This repository has been archived by the owner on Nov 29, 2023. It is now read-only.
/
video_intelligence.py
1251 lines (1065 loc) · 50.2 KB
/
video_intelligence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import proto # type: ignore
from google.protobuf import duration_pb2 # type: ignore
from google.protobuf import timestamp_pb2 # type: ignore
from google.rpc import status_pb2 # type: ignore
__protobuf__ = proto.module(
package="google.cloud.videointelligence.v1",
manifest={
"Feature",
"LabelDetectionMode",
"Likelihood",
"AnnotateVideoRequest",
"VideoContext",
"LabelDetectionConfig",
"ShotChangeDetectionConfig",
"ObjectTrackingConfig",
"FaceDetectionConfig",
"PersonDetectionConfig",
"ExplicitContentDetectionConfig",
"TextDetectionConfig",
"VideoSegment",
"LabelSegment",
"LabelFrame",
"Entity",
"LabelAnnotation",
"ExplicitContentFrame",
"ExplicitContentAnnotation",
"NormalizedBoundingBox",
"FaceDetectionAnnotation",
"PersonDetectionAnnotation",
"FaceSegment",
"FaceFrame",
"FaceAnnotation",
"TimestampedObject",
"Track",
"DetectedAttribute",
"DetectedLandmark",
"VideoAnnotationResults",
"AnnotateVideoResponse",
"VideoAnnotationProgress",
"AnnotateVideoProgress",
"SpeechTranscriptionConfig",
"SpeechContext",
"SpeechTranscription",
"SpeechRecognitionAlternative",
"WordInfo",
"NormalizedVertex",
"NormalizedBoundingPoly",
"TextSegment",
"TextFrame",
"TextAnnotation",
"ObjectTrackingFrame",
"ObjectTrackingAnnotation",
"LogoRecognitionAnnotation",
},
)
class Feature(proto.Enum):
r"""Video annotation feature."""
FEATURE_UNSPECIFIED = 0
LABEL_DETECTION = 1
SHOT_CHANGE_DETECTION = 2
EXPLICIT_CONTENT_DETECTION = 3
FACE_DETECTION = 4
SPEECH_TRANSCRIPTION = 6
TEXT_DETECTION = 7
OBJECT_TRACKING = 9
LOGO_RECOGNITION = 12
PERSON_DETECTION = 14
class LabelDetectionMode(proto.Enum):
r"""Label detection mode."""
LABEL_DETECTION_MODE_UNSPECIFIED = 0
SHOT_MODE = 1
FRAME_MODE = 2
SHOT_AND_FRAME_MODE = 3
class Likelihood(proto.Enum):
r"""Bucketized representation of likelihood."""
LIKELIHOOD_UNSPECIFIED = 0
VERY_UNLIKELY = 1
UNLIKELY = 2
POSSIBLE = 3
LIKELY = 4
VERY_LIKELY = 5
class AnnotateVideoRequest(proto.Message):
r"""Video annotation request.
Attributes:
input_uri (str):
Input video location. Currently, only `Cloud
Storage <https://cloud.google.com/storage/>`__ URIs are
supported. URIs must be specified in the following format:
``gs://bucket-id/object-id`` (other URI formats return
[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
For more information, see `Request
URIs <https://cloud.google.com/storage/docs/request-endpoints>`__.
To identify multiple videos, a video URI may include
wildcards in the ``object-id``. Supported wildcards: '*' to
match 0 or more characters; '?' to match 1 character. If
unset, the input video should be embedded in the request as
``input_content``. If set, ``input_content`` must be unset.
input_content (bytes):
The video data bytes. If unset, the input video(s) should be
specified via the ``input_uri``. If set, ``input_uri`` must
be unset.
features (Sequence[google.cloud.videointelligence_v1.types.Feature]):
Required. Requested video annotation
features.
video_context (google.cloud.videointelligence_v1.types.VideoContext):
Additional video context and/or feature-
pecific parameters.
output_uri (str):
Optional. Location where the output (in JSON format) should
be stored. Currently, only `Cloud
Storage <https://cloud.google.com/storage/>`__ URIs are
supported. These must be specified in the following format:
``gs://bucket-id/object-id`` (other URI formats return
[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
For more information, see `Request
URIs <https://cloud.google.com/storage/docs/request-endpoints>`__.
location_id (str):
Optional. Cloud region where annotation should take place.
Supported cloud regions are: ``us-east1``, ``us-west1``,
``europe-west1``, ``asia-east1``. If no region is specified,
the region will be determined based on video file location.
"""
input_uri = proto.Field(proto.STRING, number=1,)
input_content = proto.Field(proto.BYTES, number=6,)
features = proto.RepeatedField(proto.ENUM, number=2, enum="Feature",)
video_context = proto.Field(proto.MESSAGE, number=3, message="VideoContext",)
output_uri = proto.Field(proto.STRING, number=4,)
location_id = proto.Field(proto.STRING, number=5,)
class VideoContext(proto.Message):
r"""Video context and/or feature-specific parameters.
Attributes:
segments (Sequence[google.cloud.videointelligence_v1.types.VideoSegment]):
Video segments to annotate. The segments may
overlap and are not required to be contiguous or
span the whole video. If unspecified, each video
is treated as a single segment.
label_detection_config (google.cloud.videointelligence_v1.types.LabelDetectionConfig):
Config for LABEL_DETECTION.
shot_change_detection_config (google.cloud.videointelligence_v1.types.ShotChangeDetectionConfig):
Config for SHOT_CHANGE_DETECTION.
explicit_content_detection_config (google.cloud.videointelligence_v1.types.ExplicitContentDetectionConfig):
Config for EXPLICIT_CONTENT_DETECTION.
face_detection_config (google.cloud.videointelligence_v1.types.FaceDetectionConfig):
Config for FACE_DETECTION.
speech_transcription_config (google.cloud.videointelligence_v1.types.SpeechTranscriptionConfig):
Config for SPEECH_TRANSCRIPTION.
text_detection_config (google.cloud.videointelligence_v1.types.TextDetectionConfig):
Config for TEXT_DETECTION.
person_detection_config (google.cloud.videointelligence_v1.types.PersonDetectionConfig):
Config for PERSON_DETECTION.
object_tracking_config (google.cloud.videointelligence_v1.types.ObjectTrackingConfig):
Config for OBJECT_TRACKING.
"""
segments = proto.RepeatedField(proto.MESSAGE, number=1, message="VideoSegment",)
label_detection_config = proto.Field(
proto.MESSAGE, number=2, message="LabelDetectionConfig",
)
shot_change_detection_config = proto.Field(
proto.MESSAGE, number=3, message="ShotChangeDetectionConfig",
)
explicit_content_detection_config = proto.Field(
proto.MESSAGE, number=4, message="ExplicitContentDetectionConfig",
)
face_detection_config = proto.Field(
proto.MESSAGE, number=5, message="FaceDetectionConfig",
)
speech_transcription_config = proto.Field(
proto.MESSAGE, number=6, message="SpeechTranscriptionConfig",
)
text_detection_config = proto.Field(
proto.MESSAGE, number=8, message="TextDetectionConfig",
)
person_detection_config = proto.Field(
proto.MESSAGE, number=11, message="PersonDetectionConfig",
)
object_tracking_config = proto.Field(
proto.MESSAGE, number=13, message="ObjectTrackingConfig",
)
class LabelDetectionConfig(proto.Message):
r"""Config for LABEL_DETECTION.
Attributes:
label_detection_mode (google.cloud.videointelligence_v1.types.LabelDetectionMode):
What labels should be detected with LABEL_DETECTION, in
addition to video-level labels or segment-level labels. If
unspecified, defaults to ``SHOT_MODE``.
stationary_camera (bool):
Whether the video has been shot from a stationary (i.e.,
non-moving) camera. When set to true, might improve
detection accuracy for moving objects. Should be used with
``SHOT_AND_FRAME_MODE`` enabled.
model (str):
Model to use for label detection.
Supported values: "builtin/stable" (the default
if unset) and "builtin/latest".
frame_confidence_threshold (float):
The confidence threshold we perform filtering on the labels
from frame-level detection. If not set, it is set to 0.4 by
default. The valid range for this threshold is [0.1, 0.9].
Any value set outside of this range will be clipped. Note:
For best results, follow the default threshold. We will
update the default threshold everytime when we release a new
model.
video_confidence_threshold (float):
The confidence threshold we perform filtering on the labels
from video-level and shot-level detections. If not set, it's
set to 0.3 by default. The valid range for this threshold is
[0.1, 0.9]. Any value set outside of this range will be
clipped. Note: For best results, follow the default
threshold. We will update the default threshold everytime
when we release a new model.
"""
label_detection_mode = proto.Field(proto.ENUM, number=1, enum="LabelDetectionMode",)
stationary_camera = proto.Field(proto.BOOL, number=2,)
model = proto.Field(proto.STRING, number=3,)
frame_confidence_threshold = proto.Field(proto.FLOAT, number=4,)
video_confidence_threshold = proto.Field(proto.FLOAT, number=5,)
class ShotChangeDetectionConfig(proto.Message):
r"""Config for SHOT_CHANGE_DETECTION.
Attributes:
model (str):
Model to use for shot change detection.
Supported values: "builtin/stable" (the default
if unset) and "builtin/latest".
"""
model = proto.Field(proto.STRING, number=1,)
class ObjectTrackingConfig(proto.Message):
r"""Config for OBJECT_TRACKING.
Attributes:
model (str):
Model to use for object tracking.
Supported values: "builtin/stable" (the default
if unset) and "builtin/latest".
"""
model = proto.Field(proto.STRING, number=1,)
class FaceDetectionConfig(proto.Message):
r"""Config for FACE_DETECTION.
Attributes:
model (str):
Model to use for face detection.
Supported values: "builtin/stable" (the default
if unset) and "builtin/latest".
include_bounding_boxes (bool):
Whether bounding boxes are included in the
face annotation output.
include_attributes (bool):
Whether to enable face attributes detection, such as
glasses, dark_glasses, mouth_open etc. Ignored if
'include_bounding_boxes' is set to false.
"""
model = proto.Field(proto.STRING, number=1,)
include_bounding_boxes = proto.Field(proto.BOOL, number=2,)
include_attributes = proto.Field(proto.BOOL, number=5,)
class PersonDetectionConfig(proto.Message):
r"""Config for PERSON_DETECTION.
Attributes:
include_bounding_boxes (bool):
Whether bounding boxes are included in the
person detection annotation output.
include_pose_landmarks (bool):
Whether to enable pose landmarks detection. Ignored if
'include_bounding_boxes' is set to false.
include_attributes (bool):
Whether to enable person attributes detection, such as cloth
color (black, blue, etc), type (coat, dress, etc), pattern
(plain, floral, etc), hair, etc. Ignored if
'include_bounding_boxes' is set to false.
"""
include_bounding_boxes = proto.Field(proto.BOOL, number=1,)
include_pose_landmarks = proto.Field(proto.BOOL, number=2,)
include_attributes = proto.Field(proto.BOOL, number=3,)
class ExplicitContentDetectionConfig(proto.Message):
r"""Config for EXPLICIT_CONTENT_DETECTION.
Attributes:
model (str):
Model to use for explicit content detection.
Supported values: "builtin/stable" (the default
if unset) and "builtin/latest".
"""
model = proto.Field(proto.STRING, number=1,)
class TextDetectionConfig(proto.Message):
r"""Config for TEXT_DETECTION.
Attributes:
language_hints (Sequence[str]):
Language hint can be specified if the
language to be detected is known a priori. It
can increase the accuracy of the detection.
Language hint must be language code in BCP-47
format.
Automatic language detection is performed if no
hint is provided.
model (str):
Model to use for text detection.
Supported values: "builtin/stable" (the default
if unset) and "builtin/latest".
"""
language_hints = proto.RepeatedField(proto.STRING, number=1,)
model = proto.Field(proto.STRING, number=2,)
class VideoSegment(proto.Message):
r"""Video segment.
Attributes:
start_time_offset (google.protobuf.duration_pb2.Duration):
Time-offset, relative to the beginning of the
video, corresponding to the start of the segment
(inclusive).
end_time_offset (google.protobuf.duration_pb2.Duration):
Time-offset, relative to the beginning of the
video, corresponding to the end of the segment
(inclusive).
"""
start_time_offset = proto.Field(
proto.MESSAGE, number=1, message=duration_pb2.Duration,
)
end_time_offset = proto.Field(
proto.MESSAGE, number=2, message=duration_pb2.Duration,
)
class LabelSegment(proto.Message):
r"""Video segment level annotation results for label detection.
Attributes:
segment (google.cloud.videointelligence_v1.types.VideoSegment):
Video segment where a label was detected.
confidence (float):
Confidence that the label is accurate. Range: [0, 1].
"""
segment = proto.Field(proto.MESSAGE, number=1, message="VideoSegment",)
confidence = proto.Field(proto.FLOAT, number=2,)
class LabelFrame(proto.Message):
r"""Video frame level annotation results for label detection.
Attributes:
time_offset (google.protobuf.duration_pb2.Duration):
Time-offset, relative to the beginning of the
video, corresponding to the video frame for this
location.
confidence (float):
Confidence that the label is accurate. Range: [0, 1].
"""
time_offset = proto.Field(proto.MESSAGE, number=1, message=duration_pb2.Duration,)
confidence = proto.Field(proto.FLOAT, number=2,)
class Entity(proto.Message):
r"""Detected entity from video analysis.
Attributes:
entity_id (str):
Opaque entity ID. Some IDs may be available in `Google
Knowledge Graph Search
API <https://developers.google.com/knowledge-graph/>`__.
description (str):
Textual description, e.g., ``Fixed-gear bicycle``.
language_code (str):
Language code for ``description`` in BCP-47 format.
"""
entity_id = proto.Field(proto.STRING, number=1,)
description = proto.Field(proto.STRING, number=2,)
language_code = proto.Field(proto.STRING, number=3,)
class LabelAnnotation(proto.Message):
r"""Label annotation.
Attributes:
entity (google.cloud.videointelligence_v1.types.Entity):
Detected entity.
category_entities (Sequence[google.cloud.videointelligence_v1.types.Entity]):
Common categories for the detected entity. For example, when
the label is ``Terrier``, the category is likely ``dog``.
And in some cases there might be more than one categories
e.g., ``Terrier`` could also be a ``pet``.
segments (Sequence[google.cloud.videointelligence_v1.types.LabelSegment]):
All video segments where a label was
detected.
frames (Sequence[google.cloud.videointelligence_v1.types.LabelFrame]):
All video frames where a label was detected.
version (str):
Feature version.
"""
entity = proto.Field(proto.MESSAGE, number=1, message="Entity",)
category_entities = proto.RepeatedField(proto.MESSAGE, number=2, message="Entity",)
segments = proto.RepeatedField(proto.MESSAGE, number=3, message="LabelSegment",)
frames = proto.RepeatedField(proto.MESSAGE, number=4, message="LabelFrame",)
version = proto.Field(proto.STRING, number=5,)
class ExplicitContentFrame(proto.Message):
r"""Video frame level annotation results for explicit content.
Attributes:
time_offset (google.protobuf.duration_pb2.Duration):
Time-offset, relative to the beginning of the
video, corresponding to the video frame for this
location.
pornography_likelihood (google.cloud.videointelligence_v1.types.Likelihood):
Likelihood of the pornography content..
"""
time_offset = proto.Field(proto.MESSAGE, number=1, message=duration_pb2.Duration,)
pornography_likelihood = proto.Field(proto.ENUM, number=2, enum="Likelihood",)
class ExplicitContentAnnotation(proto.Message):
r"""Explicit content annotation (based on per-frame visual
signals only). If no explicit content has been detected in a
frame, no annotations are present for that frame.
Attributes:
frames (Sequence[google.cloud.videointelligence_v1.types.ExplicitContentFrame]):
All video frames where explicit content was
detected.
version (str):
Feature version.
"""
frames = proto.RepeatedField(
proto.MESSAGE, number=1, message="ExplicitContentFrame",
)
version = proto.Field(proto.STRING, number=2,)
class NormalizedBoundingBox(proto.Message):
r"""Normalized bounding box. The normalized vertex coordinates are
relative to the original image. Range: [0, 1].
Attributes:
left (float):
Left X coordinate.
top (float):
Top Y coordinate.
right (float):
Right X coordinate.
bottom (float):
Bottom Y coordinate.
"""
left = proto.Field(proto.FLOAT, number=1,)
top = proto.Field(proto.FLOAT, number=2,)
right = proto.Field(proto.FLOAT, number=3,)
bottom = proto.Field(proto.FLOAT, number=4,)
class FaceDetectionAnnotation(proto.Message):
r"""Face detection annotation.
Attributes:
tracks (Sequence[google.cloud.videointelligence_v1.types.Track]):
The face tracks with attributes.
thumbnail (bytes):
The thumbnail of a person's face.
version (str):
Feature version.
"""
tracks = proto.RepeatedField(proto.MESSAGE, number=3, message="Track",)
thumbnail = proto.Field(proto.BYTES, number=4,)
version = proto.Field(proto.STRING, number=5,)
class PersonDetectionAnnotation(proto.Message):
r"""Person detection annotation per video.
Attributes:
tracks (Sequence[google.cloud.videointelligence_v1.types.Track]):
The detected tracks of a person.
version (str):
Feature version.
"""
tracks = proto.RepeatedField(proto.MESSAGE, number=1, message="Track",)
version = proto.Field(proto.STRING, number=2,)
class FaceSegment(proto.Message):
r"""Video segment level annotation results for face detection.
Attributes:
segment (google.cloud.videointelligence_v1.types.VideoSegment):
Video segment where a face was detected.
"""
segment = proto.Field(proto.MESSAGE, number=1, message="VideoSegment",)
class FaceFrame(proto.Message):
r"""Deprecated. No effect.
Attributes:
normalized_bounding_boxes (Sequence[google.cloud.videointelligence_v1.types.NormalizedBoundingBox]):
Normalized Bounding boxes in a frame.
There can be more than one boxes if the same
face is detected in multiple locations within
the current frame.
time_offset (google.protobuf.duration_pb2.Duration):
Time-offset, relative to the beginning of the
video, corresponding to the video frame for this
location.
"""
normalized_bounding_boxes = proto.RepeatedField(
proto.MESSAGE, number=1, message="NormalizedBoundingBox",
)
time_offset = proto.Field(proto.MESSAGE, number=2, message=duration_pb2.Duration,)
class FaceAnnotation(proto.Message):
r"""Deprecated. No effect.
Attributes:
thumbnail (bytes):
Thumbnail of a representative face view (in
JPEG format).
segments (Sequence[google.cloud.videointelligence_v1.types.FaceSegment]):
All video segments where a face was detected.
frames (Sequence[google.cloud.videointelligence_v1.types.FaceFrame]):
All video frames where a face was detected.
"""
thumbnail = proto.Field(proto.BYTES, number=1,)
segments = proto.RepeatedField(proto.MESSAGE, number=2, message="FaceSegment",)
frames = proto.RepeatedField(proto.MESSAGE, number=3, message="FaceFrame",)
class TimestampedObject(proto.Message):
r"""For tracking related features. An object at time_offset with
attributes, and located with normalized_bounding_box.
Attributes:
normalized_bounding_box (google.cloud.videointelligence_v1.types.NormalizedBoundingBox):
Normalized Bounding box in a frame, where the
object is located.
time_offset (google.protobuf.duration_pb2.Duration):
Time-offset, relative to the beginning of the
video, corresponding to the video frame for this
object.
attributes (Sequence[google.cloud.videointelligence_v1.types.DetectedAttribute]):
Optional. The attributes of the object in the
bounding box.
landmarks (Sequence[google.cloud.videointelligence_v1.types.DetectedLandmark]):
Optional. The detected landmarks.
"""
normalized_bounding_box = proto.Field(
proto.MESSAGE, number=1, message="NormalizedBoundingBox",
)
time_offset = proto.Field(proto.MESSAGE, number=2, message=duration_pb2.Duration,)
attributes = proto.RepeatedField(
proto.MESSAGE, number=3, message="DetectedAttribute",
)
landmarks = proto.RepeatedField(
proto.MESSAGE, number=4, message="DetectedLandmark",
)
class Track(proto.Message):
r"""A track of an object instance.
Attributes:
segment (google.cloud.videointelligence_v1.types.VideoSegment):
Video segment of a track.
timestamped_objects (Sequence[google.cloud.videointelligence_v1.types.TimestampedObject]):
The object with timestamp and attributes per
frame in the track.
attributes (Sequence[google.cloud.videointelligence_v1.types.DetectedAttribute]):
Optional. Attributes in the track level.
confidence (float):
Optional. The confidence score of the tracked
object.
"""
segment = proto.Field(proto.MESSAGE, number=1, message="VideoSegment",)
timestamped_objects = proto.RepeatedField(
proto.MESSAGE, number=2, message="TimestampedObject",
)
attributes = proto.RepeatedField(
proto.MESSAGE, number=3, message="DetectedAttribute",
)
confidence = proto.Field(proto.FLOAT, number=4,)
class DetectedAttribute(proto.Message):
r"""A generic detected attribute represented by name in string
format.
Attributes:
name (str):
The name of the attribute, for example, glasses,
dark_glasses, mouth_open. A full list of supported type
names will be provided in the document.
confidence (float):
Detected attribute confidence. Range [0, 1].
value (str):
Text value of the detection result. For
example, the value for "HairColor" can be
"black", "blonde", etc.
"""
name = proto.Field(proto.STRING, number=1,)
confidence = proto.Field(proto.FLOAT, number=2,)
value = proto.Field(proto.STRING, number=3,)
class DetectedLandmark(proto.Message):
r"""A generic detected landmark represented by name in string
format and a 2D location.
Attributes:
name (str):
The name of this landmark, for example, left_hand,
right_shoulder.
point (google.cloud.videointelligence_v1.types.NormalizedVertex):
The 2D point of the detected landmark using
the normalized image coordindate system. The
normalized coordinates have the range from 0 to
1.
confidence (float):
The confidence score of the detected landmark. Range [0, 1].
"""
name = proto.Field(proto.STRING, number=1,)
point = proto.Field(proto.MESSAGE, number=2, message="NormalizedVertex",)
confidence = proto.Field(proto.FLOAT, number=3,)
class VideoAnnotationResults(proto.Message):
r"""Annotation results for a single video.
Attributes:
input_uri (str):
Video file location in `Cloud
Storage <https://cloud.google.com/storage/>`__.
segment (google.cloud.videointelligence_v1.types.VideoSegment):
Video segment on which the annotation is run.
segment_label_annotations (Sequence[google.cloud.videointelligence_v1.types.LabelAnnotation]):
Topical label annotations on video level or
user-specified segment level. There is exactly
one element for each unique label.
segment_presence_label_annotations (Sequence[google.cloud.videointelligence_v1.types.LabelAnnotation]):
Presence label annotations on video level or user-specified
segment level. There is exactly one element for each unique
label. Compared to the existing topical
``segment_label_annotations``, this field presents more
fine-grained, segment-level labels detected in video content
and is made available only when the client sets
``LabelDetectionConfig.model`` to "builtin/latest" in the
request.
shot_label_annotations (Sequence[google.cloud.videointelligence_v1.types.LabelAnnotation]):
Topical label annotations on shot level.
There is exactly one element for each unique
label.
shot_presence_label_annotations (Sequence[google.cloud.videointelligence_v1.types.LabelAnnotation]):
Presence label annotations on shot level. There is exactly
one element for each unique label. Compared to the existing
topical ``shot_label_annotations``, this field presents more
fine-grained, shot-level labels detected in video content
and is made available only when the client sets
``LabelDetectionConfig.model`` to "builtin/latest" in the
request.
frame_label_annotations (Sequence[google.cloud.videointelligence_v1.types.LabelAnnotation]):
Label annotations on frame level.
There is exactly one element for each unique
label.
face_annotations (Sequence[google.cloud.videointelligence_v1.types.FaceAnnotation]):
Deprecated. Please use ``face_detection_annotations``
instead.
face_detection_annotations (Sequence[google.cloud.videointelligence_v1.types.FaceDetectionAnnotation]):
Face detection annotations.
shot_annotations (Sequence[google.cloud.videointelligence_v1.types.VideoSegment]):
Shot annotations. Each shot is represented as
a video segment.
explicit_annotation (google.cloud.videointelligence_v1.types.ExplicitContentAnnotation):
Explicit content annotation.
speech_transcriptions (Sequence[google.cloud.videointelligence_v1.types.SpeechTranscription]):
Speech transcription.
text_annotations (Sequence[google.cloud.videointelligence_v1.types.TextAnnotation]):
OCR text detection and tracking.
Annotations for list of detected text snippets.
Each will have list of frame information
associated with it.
object_annotations (Sequence[google.cloud.videointelligence_v1.types.ObjectTrackingAnnotation]):
Annotations for list of objects detected and
tracked in video.
logo_recognition_annotations (Sequence[google.cloud.videointelligence_v1.types.LogoRecognitionAnnotation]):
Annotations for list of logos detected,
tracked and recognized in video.
person_detection_annotations (Sequence[google.cloud.videointelligence_v1.types.PersonDetectionAnnotation]):
Person detection annotations.
error (google.rpc.status_pb2.Status):
If set, indicates an error. Note that for a single
``AnnotateVideoRequest`` some videos may succeed and some
may fail.
"""
input_uri = proto.Field(proto.STRING, number=1,)
segment = proto.Field(proto.MESSAGE, number=10, message="VideoSegment",)
segment_label_annotations = proto.RepeatedField(
proto.MESSAGE, number=2, message="LabelAnnotation",
)
segment_presence_label_annotations = proto.RepeatedField(
proto.MESSAGE, number=23, message="LabelAnnotation",
)
shot_label_annotations = proto.RepeatedField(
proto.MESSAGE, number=3, message="LabelAnnotation",
)
shot_presence_label_annotations = proto.RepeatedField(
proto.MESSAGE, number=24, message="LabelAnnotation",
)
frame_label_annotations = proto.RepeatedField(
proto.MESSAGE, number=4, message="LabelAnnotation",
)
face_annotations = proto.RepeatedField(
proto.MESSAGE, number=5, message="FaceAnnotation",
)
face_detection_annotations = proto.RepeatedField(
proto.MESSAGE, number=13, message="FaceDetectionAnnotation",
)
shot_annotations = proto.RepeatedField(
proto.MESSAGE, number=6, message="VideoSegment",
)
explicit_annotation = proto.Field(
proto.MESSAGE, number=7, message="ExplicitContentAnnotation",
)
speech_transcriptions = proto.RepeatedField(
proto.MESSAGE, number=11, message="SpeechTranscription",
)
text_annotations = proto.RepeatedField(
proto.MESSAGE, number=12, message="TextAnnotation",
)
object_annotations = proto.RepeatedField(
proto.MESSAGE, number=14, message="ObjectTrackingAnnotation",
)
logo_recognition_annotations = proto.RepeatedField(
proto.MESSAGE, number=19, message="LogoRecognitionAnnotation",
)
person_detection_annotations = proto.RepeatedField(
proto.MESSAGE, number=20, message="PersonDetectionAnnotation",
)
error = proto.Field(proto.MESSAGE, number=9, message=status_pb2.Status,)
class AnnotateVideoResponse(proto.Message):
r"""Video annotation response. Included in the ``response`` field of the
``Operation`` returned by the ``GetOperation`` call of the
``google::longrunning::Operations`` service.
Attributes:
annotation_results (Sequence[google.cloud.videointelligence_v1.types.VideoAnnotationResults]):
Annotation results for all videos specified in
``AnnotateVideoRequest``.
"""
annotation_results = proto.RepeatedField(
proto.MESSAGE, number=1, message="VideoAnnotationResults",
)
class VideoAnnotationProgress(proto.Message):
r"""Annotation progress for a single video.
Attributes:
input_uri (str):
Video file location in `Cloud
Storage <https://cloud.google.com/storage/>`__.
progress_percent (int):
Approximate percentage processed thus far.
Guaranteed to be 100 when fully processed.
start_time (google.protobuf.timestamp_pb2.Timestamp):
Time when the request was received.
update_time (google.protobuf.timestamp_pb2.Timestamp):
Time of the most recent update.
feature (google.cloud.videointelligence_v1.types.Feature):
Specifies which feature is being tracked if
the request contains more than one feature.
segment (google.cloud.videointelligence_v1.types.VideoSegment):
Specifies which segment is being tracked if
the request contains more than one segment.
"""
input_uri = proto.Field(proto.STRING, number=1,)
progress_percent = proto.Field(proto.INT32, number=2,)
start_time = proto.Field(proto.MESSAGE, number=3, message=timestamp_pb2.Timestamp,)
update_time = proto.Field(proto.MESSAGE, number=4, message=timestamp_pb2.Timestamp,)
feature = proto.Field(proto.ENUM, number=5, enum="Feature",)
segment = proto.Field(proto.MESSAGE, number=6, message="VideoSegment",)
class AnnotateVideoProgress(proto.Message):
r"""Video annotation progress. Included in the ``metadata`` field of the
``Operation`` returned by the ``GetOperation`` call of the
``google::longrunning::Operations`` service.
Attributes:
annotation_progress (Sequence[google.cloud.videointelligence_v1.types.VideoAnnotationProgress]):
Progress metadata for all videos specified in
``AnnotateVideoRequest``.
"""
annotation_progress = proto.RepeatedField(
proto.MESSAGE, number=1, message="VideoAnnotationProgress",
)
class SpeechTranscriptionConfig(proto.Message):
r"""Config for SPEECH_TRANSCRIPTION.
Attributes:
language_code (str):
Required. *Required* The language of the supplied audio as a
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tag. Example: "en-US". See `Language
Support <https://cloud.google.com/speech/docs/languages>`__
for a list of the currently supported language codes.
max_alternatives (int):
Optional. Maximum number of recognition hypotheses to be
returned. Specifically, the maximum number of
``SpeechRecognitionAlternative`` messages within each
``SpeechTranscription``. The server may return fewer than
``max_alternatives``. Valid values are ``0``-``30``. A value
of ``0`` or ``1`` will return a maximum of one. If omitted,
will return a maximum of one.
filter_profanity (bool):
Optional. If set to ``true``, the server will attempt to
filter out profanities, replacing all but the initial
character in each filtered word with asterisks, e.g. "f***".
If set to ``false`` or omitted, profanities won't be
filtered out.
speech_contexts (Sequence[google.cloud.videointelligence_v1.types.SpeechContext]):
Optional. A means to provide context to
assist the speech recognition.
enable_automatic_punctuation (bool):
Optional. If 'true', adds punctuation to
recognition result hypotheses. This feature is
only available in select languages. Setting this
for requests in other languages has no effect at
all. The default 'false' value does not add
punctuation to result hypotheses. NOTE: "This is
currently offered as an experimental service,
complimentary to all users. In the future this
may be exclusively available as a premium
feature.".
audio_tracks (Sequence[int]):
Optional. For file formats, such as MXF or
MKV, supporting multiple audio tracks, specify
up to two tracks. Default: track 0.
enable_speaker_diarization (bool):
Optional. If 'true', enables speaker detection for each
recognized word in the top alternative of the recognition
result using a speaker_tag provided in the WordInfo. Note:
When this is true, we send all the words from the beginning
of the audio for the top alternative in every consecutive
response. This is done in order to improve our speaker tags
as our models learn to identify the speakers in the
conversation over time.
diarization_speaker_count (int):
Optional. If set, specifies the estimated number of speakers
in the conversation. If not set, defaults to '2'. Ignored
unless enable_speaker_diarization is set to true.
enable_word_confidence (bool):
Optional. If ``true``, the top result includes a list of
words and the confidence for those words. If ``false``, no
word-level confidence information is returned. The default
is ``false``.
"""
language_code = proto.Field(proto.STRING, number=1,)
max_alternatives = proto.Field(proto.INT32, number=2,)
filter_profanity = proto.Field(proto.BOOL, number=3,)
speech_contexts = proto.RepeatedField(
proto.MESSAGE, number=4, message="SpeechContext",
)
enable_automatic_punctuation = proto.Field(proto.BOOL, number=5,)
audio_tracks = proto.RepeatedField(proto.INT32, number=6,)
enable_speaker_diarization = proto.Field(proto.BOOL, number=7,)
diarization_speaker_count = proto.Field(proto.INT32, number=8,)
enable_word_confidence = proto.Field(proto.BOOL, number=9,)
class SpeechContext(proto.Message):
r"""Provides "hints" to the speech recognizer to favor specific
words and phrases in the results.
Attributes:
phrases (Sequence[str]):
Optional. A list of strings containing words and phrases
"hints" so that the speech recognition is more likely to
recognize them. This can be used to improve the accuracy for
specific words and phrases, for example, if specific
commands are typically spoken by the user. This can also be
used to add additional words to the vocabulary of the
recognizer. See `usage
limits <https://cloud.google.com/speech/limits#content>`__.
"""
phrases = proto.RepeatedField(proto.STRING, number=1,)
class SpeechTranscription(proto.Message):
r"""A speech recognition result corresponding to a portion of the
audio.
Attributes:
alternatives (Sequence[google.cloud.videointelligence_v1.types.SpeechRecognitionAlternative]):
May contain one or more recognition hypotheses (up to the
maximum specified in ``max_alternatives``). These
alternatives are ordered in terms of accuracy, with the top
(first) alternative being the most probable, as ranked by
the recognizer.
language_code (str):
Output only. The
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tag of the language in this result. This language
code was detected to have the most likelihood of being
spoken in the audio.
"""
alternatives = proto.RepeatedField(
proto.MESSAGE, number=1, message="SpeechRecognitionAlternative",
)
language_code = proto.Field(proto.STRING, number=2,)
class SpeechRecognitionAlternative(proto.Message):
r"""Alternative hypotheses (a.k.a. n-best list).
Attributes:
transcript (str):
Transcript text representing the words that
the user spoke.