This repository has been archived by the owner on Sep 16, 2023. It is now read-only.
/
video_intelligence.proto
1090 lines (885 loc) · 39.9 KB
/
video_intelligence.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.videointelligence.v1p3beta1;
import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1";
option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p3beta1;videointelligence";
option java_multiple_files = true;
option java_outer_classname = "VideoIntelligenceServiceProto";
option java_package = "com.google.cloud.videointelligence.v1p3beta1";
option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1";
option ruby_package = "Google::Cloud::VideoIntelligence::V1p3beta1";
// Service that implements the Video Intelligence API.
service VideoIntelligenceService {
option (google.api.default_host) = "videointelligence.googleapis.com";
option (google.api.oauth_scopes) =
"https://www.googleapis.com/auth/cloud-platform";
// Performs asynchronous video annotation. Progress and results can be
// retrieved through the `google.longrunning.Operations` interface.
// `Operation.metadata` contains `AnnotateVideoProgress` (progress).
// `Operation.response` contains `AnnotateVideoResponse` (results).
rpc AnnotateVideo(AnnotateVideoRequest)
returns (google.longrunning.Operation) {
option (google.api.http) = {
post: "/v1p3beta1/videos:annotate"
body: "*"
};
option (google.api.method_signature) = "input_uri,features";
option (google.longrunning.operation_info) = {
response_type: "AnnotateVideoResponse"
metadata_type: "AnnotateVideoProgress"
};
}
}
// Service that implements streaming Video Intelligence API.
service StreamingVideoIntelligenceService {
option (google.api.default_host) = "videointelligence.googleapis.com";
option (google.api.oauth_scopes) =
"https://www.googleapis.com/auth/cloud-platform";
// Performs video annotation with bidirectional streaming: emitting results
// while sending video/audio bytes.
// This method is only available via the gRPC API (not REST).
rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest)
returns (stream StreamingAnnotateVideoResponse) {}
}
// Video annotation request.
message AnnotateVideoRequest {
// Input video location. Currently, only
// [Cloud Storage](https://cloud.google.com/storage/) URIs are
// supported. URIs must be specified in the following format:
// `gs://bucket-id/object-id` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
// more information, see [Request
// URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify
// multiple videos, a video URI may include wildcards in the `object-id`.
// Supported wildcards: '*' to match 0 or more characters;
// '?' to match 1 character. If unset, the input video should be embedded
// in the request as `input_content`. If set, `input_content` must be unset.
string input_uri = 1;
// The video data bytes.
// If unset, the input video(s) should be specified via the `input_uri`.
// If set, `input_uri` must be unset.
bytes input_content = 6;
// Required. Requested video annotation features.
repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
// Additional video context and/or feature-specific parameters.
VideoContext video_context = 3;
// Optional. Location where the output (in JSON format) should be stored.
// Currently, only [Cloud Storage](https://cloud.google.com/storage/)
// URIs are supported. These must be specified in the following format:
// `gs://bucket-id/object-id` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
// more information, see [Request
// URIs](https://cloud.google.com/storage/docs/request-endpoints).
string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
// Optional. Cloud region where annotation should take place. Supported cloud
// regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no
// region is specified, the region will be determined based on video file
// location.
string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
}
// Video context and/or feature-specific parameters.
message VideoContext {
// Video segments to annotate. The segments may overlap and are not required
// to be contiguous or span the whole video. If unspecified, each video is
// treated as a single segment.
repeated VideoSegment segments = 1;
// Config for LABEL_DETECTION.
LabelDetectionConfig label_detection_config = 2;
// Config for SHOT_CHANGE_DETECTION.
ShotChangeDetectionConfig shot_change_detection_config = 3;
// Config for EXPLICIT_CONTENT_DETECTION.
ExplicitContentDetectionConfig explicit_content_detection_config = 4;
// Config for FACE_DETECTION.
FaceDetectionConfig face_detection_config = 5;
// Config for SPEECH_TRANSCRIPTION.
SpeechTranscriptionConfig speech_transcription_config = 6;
// Config for TEXT_DETECTION.
TextDetectionConfig text_detection_config = 8;
// Config for PERSON_DETECTION.
PersonDetectionConfig person_detection_config = 11;
// Config for OBJECT_TRACKING.
ObjectTrackingConfig object_tracking_config = 13;
}
// Label detection mode.
enum LabelDetectionMode {
// Unspecified.
LABEL_DETECTION_MODE_UNSPECIFIED = 0;
// Detect shot-level labels.
SHOT_MODE = 1;
// Detect frame-level labels.
FRAME_MODE = 2;
// Detect both shot-level and frame-level labels.
SHOT_AND_FRAME_MODE = 3;
}
// Bucketized representation of likelihood.
enum Likelihood {
// Unspecified likelihood.
LIKELIHOOD_UNSPECIFIED = 0;
// Very unlikely.
VERY_UNLIKELY = 1;
// Unlikely.
UNLIKELY = 2;
// Possible.
POSSIBLE = 3;
// Likely.
LIKELY = 4;
// Very likely.
VERY_LIKELY = 5;
}
// Config for LABEL_DETECTION.
message LabelDetectionConfig {
// What labels should be detected with LABEL_DETECTION, in addition to
// video-level labels or segment-level labels.
// If unspecified, defaults to `SHOT_MODE`.
LabelDetectionMode label_detection_mode = 1;
// Whether the video has been shot from a stationary (i.e., non-moving)
// camera. When set to true, might improve detection accuracy for moving
// objects. Should be used with `SHOT_AND_FRAME_MODE` enabled.
bool stationary_camera = 2;
// Model to use for label detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 3;
// The confidence threshold we perform filtering on the labels from
// frame-level detection. If not set, it is set to 0.4 by default. The valid
// range for this threshold is [0.1, 0.9]. Any value set outside of this
// range will be clipped.
// Note: For best results, follow the default threshold. We will update
// the default threshold everytime when we release a new model.
float frame_confidence_threshold = 4;
// The confidence threshold we perform filtering on the labels from
// video-level and shot-level detections. If not set, it's set to 0.3 by
// default. The valid range for this threshold is [0.1, 0.9]. Any value set
// outside of this range will be clipped.
// Note: For best results, follow the default threshold. We will update
// the default threshold everytime when we release a new model.
float video_confidence_threshold = 5;
}
// Streaming video annotation feature.
enum StreamingFeature {
// Unspecified.
STREAMING_FEATURE_UNSPECIFIED = 0;
// Label detection. Detect objects, such as dog or flower.
STREAMING_LABEL_DETECTION = 1;
// Shot change detection.
STREAMING_SHOT_CHANGE_DETECTION = 2;
// Explicit content detection.
STREAMING_EXPLICIT_CONTENT_DETECTION = 3;
// Object detection and tracking.
STREAMING_OBJECT_TRACKING = 4;
// Action recognition based on AutoML model.
STREAMING_AUTOML_ACTION_RECOGNITION = 23;
// Video classification based on AutoML model.
STREAMING_AUTOML_CLASSIFICATION = 21;
// Object detection and tracking based on AutoML model.
STREAMING_AUTOML_OBJECT_TRACKING = 22;
}
// Video annotation feature.
enum Feature {
// Unspecified.
FEATURE_UNSPECIFIED = 0;
// Label detection. Detect objects, such as dog or flower.
LABEL_DETECTION = 1;
// Shot change detection.
SHOT_CHANGE_DETECTION = 2;
// Explicit content detection.
EXPLICIT_CONTENT_DETECTION = 3;
// Human face detection.
FACE_DETECTION = 4;
// Speech transcription.
SPEECH_TRANSCRIPTION = 6;
// OCR text detection and tracking.
TEXT_DETECTION = 7;
// Object detection and tracking.
OBJECT_TRACKING = 9;
// Logo detection, tracking, and recognition.
LOGO_RECOGNITION = 12;
// Celebrity recognition.
CELEBRITY_RECOGNITION = 13;
// Person detection.
PERSON_DETECTION = 14;
}
// Config for SHOT_CHANGE_DETECTION.
message ShotChangeDetectionConfig {
// Model to use for shot change detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;
}
// Config for OBJECT_TRACKING.
message ObjectTrackingConfig {
// Model to use for object tracking.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;
}
// Config for EXPLICIT_CONTENT_DETECTION.
message ExplicitContentDetectionConfig {
// Model to use for explicit content detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;
}
// Config for FACE_DETECTION.
message FaceDetectionConfig {
// Model to use for face detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 1;
// Whether bounding boxes are included in the face annotation output.
bool include_bounding_boxes = 2;
// Whether to enable face attributes detection, such as glasses, dark_glasses,
// mouth_open etc. Ignored if 'include_bounding_boxes' is set to false.
bool include_attributes = 5;
}
// Config for PERSON_DETECTION.
message PersonDetectionConfig {
// Whether bounding boxes are included in the person detection annotation
// output.
bool include_bounding_boxes = 1;
// Whether to enable pose landmarks detection. Ignored if
// 'include_bounding_boxes' is set to false.
bool include_pose_landmarks = 2;
// Whether to enable person attributes detection, such as cloth color (black,
// blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair,
// etc.
// Ignored if 'include_bounding_boxes' is set to false.
bool include_attributes = 3;
}
// Config for TEXT_DETECTION.
message TextDetectionConfig {
// Language hint can be specified if the language to be detected is known a
// priori. It can increase the accuracy of the detection. Language hint must
// be language code in BCP-47 format.
//
// Automatic language detection is performed if no hint is provided.
repeated string language_hints = 1;
// Model to use for text detection.
// Supported values: "builtin/stable" (the default if unset) and
// "builtin/latest".
string model = 2;
}
// Video segment.
message VideoSegment {
// Time-offset, relative to the beginning of the video,
// corresponding to the start of the segment (inclusive).
google.protobuf.Duration start_time_offset = 1;
// Time-offset, relative to the beginning of the video,
// corresponding to the end of the segment (inclusive).
google.protobuf.Duration end_time_offset = 2;
}
// Video segment level annotation results for label detection.
message LabelSegment {
// Video segment where a label was detected.
VideoSegment segment = 1;
// Confidence that the label is accurate. Range: [0, 1].
float confidence = 2;
}
// Video frame level annotation results for label detection.
message LabelFrame {
// Time-offset, relative to the beginning of the video, corresponding to the
// video frame for this location.
google.protobuf.Duration time_offset = 1;
// Confidence that the label is accurate. Range: [0, 1].
float confidence = 2;
}
// Detected entity from video analysis.
message Entity {
// Opaque entity ID. Some IDs may be available in
// [Google Knowledge Graph Search
// API](https://developers.google.com/knowledge-graph/).
string entity_id = 1;
// Textual description, e.g., `Fixed-gear bicycle`.
string description = 2;
// Language code for `description` in BCP-47 format.
string language_code = 3;
}
// Label annotation.
message LabelAnnotation {
// Detected entity.
Entity entity = 1;
// Common categories for the detected entity.
// For example, when the label is `Terrier`, the category is likely `dog`. And
// in some cases there might be more than one categories e.g., `Terrier` could
// also be a `pet`.
repeated Entity category_entities = 2;
// All video segments where a label was detected.
repeated LabelSegment segments = 3;
// All video frames where a label was detected.
repeated LabelFrame frames = 4;
}
// Video frame level annotation results for explicit content.
message ExplicitContentFrame {
// Time-offset, relative to the beginning of the video, corresponding to the
// video frame for this location.
google.protobuf.Duration time_offset = 1;
// Likelihood of the pornography content..
Likelihood pornography_likelihood = 2;
}
// Explicit content annotation (based on per-frame visual signals only).
// If no explicit content has been detected in a frame, no annotations are
// present for that frame.
message ExplicitContentAnnotation {
// All video frames where explicit content was detected.
repeated ExplicitContentFrame frames = 1;
}
// Normalized bounding box.
// The normalized vertex coordinates are relative to the original image.
// Range: [0, 1].
message NormalizedBoundingBox {
// Left X coordinate.
float left = 1;
// Top Y coordinate.
float top = 2;
// Right X coordinate.
float right = 3;
// Bottom Y coordinate.
float bottom = 4;
}
// For tracking related features.
// An object at time_offset with attributes, and located with
// normalized_bounding_box.
message TimestampedObject {
// Normalized Bounding box in a frame, where the object is located.
NormalizedBoundingBox normalized_bounding_box = 1;
// Time-offset, relative to the beginning of the video,
// corresponding to the video frame for this object.
google.protobuf.Duration time_offset = 2;
// Optional. The attributes of the object in the bounding box.
repeated DetectedAttribute attributes = 3
[(google.api.field_behavior) = OPTIONAL];
// Optional. The detected landmarks.
repeated DetectedLandmark landmarks = 4
[(google.api.field_behavior) = OPTIONAL];
}
// A track of an object instance.
message Track {
// Video segment of a track.
VideoSegment segment = 1;
// The object with timestamp and attributes per frame in the track.
repeated TimestampedObject timestamped_objects = 2;
// Optional. Attributes in the track level.
repeated DetectedAttribute attributes = 3
[(google.api.field_behavior) = OPTIONAL];
// Optional. The confidence score of the tracked object.
float confidence = 4 [(google.api.field_behavior) = OPTIONAL];
}
// A generic detected attribute represented by name in string format.
message DetectedAttribute {
// The name of the attribute, for example, glasses, dark_glasses, mouth_open.
// A full list of supported type names will be provided in the document.
string name = 1;
// Detected attribute confidence. Range [0, 1].
float confidence = 2;
// Text value of the detection result. For example, the value for "HairColor"
// can be "black", "blonde", etc.
string value = 3;
}
// Celebrity definition.
message Celebrity {
// The resource name of the celebrity. Have the format
// `video-intelligence/kg-mid` indicates a celebrity from preloaded gallery.
// kg-mid is the id in Google knowledge graph, which is unique for the
// celebrity.
string name = 1;
// The celebrity name.
string display_name = 2;
// Textual description of additional information about the celebrity, if
// applicable.
string description = 3;
}
// The annotation result of a celebrity face track. RecognizedCelebrity field
// could be empty if the face track does not have any matched celebrities.
message CelebrityTrack {
// The recognized celebrity with confidence score.
message RecognizedCelebrity {
// The recognized celebrity.
Celebrity celebrity = 1;
// Recognition confidence. Range [0, 1].
float confidence = 2;
}
// Top N match of the celebrities for the face in this track.
repeated RecognizedCelebrity celebrities = 1;
// A track of a person's face.
Track face_track = 3;
}
// Celebrity recognition annotation per video.
message CelebrityRecognitionAnnotation {
// The tracks detected from the input video, including recognized celebrities
// and other detected faces in the video.
repeated CelebrityTrack celebrity_tracks = 1;
}
// A generic detected landmark represented by name in string format and a 2D
// location.
message DetectedLandmark {
// The name of this landmark, for example, left_hand, right_shoulder.
string name = 1;
// The 2D point of the detected landmark using the normalized image
// coordindate system. The normalized coordinates have the range from 0 to 1.
NormalizedVertex point = 2;
// The confidence score of the detected landmark. Range [0, 1].
float confidence = 3;
}
// Face detection annotation.
message FaceDetectionAnnotation {
// The face tracks with attributes.
repeated Track tracks = 3;
// The thumbnail of a person's face.
bytes thumbnail = 4;
}
// Person detection annotation per video.
message PersonDetectionAnnotation {
// The detected tracks of a person.
repeated Track tracks = 1;
}
// Annotation results for a single video.
message VideoAnnotationResults {
// Video file location in
// [Cloud Storage](https://cloud.google.com/storage/).
string input_uri = 1;
// Video segment on which the annotation is run.
VideoSegment segment = 10;
// Topical label annotations on video level or user-specified segment level.
// There is exactly one element for each unique label.
repeated LabelAnnotation segment_label_annotations = 2;
// Presence label annotations on video level or user-specified segment level.
// There is exactly one element for each unique label. Compared to the
// existing topical `segment_label_annotations`, this field presents more
// fine-grained, segment-level labels detected in video content and is made
// available only when the client sets `LabelDetectionConfig.model` to
// "builtin/latest" in the request.
repeated LabelAnnotation segment_presence_label_annotations = 23;
// Topical label annotations on shot level.
// There is exactly one element for each unique label.
repeated LabelAnnotation shot_label_annotations = 3;
// Presence label annotations on shot level. There is exactly one element for
// each unique label. Compared to the existing topical
// `shot_label_annotations`, this field presents more fine-grained, shot-level
// labels detected in video content and is made available only when the client
// sets `LabelDetectionConfig.model` to "builtin/latest" in the request.
repeated LabelAnnotation shot_presence_label_annotations = 24;
// Label annotations on frame level.
// There is exactly one element for each unique label.
repeated LabelAnnotation frame_label_annotations = 4;
// Face detection annotations.
repeated FaceDetectionAnnotation face_detection_annotations = 13;
// Shot annotations. Each shot is represented as a video segment.
repeated VideoSegment shot_annotations = 6;
// Explicit content annotation.
ExplicitContentAnnotation explicit_annotation = 7;
// Speech transcription.
repeated SpeechTranscription speech_transcriptions = 11;
// OCR text detection and tracking.
// Annotations for list of detected text snippets. Each will have list of
// frame information associated with it.
repeated TextAnnotation text_annotations = 12;
// Annotations for list of objects detected and tracked in video.
repeated ObjectTrackingAnnotation object_annotations = 14;
// Annotations for list of logos detected, tracked and recognized in video.
repeated LogoRecognitionAnnotation logo_recognition_annotations = 19;
// Person detection annotations.
repeated PersonDetectionAnnotation person_detection_annotations = 20;
// Celebrity recognition annotations.
CelebrityRecognitionAnnotation celebrity_recognition_annotations = 21;
// If set, indicates an error. Note that for a single `AnnotateVideoRequest`
// some videos may succeed and some may fail.
google.rpc.Status error = 9;
}
// Video annotation response. Included in the `response`
// field of the `Operation` returned by the `GetOperation`
// call of the `google::longrunning::Operations` service.
message AnnotateVideoResponse {
// Annotation results for all videos specified in `AnnotateVideoRequest`.
repeated VideoAnnotationResults annotation_results = 1;
}
// Annotation progress for a single video.
message VideoAnnotationProgress {
// Video file location in
// [Cloud Storage](https://cloud.google.com/storage/).
string input_uri = 1;
// Approximate percentage processed thus far. Guaranteed to be
// 100 when fully processed.
int32 progress_percent = 2;
// Time when the request was received.
google.protobuf.Timestamp start_time = 3;
// Time of the most recent update.
google.protobuf.Timestamp update_time = 4;
// Specifies which feature is being tracked if the request contains more than
// one feature.
Feature feature = 5;
// Specifies which segment is being tracked if the request contains more than
// one segment.
VideoSegment segment = 6;
}
// Video annotation progress. Included in the `metadata`
// field of the `Operation` returned by the `GetOperation`
// call of the `google::longrunning::Operations` service.
message AnnotateVideoProgress {
// Progress metadata for all videos specified in `AnnotateVideoRequest`.
repeated VideoAnnotationProgress annotation_progress = 1;
}
// Config for SPEECH_TRANSCRIPTION.
message SpeechTranscriptionConfig {
// Required. *Required* The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// See [Language Support](https://cloud.google.com/speech/docs/languages)
// for a list of the currently supported language codes.
string language_code = 1 [(google.api.field_behavior) = REQUIRED];
// Optional. Maximum number of recognition hypotheses to be returned.
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
// within each `SpeechTranscription`. The server may return fewer than
// `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
// return a maximum of one. If omitted, will return a maximum of one.
int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
// Optional. If set to `true`, the server will attempt to filter out
// profanities, replacing all but the initial character in each filtered word
// with asterisks, e.g. "f***". If set to `false` or omitted, profanities
// won't be filtered out.
bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
// Optional. A means to provide context to assist the speech recognition.
repeated SpeechContext speech_contexts = 4
[(google.api.field_behavior) = OPTIONAL];
// Optional. If 'true', adds punctuation to recognition result hypotheses.
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all. The default 'false' value
// does not add punctuation to result hypotheses. NOTE: "This is currently
// offered as an experimental service, complimentary to all users. In the
// future this may be exclusively available as a premium feature."
bool enable_automatic_punctuation = 5
[(google.api.field_behavior) = OPTIONAL];
// Optional. For file formats, such as MXF or MKV, supporting multiple audio
// tracks, specify up to two tracks. Default: track 0.
repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
// Optional. If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
// Note: When this is true, we send all the words from the beginning of the
// audio for the top alternative in every consecutive response.
// This is done in order to improve our speaker tags as our models learn to
// identify the speakers in the conversation over time.
bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL];
// Optional. If set, specifies the estimated number of speakers in the
// conversation. If not set, defaults to '2'. Ignored unless
// enable_speaker_diarization is set to true.
int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL];
// Optional. If `true`, the top result includes a list of words and the
// confidence for those words. If `false`, no word-level confidence
// information is returned. The default is `false`.
bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL];
}
// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
// Optional. A list of strings containing words and phrases "hints" so that
// the speech recognition is more likely to recognize them. This can be used
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](https://cloud.google.com/speech/limits#content).
repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
}
// A speech recognition result corresponding to a portion of the audio.
message SpeechTranscription {
// May contain one or more recognition hypotheses (up to the maximum specified
// in `max_alternatives`). These alternatives are ordered in terms of
// accuracy, with the top (first) alternative being the most probable, as
// ranked by the recognizer.
repeated SpeechRecognitionAlternative alternatives = 1;
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
// language tag of the language in this result. This language code was
// detected to have the most likelihood of being spoken in the audio.
string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
}
// Alternative hypotheses (a.k.a. n-best list).
message SpeechRecognitionAlternative {
// Transcript text representing the words that the user spoke.
string transcript = 1;
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. A list of word-specific information for each recognized word.
// Note: When `enable_speaker_diarization` is set to true, you will see all
// the words from the beginning of the audio.
repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}
// Word-specific information for recognized words. Word information is only
// included in the response when certain request parameters are set, such
// as `enable_word_time_offsets`.
message WordInfo {
// Time offset relative to the beginning of the audio, and
// corresponding to the start of the spoken word. This field is only set if
// `enable_word_time_offsets=true` and only in the top hypothesis. This is an
// experimental feature and the accuracy of the time offset can vary.
google.protobuf.Duration start_time = 1;
// Time offset relative to the beginning of the audio, and
// corresponding to the end of the spoken word. This field is only set if
// `enable_word_time_offsets=true` and only in the top hypothesis. This is an
// experimental feature and the accuracy of the time offset can vary.
google.protobuf.Duration end_time = 2;
// The word corresponding to this set of information.
string word = 3;
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative.
// This field is not guaranteed to be accurate and users should not rely on it
// to be always provided.
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from 1 up to diarization_speaker_count,
// and is only set if speaker diarization is enabled.
int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
}
// A vertex represents a 2D point in the image.
// NOTE: the normalized vertex coordinates are relative to the original image
// and range from 0 to 1.
message NormalizedVertex {
// X coordinate.
float x = 1;
// Y coordinate.
float y = 2;
}
// Normalized bounding polygon for text (that might not be aligned with axis).
// Contains list of the corner points in clockwise order starting from
// top-left corner. For example, for a rectangular bounding box:
// When the text is horizontal it might look like:
// 0----1
// | |
// 3----2
//
// When it's clockwise rotated 180 degrees around the top-left corner it
// becomes:
// 2----3
// | |
// 1----0
//
// and the vertex order will still be (0, 1, 2, 3). Note that values can be less
// than 0, or greater than 1 due to trignometric calculations for location of
// the box.
message NormalizedBoundingPoly {
// Normalized vertices of the bounding polygon.
repeated NormalizedVertex vertices = 1;
}
// Video segment level annotation results for text detection.
message TextSegment {
// Video segment where a text snippet was detected.
VideoSegment segment = 1;
// Confidence for the track of detected text. It is calculated as the highest
// over all frames where OCR detected text appears.
float confidence = 2;
// Information related to the frames where OCR detected text appears.
repeated TextFrame frames = 3;
}
// Video frame level annotation results for text annotation (OCR).
// Contains information regarding timestamp and bounding box locations for the
// frames containing detected OCR text snippets.
message TextFrame {
// Bounding polygon of the detected text for this frame.
NormalizedBoundingPoly rotated_bounding_box = 1;
// Timestamp of this frame.
google.protobuf.Duration time_offset = 2;
}
// Annotations related to one detected OCR text snippet. This will contain the
// corresponding text, confidence value, and frame level information for each
// detection.
message TextAnnotation {
// The detected text.
string text = 1;
// All video segments where OCR detected text appears.
repeated TextSegment segments = 2;
}
// Video frame level annotations for object detection and tracking. This field
// stores per frame location, time offset, and confidence.
message ObjectTrackingFrame {
// The normalized bounding box location of this object track for the frame.
NormalizedBoundingBox normalized_bounding_box = 1;
// The timestamp of the frame in microseconds.
google.protobuf.Duration time_offset = 2;
}
// Annotations corresponding to one tracked object.
message ObjectTrackingAnnotation {
// Different representation of tracking info in non-streaming batch
// and streaming modes.
oneof track_info {
// Non-streaming batch mode ONLY.
// Each object track corresponds to one video segment where it appears.
VideoSegment segment = 3;
// Streaming mode ONLY.
// In streaming mode, we do not know the end time of a tracked object
// before it is completed. Hence, there is no VideoSegment info returned.
// Instead, we provide a unique identifiable integer track_id so that
// the customers can correlate the results of the ongoing
// ObjectTrackAnnotation of the same track_id over time.
int64 track_id = 5;
}
// Entity to specify the object category that this track is labeled as.
Entity entity = 1;
// Object category's labeling confidence of this track.
float confidence = 4;
// Information corresponding to all frames where this object track appears.
// Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
// messages in frames.
// Streaming mode: it can only be one ObjectTrackingFrame message in frames.
repeated ObjectTrackingFrame frames = 2;
}
// Annotation corresponding to one detected, tracked and recognized logo class.
message LogoRecognitionAnnotation {
// Entity category information to specify the logo class that all the logo
// tracks within this LogoRecognitionAnnotation are recognized as.
Entity entity = 1;
// All logo tracks where the recognized logo appears. Each track corresponds
// to one logo instance appearing in consecutive frames.
repeated Track tracks = 2;
// All video segments where the recognized logo appears. There might be
// multiple instances of the same logo class appearing in one VideoSegment.
repeated VideoSegment segments = 3;
}
// The top-level message sent by the client for the `StreamingAnnotateVideo`
// method. Multiple `StreamingAnnotateVideoRequest` messages are sent.
// The first message must only contain a `StreamingVideoConfig` message.
// All subsequent messages must only contain `input_content` data.
message StreamingAnnotateVideoRequest {
// *Required* The streaming request, which is either a streaming config or
// video content.
oneof streaming_request {
// Provides information to the annotator, specifing how to process the
// request. The first `AnnotateStreamingVideoRequest` message must only
// contain a `video_config` message.
StreamingVideoConfig video_config = 1;
// The video data to be annotated. Chunks of video data are sequentially
// sent in `StreamingAnnotateVideoRequest` messages. Except the initial
// `StreamingAnnotateVideoRequest` message containing only
// `video_config`, all subsequent `AnnotateStreamingVideoRequest`
// messages must only contain `input_content` field.
// Note: as with all bytes fields, protobuffers use a pure binary
// representation (not base64).
bytes input_content = 2;
}
}
// Provides information to the annotator that specifies how to process the
// request.
message StreamingVideoConfig {
// Config for requested annotation feature.
oneof streaming_config {
// Config for STREAMING_SHOT_CHANGE_DETECTION.
StreamingShotChangeDetectionConfig shot_change_detection_config = 2;
// Config for STREAMING_LABEL_DETECTION.
StreamingLabelDetectionConfig label_detection_config = 3;
// Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
StreamingExplicitContentDetectionConfig explicit_content_detection_config =
4;
// Config for STREAMING_OBJECT_TRACKING.
StreamingObjectTrackingConfig object_tracking_config = 5;
// Config for STREAMING_AUTOML_ACTION_RECOGNITION.
StreamingAutomlActionRecognitionConfig automl_action_recognition_config =
23;
// Config for STREAMING_AUTOML_CLASSIFICATION.
StreamingAutomlClassificationConfig automl_classification_config = 21;
// Config for STREAMING_AUTOML_OBJECT_TRACKING.
StreamingAutomlObjectTrackingConfig automl_object_tracking_config = 22;
}
// Requested annotation feature.
StreamingFeature feature = 1;
// Streaming storage option. By default: storage is disabled.
StreamingStorageConfig storage_config = 30;
}