@@ -306,19 +306,24 @@ message RecognitionConfig {
306
306
// *Optional* If 'true', enables speaker detection for each recognized word in
307
307
// the top alternative of the recognition result using a speaker_tag provided
308
308
// in the WordInfo.
309
- // Note: When this is true, we send all the words from the beginning of the
309
+ // Note: Use diarization_config instead.
310
+ bool enable_speaker_diarization = 16 [deprecated = true ];
311
+
312
+ // *Optional*
313
+ // If set, specifies the estimated number of speakers in the conversation.
314
+ // Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.
315
+ // Note: Use diarization_config instead.
316
+ int32 diarization_speaker_count = 17 [deprecated = true ];
317
+
318
+ // *Optional* Config to enable speaker diarization and set additional
319
+ // parameters to make diarization better suited for your application.
320
+ // Note: When this is enabled, we send all the words from the beginning of the
310
321
// audio for the top alternative in every consecutive STREAMING responses.
311
322
// This is done in order to improve our speaker tags as our models learn to
312
323
// identify the speakers in the conversation over time.
313
324
// For non-streaming requests, the diarization results will be provided only
314
325
// in the top alternative of the FINAL SpeechRecognitionResult.
315
- bool enable_speaker_diarization = 16 ;
316
-
317
- // *Optional*
318
- // If set, specifies the estimated number of speakers in the conversation.
319
- // If not set, defaults to '2'.
320
- // Ignored unless enable_speaker_diarization is set to true."
321
- int32 diarization_speaker_count = 17 ;
326
+ SpeakerDiarizationConfig diarization_config = 19 ;
322
327
323
328
// *Optional* Metadata regarding this request.
324
329
RecognitionMetadata metadata = 9 ;
@@ -368,6 +373,29 @@ message RecognitionConfig {
368
373
bool use_enhanced = 14 ;
369
374
}
370
375
376
+ // *Optional* Config to enable speaker diarization.
377
+ message SpeakerDiarizationConfig {
378
+ // *Optional* If 'true', enables speaker detection for each recognized word in
379
+ // the top alternative of the recognition result using a speaker_tag provided
380
+ // in the WordInfo.
381
+ bool enable_speaker_diarization = 1 ;
382
+
383
+ // Note: Set min_speaker_count = max_speaker_count to fix the number of
384
+ // speakers to be detected in the audio.
385
+
386
+ // *Optional*
387
+ // Minimum number of speakers in the conversation. This range gives you more
388
+ // flexibility by allowing the system to automatically determine the correct
389
+ // number of speakers. If not set, the default value is 2.
390
+ int32 min_speaker_count = 2 ;
391
+
392
+ // *Optional*
393
+ // Maximum number of speakers in the conversation. This range gives you more
394
+ // flexibility by allowing the system to automatically determine the correct
395
+ // number of speakers. If not set, the default value is 6.
396
+ int32 max_speaker_count = 3 ;
397
+ }
398
+
371
399
// Description of audio data to be recognized.
372
400
message RecognitionMetadata {
373
401
// Use case categories that the audio recognition request can be described
0 commit comments