Skip to content
This repository has been archived by the owner on Oct 31, 2023. It is now read-only.

Commit

Permalink
feat: Support for spoken punctuation and spoken emojis (#143)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoshi-automation committed Apr 8, 2021
1 parent cc9cc3e commit b6bddbe
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 16 deletions.
4 changes: 2 additions & 2 deletions google/cloud/speech_v1p1beta1/__init__.py
Expand Up @@ -60,6 +60,7 @@ class SpeechClient(SpeechHelpers, SpeechClient):


__all__ = (
"AdaptationClient",
"CreateCustomClassRequest",
"CreatePhraseSetRequest",
"CustomClass",
Expand All @@ -82,7 +83,6 @@ class SpeechClient(SpeechHelpers, SpeechClient):
"RecognizeResponse",
"SpeakerDiarizationConfig",
"SpeechAdaptation",
"SpeechClient",
"SpeechContext",
"SpeechRecognitionAlternative",
"SpeechRecognitionResult",
Expand All @@ -94,5 +94,5 @@ class SpeechClient(SpeechHelpers, SpeechClient):
"UpdateCustomClassRequest",
"UpdatePhraseSetRequest",
"WordInfo",
"AdaptationClient",
"SpeechClient",
)
26 changes: 22 additions & 4 deletions google/cloud/speech_v1p1beta1/proto/cloud_speech.proto
@@ -1,3 +1,4 @@

// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -24,6 +25,7 @@ import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
import "google/protobuf/wrappers.proto";
import "google/rpc/status.proto";

option cc_enable_arenas = true;
Expand Down Expand Up @@ -316,15 +318,15 @@ message RecognitionConfig {
// Speech adaptation configuration improves the accuracy of speech
// recognition. When speech adaptation is set it supersedes the
// `speech_contexts` field. For more information, see the [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength)
// adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
// documentation.
SpeechAdaptation adaptation = 20;

// Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
// [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
// adaptation](https://cloud.google.com/speech-to-text/docs/adaptation).
repeated SpeechContext speech_contexts = 6;

// If `true`, the top result includes a list of words and
Expand All @@ -344,6 +346,22 @@ message RecognitionConfig {
// The default 'false' value does not add punctuation to result hypotheses.
bool enable_automatic_punctuation = 11;

// The spoken punctuation behavior for the call
// If not set, uses default behavior based on model of choice
// e.g. command_and_search will enable spoken punctuation by default
// If 'true', replaces spoken punctuation with the corresponding symbols in
// the request. For example, "how are you question mark" becomes "how are
// you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
// for support. If 'false', spoken punctuation is not replaced.
google.protobuf.BoolValue enable_spoken_punctuation = 22;

// The spoken emoji behavior for the call
// If not set, uses default behavior based on model of choice
// If 'true', adds spoken emoji formatting for the request. This will replace
// spoken emojis with the corresponding Unicode symbols in the final
// transcript. If 'false', spoken emojis are not replaced.
google.protobuf.BoolValue enable_spoken_emojis = 23;

// If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
Expand Down Expand Up @@ -674,8 +692,8 @@ message LongRunningRecognizeMetadata {
// audio, and `single_utterance` is set to false, then no messages are streamed
// back to the client.
//
// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
// be returned while processing audio:
// Here's an example of a series of `StreamingRecognizeResponse`s that might be
// returned while processing audio:
//
// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
//
Expand Down
Expand Up @@ -394,7 +394,7 @@ def streaming_recognize(
single_utterance is set to false, then no messages
are streamed back to the client.
Here's an example of a series of ten
Here's an example of a series of
StreamingRecognizeResponses that might be returned
while processing audio:
Expand Down
2 changes: 1 addition & 1 deletion google/cloud/speech_v1p1beta1/services/speech/client.py
Expand Up @@ -584,7 +584,7 @@ def streaming_recognize(
single_utterance is set to false, then no messages
are streamed back to the client.
Here's an example of a series of ten
Here's an example of a series of
StreamingRecognizeResponses that might be returned
while processing audio:
Expand Down
35 changes: 30 additions & 5 deletions google/cloud/speech_v1p1beta1/types/cloud_speech.py
Expand Up @@ -21,6 +21,7 @@
from google.cloud.speech_v1p1beta1.types import resource
from google.protobuf import duration_pb2 as duration # type: ignore
from google.protobuf import timestamp_pb2 as timestamp # type: ignore
from google.protobuf import wrappers_pb2 as wrappers # type: ignore
from google.rpc import status_pb2 as status # type: ignore


Expand Down Expand Up @@ -271,14 +272,14 @@ class RecognitionConfig(proto.Message):
speech recognition. When speech adaptation is set it
supersedes the ``speech_contexts`` field. For more
information, see the `speech
adaptation <https://cloud.google.com/speech-to-text/docs/context-strength>`__
adaptation <https://cloud.google.com/speech-to-text/docs/adaptation>`__
documentation.
speech_contexts (Sequence[google.cloud.speech_v1p1beta1.types.SpeechContext]):
Array of
[SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
A means to provide context to assist the speech recognition.
For more information, see `speech
adaptation <https://cloud.google.com/speech-to-text/docs/context-strength>`__.
adaptation <https://cloud.google.com/speech-to-text/docs/adaptation>`__.
enable_word_time_offsets (bool):
If ``true``, the top result includes a list of words and the
start and end time offsets (timestamps) for those words. If
Expand All @@ -296,6 +297,23 @@ class RecognitionConfig(proto.Message):
requests in other languages has no effect at
all. The default 'false' value does not add
punctuation to result hypotheses.
enable_spoken_punctuation (google.protobuf.wrappers_pb2.BoolValue):
The spoken punctuation behavior for the call If not set,
uses default behavior based on model of choice e.g.
command_and_search will enable spoken punctuation by default
If 'true', replaces spoken punctuation with the
corresponding symbols in the request. For example, "how are
you question mark" becomes "how are you?". See
https://cloud.google.com/speech-to-text/docs/spoken-punctuation
for support. If 'false', spoken punctuation is not replaced.
enable_spoken_emojis (google.protobuf.wrappers_pb2.BoolValue):
The spoken emoji behavior for the call
If not set, uses default behavior based on model
of choice If 'true', adds spoken emoji
formatting for the request. This will replace
spoken emojis with the corresponding Unicode
symbols in the final transcript. If 'false',
spoken emojis are not replaced.
enable_speaker_diarization (bool):
If 'true', enables speaker detection for each recognized
word in the top alternative of the recognition result using
Expand Down Expand Up @@ -436,6 +454,14 @@ class AudioEncoding(proto.Enum):

enable_automatic_punctuation = proto.Field(proto.BOOL, number=11)

enable_spoken_punctuation = proto.Field(
proto.MESSAGE, number=22, message=wrappers.BoolValue,
)

enable_spoken_emojis = proto.Field(
proto.MESSAGE, number=23, message=wrappers.BoolValue,
)

enable_speaker_diarization = proto.Field(proto.BOOL, number=16)

diarization_speaker_count = proto.Field(proto.INT32, number=17)
Expand Down Expand Up @@ -749,9 +775,8 @@ class StreamingRecognizeResponse(proto.Message):
client. If there is no recognizable audio, and ``single_utterance``
is set to false, then no messages are streamed back to the client.
Here's an example of a series of ten
``StreamingRecognizeResponse``\ s that might be returned while
processing audio:
Here's an example of a series of ``StreamingRecognizeResponse``\ s
that might be returned while processing audio:
1. results { alternatives { transcript: "tube" } stability: 0.01 }
Expand Down
6 changes: 3 additions & 3 deletions synth.metadata
Expand Up @@ -4,15 +4,15 @@
"git": {
"name": ".",
"remote": "https://github.com/googleapis/python-speech.git",
"sha": "5da4b5590e092c993688f9a048efd08ff2e65407"
"sha": "cc9cc3ecdef32a8bf1198aa2ff8561398bf359f8"
}
},
{
"git": {
"name": "googleapis",
"remote": "https://github.com/googleapis/googleapis.git",
"sha": "149a3a84c29c9b8189576c7442ccb6dcf6a8f95b",
"internalRef": "364411656"
"sha": "847464c110e3cb6a5078b6b15086c73c4b622938",
"internalRef": "367346981"
}
},
{
Expand Down
1 change: 1 addition & 0 deletions tests/unit/gapic/speech_v1p1beta1/test_speech.py
Expand Up @@ -42,6 +42,7 @@
from google.cloud.speech_v1p1beta1.types import resource
from google.longrunning import operations_pb2
from google.oauth2 import service_account
from google.protobuf import wrappers_pb2 as wrappers # type: ignore
from google.rpc import status_pb2 as status # type: ignore


Expand Down

0 comments on commit b6bddbe

Please sign in to comment.