Skip to content
This repository has been archived by the owner on Apr 20, 2024. It is now read-only.

feat: Support output transcript to Google Cloud Storage for LongRunningRecognize #128

Merged
merged 5 commits into from Mar 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 22 additions & 22 deletions google/cloud/speech_v1/types/__init__.py
Expand Up @@ -16,41 +16,41 @@
#

from .cloud_speech import (
RecognizeRequest,
LongRunningRecognizeMetadata,
LongRunningRecognizeRequest,
StreamingRecognizeRequest,
StreamingRecognitionConfig,
LongRunningRecognizeResponse,
RecognitionAudio,
RecognitionConfig,
SpeakerDiarizationConfig,
RecognitionMetadata,
SpeechContext,
RecognitionAudio,
RecognizeRequest,
RecognizeResponse,
LongRunningRecognizeResponse,
LongRunningRecognizeMetadata,
StreamingRecognizeResponse,
StreamingRecognitionResult,
SpeechRecognitionResult,
SpeakerDiarizationConfig,
SpeechContext,
SpeechRecognitionAlternative,
SpeechRecognitionResult,
StreamingRecognitionConfig,
StreamingRecognitionResult,
StreamingRecognizeRequest,
StreamingRecognizeResponse,
WordInfo,
)

__all__ = (
"RecognizeRequest",
"LongRunningRecognizeMetadata",
"LongRunningRecognizeRequest",
"StreamingRecognizeRequest",
"StreamingRecognitionConfig",
"LongRunningRecognizeResponse",
"RecognitionAudio",
"RecognitionConfig",
"SpeakerDiarizationConfig",
"RecognitionMetadata",
"SpeechContext",
"RecognitionAudio",
"RecognizeRequest",
"RecognizeResponse",
"LongRunningRecognizeResponse",
"LongRunningRecognizeMetadata",
"StreamingRecognizeResponse",
"StreamingRecognitionResult",
"SpeechRecognitionResult",
"SpeakerDiarizationConfig",
"SpeechContext",
"SpeechRecognitionAlternative",
"SpeechRecognitionResult",
"StreamingRecognitionConfig",
"StreamingRecognitionResult",
"StreamingRecognizeRequest",
"StreamingRecognizeResponse",
"WordInfo",
)
6 changes: 4 additions & 2 deletions google/cloud/speech_v1p1beta1/__init__.py
Expand Up @@ -33,6 +33,7 @@
from .types.cloud_speech import StreamingRecognitionResult
from .types.cloud_speech import StreamingRecognizeRequest
from .types.cloud_speech import StreamingRecognizeResponse
from .types.cloud_speech import TranscriptOutputConfig
from .types.cloud_speech import WordInfo
from .types.cloud_speech_adaptation import CreateCustomClassRequest
from .types.cloud_speech_adaptation import CreatePhraseSetRequest
Expand All @@ -59,7 +60,6 @@ class SpeechClient(SpeechHelpers, SpeechClient):


__all__ = (
"AdaptationClient",
"CreateCustomClassRequest",
"CreatePhraseSetRequest",
"CustomClass",
Expand All @@ -82,15 +82,17 @@ class SpeechClient(SpeechHelpers, SpeechClient):
"RecognizeResponse",
"SpeakerDiarizationConfig",
"SpeechAdaptation",
"SpeechClient",
"SpeechContext",
"SpeechRecognitionAlternative",
"SpeechRecognitionResult",
"StreamingRecognitionConfig",
"StreamingRecognitionResult",
"StreamingRecognizeRequest",
"StreamingRecognizeResponse",
"TranscriptOutputConfig",
"UpdateCustomClassRequest",
"UpdatePhraseSetRequest",
"WordInfo",
"SpeechClient",
"AdaptationClient",
)
75 changes: 46 additions & 29 deletions google/cloud/speech_v1p1beta1/proto/cloud_speech.proto
Expand Up @@ -19,7 +19,6 @@ package google.cloud.speech.v1p1beta1;
import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/speech/v1p1beta1/resource.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
Expand All @@ -37,8 +36,7 @@ option objc_class_prefix = "GCS";
// Service that implements Google Cloud Speech API.
service Speech {
option (google.api.default_host) = "speech.googleapis.com";
option (google.api.oauth_scopes) =
"https://www.googleapis.com/auth/cloud-platform";
option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";

// Performs synchronous speech recognition: receive results after all audio
// has been sent and processed.
Expand All @@ -56,8 +54,7 @@ service Speech {
// a `LongRunningRecognizeResponse` message.
// For more information on asynchronous speech recognition, see the
// [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
rpc LongRunningRecognize(LongRunningRecognizeRequest)
returns (google.longrunning.Operation) {
rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
option (google.api.http) = {
post: "/v1p1beta1/speech:longrunningrecognize"
body: "*"
Expand All @@ -71,8 +68,8 @@ service Speech {

// Performs bidirectional streaming speech recognition: receive results while
// sending audio. This method is only available via the gRPC API (not REST).
rpc StreamingRecognize(stream StreamingRecognizeRequest)
returns (stream StreamingRecognizeResponse) {}
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
}
}

// The top-level message sent by the client for the `Recognize` method.
Expand All @@ -94,6 +91,19 @@ message LongRunningRecognizeRequest {

// Required. The audio data to be recognized.
RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED];

// Optional. Specifies an optional destination for the recognition results.
TranscriptOutputConfig output_config = 4 [(google.api.field_behavior) = OPTIONAL];
}

// Specifies an optional destination for the recognition results.
message TranscriptOutputConfig {
oneof output_type {
// Specifies a Cloud Storage URI for the recognition results. Must be
// specified in the format: `gs://bucket_name/object_name`, and the bucket
// must already exist.
string gcs_uri = 1;
}
}

// The top-level message sent by the client for the `StreamingRecognize` method.
Expand Down Expand Up @@ -171,7 +181,7 @@ message RecognitionConfig {
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
// recognition can be reduced if lossy codecs are used to capture or transmit
// audio, particularly if background noise is present. Lossy codecs include
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
//
// The `FLAC` and `WAV` audio file formats include a header that describes the
// included audio content. You can request recognition for `WAV` files that
Expand All @@ -182,8 +192,7 @@ message RecognitionConfig {
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
// encoding configuration must match the encoding described in the audio
// header; otherwise the request returns an
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
// code.
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
enum AudioEncoding {
// Not specified.
ENCODING_UNSPECIFIED = 0;
Expand Down Expand Up @@ -237,8 +246,7 @@ message RecognitionConfig {

// Encoding of audio data sent in all `RecognitionAudio` messages.
// This field is optional for `FLAC` and `WAV` audio files and required
// for all other audio formats. For details, see
// [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
AudioEncoding encoding = 1;

// Sample rate in Hertz of the audio data sent in all
Expand All @@ -247,8 +255,7 @@ message RecognitionConfig {
// source to 16000 Hz. If that's not possible, use the native sample rate of
// the audio source (instead of re-sampling).
// This field is optional for FLAC and WAV audio files, but is
// required for all other audio formats. For details, see
// [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
// required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
int32 sample_rate_hertz = 2;

// The number of channels in the input audio data.
Expand Down Expand Up @@ -424,8 +431,10 @@ message SpeakerDiarizationConfig {
int32 max_speaker_count = 3;

// Output only. Unused.
int32 speaker_tag = 5
[deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
int32 speaker_tag = 5 [
deprecated = true,
(google.api.field_behavior) = OUTPUT_ONLY
];
}

// Description of audio data to be recognized.
Expand Down Expand Up @@ -589,8 +598,8 @@ message SpeechContext {

// Contains audio data in the encoding specified in the `RecognitionConfig`.
// Either `content` or `uri` must be supplied. Supplying both or neither
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
// See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
message RecognitionAudio {
// The audio source, which is either inline content or a Google Cloud
// Storage uri.
Expand All @@ -605,9 +614,8 @@ message RecognitionAudio {
// Currently, only Google Cloud Storage URIs are
// supported, which must be specified in the following format:
// `gs://bucket_name/object_name` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
// For more information, see [Request
// URIs](https://cloud.google.com/storage/docs/reference-uris).
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
// [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
string uri = 2;
}
}
Expand All @@ -630,6 +638,12 @@ message LongRunningRecognizeResponse {
// Sequential list of transcription results corresponding to
// sequential portions of audio.
repeated SpeechRecognitionResult results = 2;

// Original output config if present in the request.
TranscriptOutputConfig output_config = 6;

// If the transcript output fails this field contains the relevant error.
google.rpc.Status output_error = 7;
}

// Describes the progress of a long-running `LongRunningRecognize` call. It is
Expand All @@ -646,9 +660,12 @@ message LongRunningRecognizeMetadata {
// Time of the most recent processing update.
google.protobuf.Timestamp last_update_time = 3;

// Output only. The URI of the audio file being transcribed. Empty if the
// audio was sent as byte content.
// Output only. The URI of the audio file being transcribed. Empty if the audio was sent
// as byte content.
string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY];

// Output only. A copy of the TranscriptOutputConfig if it was set in the request.
TranscriptOutputConfig output_config = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// `StreamingRecognizeResponse` is the only message returned to the client by
Expand Down Expand Up @@ -762,9 +779,9 @@ message StreamingRecognitionResult {
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;

// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
// language tag of the language in this result. This language code was
// detected to have the most likelihood of being spoken in the audio.
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
// of the language in this result. This language code was detected to have
// the most likelihood of being spoken in the audio.
string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
}

Expand All @@ -781,9 +798,9 @@ message SpeechRecognitionResult {
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 2;

// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
// language tag of the language in this result. This language code was
// detected to have the most likelihood of being spoken in the audio.
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
// of the language in this result. This language code was detected to have
// the most likelihood of being spoken in the audio.
string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
}

Expand Down
Expand Up @@ -69,6 +69,7 @@ service Adaptation {
patch: "/v1p1beta1/{phrase_set.name=projects/*/locations/*/phraseSets/*}"
body: "phrase_set"
};
option (google.api.method_signature) = "phrase_set,update_mask";
}

// Delete a phrase set.
Expand Down Expand Up @@ -110,6 +111,7 @@ service Adaptation {
patch: "/v1p1beta1/{custom_class.name=projects/*/locations/*/customClasses/*}"
body: "custom_class"
};
option (google.api.method_signature) = "custom_class,update_mask";
}

// Delete a custom class.
Expand Down