1
- // Copyright 2018 Google LLC.
1
+ // Copyright 2019 Google LLC.
2
2
//
3
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
4
// you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@ syntax = "proto3";
18
18
package google.cloud.texttospeech.v1 ;
19
19
20
20
import "google/api/annotations.proto" ;
21
+ import "google/api/client.proto" ;
22
+ import "google/api/field_behavior.proto" ;
21
23
22
24
option cc_enable_arenas = true ;
23
25
option csharp_namespace = "Google.Cloud.TextToSpeech.V1" ;
@@ -29,35 +31,79 @@ option php_namespace = "Google\\Cloud\\TextToSpeech\\V1";
29
31
30
32
// Service that implements Google Cloud Text-to-Speech API.
31
33
service TextToSpeech {
34
+ option (google.api.default_host ) = "texttospeech.googleapis.com" ;
35
+ option (google.api.oauth_scopes ) = "https://www.googleapis.com/auth/cloud-platform" ;
36
+
32
37
// Returns a list of Voice supported for synthesis.
33
38
rpc ListVoices (ListVoicesRequest ) returns (ListVoicesResponse ) {
34
39
option (google.api.http ) = {
35
40
get : "/v1/voices"
36
41
};
42
+ option (google.api.method_signature ) = "language_code" ;
37
43
}
38
44
39
45
// Synthesizes speech synchronously: receive results after all text input
40
46
// has been processed.
41
- rpc SynthesizeSpeech (SynthesizeSpeechRequest )
42
- returns (SynthesizeSpeechResponse ) {
47
+ rpc SynthesizeSpeech (SynthesizeSpeechRequest ) returns (SynthesizeSpeechResponse ) {
43
48
option (google.api.http ) = {
44
49
post : "/v1/text:synthesize"
45
50
body : "*"
46
51
};
52
+ option (google.api.method_signature ) = "input,voice,audio_config" ;
47
53
}
48
54
}
49
55
50
56
// The top-level message sent by the client for the `ListVoices` method.
51
57
message ListVoicesRequest {
52
- // Optional (but recommended)
58
+ // Optional. Recommended.
53
59
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. If
54
60
// specified, the ListVoices call will only return voices that can be used to
55
61
// synthesize this language_code. E.g. when specifying "en-NZ", you will get
56
62
// supported "en-*" voices; when specifying "no", you will get supported
57
63
// "no-*" (Norwegian) and "nb-*" (Norwegian Bokmal) voices; specifying "zh"
58
64
// will also get supported "cmn-*" voices; specifying "zh-hk" will also get
59
65
// supported "yue-*" voices.
60
- string language_code = 1 ;
66
+ string language_code = 1 [(google.api.field_behavior ) = OPTIONAL ];
67
+ }
68
+
69
+ // Gender of the voice as described in
70
+ // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
71
+ enum SsmlVoiceGender {
72
+ // An unspecified gender.
73
+ // In VoiceSelectionParams, this means that the client doesn't care which
74
+ // gender the selected voice will have. In the Voice field of
75
+ // ListVoicesResponse, this may mean that the voice doesn't fit any of the
76
+ // other categories in this enum, or that the gender of the voice isn't known.
77
+ SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
78
+
79
+ // A male voice.
80
+ MALE = 1 ;
81
+
82
+ // A female voice.
83
+ FEMALE = 2 ;
84
+
85
+ // A gender-neutral voice.
86
+ NEUTRAL = 3 ;
87
+ }
88
+
89
+ // Configuration to set up audio encoder. The encoding determines the output
90
+ // audio format that we'd like.
91
+ enum AudioEncoding {
92
+ // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
93
+ AUDIO_ENCODING_UNSPECIFIED = 0 ;
94
+
95
+ // Uncompressed 16-bit signed little-endian samples (Linear PCM).
96
+ // Audio content returned as LINEAR16 also contains a WAV header.
97
+ LINEAR16 = 1 ;
98
+
99
+ // MP3 audio at 32kbps.
100
+ MP3 = 2 ;
101
+
102
+ // Opus encoded audio wrapped in an ogg container. The result will be a
103
+ // file which can be played natively on Android, and in browsers (at least
104
+ // Chrome and Firefox). The quality of the encoding is considerably higher
105
+ // than MP3 while using approximately the same bitrate.
106
+ OGG_OPUS = 3 ;
61
107
}
62
108
63
109
// The message returned to the client by the `ListVoices` method.
@@ -86,13 +132,13 @@ message Voice {
86
132
// The top-level message sent by the client for the `SynthesizeSpeech` method.
87
133
message SynthesizeSpeechRequest {
88
134
// Required. The Synthesizer requires either plain text or SSML as input.
89
- SynthesisInput input = 1 ;
135
+ SynthesisInput input = 1 [ (google.api .field_behavior ) = REQUIRED ] ;
90
136
91
137
// Required. The desired voice of the synthesized audio.
92
- VoiceSelectionParams voice = 2 ;
138
+ VoiceSelectionParams voice = 2 [ (google.api .field_behavior ) = REQUIRED ] ;
93
139
94
140
// Required. The configuration of the synthesized audio.
95
- AudioConfig audio_config = 3 ;
141
+ AudioConfig audio_config = 3 [ (google.api .field_behavior ) = REQUIRED ] ;
96
142
}
97
143
98
144
// Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -115,9 +161,9 @@ message SynthesisInput {
115
161
116
162
// Description of which voice to use for a synthesis request.
117
163
message VoiceSelectionParams {
118
- // The language (and optionally also the region) of the voice expressed as a
164
+ // Required. The language (and potentially also the region) of the voice expressed as a
119
165
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag, e.g.
120
- // "en-US". Required. This should not include a script tag (e.g. use
166
+ // "en-US". This should not include a script tag (e.g. use
121
167
// "cmn-cn" rather than "cmn-Hant-cn"), because the script will be inferred
122
168
// from the input provided in the SynthesisInput. The TTS service
123
169
// will use this parameter to help choose an appropriate voice. Note that
@@ -126,13 +172,13 @@ message VoiceSelectionParams {
126
172
// (e.g. using en-US rather than en-CA if there isn't a Canadian voice
127
173
// available), or even a different language, e.g. using "nb" (Norwegian
128
174
// Bokmal) instead of "no" (Norwegian)".
129
- string language_code = 1 ;
175
+ string language_code = 1 [ (google.api .field_behavior ) = REQUIRED ] ;
130
176
131
- // The name of the voice. Optional; if not set, the service will choose a
177
+ // The name of the voice. If not set, the service will choose a
132
178
// voice based on the other parameters such as language_code and gender.
133
179
string name = 2 ;
134
180
135
- // The preferred gender of the voice. Optional; if not set, the service will
181
+ // The preferred gender of the voice. If not set, the service will
136
182
// choose a voice based on the other parameters such as language_code and
137
183
// name. Note that this is only a preference, not requirement; if a
138
184
// voice of the appropriate gender is not available, the synthesizer should
@@ -142,94 +188,66 @@ message VoiceSelectionParams {
142
188
143
189
// Description of audio data to be synthesized.
144
190
message AudioConfig {
145
- // Required. The format of the requested audio byte stream.
146
- AudioEncoding audio_encoding = 1 ;
147
-
148
- // Optional speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal
149
- // native speed supported by the specific voice. 2.0 is twice as fast, and
150
- // 0.5 is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any
151
- // other values < 0.25 or > 4.0 will return an error.
152
- double speaking_rate = 2 ;
153
-
154
- // Optional speaking pitch, in the range [-20.0, 20.0]. 20 means increase 20
155
- // semitones from the original pitch. -20 means decrease 20 semitones from the
156
- // original pitch.
157
- double pitch = 3 ;
158
-
159
- // Optional volume gain (in dB) of the normal native volume supported by the
160
- // specific voice, in the range [-96.0, 16.0]. If unset, or set to a value of
161
- // 0.0 (dB), will play at normal native signal amplitude. A value of -6.0 (dB)
162
- // will play at approximately half the amplitude of the normal native signal
163
- // amplitude. A value of +6.0 (dB) will play at approximately twice the
164
- // amplitude of the normal native signal amplitude. Strongly recommend not to
165
- // exceed +10 (dB) as there's usually no effective increase in loudness for
166
- // any value greater than that.
167
- double volume_gain_db = 4 ;
168
-
169
- // The synthesis sample rate (in hertz) for this audio. Optional. If this is
170
- // different from the voice's natural sample rate, then the synthesizer will
171
- // honor this request by converting to the desired sample rate (which might
172
- // result in worse audio quality), unless the specified sample rate is not
173
- // supported for the encoding chosen, in which case it will fail the request
174
- // and return [google.rpc.Code.INVALID_ARGUMENT][].
175
- int32 sample_rate_hertz = 5 ;
176
-
177
- // An identifier which selects 'audio effects' profiles that are applied on
178
- // (post synthesized) text to speech.
179
- // Effects are applied on top of each other in the order they are given.
180
- // See
181
- //
182
- // [audio-profiles](https:
183
- // //cloud.google.com/text-to-speech/docs/audio-profiles)
184
- // for current supported profile ids.
185
- repeated string effects_profile_id = 6 ;
191
+ // Required. The format of the audio byte stream.
192
+ AudioEncoding audio_encoding = 1 [(google.api.field_behavior ) = REQUIRED ];
193
+
194
+ // Optional. Input only. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is
195
+ // the normal native speed supported by the specific voice. 2.0 is twice as
196
+ // fast, and 0.5 is half as fast. If unset(0.0), defaults to the native 1.0
197
+ // speed. Any other values < 0.25 or > 4.0 will return an error.
198
+ double speaking_rate = 2 [
199
+ (google.api.field_behavior ) = INPUT_ONLY ,
200
+ (google.api.field_behavior ) = OPTIONAL
201
+ ];
202
+
203
+ // Optional. Input only. Speaking pitch, in the range [-20.0, 20.0]. 20 means
204
+ // increase 20 semitones from the original pitch. -20 means decrease 20
205
+ // semitones from the original pitch.
206
+ double pitch = 3 [
207
+ (google.api.field_behavior ) = INPUT_ONLY ,
208
+ (google.api.field_behavior ) = OPTIONAL
209
+ ];
210
+
211
+ // Optional. Input only. Volume gain (in dB) of the normal native volume
212
+ // supported by the specific voice, in the range [-96.0, 16.0]. If unset, or
213
+ // set to a value of 0.0 (dB), will play at normal native signal amplitude. A
214
+ // value of -6.0 (dB) will play at approximately half the amplitude of the
215
+ // normal native signal amplitude. A value of +6.0 (dB) will play at
216
+ // approximately twice the amplitude of the normal native signal amplitude.
217
+ // Strongly recommend not to exceed +10 (dB) as there's usually no effective
218
+ // increase in loudness for any value greater than that.
219
+ double volume_gain_db = 4 [
220
+ (google.api.field_behavior ) = INPUT_ONLY ,
221
+ (google.api.field_behavior ) = OPTIONAL
222
+ ];
223
+
224
+ // Optional. The synthesis sample rate (in hertz) for this audio. When this is
225
+ // specified in SynthesizeSpeechRequest, if this is different from the voice's
226
+ // natural sample rate, then the synthesizer will honor this request by
227
+ // converting to the desired sample rate (which might result in worse audio
228
+ // quality), unless the specified sample rate is not supported for the
229
+ // encoding chosen, in which case it will fail the request and return
230
+ // [google.rpc.Code.INVALID_ARGUMENT][].
231
+ int32 sample_rate_hertz = 5 [(google.api.field_behavior ) = OPTIONAL ];
232
+
233
+ // Optional. Input only. An identifier which selects 'audio effects' profiles
234
+ // that are applied on (post synthesized) text to speech. Effects are applied
235
+ // on top of each other in the order they are given. See
236
+ // [audio
237
+ // profiles](https://cloud.google.com/text-to-speech/docs/audio-profiles) for
238
+ // current supported profile ids.
239
+ repeated string effects_profile_id = 6 [
240
+ (google.api.field_behavior ) = INPUT_ONLY ,
241
+ (google.api.field_behavior ) = OPTIONAL
242
+ ];
186
243
}
187
244
188
245
// The message returned to the client by the `SynthesizeSpeech` method.
189
246
message SynthesizeSpeechResponse {
190
247
// The audio data bytes encoded as specified in the request, including the
191
- // header (For LINEAR16 audio, we include the WAV header). Note: as
248
+ // header for encodings that are wrapped in containers (e.g. MP3, OGG_OPUS).
249
+ // For LINEAR16 audio, we include the WAV header. Note: as
192
250
// with all bytes fields, protobuffers use a pure binary representation,
193
251
// whereas JSON representations use base64.
194
252
bytes audio_content = 1 ;
195
253
}
196
-
197
- // Gender of the voice as described in
198
- // [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice).
199
- enum SsmlVoiceGender {
200
- // An unspecified gender.
201
- // In VoiceSelectionParams, this means that the client doesn't care which
202
- // gender the selected voice will have. In the Voice field of
203
- // ListVoicesResponse, this may mean that the voice doesn't fit any of the
204
- // other categories in this enum, or that the gender of the voice isn't known.
205
- SSML_VOICE_GENDER_UNSPECIFIED = 0 ;
206
-
207
- // A male voice.
208
- MALE = 1 ;
209
-
210
- // A female voice.
211
- FEMALE = 2 ;
212
-
213
- // A gender-neutral voice.
214
- NEUTRAL = 3 ;
215
- }
216
-
217
- // Configuration to set up audio encoder. The encoding determines the output
218
- // audio format that we'd like.
219
- enum AudioEncoding {
220
- // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][].
221
- AUDIO_ENCODING_UNSPECIFIED = 0 ;
222
-
223
- // Uncompressed 16-bit signed little-endian samples (Linear PCM).
224
- // Audio content returned as LINEAR16 also contains a WAV header.
225
- LINEAR16 = 1 ;
226
-
227
- // MP3 audio.
228
- MP3 = 2 ;
229
-
230
- // Opus encoded audio wrapped in an ogg container. The result will be a
231
- // file which can be played natively on Android, and in browsers (at least
232
- // Chrome and Firefox). The quality of the encoding is considerably higher
233
- // than MP3 while using approximately the same bitrate.
234
- OGG_OPUS = 3 ;
235
- }
0 commit comments