Fix incorrect type when using verbose_json as the whisper transcripti…

…on response_format, fixes openai#702
wrogati · Mar 24, 2024 · ea2de12 · ea2de12
1 parent d4673f1
commit ea2de12
Showing 1 changed file with 106 additions and 1 deletion.
diff --git a/src/resources/audio/transcriptions.ts b/src/resources/audio/transcriptions.ts
@@ -9,7 +9,7 @@ export class Transcriptions extends APIResource {
   /**
    * Transcribes audio into the input language.
    */
-  create(body: TranscriptionCreateParams, options?: Core.RequestOptions): Core.APIPromise<Transcription> {
+  create(body: TranscriptionCreateParams, options?: Core.RequestOptions): Core.APIPromise<Transcription | TranscriptionVerboseJson> {
     return this._client.post('/audio/transcriptions', multipartFormRequestOptions({ body, ...options }));
   }
 }
@@ -25,6 +25,111 @@ export interface Transcription {
   text: string;
 }
 
+/**
+ * Represents a verbose JSON transcription response returned by the model, based on the provided input.
+ */
+export interface TranscriptionVerboseJson {
+  /**
+   * The language of the input audio.
+   */
+  language: string;
+
+  /**
+   * The duration of the input audio.
+   */
+  duration: number;
+
+  /**
+   * The transcribed text.
+   */
+  text: string;
+
+  /**
+   * Extracted words and their corresponding timestamps.
+   */
+  words?: VerboseJsonWord[];
+
+  /**
+   * Segments of the transcribed text and their corresponding details.
+   */
+  segments?: VerboseJsonSegment[];
+}
+
+/**
+ * Represents a verbose JSON word object.
+ */
+interface VerboseJsonWord {
+  /**
+   * The text content of the word.
+   */
+  word: string;
+
+  /**
+   * Start time of the word in seconds.
+   */
+  start: number;
+
+  /**
+   * End time of the word in seconds.
+   */
+  end: number;
+}
+
+/**
+ * Represents a verbose JSON segment object.
+ */
+interface VerboseJsonSegment {
+  /**
+   * Unique identifier of the segment.
+   */
+  id: number;
+
+  /**
+   * Seek offset of the segment.
+   */
+  seek: number;
+
+  /**
+   * Start time of the segment in seconds.
+   */
+  start: number;
+
+  /**
+   * End time of the segment in seconds.
+   */
+  end: number;
+
+  /**
+   * Text content of the segment.
+   */
+  text: string;
+
+  /**
+   * Array of token IDs for the text content.
+   */
+  tokens: number[];
+
+  /**
+   * Temperature parameter used for generating the segment.
+   */
+  temperature: number;
+
+  /**
+   * Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
+   */
+  avg_logprob: number;
+
+  /**
+   * Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
+   */
+  compression_ratio: number;
+
+  /**
+   * Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
+   */
+  no_speech_prob: number;
+}
+
 export interface TranscriptionCreateParams {
   /**
    * The audio file object (not file name) to transcribe, in one of these formats: