feat: update asyncpai

elevenlabs · Dec 19, 2024 · 8252970 · 8252970
1 parent 5ec867f
commit 8252970
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 150 deletions.
diff --git a/fern/apis/api/asyncapi.yml b/fern/apis/api/asyncapi.yml
@@ -18,7 +18,7 @@ tags:
 
 channels:
   /v1/text-to-speech/{voice_id}/stream-input:
-    x-fern-audiences: 
+    x-fern-audiences:
       - sdk
     parameters:
       voice_id:
@@ -64,7 +64,15 @@ channels:
             output_format:
               description: The output audio format
               type: string
-              enum: ["mp3_44100", "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100", "ulaw_8000"]
+              enum:
+                [
+                  "mp3_44100",
+                  "pcm_16000",
+                  "pcm_22050",
+                  "pcm_24000",
+                  "pcm_44100",
+                  "ulaw_8000",
+                ]
               default: "mp3_44100"
             inactivity_timeout:
               description: Timeout for inactivity before connection is closed
@@ -74,6 +82,10 @@ channels:
               description: Whether to include timing data with every audio chunk
               type: boolean
               default: false
+            auto_mode:
+              description: This parameter focuses on reducing the latency by disabling the chunk schedule and all buffers. It is only recommended when sending full sentences or phrases, sending partial phrases will result in highly reduced quality. By default it's set to false.
+              type: boolean
+              default: false
 
     publish:
       description: Send messages to the WebSocket
@@ -91,34 +103,33 @@ channels:
         oneOf:
           - $ref: "#/components/messages/AudioOutput"
           - $ref: "#/components/messages/FinalOutput"
-    
-    x-fern-examples: 
-      - query-parameters: 
+
+    x-fern-examples:
+      - query-parameters:
           model_id: pcm_s16le
-        messages: 
+        messages:
           - type: publish
             messageId: InitializeConnection
-            value: 
+            value:
               text: " "
               voice_settings:
                 stability: 0.5
                 similarity_boost: 0.8
               xi_api_key: <YOUR_API_KEY>
           - type: publish
             messageId: SendText
-            value: 
+            value:
               text: "Hello World"
               try_trigger_generation: true
           - type: publish
             messageId: CloseConnection
-            value: 
+            value:
               text: ""
           - type: subscribe
             messageId: AudioOutput
-            value: 
+            value:
               audio: Y3VyaW91cyBtaW5kcyB0aGluayBhbGlrZSA6KQ==
 
-
 components:
   messages:
     SendText:
@@ -140,36 +151,36 @@ components:
     FinalOutput:
       messageId: finalOutput
       payload:
-        $ref: "#/components/schemas/FinalOutput"        
+        $ref: "#/components/schemas/FinalOutput"
 
   schemas:
-    InitializeConnection: 
+    InitializeConnection:
       type: object
       properties:
-        text: 
+        text:
           x-fern-type: literal<" ">
-          description: The initial text that must be sent is a blank space. 
-        voice_settings: 
+          description: The initial text that must be sent is a blank space.
+        voice_settings:
           $ref: "#/components/schemas/RealtimeVoiceSettings"
-        generation_config: 
+        generation_config:
           $ref: "#/components/schemas/GenerationConfig"
           description: "This property should only be provided in the first message you send. "
-        xi-api-key: 
+        xi-api-key:
           type: string
-          description: | 
+          description: |
             Your ElevenLabs API key. This is a required parameter that should be provided in the first message you send. 
             You can find your API key in the [API Keys section](https://elevenlabs.io/docs/api-reference/websockets#api-keys).
-      required: 
+      required:
         - text
         - xi-api-key
-    
-    CloseConnection: 
+
+    CloseConnection:
       type: object
       properties:
-        text: 
+        text:
           x-fern-type: literal<"">
           description: End the stream with an empty string
-      required: 
+      required:
         - text
 
     SendText:
@@ -179,25 +190,25 @@ components:
       properties:
         text:
           type: string
-          description: The text to be sent to the API for audio generation. SHould always end with a single space string. 
-        try_trigger_generation: 
-          description: | 
+          description: The text to be sent to the API for audio generation. SHould always end with a single space string.
+        try_trigger_generation:
+          description: |
             This is an advanced setting that most users shouldn't need to use. It relates to our generation schedule 
             explained [here](#understanding-how-our-websockets-buffer-text).
-            
+
             Use this to attempt to immediately trigger the generation of audio, overriding the `chunk_length_schedule`. 
             Unlike flush, `try_trigger_generation` will only generate audio if our 
             buffer contains more than a minimum 
             threshold of characters, this is to ensure a higher quality response from our model. 
-            
+
             Note that overriding the chunk schedule to generate small amounts of 
             text may result in lower quality audio, therefore, only use this parameter if you 
             really need text to be processed immediately. We generally recommend keeping the default value of 
             `false` and adjusting the `chunk_length_schedule` in the `generation_config` instead.
           type: boolean
-        voice_settings: 
+        voice_settings:
           type: object
-          properties: 
+          properties:
             stability:
               type: number
               description: Defines the stability for voice settings.
@@ -213,14 +224,14 @@ components:
           required:
             - stability
             - similarity_boost
-        generator_config: 
+        generator_config:
           type: object
-          properties: 
-            chunk_length_schedule: 
+          properties:
+            chunk_length_schedule:
               type: array
-              items: 
+              items:
                 type: number
-              description: | 
+              description: |
                 This is an advanced setting that most users shouldn't need to use. It relates to our 
                 generation schedule explained [here](https://elevenlabs.io/docs/api-reference/websockets#understanding-how-our-websockets-buffer-text).
 
@@ -240,9 +251,9 @@ components:
                 too low may result in lower quality audio. Please test and adjust as needed.
 
                 Each item should be in the range 50-500.
-        flush: 
+        flush:
           type: boolean
-          description: | 
+          description: |
             Flush forces the generation of audio. Set this value to true when you have finished sending text, but want to keep the websocket connection open.
 
             This is useful when you want to ensure that the last chunk of audio is generated even when the length of text sent is smaller than the value set in chunk_length_schedule (e.g. 120 or 50).
@@ -267,16 +278,16 @@ components:
           description: Defines the use speaker boost for voice settings. This parameter is available on V2+ models.
       type: object
       required:
-      - stability
-      - similarity_boost
-    
-    GenerationConfig: 
+        - stability
+        - similarity_boost
+
+    GenerationConfig:
       properties:
-        chunk_length_schedule: 
+        chunk_length_schedule:
           type: array
-          items: 
+          items:
             type: number
-          description: | 
+          description: |
             This is an advanced setting that most users shouldn't need to use. It relates to our 
             generation schedule explained [here](https://elevenlabs.io/docs/api-reference/websockets#understanding-how-our-websockets-buffer-text).
 
@@ -296,80 +307,79 @@ components:
             too low may result in lower quality audio. Please test and adjust as needed.
 
             Each item should be in the range 50-500.
-        
+
     AudioOutput:
       type: object
-      required: 
+      required:
         - audio
       properties:
-        audio: 
+        audio:
           type: string
           # format: binary
-          description: | 
+          description: |
             A generated partial audio chunk, encoded using the selected output_format, by default this 
             is MP3 encoded as a base64 string.
-    
-    FinalOutput: 
+
+    FinalOutput:
       type: object
-      properties: 
-        isFinal: 
+      properties:
+        isFinal:
           x-fern-type: literal<true>
-          description: | 
+          description: |
             Indicates if the generation is complete. If set to `True`, `audio` will be null.
-        normalizedAlignment: 
+        normalizedAlignment:
           $ref: "#/components/schemas/NormalizedAlignment"
-        alignment: 
+        alignment:
           $ref: "#/components/schemas/Alignment"
-      
-    NormalizedAlignment: 
+
+    NormalizedAlignment:
       type: object
-      description: | 
+      description: |
         Alignment information for the generated audio given the input normalized text sequence.
-      properties: 
-        char_start_times_ms: 
+      properties:
+        char_start_times_ms:
           x-fern-type: list<integer>
-          description: | 
+          description: |
             A list of starting times (in milliseconds) for each character in the normalized text as it 
             corresponds to the audio. For instance, the character 'H' starts at time 0 ms in the audio.  
             Note these times are relative to the returned chunk from the model, and not the 
-            full audio response. 
-        chars_durations_ms: 
+            full audio response.
+        chars_durations_ms:
           x-fern-type: list<integer>
-          description: | 
+          description: |
             A list of durations (in milliseconds) for each character in the normalized text as it 
             corresponds to the audio. For instance, the character 'H' lasts for 3 ms in the audio.  
             Note these times are relative to the returned chunk from the model, and not the 
             full audio response.
         chars:
           x-fern-type: list<string>
-          description: | 
+          description: |
             A list of characters in the normalized text sequence. For instance, the first character is 'H'. 
             Note that this list may contain spaces, punctuation, and other special characters. 
             The length of this list should be the same as the lengths of `char_start_times_ms` and `chars_durations_ms`.
-    
-    Alignment: 
+
+    Alignment:
       type: object
-      description: | 
+      description: |
         Alignment information for the generated audio given the input text sequence.
-      properties: 
-        char_start_times_ms: 
+      properties:
+        char_start_times_ms:
           x-fern-type: list<integer>
-          description: | 
+          description: |
             A list of starting times (in milliseconds) for each character in the text as it 
             corresponds to the audio. For instance, the character 'H' starts at time 0 ms in the audio.  
             Note these times are relative to the returned chunk from the model, and not the 
-            full audio response. 
-        chars_durations_ms: 
+            full audio response.
+        chars_durations_ms:
           x-fern-type: list<integer>
-          description: | 
+          description: |
             A list of durations (in milliseconds) for each character in the text as it 
             corresponds to the audio. For instance, the character 'H' lasts for 3 ms in the audio.  
             Note these times are relative to the returned chunk from the model, and not the 
             full audio response.
         chars:
           x-fern-type: list<string>
-          description: | 
+          description: |
             A list of characters in the text sequence. For instance, the first character is 'H'. 
             Note that this list may contain spaces, punctuation, and other special characters. 
-            The length of this list should be the same as the lengths of `char_start_times_ms` and `chars_durations_ms`.      
-
+            The length of this list should be the same as the lengths of `char_start_times_ms` and `chars_durations_ms`.