7
7
import argparse
8
8
import config
9
9
import gradio as gr
10
+ import json
10
11
import openai
11
12
import os
12
13
import requests
22
23
if openai .api_key is None :
23
24
raise ValueError ("OpenAI API Key not set as environnment variable OPENAI_API_KEY" )
24
25
26
+ # Get eleven.io
27
+ elevenio_api_key = None
28
+ if config .SPEECH_METHOD == SpeechMethod .ELEVENIO :
29
+ elevenio_api_key = os .environ .get ("ELEVENIO_API_KEY" )
30
+ if elevenio_api_key is None :
31
+ raise ValueError (
32
+ "Eleven.io API Key not set as environnment variable ELEVENIO_API_KEY"
33
+ )
34
+
25
35
# Initial message
26
36
messages = [
27
37
{
@@ -123,8 +133,7 @@ def generate_image(text_input: str) -> str:
123
133
return config .IMAGE_PATH
124
134
125
135
126
- # Call Google Cloud Text-to-Speech API to convert text to speech
127
- def text_to_speech (input_text : str , tts_voice_label : str ) -> str :
136
+ def text_to_speech_gcp (input_text : str , tts_voice_label : str ) -> str :
128
137
"""
129
138
Use GCP Text-to-Speech API to convert text to a WAV file.
130
139
@@ -170,6 +179,53 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
170
179
return config .GENERATED_SPEECH_PATH
171
180
172
181
182
+ def text_to_speech_elevenio (
183
+ input_text : str ,
184
+ tts_voice_id : str ,
185
+ stability : float = 0.65 ,
186
+ similarity_boost : float = 0.85 ,
187
+ ) -> str :
188
+ """
189
+ Use Eleven.io Text-to-Speech API to convert text to a WAV file.
190
+
191
+ Args:
192
+ input_text: Text to convert to speech
193
+ tts_voice_label: Label of voice to use, from keys of ELEVENIO_VOICE_ID in config
194
+ similarity_boost: Similarity boost for voice
195
+ stability: Stability for voice
196
+
197
+ Returns
198
+ str: Path to output audio file
199
+ """
200
+ print (f"Convert text to speech: { input_text } " )
201
+ tts_voice_id = config .ELEVENIO_VOICE_ID # Use pre-assigned from config
202
+ url = f"{ config .ELEVENIO_TTS_BASE_URL } /{ tts_voice_id } "
203
+
204
+ payload = json .dumps (
205
+ {
206
+ "text" : input_text ,
207
+ "voice_settings" : {
208
+ "stability" : stability ,
209
+ "similarity_boost" : similarity_boost ,
210
+ },
211
+ }
212
+ )
213
+ headers = {
214
+ "xi-api-key" : elevenio_api_key ,
215
+ "Content-Type" : "application/json" ,
216
+ "Accept" : "audio/mpeg" ,
217
+ }
218
+
219
+ response = requests .request ("POST" , url , headers = headers , data = payload )
220
+
221
+ # save the response audio as an MP3 file
222
+ with open (config .GENERATED_SPEECH_PATH , "wb" ) as out :
223
+ out .write (response .content )
224
+
225
+ # return response.audio_content
226
+ return config .GENERATED_SPEECH_PATH
227
+
228
+
173
229
"""
174
230
Gradio UI Definition
175
231
"""
@@ -203,7 +259,10 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
203
259
story_msg = gr .Textbox (label = "Story" )
204
260
205
261
# Add components for TTS
206
- if config .SPEECH_METHOD == SpeechMethod .GCP :
262
+ if (
263
+ config .SPEECH_METHOD == SpeechMethod .GCP
264
+ or config .SPEECH_METHOD == SpeechMethod .ELEVENIO
265
+ ):
207
266
# Audio output box if using Google Cloud TTS
208
267
audio_output = gr .Audio (label = "Output" , elem_id = "speaker" )
209
268
@@ -226,7 +285,7 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
226
285
story_msg .change (generate_image , story_msg , gen_image )
227
286
228
287
"""
229
- Used for GCP TTS only
288
+ Used for External API TTS only
230
289
231
290
Workaround: Custom (hacky) Javascript function to autoplay audio
232
291
Derived from: https://github.com/gradio-app/gradio/issues/1349
@@ -243,9 +302,15 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
243
302
speech_delay = int (config .TTS_SPEECH_DELAY * 1000 )
244
303
)
245
304
305
+ tts_fn = None
246
306
if config .SPEECH_METHOD == SpeechMethod .GCP :
307
+ tts_fn = text_to_speech_gcp
308
+ elif config .SPEECH_METHOD == SpeechMethod .ELEVENIO :
309
+ tts_fn = text_to_speech_elevenio
310
+
311
+ if tts_fn :
247
312
# Connect story output to audio output after calling TTS on it
248
- story_msg .change (text_to_speech , [story_msg , voice_selection ], audio_output )
313
+ story_msg .change (tts_fn , [story_msg , voice_selection ], audio_output )
249
314
250
315
# Separately trigger the autoplay audio function
251
316
story_msg .change (None , None , None , _js = autoplay_audio )
0 commit comments