5
5
python storyteller.py --address=127.0.0.1 --port=7860
6
6
"""
7
7
import argparse
8
+ import base64
8
9
import config
9
10
import gradio as gr
11
+ import io
10
12
import json
11
13
import openai
12
14
import os
@@ -133,6 +135,30 @@ def generate_image(text_input: str) -> str:
133
135
return config .IMAGE_PATH
134
136
135
137
138
+ def audio_file_to_html (audio_file : str ) -> str :
139
+ """
140
+ Convert audio file to HTML audio player.
141
+
142
+ Args:
143
+ audio_file: Path to audio file
144
+
145
+ Returns:
146
+ audio_player: HTML audio player that auto-plays
147
+ """
148
+ # Read in audio file to audio_bytes
149
+ audio_bytes = io .BytesIO ()
150
+ with open (audio_file , "rb" ) as f :
151
+ audio_bytes .write (f .read ())
152
+
153
+ # Generate audio player HTML object for autoplay
154
+ audio_bytes .seek (0 )
155
+ audio = base64 .b64encode (audio_bytes .read ()).decode ("utf-8" )
156
+ audio_player = (
157
+ f'<audio src="data:audio/mpeg;base64,{ audio } " controls autoplay></audio>'
158
+ )
159
+ return audio_player
160
+
161
+
136
162
def text_to_speech_gcp (input_text : str , tts_voice_label : str ) -> str :
137
163
"""
138
164
Use GCP Text-to-Speech API to convert text to a WAV file.
@@ -175,8 +201,10 @@ def text_to_speech_gcp(input_text: str, tts_voice_label: str) -> str:
175
201
with open (config .GENERATED_SPEECH_PATH , "wb" ) as out :
176
202
out .write (response .audio_content )
177
203
178
- # return response.audio_content
179
- return config .GENERATED_SPEECH_PATH
204
+ # Generate audio player HTML object for autoplay
205
+ audio_player = audio_file_to_html (config .GENERATED_SPEECH_PATH )
206
+
207
+ return audio_player
180
208
181
209
182
210
def text_to_speech_elevenio (
@@ -222,8 +250,11 @@ def text_to_speech_elevenio(
222
250
with open (config .GENERATED_SPEECH_PATH , "wb" ) as out :
223
251
out .write (response .content )
224
252
253
+ # Generate audio player HTML object for autoplay
254
+ audio_player = audio_file_to_html (config .GENERATED_SPEECH_PATH )
255
+
225
256
# return response.audio_content
226
- return config . GENERATED_SPEECH_PATH
257
+ return audio_player
227
258
228
259
229
260
"""
@@ -233,6 +264,14 @@ def text_to_speech_elevenio(
233
264
# Session state box containing all user/system messages, hidden
234
265
messages = gr .State (list ())
235
266
267
+ # Initialize TTS
268
+ tts_fn = None
269
+ if config .SPEECH_METHOD == SpeechMethod .GCP :
270
+ tts_fn = text_to_speech_gcp
271
+ elif config .SPEECH_METHOD == SpeechMethod .ELEVENIO :
272
+ tts_fn = text_to_speech_elevenio
273
+
274
+ # Set up layout and link actions together
236
275
with gr .Row ():
237
276
with gr .Column (scale = 1 ):
238
277
with gr .Accordion ("Click for Instructions & Configuration:" , open = False ):
@@ -258,16 +297,10 @@ def text_to_speech_elevenio(
258
297
# Story Output Box
259
298
story_msg = gr .Textbox (label = "Story" )
260
299
261
- # Add components for TTS
262
- if (
263
- config .SPEECH_METHOD == SpeechMethod .GCP
264
- or config .SPEECH_METHOD == SpeechMethod .ELEVENIO
265
- ):
266
- # Audio output box if using Google Cloud TTS
267
- audio_output = gr .Audio (label = "Output" , elem_id = "speaker" )
268
-
269
- # Just a sink to pass through and trigger Javascript audio autoplay on
270
- text_sink = gr .Textbox (label = "Debug" , visible = False )
300
+ if tts_fn :
301
+ # Connect story output to audio output after calling TTS on it
302
+ html = gr .HTML ()
303
+ story_msg .change (tts_fn , [story_msg , voice_selection ], html )
271
304
272
305
with gr .Column (scale = 1 ):
273
306
# Story Generated Image
@@ -284,36 +317,6 @@ def text_to_speech_elevenio(
284
317
# Connect story output to image generation
285
318
story_msg .change (generate_image , story_msg , gen_image )
286
319
287
- """
288
- Used for External API TTS only
289
-
290
- Workaround: Custom (hacky) Javascript function to autoplay audio
291
- Derived from: https://github.com/gradio-app/gradio/issues/1349
292
- Needs a timeout to wait for the Google TTS call to complete and the audio
293
- file sent to the gradio object in browser.
294
- """
295
- autoplay_audio = """
296
- async () => {{
297
- setTimeout(() => {{
298
- document.querySelector('#speaker audio').play();
299
- }}, {speech_delay});
300
- }}
301
- """ .format (
302
- speech_delay = int (config .TTS_SPEECH_DELAY * 1000 )
303
- )
304
-
305
- tts_fn = None
306
- if config .SPEECH_METHOD == SpeechMethod .GCP :
307
- tts_fn = text_to_speech_gcp
308
- elif config .SPEECH_METHOD == SpeechMethod .ELEVENIO :
309
- tts_fn = text_to_speech_elevenio
310
-
311
- if tts_fn :
312
- # Connect story output to audio output after calling TTS on it
313
- story_msg .change (tts_fn , [story_msg , voice_selection ], audio_output )
314
-
315
- # Separately trigger the autoplay audio function
316
- story_msg .change (None , None , None , _js = autoplay_audio )
317
320
318
321
if __name__ == "__main__" :
319
322
# Add a address string argument that defaults to 127.0.0.1
0 commit comments