Skip to content

Commit ffcb9c0

Browse files
authored
Add audio file to HTML audio player for autoplay functionality (#8)
1 parent 3f60d64 commit ffcb9c0

File tree

1 file changed

+46
-43
lines changed

1 file changed

+46
-43
lines changed

storyteller.py

+46-43
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
python storyteller.py --address=127.0.0.1 --port=7860
66
"""
77
import argparse
8+
import base64
89
import config
910
import gradio as gr
11+
import io
1012
import json
1113
import openai
1214
import os
@@ -133,6 +135,30 @@ def generate_image(text_input: str) -> str:
133135
return config.IMAGE_PATH
134136

135137

138+
def audio_file_to_html(audio_file: str) -> str:
139+
"""
140+
Convert audio file to HTML audio player.
141+
142+
Args:
143+
audio_file: Path to audio file
144+
145+
Returns:
146+
audio_player: HTML audio player that auto-plays
147+
"""
148+
# Read in audio file to audio_bytes
149+
audio_bytes = io.BytesIO()
150+
with open(audio_file, "rb") as f:
151+
audio_bytes.write(f.read())
152+
153+
# Generate audio player HTML object for autoplay
154+
audio_bytes.seek(0)
155+
audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
156+
audio_player = (
157+
f'<audio src="data:audio/mpeg;base64,{audio}" controls autoplay></audio>'
158+
)
159+
return audio_player
160+
161+
136162
def text_to_speech_gcp(input_text: str, tts_voice_label: str) -> str:
137163
"""
138164
Use GCP Text-to-Speech API to convert text to a WAV file.
@@ -175,8 +201,10 @@ def text_to_speech_gcp(input_text: str, tts_voice_label: str) -> str:
175201
with open(config.GENERATED_SPEECH_PATH, "wb") as out:
176202
out.write(response.audio_content)
177203

178-
# return response.audio_content
179-
return config.GENERATED_SPEECH_PATH
204+
# Generate audio player HTML object for autoplay
205+
audio_player = audio_file_to_html(config.GENERATED_SPEECH_PATH)
206+
207+
return audio_player
180208

181209

182210
def text_to_speech_elevenio(
@@ -222,8 +250,11 @@ def text_to_speech_elevenio(
222250
with open(config.GENERATED_SPEECH_PATH, "wb") as out:
223251
out.write(response.content)
224252

253+
# Generate audio player HTML object for autoplay
254+
audio_player = audio_file_to_html(config.GENERATED_SPEECH_PATH)
255+
225256
# return response.audio_content
226-
return config.GENERATED_SPEECH_PATH
257+
return audio_player
227258

228259

229260
"""
@@ -233,6 +264,14 @@ def text_to_speech_elevenio(
233264
# Session state box containing all user/system messages, hidden
234265
messages = gr.State(list())
235266

267+
# Initialize TTS
268+
tts_fn = None
269+
if config.SPEECH_METHOD == SpeechMethod.GCP:
270+
tts_fn = text_to_speech_gcp
271+
elif config.SPEECH_METHOD == SpeechMethod.ELEVENIO:
272+
tts_fn = text_to_speech_elevenio
273+
274+
# Set up layout and link actions together
236275
with gr.Row():
237276
with gr.Column(scale=1):
238277
with gr.Accordion("Click for Instructions & Configuration:", open=False):
@@ -258,16 +297,10 @@ def text_to_speech_elevenio(
258297
# Story Output Box
259298
story_msg = gr.Textbox(label="Story")
260299

261-
# Add components for TTS
262-
if (
263-
config.SPEECH_METHOD == SpeechMethod.GCP
264-
or config.SPEECH_METHOD == SpeechMethod.ELEVENIO
265-
):
266-
# Audio output box if using Google Cloud TTS
267-
audio_output = gr.Audio(label="Output", elem_id="speaker")
268-
269-
# Just a sink to pass through and trigger Javascript audio autoplay on
270-
text_sink = gr.Textbox(label="Debug", visible=False)
300+
if tts_fn:
301+
# Connect story output to audio output after calling TTS on it
302+
html = gr.HTML()
303+
story_msg.change(tts_fn, [story_msg, voice_selection], html)
271304

272305
with gr.Column(scale=1):
273306
# Story Generated Image
@@ -284,36 +317,6 @@ def text_to_speech_elevenio(
284317
# Connect story output to image generation
285318
story_msg.change(generate_image, story_msg, gen_image)
286319

287-
"""
288-
Used for External API TTS only
289-
290-
Workaround: Custom (hacky) Javascript function to autoplay audio
291-
Derived from: https://github.com/gradio-app/gradio/issues/1349
292-
Needs a timeout to wait for the Google TTS call to complete and the audio
293-
file sent to the gradio object in browser.
294-
"""
295-
autoplay_audio = """
296-
async () => {{
297-
setTimeout(() => {{
298-
document.querySelector('#speaker audio').play();
299-
}}, {speech_delay});
300-
}}
301-
""".format(
302-
speech_delay=int(config.TTS_SPEECH_DELAY * 1000)
303-
)
304-
305-
tts_fn = None
306-
if config.SPEECH_METHOD == SpeechMethod.GCP:
307-
tts_fn = text_to_speech_gcp
308-
elif config.SPEECH_METHOD == SpeechMethod.ELEVENIO:
309-
tts_fn = text_to_speech_elevenio
310-
311-
if tts_fn:
312-
# Connect story output to audio output after calling TTS on it
313-
story_msg.change(tts_fn, [story_msg, voice_selection], audio_output)
314-
315-
# Separately trigger the autoplay audio function
316-
story_msg.change(None, None, None, _js=autoplay_audio)
317320

318321
if __name__ == "__main__":
319322
# Add a address string argument that defaults to 127.0.0.1

0 commit comments

Comments
 (0)