Skip to content

Commit 3f60d64

Browse files
authored
Voice Cloning Example using Eleven.io (#7)
* Set SPEECH_METHOD to ELEVENIO and add text-to-speech function for Eleven.io API in storyteller.py * Update speech method to use GCP instead of ElevenIO * Refactor text_to_speech_elevenio function to use pre-assigned voice ID from config in storytelling.py * Refactor the code to make the API options more generic by creating options like GCP above in config.py
1 parent 46d4679 commit 3f60d64

File tree

2 files changed

+88
-5
lines changed

2 files changed

+88
-5
lines changed

config.py

+18
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
None: No speech
66
"gcp": Google Cloud Platform Text-to-Speech API
77
"mac": Mac OS X say command
8+
"elevenio": Eleven.io Text-to-Speech API
89
910
Note: For GCP, you must be authenticated with the gcloud CLI or set the
1011
GOOGLE_APPLICATION_CREDENTIALS environment variable
12+
13+
For Eleven.io you need to add an ELEVENIO_API_KEY as environment variable with key
1114
"""
1215

1316

@@ -16,6 +19,7 @@ class SpeechMethod(Enum):
1619
NONE = 1
1720
GCP = 2
1821
MAC = 3
22+
ELEVENIO = 4
1923

2024

2125
# Set the method here
@@ -46,6 +50,20 @@ class SpeechMethod(Enum):
4650
"GB Male": "en-GB-Neural2-D",
4751
}
4852

53+
"""
54+
Eleven.io Text-to-Speech API Config
55+
"""
56+
# can get the ID of voice through API /voice URL or look at the URL called when
57+
# generating online
58+
"""
59+
curl --location 'https://api.elevenlabs.io/v1/voices' \
60+
--header 'xi-api-key: [API_KEY]' \
61+
--header 'Accept: application/json'
62+
"""
63+
# TODO: Can make options like GCP above to make this more generic
64+
ELEVENIO_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"
65+
ELEVENIO_TTS_BASE_URL = "https://api.elevenlabs.io/v1/text-to-speech"
66+
4967
"""
5068
Example Prompts
5169

storyteller.py

+70-5
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import argparse
88
import config
99
import gradio as gr
10+
import json
1011
import openai
1112
import os
1213
import requests
@@ -22,6 +23,15 @@
2223
if openai.api_key is None:
2324
raise ValueError("OpenAI API Key not set as environnment variable OPENAI_API_KEY")
2425

26+
# Get eleven.io
27+
elevenio_api_key = None
28+
if config.SPEECH_METHOD == SpeechMethod.ELEVENIO:
29+
elevenio_api_key = os.environ.get("ELEVENIO_API_KEY")
30+
if elevenio_api_key is None:
31+
raise ValueError(
32+
"Eleven.io API Key not set as environnment variable ELEVENIO_API_KEY"
33+
)
34+
2535
# Initial message
2636
messages = [
2737
{
@@ -123,8 +133,7 @@ def generate_image(text_input: str) -> str:
123133
return config.IMAGE_PATH
124134

125135

126-
# Call Google Cloud Text-to-Speech API to convert text to speech
127-
def text_to_speech(input_text: str, tts_voice_label: str) -> str:
136+
def text_to_speech_gcp(input_text: str, tts_voice_label: str) -> str:
128137
"""
129138
Use GCP Text-to-Speech API to convert text to a WAV file.
130139
@@ -170,6 +179,53 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
170179
return config.GENERATED_SPEECH_PATH
171180

172181

182+
def text_to_speech_elevenio(
183+
input_text: str,
184+
tts_voice_id: str,
185+
stability: float = 0.65,
186+
similarity_boost: float = 0.85,
187+
) -> str:
188+
"""
189+
Use Eleven.io Text-to-Speech API to convert text to a WAV file.
190+
191+
Args:
192+
input_text: Text to convert to speech
193+
tts_voice_label: Label of voice to use, from keys of ELEVENIO_VOICE_ID in config
194+
similarity_boost: Similarity boost for voice
195+
stability: Stability for voice
196+
197+
Returns
198+
str: Path to output audio file
199+
"""
200+
print(f"Convert text to speech: {input_text}")
201+
tts_voice_id = config.ELEVENIO_VOICE_ID # Use pre-assigned from config
202+
url = f"{config.ELEVENIO_TTS_BASE_URL}/{tts_voice_id}"
203+
204+
payload = json.dumps(
205+
{
206+
"text": input_text,
207+
"voice_settings": {
208+
"stability": stability,
209+
"similarity_boost": similarity_boost,
210+
},
211+
}
212+
)
213+
headers = {
214+
"xi-api-key": elevenio_api_key,
215+
"Content-Type": "application/json",
216+
"Accept": "audio/mpeg",
217+
}
218+
219+
response = requests.request("POST", url, headers=headers, data=payload)
220+
221+
# save the response audio as an MP3 file
222+
with open(config.GENERATED_SPEECH_PATH, "wb") as out:
223+
out.write(response.content)
224+
225+
# return response.audio_content
226+
return config.GENERATED_SPEECH_PATH
227+
228+
173229
"""
174230
Gradio UI Definition
175231
"""
@@ -203,7 +259,10 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
203259
story_msg = gr.Textbox(label="Story")
204260

205261
# Add components for TTS
206-
if config.SPEECH_METHOD == SpeechMethod.GCP:
262+
if (
263+
config.SPEECH_METHOD == SpeechMethod.GCP
264+
or config.SPEECH_METHOD == SpeechMethod.ELEVENIO
265+
):
207266
# Audio output box if using Google Cloud TTS
208267
audio_output = gr.Audio(label="Output", elem_id="speaker")
209268

@@ -226,7 +285,7 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
226285
story_msg.change(generate_image, story_msg, gen_image)
227286

228287
"""
229-
Used for GCP TTS only
288+
Used for External API TTS only
230289
231290
Workaround: Custom (hacky) Javascript function to autoplay audio
232291
Derived from: https://github.com/gradio-app/gradio/issues/1349
@@ -243,9 +302,15 @@ def text_to_speech(input_text: str, tts_voice_label: str) -> str:
243302
speech_delay=int(config.TTS_SPEECH_DELAY * 1000)
244303
)
245304

305+
tts_fn = None
246306
if config.SPEECH_METHOD == SpeechMethod.GCP:
307+
tts_fn = text_to_speech_gcp
308+
elif config.SPEECH_METHOD == SpeechMethod.ELEVENIO:
309+
tts_fn = text_to_speech_elevenio
310+
311+
if tts_fn:
247312
# Connect story output to audio output after calling TTS on it
248-
story_msg.change(text_to_speech, [story_msg, voice_selection], audio_output)
313+
story_msg.change(tts_fn, [story_msg, voice_selection], audio_output)
249314

250315
# Separately trigger the autoplay audio function
251316
story_msg.change(None, None, None, _js=autoplay_audio)

0 commit comments

Comments
 (0)