Skip to content

Commit dde12eb

Browse files
Merge pull request elizaOS#512 from ai16z/fix/speech
fix: speech service fix
2 parents a6c1b1b + 79f3ce4 commit dde12eb

File tree

5 files changed

+1936
-8530
lines changed

5 files changed

+1936
-8530
lines changed

packages/client-discord/src/voice.ts

-5
Original file line numberDiff line numberDiff line change
@@ -416,11 +416,6 @@ export class VoiceManager extends EventEmitter {
416416
ServiceType.TRANSCRIPTION
417417
);
418418

419-
console.log(
420-
"transcriptionService: ",
421-
transcriptionService
422-
);
423-
424419
if (!transcriptionService) {
425420
throw new Error(
426421
"Transcription generation service not found"

packages/plugin-node/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"cldr-segmentation": "2.2.1",
2121
"command-exists": "1.2.9",
2222
"csv-writer": "1.6.0",
23+
"echogarden": "^2.0.5",
2324
"espeak-ng": "1.0.2",
2425
"ffmpeg-static": "5.2.0",
2526
"fluent-ffmpeg": "2.1.3",
+226-116
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
import { PassThrough, Readable } from "stream";
2-
import {
3-
IAgentRuntime,
4-
ISpeechService,
5-
ITranscriptionService,
6-
ServiceType,
7-
} from "@ai16z/eliza";
2+
import { IAgentRuntime, ISpeechService, ServiceType } from "@ai16z/eliza";
83
import { getWavHeader } from "./audioUtils.ts";
9-
import { synthesize } from "../vendor/vits.ts";
104
import { Service } from "@ai16z/eliza";
115
import { validateNodeConfig } from "../enviroment.ts";
6+
import * as Echogarden from "echogarden";
127

138
function prependWavHeader(
149
readable: Readable,
@@ -40,77 +35,141 @@ function prependWavHeader(
4035

4136
async function textToSpeech(runtime: IAgentRuntime, text: string) {
4237
await validateNodeConfig(runtime);
43-
const body = {
44-
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
45-
text: text,
46-
voice_settings: {
47-
similarity_boost: runtime.getSetting(
48-
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
49-
),
50-
stability: runtime.getSetting("ELEVENLABS_VOICE_STABILITY"),
51-
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
52-
use_speaker_boost: runtime.getSetting(
53-
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
54-
),
55-
},
56-
};
57-
const options = {
58-
method: "POST",
59-
headers: {
60-
"Content-Type": "application/json",
61-
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
62-
},
63-
body: JSON.stringify(body),
64-
};
65-
66-
const response = await fetch(
67-
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
68-
options
69-
);
7038

71-
const status = response.status;
72-
if (status != 200) {
73-
console.log(`Received status ${status} from Eleven Labs API`);
74-
const errorBodyString = await response.text();
75-
throw new Error(
76-
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
39+
try {
40+
const response = await fetch(
41+
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
42+
{
43+
method: "POST",
44+
headers: {
45+
"Content-Type": "application/json",
46+
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
47+
},
48+
body: JSON.stringify({
49+
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
50+
text: text,
51+
voice_settings: {
52+
similarity_boost: runtime.getSetting(
53+
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
54+
),
55+
stability: runtime.getSetting(
56+
"ELEVENLABS_VOICE_STABILITY"
57+
),
58+
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
59+
use_speaker_boost: runtime.getSetting(
60+
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
61+
),
62+
},
63+
}),
64+
}
7765
);
78-
}
7966

80-
if (response) {
81-
const reader = response.body?.getReader();
82-
const readable = new Readable({
83-
read() {
84-
reader &&
85-
reader.read().then(({ done, value }) => {
86-
if (done) {
87-
this.push(null);
88-
} else {
89-
this.push(value);
90-
}
91-
});
92-
},
93-
});
94-
95-
if (runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").startsWith("pcm_")) {
96-
const sampleRate = parseInt(
97-
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
98-
);
99-
const withHeader = prependWavHeader(
100-
readable,
101-
1024 * 1024 * 100,
102-
sampleRate,
103-
1,
104-
16
67+
const status = response.status;
68+
if (status != 200) {
69+
const errorBodyString = await response.text();
70+
const errorBody = JSON.parse(errorBodyString);
71+
72+
// Check for quota exceeded error
73+
if (
74+
status === 401 &&
75+
errorBody.detail?.status === "quota_exceeded"
76+
) {
77+
console.log("ElevenLabs quota exceeded, falling back to VITS");
78+
throw new Error("QUOTA_EXCEEDED");
79+
}
80+
81+
throw new Error(
82+
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
10583
);
106-
return withHeader;
84+
}
85+
86+
if (response) {
87+
const reader = response.body?.getReader();
88+
const readable = new Readable({
89+
read() {
90+
reader &&
91+
reader.read().then(({ done, value }) => {
92+
if (done) {
93+
this.push(null);
94+
} else {
95+
this.push(value);
96+
}
97+
});
98+
},
99+
});
100+
101+
if (
102+
runtime
103+
.getSetting("ELEVENLABS_OUTPUT_FORMAT")
104+
.startsWith("pcm_")
105+
) {
106+
const sampleRate = parseInt(
107+
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
108+
);
109+
const withHeader = prependWavHeader(
110+
readable,
111+
1024 * 1024 * 100,
112+
sampleRate,
113+
1,
114+
16
115+
);
116+
return withHeader;
117+
} else {
118+
return readable;
119+
}
107120
} else {
108-
return readable;
121+
return new Readable({
122+
read() {},
123+
});
124+
}
125+
} catch (error) {
126+
if (error.message === "QUOTA_EXCEEDED") {
127+
// Fall back to VITS
128+
const { audio } = await Echogarden.synthesize(text, {
129+
engine: "vits",
130+
voice: "en_US-hfc_female-medium",
131+
});
132+
133+
let wavStream: Readable;
134+
if (audio instanceof Buffer) {
135+
console.log("audio is a buffer");
136+
wavStream = Readable.from(audio);
137+
} else if ("audioChannels" in audio && "sampleRate" in audio) {
138+
console.log("audio is a RawAudio");
139+
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
140+
console.log("buffer length: ", floatBuffer.length);
141+
142+
// Get the sample rate from the RawAudio object
143+
const sampleRate = audio.sampleRate;
144+
145+
// Create a Float32Array view of the floatBuffer
146+
const floatArray = new Float32Array(floatBuffer.buffer);
147+
148+
// Convert 32-bit float audio to 16-bit PCM
149+
const pcmBuffer = new Int16Array(floatArray.length);
150+
for (let i = 0; i < floatArray.length; i++) {
151+
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
152+
}
153+
154+
// Prepend WAV header to the buffer
155+
const wavHeaderBuffer = getWavHeader(
156+
pcmBuffer.length * 2,
157+
sampleRate,
158+
1,
159+
16
160+
);
161+
const wavBuffer = Buffer.concat([
162+
wavHeaderBuffer,
163+
Buffer.from(pcmBuffer.buffer),
164+
]);
165+
166+
wavStream = Readable.from(wavBuffer);
167+
} else {
168+
throw new Error("Unsupported audio format");
169+
}
170+
return wavStream;
109171
}
110-
} else {
111-
return new Readable({
112-
read() {},
113-
});
172+
throw error; // Re-throw other errors
114173
}
115174
}
116175

@@ -124,53 +183,104 @@ export class SpeechService extends Service implements ISpeechService {
124183
}
125184

126185
async generate(runtime: IAgentRuntime, text: string): Promise<Readable> {
127-
// check for elevenlabs API key
128-
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
129-
return textToSpeech(runtime, text);
130-
}
131-
const { audio } = await synthesize(text, {
132-
engine: "vits",
133-
voice: "en_US-hfc_female-medium",
134-
});
135-
136-
let wavStream: Readable;
137-
if (audio instanceof Buffer) {
138-
console.log("audio is a buffer");
139-
wavStream = Readable.from(audio);
140-
} else if ("audioChannels" in audio && "sampleRate" in audio) {
141-
console.log("audio is a RawAudio");
142-
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
143-
console.log("buffer length: ", floatBuffer.length);
144-
145-
// Get the sample rate from the RawAudio object
146-
const sampleRate = audio.sampleRate;
147-
148-
// Create a Float32Array view of the floatBuffer
149-
const floatArray = new Float32Array(floatBuffer.buffer);
150-
151-
// Convert 32-bit float audio to 16-bit PCM
152-
const pcmBuffer = new Int16Array(floatArray.length);
153-
for (let i = 0; i < floatArray.length; i++) {
154-
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
186+
try {
187+
// check for elevenlabs API key
188+
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
189+
return await textToSpeech(runtime, text);
155190
}
156191

157-
// Prepend WAV header to the buffer
158-
const wavHeaderBuffer = getWavHeader(
159-
pcmBuffer.length * 2,
160-
sampleRate,
161-
1,
162-
16
163-
);
164-
const wavBuffer = Buffer.concat([
165-
wavHeaderBuffer,
166-
Buffer.from(pcmBuffer.buffer),
167-
]);
192+
// Default to VITS if no ElevenLabs API key
193+
const { audio } = await Echogarden.synthesize(text, {
194+
engine: "vits",
195+
voice: "en_US-hfc_female-medium",
196+
});
168197

169-
wavStream = Readable.from(wavBuffer);
170-
} else {
171-
throw new Error("Unsupported audio format");
172-
}
198+
let wavStream: Readable;
199+
if (audio instanceof Buffer) {
200+
console.log("audio is a buffer");
201+
wavStream = Readable.from(audio);
202+
} else if ("audioChannels" in audio && "sampleRate" in audio) {
203+
console.log("audio is a RawAudio");
204+
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
205+
console.log("buffer length: ", floatBuffer.length);
206+
207+
// Get the sample rate from the RawAudio object
208+
const sampleRate = audio.sampleRate;
209+
210+
// Create a Float32Array view of the floatBuffer
211+
const floatArray = new Float32Array(floatBuffer.buffer);
212+
213+
// Convert 32-bit float audio to 16-bit PCM
214+
const pcmBuffer = new Int16Array(floatArray.length);
215+
for (let i = 0; i < floatArray.length; i++) {
216+
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
217+
}
218+
219+
// Prepend WAV header to the buffer
220+
const wavHeaderBuffer = getWavHeader(
221+
pcmBuffer.length * 2,
222+
sampleRate,
223+
1,
224+
16
225+
);
226+
const wavBuffer = Buffer.concat([
227+
wavHeaderBuffer,
228+
Buffer.from(pcmBuffer.buffer),
229+
]);
230+
231+
wavStream = Readable.from(wavBuffer);
232+
} else {
233+
throw new Error("Unsupported audio format");
234+
}
173235

174-
return wavStream;
236+
return wavStream;
237+
} catch (error) {
238+
console.error("Speech generation error:", error);
239+
// If ElevenLabs fails for any reason, fall back to VITS
240+
const { audio } = await Echogarden.synthesize(text, {
241+
engine: "vits",
242+
voice: "en_US-hfc_female-medium",
243+
});
244+
245+
let wavStream: Readable;
246+
if (audio instanceof Buffer) {
247+
console.log("audio is a buffer");
248+
wavStream = Readable.from(audio);
249+
} else if ("audioChannels" in audio && "sampleRate" in audio) {
250+
console.log("audio is a RawAudio");
251+
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
252+
console.log("buffer length: ", floatBuffer.length);
253+
254+
// Get the sample rate from the RawAudio object
255+
const sampleRate = audio.sampleRate;
256+
257+
// Create a Float32Array view of the floatBuffer
258+
const floatArray = new Float32Array(floatBuffer.buffer);
259+
260+
// Convert 32-bit float audio to 16-bit PCM
261+
const pcmBuffer = new Int16Array(floatArray.length);
262+
for (let i = 0; i < floatArray.length; i++) {
263+
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
264+
}
265+
266+
// Prepend WAV header to the buffer
267+
const wavHeaderBuffer = getWavHeader(
268+
pcmBuffer.length * 2,
269+
sampleRate,
270+
1,
271+
16
272+
);
273+
const wavBuffer = Buffer.concat([
274+
wavHeaderBuffer,
275+
Buffer.from(pcmBuffer.buffer),
276+
]);
277+
278+
wavStream = Readable.from(wavBuffer);
279+
} else {
280+
throw new Error("Unsupported audio format");
281+
}
282+
283+
return wavStream;
284+
}
175285
}
176286
}

0 commit comments

Comments
 (0)