Skip to content

Commit e1a838c

Browse files
speech service fix
1 parent a6c1b1b commit e1a838c

File tree

5 files changed

+1938
-8531
lines changed

5 files changed

+1938
-8531
lines changed

packages/client-discord/src/voice.ts

-5
Original file line numberDiff line numberDiff line change
@@ -416,11 +416,6 @@ export class VoiceManager extends EventEmitter {
416416
ServiceType.TRANSCRIPTION
417417
);
418418

419-
console.log(
420-
"transcriptionService: ",
421-
transcriptionService
422-
);
423-
424419
if (!transcriptionService) {
425420
throw new Error(
426421
"Transcription generation service not found"

packages/plugin-node/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"cldr-segmentation": "2.2.1",
2121
"command-exists": "1.2.9",
2222
"csv-writer": "1.6.0",
23+
"echogarden": "^2.0.5",
2324
"espeak-ng": "1.0.2",
2425
"ffmpeg-static": "5.2.0",
2526
"fluent-ffmpeg": "2.1.3",
+226-115
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
import { PassThrough, Readable } from "stream";
2-
import {
3-
IAgentRuntime,
4-
ISpeechService,
5-
ITranscriptionService,
6-
ServiceType,
7-
} from "@ai16z/eliza";
2+
import { IAgentRuntime, ISpeechService, ServiceType } from "@ai16z/eliza";
83
import { getWavHeader } from "./audioUtils.ts";
9-
import { synthesize } from "../vendor/vits.ts";
104
import { Service } from "@ai16z/eliza";
115
import { validateNodeConfig } from "../enviroment.ts";
6+
import * as Echogarden from "echogarden";
127

138
function prependWavHeader(
149
readable: Readable,
@@ -40,77 +35,142 @@ function prependWavHeader(
4035

4136
async function textToSpeech(runtime: IAgentRuntime, text: string) {
4237
await validateNodeConfig(runtime);
43-
const body = {
44-
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
45-
text: text,
46-
voice_settings: {
47-
similarity_boost: runtime.getSetting(
48-
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
49-
),
50-
stability: runtime.getSetting("ELEVENLABS_VOICE_STABILITY"),
51-
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
52-
use_speaker_boost: runtime.getSetting(
53-
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
54-
),
55-
},
56-
};
57-
const options = {
58-
method: "POST",
59-
headers: {
60-
"Content-Type": "application/json",
61-
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
62-
},
63-
body: JSON.stringify(body),
64-
};
65-
66-
const response = await fetch(
67-
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
68-
options
69-
);
7038

71-
const status = response.status;
72-
if (status != 200) {
73-
console.log(`Received status ${status} from Eleven Labs API`);
74-
const errorBodyString = await response.text();
75-
throw new Error(
76-
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
39+
try {
40+
const body = {
41+
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
42+
text: text,
43+
voice_settings: {
44+
similarity_boost: runtime.getSetting(
45+
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
46+
),
47+
stability: runtime.getSetting("ELEVENLABS_VOICE_STABILITY"),
48+
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
49+
use_speaker_boost: runtime.getSetting(
50+
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
51+
),
52+
},
53+
};
54+
const options = {
55+
method: "POST",
56+
headers: {
57+
"Content-Type": "application/json",
58+
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
59+
},
60+
body: JSON.stringify(body),
61+
};
62+
63+
const response = await fetch(
64+
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
65+
options
7766
);
78-
}
7967

80-
if (response) {
81-
const reader = response.body?.getReader();
82-
const readable = new Readable({
83-
read() {
84-
reader &&
85-
reader.read().then(({ done, value }) => {
86-
if (done) {
87-
this.push(null);
88-
} else {
89-
this.push(value);
90-
}
91-
});
92-
},
93-
});
68+
const status = response.status;
69+
if (status != 200) {
70+
const errorBodyString = await response.text();
71+
const errorBody = JSON.parse(errorBodyString);
9472

95-
if (runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").startsWith("pcm_")) {
96-
const sampleRate = parseInt(
97-
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
98-
);
99-
const withHeader = prependWavHeader(
100-
readable,
101-
1024 * 1024 * 100,
102-
sampleRate,
103-
1,
104-
16
73+
// Check for quota exceeded error
74+
if (
75+
status === 401 &&
76+
errorBody.detail?.status === "quota_exceeded"
77+
) {
78+
console.log("ElevenLabs quota exceeded, falling back to VITS");
79+
throw new Error("QUOTA_EXCEEDED");
80+
}
81+
82+
throw new Error(
83+
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
10584
);
106-
return withHeader;
85+
}
86+
87+
if (response) {
88+
const reader = response.body?.getReader();
89+
const readable = new Readable({
90+
read() {
91+
reader &&
92+
reader.read().then(({ done, value }) => {
93+
if (done) {
94+
this.push(null);
95+
} else {
96+
this.push(value);
97+
}
98+
});
99+
},
100+
});
101+
102+
if (
103+
runtime
104+
.getSetting("ELEVENLABS_OUTPUT_FORMAT")
105+
.startsWith("pcm_")
106+
) {
107+
const sampleRate = parseInt(
108+
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
109+
);
110+
const withHeader = prependWavHeader(
111+
readable,
112+
1024 * 1024 * 100,
113+
sampleRate,
114+
1,
115+
16
116+
);
117+
return withHeader;
118+
} else {
119+
return readable;
120+
}
107121
} else {
108-
return readable;
122+
return new Readable({
123+
read() {},
124+
});
125+
}
126+
} catch (error) {
127+
if (error.message === "QUOTA_EXCEEDED") {
128+
// Fall back to VITS
129+
const { audio } = await Echogarden.synthesize(text, {
130+
engine: "vits",
131+
voice: "en_US-hfc_female-medium",
132+
});
133+
134+
let wavStream: Readable;
135+
if (audio instanceof Buffer) {
136+
console.log("audio is a buffer");
137+
wavStream = Readable.from(audio);
138+
} else if ("audioChannels" in audio && "sampleRate" in audio) {
139+
console.log("audio is a RawAudio");
140+
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
141+
console.log("buffer length: ", floatBuffer.length);
142+
143+
// Get the sample rate from the RawAudio object
144+
const sampleRate = audio.sampleRate;
145+
146+
// Create a Float32Array view of the floatBuffer
147+
const floatArray = new Float32Array(floatBuffer.buffer);
148+
149+
// Convert 32-bit float audio to 16-bit PCM
150+
const pcmBuffer = new Int16Array(floatArray.length);
151+
for (let i = 0; i < floatArray.length; i++) {
152+
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
153+
}
154+
155+
// Prepend WAV header to the buffer
156+
const wavHeaderBuffer = getWavHeader(
157+
pcmBuffer.length * 2,
158+
sampleRate,
159+
1,
160+
16
161+
);
162+
const wavBuffer = Buffer.concat([
163+
wavHeaderBuffer,
164+
Buffer.from(pcmBuffer.buffer),
165+
]);
166+
167+
wavStream = Readable.from(wavBuffer);
168+
} else {
169+
throw new Error("Unsupported audio format");
170+
}
171+
return wavStream;
109172
}
110-
} else {
111-
return new Readable({
112-
read() {},
113-
});
173+
throw error; // Re-throw other errors
114174
}
115175
}
116176

@@ -124,53 +184,104 @@ export class SpeechService extends Service implements ISpeechService {
124184
}
125185

126186
async generate(runtime: IAgentRuntime, text: string): Promise<Readable> {
127-
// check for elevenlabs API key
128-
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
129-
return textToSpeech(runtime, text);
130-
}
131-
const { audio } = await synthesize(text, {
132-
engine: "vits",
133-
voice: "en_US-hfc_female-medium",
134-
});
135-
136-
let wavStream: Readable;
137-
if (audio instanceof Buffer) {
138-
console.log("audio is a buffer");
139-
wavStream = Readable.from(audio);
140-
} else if ("audioChannels" in audio && "sampleRate" in audio) {
141-
console.log("audio is a RawAudio");
142-
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
143-
console.log("buffer length: ", floatBuffer.length);
144-
145-
// Get the sample rate from the RawAudio object
146-
const sampleRate = audio.sampleRate;
147-
148-
// Create a Float32Array view of the floatBuffer
149-
const floatArray = new Float32Array(floatBuffer.buffer);
150-
151-
// Convert 32-bit float audio to 16-bit PCM
152-
const pcmBuffer = new Int16Array(floatArray.length);
153-
for (let i = 0; i < floatArray.length; i++) {
154-
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
187+
try {
188+
// check for elevenlabs API key
189+
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
190+
return await textToSpeech(runtime, text);
155191
}
156192

157-
// Prepend WAV header to the buffer
158-
const wavHeaderBuffer = getWavHeader(
159-
pcmBuffer.length * 2,
160-
sampleRate,
161-
1,
162-
16
163-
);
164-
const wavBuffer = Buffer.concat([
165-
wavHeaderBuffer,
166-
Buffer.from(pcmBuffer.buffer),
167-
]);
193+
// Default to VITS if no ElevenLabs API key
194+
const { audio } = await Echogarden.synthesize(text, {
195+
engine: "vits",
196+
voice: "en_US-hfc_female-medium",
197+
});
168198

169-
wavStream = Readable.from(wavBuffer);
170-
} else {
171-
throw new Error("Unsupported audio format");
172-
}
199+
let wavStream: Readable;
200+
if (audio instanceof Buffer) {
201+
console.log("audio is a buffer");
202+
wavStream = Readable.from(audio);
203+
} else if ("audioChannels" in audio && "sampleRate" in audio) {
204+
console.log("audio is a RawAudio");
205+
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
206+
console.log("buffer length: ", floatBuffer.length);
207+
208+
// Get the sample rate from the RawAudio object
209+
const sampleRate = audio.sampleRate;
210+
211+
// Create a Float32Array view of the floatBuffer
212+
const floatArray = new Float32Array(floatBuffer.buffer);
213+
214+
// Convert 32-bit float audio to 16-bit PCM
215+
const pcmBuffer = new Int16Array(floatArray.length);
216+
for (let i = 0; i < floatArray.length; i++) {
217+
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
218+
}
219+
220+
// Prepend WAV header to the buffer
221+
const wavHeaderBuffer = getWavHeader(
222+
pcmBuffer.length * 2,
223+
sampleRate,
224+
1,
225+
16
226+
);
227+
const wavBuffer = Buffer.concat([
228+
wavHeaderBuffer,
229+
Buffer.from(pcmBuffer.buffer),
230+
]);
231+
232+
wavStream = Readable.from(wavBuffer);
233+
} else {
234+
throw new Error("Unsupported audio format");
235+
}
236+
237+
return wavStream;
238+
} catch (error) {
239+
console.error("Speech generation error:", error);
240+
// If ElevenLabs fails for any reason, fall back to VITS
241+
const { audio } = await Echogarden.synthesize(text, {
242+
engine: "vits",
243+
voice: "en_US-hfc_female-medium",
244+
});
245+
246+
let wavStream: Readable;
247+
if (audio instanceof Buffer) {
248+
console.log("audio is a buffer");
249+
wavStream = Readable.from(audio);
250+
} else if ("audioChannels" in audio && "sampleRate" in audio) {
251+
console.log("audio is a RawAudio");
252+
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
253+
console.log("buffer length: ", floatBuffer.length);
254+
255+
// Get the sample rate from the RawAudio object
256+
const sampleRate = audio.sampleRate;
257+
258+
// Create a Float32Array view of the floatBuffer
259+
const floatArray = new Float32Array(floatBuffer.buffer);
173260

174-
return wavStream;
261+
// Convert 32-bit float audio to 16-bit PCM
262+
const pcmBuffer = new Int16Array(floatArray.length);
263+
for (let i = 0; i < floatArray.length; i++) {
264+
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
265+
}
266+
267+
// Prepend WAV header to the buffer
268+
const wavHeaderBuffer = getWavHeader(
269+
pcmBuffer.length * 2,
270+
sampleRate,
271+
1,
272+
16
273+
);
274+
const wavBuffer = Buffer.concat([
275+
wavHeaderBuffer,
276+
Buffer.from(pcmBuffer.buffer),
277+
]);
278+
279+
wavStream = Readable.from(wavBuffer);
280+
} else {
281+
throw new Error("Unsupported audio format");
282+
}
283+
284+
return wavStream;
285+
}
175286
}
176287
}

0 commit comments

Comments
 (0)