1
1
import { PassThrough , Readable } from "stream" ;
2
- import {
3
- IAgentRuntime ,
4
- ISpeechService ,
5
- ITranscriptionService ,
6
- ServiceType ,
7
- } from "@ai16z/eliza" ;
2
+ import { IAgentRuntime , ISpeechService , ServiceType } from "@ai16z/eliza" ;
8
3
import { getWavHeader } from "./audioUtils.ts" ;
9
- import { synthesize } from "../vendor/vits.ts" ;
10
4
import { Service } from "@ai16z/eliza" ;
11
5
import { validateNodeConfig } from "../enviroment.ts" ;
6
+ import * as Echogarden from "echogarden" ;
12
7
13
8
function prependWavHeader (
14
9
readable : Readable ,
@@ -40,77 +35,142 @@ function prependWavHeader(
40
35
41
36
async function textToSpeech ( runtime : IAgentRuntime , text : string ) {
42
37
await validateNodeConfig ( runtime ) ;
43
- const body = {
44
- model_id : runtime . getSetting ( "ELEVENLABS_MODEL_ID" ) ,
45
- text : text ,
46
- voice_settings : {
47
- similarity_boost : runtime . getSetting (
48
- "ELEVENLABS_VOICE_SIMILARITY_BOOST"
49
- ) ,
50
- stability : runtime . getSetting ( "ELEVENLABS_VOICE_STABILITY" ) ,
51
- style : runtime . getSetting ( "ELEVENLABS_VOICE_STYLE" ) ,
52
- use_speaker_boost : runtime . getSetting (
53
- "ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
54
- ) ,
55
- } ,
56
- } ;
57
- const options = {
58
- method : "POST" ,
59
- headers : {
60
- "Content-Type" : "application/json" ,
61
- "xi-api-key" : runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ,
62
- } ,
63
- body : JSON . stringify ( body ) ,
64
- } ;
65
-
66
- const response = await fetch (
67
- `https://api.elevenlabs.io/v1/text-to-speech/${ runtime . getSetting ( "ELEVENLABS_VOICE_ID" ) } /stream?optimize_streaming_latency=${ runtime . getSetting ( "ELEVENLABS_OPTIMIZE_STREAMING_LATENCY" ) } &output_format=${ runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) } ` ,
68
- options
69
- ) ;
70
38
71
- const status = response . status ;
72
- if ( status != 200 ) {
73
- console . log ( `Received status ${ status } from Eleven Labs API` ) ;
74
- const errorBodyString = await response . text ( ) ;
75
- throw new Error (
76
- `Received status ${ status } from Eleven Labs API: ${ errorBodyString } `
39
+ try {
40
+ const body = {
41
+ model_id : runtime . getSetting ( "ELEVENLABS_MODEL_ID" ) ,
42
+ text : text ,
43
+ voice_settings : {
44
+ similarity_boost : runtime . getSetting (
45
+ "ELEVENLABS_VOICE_SIMILARITY_BOOST"
46
+ ) ,
47
+ stability : runtime . getSetting ( "ELEVENLABS_VOICE_STABILITY" ) ,
48
+ style : runtime . getSetting ( "ELEVENLABS_VOICE_STYLE" ) ,
49
+ use_speaker_boost : runtime . getSetting (
50
+ "ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
51
+ ) ,
52
+ } ,
53
+ } ;
54
+ const options = {
55
+ method : "POST" ,
56
+ headers : {
57
+ "Content-Type" : "application/json" ,
58
+ "xi-api-key" : runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ,
59
+ } ,
60
+ body : JSON . stringify ( body ) ,
61
+ } ;
62
+
63
+ const response = await fetch (
64
+ `https://api.elevenlabs.io/v1/text-to-speech/${ runtime . getSetting ( "ELEVENLABS_VOICE_ID" ) } /stream?optimize_streaming_latency=${ runtime . getSetting ( "ELEVENLABS_OPTIMIZE_STREAMING_LATENCY" ) } &output_format=${ runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) } ` ,
65
+ options
77
66
) ;
78
- }
79
67
80
- if ( response ) {
81
- const reader = response . body ?. getReader ( ) ;
82
- const readable = new Readable ( {
83
- read ( ) {
84
- reader &&
85
- reader . read ( ) . then ( ( { done, value } ) => {
86
- if ( done ) {
87
- this . push ( null ) ;
88
- } else {
89
- this . push ( value ) ;
90
- }
91
- } ) ;
92
- } ,
93
- } ) ;
68
+ const status = response . status ;
69
+ if ( status != 200 ) {
70
+ const errorBodyString = await response . text ( ) ;
71
+ const errorBody = JSON . parse ( errorBodyString ) ;
94
72
95
- if ( runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) . startsWith ( "pcm_" ) ) {
96
- const sampleRate = parseInt (
97
- runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) . substring ( 4 )
98
- ) ;
99
- const withHeader = prependWavHeader (
100
- readable ,
101
- 1024 * 1024 * 100 ,
102
- sampleRate ,
103
- 1 ,
104
- 16
73
+ // Check for quota exceeded error
74
+ if (
75
+ status === 401 &&
76
+ errorBody . detail ?. status === "quota_exceeded"
77
+ ) {
78
+ console . log ( "ElevenLabs quota exceeded, falling back to VITS" ) ;
79
+ throw new Error ( "QUOTA_EXCEEDED" ) ;
80
+ }
81
+
82
+ throw new Error (
83
+ `Received status ${ status } from Eleven Labs API: ${ errorBodyString } `
105
84
) ;
106
- return withHeader ;
85
+ }
86
+
87
+ if ( response ) {
88
+ const reader = response . body ?. getReader ( ) ;
89
+ const readable = new Readable ( {
90
+ read ( ) {
91
+ reader &&
92
+ reader . read ( ) . then ( ( { done, value } ) => {
93
+ if ( done ) {
94
+ this . push ( null ) ;
95
+ } else {
96
+ this . push ( value ) ;
97
+ }
98
+ } ) ;
99
+ } ,
100
+ } ) ;
101
+
102
+ if (
103
+ runtime
104
+ . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" )
105
+ . startsWith ( "pcm_" )
106
+ ) {
107
+ const sampleRate = parseInt (
108
+ runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) . substring ( 4 )
109
+ ) ;
110
+ const withHeader = prependWavHeader (
111
+ readable ,
112
+ 1024 * 1024 * 100 ,
113
+ sampleRate ,
114
+ 1 ,
115
+ 16
116
+ ) ;
117
+ return withHeader ;
118
+ } else {
119
+ return readable ;
120
+ }
107
121
} else {
108
- return readable ;
122
+ return new Readable ( {
123
+ read ( ) { } ,
124
+ } ) ;
125
+ }
126
+ } catch ( error ) {
127
+ if ( error . message === "QUOTA_EXCEEDED" ) {
128
+ // Fall back to VITS
129
+ const { audio } = await Echogarden . synthesize ( text , {
130
+ engine : "vits" ,
131
+ voice : "en_US-hfc_female-medium" ,
132
+ } ) ;
133
+
134
+ let wavStream : Readable ;
135
+ if ( audio instanceof Buffer ) {
136
+ console . log ( "audio is a buffer" ) ;
137
+ wavStream = Readable . from ( audio ) ;
138
+ } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
139
+ console . log ( "audio is a RawAudio" ) ;
140
+ const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
141
+ console . log ( "buffer length: " , floatBuffer . length ) ;
142
+
143
+ // Get the sample rate from the RawAudio object
144
+ const sampleRate = audio . sampleRate ;
145
+
146
+ // Create a Float32Array view of the floatBuffer
147
+ const floatArray = new Float32Array ( floatBuffer . buffer ) ;
148
+
149
+ // Convert 32-bit float audio to 16-bit PCM
150
+ const pcmBuffer = new Int16Array ( floatArray . length ) ;
151
+ for ( let i = 0 ; i < floatArray . length ; i ++ ) {
152
+ pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
153
+ }
154
+
155
+ // Prepend WAV header to the buffer
156
+ const wavHeaderBuffer = getWavHeader (
157
+ pcmBuffer . length * 2 ,
158
+ sampleRate ,
159
+ 1 ,
160
+ 16
161
+ ) ;
162
+ const wavBuffer = Buffer . concat ( [
163
+ wavHeaderBuffer ,
164
+ Buffer . from ( pcmBuffer . buffer ) ,
165
+ ] ) ;
166
+
167
+ wavStream = Readable . from ( wavBuffer ) ;
168
+ } else {
169
+ throw new Error ( "Unsupported audio format" ) ;
170
+ }
171
+ return wavStream ;
109
172
}
110
- } else {
111
- return new Readable ( {
112
- read ( ) { } ,
113
- } ) ;
173
+ throw error ; // Re-throw other errors
114
174
}
115
175
}
116
176
@@ -124,53 +184,104 @@ export class SpeechService extends Service implements ISpeechService {
124
184
}
125
185
126
186
async generate ( runtime : IAgentRuntime , text : string ) : Promise < Readable > {
127
- // check for elevenlabs API key
128
- if ( runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ) {
129
- return textToSpeech ( runtime , text ) ;
130
- }
131
- const { audio } = await synthesize ( text , {
132
- engine : "vits" ,
133
- voice : "en_US-hfc_female-medium" ,
134
- } ) ;
135
-
136
- let wavStream : Readable ;
137
- if ( audio instanceof Buffer ) {
138
- console . log ( "audio is a buffer" ) ;
139
- wavStream = Readable . from ( audio ) ;
140
- } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
141
- console . log ( "audio is a RawAudio" ) ;
142
- const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
143
- console . log ( "buffer length: " , floatBuffer . length ) ;
144
-
145
- // Get the sample rate from the RawAudio object
146
- const sampleRate = audio . sampleRate ;
147
-
148
- // Create a Float32Array view of the floatBuffer
149
- const floatArray = new Float32Array ( floatBuffer . buffer ) ;
150
-
151
- // Convert 32-bit float audio to 16-bit PCM
152
- const pcmBuffer = new Int16Array ( floatArray . length ) ;
153
- for ( let i = 0 ; i < floatArray . length ; i ++ ) {
154
- pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
187
+ try {
188
+ // check for elevenlabs API key
189
+ if ( runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ) {
190
+ return await textToSpeech ( runtime , text ) ;
155
191
}
156
192
157
- // Prepend WAV header to the buffer
158
- const wavHeaderBuffer = getWavHeader (
159
- pcmBuffer . length * 2 ,
160
- sampleRate ,
161
- 1 ,
162
- 16
163
- ) ;
164
- const wavBuffer = Buffer . concat ( [
165
- wavHeaderBuffer ,
166
- Buffer . from ( pcmBuffer . buffer ) ,
167
- ] ) ;
193
+ // Default to VITS if no ElevenLabs API key
194
+ const { audio } = await Echogarden . synthesize ( text , {
195
+ engine : "vits" ,
196
+ voice : "en_US-hfc_female-medium" ,
197
+ } ) ;
168
198
169
- wavStream = Readable . from ( wavBuffer ) ;
170
- } else {
171
- throw new Error ( "Unsupported audio format" ) ;
172
- }
199
+ let wavStream : Readable ;
200
+ if ( audio instanceof Buffer ) {
201
+ console . log ( "audio is a buffer" ) ;
202
+ wavStream = Readable . from ( audio ) ;
203
+ } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
204
+ console . log ( "audio is a RawAudio" ) ;
205
+ const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
206
+ console . log ( "buffer length: " , floatBuffer . length ) ;
207
+
208
+ // Get the sample rate from the RawAudio object
209
+ const sampleRate = audio . sampleRate ;
210
+
211
+ // Create a Float32Array view of the floatBuffer
212
+ const floatArray = new Float32Array ( floatBuffer . buffer ) ;
213
+
214
+ // Convert 32-bit float audio to 16-bit PCM
215
+ const pcmBuffer = new Int16Array ( floatArray . length ) ;
216
+ for ( let i = 0 ; i < floatArray . length ; i ++ ) {
217
+ pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
218
+ }
219
+
220
+ // Prepend WAV header to the buffer
221
+ const wavHeaderBuffer = getWavHeader (
222
+ pcmBuffer . length * 2 ,
223
+ sampleRate ,
224
+ 1 ,
225
+ 16
226
+ ) ;
227
+ const wavBuffer = Buffer . concat ( [
228
+ wavHeaderBuffer ,
229
+ Buffer . from ( pcmBuffer . buffer ) ,
230
+ ] ) ;
231
+
232
+ wavStream = Readable . from ( wavBuffer ) ;
233
+ } else {
234
+ throw new Error ( "Unsupported audio format" ) ;
235
+ }
236
+
237
+ return wavStream ;
238
+ } catch ( error ) {
239
+ console . error ( "Speech generation error:" , error ) ;
240
+ // If ElevenLabs fails for any reason, fall back to VITS
241
+ const { audio } = await Echogarden . synthesize ( text , {
242
+ engine : "vits" ,
243
+ voice : "en_US-hfc_female-medium" ,
244
+ } ) ;
245
+
246
+ let wavStream : Readable ;
247
+ if ( audio instanceof Buffer ) {
248
+ console . log ( "audio is a buffer" ) ;
249
+ wavStream = Readable . from ( audio ) ;
250
+ } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
251
+ console . log ( "audio is a RawAudio" ) ;
252
+ const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
253
+ console . log ( "buffer length: " , floatBuffer . length ) ;
254
+
255
+ // Get the sample rate from the RawAudio object
256
+ const sampleRate = audio . sampleRate ;
257
+
258
+ // Create a Float32Array view of the floatBuffer
259
+ const floatArray = new Float32Array ( floatBuffer . buffer ) ;
173
260
174
- return wavStream ;
261
+ // Convert 32-bit float audio to 16-bit PCM
262
+ const pcmBuffer = new Int16Array ( floatArray . length ) ;
263
+ for ( let i = 0 ; i < floatArray . length ; i ++ ) {
264
+ pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
265
+ }
266
+
267
+ // Prepend WAV header to the buffer
268
+ const wavHeaderBuffer = getWavHeader (
269
+ pcmBuffer . length * 2 ,
270
+ sampleRate ,
271
+ 1 ,
272
+ 16
273
+ ) ;
274
+ const wavBuffer = Buffer . concat ( [
275
+ wavHeaderBuffer ,
276
+ Buffer . from ( pcmBuffer . buffer ) ,
277
+ ] ) ;
278
+
279
+ wavStream = Readable . from ( wavBuffer ) ;
280
+ } else {
281
+ throw new Error ( "Unsupported audio format" ) ;
282
+ }
283
+
284
+ return wavStream ;
285
+ }
175
286
}
176
287
}
0 commit comments