1
1
import { PassThrough , Readable } from "stream" ;
2
- import {
3
- IAgentRuntime ,
4
- ISpeechService ,
5
- ITranscriptionService ,
6
- ServiceType ,
7
- } from "@ai16z/eliza" ;
2
+ import { IAgentRuntime , ISpeechService , ServiceType } from "@ai16z/eliza" ;
8
3
import { getWavHeader } from "./audioUtils.ts" ;
9
- import { synthesize } from "../vendor/vits.ts" ;
10
4
import { Service } from "@ai16z/eliza" ;
11
5
import { validateNodeConfig } from "../enviroment.ts" ;
6
+ import * as Echogarden from "echogarden" ;
12
7
13
8
function prependWavHeader (
14
9
readable : Readable ,
@@ -40,77 +35,141 @@ function prependWavHeader(
40
35
41
36
async function textToSpeech ( runtime : IAgentRuntime , text : string ) {
42
37
await validateNodeConfig ( runtime ) ;
43
- const body = {
44
- model_id : runtime . getSetting ( "ELEVENLABS_MODEL_ID" ) ,
45
- text : text ,
46
- voice_settings : {
47
- similarity_boost : runtime . getSetting (
48
- "ELEVENLABS_VOICE_SIMILARITY_BOOST"
49
- ) ,
50
- stability : runtime . getSetting ( "ELEVENLABS_VOICE_STABILITY" ) ,
51
- style : runtime . getSetting ( "ELEVENLABS_VOICE_STYLE" ) ,
52
- use_speaker_boost : runtime . getSetting (
53
- "ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
54
- ) ,
55
- } ,
56
- } ;
57
- const options = {
58
- method : "POST" ,
59
- headers : {
60
- "Content-Type" : "application/json" ,
61
- "xi-api-key" : runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ,
62
- } ,
63
- body : JSON . stringify ( body ) ,
64
- } ;
65
-
66
- const response = await fetch (
67
- `https://api.elevenlabs.io/v1/text-to-speech/${ runtime . getSetting ( "ELEVENLABS_VOICE_ID" ) } /stream?optimize_streaming_latency=${ runtime . getSetting ( "ELEVENLABS_OPTIMIZE_STREAMING_LATENCY" ) } &output_format=${ runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) } ` ,
68
- options
69
- ) ;
70
38
71
- const status = response . status ;
72
- if ( status != 200 ) {
73
- console . log ( `Received status ${ status } from Eleven Labs API` ) ;
74
- const errorBodyString = await response . text ( ) ;
75
- throw new Error (
76
- `Received status ${ status } from Eleven Labs API: ${ errorBodyString } `
39
+ try {
40
+ const response = await fetch (
41
+ `https://api.elevenlabs.io/v1/text-to-speech/${ runtime . getSetting ( "ELEVENLABS_VOICE_ID" ) } /stream?optimize_streaming_latency=${ runtime . getSetting ( "ELEVENLABS_OPTIMIZE_STREAMING_LATENCY" ) } &output_format=${ runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) } ` ,
42
+ {
43
+ method : "POST" ,
44
+ headers : {
45
+ "Content-Type" : "application/json" ,
46
+ "xi-api-key" : runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ,
47
+ } ,
48
+ body : JSON . stringify ( {
49
+ model_id : runtime . getSetting ( "ELEVENLABS_MODEL_ID" ) ,
50
+ text : text ,
51
+ voice_settings : {
52
+ similarity_boost : runtime . getSetting (
53
+ "ELEVENLABS_VOICE_SIMILARITY_BOOST"
54
+ ) ,
55
+ stability : runtime . getSetting (
56
+ "ELEVENLABS_VOICE_STABILITY"
57
+ ) ,
58
+ style : runtime . getSetting ( "ELEVENLABS_VOICE_STYLE" ) ,
59
+ use_speaker_boost : runtime . getSetting (
60
+ "ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
61
+ ) ,
62
+ } ,
63
+ } ) ,
64
+ }
77
65
) ;
78
- }
79
66
80
- if ( response ) {
81
- const reader = response . body ?. getReader ( ) ;
82
- const readable = new Readable ( {
83
- read ( ) {
84
- reader &&
85
- reader . read ( ) . then ( ( { done, value } ) => {
86
- if ( done ) {
87
- this . push ( null ) ;
88
- } else {
89
- this . push ( value ) ;
90
- }
91
- } ) ;
92
- } ,
93
- } ) ;
94
-
95
- if ( runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) . startsWith ( "pcm_" ) ) {
96
- const sampleRate = parseInt (
97
- runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) . substring ( 4 )
98
- ) ;
99
- const withHeader = prependWavHeader (
100
- readable ,
101
- 1024 * 1024 * 100 ,
102
- sampleRate ,
103
- 1 ,
104
- 16
67
+ const status = response . status ;
68
+ if ( status != 200 ) {
69
+ const errorBodyString = await response . text ( ) ;
70
+ const errorBody = JSON . parse ( errorBodyString ) ;
71
+
72
+ // Check for quota exceeded error
73
+ if (
74
+ status === 401 &&
75
+ errorBody . detail ?. status === "quota_exceeded"
76
+ ) {
77
+ console . log ( "ElevenLabs quota exceeded, falling back to VITS" ) ;
78
+ throw new Error ( "QUOTA_EXCEEDED" ) ;
79
+ }
80
+
81
+ throw new Error (
82
+ `Received status ${ status } from Eleven Labs API: ${ errorBodyString } `
105
83
) ;
106
- return withHeader ;
84
+ }
85
+
86
+ if ( response ) {
87
+ const reader = response . body ?. getReader ( ) ;
88
+ const readable = new Readable ( {
89
+ read ( ) {
90
+ reader &&
91
+ reader . read ( ) . then ( ( { done, value } ) => {
92
+ if ( done ) {
93
+ this . push ( null ) ;
94
+ } else {
95
+ this . push ( value ) ;
96
+ }
97
+ } ) ;
98
+ } ,
99
+ } ) ;
100
+
101
+ if (
102
+ runtime
103
+ . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" )
104
+ . startsWith ( "pcm_" )
105
+ ) {
106
+ const sampleRate = parseInt (
107
+ runtime . getSetting ( "ELEVENLABS_OUTPUT_FORMAT" ) . substring ( 4 )
108
+ ) ;
109
+ const withHeader = prependWavHeader (
110
+ readable ,
111
+ 1024 * 1024 * 100 ,
112
+ sampleRate ,
113
+ 1 ,
114
+ 16
115
+ ) ;
116
+ return withHeader ;
117
+ } else {
118
+ return readable ;
119
+ }
107
120
} else {
108
- return readable ;
121
+ return new Readable ( {
122
+ read ( ) { } ,
123
+ } ) ;
124
+ }
125
+ } catch ( error ) {
126
+ if ( error . message === "QUOTA_EXCEEDED" ) {
127
+ // Fall back to VITS
128
+ const { audio } = await Echogarden . synthesize ( text , {
129
+ engine : "vits" ,
130
+ voice : "en_US-hfc_female-medium" ,
131
+ } ) ;
132
+
133
+ let wavStream : Readable ;
134
+ if ( audio instanceof Buffer ) {
135
+ console . log ( "audio is a buffer" ) ;
136
+ wavStream = Readable . from ( audio ) ;
137
+ } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
138
+ console . log ( "audio is a RawAudio" ) ;
139
+ const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
140
+ console . log ( "buffer length: " , floatBuffer . length ) ;
141
+
142
+ // Get the sample rate from the RawAudio object
143
+ const sampleRate = audio . sampleRate ;
144
+
145
+ // Create a Float32Array view of the floatBuffer
146
+ const floatArray = new Float32Array ( floatBuffer . buffer ) ;
147
+
148
+ // Convert 32-bit float audio to 16-bit PCM
149
+ const pcmBuffer = new Int16Array ( floatArray . length ) ;
150
+ for ( let i = 0 ; i < floatArray . length ; i ++ ) {
151
+ pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
152
+ }
153
+
154
+ // Prepend WAV header to the buffer
155
+ const wavHeaderBuffer = getWavHeader (
156
+ pcmBuffer . length * 2 ,
157
+ sampleRate ,
158
+ 1 ,
159
+ 16
160
+ ) ;
161
+ const wavBuffer = Buffer . concat ( [
162
+ wavHeaderBuffer ,
163
+ Buffer . from ( pcmBuffer . buffer ) ,
164
+ ] ) ;
165
+
166
+ wavStream = Readable . from ( wavBuffer ) ;
167
+ } else {
168
+ throw new Error ( "Unsupported audio format" ) ;
169
+ }
170
+ return wavStream ;
109
171
}
110
- } else {
111
- return new Readable ( {
112
- read ( ) { } ,
113
- } ) ;
172
+ throw error ; // Re-throw other errors
114
173
}
115
174
}
116
175
@@ -124,53 +183,104 @@ export class SpeechService extends Service implements ISpeechService {
124
183
}
125
184
126
185
async generate ( runtime : IAgentRuntime , text : string ) : Promise < Readable > {
127
- // check for elevenlabs API key
128
- if ( runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ) {
129
- return textToSpeech ( runtime , text ) ;
130
- }
131
- const { audio } = await synthesize ( text , {
132
- engine : "vits" ,
133
- voice : "en_US-hfc_female-medium" ,
134
- } ) ;
135
-
136
- let wavStream : Readable ;
137
- if ( audio instanceof Buffer ) {
138
- console . log ( "audio is a buffer" ) ;
139
- wavStream = Readable . from ( audio ) ;
140
- } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
141
- console . log ( "audio is a RawAudio" ) ;
142
- const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
143
- console . log ( "buffer length: " , floatBuffer . length ) ;
144
-
145
- // Get the sample rate from the RawAudio object
146
- const sampleRate = audio . sampleRate ;
147
-
148
- // Create a Float32Array view of the floatBuffer
149
- const floatArray = new Float32Array ( floatBuffer . buffer ) ;
150
-
151
- // Convert 32-bit float audio to 16-bit PCM
152
- const pcmBuffer = new Int16Array ( floatArray . length ) ;
153
- for ( let i = 0 ; i < floatArray . length ; i ++ ) {
154
- pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
186
+ try {
187
+ // check for elevenlabs API key
188
+ if ( runtime . getSetting ( "ELEVENLABS_XI_API_KEY" ) ) {
189
+ return await textToSpeech ( runtime , text ) ;
155
190
}
156
191
157
- // Prepend WAV header to the buffer
158
- const wavHeaderBuffer = getWavHeader (
159
- pcmBuffer . length * 2 ,
160
- sampleRate ,
161
- 1 ,
162
- 16
163
- ) ;
164
- const wavBuffer = Buffer . concat ( [
165
- wavHeaderBuffer ,
166
- Buffer . from ( pcmBuffer . buffer ) ,
167
- ] ) ;
192
+ // Default to VITS if no ElevenLabs API key
193
+ const { audio } = await Echogarden . synthesize ( text , {
194
+ engine : "vits" ,
195
+ voice : "en_US-hfc_female-medium" ,
196
+ } ) ;
168
197
169
- wavStream = Readable . from ( wavBuffer ) ;
170
- } else {
171
- throw new Error ( "Unsupported audio format" ) ;
172
- }
198
+ let wavStream : Readable ;
199
+ if ( audio instanceof Buffer ) {
200
+ console . log ( "audio is a buffer" ) ;
201
+ wavStream = Readable . from ( audio ) ;
202
+ } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
203
+ console . log ( "audio is a RawAudio" ) ;
204
+ const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
205
+ console . log ( "buffer length: " , floatBuffer . length ) ;
206
+
207
+ // Get the sample rate from the RawAudio object
208
+ const sampleRate = audio . sampleRate ;
209
+
210
+ // Create a Float32Array view of the floatBuffer
211
+ const floatArray = new Float32Array ( floatBuffer . buffer ) ;
212
+
213
+ // Convert 32-bit float audio to 16-bit PCM
214
+ const pcmBuffer = new Int16Array ( floatArray . length ) ;
215
+ for ( let i = 0 ; i < floatArray . length ; i ++ ) {
216
+ pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
217
+ }
218
+
219
+ // Prepend WAV header to the buffer
220
+ const wavHeaderBuffer = getWavHeader (
221
+ pcmBuffer . length * 2 ,
222
+ sampleRate ,
223
+ 1 ,
224
+ 16
225
+ ) ;
226
+ const wavBuffer = Buffer . concat ( [
227
+ wavHeaderBuffer ,
228
+ Buffer . from ( pcmBuffer . buffer ) ,
229
+ ] ) ;
230
+
231
+ wavStream = Readable . from ( wavBuffer ) ;
232
+ } else {
233
+ throw new Error ( "Unsupported audio format" ) ;
234
+ }
173
235
174
- return wavStream ;
236
+ return wavStream ;
237
+ } catch ( error ) {
238
+ console . error ( "Speech generation error:" , error ) ;
239
+ // If ElevenLabs fails for any reason, fall back to VITS
240
+ const { audio } = await Echogarden . synthesize ( text , {
241
+ engine : "vits" ,
242
+ voice : "en_US-hfc_female-medium" ,
243
+ } ) ;
244
+
245
+ let wavStream : Readable ;
246
+ if ( audio instanceof Buffer ) {
247
+ console . log ( "audio is a buffer" ) ;
248
+ wavStream = Readable . from ( audio ) ;
249
+ } else if ( "audioChannels" in audio && "sampleRate" in audio ) {
250
+ console . log ( "audio is a RawAudio" ) ;
251
+ const floatBuffer = Buffer . from ( audio . audioChannels [ 0 ] . buffer ) ;
252
+ console . log ( "buffer length: " , floatBuffer . length ) ;
253
+
254
+ // Get the sample rate from the RawAudio object
255
+ const sampleRate = audio . sampleRate ;
256
+
257
+ // Create a Float32Array view of the floatBuffer
258
+ const floatArray = new Float32Array ( floatBuffer . buffer ) ;
259
+
260
+ // Convert 32-bit float audio to 16-bit PCM
261
+ const pcmBuffer = new Int16Array ( floatArray . length ) ;
262
+ for ( let i = 0 ; i < floatArray . length ; i ++ ) {
263
+ pcmBuffer [ i ] = Math . round ( floatArray [ i ] * 32767 ) ;
264
+ }
265
+
266
+ // Prepend WAV header to the buffer
267
+ const wavHeaderBuffer = getWavHeader (
268
+ pcmBuffer . length * 2 ,
269
+ sampleRate ,
270
+ 1 ,
271
+ 16
272
+ ) ;
273
+ const wavBuffer = Buffer . concat ( [
274
+ wavHeaderBuffer ,
275
+ Buffer . from ( pcmBuffer . buffer ) ,
276
+ ] ) ;
277
+
278
+ wavStream = Readable . from ( wavBuffer ) ;
279
+ } else {
280
+ throw new Error ( "Unsupported audio format" ) ;
281
+ }
282
+
283
+ return wavStream ;
284
+ }
175
285
}
176
286
}
0 commit comments