Skip to content

Commit 5d926c1

Browse files
committed
feat: add /:agentId/speak endpoint for text-to-speech functionality
- Implemented a new POST endpoint that allows users to send text to a specified agent. - The endpoint processes the message, generates a response, and converts it to speech using the ElevenLabs API. - Added error handling for missing text, agent not found, and ElevenLabs API errors. - Responses are streamed back as audio in MPEG format.
1 parent a930c84 commit 5d926c1

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed

packages/client-direct/src/index.ts

+157
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,163 @@ export class DirectClient {
370370
}
371371
}
372372
);
373+
374+
this.app.post("/:agentId/speak", async (req, res) => {
375+
const agentId = req.params.agentId;
376+
const roomId = stringToUuid(req.body.roomId ?? "default-room-" + agentId);
377+
const userId = stringToUuid(req.body.userId ?? "user");
378+
const text = req.body.text;
379+
380+
if (!text) {
381+
res.status(400).send("No text provided");
382+
return;
383+
}
384+
385+
let runtime = this.agents.get(agentId);
386+
387+
// if runtime is null, look for runtime with the same name
388+
if (!runtime) {
389+
runtime = Array.from(this.agents.values()).find(
390+
(a) => a.character.name.toLowerCase() === agentId.toLowerCase()
391+
);
392+
}
393+
394+
if (!runtime) {
395+
res.status(404).send("Agent not found");
396+
return;
397+
}
398+
399+
try {
400+
// Process message through agent (same as /message endpoint)
401+
await runtime.ensureConnection(
402+
userId,
403+
roomId,
404+
req.body.userName,
405+
req.body.name,
406+
"direct"
407+
);
408+
409+
const messageId = stringToUuid(Date.now().toString());
410+
411+
const content: Content = {
412+
text,
413+
attachments: [],
414+
source: "direct",
415+
inReplyTo: undefined,
416+
};
417+
418+
const userMessage = {
419+
content,
420+
userId,
421+
roomId,
422+
agentId: runtime.agentId,
423+
};
424+
425+
const memory: Memory = {
426+
id: messageId,
427+
agentId: runtime.agentId,
428+
userId,
429+
roomId,
430+
content,
431+
createdAt: Date.now(),
432+
};
433+
434+
await runtime.messageManager.createMemory(memory);
435+
436+
const state = await runtime.composeState(userMessage, {
437+
agentName: runtime.character.name,
438+
});
439+
440+
const context = composeContext({
441+
state,
442+
template: messageHandlerTemplate,
443+
});
444+
445+
const response = await generateMessageResponse({
446+
runtime: runtime,
447+
context,
448+
modelClass: ModelClass.LARGE,
449+
});
450+
451+
// save response to memory
452+
const responseMessage = {
453+
...userMessage,
454+
userId: runtime.agentId,
455+
content: response,
456+
};
457+
458+
await runtime.messageManager.createMemory(responseMessage);
459+
460+
if (!response) {
461+
res.status(500).send("No response from generateMessageResponse");
462+
return;
463+
}
464+
465+
let message = null as Content | null;
466+
467+
await runtime.evaluate(memory, state);
468+
469+
const _result = await runtime.processActions(
470+
memory,
471+
[responseMessage],
472+
state,
473+
async (newMessages) => {
474+
message = newMessages;
475+
return [memory];
476+
}
477+
);
478+
479+
// Get the text to convert to speech
480+
const textToSpeak = response.text;
481+
482+
// Convert to speech using ElevenLabs
483+
const elevenLabsApiUrl = `https://api.elevenlabs.io/v1/text-to-speech/${process.env.ELEVENLABS_VOICE_ID}`;
484+
const apiKey = process.env.ELEVENLABS_XI_API_KEY;
485+
486+
if (!apiKey) {
487+
throw new Error("ELEVENLABS_XI_API_KEY not configured");
488+
}
489+
490+
const speechResponse = await fetch(elevenLabsApiUrl, {
491+
method: "POST",
492+
headers: {
493+
"Content-Type": "application/json",
494+
"xi-api-key": apiKey,
495+
},
496+
body: JSON.stringify({
497+
text: textToSpeak,
498+
model_id: process.env.ELEVENLABS_MODEL_ID || "eleven_multilingual_v2",
499+
voice_settings: {
500+
stability: parseFloat(process.env.ELEVENLABS_VOICE_STABILITY || "0.5"),
501+
similarity_boost: parseFloat(process.env.ELEVENLABS_VOICE_SIMILARITY_BOOST || "0.9"),
502+
style: parseFloat(process.env.ELEVENLABS_VOICE_STYLE || "0.66"),
503+
use_speaker_boost: process.env.ELEVENLABS_VOICE_USE_SPEAKER_BOOST === "true",
504+
},
505+
}),
506+
});
507+
508+
if (!speechResponse.ok) {
509+
throw new Error(`ElevenLabs API error: ${speechResponse.statusText}`);
510+
}
511+
512+
const audioBuffer = await speechResponse.arrayBuffer();
513+
514+
// Set appropriate headers for audio streaming
515+
res.set({
516+
'Content-Type': 'audio/mpeg',
517+
'Transfer-Encoding': 'chunked'
518+
});
519+
520+
res.send(Buffer.from(audioBuffer));
521+
522+
} catch (error) {
523+
console.error("Error processing message or generating speech:", error);
524+
res.status(500).json({
525+
error: "Error processing message or generating speech",
526+
details: error.message
527+
});
528+
}
529+
});
373530
}
374531

375532
// agent/src/index.ts:startAgent calls this

0 commit comments

Comments
 (0)