Skip to content

Commit d371fee

Browse files
authored
Merge pull request #1528 from HowieDuhzit/main
feat: add /:agentId/speak endpoint for text-to-speech functionality
2 parents 8453774 + a43da32 commit d371fee

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed

packages/client-direct/src/index.ts

+157
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,163 @@ export class DirectClient {
445445
}
446446
}
447447
);
448+
449+
this.app.post("/:agentId/speak", async (req, res) => {
450+
const agentId = req.params.agentId;
451+
const roomId = stringToUuid(req.body.roomId ?? "default-room-" + agentId);
452+
const userId = stringToUuid(req.body.userId ?? "user");
453+
const text = req.body.text;
454+
455+
if (!text) {
456+
res.status(400).send("No text provided");
457+
return;
458+
}
459+
460+
let runtime = this.agents.get(agentId);
461+
462+
// if runtime is null, look for runtime with the same name
463+
if (!runtime) {
464+
runtime = Array.from(this.agents.values()).find(
465+
(a) => a.character.name.toLowerCase() === agentId.toLowerCase()
466+
);
467+
}
468+
469+
if (!runtime) {
470+
res.status(404).send("Agent not found");
471+
return;
472+
}
473+
474+
try {
475+
// Process message through agent (same as /message endpoint)
476+
await runtime.ensureConnection(
477+
userId,
478+
roomId,
479+
req.body.userName,
480+
req.body.name,
481+
"direct"
482+
);
483+
484+
const messageId = stringToUuid(Date.now().toString());
485+
486+
const content: Content = {
487+
text,
488+
attachments: [],
489+
source: "direct",
490+
inReplyTo: undefined,
491+
};
492+
493+
const userMessage = {
494+
content,
495+
userId,
496+
roomId,
497+
agentId: runtime.agentId,
498+
};
499+
500+
const memory: Memory = {
501+
id: messageId,
502+
agentId: runtime.agentId,
503+
userId,
504+
roomId,
505+
content,
506+
createdAt: Date.now(),
507+
};
508+
509+
await runtime.messageManager.createMemory(memory);
510+
511+
const state = await runtime.composeState(userMessage, {
512+
agentName: runtime.character.name,
513+
});
514+
515+
const context = composeContext({
516+
state,
517+
template: messageHandlerTemplate,
518+
});
519+
520+
const response = await generateMessageResponse({
521+
runtime: runtime,
522+
context,
523+
modelClass: ModelClass.LARGE,
524+
});
525+
526+
// save response to memory
527+
const responseMessage = {
528+
...userMessage,
529+
userId: runtime.agentId,
530+
content: response,
531+
};
532+
533+
await runtime.messageManager.createMemory(responseMessage);
534+
535+
if (!response) {
536+
res.status(500).send("No response from generateMessageResponse");
537+
return;
538+
}
539+
540+
let message = null as Content | null;
541+
542+
await runtime.evaluate(memory, state);
543+
544+
const _result = await runtime.processActions(
545+
memory,
546+
[responseMessage],
547+
state,
548+
async (newMessages) => {
549+
message = newMessages;
550+
return [memory];
551+
}
552+
);
553+
554+
// Get the text to convert to speech
555+
const textToSpeak = response.text;
556+
557+
// Convert to speech using ElevenLabs
558+
const elevenLabsApiUrl = `https://api.elevenlabs.io/v1/text-to-speech/${process.env.ELEVENLABS_VOICE_ID}`;
559+
const apiKey = process.env.ELEVENLABS_XI_API_KEY;
560+
561+
if (!apiKey) {
562+
throw new Error("ELEVENLABS_XI_API_KEY not configured");
563+
}
564+
565+
const speechResponse = await fetch(elevenLabsApiUrl, {
566+
method: "POST",
567+
headers: {
568+
"Content-Type": "application/json",
569+
"xi-api-key": apiKey,
570+
},
571+
body: JSON.stringify({
572+
text: textToSpeak,
573+
model_id: process.env.ELEVENLABS_MODEL_ID || "eleven_multilingual_v2",
574+
voice_settings: {
575+
stability: parseFloat(process.env.ELEVENLABS_VOICE_STABILITY || "0.5"),
576+
similarity_boost: parseFloat(process.env.ELEVENLABS_VOICE_SIMILARITY_BOOST || "0.9"),
577+
style: parseFloat(process.env.ELEVENLABS_VOICE_STYLE || "0.66"),
578+
use_speaker_boost: process.env.ELEVENLABS_VOICE_USE_SPEAKER_BOOST === "true",
579+
},
580+
}),
581+
});
582+
583+
if (!speechResponse.ok) {
584+
throw new Error(`ElevenLabs API error: ${speechResponse.statusText}`);
585+
}
586+
587+
const audioBuffer = await speechResponse.arrayBuffer();
588+
589+
// Set appropriate headers for audio streaming
590+
res.set({
591+
'Content-Type': 'audio/mpeg',
592+
'Transfer-Encoding': 'chunked'
593+
});
594+
595+
res.send(Buffer.from(audioBuffer));
596+
597+
} catch (error) {
598+
console.error("Error processing message or generating speech:", error);
599+
res.status(500).json({
600+
error: "Error processing message or generating speech",
601+
details: error.message
602+
});
603+
}
604+
});
448605
}
449606

450607
// agent/src/index.ts:startAgent calls this

0 commit comments

Comments
 (0)