plugin-tts: enhance TTS generation flow and caching (#2506)

bfontes · wtfsayo · web-flow · commit cfd1f48c3f13 · 2025-01-19T20:47:09.000+05:30
* refactor(tts-generation): enhance TTS generation flow and caching

* refactor(tts-generation): add error handling for language detection

---------

Co-authored-by: Sayo &lt;hi@sayo.wtf&gt;
diff --git a/packages/plugin-tts/src/index.ts b/packages/plugin-tts/src/index.ts
@@ -14,7 +14,7 @@ import * as fs from "fs";
 import { Buffer } from "buffer";
 import * as path from "path";
 import * as process from "process";
-import { detect } from 'langdetect'; 
+import { detect } from 'langdetect';
 
 const generateTTS = async (prompt: string, voice: string, runtime: IAgentRuntime) => {
     process.env["FAL_KEY"] =
@@ -27,26 +27,24 @@ const generateTTS = async (prompt: string, voice: string, runtime: IAgentRuntime
             input: {
                 input: prompt,
                 voice: voice
-              },
+            },
             logs: true,
             onQueueUpdate: (update) => {
                 if (update.status === "IN_PROGRESS") {
                     update.logs
                         .map((log) => log.message)
-                        .forEach(elizaLogger.log);
+                        .forEach(elizaLogger.debug);
                 }
             },
         });
 
-        elizaLogger.log(
+        elizaLogger.debug(
             "Generation request successful, received response:",
             response
         );
 
-        return {success: true, 
-                data: response.data};
-    } 
-    catch (error) {
+        return { success: true, data: response.data };
+    } catch (error) {
         elizaLogger.error("TTS generation error:", error);
         return {
             success: false,
@@ -67,9 +65,9 @@ const TTSGeneration: Action = {
     ],
     description: "Generate a tts audio based on a text prompt",
     validate: async (runtime: IAgentRuntime, _message: Memory) => {
-        elizaLogger.log("Validating TTS action");
+        elizaLogger.debug("Validating TTS action");
         const FalApiKey = runtime.getSetting("FAL_API_KEY");
-        elizaLogger.log("FAL_API_KEY present:", !!FalApiKey);
+        elizaLogger.debug("FAL_API_KEY present:", !!FalApiKey);
         return !!FalApiKey;
     },
     handler: async (
@@ -79,120 +77,98 @@ const TTSGeneration: Action = {
         _options: any,
         callback: HandlerCallback
     ) => {
-        elizaLogger.log("TTS request:", message);
-
+        elizaLogger.debug("TTS request:", message);
         // Clean up the prompt by removing mentions and commands
         const TTSPrompt = message.content.text
             .replace(/<@\d+>/g, "") // Remove mentions
-            .replace(/generate TTS|create TTS|make TTS|render TTS/gi, "") // Remove commands
+            .replace(/generate TTS|create TTS|make TTS|render TTS/gi, "")
             .trim();
 
         if (!TTSPrompt || TTSPrompt.length < 3) {
-            callback({
-                text: "Please input a word at least of length 3",
-            });
+            callback({ text: "Please input a word at least of length 3" });
             return;
         }
 
-        elizaLogger.log("TTS prompt:", TTSPrompt);
+        elizaLogger.debug("TTS prompt:", TTSPrompt);
 
         callback({
-            text: `I'll generate a audio based on your prompt: "${TTSPrompt}". This might take a few seconds...`,
+            text: `I'll generate an audio based on your prompt: "${TTSPrompt}". This might take a few seconds...`,
         });
 
-        const language = detect(TTSPrompt);
-        const voice_subject = VOICE_MAP[language[0].lang];
-        const target_voice = getRandomVoice(voice_subject).fullName;
+        let target_voice;
+        try {
+            const language = detect(TTSPrompt);
+            if (language && language.length > 0) {
+            const voice_subject = VOICE_MAP[language[0].lang];
+            target_voice = getRandomVoice(voice_subject).fullName;
+            } else {
+            throw new Error("Language detection failed, no language detected.");
+            }
+        } catch (error) {
+            elizaLogger.error("Language detection error:", error);
+            const defaultVoice = VOICE_MAP['en'];
+            target_voice = getRandomVoice(defaultVoice).fullName;
+        }
 
-        elizaLogger.log("Starting TTS generation with prompt:", prompt, "and voice:", target_voice);
+        elizaLogger.debug("Starting TTS generation with prompt:", TTSPrompt, "and voice:", target_voice);
 
         try {
             const result = await generateTTS(TTSPrompt, target_voice, runtime);
 
             if (result.success && result.data.audio.url) {
-                // Download the Audio file
-                const response = await fetch(result.data.audio.url);
-                const arrayBuffer = await response.arrayBuffer();
-                const TTSFileName = `content_cache/tts_${result.data.audio.file_name}`;
-
-                // ensure the directory is existed
-                const directoryPath = path.dirname(TTSFileName);
-                if (!fs.existsSync(directoryPath)) {
-                    fs.mkdirSync(directoryPath, { recursive: true });
+                const cachedFile = `content_cache/tts_${result.data.audio.file_name}`;
+                if (fs.existsSync(cachedFile)) {
+                    elizaLogger.debug("Using cached audio:", cachedFile);
+                } else {
+                    const response = await fetch(result.data.audio.url);
+                    const arrayBuffer = await response.arrayBuffer();
+
+                    const directoryPath = path.dirname(cachedFile);
+                    if (!fs.existsSync(directoryPath)) {
+                        fs.mkdirSync(directoryPath, { recursive: true });
+                    }
+
+                    fs.writeFileSync(cachedFile, Buffer.from(arrayBuffer));
+                    elizaLogger.debug("Audio Duration:", result.data.audio.duration);
                 }
 
-                // Save Audio file
-                fs.writeFileSync(TTSFileName, Buffer.from(arrayBuffer));
-
-                elizaLogger.log("Audio Duration:", result.data.audio.duration);
-                callback(
-                    {
-                        text: "TTS Success! Here's your generated audio!",
-                        attachments: [
-                            {
-                                id: crypto.randomUUID(),
-                                url: result.data.audio.url,
-                                title: "TTS Generation",
-                                source: "TTSGeneration",
-                                description: TTSPrompt,
-                                text: TTSPrompt,
-                            },
-                        ],
-                    },
-                    [TTSFileName]
-                ); // Add the audio file to the attachments
-            } else {
                 callback({
-                    text: `TTS generation failed: ${result.error}`,
-                    error: true,
-                });
+                    text: "TTS Success! Here's your generated audio!",
+                    attachments: [
+                        {
+                            id: crypto.randomUUID(),
+                            url: result.data.audio.url,
+                            title: "TTS Generation",
+                            source: "TTSGeneration",
+                            description: TTSPrompt,
+                            text: TTSPrompt,
+                        },
+                    ],
+                }, [cachedFile]);
+            } else {
+                callback({ text: `TTS generation failed: ${result.error}`, error: true });
             }
         } catch (error) {
             elizaLogger.error(`Failed to generate TTS. Error: ${error}`);
-            callback({
-                text: `TTS generation failed: ${error.message}`,
-                error: true,
-            });
+            callback({ text: `TTS generation failed: ${error.message}`, error: true });
         }
     },
     examples: [
         [
-            {
-                user: "{{user1}}",
-                content: {
-                    text: "Generate a TTS of prompt: Hello world!",
-                },
-            },
-            {
-                user: "{{agentName}}",
-                content: {
-                    text: "I'll call a TTS to generate an audio based on your input prompt",
-                    action: "CREATE_TTS",
-                },
-            },
+            { user: "{{user1}}", content: { text: "Generate a TTS of prompt: Hello world!" } },
+            { user: "{{agentName}}", content: { text: "I'll call a TTS to generate an audio based on your input prompt", action: "CREATE_TTS" } },
         ],
         [
-            {
-                user: "{{user1}}",
-                content: {
-                    text: "Please do TTS to a prompt: Sam is busy now",
-                },
-            },
-            {
-                user: "{{agentName}}",
-                content: {
-                    text: "Ok, please wait for the tts generation~",
-                    action: "AUDIO_CREATE",
-                },
-            },
+            { user: "{{user1}}", content: { text: "Please do TTS to a prompt: Sam is busy now" } },
+            { user: "{{agentName}}", content: { text: "Ok, please wait for the tts generation~", action: "AUDIO_CREATE" } },
         ],
     ],
-} as Action;
+};
 
 export const TTSGenerationPlugin: Plugin = {
     name: "TTSGeneration",
     description: "Generate TTS using PlayAI tts (v3)",
     actions: [TTSGeneration],
     evaluators: [],
     providers: [],
-};
+};