Merge pull request #1026 from ai16z/tcm-improve-discord-voice

odilitime · web-flow · commit 2ffa4b45c4f8 · 2024-12-13T09:35:51.000-08:00
feat: improve voice processing and add deepgram transcription option
diff --git a/.env.example b/.env.example
@@ -270,3 +270,7 @@ AWS_SECRET_ACCESS_KEY=
 AWS_REGION=
 AWS_S3_BUCKET=
 AWS_S3_UPLOAD_PATH=
+
+
+# Deepgram
+DEEPGRAM_API_KEY=
diff --git a/package.json b/package.json
@@ -54,9 +54,10 @@
     "dependencies": {
         "@0glabs/0g-ts-sdk": "0.2.1",
         "@coinbase/coinbase-sdk": "0.10.0",
+        "@deepgram/sdk": "^3.9.0",
+        "@vitest/eslint-plugin": "1.0.1",
         "amqplib": "0.10.5",
         "csv-parse": "5.6.0",
-        "@vitest/eslint-plugin": "1.0.1",
         "ollama-ai-provider": "0.16.1",
         "optional": "0.1.4",
         "pnpm": "9.14.4",
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
@@ -46,14 +46,12 @@ import {
     discordShouldRespondTemplate,
     discordVoiceHandlerTemplate,
 } from "./templates.ts";
-import debounce from "lodash/debounce.js";
 import { getWavHeader } from "./utils.ts";
 
 // These values are chosen for compatibility with picovoice components
 const DECODE_FRAME_SIZE = 1024;
 const DECODE_SAMPLE_RATE = 16000;
 
-// Buffers all audio
 export class AudioMonitor {
     private readable: Readable;
     private buffers: Buffer[] = [];
@@ -64,6 +62,7 @@ export class AudioMonitor {
     constructor(
         readable: Readable,
         maxSize: number,
+        onStart: () => void,
         callback: (buffer: Buffer) => void
     ) {
         this.readable = readable;
@@ -98,6 +97,7 @@ export class AudioMonitor {
         });
         this.readable.on("speakingStarted", () => {
             if (this.ended) return;
+            onStart();
             elizaLogger.log("Speaking started");
             this.reset();
         });
@@ -138,6 +138,8 @@ export class AudioMonitor {
 }
 
 export class VoiceManager extends EventEmitter {
+    private processingVoice: boolean = false;
+    private transcriptionTimeout: NodeJS.Timeout | null = null;
     private userStates: Map<
         string,
         {
@@ -373,6 +375,7 @@ export class VoiceManager extends EventEmitter {
                 if (avgVolume > SPEAKING_THRESHOLD) {
                     volumeBuffer.length = 0;
                     this.cleanupAudioPlayer(this.activeAudioPlayer);
+                    this.processingVoice = false;
                 }
             }
         });
@@ -453,6 +456,52 @@ export class VoiceManager extends EventEmitter {
         // this.scanGuild(guild);
     }
 
+    async debouncedProcessTranscription(
+        userId: UUID,
+        name: string,
+        userName: string,
+        channel: BaseGuildVoiceChannel
+    ) {
+        const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence
+
+        if (this.activeAudioPlayer?.state?.status === "idle") {
+            elizaLogger.log("Cleaning up idle audio player.");
+            this.cleanupAudioPlayer(this.activeAudioPlayer);
+        }
+
+        if (this.activeAudioPlayer || this.processingVoice) {
+            const state = this.userStates.get(userId);
+            state.buffers.length = 0;
+            state.totalLength = 0;
+            return;
+        }
+
+        if (this.transcriptionTimeout) {
+            clearTimeout(this.transcriptionTimeout);
+        }
+
+        this.transcriptionTimeout = setTimeout(async () => {
+            this.processingVoice = true;
+            try {
+                await this.processTranscription(
+                    userId,
+                    channel.id,
+                    channel,
+                    name,
+                    userName
+                );
+
+                // Clean all users' previous buffers
+                this.userStates.forEach((state, id) => {
+                    state.buffers.length = 0;
+                    state.totalLength = 0;
+                });
+            } finally {
+                this.processingVoice = false;
+            }
+        }, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
+    }
+
     async handleUserStream(
         userId: UUID,
         name: string,
@@ -461,7 +510,6 @@ export class VoiceManager extends EventEmitter {
         audioStream: Readable
     ) {
         console.log(`Starting audio monitor for user: ${userId}`);
-        const channelId = channel.id;
         if (!this.userStates.has(userId)) {
             this.userStates.set(userId, {
                 buffers: [],
@@ -473,25 +521,17 @@ export class VoiceManager extends EventEmitter {
 
         const state = this.userStates.get(userId);
 
-        const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2500; // wait for 1.5 seconds of silence
-
-        const debouncedProcessTranscription = debounce(async () => {
-            await this.processTranscription(
-                userId,
-                channelId,
-                channel,
-                name,
-                userName
-            );
-        }, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
-
         const processBuffer = async (buffer: Buffer) => {
             try {
                 state!.buffers.push(buffer);
                 state!.totalLength += buffer.length;
                 state!.lastActive = Date.now();
-
-                debouncedProcessTranscription();
+                this.debouncedProcessTranscription(
+                    userId,
+                    name,
+                    userName,
+                    channel
+                );
             } catch (error) {
                 console.error(
                     `Error processing buffer for user ${userId}:`,
@@ -500,13 +540,22 @@ export class VoiceManager extends EventEmitter {
             }
         };
 
-        new AudioMonitor(audioStream, 10000000, async (buffer) => {
-            if (!buffer) {
-                console.error("Received empty buffer");
-                return;
+        new AudioMonitor(
+            audioStream,
+            10000000,
+            () => {
+                if (this.transcriptionTimeout) {
+                    clearTimeout(this.transcriptionTimeout);
+                }
+            },
+            async (buffer) => {
+                if (!buffer) {
+                    console.error("Received empty buffer");
+                    return;
+                }
+                await processBuffer(buffer);
             }
-            await processBuffer(buffer);
-        });
+        );
     }
 
     private async processTranscription(
@@ -520,12 +569,11 @@ export class VoiceManager extends EventEmitter {
         if (!state || state.buffers.length === 0) return;
         try {
             const inputBuffer = Buffer.concat(state.buffers, state.totalLength);
+
             state.buffers.length = 0; // Clear the buffers
             state.totalLength = 0;
-
             // Convert Opus to WAV
             const wavBuffer = await this.convertOpusToWav(inputBuffer);
-
             console.log("Starting transcription...");
 
             const transcriptionText = await this.runtime
diff --git a/packages/plugin-node/src/services/transcription.ts b/packages/plugin-node/src/services/transcription.ts
@@ -14,6 +14,7 @@ import os from "os";
 import path from "path";
 import { fileURLToPath } from "url";
 import { promisify } from "util";
+import { createClient, DeepgramClient } from "@deepgram/sdk";
 
 // const __dirname = path.dirname(new URL(import.meta.url).pathname); #compatibility issues with windows
 const __filename = fileURLToPath(import.meta.url);
@@ -25,17 +26,23 @@ export class TranscriptionService
     extends Service
     implements ITranscriptionService
 {
+    private runtime: IAgentRuntime | null = null;
     static serviceType: ServiceType = ServiceType.TRANSCRIPTION;
     private CONTENT_CACHE_DIR: string;
     private DEBUG_AUDIO_DIR: string;
     private TARGET_SAMPLE_RATE = 16000; // Common sample rate for speech recognition
     private isCudaAvailable: boolean = false;
     private openai: OpenAI | null = null;
+    private deepgram?: DeepgramClient;
 
     private queue: { audioBuffer: ArrayBuffer; resolve: Function }[] = [];
     private processing: boolean = false;
 
-    async initialize(_runtime: IAgentRuntime): Promise<void> {}
+    async initialize(_runtime: IAgentRuntime): Promise<void> {
+        this.runtime = _runtime;
+        const deepgramKey = this.runtime.getSetting("DEEPGRAM_API_KEY");
+        this.deepgram = deepgramKey ? createClient(deepgramKey) : null;
+    }
 
     constructor() {
         super();
@@ -194,8 +201,9 @@ export class TranscriptionService
         while (this.queue.length > 0) {
             const { audioBuffer, resolve } = this.queue.shift()!;
             let result: string | null = null;
-
-            if (this.openai) {
+            if (this.deepgram) {
+                result = await this.transcribeWithDeepgram(audioBuffer);
+            } else if (this.openai) {
                 result = await this.transcribeWithOpenAI(audioBuffer);
             } else {
                 result = await this.transcribeLocally(audioBuffer);
@@ -207,6 +215,23 @@ export class TranscriptionService
         this.processing = false;
     }
 
+    private async transcribeWithDeepgram(
+        audioBuffer: ArrayBuffer
+    ): Promise<string | null> {
+        const buffer = Buffer.from(audioBuffer);
+        const response = await this.deepgram.listen.prerecorded.transcribeFile(
+            buffer,
+            {
+                model: "nova-2",
+                language: "en-US",
+                smart_format: true,
+            }
+        );
+        const result =
+            response.result.results.channels[0].alternatives[0].transcript;
+        return result;
+    }
+
     private async transcribeWithOpenAI(
         audioBuffer: ArrayBuffer
     ): Promise<string | null> {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml