integrate deepgram

tcm390 · tcm390 · commit 296b6a806f52 · 2024-12-12T22:34:49.000-05:00
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
@@ -47,7 +47,6 @@ import {
     discordVoiceHandlerTemplate,
 } from "./templates.ts";
 import { getWavHeader } from "./utils.ts";
-import { createClient, DeepgramClient } from "@deepgram/sdk";
 
 // These values are chosen for compatibility with picovoice components
 const DECODE_FRAME_SIZE = 1024;
@@ -139,7 +138,6 @@ export class AudioMonitor {
 }
 
 export class VoiceManager extends EventEmitter {
-    private deepgram?: DeepgramClient;
     private processingVoice: boolean = false;
     private transcriptionTimeout: NodeJS.Timeout | null = null;
     private userStates: Map<
@@ -165,9 +163,6 @@ export class VoiceManager extends EventEmitter {
         super();
         this.client = client.client;
         this.runtime = client.runtime;
-
-        const deepgramKey = this.runtime.getSetting("DEEPGRAM_API_KEY");
-        this.deepgram = deepgramKey ? createClient(deepgramKey) : null;
     }
 
     async handleVoiceStateUpdate(oldState: VoiceState, newState: VoiceState) {
@@ -583,26 +578,9 @@ export class VoiceManager extends EventEmitter {
 
             let transcriptionText: string;
 
-            if (this.deepgram) {
-                const response =
-                    await this.deepgram.listen.prerecorded.transcribeFile(
-                        wavBuffer,
-                        {
-                            model: "nova-2",
-                            language: "en-US",
-                            smart_format: true,
-                        }
-                    );
-                transcriptionText =
-                    response.result.results.channels[0].alternatives[0]
-                        .transcript;
-            } else {
-                transcriptionText = await this.runtime
-                    .getService<ITranscriptionService>(
-                        ServiceType.TRANSCRIPTION
-                    )
-                    .transcribe(wavBuffer);
-            }
+            transcriptionText = await this.runtime
+                .getService<ITranscriptionService>(ServiceType.TRANSCRIPTION)
+                .transcribe(wavBuffer);
 
             function isValidTranscription(text: string): boolean {
                 if (!text || text.includes("[BLANK_AUDIO]")) return false;
diff --git a/packages/plugin-node/src/services/transcription.ts b/packages/plugin-node/src/services/transcription.ts
@@ -14,6 +14,7 @@ import os from "os";
 import path from "path";
 import { fileURLToPath } from "url";
 import { promisify } from "util";
+import { createClient, DeepgramClient } from "@deepgram/sdk";
 
 // const __dirname = path.dirname(new URL(import.meta.url).pathname); #compatibility issues with windows
 const __filename = fileURLToPath(import.meta.url);
@@ -25,17 +26,23 @@ export class TranscriptionService
     extends Service
     implements ITranscriptionService
 {
+    private runtime: IAgentRuntime | null = null;
     static serviceType: ServiceType = ServiceType.TRANSCRIPTION;
     private CONTENT_CACHE_DIR: string;
     private DEBUG_AUDIO_DIR: string;
     private TARGET_SAMPLE_RATE = 16000; // Common sample rate for speech recognition
     private isCudaAvailable: boolean = false;
     private openai: OpenAI | null = null;
+    private deepgram?: DeepgramClient;
 
     private queue: { audioBuffer: ArrayBuffer; resolve: Function }[] = [];
     private processing: boolean = false;
 
-    async initialize(_runtime: IAgentRuntime): Promise<void> {}
+    async initialize(_runtime: IAgentRuntime): Promise<void> {
+        this.runtime = _runtime;
+        const deepgramKey = this.runtime.getSetting("DEEPGRAM_API_KEY");
+        this.deepgram = deepgramKey ? createClient(deepgramKey) : null;
+    }
 
     constructor() {
         super();
@@ -194,8 +201,12 @@ export class TranscriptionService
         while (this.queue.length > 0) {
             const { audioBuffer, resolve } = this.queue.shift()!;
             let result: string | null = null;
-
-            if (this.openai) {
+            if (this.deepgram) {
+                console.log(
+                    "%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&%%%%%%%&&&&"
+                );
+                result = await this.transcribeWithDeepgram(audioBuffer);
+            } else if (this.openai) {
                 result = await this.transcribeWithOpenAI(audioBuffer);
             } else {
                 result = await this.transcribeLocally(audioBuffer);
@@ -207,6 +218,23 @@ export class TranscriptionService
         this.processing = false;
     }
 
+    private async transcribeWithDeepgram(
+        audioBuffer: ArrayBuffer
+    ): Promise<string | null> {
+        const buffer = Buffer.from(audioBuffer);
+        const response = await this.deepgram.listen.prerecorded.transcribeFile(
+            buffer,
+            {
+                model: "nova-2",
+                language: "en-US",
+                smart_format: true,
+            }
+        );
+        const result =
+            response.result.results.channels[0].alternatives[0].transcript;
+        return result;
+    }
+
     private async transcribeWithOpenAI(
         audioBuffer: ArrayBuffer
     ): Promise<string | null> {