diff --git a/.env.example b/.env.example index 623fb3569cf..0d808f655b0 100644 --- a/.env.example +++ b/.env.example @@ -270,3 +270,7 @@ AWS_SECRET_ACCESS_KEY= AWS_REGION= AWS_S3_BUCKET= AWS_S3_UPLOAD_PATH= + + +# Deepgram +DEEPGRAM_API_KEY= diff --git a/package.json b/package.json index ac6da05dad9..55021a62d38 100644 --- a/package.json +++ b/package.json @@ -54,9 +54,10 @@ "dependencies": { "@0glabs/0g-ts-sdk": "0.2.1", "@coinbase/coinbase-sdk": "0.10.0", + "@deepgram/sdk": "^3.9.0", + "@vitest/eslint-plugin": "1.0.1", "amqplib": "0.10.5", "csv-parse": "5.6.0", - "@vitest/eslint-plugin": "1.0.1", "ollama-ai-provider": "0.16.1", "optional": "0.1.4", "pnpm": "9.14.4", diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index c8b2bb5447b..97f2a81b6e4 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -46,14 +46,12 @@ import { discordShouldRespondTemplate, discordVoiceHandlerTemplate, } from "./templates.ts"; -import debounce from "lodash/debounce.js"; import { getWavHeader } from "./utils.ts"; // These values are chosen for compatibility with picovoice components const DECODE_FRAME_SIZE = 1024; const DECODE_SAMPLE_RATE = 16000; -// Buffers all audio export class AudioMonitor { private readable: Readable; private buffers: Buffer[] = []; @@ -64,6 +62,7 @@ export class AudioMonitor { constructor( readable: Readable, maxSize: number, + onStart: () => void, callback: (buffer: Buffer) => void ) { this.readable = readable; @@ -98,6 +97,7 @@ export class AudioMonitor { }); this.readable.on("speakingStarted", () => { if (this.ended) return; + onStart(); elizaLogger.log("Speaking started"); this.reset(); }); @@ -138,6 +138,8 @@ export class AudioMonitor { } export class VoiceManager extends EventEmitter { + private processingVoice: boolean = false; + private transcriptionTimeout: NodeJS.Timeout | null = null; private userStates: Map< string, { @@ -373,6 +375,7 @@ export class VoiceManager extends EventEmitter { if (avgVolume > SPEAKING_THRESHOLD) { volumeBuffer.length = 0; this.cleanupAudioPlayer(this.activeAudioPlayer); + this.processingVoice = false; } } }); @@ -453,6 +456,52 @@ export class VoiceManager extends EventEmitter { // this.scanGuild(guild); } + async debouncedProcessTranscription( + userId: UUID, + name: string, + userName: string, + channel: BaseGuildVoiceChannel + ) { + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence + + if (this.activeAudioPlayer?.state?.status === "idle") { + elizaLogger.log("Cleaning up idle audio player."); + this.cleanupAudioPlayer(this.activeAudioPlayer); + } + + if (this.activeAudioPlayer || this.processingVoice) { + const state = this.userStates.get(userId); + state.buffers.length = 0; + state.totalLength = 0; + return; + } + + if (this.transcriptionTimeout) { + clearTimeout(this.transcriptionTimeout); + } + + this.transcriptionTimeout = setTimeout(async () => { + this.processingVoice = true; + try { + await this.processTranscription( + userId, + channel.id, + channel, + name, + userName + ); + + // Clean all users' previous buffers + this.userStates.forEach((state, id) => { + state.buffers.length = 0; + state.totalLength = 0; + }); + } finally { + this.processingVoice = false; + } + }, DEBOUNCE_TRANSCRIPTION_THRESHOLD); + } + async handleUserStream( userId: UUID, name: string, @@ -461,7 +510,6 @@ export class VoiceManager extends EventEmitter { audioStream: Readable ) { console.log(`Starting audio monitor for user: ${userId}`); - const channelId = channel.id; if (!this.userStates.has(userId)) { this.userStates.set(userId, { buffers: [], @@ -473,25 +521,17 @@ export class VoiceManager extends EventEmitter { const state = this.userStates.get(userId); - const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2500; // wait for 1.5 seconds of silence - - const debouncedProcessTranscription = debounce(async () => { - await this.processTranscription( - userId, - channelId, - channel, - name, - userName - ); - }, DEBOUNCE_TRANSCRIPTION_THRESHOLD); - const processBuffer = async (buffer: Buffer) => { try { state!.buffers.push(buffer); state!.totalLength += buffer.length; state!.lastActive = Date.now(); - - debouncedProcessTranscription(); + this.debouncedProcessTranscription( + userId, + name, + userName, + channel + ); } catch (error) { console.error( `Error processing buffer for user ${userId}:`, @@ -500,13 +540,22 @@ export class VoiceManager extends EventEmitter { } }; - new AudioMonitor(audioStream, 10000000, async (buffer) => { - if (!buffer) { - console.error("Received empty buffer"); - return; + new AudioMonitor( + audioStream, + 10000000, + () => { + if (this.transcriptionTimeout) { + clearTimeout(this.transcriptionTimeout); + } + }, + async (buffer) => { + if (!buffer) { + console.error("Received empty buffer"); + return; + } + await processBuffer(buffer); } - await processBuffer(buffer); - }); + ); } private async processTranscription( @@ -520,12 +569,11 @@ export class VoiceManager extends EventEmitter { if (!state || state.buffers.length === 0) return; try { const inputBuffer = Buffer.concat(state.buffers, state.totalLength); + state.buffers.length = 0; // Clear the buffers state.totalLength = 0; - // Convert Opus to WAV const wavBuffer = await this.convertOpusToWav(inputBuffer); - console.log("Starting transcription..."); const transcriptionText = await this.runtime diff --git a/packages/plugin-node/src/services/transcription.ts b/packages/plugin-node/src/services/transcription.ts index 43bbf373607..35c8546a768 100644 --- a/packages/plugin-node/src/services/transcription.ts +++ b/packages/plugin-node/src/services/transcription.ts @@ -14,6 +14,7 @@ import os from "os"; import path from "path"; import { fileURLToPath } from "url"; import { promisify } from "util"; +import { createClient, DeepgramClient } from "@deepgram/sdk"; // const __dirname = path.dirname(new URL(import.meta.url).pathname); #compatibility issues with windows const __filename = fileURLToPath(import.meta.url); @@ -25,17 +26,23 @@ export class TranscriptionService extends Service implements ITranscriptionService { + private runtime: IAgentRuntime | null = null; static serviceType: ServiceType = ServiceType.TRANSCRIPTION; private CONTENT_CACHE_DIR: string; private DEBUG_AUDIO_DIR: string; private TARGET_SAMPLE_RATE = 16000; // Common sample rate for speech recognition private isCudaAvailable: boolean = false; private openai: OpenAI | null = null; + private deepgram?: DeepgramClient; private queue: { audioBuffer: ArrayBuffer; resolve: Function }[] = []; private processing: boolean = false; - async initialize(_runtime: IAgentRuntime): Promise {} + async initialize(_runtime: IAgentRuntime): Promise { + this.runtime = _runtime; + const deepgramKey = this.runtime.getSetting("DEEPGRAM_API_KEY"); + this.deepgram = deepgramKey ? createClient(deepgramKey) : null; + } constructor() { super(); @@ -194,8 +201,9 @@ export class TranscriptionService while (this.queue.length > 0) { const { audioBuffer, resolve } = this.queue.shift()!; let result: string | null = null; - - if (this.openai) { + if (this.deepgram) { + result = await this.transcribeWithDeepgram(audioBuffer); + } else if (this.openai) { result = await this.transcribeWithOpenAI(audioBuffer); } else { result = await this.transcribeLocally(audioBuffer); @@ -207,6 +215,23 @@ export class TranscriptionService this.processing = false; } + private async transcribeWithDeepgram( + audioBuffer: ArrayBuffer + ): Promise { + const buffer = Buffer.from(audioBuffer); + const response = await this.deepgram.listen.prerecorded.transcribeFile( + buffer, + { + model: "nova-2", + language: "en-US", + smart_format: true, + } + ); + const result = + response.result.results.channels[0].alternatives[0].transcript; + return result; + } + private async transcribeWithOpenAI( audioBuffer: ArrayBuffer ): Promise { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7175ed08e7a..7ee84a25ecb 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -37030,4 +37030,4 @@ snapshots: zx@8.2.4: optionalDependencies: '@types/fs-extra': 11.0.4 - '@types/node': 20.17.9 + '@types/node': 20.17.9 \ No newline at end of file