Skip to content

Commit 2ffa4b4

Browse files
authored
Merge pull request #1026 from ai16z/tcm-improve-discord-voice
feat: improve voice processing and add deepgram transcription option
2 parents d8bfbcd + 6be1b68 commit 2ffa4b4

File tree

5 files changed

+108
-30
lines changed

5 files changed

+108
-30
lines changed

.env.example

+4
Original file line numberDiff line numberDiff line change
@@ -270,3 +270,7 @@ AWS_SECRET_ACCESS_KEY=
270270
AWS_REGION=
271271
AWS_S3_BUCKET=
272272
AWS_S3_UPLOAD_PATH=
273+
274+
275+
# Deepgram
276+
DEEPGRAM_API_KEY=

package.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@
5454
"dependencies": {
5555
"@0glabs/0g-ts-sdk": "0.2.1",
5656
"@coinbase/coinbase-sdk": "0.10.0",
57+
"@deepgram/sdk": "^3.9.0",
58+
"@vitest/eslint-plugin": "1.0.1",
5759
"amqplib": "0.10.5",
5860
"csv-parse": "5.6.0",
59-
"@vitest/eslint-plugin": "1.0.1",
6061
"ollama-ai-provider": "0.16.1",
6162
"optional": "0.1.4",
6263
"pnpm": "9.14.4",

packages/client-discord/src/voice.ts

+73-25
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,12 @@ import {
4646
discordShouldRespondTemplate,
4747
discordVoiceHandlerTemplate,
4848
} from "./templates.ts";
49-
import debounce from "lodash/debounce.js";
5049
import { getWavHeader } from "./utils.ts";
5150

5251
// These values are chosen for compatibility with picovoice components
5352
const DECODE_FRAME_SIZE = 1024;
5453
const DECODE_SAMPLE_RATE = 16000;
5554

56-
// Buffers all audio
5755
export class AudioMonitor {
5856
private readable: Readable;
5957
private buffers: Buffer[] = [];
@@ -64,6 +62,7 @@ export class AudioMonitor {
6462
constructor(
6563
readable: Readable,
6664
maxSize: number,
65+
onStart: () => void,
6766
callback: (buffer: Buffer) => void
6867
) {
6968
this.readable = readable;
@@ -98,6 +97,7 @@ export class AudioMonitor {
9897
});
9998
this.readable.on("speakingStarted", () => {
10099
if (this.ended) return;
100+
onStart();
101101
elizaLogger.log("Speaking started");
102102
this.reset();
103103
});
@@ -138,6 +138,8 @@ export class AudioMonitor {
138138
}
139139

140140
export class VoiceManager extends EventEmitter {
141+
private processingVoice: boolean = false;
142+
private transcriptionTimeout: NodeJS.Timeout | null = null;
141143
private userStates: Map<
142144
string,
143145
{
@@ -373,6 +375,7 @@ export class VoiceManager extends EventEmitter {
373375
if (avgVolume > SPEAKING_THRESHOLD) {
374376
volumeBuffer.length = 0;
375377
this.cleanupAudioPlayer(this.activeAudioPlayer);
378+
this.processingVoice = false;
376379
}
377380
}
378381
});
@@ -453,6 +456,52 @@ export class VoiceManager extends EventEmitter {
453456
// this.scanGuild(guild);
454457
}
455458

459+
async debouncedProcessTranscription(
460+
userId: UUID,
461+
name: string,
462+
userName: string,
463+
channel: BaseGuildVoiceChannel
464+
) {
465+
const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence
466+
467+
if (this.activeAudioPlayer?.state?.status === "idle") {
468+
elizaLogger.log("Cleaning up idle audio player.");
469+
this.cleanupAudioPlayer(this.activeAudioPlayer);
470+
}
471+
472+
if (this.activeAudioPlayer || this.processingVoice) {
473+
const state = this.userStates.get(userId);
474+
state.buffers.length = 0;
475+
state.totalLength = 0;
476+
return;
477+
}
478+
479+
if (this.transcriptionTimeout) {
480+
clearTimeout(this.transcriptionTimeout);
481+
}
482+
483+
this.transcriptionTimeout = setTimeout(async () => {
484+
this.processingVoice = true;
485+
try {
486+
await this.processTranscription(
487+
userId,
488+
channel.id,
489+
channel,
490+
name,
491+
userName
492+
);
493+
494+
// Clean all users' previous buffers
495+
this.userStates.forEach((state, id) => {
496+
state.buffers.length = 0;
497+
state.totalLength = 0;
498+
});
499+
} finally {
500+
this.processingVoice = false;
501+
}
502+
}, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
503+
}
504+
456505
async handleUserStream(
457506
userId: UUID,
458507
name: string,
@@ -461,7 +510,6 @@ export class VoiceManager extends EventEmitter {
461510
audioStream: Readable
462511
) {
463512
console.log(`Starting audio monitor for user: ${userId}`);
464-
const channelId = channel.id;
465513
if (!this.userStates.has(userId)) {
466514
this.userStates.set(userId, {
467515
buffers: [],
@@ -473,25 +521,17 @@ export class VoiceManager extends EventEmitter {
473521

474522
const state = this.userStates.get(userId);
475523

476-
const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2500; // wait for 1.5 seconds of silence
477-
478-
const debouncedProcessTranscription = debounce(async () => {
479-
await this.processTranscription(
480-
userId,
481-
channelId,
482-
channel,
483-
name,
484-
userName
485-
);
486-
}, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
487-
488524
const processBuffer = async (buffer: Buffer) => {
489525
try {
490526
state!.buffers.push(buffer);
491527
state!.totalLength += buffer.length;
492528
state!.lastActive = Date.now();
493-
494-
debouncedProcessTranscription();
529+
this.debouncedProcessTranscription(
530+
userId,
531+
name,
532+
userName,
533+
channel
534+
);
495535
} catch (error) {
496536
console.error(
497537
`Error processing buffer for user ${userId}:`,
@@ -500,13 +540,22 @@ export class VoiceManager extends EventEmitter {
500540
}
501541
};
502542

503-
new AudioMonitor(audioStream, 10000000, async (buffer) => {
504-
if (!buffer) {
505-
console.error("Received empty buffer");
506-
return;
543+
new AudioMonitor(
544+
audioStream,
545+
10000000,
546+
() => {
547+
if (this.transcriptionTimeout) {
548+
clearTimeout(this.transcriptionTimeout);
549+
}
550+
},
551+
async (buffer) => {
552+
if (!buffer) {
553+
console.error("Received empty buffer");
554+
return;
555+
}
556+
await processBuffer(buffer);
507557
}
508-
await processBuffer(buffer);
509-
});
558+
);
510559
}
511560

512561
private async processTranscription(
@@ -520,12 +569,11 @@ export class VoiceManager extends EventEmitter {
520569
if (!state || state.buffers.length === 0) return;
521570
try {
522571
const inputBuffer = Buffer.concat(state.buffers, state.totalLength);
572+
523573
state.buffers.length = 0; // Clear the buffers
524574
state.totalLength = 0;
525-
526575
// Convert Opus to WAV
527576
const wavBuffer = await this.convertOpusToWav(inputBuffer);
528-
529577
console.log("Starting transcription...");
530578

531579
const transcriptionText = await this.runtime

packages/plugin-node/src/services/transcription.ts

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import os from "os";
1414
import path from "path";
1515
import { fileURLToPath } from "url";
1616
import { promisify } from "util";
17+
import { createClient, DeepgramClient } from "@deepgram/sdk";
1718

1819
// const __dirname = path.dirname(new URL(import.meta.url).pathname); #compatibility issues with windows
1920
const __filename = fileURLToPath(import.meta.url);
@@ -25,17 +26,23 @@ export class TranscriptionService
2526
extends Service
2627
implements ITranscriptionService
2728
{
29+
private runtime: IAgentRuntime | null = null;
2830
static serviceType: ServiceType = ServiceType.TRANSCRIPTION;
2931
private CONTENT_CACHE_DIR: string;
3032
private DEBUG_AUDIO_DIR: string;
3133
private TARGET_SAMPLE_RATE = 16000; // Common sample rate for speech recognition
3234
private isCudaAvailable: boolean = false;
3335
private openai: OpenAI | null = null;
36+
private deepgram?: DeepgramClient;
3437

3538
private queue: { audioBuffer: ArrayBuffer; resolve: Function }[] = [];
3639
private processing: boolean = false;
3740

38-
async initialize(_runtime: IAgentRuntime): Promise<void> {}
41+
async initialize(_runtime: IAgentRuntime): Promise<void> {
42+
this.runtime = _runtime;
43+
const deepgramKey = this.runtime.getSetting("DEEPGRAM_API_KEY");
44+
this.deepgram = deepgramKey ? createClient(deepgramKey) : null;
45+
}
3946

4047
constructor() {
4148
super();
@@ -194,8 +201,9 @@ export class TranscriptionService
194201
while (this.queue.length > 0) {
195202
const { audioBuffer, resolve } = this.queue.shift()!;
196203
let result: string | null = null;
197-
198-
if (this.openai) {
204+
if (this.deepgram) {
205+
result = await this.transcribeWithDeepgram(audioBuffer);
206+
} else if (this.openai) {
199207
result = await this.transcribeWithOpenAI(audioBuffer);
200208
} else {
201209
result = await this.transcribeLocally(audioBuffer);
@@ -207,6 +215,23 @@ export class TranscriptionService
207215
this.processing = false;
208216
}
209217

218+
private async transcribeWithDeepgram(
219+
audioBuffer: ArrayBuffer
220+
): Promise<string | null> {
221+
const buffer = Buffer.from(audioBuffer);
222+
const response = await this.deepgram.listen.prerecorded.transcribeFile(
223+
buffer,
224+
{
225+
model: "nova-2",
226+
language: "en-US",
227+
smart_format: true,
228+
}
229+
);
230+
const result =
231+
response.result.results.channels[0].alternatives[0].transcript;
232+
return result;
233+
}
234+
210235
private async transcribeWithOpenAI(
211236
audioBuffer: ArrayBuffer
212237
): Promise<string | null> {

pnpm-lock.yaml

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)