Skip to content

Commit 1f6013f

Browse files
committed
improve voice processing and add deepgram transcription option
1 parent 1c9a5a1 commit 1f6013f

File tree

1 file changed

+100
-28
lines changed

1 file changed

+100
-28
lines changed

packages/client-discord/src/voice.ts

+100-28
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,13 @@ import {
4646
discordShouldRespondTemplate,
4747
discordVoiceHandlerTemplate,
4848
} from "./templates.ts";
49-
import debounce from "lodash/debounce.js";
5049
import { getWavHeader } from "./utils.ts";
50+
import { createClient, DeepgramClient } from "@deepgram/sdk";
5151

5252
// These values are chosen for compatibility with picovoice components
5353
const DECODE_FRAME_SIZE = 1024;
5454
const DECODE_SAMPLE_RATE = 16000;
5555

56-
// Buffers all audio
5756
export class AudioMonitor {
5857
private readable: Readable;
5958
private buffers: Buffer[] = [];
@@ -64,6 +63,7 @@ export class AudioMonitor {
6463
constructor(
6564
readable: Readable,
6665
maxSize: number,
66+
onStart: () => void,
6767
callback: (buffer: Buffer) => void
6868
) {
6969
this.readable = readable;
@@ -98,6 +98,7 @@ export class AudioMonitor {
9898
});
9999
this.readable.on("speakingStarted", () => {
100100
if (this.ended) return;
101+
onStart();
101102
elizaLogger.log("Speaking started");
102103
this.reset();
103104
});
@@ -138,6 +139,9 @@ export class AudioMonitor {
138139
}
139140

140141
export class VoiceManager extends EventEmitter {
142+
private deepgram?: DeepgramClient;
143+
private processingVoice: boolean = false;
144+
private transcriptionTimeout: NodeJS.Timeout | null = null;
141145
private userStates: Map<
142146
string,
143147
{
@@ -161,6 +165,9 @@ export class VoiceManager extends EventEmitter {
161165
super();
162166
this.client = client.client;
163167
this.runtime = client.runtime;
168+
169+
const deepgramKey = this.runtime.getSetting("DEEPGRAM_API_KEY");
170+
this.deepgram = deepgramKey ? createClient(deepgramKey) : null;
164171
}
165172

166173
async handleVoiceStateUpdate(oldState: VoiceState, newState: VoiceState) {
@@ -373,6 +380,7 @@ export class VoiceManager extends EventEmitter {
373380
if (avgVolume > SPEAKING_THRESHOLD) {
374381
volumeBuffer.length = 0;
375382
this.cleanupAudioPlayer(this.activeAudioPlayer);
383+
this.processingVoice = false;
376384
}
377385
}
378386
});
@@ -453,6 +461,52 @@ export class VoiceManager extends EventEmitter {
453461
// this.scanGuild(guild);
454462
}
455463

464+
async debouncedProcessTranscription(
465+
userId: UUID,
466+
name: string,
467+
userName: string,
468+
channel: BaseGuildVoiceChannel
469+
) {
470+
const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence
471+
472+
if (this.activeAudioPlayer?.state?.status === "idle") {
473+
elizaLogger.log("Cleaning up idle audio player.");
474+
this.cleanupAudioPlayer(this.activeAudioPlayer);
475+
}
476+
477+
if (this.activeAudioPlayer || this.processingVoice) {
478+
const state = this.userStates.get(userId);
479+
state.buffers.length = 0;
480+
state.totalLength = 0;
481+
return;
482+
}
483+
484+
if (this.transcriptionTimeout) {
485+
clearTimeout(this.transcriptionTimeout);
486+
}
487+
488+
this.transcriptionTimeout = setTimeout(async () => {
489+
this.processingVoice = true;
490+
try {
491+
await this.processTranscription(
492+
userId,
493+
channel.id,
494+
channel,
495+
name,
496+
userName
497+
);
498+
499+
// Clean all users' previous buffers
500+
this.userStates.forEach((state, id) => {
501+
state.buffers.length = 0;
502+
state.totalLength = 0;
503+
});
504+
} finally {
505+
this.processingVoice = false;
506+
}
507+
}, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
508+
}
509+
456510
async handleUserStream(
457511
userId: UUID,
458512
name: string,
@@ -461,7 +515,6 @@ export class VoiceManager extends EventEmitter {
461515
audioStream: Readable
462516
) {
463517
console.log(`Starting audio monitor for user: ${userId}`);
464-
const channelId = channel.id;
465518
if (!this.userStates.has(userId)) {
466519
this.userStates.set(userId, {
467520
buffers: [],
@@ -473,25 +526,17 @@ export class VoiceManager extends EventEmitter {
473526

474527
const state = this.userStates.get(userId);
475528

476-
const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2500; // wait for 1.5 seconds of silence
477-
478-
const debouncedProcessTranscription = debounce(async () => {
479-
await this.processTranscription(
480-
userId,
481-
channelId,
482-
channel,
483-
name,
484-
userName
485-
);
486-
}, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
487-
488529
const processBuffer = async (buffer: Buffer) => {
489530
try {
490531
state!.buffers.push(buffer);
491532
state!.totalLength += buffer.length;
492533
state!.lastActive = Date.now();
493-
494-
debouncedProcessTranscription();
534+
this.debouncedProcessTranscription(
535+
userId,
536+
name,
537+
userName,
538+
channel
539+
);
495540
} catch (error) {
496541
console.error(
497542
`Error processing buffer for user ${userId}:`,
@@ -500,13 +545,22 @@ export class VoiceManager extends EventEmitter {
500545
}
501546
};
502547

503-
new AudioMonitor(audioStream, 10000000, async (buffer) => {
504-
if (!buffer) {
505-
console.error("Received empty buffer");
506-
return;
548+
new AudioMonitor(
549+
audioStream,
550+
10000000,
551+
() => {
552+
if (this.transcriptionTimeout) {
553+
clearTimeout(this.transcriptionTimeout);
554+
}
555+
},
556+
async (buffer) => {
557+
if (!buffer) {
558+
console.error("Received empty buffer");
559+
return;
560+
}
561+
await processBuffer(buffer);
507562
}
508-
await processBuffer(buffer);
509-
});
563+
);
510564
}
511565

512566
private async processTranscription(
@@ -520,17 +574,35 @@ export class VoiceManager extends EventEmitter {
520574
if (!state || state.buffers.length === 0) return;
521575
try {
522576
const inputBuffer = Buffer.concat(state.buffers, state.totalLength);
577+
523578
state.buffers.length = 0; // Clear the buffers
524579
state.totalLength = 0;
525-
526580
// Convert Opus to WAV
527581
const wavBuffer = await this.convertOpusToWav(inputBuffer);
528-
529582
console.log("Starting transcription...");
530583

531-
const transcriptionText = await this.runtime
532-
.getService<ITranscriptionService>(ServiceType.TRANSCRIPTION)
533-
.transcribe(wavBuffer);
584+
let transcriptionText: string;
585+
586+
if (this.deepgram) {
587+
const response =
588+
await this.deepgram.listen.prerecorded.transcribeFile(
589+
wavBuffer,
590+
{
591+
model: "nova-2",
592+
language: "en-US",
593+
smart_format: true,
594+
}
595+
);
596+
transcriptionText =
597+
response.result.results.channels[0].alternatives[0]
598+
.transcript;
599+
} else {
600+
transcriptionText = await this.runtime
601+
.getService<ITranscriptionService>(
602+
ServiceType.TRANSCRIPTION
603+
)
604+
.transcribe(wavBuffer);
605+
}
534606

535607
function isValidTranscription(text: string): boolean {
536608
if (!text || text.includes("[BLANK_AUDIO]")) return false;

0 commit comments

Comments
 (0)