@@ -46,14 +46,13 @@ import {
46
46
discordShouldRespondTemplate ,
47
47
discordVoiceHandlerTemplate ,
48
48
} from "./templates.ts" ;
49
- import debounce from "lodash/debounce.js" ;
50
49
import { getWavHeader } from "./utils.ts" ;
50
+ import { createClient , DeepgramClient } from "@deepgram/sdk" ;
51
51
52
52
// These values are chosen for compatibility with picovoice components
53
53
const DECODE_FRAME_SIZE = 1024 ;
54
54
const DECODE_SAMPLE_RATE = 16000 ;
55
55
56
- // Buffers all audio
57
56
export class AudioMonitor {
58
57
private readable : Readable ;
59
58
private buffers : Buffer [ ] = [ ] ;
@@ -64,6 +63,7 @@ export class AudioMonitor {
64
63
constructor (
65
64
readable : Readable ,
66
65
maxSize : number ,
66
+ onStart : ( ) => void ,
67
67
callback : ( buffer : Buffer ) => void
68
68
) {
69
69
this . readable = readable ;
@@ -98,6 +98,7 @@ export class AudioMonitor {
98
98
} ) ;
99
99
this . readable . on ( "speakingStarted" , ( ) => {
100
100
if ( this . ended ) return ;
101
+ onStart ( ) ;
101
102
elizaLogger . log ( "Speaking started" ) ;
102
103
this . reset ( ) ;
103
104
} ) ;
@@ -138,6 +139,9 @@ export class AudioMonitor {
138
139
}
139
140
140
141
export class VoiceManager extends EventEmitter {
142
+ private deepgram ?: DeepgramClient ;
143
+ private processingVoice : boolean = false ;
144
+ private transcriptionTimeout : NodeJS . Timeout | null = null ;
141
145
private userStates : Map <
142
146
string ,
143
147
{
@@ -161,6 +165,9 @@ export class VoiceManager extends EventEmitter {
161
165
super ( ) ;
162
166
this . client = client . client ;
163
167
this . runtime = client . runtime ;
168
+
169
+ const deepgramKey = this . runtime . getSetting ( "DEEPGRAM_API_KEY" ) ;
170
+ this . deepgram = deepgramKey ? createClient ( deepgramKey ) : null ;
164
171
}
165
172
166
173
async handleVoiceStateUpdate ( oldState : VoiceState , newState : VoiceState ) {
@@ -373,6 +380,7 @@ export class VoiceManager extends EventEmitter {
373
380
if ( avgVolume > SPEAKING_THRESHOLD ) {
374
381
volumeBuffer . length = 0 ;
375
382
this . cleanupAudioPlayer ( this . activeAudioPlayer ) ;
383
+ this . processingVoice = false ;
376
384
}
377
385
}
378
386
} ) ;
@@ -453,6 +461,52 @@ export class VoiceManager extends EventEmitter {
453
461
// this.scanGuild(guild);
454
462
}
455
463
464
+ async debouncedProcessTranscription (
465
+ userId : UUID ,
466
+ name : string ,
467
+ userName : string ,
468
+ channel : BaseGuildVoiceChannel
469
+ ) {
470
+ const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500 ; // wait for 1.5 seconds of silence
471
+
472
+ if ( this . activeAudioPlayer ?. state ?. status === "idle" ) {
473
+ elizaLogger . log ( "Cleaning up idle audio player." ) ;
474
+ this . cleanupAudioPlayer ( this . activeAudioPlayer ) ;
475
+ }
476
+
477
+ if ( this . activeAudioPlayer || this . processingVoice ) {
478
+ const state = this . userStates . get ( userId ) ;
479
+ state . buffers . length = 0 ;
480
+ state . totalLength = 0 ;
481
+ return ;
482
+ }
483
+
484
+ if ( this . transcriptionTimeout ) {
485
+ clearTimeout ( this . transcriptionTimeout ) ;
486
+ }
487
+
488
+ this . transcriptionTimeout = setTimeout ( async ( ) => {
489
+ this . processingVoice = true ;
490
+ try {
491
+ await this . processTranscription (
492
+ userId ,
493
+ channel . id ,
494
+ channel ,
495
+ name ,
496
+ userName
497
+ ) ;
498
+
499
+ // Clean all users' previous buffers
500
+ this . userStates . forEach ( ( state , id ) => {
501
+ state . buffers . length = 0 ;
502
+ state . totalLength = 0 ;
503
+ } ) ;
504
+ } finally {
505
+ this . processingVoice = false ;
506
+ }
507
+ } , DEBOUNCE_TRANSCRIPTION_THRESHOLD ) ;
508
+ }
509
+
456
510
async handleUserStream (
457
511
userId : UUID ,
458
512
name : string ,
@@ -461,7 +515,6 @@ export class VoiceManager extends EventEmitter {
461
515
audioStream : Readable
462
516
) {
463
517
console . log ( `Starting audio monitor for user: ${ userId } ` ) ;
464
- const channelId = channel . id ;
465
518
if ( ! this . userStates . has ( userId ) ) {
466
519
this . userStates . set ( userId , {
467
520
buffers : [ ] ,
@@ -473,25 +526,17 @@ export class VoiceManager extends EventEmitter {
473
526
474
527
const state = this . userStates . get ( userId ) ;
475
528
476
- const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2500 ; // wait for 1.5 seconds of silence
477
-
478
- const debouncedProcessTranscription = debounce ( async ( ) => {
479
- await this . processTranscription (
480
- userId ,
481
- channelId ,
482
- channel ,
483
- name ,
484
- userName
485
- ) ;
486
- } , DEBOUNCE_TRANSCRIPTION_THRESHOLD ) ;
487
-
488
529
const processBuffer = async ( buffer : Buffer ) => {
489
530
try {
490
531
state ! . buffers . push ( buffer ) ;
491
532
state ! . totalLength += buffer . length ;
492
533
state ! . lastActive = Date . now ( ) ;
493
-
494
- debouncedProcessTranscription ( ) ;
534
+ this . debouncedProcessTranscription (
535
+ userId ,
536
+ name ,
537
+ userName ,
538
+ channel
539
+ ) ;
495
540
} catch ( error ) {
496
541
console . error (
497
542
`Error processing buffer for user ${ userId } :` ,
@@ -500,13 +545,22 @@ export class VoiceManager extends EventEmitter {
500
545
}
501
546
} ;
502
547
503
- new AudioMonitor ( audioStream , 10000000 , async ( buffer ) => {
504
- if ( ! buffer ) {
505
- console . error ( "Received empty buffer" ) ;
506
- return ;
548
+ new AudioMonitor (
549
+ audioStream ,
550
+ 10000000 ,
551
+ ( ) => {
552
+ if ( this . transcriptionTimeout ) {
553
+ clearTimeout ( this . transcriptionTimeout ) ;
554
+ }
555
+ } ,
556
+ async ( buffer ) => {
557
+ if ( ! buffer ) {
558
+ console . error ( "Received empty buffer" ) ;
559
+ return ;
560
+ }
561
+ await processBuffer ( buffer ) ;
507
562
}
508
- await processBuffer ( buffer ) ;
509
- } ) ;
563
+ ) ;
510
564
}
511
565
512
566
private async processTranscription (
@@ -520,17 +574,35 @@ export class VoiceManager extends EventEmitter {
520
574
if ( ! state || state . buffers . length === 0 ) return ;
521
575
try {
522
576
const inputBuffer = Buffer . concat ( state . buffers , state . totalLength ) ;
577
+
523
578
state . buffers . length = 0 ; // Clear the buffers
524
579
state . totalLength = 0 ;
525
-
526
580
// Convert Opus to WAV
527
581
const wavBuffer = await this . convertOpusToWav ( inputBuffer ) ;
528
-
529
582
console . log ( "Starting transcription..." ) ;
530
583
531
- const transcriptionText = await this . runtime
532
- . getService < ITranscriptionService > ( ServiceType . TRANSCRIPTION )
533
- . transcribe ( wavBuffer ) ;
584
+ let transcriptionText : string ;
585
+
586
+ if ( this . deepgram ) {
587
+ const response =
588
+ await this . deepgram . listen . prerecorded . transcribeFile (
589
+ wavBuffer ,
590
+ {
591
+ model : "nova-2" ,
592
+ language : "en-US" ,
593
+ smart_format : true ,
594
+ }
595
+ ) ;
596
+ transcriptionText =
597
+ response . result . results . channels [ 0 ] . alternatives [ 0 ]
598
+ . transcript ;
599
+ } else {
600
+ transcriptionText = await this . runtime
601
+ . getService < ITranscriptionService > (
602
+ ServiceType . TRANSCRIPTION
603
+ )
604
+ . transcribe ( wavBuffer ) ;
605
+ }
534
606
535
607
function isValidTranscription ( text : string ) : boolean {
536
608
if ( ! text || text . includes ( "[BLANK_AUDIO]" ) ) return false ;
0 commit comments