@@ -46,14 +46,12 @@ import {
46
46
discordShouldRespondTemplate ,
47
47
discordVoiceHandlerTemplate ,
48
48
} from "./templates.ts" ;
49
- import debounce from "lodash/debounce.js" ;
50
49
import { getWavHeader } from "./utils.ts" ;
51
50
52
51
// These values are chosen for compatibility with picovoice components
53
52
const DECODE_FRAME_SIZE = 1024 ;
54
53
const DECODE_SAMPLE_RATE = 16000 ;
55
54
56
- // Buffers all audio
57
55
export class AudioMonitor {
58
56
private readable : Readable ;
59
57
private buffers : Buffer [ ] = [ ] ;
@@ -64,6 +62,7 @@ export class AudioMonitor {
64
62
constructor (
65
63
readable : Readable ,
66
64
maxSize : number ,
65
+ onStart : ( ) => void ,
67
66
callback : ( buffer : Buffer ) => void
68
67
) {
69
68
this . readable = readable ;
@@ -98,6 +97,7 @@ export class AudioMonitor {
98
97
} ) ;
99
98
this . readable . on ( "speakingStarted" , ( ) => {
100
99
if ( this . ended ) return ;
100
+ onStart ( ) ;
101
101
elizaLogger . log ( "Speaking started" ) ;
102
102
this . reset ( ) ;
103
103
} ) ;
@@ -138,6 +138,8 @@ export class AudioMonitor {
138
138
}
139
139
140
140
export class VoiceManager extends EventEmitter {
141
+ private processingVoice : boolean = false ;
142
+ private transcriptionTimeout : NodeJS . Timeout | null = null ;
141
143
private userStates : Map <
142
144
string ,
143
145
{
@@ -373,6 +375,7 @@ export class VoiceManager extends EventEmitter {
373
375
if ( avgVolume > SPEAKING_THRESHOLD ) {
374
376
volumeBuffer . length = 0 ;
375
377
this . cleanupAudioPlayer ( this . activeAudioPlayer ) ;
378
+ this . processingVoice = false ;
376
379
}
377
380
}
378
381
} ) ;
@@ -453,6 +456,52 @@ export class VoiceManager extends EventEmitter {
453
456
// this.scanGuild(guild);
454
457
}
455
458
459
+ async debouncedProcessTranscription (
460
+ userId : UUID ,
461
+ name : string ,
462
+ userName : string ,
463
+ channel : BaseGuildVoiceChannel
464
+ ) {
465
+ const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500 ; // wait for 1.5 seconds of silence
466
+
467
+ if ( this . activeAudioPlayer ?. state ?. status === "idle" ) {
468
+ elizaLogger . log ( "Cleaning up idle audio player." ) ;
469
+ this . cleanupAudioPlayer ( this . activeAudioPlayer ) ;
470
+ }
471
+
472
+ if ( this . activeAudioPlayer || this . processingVoice ) {
473
+ const state = this . userStates . get ( userId ) ;
474
+ state . buffers . length = 0 ;
475
+ state . totalLength = 0 ;
476
+ return ;
477
+ }
478
+
479
+ if ( this . transcriptionTimeout ) {
480
+ clearTimeout ( this . transcriptionTimeout ) ;
481
+ }
482
+
483
+ this . transcriptionTimeout = setTimeout ( async ( ) => {
484
+ this . processingVoice = true ;
485
+ try {
486
+ await this . processTranscription (
487
+ userId ,
488
+ channel . id ,
489
+ channel ,
490
+ name ,
491
+ userName
492
+ ) ;
493
+
494
+ // Clean all users' previous buffers
495
+ this . userStates . forEach ( ( state , id ) => {
496
+ state . buffers . length = 0 ;
497
+ state . totalLength = 0 ;
498
+ } ) ;
499
+ } finally {
500
+ this . processingVoice = false ;
501
+ }
502
+ } , DEBOUNCE_TRANSCRIPTION_THRESHOLD ) ;
503
+ }
504
+
456
505
async handleUserStream (
457
506
userId : UUID ,
458
507
name : string ,
@@ -461,7 +510,6 @@ export class VoiceManager extends EventEmitter {
461
510
audioStream : Readable
462
511
) {
463
512
console . log ( `Starting audio monitor for user: ${ userId } ` ) ;
464
- const channelId = channel . id ;
465
513
if ( ! this . userStates . has ( userId ) ) {
466
514
this . userStates . set ( userId , {
467
515
buffers : [ ] ,
@@ -473,25 +521,17 @@ export class VoiceManager extends EventEmitter {
473
521
474
522
const state = this . userStates . get ( userId ) ;
475
523
476
- const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2500 ; // wait for 1.5 seconds of silence
477
-
478
- const debouncedProcessTranscription = debounce ( async ( ) => {
479
- await this . processTranscription (
480
- userId ,
481
- channelId ,
482
- channel ,
483
- name ,
484
- userName
485
- ) ;
486
- } , DEBOUNCE_TRANSCRIPTION_THRESHOLD ) ;
487
-
488
524
const processBuffer = async ( buffer : Buffer ) => {
489
525
try {
490
526
state ! . buffers . push ( buffer ) ;
491
527
state ! . totalLength += buffer . length ;
492
528
state ! . lastActive = Date . now ( ) ;
493
-
494
- debouncedProcessTranscription ( ) ;
529
+ this . debouncedProcessTranscription (
530
+ userId ,
531
+ name ,
532
+ userName ,
533
+ channel
534
+ ) ;
495
535
} catch ( error ) {
496
536
console . error (
497
537
`Error processing buffer for user ${ userId } :` ,
@@ -500,13 +540,22 @@ export class VoiceManager extends EventEmitter {
500
540
}
501
541
} ;
502
542
503
- new AudioMonitor ( audioStream , 10000000 , async ( buffer ) => {
504
- if ( ! buffer ) {
505
- console . error ( "Received empty buffer" ) ;
506
- return ;
543
+ new AudioMonitor (
544
+ audioStream ,
545
+ 10000000 ,
546
+ ( ) => {
547
+ if ( this . transcriptionTimeout ) {
548
+ clearTimeout ( this . transcriptionTimeout ) ;
549
+ }
550
+ } ,
551
+ async ( buffer ) => {
552
+ if ( ! buffer ) {
553
+ console . error ( "Received empty buffer" ) ;
554
+ return ;
555
+ }
556
+ await processBuffer ( buffer ) ;
507
557
}
508
- await processBuffer ( buffer ) ;
509
- } ) ;
558
+ ) ;
510
559
}
511
560
512
561
private async processTranscription (
@@ -520,12 +569,11 @@ export class VoiceManager extends EventEmitter {
520
569
if ( ! state || state . buffers . length === 0 ) return ;
521
570
try {
522
571
const inputBuffer = Buffer . concat ( state . buffers , state . totalLength ) ;
572
+
523
573
state . buffers . length = 0 ; // Clear the buffers
524
574
state . totalLength = 0 ;
525
-
526
575
// Convert Opus to WAV
527
576
const wavBuffer = await this . convertOpusToWav ( inputBuffer ) ;
528
-
529
577
console . log ( "Starting transcription..." ) ;
530
578
531
579
const transcriptionText = await this . runtime
0 commit comments