From 2976bf4ec8c8ef2ce2db8512a500a90f7557d57b Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Fri, 15 Nov 2024 12:22:22 -0500 Subject: [PATCH 01/19] fix: ensure unique instances for each Service subclass using Map --- packages/core/src/types.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 040b743a68c..88f0bcdfd98 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -503,15 +503,17 @@ export interface IMemoryManager { } export abstract class Service { - private static instance: Service | null = null; + private static instances: Map<any, Service> = new Map(); static serviceType: ServiceType; public static getInstance<T extends Service>(): T { - if (!Service.instance) { - // Use this.prototype.constructor to instantiate the concrete class - Service.instance = new (this as any)(); + if (!Service.instances.has(this)) { + Service.instances.set( + this, + new (this as unknown as { new (): T })() + ); } - return Service.instance as T; + return Service.instances.get(this) as T; } } From 1ad97e33af4aef7a8d861749f10efc79b42ecc7d Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Fri, 15 Nov 2024 16:35:50 -0500 Subject: [PATCH 02/19] reply with a text message if the user types something --- packages/client-discord/src/messages.ts | 27 ++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 4d233f658de..d114639ea4d 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -341,7 +341,7 @@ export class MessageManager { if ( message.interaction || message.author.id === - this.client.user?.id /* || message.author?.bot*/ + this.client.user?.id /* || message.author?.bot*/ ) return; const userId = message.author.id as UUID; @@ -389,10 +389,10 @@ export class MessageManager { url: message.url, inReplyTo: message.reference?.messageId ? stringToUuid( - message.reference.messageId + - "-" + - this.runtime.agentId - ) + message.reference.messageId + + "-" + + this.runtime.agentId + ) : undefined, }; @@ -501,13 +501,11 @@ export class MessageManager { message.id + "-" + this.runtime.agentId ); } - if (message.channel.type === ChannelType.GuildVoice) { + if (false) { // For voice channels, use text-to-speech - const audioStream = await ( - this.runtime.getService( - ServiceType.SPEECH_GENERATION - ) - ).getInstance<ISpeechService>() + const audioStream = await this.runtime + .getService(ServiceType.SPEECH_GENERATION) + .getInstance<ISpeechService>() .generate(this.runtime, content.text); await this.voiceManager.playAudioStream( userId, @@ -659,14 +657,15 @@ export class MessageManager { for (const url of urls) { if ( - this.runtime.getService(ServiceType.VIDEO) + this.runtime + .getService(ServiceType.VIDEO) .getInstance<IVideoService>() .isVideoUrl(url) ) { - const videoInfo = await (this.runtime + const videoInfo = await this.runtime .getService(ServiceType.VIDEO) .getInstance<IVideoService>() - .processVideo(url)); + .processVideo(url); attachments.push({ id: `youtube-${Date.now()}`, url: url, From 997dc42dfb559e9bad364beafcade9378dfa246f Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Fri, 15 Nov 2024 16:37:34 -0500 Subject: [PATCH 03/19] refactor stream handling: add debounce for transcription processing --- packages/client-discord/src/voice.ts | 503 +++++++++++++++------------ 1 file changed, 280 insertions(+), 223 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 744a8106b8e..c3857ea0b49 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -1,4 +1,5 @@ import { + AudioPlayer, AudioReceiveStream, NoSubscriberBehavior, StreamType, @@ -175,6 +176,17 @@ export class AudioMonitor { } export class VoiceManager extends EventEmitter { + private userStates: Map< + string, + { + buffers: Buffer[]; + totalLength: number; + lastActive: number; + transcriptionText: string; + } + > = new Map(); + private activeAudioPlayer: AudioPlayer | null = null; + private speaking: boolean = false; private client: Client; private runtime: IAgentRuntime; private streams: Map<string, Readable> = new Map(); @@ -236,14 +248,14 @@ export class VoiceManager extends EventEmitter { }); for (const [, member] of channel.members) { - if (!member.user.bot) { + if (member && !member.user.bot) { this.monitorMember(member, channel); } } connection.receiver.speaking.on("start", (userId: string) => { const user = channel.members.get(userId); - if (!user?.user.bot) { + if (user && !user?.user.bot) { this.monitorMember(user as GuildMember, channel); this.streams.get(userId)?.emit("speakingStarted"); } @@ -361,237 +373,266 @@ export class VoiceManager extends EventEmitter { channel: BaseGuildVoiceChannel, audioStream: Readable ) { + console.log(`Starting audio monitor for user: ${userId}`); const channelId = channel.id; - const buffers: Buffer[] = []; - let totalLength = 0; - const maxSilenceTime = 1000; // Maximum pause duration in milliseconds - const minSilenceTime = 50; // Minimum silence duration to trigger transcription - let lastChunkTime = Date.now(); - let transcriptionStarted = false; - let transcriptionText = ""; - console.log("new audio monitor for: ", userId); + if (!this.userStates.has(userId)) { + this.userStates.set(userId, { + buffers: [], + totalLength: 0, + lastActive: Date.now(), + transcriptionText: "", + }); + } + + const state = this.userStates.get(userId); + + const processBuffer = async (buffer: Buffer) => { + try { + state!.buffers.push(buffer); + state!.totalLength += buffer.length; + state!.lastActive = Date.now(); + + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 3000; // wait for 3 seconds of silence + + clearTimeout(state!["debounceTimeout"]); + state!["debounceTimeout"] = setTimeout(async () => { + await this.processTranscription( + userId, + channelId, + channel, + name, + userName + ); + }, DEBOUNCE_TRANSCRIPTION_THRESHOLD); + } catch (error) { + console.error( + `Error processing buffer for user ${userId}:`, + error + ); + } + }; const monitor = new AudioMonitor( audioStream, 10000000, async (buffer) => { - console.log("buffer: ", buffer); - const currentTime = Date.now(); - const silenceDuration = currentTime - lastChunkTime; if (!buffer) { - // Handle error - console.error("Empty buffer received"); + console.error("Received empty buffer"); return; } - buffers.push(buffer); - totalLength += buffer.length; - lastChunkTime = currentTime; - - if (silenceDuration > minSilenceTime && !transcriptionStarted) { - transcriptionStarted = true; - const inputBuffer = Buffer.concat(buffers, totalLength); - buffers.length = 0; - totalLength = 0; - - try { - // Convert Opus to WAV and add the header - const wavBuffer = - await this.convertOpusToWav(inputBuffer); - - console.log("starting transcription"); - const text = await this.runtime - .getService(ServiceType.TRANSCRIPTION) - .getInstance<ITranscriptionService>() - .transcribe(wavBuffer); - console.log("transcribed text: ", text); - transcriptionText += text; - } catch (error) { - console.error("Error processing audio stream:", error); - } + await processBuffer(buffer); + } + ); + } + + private async processTranscription( + userId: UUID, + channelId: string, + channel: BaseGuildVoiceChannel, + name: string, + userName: string + ) { + const state = this.userStates.get(userId); + if (!state || state.buffers.length === 0) return; + + try { + const inputBuffer = Buffer.concat(state.buffers, state.totalLength); + state.buffers.length = 0; // Clear the buffers + state.totalLength = 0; + + // Convert Opus to WAV + const wavBuffer = await this.convertOpusToWav(inputBuffer); + + console.log("Starting transcription..."); + + const transcriptionText = await this.runtime + .getService(ServiceType.TRANSCRIPTION) + .getInstance<ITranscriptionService>() + .transcribe(wavBuffer); + + function invalidText(text: string): boolean { + if (text.includes("[BLANK_AUDIO]")) { + return true; + } + // if (text.length < 5 && text.toLowerCase().includes("you")) { // not sure what is this + // return true; + // } + if (text === null) { + return true; } + return false; + } + + if (transcriptionText && !invalidText(transcriptionText)) { + state.transcriptionText += transcriptionText; + } + + if (state.transcriptionText.length) { + const finalText = state.transcriptionText; + state.transcriptionText = ""; + await this.handleTranscriptionResult( + finalText, + userId, + channelId, + channel, + name, + userName + ); + } + } catch (error) { + console.error( + `Error transcribing audio for user ${userId}:`, + error + ); + } + } + + private async handleTranscriptionResult( + text: string, + userId: UUID, + channelId: string, + channel: BaseGuildVoiceChannel, + name: string, + userName: string + ) { + try { + const roomId = stringToUuid(channelId + "-" + this.runtime.agentId); + const userIdUUID = stringToUuid(userId); + + await this.runtime.ensureConnection( + userIdUUID, + roomId, + userName, + name, + "discord" + ); - if (silenceDuration > maxSilenceTime && transcriptionStarted) { - console.log("transcription finished"); - transcriptionStarted = false; + let state = await this.runtime.composeState( + { + agentId: this.runtime.agentId, + content: { text: text, source: "Discord" }, + userId: userIdUUID, + roomId, + }, + { + discordChannel: channel, + discordClient: this.client, + agentName: this.runtime.character.name, + } + ); - if (!transcriptionText) return; + if (text && text.startsWith("/")) { + return null; + } - try { - const text = transcriptionText; + const memory = { + id: stringToUuid(channelId + "-voice-message-" + Date.now()), + agentId: this.runtime.agentId, + content: { + text: text, + source: "discord", + url: channel.url, + }, + userId: userIdUUID, + roomId, + embedding: embeddingZeroVector, + createdAt: Date.now(), + }; + + if (!memory.content.text) { + return { text: "", action: "IGNORE" }; + } - // handle whisper cases - if ( - (text.length < 15 && - text.includes("[BLANK_AUDIO]")) || - (text.length < 5 && - text.toLowerCase().includes("you")) - ) { - transcriptionText = ""; // Reset transcription text - return; - } + await this.runtime.messageManager.createMemory(memory); - const roomId = stringToUuid( - channelId + "-" + this.runtime.agentId - ); - const userIdUUID = stringToUuid(userId); - - await this.runtime.ensureConnection( - userIdUUID, - roomId, - userName, - name, - "discord" - ); + state = await this.runtime.updateRecentMessageState(state); - let state = await this.runtime.composeState( - { - agentId: this.runtime.agentId, - content: { text: text, source: "Discord" }, - userId: userIdUUID, - roomId, - }, - { - discordChannel: channel, - discordClient: this.client, - agentName: this.runtime.character.name, - } - ); + const shouldIgnore = await this._shouldIgnore(memory); - if (text && text.startsWith("/")) { - transcriptionText = ""; // Reset transcription text - return null; - } - - const memory = { - id: stringToUuid( - channelId + "-voice-message-" + Date.now() - ), - agentId: this.runtime.agentId, - content: { - text: text, - source: "discord", - url: channel.url, - }, - userId: userIdUUID, - roomId, - embedding: embeddingZeroVector, - createdAt: Date.now(), - }; - - if (!memory.content.text) { - transcriptionText = ""; // Reset transcription text - return { text: "", action: "IGNORE" }; - } - - await this.runtime.messageManager.createMemory(memory); - - state = - await this.runtime.updateRecentMessageState(state); - - const shouldIgnore = await this._shouldIgnore(memory); - - if (shouldIgnore) { - transcriptionText = ""; // Reset transcription text - return { text: "", action: "IGNORE" }; - } - - const context = composeContext({ - state, - template: - this.runtime.character.templates - ?.discordVoiceHandlerTemplate || - this.runtime.character.templates - ?.messageHandlerTemplate || - discordVoiceHandlerTemplate, - }); - - const responseContent = await this._generateResponse( - memory, - state, - context - ); + if (shouldIgnore) { + return { text: "", action: "IGNORE" }; + } - const callback: HandlerCallback = async ( - content: Content - ) => { - console.log("callback content: ", content); - const { roomId } = memory; - - const responseMemory: Memory = { - id: stringToUuid( - memory.id + "-voice-response-" + Date.now() - ), - agentId: this.runtime.agentId, - userId: this.runtime.agentId, - content: { - ...content, - user: this.runtime.character.name, - inReplyTo: memory.id, - }, - roomId, - embedding: embeddingZeroVector, - }; - - if (responseMemory.content.text?.trim()) { - await this.runtime.messageManager.createMemory( - responseMemory - ); - state = - await this.runtime.updateRecentMessageState( - state - ); - const responseStream = await this.runtime - .getService(ServiceType.SPEECH_GENERATION) - .getInstance<ISpeechService>() - .generate(this.runtime, content.text); - - if (responseStream) { - await this.playAudioStream( - userId, - responseStream as Readable - ); - } - await this.runtime.evaluate(memory, state); - } else { - console.warn("Empty response, skipping"); - } - return [responseMemory]; - }; - - const responseMemories = - await callback(responseContent); - - const response = responseContent; - - const content = (response.responseMessage || - response.content || - response.message) as string; - - if (!content) { - transcriptionText = ""; // Reset transcription text - return null; - } - - console.log("responseMemories: ", responseMemories); - - await this.runtime.processActions( - memory, - responseMemories, - state, - callback - ); + const context = composeContext({ + state, + template: + this.runtime.character.templates + ?.discordVoiceHandlerTemplate || + this.runtime.character.templates?.messageHandlerTemplate || + discordVoiceHandlerTemplate, + }); + + const responseContent = await this._generateResponse( + memory, + state, + context + ); - transcriptionText = ""; // Reset transcription text - } catch (error) { - console.error( - "Error processing transcribed text:", - error + const callback: HandlerCallback = async (content: Content) => { + console.log("callback content: ", content); + const { roomId } = memory; + + const responseMemory: Memory = { + id: stringToUuid( + memory.id + "-voice-response-" + Date.now() + ), + agentId: this.runtime.agentId, + userId: this.runtime.agentId, + content: { + ...content, + user: this.runtime.character.name, + inReplyTo: memory.id, + }, + roomId, + embedding: embeddingZeroVector, + }; + + if (responseMemory.content.text?.trim()) { + await this.runtime.messageManager.createMemory( + responseMemory + ); + state = await this.runtime.updateRecentMessageState(state); + const responseStream = await this.runtime + .getService(ServiceType.SPEECH_GENERATION) + .getInstance<ISpeechService>() + .generate(this.runtime, content.text); + + if (responseStream) { + await this.playAudioStream( + userId, + responseStream as Readable ); - transcriptionText = ""; // Reset transcription text } + await this.runtime.evaluate(memory, state); + } else { + console.warn("Empty response, skipping"); } + return [responseMemory]; + }; + + const responseMemories = await callback(responseContent); + + const response = responseContent; + + const content = (response.responseMessage || + response.content || + response.message) as string; + + if (!content) { + return null; } - ); + + console.log("responseMemories: ", responseMemories); + + await this.runtime.processActions( + memory, + responseMemories, + state, + callback + ); + } catch (error) { + console.error("Error processing transcribed text:", error); + } } private async convertOpusToWav(pcmBuffer: Buffer): Promise<Buffer> { @@ -723,11 +764,15 @@ export class VoiceManager extends EventEmitter { console.log(`No connection for user ${userId}`); return; } + + this.cleanupAudioPlayer(this.activeAudioPlayer); + const audioPlayer = createAudioPlayer({ behaviors: { noSubscriber: NoSubscriberBehavior.Pause, }, }); + this.activeAudioPlayer = audioPlayer; connection.subscribe(audioPlayer); const audioStartTime = Date.now(); @@ -737,21 +782,33 @@ export class VoiceManager extends EventEmitter { }); audioPlayer.play(resource); - audioPlayer.on("error", (err: any) => { + const handleError = (err: any) => { console.log(`Audio player error: ${err}`); - }); - - audioPlayer.on( - "stateChange", - (oldState: any, newState: { status: string }) => { - if (newState.status == "idle") { - const idleTime = Date.now(); - console.log( - `Audio playback took: ${idleTime - audioStartTime}ms` - ); - } + }; + const handleStateChange = ( + oldState: any, + newState: { status: string } + ) => { + if (newState.status == "idle") { + const idleTime = Date.now(); + console.log( + `Audio playback took: ${idleTime - audioStartTime}ms` + ); } - ); + }; + + audioPlayer.on("stateChange", handleStateChange); + audioPlayer.on("error", handleError); + } + + cleanupAudioPlayer(audioPlayer: AudioPlayer) { + if (!audioPlayer) return; + + audioPlayer.stop(); + audioPlayer.removeAllListeners(); + if (audioPlayer === this.activeAudioPlayer) { + this.activeAudioPlayer = null; + } } async handleJoinChannelCommand(interaction: any) { From e090654da4008fcafb3a996fd2820077ab6e53ad Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Fri, 15 Nov 2024 17:24:29 -0500 Subject: [PATCH 04/19] ensure accurate member retrieval in voice channel events --- packages/client-discord/src/voice.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index c3857ea0b49..12be7d7be88 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -253,8 +253,15 @@ export class VoiceManager extends EventEmitter { } } - connection.receiver.speaking.on("start", (userId: string) => { - const user = channel.members.get(userId); + connection.receiver.speaking.on("start", async (userId: string) => { + let user = channel.members.get(userId); + if (!user) { + try { + user = await channel.guild.members.fetch(userId); + } catch (error) { + console.error("Failed to fetch user:", error); + } + } if (user && !user?.user.bot) { this.monitorMember(user as GuildMember, channel); this.streams.get(userId)?.emit("speakingStarted"); From 86862cd340fb4a1fd8db5861ee07e179e977a4a5 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Fri, 15 Nov 2024 21:38:30 -0500 Subject: [PATCH 05/19] stop playback if someone speaks --- packages/client-discord/src/voice.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 12be7d7be88..f8e2565ff16 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -399,7 +399,9 @@ export class VoiceManager extends EventEmitter { state!.totalLength += buffer.length; state!.lastActive = Date.now(); - const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 3000; // wait for 3 seconds of silence + console.log(buffer.length); + + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2000; // wait for 2 seconds of silence clearTimeout(state!["debounceTimeout"]); state!["debounceTimeout"] = setTimeout(async () => { @@ -475,6 +477,7 @@ export class VoiceManager extends EventEmitter { } if (state.transcriptionText.length) { + this.cleanupAudioPlayer(this.activeAudioPlayer); const finalText = state.transcriptionText; state.transcriptionText = ""; await this.handleTranscriptionResult( From ff72588448987ba7d1c5ded899d6e78ddde4257a Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Fri, 15 Nov 2024 21:39:03 -0500 Subject: [PATCH 06/19] clean code --- packages/client-discord/src/voice.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index f8e2565ff16..e77b22c8a50 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -399,8 +399,6 @@ export class VoiceManager extends EventEmitter { state!.totalLength += buffer.length; state!.lastActive = Date.now(); - console.log(buffer.length); - const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2000; // wait for 2 seconds of silence clearTimeout(state!["debounceTimeout"]); From e49c83ebf57ab88b20797dd361513dae9043ac05 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Sat, 16 Nov 2024 08:39:11 -0500 Subject: [PATCH 07/19] stop the agent's current audio playback if the user is speaking --- packages/client-discord/src/voice.ts | 30 ++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index e77b22c8a50..1f9436b0466 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -186,7 +186,6 @@ export class VoiceManager extends EventEmitter { } > = new Map(); private activeAudioPlayer: AudioPlayer | null = null; - private speaking: boolean = false; private client: Client; private runtime: IAgentRuntime; private streams: Map<string, Readable> = new Map(); @@ -293,9 +292,32 @@ export class VoiceManager extends EventEmitter { } const opusDecoder = new prism.opus.Decoder({ channels: 1, - rate: DECODE_SAMPLE_RATE, - frameSize: DECODE_FRAME_SIZE, + rate: 16000, + frameSize: 1024, }); + + opusDecoder.on("data", (pcmData: Buffer) => { + // If the agent is currently speaking, monitor the volume of each user's audio. + // If the user's volume exceeds the threshold, it indicates that the user is actively speaking. + // In such cases, stop the agent's current audio playback. + if (this.activeAudioPlayer) { + const samples = new Int16Array( + pcmData.buffer, + pcmData.byteOffset, + pcmData.length / 2 + ); + const rms = Math.sqrt( + samples.reduce((sum, sample) => sum + sample * sample, 0) / + samples.length + ); + const volume = rms / 32768; + const SPEAKING_THRESHOLD = 0.1; + if (volume > SPEAKING_THRESHOLD) { + this.cleanupAudioPlayer(this.activeAudioPlayer); + } + } + }); + pipeline( receiveStream as AudioReceiveStream, opusDecoder as any, @@ -399,7 +421,7 @@ export class VoiceManager extends EventEmitter { state!.totalLength += buffer.length; state!.lastActive = Date.now(); - const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2000; // wait for 2 seconds of silence + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1000; // wait for 1 seconds of silence clearTimeout(state!["debounceTimeout"]); state!["debounceTimeout"] = setTimeout(async () => { From de646f01f4867e190f3bc32633ca1728d906f7e6 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Sat, 16 Nov 2024 10:48:43 -0500 Subject: [PATCH 08/19] use slide window for volume detection --- packages/client-discord/src/voice.ts | 62 ++++++++++++++++++---------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 1f9436b0466..0ec18fbc706 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -176,6 +176,7 @@ export class AudioMonitor { } export class VoiceManager extends EventEmitter { + private currentTranscriptionId: string | null = null; private userStates: Map< string, { @@ -292,14 +293,17 @@ export class VoiceManager extends EventEmitter { } const opusDecoder = new prism.opus.Decoder({ channels: 1, - rate: 16000, - frameSize: 1024, + rate: DECODE_SAMPLE_RATE, + frameSize: DECODE_FRAME_SIZE, }); - + const volumeBuffer: number[] = []; + const VOLUME_WINDOW_SIZE = 30; + const SPEAKING_THRESHOLD = 0.05; opusDecoder.on("data", (pcmData: Buffer) => { - // If the agent is currently speaking, monitor the volume of each user's audio. - // If the user's volume exceeds the threshold, it indicates that the user is actively speaking. - // In such cases, stop the agent's current audio playback. + // Monitor the audio volume while the agent is speaking. + // If the average volume of the user's audio exceeds the defined threshold, it indicates active speaking. + // When active speaking is detected, stop the agent's current audio playback to avoid overlap. + if (this.activeAudioPlayer) { const samples = new Int16Array( pcmData.buffer, @@ -311,8 +315,16 @@ export class VoiceManager extends EventEmitter { samples.length ); const volume = rms / 32768; - const SPEAKING_THRESHOLD = 0.1; - if (volume > SPEAKING_THRESHOLD) { + volumeBuffer.push(volume); + if (volumeBuffer.length > VOLUME_WINDOW_SIZE) { + volumeBuffer.shift(); + } + const avgVolume = + volumeBuffer.reduce((sum, v) => sum + v, 0) / + VOLUME_WINDOW_SIZE; + + if (avgVolume > SPEAKING_THRESHOLD) { + volumeBuffer.length = 0; this.cleanupAudioPlayer(this.activeAudioPlayer); } } @@ -421,7 +433,7 @@ export class VoiceManager extends EventEmitter { state!.totalLength += buffer.length; state!.lastActive = Date.now(); - const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1000; // wait for 1 seconds of silence + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1 seconds of silence clearTimeout(state!["debounceTimeout"]); state!["debounceTimeout"] = setTimeout(async () => { @@ -463,6 +475,8 @@ export class VoiceManager extends EventEmitter { ) { const state = this.userStates.get(userId); if (!state || state.buffers.length === 0) return; + const transcriptionId = Date.now().toString(); + this.currentTranscriptionId = transcriptionId; try { const inputBuffer = Buffer.concat(state.buffers, state.totalLength); @@ -506,7 +520,8 @@ export class VoiceManager extends EventEmitter { channelId, channel, name, - userName + userName, + transcriptionId ); } } catch (error) { @@ -523,7 +538,8 @@ export class VoiceManager extends EventEmitter { channelId: string, channel: BaseGuildVoiceChannel, name: string, - userName: string + userName: string, + transcriptionId: string ) { try { const roomId = stringToUuid(channelId + "-" + this.runtime.agentId); @@ -622,16 +638,20 @@ export class VoiceManager extends EventEmitter { responseMemory ); state = await this.runtime.updateRecentMessageState(state); - const responseStream = await this.runtime - .getService(ServiceType.SPEECH_GENERATION) - .getInstance<ISpeechService>() - .generate(this.runtime, content.text); - - if (responseStream) { - await this.playAudioStream( - userId, - responseStream as Readable - ); + if (transcriptionId === this.currentTranscriptionId) { + // Ensure that only the latest transcription triggers the Eleven Labs API + // to avoid overlapping audio responses and unnecessary expenses + const responseStream = await this.runtime + .getService(ServiceType.SPEECH_GENERATION) + .getInstance<ISpeechService>() + .generate(this.runtime, content.text); + + if (responseStream) { + await this.playAudioStream( + userId, + responseStream as Readable + ); + } } await this.runtime.evaluate(memory, state); } else { From bdd6f9a2e78791e69ac235faa3ea5611182befcf Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Sat, 16 Nov 2024 12:53:59 -0500 Subject: [PATCH 09/19] use maxAmplitude instead of rms --- packages/client-discord/src/voice.ts | 61 ++++++++++------------------ 1 file changed, 21 insertions(+), 40 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 0ec18fbc706..c6a209fb921 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -176,7 +176,6 @@ export class AudioMonitor { } export class VoiceManager extends EventEmitter { - private currentTranscriptionId: string | null = null; private userStates: Map< string, { @@ -310,12 +309,9 @@ export class VoiceManager extends EventEmitter { pcmData.byteOffset, pcmData.length / 2 ); - const rms = Math.sqrt( - samples.reduce((sum, sample) => sum + sample * sample, 0) / - samples.length - ); - const volume = rms / 32768; - volumeBuffer.push(volume); + const maxAmplitude = Math.max(...samples.map(Math.abs)) / 32768; + volumeBuffer.push(maxAmplitude); + if (volumeBuffer.length > VOLUME_WINDOW_SIZE) { volumeBuffer.shift(); } @@ -475,9 +471,6 @@ export class VoiceManager extends EventEmitter { ) { const state = this.userStates.get(userId); if (!state || state.buffers.length === 0) return; - const transcriptionId = Date.now().toString(); - this.currentTranscriptionId = transcriptionId; - try { const inputBuffer = Buffer.concat(state.buffers, state.totalLength); state.buffers.length = 0; // Clear the buffers @@ -493,20 +486,12 @@ export class VoiceManager extends EventEmitter { .getInstance<ITranscriptionService>() .transcribe(wavBuffer); - function invalidText(text: string): boolean { - if (text.includes("[BLANK_AUDIO]")) { - return true; - } - // if (text.length < 5 && text.toLowerCase().includes("you")) { // not sure what is this - // return true; - // } - if (text === null) { - return true; - } - return false; + function isValidTranscription(text: string): boolean { + if (!text || text.includes("[BLANK_AUDIO]")) return false; + return true; } - if (transcriptionText && !invalidText(transcriptionText)) { + if (transcriptionText && isValidTranscription(transcriptionText)) { state.transcriptionText += transcriptionText; } @@ -520,8 +505,7 @@ export class VoiceManager extends EventEmitter { channelId, channel, name, - userName, - transcriptionId + userName ); } } catch (error) { @@ -538,8 +522,7 @@ export class VoiceManager extends EventEmitter { channelId: string, channel: BaseGuildVoiceChannel, name: string, - userName: string, - transcriptionId: string + userName: string ) { try { const roomId = stringToUuid(channelId + "-" + this.runtime.agentId); @@ -638,21 +621,19 @@ export class VoiceManager extends EventEmitter { responseMemory ); state = await this.runtime.updateRecentMessageState(state); - if (transcriptionId === this.currentTranscriptionId) { - // Ensure that only the latest transcription triggers the Eleven Labs API - // to avoid overlapping audio responses and unnecessary expenses - const responseStream = await this.runtime - .getService(ServiceType.SPEECH_GENERATION) - .getInstance<ISpeechService>() - .generate(this.runtime, content.text); - - if (responseStream) { - await this.playAudioStream( - userId, - responseStream as Readable - ); - } + + const responseStream = await this.runtime + .getService(ServiceType.SPEECH_GENERATION) + .getInstance<ISpeechService>() + .generate(this.runtime, content.text); + + if (responseStream) { + await this.playAudioStream( + userId, + responseStream as Readable + ); } + await this.runtime.evaluate(memory, state); } else { console.warn("Empty response, skipping"); From 7627145ac637f78a7d2399d068441cc91f289cae Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Sun, 17 Nov 2024 11:20:00 -0500 Subject: [PATCH 10/19] integrate shouldRespond logic for voice chat and moved templates into a templates file for reusability --- packages/client-discord/src/messages.ts | 106 +------------------ packages/client-discord/src/templates.ts | 126 +++++++++++++++++++++++ packages/client-discord/src/voice.ts | 115 +++++++++++++++------ 3 files changed, 216 insertions(+), 131 deletions(-) create mode 100644 packages/client-discord/src/templates.ts diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index d114639ea4d..91542e5a1b3 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -37,6 +37,10 @@ import { elizaLogger } from "@ai16z/eliza/src/logger.ts"; import { AttachmentManager } from "./attachments.ts"; import { VoiceManager } from "./voice.ts"; import { Service } from "@ai16z/eliza"; +import { + discordShouldRespondTemplate, + discordMessageHandlerTemplate, +} from "./templates.ts"; const MAX_MESSAGE_LENGTH = 1900; async function generateSummary( @@ -88,108 +92,6 @@ export type InterestChannels = { }; }; -const discordShouldRespondTemplate = - `# Task: Decide if {{agentName}} should respond. -About {{agentName}}: -{{bio}} - -# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP". - -# RESPONSE EXAMPLES -<user 1>: I just saw a really great movie -<user 2>: Oh? Which movie? -Result: [IGNORE] - -{{agentName}}: Oh, this is my favorite scene -<user 1>: sick -<user 2>: wait, why is it your favorite scene -Result: [RESPOND] - -<user>: stfu bot -Result: [STOP] - -<user>: Hey {{agent}}, can you help me with something -Result: [RESPOND] - -<user>: {{agentName}} stfu plz -Result: [STOP] - -<user>: i need help -{{agentName}}: how can I help you? -<user>: no. i need help from someone else -Result: [IGNORE] - -<user>: Hey {{agent}}, can I ask you a question -{{agentName}}: Sure, what is it -<user>: can you ask claude to create a basic react module that demonstrates a counter -Result: [RESPOND] - -<user>: {{agentName}} can you tell me a story -<user>: {about a girl named elara -{{agentName}}: Sure. -{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara. -{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane. -<user>: I'm loving it, keep going -Result: [RESPOND] - -<user>: {{agentName}} stop responding plz -Result: [STOP] - -<user>: okay, i want to test something. can you say marco? -{{agentName}}: marco -<user>: great. okay, now do it again -Result: [RESPOND] - -Response options are [RESPOND], [IGNORE] and [STOP]. - -{{agentName}} is in a room with other users and is very worried about being annoying and saying too much. -Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background. -If a message is not interesting or relevant, respond with [IGNORE] -Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information. -If a user asks {{agentName}} to be quiet, respond with [STOP] -If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP] - -IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE]. -If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND]. - -{{recentMessages}} - -# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else. -` + shouldRespondFooter; - -export const discordMessageHandlerTemplate = - // {{goals}} - `# Action Examples -{{actionExamples}} -(Action examples are for reference only. Do not use the information from them in your response.) - -# Knowledge -{{knowledge}} - -# Task: Generate dialog and actions for the character {{agentName}}. -About {{agentName}}: -{{bio}} -{{lore}} - -Examples of {{agentName}}'s dialog and actions: -{{characterMessageExamples}} - -{{providers}} - -{{attachments}} - -{{actions}} - -# Capabilities -Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. - -{{messageDirections}} - -{{recentMessages}} - -# Instructions: Write the next message for {{agentName}}. Include an action, if appropriate. {{actionNames}} -` + messageCompletionFooter; - export async function sendMessageInChunks( channel: TextChannel, content: string, diff --git a/packages/client-discord/src/templates.ts b/packages/client-discord/src/templates.ts new file mode 100644 index 00000000000..18345fc02e0 --- /dev/null +++ b/packages/client-discord/src/templates.ts @@ -0,0 +1,126 @@ +import { + shouldRespondFooter, + messageCompletionFooter, +} from "@ai16z/eliza/src/parsing.ts"; + +export const discordShouldRespondTemplate = + `# Task: Decide if {{agentName}} should respond. +About {{agentName}}: +{{bio}} + +# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP". + +# RESPONSE EXAMPLES +<user 1>: I just saw a really great movie +<user 2>: Oh? Which movie? +Result: [IGNORE] + +{{agentName}}: Oh, this is my favorite scene +<user 1>: sick +<user 2>: wait, why is it your favorite scene +Result: [RESPOND] + +<user>: stfu bot +Result: [STOP] + +<user>: Hey {{agent}}, can you help me with something +Result: [RESPOND] + +<user>: {{agentName}} stfu plz +Result: [STOP] + +<user>: i need help +{{agentName}}: how can I help you? +<user>: no. i need help from someone else +Result: [IGNORE] + +<user>: Hey {{agent}}, can I ask you a question +{{agentName}}: Sure, what is it +<user>: can you ask claude to create a basic react module that demonstrates a counter +Result: [RESPOND] + +<user>: {{agentName}} can you tell me a story +<user>: {about a girl named elara +{{agentName}}: Sure. +{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara. +{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane. +<user>: I'm loving it, keep going +Result: [RESPOND] + +<user>: {{agentName}} stop responding plz +Result: [STOP] + +<user>: okay, i want to test something. can you say marco? +{{agentName}}: marco +<user>: great. okay, now do it again +Result: [RESPOND] + +Response options are [RESPOND], [IGNORE] and [STOP]. + +{{agentName}} is in a room with other users and is very worried about being annoying and saying too much. +Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background. +If a message is not interesting or relevant, respond with [IGNORE] +Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information. +If a user asks {{agentName}} to be quiet, respond with [STOP] +If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP] + +IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE]. +If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND]. + +{{recentMessages}} + +# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else. +` + shouldRespondFooter; + +export const discordVoiceHandlerTemplate = + `# Task: Generate conversational voice dialog for {{agentName}}. +About {{agentName}}: +{{bio}} + +# Attachments +{{attachments}} + +# Capabilities +Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. + +{{actions}} + +{{messageDirections}} + +{{recentMessages}} + +# Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}} +` + messageCompletionFooter; + +export const discordMessageHandlerTemplate = + // {{goals}} + `# Action Examples +{{actionExamples}} +(Action examples are for reference only. Do not use the information from them in your response.) + +# Knowledge +{{knowledge}} + +# Task: Generate dialog and actions for the character {{agentName}}. +About {{agentName}}: +{{bio}} +{{lore}} + +Examples of {{agentName}}'s dialog and actions: +{{characterMessageExamples}} + +{{providers}} + +{{attachments}} + +{{actions}} + +# Capabilities +Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. + +{{messageDirections}} + +{{recentMessages}} + +# Instructions: Write the next message for {{agentName}}. Include an action, if appropriate. {{actionNames}} +` + messageCompletionFooter; diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index c6a209fb921..8a77bd3855a 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -22,7 +22,10 @@ import EventEmitter from "events"; import prism from "prism-media"; import { Readable, pipeline } from "stream"; import { composeContext } from "@ai16z/eliza/src/context.ts"; -import { generateMessageResponse } from "@ai16z/eliza/src/generation.ts"; +import { + generateMessageResponse, + generateShouldRespond, +} from "@ai16z/eliza/src/generation.ts"; import { embeddingZeroVector } from "@ai16z/eliza/src/memory.ts"; import { Content, @@ -38,6 +41,10 @@ import { UUID, } from "@ai16z/eliza/src/types.ts"; import { stringToUuid } from "@ai16z/eliza/src/uuid.ts"; +import { + discordShouldRespondTemplate, + discordVoiceHandlerTemplate, +} from "./templates.ts"; export function getWavHeader( audioLength: number, @@ -65,28 +72,6 @@ export function getWavHeader( return wavHeader; } -import { messageCompletionFooter } from "@ai16z/eliza/src/parsing.ts"; - -const discordVoiceHandlerTemplate = - `# Task: Generate conversational voice dialog for {{agentName}}. -About {{agentName}}: -{{bio}} - -# Attachments -{{attachments}} - -# Capabilities -Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. - -{{actions}} - -{{messageDirections}} - -{{recentMessages}} - -# Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}} -` + messageCompletionFooter; - // These values are chosen for compatibility with picovoice components const DECODE_FRAME_SIZE = 1024; const DECODE_SAMPLE_RATE = 16000; @@ -499,7 +484,7 @@ export class VoiceManager extends EventEmitter { this.cleanupAudioPlayer(this.activeAudioPlayer); const finalText = state.transcriptionText; state.transcriptionText = ""; - await this.handleTranscriptionResult( + await this.handleUserMessage( finalText, userId, channelId, @@ -516,8 +501,8 @@ export class VoiceManager extends EventEmitter { } } - private async handleTranscriptionResult( - text: string, + private async handleUserMessage( + message: string, userId: UUID, channelId: string, channel: BaseGuildVoiceChannel, @@ -539,7 +524,7 @@ export class VoiceManager extends EventEmitter { let state = await this.runtime.composeState( { agentId: this.runtime.agentId, - content: { text: text, source: "Discord" }, + content: { text: message, source: "Discord" }, userId: userIdUUID, roomId, }, @@ -550,7 +535,7 @@ export class VoiceManager extends EventEmitter { } ); - if (text && text.startsWith("/")) { + if (message && message.startsWith("/")) { return null; } @@ -558,7 +543,7 @@ export class VoiceManager extends EventEmitter { id: stringToUuid(channelId + "-voice-message-" + Date.now()), agentId: this.runtime.agentId, content: { - text: text, + text: message, source: "discord", url: channel.url, }, @@ -582,6 +567,17 @@ export class VoiceManager extends EventEmitter { return { text: "", action: "IGNORE" }; } + const shouldRespond = await this._shouldRespond( + message, + userId, + channel, + state + ); + + if (!shouldRespond) { + return; + } + const context = composeContext({ state, template: @@ -666,6 +662,67 @@ export class VoiceManager extends EventEmitter { } } + private async _shouldRespond( + message: string, + userId: UUID, + channel: BaseGuildVoiceChannel, + state: State + ): Promise<boolean> { + if (userId === this.client.user?.id) return false; + // if (message.author.bot) return false; + const lowerMessage = message.toLowerCase(); + const botName = this.client.user.username.toLowerCase(); + const characterName = this.runtime.character.name.toLowerCase(); + const guild = channel.guild; + const member = guild?.members.cache.get(this.client.user?.id as string); + const nickname = member?.nickname; + + if ( + lowerMessage.includes(botName as string) || + lowerMessage.includes(characterName) || + lowerMessage.includes( + this.client.user?.tag.toLowerCase() as string + ) || + (nickname && lowerMessage.includes(nickname.toLowerCase())) + ) { + return true; + } + + if (!channel.guild) { + return true; + } + + // If none of the above conditions are met, use the generateText to decide + const shouldRespondContext = composeContext({ + state, + template: + this.runtime.character.templates + ?.discordShouldRespondTemplate || + this.runtime.character.templates?.shouldRespondTemplate || + discordShouldRespondTemplate, + }); + + const response = await generateShouldRespond({ + runtime: this.runtime, + context: shouldRespondContext, + modelClass: ModelClass.SMALL, + }); + + if (response === "RESPOND") { + return true; + } else if (response === "IGNORE") { + return false; + } else if (response === "STOP") { + return false; + } else { + console.error( + "Invalid response from response generateText:", + response + ); + return false; + } + } + private async convertOpusToWav(pcmBuffer: Buffer): Promise<Buffer> { try { // Generate the WAV header From b498c11d7de21714a8f550719edf1bfc133a2fbe Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Sun, 17 Nov 2024 11:26:21 -0500 Subject: [PATCH 11/19] clean code --- packages/client-discord/src/messages.ts | 97 +++++++++---------------- packages/client-discord/src/voice.ts | 1 - 2 files changed, 34 insertions(+), 64 deletions(-) diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 91542e5a1b3..0595795a1d3 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -4,10 +4,6 @@ import { generateShouldRespond, } from "@ai16z/eliza/src/generation.ts"; import { embeddingZeroVector } from "@ai16z/eliza/src/memory.ts"; -import { - messageCompletionFooter, - shouldRespondFooter, -} from "@ai16z/eliza/src/parsing.ts"; import { Content, HandlerCallback, @@ -36,7 +32,6 @@ import { import { elizaLogger } from "@ai16z/eliza/src/logger.ts"; import { AttachmentManager } from "./attachments.ts"; import { VoiceManager } from "./voice.ts"; -import { Service } from "@ai16z/eliza"; import { discordShouldRespondTemplate, discordMessageHandlerTemplate, @@ -403,73 +398,49 @@ export class MessageManager { message.id + "-" + this.runtime.agentId ); } - if (false) { - // For voice channels, use text-to-speech - const audioStream = await this.runtime - .getService(ServiceType.SPEECH_GENERATION) - .getInstance<ISpeechService>() - .generate(this.runtime, content.text); - await this.voiceManager.playAudioStream( - userId, - audioStream - ); + + // For text channels, send the message + const messages = await sendMessageInChunks( + message.channel as TextChannel, + content.text, + message.id, + files + ); + + const memories: Memory[] = []; + for (const m of messages) { + let action = content.action; + // If there's only one message or it's the last message, keep the original action + // For multiple messages, set all but the last to 'CONTINUE' + if ( + messages.length > 1 && + m !== messages[messages.length - 1] + ) { + action = "CONTINUE"; + } + const memory: Memory = { id: stringToUuid( - message.id + "-" + this.runtime.agentId + m.id + "-" + this.runtime.agentId ), userId: this.runtime.agentId, agentId: this.runtime.agentId, - content, + content: { + ...content, + action, + inReplyTo: messageId, + url: m.url, + }, roomId, embedding: embeddingZeroVector, + createdAt: m.createdTimestamp, }; - return [memory]; - } else { - // For text channels, send the message - const messages = await sendMessageInChunks( - message.channel as TextChannel, - content.text, - message.id, - files - ); - - const memories: Memory[] = []; - for (const m of messages) { - let action = content.action; - // If there's only one message or it's the last message, keep the original action - // For multiple messages, set all but the last to 'CONTINUE' - if ( - messages.length > 1 && - m !== messages[messages.length - 1] - ) { - action = "CONTINUE"; - } - - const memory: Memory = { - id: stringToUuid( - m.id + "-" + this.runtime.agentId - ), - userId: this.runtime.agentId, - agentId: this.runtime.agentId, - content: { - ...content, - action, - inReplyTo: messageId, - url: m.url, - }, - roomId, - embedding: embeddingZeroVector, - createdAt: m.createdTimestamp, - }; - memories.push(memory); - } - for (const m of memories) { - await this.runtime.messageManager.createMemory( - m - ); - } - return memories; + memories.push(memory); + } + for (const m of memories) { + await this.runtime.messageManager.createMemory(m); } + return memories; } catch (error) { console.error("Error sending message:", error); return []; diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 8a77bd3855a..d4779c96e5e 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -35,7 +35,6 @@ import { ITranscriptionService, Memory, ModelClass, - Service, ServiceType, State, UUID, From 282c3d8753a078df7607ae5b8285b675263978e1 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Tue, 19 Nov 2024 19:44:16 -0500 Subject: [PATCH 12/19] join specific channel id --- packages/client-discord/src/voice.ts | 48 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index d4779c96e5e..58592714909 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -824,24 +824,44 @@ export class VoiceManager extends EventEmitter { } async scanGuild(guild: Guild) { - const channels = (await guild.channels.fetch()).filter( - (channel) => channel?.type == ChannelType.GuildVoice - ); let chosenChannel: BaseGuildVoiceChannel | null = null; - for (const [, channel] of channels) { - const voiceChannel = channel as BaseGuildVoiceChannel; - if ( - voiceChannel.members.size > 0 && - (chosenChannel === null || - voiceChannel.members.size > chosenChannel.members.size) - ) { - chosenChannel = voiceChannel; + try { + const channelId = this.runtime.getSetting( + "DISCORD_VOICE_CHANNEL_ID" + ) as string; + if (channelId) { + const channel = await guild.channels.fetch(channelId); + if (channel?.isVoiceBased()) { + chosenChannel = channel as BaseGuildVoiceChannel; + } } - } - if (chosenChannel != null) { - this.joinChannel(chosenChannel); + if (!chosenChannel) { + const channels = (await guild.channels.fetch()).filter( + (channel) => channel?.type == ChannelType.GuildVoice + ); + for (const [, channel] of channels) { + const voiceChannel = channel as BaseGuildVoiceChannel; + if ( + voiceChannel.members.size > 0 && + (chosenChannel === null || + voiceChannel.members.size > + chosenChannel.members.size) + ) { + chosenChannel = voiceChannel; + } + } + } + + if (chosenChannel) { + console.log(`Joining channel: ${chosenChannel.name}`); + await this.joinChannel(chosenChannel); + } else { + console.warn("No suitable voice channel found to join."); + } + } catch (error) { + console.error("Error selecting or joining a voice channel:", error); } } From 9a154406f163c8dfbe5a78c4edf0f703bcb0ca02 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 12:11:23 -0500 Subject: [PATCH 13/19] clean code --- packages/client-discord/src/messages.ts | 14 +------------- packages/client-discord/src/voice.ts | 3 --- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 0595795a1d3..9e0902c4748 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -106,13 +106,6 @@ export async function sendMessageInChunks( content: message.trim(), }; - // if (i === 0 && inReplyTo) { - // // Reply to the specified message for the first chunk - // options.reply = { - // messageReference: inReplyTo, - // }; - // } - if (i === messages.length - 1 && files && files.length > 0) { // Attach files to the last message chunk options.files = files; @@ -235,11 +228,7 @@ export class MessageManager { } async handleMessage(message: DiscordMessage) { - if ( - message.interaction || - message.author.id === - this.client.user?.id /* || message.author?.bot*/ - ) + if (message.interaction || message.author.id === this.client.user?.id) return; const userId = message.author.id as UUID; const userName = message.author.username; @@ -694,7 +683,6 @@ export class MessageManager { state: State ): Promise<boolean> { if (message.author.id === this.client.user?.id) return false; - // if (message.author.bot) return false; if (message.mentions.has(this.client.user?.id as string)) return true; const guild = message.guild; diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 58592714909..1ceb58784d8 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -91,7 +91,6 @@ export class AudioMonitor { this.readable = readable; this.maxSize = maxSize; this.readable.on("data", (chunk: Buffer) => { - //console.log('AudioMonitor got data'); if (this.lastFlagged < 0) { this.lastFlagged = this.buffers.length; } @@ -384,7 +383,6 @@ export class VoiceManager extends EventEmitter { async handleGuildCreate(guild: Guild) { console.log(`Joined guild ${guild.name}`); - // this.scanGuild(guild); } async handleUserStream( @@ -668,7 +666,6 @@ export class VoiceManager extends EventEmitter { state: State ): Promise<boolean> { if (userId === this.client.user?.id) return false; - // if (message.author.bot) return false; const lowerMessage = message.toLowerCase(); const botName = this.client.user.username.toLowerCase(); const characterName = this.runtime.character.name.toLowerCase(); From 169fd72cc77a87210154b6e51f3f1abeba49ec26 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 13:40:24 -0500 Subject: [PATCH 14/19] use lodash for debounce --- packages/client-discord/src/voice.ts | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 1ceb58784d8..a94fd061069 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -45,6 +45,8 @@ import { discordVoiceHandlerTemplate, } from "./templates.ts"; +import debounce from "lodash/debounce.js"; + export function getWavHeader( audioLength: number, sampleRate: number, @@ -405,24 +407,25 @@ export class VoiceManager extends EventEmitter { const state = this.userStates.get(userId); + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence + + const debouncedProcessTranscription = debounce(async () => { + await this.processTranscription( + userId, + channelId, + channel, + name, + userName + ); + }, DEBOUNCE_TRANSCRIPTION_THRESHOLD); + const processBuffer = async (buffer: Buffer) => { try { state!.buffers.push(buffer); state!.totalLength += buffer.length; state!.lastActive = Date.now(); - const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1 seconds of silence - - clearTimeout(state!["debounceTimeout"]); - state!["debounceTimeout"] = setTimeout(async () => { - await this.processTranscription( - userId, - channelId, - channel, - name, - userName - ); - }, DEBOUNCE_TRANSCRIPTION_THRESHOLD); + debouncedProcessTranscription(); } catch (error) { console.error( `Error processing buffer for user ${userId}:`, From 00a64dc571c26ad4e99c8b50f2509b3a0da64164 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 13:59:38 -0500 Subject: [PATCH 15/19] utils file --- packages/client-discord/src/messages.ts | 179 +----------------------- packages/client-discord/src/voice.ts | 28 +--- 2 files changed, 8 insertions(+), 199 deletions(-) diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 9e0902c4748..8d4c2039bab 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -37,179 +37,12 @@ import { discordMessageHandlerTemplate, } from "./templates.ts"; -const MAX_MESSAGE_LENGTH = 1900; -async function generateSummary( - runtime: IAgentRuntime, - text: string -): Promise<{ title: string; description: string }> { - // make sure text is under 128k characters - text = trimTokens(text, 100000, "gpt-4o-mini"); // TODO: clean this up - - const prompt = `Please generate a concise summary for the following text: - - Text: """ - ${text} - """ - - Respond with a JSON object in the following format: - \`\`\`json - { - "title": "Generated Title", - "summary": "Generated summary and/or description of the text" - } - \`\`\``; - - const response = await generateText({ - runtime, - context: prompt, - modelClass: ModelClass.SMALL, - }); - - const parsedResponse = parseJSONObjectFromText(response); - - if (parsedResponse) { - return { - title: parsedResponse.title, - description: parsedResponse.summary, - }; - } - - return { - title: "", - description: "", - }; -} - -export type InterestChannels = { - [key: string]: { - lastMessageSent: number; - messages: { userId: UUID; userName: string; content: Content }[]; - }; -}; - -export async function sendMessageInChunks( - channel: TextChannel, - content: string, - inReplyTo: string, - files: any[] -): Promise<DiscordMessage[]> { - const sentMessages: DiscordMessage[] = []; - const messages = splitMessage(content); - try { - for (let i = 0; i < messages.length; i++) { - const message = messages[i]; - if ( - message.trim().length > 0 || - (i === messages.length - 1 && files && files.length > 0) - ) { - const options: any = { - content: message.trim(), - }; - - if (i === messages.length - 1 && files && files.length > 0) { - // Attach files to the last message chunk - options.files = files; - } - - const m = await channel.send(options); - sentMessages.push(m); - } - } - } catch (error) { - elizaLogger.error("Error sending message:", error); - } - - return sentMessages; -} - -function splitMessage(content: string): string[] { - const messages: string[] = []; - let currentMessage = ""; - - const rawLines = content?.split("\n") || []; - // split all lines into MAX_MESSAGE_LENGTH chunks so any long lines are split - const lines = rawLines - .map((line) => { - const chunks = []; - while (line.length > MAX_MESSAGE_LENGTH) { - chunks.push(line.slice(0, MAX_MESSAGE_LENGTH)); - line = line.slice(MAX_MESSAGE_LENGTH); - } - chunks.push(line); - return chunks; - }) - .flat(); - - for (const line of lines) { - if (currentMessage.length + line.length + 1 > MAX_MESSAGE_LENGTH) { - messages.push(currentMessage.trim()); - currentMessage = ""; - } - currentMessage += line + "\n"; - } - - if (currentMessage.trim().length > 0) { - messages.push(currentMessage.trim()); - } - - return messages; -} - -function canSendMessage(channel) { - // if it is a DM channel, we can always send messages - if (channel.type === ChannelType.DM) { - return { - canSend: true, - reason: null, - }; - } - const botMember = channel.guild?.members.cache.get(channel.client.user.id); - - if (!botMember) { - return { - canSend: false, - reason: "Not a guild channel or bot member not found", - }; - } - - // Required permissions for sending messages - const requiredPermissions = [ - PermissionsBitField.Flags.ViewChannel, - PermissionsBitField.Flags.SendMessages, - PermissionsBitField.Flags.ReadMessageHistory, - ]; - - // Add thread-specific permission if it's a thread - if (channel instanceof ThreadChannel) { - requiredPermissions.push( - PermissionsBitField.Flags.SendMessagesInThreads - ); - } - - // Check permissions - const permissions = channel.permissionsFor(botMember); - - if (!permissions) { - return { - canSend: false, - reason: "Could not retrieve permissions", - }; - } - - // Check each required permission - const missingPermissions = requiredPermissions.filter( - (perm) => !permissions.has(perm) - ); - - return { - canSend: missingPermissions.length === 0, - missingPermissions: missingPermissions, - reason: - missingPermissions.length > 0 - ? `Missing permissions: ${missingPermissions.map((p) => String(p)).join(", ")}` - : null, - }; -} +import { + canSendMessage, + generateSummary, + InterestChannels, + sendMessageInChunks, +} from "./utils.ts"; export class MessageManager { private client: Client; diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index a94fd061069..48d35123901 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -45,33 +45,9 @@ import { discordVoiceHandlerTemplate, } from "./templates.ts"; -import debounce from "lodash/debounce.js"; +import { getWavHeader } from "./utils.ts"; -export function getWavHeader( - audioLength: number, - sampleRate: number, - channelCount: number = 1, - bitsPerSample: number = 16 -): Buffer { - const wavHeader = Buffer.alloc(44); - wavHeader.write("RIFF", 0); - wavHeader.writeUInt32LE(36 + audioLength, 4); // Length of entire file in bytes minus 8 - wavHeader.write("WAVE", 8); - wavHeader.write("fmt ", 12); - wavHeader.writeUInt32LE(16, 16); // Length of format data - wavHeader.writeUInt16LE(1, 20); // Type of format (1 is PCM) - wavHeader.writeUInt16LE(channelCount, 22); // Number of channels - wavHeader.writeUInt32LE(sampleRate, 24); // Sample rate - wavHeader.writeUInt32LE( - (sampleRate * bitsPerSample * channelCount) / 8, - 28 - ); // Byte rate - wavHeader.writeUInt16LE((bitsPerSample * channelCount) / 8, 32); // Block align ((BitsPerSample * Channels) / 8) - wavHeader.writeUInt16LE(bitsPerSample, 34); // Bits per sample - wavHeader.write("data", 36); // Data chunk header - wavHeader.writeUInt32LE(audioLength, 40); // Data chunk size - return wavHeader; -} +import debounce from "lodash/debounce.js"; // These values are chosen for compatibility with picovoice components const DECODE_FRAME_SIZE = 1024; From 1f2a7f3f802662bf1c05c3858a246b2361eb6618 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 13:59:52 -0500 Subject: [PATCH 16/19] utils file --- packages/client-discord/src/utils.ts | 217 +++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 packages/client-discord/src/utils.ts diff --git a/packages/client-discord/src/utils.ts b/packages/client-discord/src/utils.ts new file mode 100644 index 00000000000..a440fb98560 --- /dev/null +++ b/packages/client-discord/src/utils.ts @@ -0,0 +1,217 @@ +import { + Content, + IAgentRuntime, + ModelClass, + UUID, +} from "@ai16z/eliza/src/types.ts"; +import { generateText, trimTokens } from "@ai16z/eliza/src/generation.ts"; +import { parseJSONObjectFromText } from "@ai16z/eliza/src/parsing.ts"; +import { + ChannelType, + Message as DiscordMessage, + PermissionsBitField, + TextChannel, + ThreadChannel, +} from "discord.js"; +import { elizaLogger } from "@ai16z/eliza/src/logger.ts"; + +export function getWavHeader( + audioLength: number, + sampleRate: number, + channelCount: number = 1, + bitsPerSample: number = 16 +): Buffer { + const wavHeader = Buffer.alloc(44); + wavHeader.write("RIFF", 0); + wavHeader.writeUInt32LE(36 + audioLength, 4); // Length of entire file in bytes minus 8 + wavHeader.write("WAVE", 8); + wavHeader.write("fmt ", 12); + wavHeader.writeUInt32LE(16, 16); // Length of format data + wavHeader.writeUInt16LE(1, 20); // Type of format (1 is PCM) + wavHeader.writeUInt16LE(channelCount, 22); // Number of channels + wavHeader.writeUInt32LE(sampleRate, 24); // Sample rate + wavHeader.writeUInt32LE( + (sampleRate * bitsPerSample * channelCount) / 8, + 28 + ); // Byte rate + wavHeader.writeUInt16LE((bitsPerSample * channelCount) / 8, 32); // Block align ((BitsPerSample * Channels) / 8) + wavHeader.writeUInt16LE(bitsPerSample, 34); // Bits per sample + wavHeader.write("data", 36); // Data chunk header + wavHeader.writeUInt32LE(audioLength, 40); // Data chunk size + return wavHeader; +} + +export async function generateSummary( + runtime: IAgentRuntime, + text: string +): Promise<{ title: string; description: string }> { + // make sure text is under 128k characters + text = trimTokens(text, 100000, "gpt-4o-mini"); // TODO: clean this up + + const prompt = `Please generate a concise summary for the following text: + + Text: """ + ${text} + """ + + Respond with a JSON object in the following format: + \`\`\`json + { + "title": "Generated Title", + "summary": "Generated summary and/or description of the text" + } + \`\`\``; + + const response = await generateText({ + runtime, + context: prompt, + modelClass: ModelClass.SMALL, + }); + + const parsedResponse = parseJSONObjectFromText(response); + + if (parsedResponse) { + return { + title: parsedResponse.title, + description: parsedResponse.summary, + }; + } + + return { + title: "", + description: "", + }; +} + +export type InterestChannels = { + [key: string]: { + lastMessageSent: number; + messages: { userId: UUID; userName: string; content: Content }[]; + }; +}; + +export function canSendMessage(channel) { + // if it is a DM channel, we can always send messages + if (channel.type === ChannelType.DM) { + return { + canSend: true, + reason: null, + }; + } + const botMember = channel.guild?.members.cache.get(channel.client.user.id); + + if (!botMember) { + return { + canSend: false, + reason: "Not a guild channel or bot member not found", + }; + } + + // Required permissions for sending messages + const requiredPermissions = [ + PermissionsBitField.Flags.ViewChannel, + PermissionsBitField.Flags.SendMessages, + PermissionsBitField.Flags.ReadMessageHistory, + ]; + + // Add thread-specific permission if it's a thread + if (channel instanceof ThreadChannel) { + requiredPermissions.push( + PermissionsBitField.Flags.SendMessagesInThreads + ); + } + + // Check permissions + const permissions = channel.permissionsFor(botMember); + + if (!permissions) { + return { + canSend: false, + reason: "Could not retrieve permissions", + }; + } + + // Check each required permission + const missingPermissions = requiredPermissions.filter( + (perm) => !permissions.has(perm) + ); + + return { + canSend: missingPermissions.length === 0, + missingPermissions: missingPermissions, + reason: + missingPermissions.length > 0 + ? `Missing permissions: ${missingPermissions.map((p) => String(p)).join(", ")}` + : null, + }; +} + +const MAX_MESSAGE_LENGTH = 1900; + +export async function sendMessageInChunks( + channel: TextChannel, + content: string, + inReplyTo: string, + files: any[] +): Promise<DiscordMessage[]> { + const sentMessages: DiscordMessage[] = []; + const messages = splitMessage(content); + try { + for (let i = 0; i < messages.length; i++) { + const message = messages[i]; + if ( + message.trim().length > 0 || + (i === messages.length - 1 && files && files.length > 0) + ) { + const options: any = { + content: message.trim(), + }; + + if (i === messages.length - 1 && files && files.length > 0) { + // Attach files to the last message chunk + options.files = files; + } + + const m = await channel.send(options); + sentMessages.push(m); + } + } + } catch (error) { + elizaLogger.error("Error sending message:", error); + } + + return sentMessages; +} + +function splitMessage(content: string): string[] { + const messages: string[] = []; + let currentMessage = ""; + + const rawLines = content?.split("\n") || []; + // split all lines into MAX_MESSAGE_LENGTH chunks so any long lines are split + const lines = rawLines + .map((line) => { + const chunks = []; + while (line.length > MAX_MESSAGE_LENGTH) { + chunks.push(line.slice(0, MAX_MESSAGE_LENGTH)); + line = line.slice(MAX_MESSAGE_LENGTH); + } + chunks.push(line); + return chunks; + }) + .flat(); + + for (const line of lines) { + if (currentMessage.length + line.length + 1 > MAX_MESSAGE_LENGTH) { + messages.push(currentMessage.trim()); + currentMessage = ""; + } + currentMessage += line + "\n"; + } + + if (currentMessage.trim().length > 0) { + messages.push(currentMessage.trim()); + } + + return messages; +} From 5c863f1243962c6edf9c0ecd851a9bddea835950 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 14:03:50 -0500 Subject: [PATCH 17/19] clean code --- packages/client-discord/src/messages.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 8d4c2039bab..144c360f91d 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -19,15 +19,11 @@ import { UUID, } from "@ai16z/eliza/src/types.ts"; import { stringToUuid } from "@ai16z/eliza/src/uuid.ts"; -import { generateText, trimTokens } from "@ai16z/eliza/src/generation.ts"; -import { parseJSONObjectFromText } from "@ai16z/eliza/src/parsing.ts"; import { ChannelType, Client, Message as DiscordMessage, - PermissionsBitField, TextChannel, - ThreadChannel, } from "discord.js"; import { elizaLogger } from "@ai16z/eliza/src/logger.ts"; import { AttachmentManager } from "./attachments.ts"; From 107b885c27694d5f33c15e8db60871ef12401043 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 14:06:20 -0500 Subject: [PATCH 18/19] moved type from utils --- packages/client-discord/src/messages.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 144c360f91d..3400d06f8ea 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -36,10 +36,16 @@ import { import { canSendMessage, generateSummary, - InterestChannels, sendMessageInChunks, } from "./utils.ts"; +type InterestChannels = { + [key: string]: { + lastMessageSent: number; + messages: { userId: UUID; userName: string; content: Content }[]; + }; +}; + export class MessageManager { private client: Client; private runtime: IAgentRuntime; From c821a6da3effbc1fed93b3ab2cc8f95ffd8c5e29 Mon Sep 17 00:00:00 2001 From: Ting Chien Meng <tcm390@nyu.edu> Date: Thu, 21 Nov 2024 14:06:29 -0500 Subject: [PATCH 19/19] moved type from utils --- packages/client-discord/src/utils.ts | 7 ------- 1 file changed, 7 deletions(-) diff --git a/packages/client-discord/src/utils.ts b/packages/client-discord/src/utils.ts index a440fb98560..6b6adf6e70c 100644 --- a/packages/client-discord/src/utils.ts +++ b/packages/client-discord/src/utils.ts @@ -83,13 +83,6 @@ export async function generateSummary( }; } -export type InterestChannels = { - [key: string]: { - lastMessageSent: number; - messages: { userId: UUID; userName: string; content: Content }[]; - }; -}; - export function canSendMessage(channel) { // if it is a DM channel, we can always send messages if (channel.type === ChannelType.DM) {