From 2976bf4ec8c8ef2ce2db8512a500a90f7557d57b Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Fri, 15 Nov 2024 12:22:22 -0500
Subject: [PATCH 01/19] fix: ensure unique instances for each Service subclass
 using Map

---
 packages/core/src/types.ts | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts
index 040b743a68c..88f0bcdfd98 100644
--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@@ -503,15 +503,17 @@ export interface IMemoryManager {
 }
 
 export abstract class Service {
-    private static instance: Service | null = null;
+    private static instances: Map<any, Service> = new Map();
     static serviceType: ServiceType;
 
     public static getInstance<T extends Service>(): T {
-        if (!Service.instance) {
-            // Use this.prototype.constructor to instantiate the concrete class
-            Service.instance = new (this as any)();
+        if (!Service.instances.has(this)) {
+            Service.instances.set(
+                this,
+                new (this as unknown as { new (): T })()
+            );
         }
-        return Service.instance as T;
+        return Service.instances.get(this) as T;
     }
 }
 

From 1ad97e33af4aef7a8d861749f10efc79b42ecc7d Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Fri, 15 Nov 2024 16:35:50 -0500
Subject: [PATCH 02/19] reply with a text message if the user types something

---
 packages/client-discord/src/messages.ts | 27 ++++++++++++-------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index 4d233f658de..d114639ea4d 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -341,7 +341,7 @@ export class MessageManager {
         if (
             message.interaction ||
             message.author.id ===
-            this.client.user?.id /* || message.author?.bot*/
+                this.client.user?.id /* || message.author?.bot*/
         )
             return;
         const userId = message.author.id as UUID;
@@ -389,10 +389,10 @@ export class MessageManager {
                 url: message.url,
                 inReplyTo: message.reference?.messageId
                     ? stringToUuid(
-                        message.reference.messageId +
-                        "-" +
-                        this.runtime.agentId
-                    )
+                          message.reference.messageId +
+                              "-" +
+                              this.runtime.agentId
+                      )
                     : undefined,
             };
 
@@ -501,13 +501,11 @@ export class MessageManager {
                                 message.id + "-" + this.runtime.agentId
                             );
                         }
-                        if (message.channel.type === ChannelType.GuildVoice) {
+                        if (false) {
                             // For voice channels, use text-to-speech
-                            const audioStream = await (
-                                this.runtime.getService(
-                                    ServiceType.SPEECH_GENERATION
-                                )
-                            ).getInstance<ISpeechService>()
+                            const audioStream = await this.runtime
+                                .getService(ServiceType.SPEECH_GENERATION)
+                                .getInstance<ISpeechService>()
                                 .generate(this.runtime, content.text);
                             await this.voiceManager.playAudioStream(
                                 userId,
@@ -659,14 +657,15 @@ export class MessageManager {
 
         for (const url of urls) {
             if (
-                this.runtime.getService(ServiceType.VIDEO)
+                this.runtime
+                    .getService(ServiceType.VIDEO)
                     .getInstance<IVideoService>()
                     .isVideoUrl(url)
             ) {
-                const videoInfo = await (this.runtime
+                const videoInfo = await this.runtime
                     .getService(ServiceType.VIDEO)
                     .getInstance<IVideoService>()
-                    .processVideo(url));
+                    .processVideo(url);
                 attachments.push({
                     id: `youtube-${Date.now()}`,
                     url: url,

From 997dc42dfb559e9bad364beafcade9378dfa246f Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Fri, 15 Nov 2024 16:37:34 -0500
Subject: [PATCH 03/19] refactor stream handling: add debounce for
 transcription processing

---
 packages/client-discord/src/voice.ts | 503 +++++++++++++++------------
 1 file changed, 280 insertions(+), 223 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 744a8106b8e..c3857ea0b49 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -1,4 +1,5 @@
 import {
+    AudioPlayer,
     AudioReceiveStream,
     NoSubscriberBehavior,
     StreamType,
@@ -175,6 +176,17 @@ export class AudioMonitor {
 }
 
 export class VoiceManager extends EventEmitter {
+    private userStates: Map<
+        string,
+        {
+            buffers: Buffer[];
+            totalLength: number;
+            lastActive: number;
+            transcriptionText: string;
+        }
+    > = new Map();
+    private activeAudioPlayer: AudioPlayer | null = null;
+    private speaking: boolean = false;
     private client: Client;
     private runtime: IAgentRuntime;
     private streams: Map<string, Readable> = new Map();
@@ -236,14 +248,14 @@ export class VoiceManager extends EventEmitter {
         });
 
         for (const [, member] of channel.members) {
-            if (!member.user.bot) {
+            if (member && !member.user.bot) {
                 this.monitorMember(member, channel);
             }
         }
 
         connection.receiver.speaking.on("start", (userId: string) => {
             const user = channel.members.get(userId);
-            if (!user?.user.bot) {
+            if (user && !user?.user.bot) {
                 this.monitorMember(user as GuildMember, channel);
                 this.streams.get(userId)?.emit("speakingStarted");
             }
@@ -361,237 +373,266 @@ export class VoiceManager extends EventEmitter {
         channel: BaseGuildVoiceChannel,
         audioStream: Readable
     ) {
+        console.log(`Starting audio monitor for user: ${userId}`);
         const channelId = channel.id;
-        const buffers: Buffer[] = [];
-        let totalLength = 0;
-        const maxSilenceTime = 1000; // Maximum pause duration in milliseconds
-        const minSilenceTime = 50; // Minimum silence duration to trigger transcription
-        let lastChunkTime = Date.now();
-        let transcriptionStarted = false;
-        let transcriptionText = "";
-        console.log("new audio monitor for: ", userId);
+        if (!this.userStates.has(userId)) {
+            this.userStates.set(userId, {
+                buffers: [],
+                totalLength: 0,
+                lastActive: Date.now(),
+                transcriptionText: "",
+            });
+        }
+
+        const state = this.userStates.get(userId);
+
+        const processBuffer = async (buffer: Buffer) => {
+            try {
+                state!.buffers.push(buffer);
+                state!.totalLength += buffer.length;
+                state!.lastActive = Date.now();
+
+                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 3000; // wait for 3 seconds of silence
+
+                clearTimeout(state!["debounceTimeout"]);
+                state!["debounceTimeout"] = setTimeout(async () => {
+                    await this.processTranscription(
+                        userId,
+                        channelId,
+                        channel,
+                        name,
+                        userName
+                    );
+                }, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
+            } catch (error) {
+                console.error(
+                    `Error processing buffer for user ${userId}:`,
+                    error
+                );
+            }
+        };
 
         const monitor = new AudioMonitor(
             audioStream,
             10000000,
             async (buffer) => {
-                console.log("buffer: ", buffer);
-                const currentTime = Date.now();
-                const silenceDuration = currentTime - lastChunkTime;
                 if (!buffer) {
-                    // Handle error
-                    console.error("Empty buffer received");
+                    console.error("Received empty buffer");
                     return;
                 }
-                buffers.push(buffer);
-                totalLength += buffer.length;
-                lastChunkTime = currentTime;
-
-                if (silenceDuration > minSilenceTime && !transcriptionStarted) {
-                    transcriptionStarted = true;
-                    const inputBuffer = Buffer.concat(buffers, totalLength);
-                    buffers.length = 0;
-                    totalLength = 0;
-
-                    try {
-                        // Convert Opus to WAV and add the header
-                        const wavBuffer =
-                            await this.convertOpusToWav(inputBuffer);
-
-                        console.log("starting transcription");
-                        const text = await this.runtime
-                            .getService(ServiceType.TRANSCRIPTION)
-                            .getInstance<ITranscriptionService>()
-                            .transcribe(wavBuffer);
-                        console.log("transcribed text: ", text);
-                        transcriptionText += text;
-                    } catch (error) {
-                        console.error("Error processing audio stream:", error);
-                    }
+                await processBuffer(buffer);
+            }
+        );
+    }
+
+    private async processTranscription(
+        userId: UUID,
+        channelId: string,
+        channel: BaseGuildVoiceChannel,
+        name: string,
+        userName: string
+    ) {
+        const state = this.userStates.get(userId);
+        if (!state || state.buffers.length === 0) return;
+
+        try {
+            const inputBuffer = Buffer.concat(state.buffers, state.totalLength);
+            state.buffers.length = 0; // Clear the buffers
+            state.totalLength = 0;
+
+            // Convert Opus to WAV
+            const wavBuffer = await this.convertOpusToWav(inputBuffer);
+
+            console.log("Starting transcription...");
+
+            const transcriptionText = await this.runtime
+                .getService(ServiceType.TRANSCRIPTION)
+                .getInstance<ITranscriptionService>()
+                .transcribe(wavBuffer);
+
+            function invalidText(text: string): boolean {
+                if (text.includes("[BLANK_AUDIO]")) {
+                    return true;
+                }
+                // if (text.length < 5 && text.toLowerCase().includes("you")) { // not sure what is this
+                //     return true;
+                // }
+                if (text === null) {
+                    return true;
                 }
+                return false;
+            }
+
+            if (transcriptionText && !invalidText(transcriptionText)) {
+                state.transcriptionText += transcriptionText;
+            }
+
+            if (state.transcriptionText.length) {
+                const finalText = state.transcriptionText;
+                state.transcriptionText = "";
+                await this.handleTranscriptionResult(
+                    finalText,
+                    userId,
+                    channelId,
+                    channel,
+                    name,
+                    userName
+                );
+            }
+        } catch (error) {
+            console.error(
+                `Error transcribing audio for user ${userId}:`,
+                error
+            );
+        }
+    }
+
+    private async handleTranscriptionResult(
+        text: string,
+        userId: UUID,
+        channelId: string,
+        channel: BaseGuildVoiceChannel,
+        name: string,
+        userName: string
+    ) {
+        try {
+            const roomId = stringToUuid(channelId + "-" + this.runtime.agentId);
+            const userIdUUID = stringToUuid(userId);
+
+            await this.runtime.ensureConnection(
+                userIdUUID,
+                roomId,
+                userName,
+                name,
+                "discord"
+            );
 
-                if (silenceDuration > maxSilenceTime && transcriptionStarted) {
-                    console.log("transcription finished");
-                    transcriptionStarted = false;
+            let state = await this.runtime.composeState(
+                {
+                    agentId: this.runtime.agentId,
+                    content: { text: text, source: "Discord" },
+                    userId: userIdUUID,
+                    roomId,
+                },
+                {
+                    discordChannel: channel,
+                    discordClient: this.client,
+                    agentName: this.runtime.character.name,
+                }
+            );
 
-                    if (!transcriptionText) return;
+            if (text && text.startsWith("/")) {
+                return null;
+            }
 
-                    try {
-                        const text = transcriptionText;
+            const memory = {
+                id: stringToUuid(channelId + "-voice-message-" + Date.now()),
+                agentId: this.runtime.agentId,
+                content: {
+                    text: text,
+                    source: "discord",
+                    url: channel.url,
+                },
+                userId: userIdUUID,
+                roomId,
+                embedding: embeddingZeroVector,
+                createdAt: Date.now(),
+            };
+
+            if (!memory.content.text) {
+                return { text: "", action: "IGNORE" };
+            }
 
-                        // handle whisper cases
-                        if (
-                            (text.length < 15 &&
-                                text.includes("[BLANK_AUDIO]")) ||
-                            (text.length < 5 &&
-                                text.toLowerCase().includes("you"))
-                        ) {
-                            transcriptionText = ""; // Reset transcription text
-                            return;
-                        }
+            await this.runtime.messageManager.createMemory(memory);
 
-                        const roomId = stringToUuid(
-                            channelId + "-" + this.runtime.agentId
-                        );
-                        const userIdUUID = stringToUuid(userId);
-
-                        await this.runtime.ensureConnection(
-                            userIdUUID,
-                            roomId,
-                            userName,
-                            name,
-                            "discord"
-                        );
+            state = await this.runtime.updateRecentMessageState(state);
 
-                        let state = await this.runtime.composeState(
-                            {
-                                agentId: this.runtime.agentId,
-                                content: { text: text, source: "Discord" },
-                                userId: userIdUUID,
-                                roomId,
-                            },
-                            {
-                                discordChannel: channel,
-                                discordClient: this.client,
-                                agentName: this.runtime.character.name,
-                            }
-                        );
+            const shouldIgnore = await this._shouldIgnore(memory);
 
-                        if (text && text.startsWith("/")) {
-                            transcriptionText = ""; // Reset transcription text
-                            return null;
-                        }
-
-                        const memory = {
-                            id: stringToUuid(
-                                channelId + "-voice-message-" + Date.now()
-                            ),
-                            agentId: this.runtime.agentId,
-                            content: {
-                                text: text,
-                                source: "discord",
-                                url: channel.url,
-                            },
-                            userId: userIdUUID,
-                            roomId,
-                            embedding: embeddingZeroVector,
-                            createdAt: Date.now(),
-                        };
-
-                        if (!memory.content.text) {
-                            transcriptionText = ""; // Reset transcription text
-                            return { text: "", action: "IGNORE" };
-                        }
-
-                        await this.runtime.messageManager.createMemory(memory);
-
-                        state =
-                            await this.runtime.updateRecentMessageState(state);
-
-                        const shouldIgnore = await this._shouldIgnore(memory);
-
-                        if (shouldIgnore) {
-                            transcriptionText = ""; // Reset transcription text
-                            return { text: "", action: "IGNORE" };
-                        }
-
-                        const context = composeContext({
-                            state,
-                            template:
-                                this.runtime.character.templates
-                                    ?.discordVoiceHandlerTemplate ||
-                                this.runtime.character.templates
-                                    ?.messageHandlerTemplate ||
-                                discordVoiceHandlerTemplate,
-                        });
-
-                        const responseContent = await this._generateResponse(
-                            memory,
-                            state,
-                            context
-                        );
+            if (shouldIgnore) {
+                return { text: "", action: "IGNORE" };
+            }
 
-                        const callback: HandlerCallback = async (
-                            content: Content
-                        ) => {
-                            console.log("callback content: ", content);
-                            const { roomId } = memory;
-
-                            const responseMemory: Memory = {
-                                id: stringToUuid(
-                                    memory.id + "-voice-response-" + Date.now()
-                                ),
-                                agentId: this.runtime.agentId,
-                                userId: this.runtime.agentId,
-                                content: {
-                                    ...content,
-                                    user: this.runtime.character.name,
-                                    inReplyTo: memory.id,
-                                },
-                                roomId,
-                                embedding: embeddingZeroVector,
-                            };
-
-                            if (responseMemory.content.text?.trim()) {
-                                await this.runtime.messageManager.createMemory(
-                                    responseMemory
-                                );
-                                state =
-                                    await this.runtime.updateRecentMessageState(
-                                        state
-                                    );
-                                const responseStream = await this.runtime
-                                    .getService(ServiceType.SPEECH_GENERATION)
-                                    .getInstance<ISpeechService>()
-                                    .generate(this.runtime, content.text);
-
-                                if (responseStream) {
-                                    await this.playAudioStream(
-                                        userId,
-                                        responseStream as Readable
-                                    );
-                                }
-                                await this.runtime.evaluate(memory, state);
-                            } else {
-                                console.warn("Empty response, skipping");
-                            }
-                            return [responseMemory];
-                        };
-
-                        const responseMemories =
-                            await callback(responseContent);
-
-                        const response = responseContent;
-
-                        const content = (response.responseMessage ||
-                            response.content ||
-                            response.message) as string;
-
-                        if (!content) {
-                            transcriptionText = ""; // Reset transcription text
-                            return null;
-                        }
-
-                        console.log("responseMemories: ", responseMemories);
-
-                        await this.runtime.processActions(
-                            memory,
-                            responseMemories,
-                            state,
-                            callback
-                        );
+            const context = composeContext({
+                state,
+                template:
+                    this.runtime.character.templates
+                        ?.discordVoiceHandlerTemplate ||
+                    this.runtime.character.templates?.messageHandlerTemplate ||
+                    discordVoiceHandlerTemplate,
+            });
+
+            const responseContent = await this._generateResponse(
+                memory,
+                state,
+                context
+            );
 
-                        transcriptionText = ""; // Reset transcription text
-                    } catch (error) {
-                        console.error(
-                            "Error processing transcribed text:",
-                            error
+            const callback: HandlerCallback = async (content: Content) => {
+                console.log("callback content: ", content);
+                const { roomId } = memory;
+
+                const responseMemory: Memory = {
+                    id: stringToUuid(
+                        memory.id + "-voice-response-" + Date.now()
+                    ),
+                    agentId: this.runtime.agentId,
+                    userId: this.runtime.agentId,
+                    content: {
+                        ...content,
+                        user: this.runtime.character.name,
+                        inReplyTo: memory.id,
+                    },
+                    roomId,
+                    embedding: embeddingZeroVector,
+                };
+
+                if (responseMemory.content.text?.trim()) {
+                    await this.runtime.messageManager.createMemory(
+                        responseMemory
+                    );
+                    state = await this.runtime.updateRecentMessageState(state);
+                    const responseStream = await this.runtime
+                        .getService(ServiceType.SPEECH_GENERATION)
+                        .getInstance<ISpeechService>()
+                        .generate(this.runtime, content.text);
+
+                    if (responseStream) {
+                        await this.playAudioStream(
+                            userId,
+                            responseStream as Readable
                         );
-                        transcriptionText = ""; // Reset transcription text
                     }
+                    await this.runtime.evaluate(memory, state);
+                } else {
+                    console.warn("Empty response, skipping");
                 }
+                return [responseMemory];
+            };
+
+            const responseMemories = await callback(responseContent);
+
+            const response = responseContent;
+
+            const content = (response.responseMessage ||
+                response.content ||
+                response.message) as string;
+
+            if (!content) {
+                return null;
             }
-        );
+
+            console.log("responseMemories: ", responseMemories);
+
+            await this.runtime.processActions(
+                memory,
+                responseMemories,
+                state,
+                callback
+            );
+        } catch (error) {
+            console.error("Error processing transcribed text:", error);
+        }
     }
 
     private async convertOpusToWav(pcmBuffer: Buffer): Promise<Buffer> {
@@ -723,11 +764,15 @@ export class VoiceManager extends EventEmitter {
             console.log(`No connection for user ${userId}`);
             return;
         }
+
+        this.cleanupAudioPlayer(this.activeAudioPlayer);
+
         const audioPlayer = createAudioPlayer({
             behaviors: {
                 noSubscriber: NoSubscriberBehavior.Pause,
             },
         });
+        this.activeAudioPlayer = audioPlayer;
         connection.subscribe(audioPlayer);
 
         const audioStartTime = Date.now();
@@ -737,21 +782,33 @@ export class VoiceManager extends EventEmitter {
         });
         audioPlayer.play(resource);
 
-        audioPlayer.on("error", (err: any) => {
+        const handleError = (err: any) => {
             console.log(`Audio player error: ${err}`);
-        });
-
-        audioPlayer.on(
-            "stateChange",
-            (oldState: any, newState: { status: string }) => {
-                if (newState.status == "idle") {
-                    const idleTime = Date.now();
-                    console.log(
-                        `Audio playback took: ${idleTime - audioStartTime}ms`
-                    );
-                }
+        };
+        const handleStateChange = (
+            oldState: any,
+            newState: { status: string }
+        ) => {
+            if (newState.status == "idle") {
+                const idleTime = Date.now();
+                console.log(
+                    `Audio playback took: ${idleTime - audioStartTime}ms`
+                );
             }
-        );
+        };
+
+        audioPlayer.on("stateChange", handleStateChange);
+        audioPlayer.on("error", handleError);
+    }
+
+    cleanupAudioPlayer(audioPlayer: AudioPlayer) {
+        if (!audioPlayer) return;
+
+        audioPlayer.stop();
+        audioPlayer.removeAllListeners();
+        if (audioPlayer === this.activeAudioPlayer) {
+            this.activeAudioPlayer = null;
+        }
     }
 
     async handleJoinChannelCommand(interaction: any) {

From e090654da4008fcafb3a996fd2820077ab6e53ad Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Fri, 15 Nov 2024 17:24:29 -0500
Subject: [PATCH 04/19] ensure accurate member retrieval in voice channel
 events

---
 packages/client-discord/src/voice.ts | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index c3857ea0b49..12be7d7be88 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -253,8 +253,15 @@ export class VoiceManager extends EventEmitter {
             }
         }
 
-        connection.receiver.speaking.on("start", (userId: string) => {
-            const user = channel.members.get(userId);
+        connection.receiver.speaking.on("start", async (userId: string) => {
+            let user = channel.members.get(userId);
+            if (!user) {
+                try {
+                    user = await channel.guild.members.fetch(userId);
+                } catch (error) {
+                    console.error("Failed to fetch user:", error);
+                }
+            }
             if (user && !user?.user.bot) {
                 this.monitorMember(user as GuildMember, channel);
                 this.streams.get(userId)?.emit("speakingStarted");

From 86862cd340fb4a1fd8db5861ee07e179e977a4a5 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Fri, 15 Nov 2024 21:38:30 -0500
Subject: [PATCH 05/19] stop playback if someone speaks

---
 packages/client-discord/src/voice.ts | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 12be7d7be88..f8e2565ff16 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -399,7 +399,9 @@ export class VoiceManager extends EventEmitter {
                 state!.totalLength += buffer.length;
                 state!.lastActive = Date.now();
 
-                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 3000; // wait for 3 seconds of silence
+                console.log(buffer.length);
+
+                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2000; // wait for 2 seconds of silence
 
                 clearTimeout(state!["debounceTimeout"]);
                 state!["debounceTimeout"] = setTimeout(async () => {
@@ -475,6 +477,7 @@ export class VoiceManager extends EventEmitter {
             }
 
             if (state.transcriptionText.length) {
+                this.cleanupAudioPlayer(this.activeAudioPlayer);
                 const finalText = state.transcriptionText;
                 state.transcriptionText = "";
                 await this.handleTranscriptionResult(

From ff72588448987ba7d1c5ded899d6e78ddde4257a Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Fri, 15 Nov 2024 21:39:03 -0500
Subject: [PATCH 06/19] clean code

---
 packages/client-discord/src/voice.ts | 2 --
 1 file changed, 2 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index f8e2565ff16..e77b22c8a50 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -399,8 +399,6 @@ export class VoiceManager extends EventEmitter {
                 state!.totalLength += buffer.length;
                 state!.lastActive = Date.now();
 
-                console.log(buffer.length);
-
                 const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2000; // wait for 2 seconds of silence
 
                 clearTimeout(state!["debounceTimeout"]);

From e49c83ebf57ab88b20797dd361513dae9043ac05 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Sat, 16 Nov 2024 08:39:11 -0500
Subject: [PATCH 07/19] stop the agent's current audio playback if the user is
 speaking

---
 packages/client-discord/src/voice.ts | 30 ++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index e77b22c8a50..1f9436b0466 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -186,7 +186,6 @@ export class VoiceManager extends EventEmitter {
         }
     > = new Map();
     private activeAudioPlayer: AudioPlayer | null = null;
-    private speaking: boolean = false;
     private client: Client;
     private runtime: IAgentRuntime;
     private streams: Map<string, Readable> = new Map();
@@ -293,9 +292,32 @@ export class VoiceManager extends EventEmitter {
         }
         const opusDecoder = new prism.opus.Decoder({
             channels: 1,
-            rate: DECODE_SAMPLE_RATE,
-            frameSize: DECODE_FRAME_SIZE,
+            rate: 16000,
+            frameSize: 1024,
         });
+
+        opusDecoder.on("data", (pcmData: Buffer) => {
+            // If the agent is currently speaking, monitor the volume of each user's audio.
+            // If the user's volume exceeds the threshold, it indicates that the user is actively speaking.
+            // In such cases, stop the agent's current audio playback.
+            if (this.activeAudioPlayer) {
+                const samples = new Int16Array(
+                    pcmData.buffer,
+                    pcmData.byteOffset,
+                    pcmData.length / 2
+                );
+                const rms = Math.sqrt(
+                    samples.reduce((sum, sample) => sum + sample * sample, 0) /
+                        samples.length
+                );
+                const volume = rms / 32768;
+                const SPEAKING_THRESHOLD = 0.1;
+                if (volume > SPEAKING_THRESHOLD) {
+                    this.cleanupAudioPlayer(this.activeAudioPlayer);
+                }
+            }
+        });
+
         pipeline(
             receiveStream as AudioReceiveStream,
             opusDecoder as any,
@@ -399,7 +421,7 @@ export class VoiceManager extends EventEmitter {
                 state!.totalLength += buffer.length;
                 state!.lastActive = Date.now();
 
-                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 2000; // wait for 2 seconds of silence
+                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1000; // wait for 1 seconds of silence
 
                 clearTimeout(state!["debounceTimeout"]);
                 state!["debounceTimeout"] = setTimeout(async () => {

From de646f01f4867e190f3bc32633ca1728d906f7e6 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Sat, 16 Nov 2024 10:48:43 -0500
Subject: [PATCH 08/19] use slide window for volume detection

---
 packages/client-discord/src/voice.ts | 62 ++++++++++++++++++----------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 1f9436b0466..0ec18fbc706 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -176,6 +176,7 @@ export class AudioMonitor {
 }
 
 export class VoiceManager extends EventEmitter {
+    private currentTranscriptionId: string | null = null;
     private userStates: Map<
         string,
         {
@@ -292,14 +293,17 @@ export class VoiceManager extends EventEmitter {
         }
         const opusDecoder = new prism.opus.Decoder({
             channels: 1,
-            rate: 16000,
-            frameSize: 1024,
+            rate: DECODE_SAMPLE_RATE,
+            frameSize: DECODE_FRAME_SIZE,
         });
-
+        const volumeBuffer: number[] = [];
+        const VOLUME_WINDOW_SIZE = 30;
+        const SPEAKING_THRESHOLD = 0.05;
         opusDecoder.on("data", (pcmData: Buffer) => {
-            // If the agent is currently speaking, monitor the volume of each user's audio.
-            // If the user's volume exceeds the threshold, it indicates that the user is actively speaking.
-            // In such cases, stop the agent's current audio playback.
+            // Monitor the audio volume while the agent is speaking.
+            // If the average volume of the user's audio exceeds the defined threshold, it indicates active speaking.
+            // When active speaking is detected, stop the agent's current audio playback to avoid overlap.
+
             if (this.activeAudioPlayer) {
                 const samples = new Int16Array(
                     pcmData.buffer,
@@ -311,8 +315,16 @@ export class VoiceManager extends EventEmitter {
                         samples.length
                 );
                 const volume = rms / 32768;
-                const SPEAKING_THRESHOLD = 0.1;
-                if (volume > SPEAKING_THRESHOLD) {
+                volumeBuffer.push(volume);
+                if (volumeBuffer.length > VOLUME_WINDOW_SIZE) {
+                    volumeBuffer.shift();
+                }
+                const avgVolume =
+                    volumeBuffer.reduce((sum, v) => sum + v, 0) /
+                    VOLUME_WINDOW_SIZE;
+
+                if (avgVolume > SPEAKING_THRESHOLD) {
+                    volumeBuffer.length = 0;
                     this.cleanupAudioPlayer(this.activeAudioPlayer);
                 }
             }
@@ -421,7 +433,7 @@ export class VoiceManager extends EventEmitter {
                 state!.totalLength += buffer.length;
                 state!.lastActive = Date.now();
 
-                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1000; // wait for 1 seconds of silence
+                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1 seconds of silence
 
                 clearTimeout(state!["debounceTimeout"]);
                 state!["debounceTimeout"] = setTimeout(async () => {
@@ -463,6 +475,8 @@ export class VoiceManager extends EventEmitter {
     ) {
         const state = this.userStates.get(userId);
         if (!state || state.buffers.length === 0) return;
+        const transcriptionId = Date.now().toString();
+        this.currentTranscriptionId = transcriptionId;
 
         try {
             const inputBuffer = Buffer.concat(state.buffers, state.totalLength);
@@ -506,7 +520,8 @@ export class VoiceManager extends EventEmitter {
                     channelId,
                     channel,
                     name,
-                    userName
+                    userName,
+                    transcriptionId
                 );
             }
         } catch (error) {
@@ -523,7 +538,8 @@ export class VoiceManager extends EventEmitter {
         channelId: string,
         channel: BaseGuildVoiceChannel,
         name: string,
-        userName: string
+        userName: string,
+        transcriptionId: string
     ) {
         try {
             const roomId = stringToUuid(channelId + "-" + this.runtime.agentId);
@@ -622,16 +638,20 @@ export class VoiceManager extends EventEmitter {
                         responseMemory
                     );
                     state = await this.runtime.updateRecentMessageState(state);
-                    const responseStream = await this.runtime
-                        .getService(ServiceType.SPEECH_GENERATION)
-                        .getInstance<ISpeechService>()
-                        .generate(this.runtime, content.text);
-
-                    if (responseStream) {
-                        await this.playAudioStream(
-                            userId,
-                            responseStream as Readable
-                        );
+                    if (transcriptionId === this.currentTranscriptionId) {
+                        // Ensure that only the latest transcription triggers the Eleven Labs API
+                        // to avoid overlapping audio responses and unnecessary expenses
+                        const responseStream = await this.runtime
+                            .getService(ServiceType.SPEECH_GENERATION)
+                            .getInstance<ISpeechService>()
+                            .generate(this.runtime, content.text);
+
+                        if (responseStream) {
+                            await this.playAudioStream(
+                                userId,
+                                responseStream as Readable
+                            );
+                        }
                     }
                     await this.runtime.evaluate(memory, state);
                 } else {

From bdd6f9a2e78791e69ac235faa3ea5611182befcf Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Sat, 16 Nov 2024 12:53:59 -0500
Subject: [PATCH 09/19] use maxAmplitude instead of rms

---
 packages/client-discord/src/voice.ts | 61 ++++++++++------------------
 1 file changed, 21 insertions(+), 40 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 0ec18fbc706..c6a209fb921 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -176,7 +176,6 @@ export class AudioMonitor {
 }
 
 export class VoiceManager extends EventEmitter {
-    private currentTranscriptionId: string | null = null;
     private userStates: Map<
         string,
         {
@@ -310,12 +309,9 @@ export class VoiceManager extends EventEmitter {
                     pcmData.byteOffset,
                     pcmData.length / 2
                 );
-                const rms = Math.sqrt(
-                    samples.reduce((sum, sample) => sum + sample * sample, 0) /
-                        samples.length
-                );
-                const volume = rms / 32768;
-                volumeBuffer.push(volume);
+                const maxAmplitude = Math.max(...samples.map(Math.abs)) / 32768;
+                volumeBuffer.push(maxAmplitude);
+
                 if (volumeBuffer.length > VOLUME_WINDOW_SIZE) {
                     volumeBuffer.shift();
                 }
@@ -475,9 +471,6 @@ export class VoiceManager extends EventEmitter {
     ) {
         const state = this.userStates.get(userId);
         if (!state || state.buffers.length === 0) return;
-        const transcriptionId = Date.now().toString();
-        this.currentTranscriptionId = transcriptionId;
-
         try {
             const inputBuffer = Buffer.concat(state.buffers, state.totalLength);
             state.buffers.length = 0; // Clear the buffers
@@ -493,20 +486,12 @@ export class VoiceManager extends EventEmitter {
                 .getInstance<ITranscriptionService>()
                 .transcribe(wavBuffer);
 
-            function invalidText(text: string): boolean {
-                if (text.includes("[BLANK_AUDIO]")) {
-                    return true;
-                }
-                // if (text.length < 5 && text.toLowerCase().includes("you")) { // not sure what is this
-                //     return true;
-                // }
-                if (text === null) {
-                    return true;
-                }
-                return false;
+            function isValidTranscription(text: string): boolean {
+                if (!text || text.includes("[BLANK_AUDIO]")) return false;
+                return true;
             }
 
-            if (transcriptionText && !invalidText(transcriptionText)) {
+            if (transcriptionText && isValidTranscription(transcriptionText)) {
                 state.transcriptionText += transcriptionText;
             }
 
@@ -520,8 +505,7 @@ export class VoiceManager extends EventEmitter {
                     channelId,
                     channel,
                     name,
-                    userName,
-                    transcriptionId
+                    userName
                 );
             }
         } catch (error) {
@@ -538,8 +522,7 @@ export class VoiceManager extends EventEmitter {
         channelId: string,
         channel: BaseGuildVoiceChannel,
         name: string,
-        userName: string,
-        transcriptionId: string
+        userName: string
     ) {
         try {
             const roomId = stringToUuid(channelId + "-" + this.runtime.agentId);
@@ -638,21 +621,19 @@ export class VoiceManager extends EventEmitter {
                         responseMemory
                     );
                     state = await this.runtime.updateRecentMessageState(state);
-                    if (transcriptionId === this.currentTranscriptionId) {
-                        // Ensure that only the latest transcription triggers the Eleven Labs API
-                        // to avoid overlapping audio responses and unnecessary expenses
-                        const responseStream = await this.runtime
-                            .getService(ServiceType.SPEECH_GENERATION)
-                            .getInstance<ISpeechService>()
-                            .generate(this.runtime, content.text);
-
-                        if (responseStream) {
-                            await this.playAudioStream(
-                                userId,
-                                responseStream as Readable
-                            );
-                        }
+
+                    const responseStream = await this.runtime
+                        .getService(ServiceType.SPEECH_GENERATION)
+                        .getInstance<ISpeechService>()
+                        .generate(this.runtime, content.text);
+
+                    if (responseStream) {
+                        await this.playAudioStream(
+                            userId,
+                            responseStream as Readable
+                        );
                     }
+
                     await this.runtime.evaluate(memory, state);
                 } else {
                     console.warn("Empty response, skipping");

From 7627145ac637f78a7d2399d068441cc91f289cae Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Sun, 17 Nov 2024 11:20:00 -0500
Subject: [PATCH 10/19] integrate shouldRespond logic for voice chat and moved
 templates into a templates file for reusability

---
 packages/client-discord/src/messages.ts  | 106 +------------------
 packages/client-discord/src/templates.ts | 126 +++++++++++++++++++++++
 packages/client-discord/src/voice.ts     | 115 +++++++++++++++------
 3 files changed, 216 insertions(+), 131 deletions(-)
 create mode 100644 packages/client-discord/src/templates.ts

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index d114639ea4d..91542e5a1b3 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -37,6 +37,10 @@ import { elizaLogger } from "@ai16z/eliza/src/logger.ts";
 import { AttachmentManager } from "./attachments.ts";
 import { VoiceManager } from "./voice.ts";
 import { Service } from "@ai16z/eliza";
+import {
+    discordShouldRespondTemplate,
+    discordMessageHandlerTemplate,
+} from "./templates.ts";
 
 const MAX_MESSAGE_LENGTH = 1900;
 async function generateSummary(
@@ -88,108 +92,6 @@ export type InterestChannels = {
     };
 };
 
-const discordShouldRespondTemplate =
-    `# Task: Decide if {{agentName}} should respond.
-About {{agentName}}:
-{{bio}}
-
-# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP".
-
-# RESPONSE EXAMPLES
-<user 1>: I just saw a really great movie
-<user 2>: Oh? Which movie?
-Result: [IGNORE]
-
-{{agentName}}: Oh, this is my favorite scene
-<user 1>: sick
-<user 2>: wait, why is it your favorite scene
-Result: [RESPOND]
-
-<user>: stfu bot
-Result: [STOP]
-
-<user>: Hey {{agent}}, can you help me with something
-Result: [RESPOND]
-
-<user>: {{agentName}} stfu plz
-Result: [STOP]
-
-<user>: i need help
-{{agentName}}: how can I help you?
-<user>: no. i need help from someone else
-Result: [IGNORE]
-
-<user>: Hey {{agent}}, can I ask you a question
-{{agentName}}: Sure, what is it
-<user>: can you ask claude to create a basic react module that demonstrates a counter
-Result: [RESPOND]
-
-<user>: {{agentName}} can you tell me a story
-<user>: {about a girl named elara
-{{agentName}}: Sure.
-{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara.
-{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane.
-<user>: I'm loving it, keep going
-Result: [RESPOND]
-
-<user>: {{agentName}} stop responding plz
-Result: [STOP]
-
-<user>: okay, i want to test something. can you say marco?
-{{agentName}}: marco
-<user>: great. okay, now do it again
-Result: [RESPOND]
-
-Response options are [RESPOND], [IGNORE] and [STOP].
-
-{{agentName}} is in a room with other users and is very worried about being annoying and saying too much.
-Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background.
-If a message is not interesting or relevant, respond with [IGNORE]
-Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information.
-If a user asks {{agentName}} to be quiet, respond with [STOP]
-If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP]
-
-IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE].
-If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND].
-
-{{recentMessages}}
-
-# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else.
-` + shouldRespondFooter;
-
-export const discordMessageHandlerTemplate =
-    // {{goals}}
-    `# Action Examples
-{{actionExamples}}
-(Action examples are for reference only. Do not use the information from them in your response.)
-
-# Knowledge
-{{knowledge}}
-
-# Task: Generate dialog and actions for the character {{agentName}}.
-About {{agentName}}:
-{{bio}}
-{{lore}}
-
-Examples of {{agentName}}'s dialog and actions:
-{{characterMessageExamples}}
-
-{{providers}}
-
-{{attachments}}
-
-{{actions}}
-
-# Capabilities
-Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section.
-
-{{messageDirections}}
-
-{{recentMessages}}
-
-# Instructions: Write the next message for {{agentName}}. Include an action, if appropriate. {{actionNames}}
-` + messageCompletionFooter;
-
 export async function sendMessageInChunks(
     channel: TextChannel,
     content: string,
diff --git a/packages/client-discord/src/templates.ts b/packages/client-discord/src/templates.ts
new file mode 100644
index 00000000000..18345fc02e0
--- /dev/null
+++ b/packages/client-discord/src/templates.ts
@@ -0,0 +1,126 @@
+import {
+    shouldRespondFooter,
+    messageCompletionFooter,
+} from "@ai16z/eliza/src/parsing.ts";
+
+export const discordShouldRespondTemplate =
+    `# Task: Decide if {{agentName}} should respond.
+About {{agentName}}:
+{{bio}}
+
+# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP".
+
+# RESPONSE EXAMPLES
+<user 1>: I just saw a really great movie
+<user 2>: Oh? Which movie?
+Result: [IGNORE]
+
+{{agentName}}: Oh, this is my favorite scene
+<user 1>: sick
+<user 2>: wait, why is it your favorite scene
+Result: [RESPOND]
+
+<user>: stfu bot
+Result: [STOP]
+
+<user>: Hey {{agent}}, can you help me with something
+Result: [RESPOND]
+
+<user>: {{agentName}} stfu plz
+Result: [STOP]
+
+<user>: i need help
+{{agentName}}: how can I help you?
+<user>: no. i need help from someone else
+Result: [IGNORE]
+
+<user>: Hey {{agent}}, can I ask you a question
+{{agentName}}: Sure, what is it
+<user>: can you ask claude to create a basic react module that demonstrates a counter
+Result: [RESPOND]
+
+<user>: {{agentName}} can you tell me a story
+<user>: {about a girl named elara
+{{agentName}}: Sure.
+{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara.
+{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane.
+<user>: I'm loving it, keep going
+Result: [RESPOND]
+
+<user>: {{agentName}} stop responding plz
+Result: [STOP]
+
+<user>: okay, i want to test something. can you say marco?
+{{agentName}}: marco
+<user>: great. okay, now do it again
+Result: [RESPOND]
+
+Response options are [RESPOND], [IGNORE] and [STOP].
+
+{{agentName}} is in a room with other users and is very worried about being annoying and saying too much.
+Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background.
+If a message is not interesting or relevant, respond with [IGNORE]
+Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information.
+If a user asks {{agentName}} to be quiet, respond with [STOP]
+If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP]
+
+IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE].
+If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND].
+
+{{recentMessages}}
+
+# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else.
+` + shouldRespondFooter;
+
+export const discordVoiceHandlerTemplate =
+    `# Task: Generate conversational voice dialog for {{agentName}}.
+About {{agentName}}:
+{{bio}}
+
+# Attachments
+{{attachments}}
+
+# Capabilities
+Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section.
+
+{{actions}}
+
+{{messageDirections}}
+
+{{recentMessages}}
+
+# Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}}
+` + messageCompletionFooter;
+
+export const discordMessageHandlerTemplate =
+    // {{goals}}
+    `# Action Examples
+{{actionExamples}}
+(Action examples are for reference only. Do not use the information from them in your response.)
+
+# Knowledge
+{{knowledge}}
+
+# Task: Generate dialog and actions for the character {{agentName}}.
+About {{agentName}}:
+{{bio}}
+{{lore}}
+
+Examples of {{agentName}}'s dialog and actions:
+{{characterMessageExamples}}
+
+{{providers}}
+
+{{attachments}}
+
+{{actions}}
+
+# Capabilities
+Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section.
+
+{{messageDirections}}
+
+{{recentMessages}}
+
+# Instructions: Write the next message for {{agentName}}. Include an action, if appropriate. {{actionNames}}
+` + messageCompletionFooter;
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index c6a209fb921..8a77bd3855a 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -22,7 +22,10 @@ import EventEmitter from "events";
 import prism from "prism-media";
 import { Readable, pipeline } from "stream";
 import { composeContext } from "@ai16z/eliza/src/context.ts";
-import { generateMessageResponse } from "@ai16z/eliza/src/generation.ts";
+import {
+    generateMessageResponse,
+    generateShouldRespond,
+} from "@ai16z/eliza/src/generation.ts";
 import { embeddingZeroVector } from "@ai16z/eliza/src/memory.ts";
 import {
     Content,
@@ -38,6 +41,10 @@ import {
     UUID,
 } from "@ai16z/eliza/src/types.ts";
 import { stringToUuid } from "@ai16z/eliza/src/uuid.ts";
+import {
+    discordShouldRespondTemplate,
+    discordVoiceHandlerTemplate,
+} from "./templates.ts";
 
 export function getWavHeader(
     audioLength: number,
@@ -65,28 +72,6 @@ export function getWavHeader(
     return wavHeader;
 }
 
-import { messageCompletionFooter } from "@ai16z/eliza/src/parsing.ts";
-
-const discordVoiceHandlerTemplate =
-    `# Task: Generate conversational voice dialog for {{agentName}}.
-About {{agentName}}:
-{{bio}}
-
-# Attachments
-{{attachments}}
-
-# Capabilities
-Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section.
-
-{{actions}}
-
-{{messageDirections}}
-
-{{recentMessages}}
-
-# Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}}
-` + messageCompletionFooter;
-
 // These values are chosen for compatibility with picovoice components
 const DECODE_FRAME_SIZE = 1024;
 const DECODE_SAMPLE_RATE = 16000;
@@ -499,7 +484,7 @@ export class VoiceManager extends EventEmitter {
                 this.cleanupAudioPlayer(this.activeAudioPlayer);
                 const finalText = state.transcriptionText;
                 state.transcriptionText = "";
-                await this.handleTranscriptionResult(
+                await this.handleUserMessage(
                     finalText,
                     userId,
                     channelId,
@@ -516,8 +501,8 @@ export class VoiceManager extends EventEmitter {
         }
     }
 
-    private async handleTranscriptionResult(
-        text: string,
+    private async handleUserMessage(
+        message: string,
         userId: UUID,
         channelId: string,
         channel: BaseGuildVoiceChannel,
@@ -539,7 +524,7 @@ export class VoiceManager extends EventEmitter {
             let state = await this.runtime.composeState(
                 {
                     agentId: this.runtime.agentId,
-                    content: { text: text, source: "Discord" },
+                    content: { text: message, source: "Discord" },
                     userId: userIdUUID,
                     roomId,
                 },
@@ -550,7 +535,7 @@ export class VoiceManager extends EventEmitter {
                 }
             );
 
-            if (text && text.startsWith("/")) {
+            if (message && message.startsWith("/")) {
                 return null;
             }
 
@@ -558,7 +543,7 @@ export class VoiceManager extends EventEmitter {
                 id: stringToUuid(channelId + "-voice-message-" + Date.now()),
                 agentId: this.runtime.agentId,
                 content: {
-                    text: text,
+                    text: message,
                     source: "discord",
                     url: channel.url,
                 },
@@ -582,6 +567,17 @@ export class VoiceManager extends EventEmitter {
                 return { text: "", action: "IGNORE" };
             }
 
+            const shouldRespond = await this._shouldRespond(
+                message,
+                userId,
+                channel,
+                state
+            );
+
+            if (!shouldRespond) {
+                return;
+            }
+
             const context = composeContext({
                 state,
                 template:
@@ -666,6 +662,67 @@ export class VoiceManager extends EventEmitter {
         }
     }
 
+    private async _shouldRespond(
+        message: string,
+        userId: UUID,
+        channel: BaseGuildVoiceChannel,
+        state: State
+    ): Promise<boolean> {
+        if (userId === this.client.user?.id) return false;
+        // if (message.author.bot) return false;
+        const lowerMessage = message.toLowerCase();
+        const botName = this.client.user.username.toLowerCase();
+        const characterName = this.runtime.character.name.toLowerCase();
+        const guild = channel.guild;
+        const member = guild?.members.cache.get(this.client.user?.id as string);
+        const nickname = member?.nickname;
+
+        if (
+            lowerMessage.includes(botName as string) ||
+            lowerMessage.includes(characterName) ||
+            lowerMessage.includes(
+                this.client.user?.tag.toLowerCase() as string
+            ) ||
+            (nickname && lowerMessage.includes(nickname.toLowerCase()))
+        ) {
+            return true;
+        }
+
+        if (!channel.guild) {
+            return true;
+        }
+
+        // If none of the above conditions are met, use the generateText to decide
+        const shouldRespondContext = composeContext({
+            state,
+            template:
+                this.runtime.character.templates
+                    ?.discordShouldRespondTemplate ||
+                this.runtime.character.templates?.shouldRespondTemplate ||
+                discordShouldRespondTemplate,
+        });
+
+        const response = await generateShouldRespond({
+            runtime: this.runtime,
+            context: shouldRespondContext,
+            modelClass: ModelClass.SMALL,
+        });
+
+        if (response === "RESPOND") {
+            return true;
+        } else if (response === "IGNORE") {
+            return false;
+        } else if (response === "STOP") {
+            return false;
+        } else {
+            console.error(
+                "Invalid response from response generateText:",
+                response
+            );
+            return false;
+        }
+    }
+
     private async convertOpusToWav(pcmBuffer: Buffer): Promise<Buffer> {
         try {
             // Generate the WAV header

From b498c11d7de21714a8f550719edf1bfc133a2fbe Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Sun, 17 Nov 2024 11:26:21 -0500
Subject: [PATCH 11/19] clean code

---
 packages/client-discord/src/messages.ts | 97 +++++++++----------------
 packages/client-discord/src/voice.ts    |  1 -
 2 files changed, 34 insertions(+), 64 deletions(-)

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index 91542e5a1b3..0595795a1d3 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -4,10 +4,6 @@ import {
     generateShouldRespond,
 } from "@ai16z/eliza/src/generation.ts";
 import { embeddingZeroVector } from "@ai16z/eliza/src/memory.ts";
-import {
-    messageCompletionFooter,
-    shouldRespondFooter,
-} from "@ai16z/eliza/src/parsing.ts";
 import {
     Content,
     HandlerCallback,
@@ -36,7 +32,6 @@ import {
 import { elizaLogger } from "@ai16z/eliza/src/logger.ts";
 import { AttachmentManager } from "./attachments.ts";
 import { VoiceManager } from "./voice.ts";
-import { Service } from "@ai16z/eliza";
 import {
     discordShouldRespondTemplate,
     discordMessageHandlerTemplate,
@@ -403,73 +398,49 @@ export class MessageManager {
                                 message.id + "-" + this.runtime.agentId
                             );
                         }
-                        if (false) {
-                            // For voice channels, use text-to-speech
-                            const audioStream = await this.runtime
-                                .getService(ServiceType.SPEECH_GENERATION)
-                                .getInstance<ISpeechService>()
-                                .generate(this.runtime, content.text);
-                            await this.voiceManager.playAudioStream(
-                                userId,
-                                audioStream
-                            );
+
+                        // For text channels, send the message
+                        const messages = await sendMessageInChunks(
+                            message.channel as TextChannel,
+                            content.text,
+                            message.id,
+                            files
+                        );
+
+                        const memories: Memory[] = [];
+                        for (const m of messages) {
+                            let action = content.action;
+                            // If there's only one message or it's the last message, keep the original action
+                            // For multiple messages, set all but the last to 'CONTINUE'
+                            if (
+                                messages.length > 1 &&
+                                m !== messages[messages.length - 1]
+                            ) {
+                                action = "CONTINUE";
+                            }
+
                             const memory: Memory = {
                                 id: stringToUuid(
-                                    message.id + "-" + this.runtime.agentId
+                                    m.id + "-" + this.runtime.agentId
                                 ),
                                 userId: this.runtime.agentId,
                                 agentId: this.runtime.agentId,
-                                content,
+                                content: {
+                                    ...content,
+                                    action,
+                                    inReplyTo: messageId,
+                                    url: m.url,
+                                },
                                 roomId,
                                 embedding: embeddingZeroVector,
+                                createdAt: m.createdTimestamp,
                             };
-                            return [memory];
-                        } else {
-                            // For text channels, send the message
-                            const messages = await sendMessageInChunks(
-                                message.channel as TextChannel,
-                                content.text,
-                                message.id,
-                                files
-                            );
-
-                            const memories: Memory[] = [];
-                            for (const m of messages) {
-                                let action = content.action;
-                                // If there's only one message or it's the last message, keep the original action
-                                // For multiple messages, set all but the last to 'CONTINUE'
-                                if (
-                                    messages.length > 1 &&
-                                    m !== messages[messages.length - 1]
-                                ) {
-                                    action = "CONTINUE";
-                                }
-
-                                const memory: Memory = {
-                                    id: stringToUuid(
-                                        m.id + "-" + this.runtime.agentId
-                                    ),
-                                    userId: this.runtime.agentId,
-                                    agentId: this.runtime.agentId,
-                                    content: {
-                                        ...content,
-                                        action,
-                                        inReplyTo: messageId,
-                                        url: m.url,
-                                    },
-                                    roomId,
-                                    embedding: embeddingZeroVector,
-                                    createdAt: m.createdTimestamp,
-                                };
-                                memories.push(memory);
-                            }
-                            for (const m of memories) {
-                                await this.runtime.messageManager.createMemory(
-                                    m
-                                );
-                            }
-                            return memories;
+                            memories.push(memory);
+                        }
+                        for (const m of memories) {
+                            await this.runtime.messageManager.createMemory(m);
                         }
+                        return memories;
                     } catch (error) {
                         console.error("Error sending message:", error);
                         return [];
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 8a77bd3855a..d4779c96e5e 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -35,7 +35,6 @@ import {
     ITranscriptionService,
     Memory,
     ModelClass,
-    Service,
     ServiceType,
     State,
     UUID,

From 282c3d8753a078df7607ae5b8285b675263978e1 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Tue, 19 Nov 2024 19:44:16 -0500
Subject: [PATCH 12/19] join specific channel id

---
 packages/client-discord/src/voice.ts | 48 ++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index d4779c96e5e..58592714909 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -824,24 +824,44 @@ export class VoiceManager extends EventEmitter {
     }
 
     async scanGuild(guild: Guild) {
-        const channels = (await guild.channels.fetch()).filter(
-            (channel) => channel?.type == ChannelType.GuildVoice
-        );
         let chosenChannel: BaseGuildVoiceChannel | null = null;
 
-        for (const [, channel] of channels) {
-            const voiceChannel = channel as BaseGuildVoiceChannel;
-            if (
-                voiceChannel.members.size > 0 &&
-                (chosenChannel === null ||
-                    voiceChannel.members.size > chosenChannel.members.size)
-            ) {
-                chosenChannel = voiceChannel;
+        try {
+            const channelId = this.runtime.getSetting(
+                "DISCORD_VOICE_CHANNEL_ID"
+            ) as string;
+            if (channelId) {
+                const channel = await guild.channels.fetch(channelId);
+                if (channel?.isVoiceBased()) {
+                    chosenChannel = channel as BaseGuildVoiceChannel;
+                }
             }
-        }
 
-        if (chosenChannel != null) {
-            this.joinChannel(chosenChannel);
+            if (!chosenChannel) {
+                const channels = (await guild.channels.fetch()).filter(
+                    (channel) => channel?.type == ChannelType.GuildVoice
+                );
+                for (const [, channel] of channels) {
+                    const voiceChannel = channel as BaseGuildVoiceChannel;
+                    if (
+                        voiceChannel.members.size > 0 &&
+                        (chosenChannel === null ||
+                            voiceChannel.members.size >
+                                chosenChannel.members.size)
+                    ) {
+                        chosenChannel = voiceChannel;
+                    }
+                }
+            }
+
+            if (chosenChannel) {
+                console.log(`Joining channel: ${chosenChannel.name}`);
+                await this.joinChannel(chosenChannel);
+            } else {
+                console.warn("No suitable voice channel found to join.");
+            }
+        } catch (error) {
+            console.error("Error selecting or joining a voice channel:", error);
         }
     }
 

From 9a154406f163c8dfbe5a78c4edf0f703bcb0ca02 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 12:11:23 -0500
Subject: [PATCH 13/19] clean code

---
 packages/client-discord/src/messages.ts | 14 +-------------
 packages/client-discord/src/voice.ts    |  3 ---
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index 0595795a1d3..9e0902c4748 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -106,13 +106,6 @@ export async function sendMessageInChunks(
                     content: message.trim(),
                 };
 
-                // if (i === 0 && inReplyTo) {
-                //   // Reply to the specified message for the first chunk
-                //   options.reply = {
-                //     messageReference: inReplyTo,
-                //   };
-                // }
-
                 if (i === messages.length - 1 && files && files.length > 0) {
                     // Attach files to the last message chunk
                     options.files = files;
@@ -235,11 +228,7 @@ export class MessageManager {
     }
 
     async handleMessage(message: DiscordMessage) {
-        if (
-            message.interaction ||
-            message.author.id ===
-                this.client.user?.id /* || message.author?.bot*/
-        )
+        if (message.interaction || message.author.id === this.client.user?.id)
             return;
         const userId = message.author.id as UUID;
         const userName = message.author.username;
@@ -694,7 +683,6 @@ export class MessageManager {
         state: State
     ): Promise<boolean> {
         if (message.author.id === this.client.user?.id) return false;
-        // if (message.author.bot) return false;
         if (message.mentions.has(this.client.user?.id as string)) return true;
 
         const guild = message.guild;
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 58592714909..1ceb58784d8 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -91,7 +91,6 @@ export class AudioMonitor {
         this.readable = readable;
         this.maxSize = maxSize;
         this.readable.on("data", (chunk: Buffer) => {
-            //console.log('AudioMonitor got data');
             if (this.lastFlagged < 0) {
                 this.lastFlagged = this.buffers.length;
             }
@@ -384,7 +383,6 @@ export class VoiceManager extends EventEmitter {
 
     async handleGuildCreate(guild: Guild) {
         console.log(`Joined guild ${guild.name}`);
-        // this.scanGuild(guild);
     }
 
     async handleUserStream(
@@ -668,7 +666,6 @@ export class VoiceManager extends EventEmitter {
         state: State
     ): Promise<boolean> {
         if (userId === this.client.user?.id) return false;
-        // if (message.author.bot) return false;
         const lowerMessage = message.toLowerCase();
         const botName = this.client.user.username.toLowerCase();
         const characterName = this.runtime.character.name.toLowerCase();

From 169fd72cc77a87210154b6e51f3f1abeba49ec26 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 13:40:24 -0500
Subject: [PATCH 14/19] use lodash for debounce

---
 packages/client-discord/src/voice.ts | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index 1ceb58784d8..a94fd061069 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -45,6 +45,8 @@ import {
     discordVoiceHandlerTemplate,
 } from "./templates.ts";
 
+import debounce from "lodash/debounce.js";
+
 export function getWavHeader(
     audioLength: number,
     sampleRate: number,
@@ -405,24 +407,25 @@ export class VoiceManager extends EventEmitter {
 
         const state = this.userStates.get(userId);
 
+        const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence
+
+        const debouncedProcessTranscription = debounce(async () => {
+            await this.processTranscription(
+                userId,
+                channelId,
+                channel,
+                name,
+                userName
+            );
+        }, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
+
         const processBuffer = async (buffer: Buffer) => {
             try {
                 state!.buffers.push(buffer);
                 state!.totalLength += buffer.length;
                 state!.lastActive = Date.now();
 
-                const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1 seconds of silence
-
-                clearTimeout(state!["debounceTimeout"]);
-                state!["debounceTimeout"] = setTimeout(async () => {
-                    await this.processTranscription(
-                        userId,
-                        channelId,
-                        channel,
-                        name,
-                        userName
-                    );
-                }, DEBOUNCE_TRANSCRIPTION_THRESHOLD);
+                debouncedProcessTranscription();
             } catch (error) {
                 console.error(
                     `Error processing buffer for user ${userId}:`,

From 00a64dc571c26ad4e99c8b50f2509b3a0da64164 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 13:59:38 -0500
Subject: [PATCH 15/19] utils file

---
 packages/client-discord/src/messages.ts | 179 +-----------------------
 packages/client-discord/src/voice.ts    |  28 +---
 2 files changed, 8 insertions(+), 199 deletions(-)

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index 9e0902c4748..8d4c2039bab 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -37,179 +37,12 @@ import {
     discordMessageHandlerTemplate,
 } from "./templates.ts";
 
-const MAX_MESSAGE_LENGTH = 1900;
-async function generateSummary(
-    runtime: IAgentRuntime,
-    text: string
-): Promise<{ title: string; description: string }> {
-    // make sure text is under 128k characters
-    text = trimTokens(text, 100000, "gpt-4o-mini"); // TODO: clean this up
-
-    const prompt = `Please generate a concise summary for the following text:
-  
-  Text: """
-  ${text}
-  """
-  
-  Respond with a JSON object in the following format:
-  \`\`\`json
-  {
-    "title": "Generated Title",
-    "summary": "Generated summary and/or description of the text"
-  }
-  \`\`\``;
-
-    const response = await generateText({
-        runtime,
-        context: prompt,
-        modelClass: ModelClass.SMALL,
-    });
-
-    const parsedResponse = parseJSONObjectFromText(response);
-
-    if (parsedResponse) {
-        return {
-            title: parsedResponse.title,
-            description: parsedResponse.summary,
-        };
-    }
-
-    return {
-        title: "",
-        description: "",
-    };
-}
-
-export type InterestChannels = {
-    [key: string]: {
-        lastMessageSent: number;
-        messages: { userId: UUID; userName: string; content: Content }[];
-    };
-};
-
-export async function sendMessageInChunks(
-    channel: TextChannel,
-    content: string,
-    inReplyTo: string,
-    files: any[]
-): Promise<DiscordMessage[]> {
-    const sentMessages: DiscordMessage[] = [];
-    const messages = splitMessage(content);
-    try {
-        for (let i = 0; i < messages.length; i++) {
-            const message = messages[i];
-            if (
-                message.trim().length > 0 ||
-                (i === messages.length - 1 && files && files.length > 0)
-            ) {
-                const options: any = {
-                    content: message.trim(),
-                };
-
-                if (i === messages.length - 1 && files && files.length > 0) {
-                    // Attach files to the last message chunk
-                    options.files = files;
-                }
-
-                const m = await channel.send(options);
-                sentMessages.push(m);
-            }
-        }
-    } catch (error) {
-        elizaLogger.error("Error sending message:", error);
-    }
-
-    return sentMessages;
-}
-
-function splitMessage(content: string): string[] {
-    const messages: string[] = [];
-    let currentMessage = "";
-
-    const rawLines = content?.split("\n") || [];
-    // split all lines into MAX_MESSAGE_LENGTH chunks so any long lines are split
-    const lines = rawLines
-        .map((line) => {
-            const chunks = [];
-            while (line.length > MAX_MESSAGE_LENGTH) {
-                chunks.push(line.slice(0, MAX_MESSAGE_LENGTH));
-                line = line.slice(MAX_MESSAGE_LENGTH);
-            }
-            chunks.push(line);
-            return chunks;
-        })
-        .flat();
-
-    for (const line of lines) {
-        if (currentMessage.length + line.length + 1 > MAX_MESSAGE_LENGTH) {
-            messages.push(currentMessage.trim());
-            currentMessage = "";
-        }
-        currentMessage += line + "\n";
-    }
-
-    if (currentMessage.trim().length > 0) {
-        messages.push(currentMessage.trim());
-    }
-
-    return messages;
-}
-
-function canSendMessage(channel) {
-    // if it is a DM channel, we can always send messages
-    if (channel.type === ChannelType.DM) {
-        return {
-            canSend: true,
-            reason: null,
-        };
-    }
-    const botMember = channel.guild?.members.cache.get(channel.client.user.id);
-
-    if (!botMember) {
-        return {
-            canSend: false,
-            reason: "Not a guild channel or bot member not found",
-        };
-    }
-
-    // Required permissions for sending messages
-    const requiredPermissions = [
-        PermissionsBitField.Flags.ViewChannel,
-        PermissionsBitField.Flags.SendMessages,
-        PermissionsBitField.Flags.ReadMessageHistory,
-    ];
-
-    // Add thread-specific permission if it's a thread
-    if (channel instanceof ThreadChannel) {
-        requiredPermissions.push(
-            PermissionsBitField.Flags.SendMessagesInThreads
-        );
-    }
-
-    // Check permissions
-    const permissions = channel.permissionsFor(botMember);
-
-    if (!permissions) {
-        return {
-            canSend: false,
-            reason: "Could not retrieve permissions",
-        };
-    }
-
-    // Check each required permission
-    const missingPermissions = requiredPermissions.filter(
-        (perm) => !permissions.has(perm)
-    );
-
-    return {
-        canSend: missingPermissions.length === 0,
-        missingPermissions: missingPermissions,
-        reason:
-            missingPermissions.length > 0
-                ? `Missing permissions: ${missingPermissions.map((p) => String(p)).join(", ")}`
-                : null,
-    };
-}
+import {
+    canSendMessage,
+    generateSummary,
+    InterestChannels,
+    sendMessageInChunks,
+} from "./utils.ts";
 
 export class MessageManager {
     private client: Client;
diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts
index a94fd061069..48d35123901 100644
--- a/packages/client-discord/src/voice.ts
+++ b/packages/client-discord/src/voice.ts
@@ -45,33 +45,9 @@ import {
     discordVoiceHandlerTemplate,
 } from "./templates.ts";
 
-import debounce from "lodash/debounce.js";
+import { getWavHeader } from "./utils.ts";
 
-export function getWavHeader(
-    audioLength: number,
-    sampleRate: number,
-    channelCount: number = 1,
-    bitsPerSample: number = 16
-): Buffer {
-    const wavHeader = Buffer.alloc(44);
-    wavHeader.write("RIFF", 0);
-    wavHeader.writeUInt32LE(36 + audioLength, 4); // Length of entire file in bytes minus 8
-    wavHeader.write("WAVE", 8);
-    wavHeader.write("fmt ", 12);
-    wavHeader.writeUInt32LE(16, 16); // Length of format data
-    wavHeader.writeUInt16LE(1, 20); // Type of format (1 is PCM)
-    wavHeader.writeUInt16LE(channelCount, 22); // Number of channels
-    wavHeader.writeUInt32LE(sampleRate, 24); // Sample rate
-    wavHeader.writeUInt32LE(
-        (sampleRate * bitsPerSample * channelCount) / 8,
-        28
-    ); // Byte rate
-    wavHeader.writeUInt16LE((bitsPerSample * channelCount) / 8, 32); // Block align ((BitsPerSample * Channels) / 8)
-    wavHeader.writeUInt16LE(bitsPerSample, 34); // Bits per sample
-    wavHeader.write("data", 36); // Data chunk header
-    wavHeader.writeUInt32LE(audioLength, 40); // Data chunk size
-    return wavHeader;
-}
+import debounce from "lodash/debounce.js";
 
 // These values are chosen for compatibility with picovoice components
 const DECODE_FRAME_SIZE = 1024;

From 1f2a7f3f802662bf1c05c3858a246b2361eb6618 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 13:59:52 -0500
Subject: [PATCH 16/19] utils file

---
 packages/client-discord/src/utils.ts | 217 +++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 packages/client-discord/src/utils.ts

diff --git a/packages/client-discord/src/utils.ts b/packages/client-discord/src/utils.ts
new file mode 100644
index 00000000000..a440fb98560
--- /dev/null
+++ b/packages/client-discord/src/utils.ts
@@ -0,0 +1,217 @@
+import {
+    Content,
+    IAgentRuntime,
+    ModelClass,
+    UUID,
+} from "@ai16z/eliza/src/types.ts";
+import { generateText, trimTokens } from "@ai16z/eliza/src/generation.ts";
+import { parseJSONObjectFromText } from "@ai16z/eliza/src/parsing.ts";
+import {
+    ChannelType,
+    Message as DiscordMessage,
+    PermissionsBitField,
+    TextChannel,
+    ThreadChannel,
+} from "discord.js";
+import { elizaLogger } from "@ai16z/eliza/src/logger.ts";
+
+export function getWavHeader(
+    audioLength: number,
+    sampleRate: number,
+    channelCount: number = 1,
+    bitsPerSample: number = 16
+): Buffer {
+    const wavHeader = Buffer.alloc(44);
+    wavHeader.write("RIFF", 0);
+    wavHeader.writeUInt32LE(36 + audioLength, 4); // Length of entire file in bytes minus 8
+    wavHeader.write("WAVE", 8);
+    wavHeader.write("fmt ", 12);
+    wavHeader.writeUInt32LE(16, 16); // Length of format data
+    wavHeader.writeUInt16LE(1, 20); // Type of format (1 is PCM)
+    wavHeader.writeUInt16LE(channelCount, 22); // Number of channels
+    wavHeader.writeUInt32LE(sampleRate, 24); // Sample rate
+    wavHeader.writeUInt32LE(
+        (sampleRate * bitsPerSample * channelCount) / 8,
+        28
+    ); // Byte rate
+    wavHeader.writeUInt16LE((bitsPerSample * channelCount) / 8, 32); // Block align ((BitsPerSample * Channels) / 8)
+    wavHeader.writeUInt16LE(bitsPerSample, 34); // Bits per sample
+    wavHeader.write("data", 36); // Data chunk header
+    wavHeader.writeUInt32LE(audioLength, 40); // Data chunk size
+    return wavHeader;
+}
+
+export async function generateSummary(
+    runtime: IAgentRuntime,
+    text: string
+): Promise<{ title: string; description: string }> {
+    // make sure text is under 128k characters
+    text = trimTokens(text, 100000, "gpt-4o-mini"); // TODO: clean this up
+
+    const prompt = `Please generate a concise summary for the following text:
+  
+  Text: """
+  ${text}
+  """
+  
+  Respond with a JSON object in the following format:
+  \`\`\`json
+  {
+    "title": "Generated Title",
+    "summary": "Generated summary and/or description of the text"
+  }
+  \`\`\``;
+
+    const response = await generateText({
+        runtime,
+        context: prompt,
+        modelClass: ModelClass.SMALL,
+    });
+
+    const parsedResponse = parseJSONObjectFromText(response);
+
+    if (parsedResponse) {
+        return {
+            title: parsedResponse.title,
+            description: parsedResponse.summary,
+        };
+    }
+
+    return {
+        title: "",
+        description: "",
+    };
+}
+
+export type InterestChannels = {
+    [key: string]: {
+        lastMessageSent: number;
+        messages: { userId: UUID; userName: string; content: Content }[];
+    };
+};
+
+export function canSendMessage(channel) {
+    // if it is a DM channel, we can always send messages
+    if (channel.type === ChannelType.DM) {
+        return {
+            canSend: true,
+            reason: null,
+        };
+    }
+    const botMember = channel.guild?.members.cache.get(channel.client.user.id);
+
+    if (!botMember) {
+        return {
+            canSend: false,
+            reason: "Not a guild channel or bot member not found",
+        };
+    }
+
+    // Required permissions for sending messages
+    const requiredPermissions = [
+        PermissionsBitField.Flags.ViewChannel,
+        PermissionsBitField.Flags.SendMessages,
+        PermissionsBitField.Flags.ReadMessageHistory,
+    ];
+
+    // Add thread-specific permission if it's a thread
+    if (channel instanceof ThreadChannel) {
+        requiredPermissions.push(
+            PermissionsBitField.Flags.SendMessagesInThreads
+        );
+    }
+
+    // Check permissions
+    const permissions = channel.permissionsFor(botMember);
+
+    if (!permissions) {
+        return {
+            canSend: false,
+            reason: "Could not retrieve permissions",
+        };
+    }
+
+    // Check each required permission
+    const missingPermissions = requiredPermissions.filter(
+        (perm) => !permissions.has(perm)
+    );
+
+    return {
+        canSend: missingPermissions.length === 0,
+        missingPermissions: missingPermissions,
+        reason:
+            missingPermissions.length > 0
+                ? `Missing permissions: ${missingPermissions.map((p) => String(p)).join(", ")}`
+                : null,
+    };
+}
+
+const MAX_MESSAGE_LENGTH = 1900;
+
+export async function sendMessageInChunks(
+    channel: TextChannel,
+    content: string,
+    inReplyTo: string,
+    files: any[]
+): Promise<DiscordMessage[]> {
+    const sentMessages: DiscordMessage[] = [];
+    const messages = splitMessage(content);
+    try {
+        for (let i = 0; i < messages.length; i++) {
+            const message = messages[i];
+            if (
+                message.trim().length > 0 ||
+                (i === messages.length - 1 && files && files.length > 0)
+            ) {
+                const options: any = {
+                    content: message.trim(),
+                };
+
+                if (i === messages.length - 1 && files && files.length > 0) {
+                    // Attach files to the last message chunk
+                    options.files = files;
+                }
+
+                const m = await channel.send(options);
+                sentMessages.push(m);
+            }
+        }
+    } catch (error) {
+        elizaLogger.error("Error sending message:", error);
+    }
+
+    return sentMessages;
+}
+
+function splitMessage(content: string): string[] {
+    const messages: string[] = [];
+    let currentMessage = "";
+
+    const rawLines = content?.split("\n") || [];
+    // split all lines into MAX_MESSAGE_LENGTH chunks so any long lines are split
+    const lines = rawLines
+        .map((line) => {
+            const chunks = [];
+            while (line.length > MAX_MESSAGE_LENGTH) {
+                chunks.push(line.slice(0, MAX_MESSAGE_LENGTH));
+                line = line.slice(MAX_MESSAGE_LENGTH);
+            }
+            chunks.push(line);
+            return chunks;
+        })
+        .flat();
+
+    for (const line of lines) {
+        if (currentMessage.length + line.length + 1 > MAX_MESSAGE_LENGTH) {
+            messages.push(currentMessage.trim());
+            currentMessage = "";
+        }
+        currentMessage += line + "\n";
+    }
+
+    if (currentMessage.trim().length > 0) {
+        messages.push(currentMessage.trim());
+    }
+
+    return messages;
+}

From 5c863f1243962c6edf9c0ecd851a9bddea835950 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 14:03:50 -0500
Subject: [PATCH 17/19] clean code

---
 packages/client-discord/src/messages.ts | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index 8d4c2039bab..144c360f91d 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -19,15 +19,11 @@ import {
     UUID,
 } from "@ai16z/eliza/src/types.ts";
 import { stringToUuid } from "@ai16z/eliza/src/uuid.ts";
-import { generateText, trimTokens } from "@ai16z/eliza/src/generation.ts";
-import { parseJSONObjectFromText } from "@ai16z/eliza/src/parsing.ts";
 import {
     ChannelType,
     Client,
     Message as DiscordMessage,
-    PermissionsBitField,
     TextChannel,
-    ThreadChannel,
 } from "discord.js";
 import { elizaLogger } from "@ai16z/eliza/src/logger.ts";
 import { AttachmentManager } from "./attachments.ts";

From 107b885c27694d5f33c15e8db60871ef12401043 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 14:06:20 -0500
Subject: [PATCH 18/19] moved type from utils

---
 packages/client-discord/src/messages.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts
index 144c360f91d..3400d06f8ea 100644
--- a/packages/client-discord/src/messages.ts
+++ b/packages/client-discord/src/messages.ts
@@ -36,10 +36,16 @@ import {
 import {
     canSendMessage,
     generateSummary,
-    InterestChannels,
     sendMessageInChunks,
 } from "./utils.ts";
 
+type InterestChannels = {
+    [key: string]: {
+        lastMessageSent: number;
+        messages: { userId: UUID; userName: string; content: Content }[];
+    };
+};
+
 export class MessageManager {
     private client: Client;
     private runtime: IAgentRuntime;

From c821a6da3effbc1fed93b3ab2cc8f95ffd8c5e29 Mon Sep 17 00:00:00 2001
From: Ting Chien Meng <tcm390@nyu.edu>
Date: Thu, 21 Nov 2024 14:06:29 -0500
Subject: [PATCH 19/19] moved type from utils

---
 packages/client-discord/src/utils.ts | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/packages/client-discord/src/utils.ts b/packages/client-discord/src/utils.ts
index a440fb98560..6b6adf6e70c 100644
--- a/packages/client-discord/src/utils.ts
+++ b/packages/client-discord/src/utils.ts
@@ -83,13 +83,6 @@ export async function generateSummary(
     };
 }
 
-export type InterestChannels = {
-    [key: string]: {
-        lastMessageSent: number;
-        messages: { userId: UUID; userName: string; content: Content }[];
-    };
-};
-
 export function canSendMessage(channel) {
     // if it is a DM channel, we can always send messages
     if (channel.type === ChannelType.DM) {