diff --git a/packages/client-discord/src/messages.ts b/packages/client-discord/src/messages.ts index 4d233f658d..3400d06f8e 100644 --- a/packages/client-discord/src/messages.ts +++ b/packages/client-discord/src/messages.ts @@ -4,10 +4,6 @@ import { generateShouldRespond, } from "@ai16z/eliza/src/generation.ts"; import { embeddingZeroVector } from "@ai16z/eliza/src/memory.ts"; -import { - messageCompletionFooter, - shouldRespondFooter, -} from "@ai16z/eliza/src/parsing.ts"; import { Content, HandlerCallback, @@ -23,304 +19,33 @@ import { UUID, } from "@ai16z/eliza/src/types.ts"; import { stringToUuid } from "@ai16z/eliza/src/uuid.ts"; -import { generateText, trimTokens } from "@ai16z/eliza/src/generation.ts"; -import { parseJSONObjectFromText } from "@ai16z/eliza/src/parsing.ts"; import { ChannelType, Client, Message as DiscordMessage, - PermissionsBitField, TextChannel, - ThreadChannel, } from "discord.js"; import { elizaLogger } from "@ai16z/eliza/src/logger.ts"; import { AttachmentManager } from "./attachments.ts"; import { VoiceManager } from "./voice.ts"; -import { Service } from "@ai16z/eliza"; - -const MAX_MESSAGE_LENGTH = 1900; -async function generateSummary( - runtime: IAgentRuntime, - text: string -): Promise<{ title: string; description: string }> { - // make sure text is under 128k characters - text = trimTokens(text, 100000, "gpt-4o-mini"); // TODO: clean this up - - const prompt = `Please generate a concise summary for the following text: - - Text: """ - ${text} - """ - - Respond with a JSON object in the following format: - \`\`\`json - { - "title": "Generated Title", - "summary": "Generated summary and/or description of the text" - } - \`\`\``; - - const response = await generateText({ - runtime, - context: prompt, - modelClass: ModelClass.SMALL, - }); - - const parsedResponse = parseJSONObjectFromText(response); - - if (parsedResponse) { - return { - title: parsedResponse.title, - description: parsedResponse.summary, - }; - } +import { + discordShouldRespondTemplate, + discordMessageHandlerTemplate, +} from "./templates.ts"; - return { - title: "", - description: "", - }; -} +import { + canSendMessage, + generateSummary, + sendMessageInChunks, +} from "./utils.ts"; -export type InterestChannels = { +type InterestChannels = { [key: string]: { lastMessageSent: number; messages: { userId: UUID; userName: string; content: Content }[]; }; }; -const discordShouldRespondTemplate = - `# Task: Decide if {{agentName}} should respond. -About {{agentName}}: -{{bio}} - -# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP". - -# RESPONSE EXAMPLES -<user 1>: I just saw a really great movie -<user 2>: Oh? Which movie? -Result: [IGNORE] - -{{agentName}}: Oh, this is my favorite scene -<user 1>: sick -<user 2>: wait, why is it your favorite scene -Result: [RESPOND] - -<user>: stfu bot -Result: [STOP] - -<user>: Hey {{agent}}, can you help me with something -Result: [RESPOND] - -<user>: {{agentName}} stfu plz -Result: [STOP] - -<user>: i need help -{{agentName}}: how can I help you? -<user>: no. i need help from someone else -Result: [IGNORE] - -<user>: Hey {{agent}}, can I ask you a question -{{agentName}}: Sure, what is it -<user>: can you ask claude to create a basic react module that demonstrates a counter -Result: [RESPOND] - -<user>: {{agentName}} can you tell me a story -<user>: {about a girl named elara -{{agentName}}: Sure. -{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara. -{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane. -<user>: I'm loving it, keep going -Result: [RESPOND] - -<user>: {{agentName}} stop responding plz -Result: [STOP] - -<user>: okay, i want to test something. can you say marco? -{{agentName}}: marco -<user>: great. okay, now do it again -Result: [RESPOND] - -Response options are [RESPOND], [IGNORE] and [STOP]. - -{{agentName}} is in a room with other users and is very worried about being annoying and saying too much. -Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background. -If a message is not interesting or relevant, respond with [IGNORE] -Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information. -If a user asks {{agentName}} to be quiet, respond with [STOP] -If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP] - -IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE]. -If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND]. - -{{recentMessages}} - -# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else. -` + shouldRespondFooter; - -export const discordMessageHandlerTemplate = - // {{goals}} - `# Action Examples -{{actionExamples}} -(Action examples are for reference only. Do not use the information from them in your response.) - -# Knowledge -{{knowledge}} - -# Task: Generate dialog and actions for the character {{agentName}}. -About {{agentName}}: -{{bio}} -{{lore}} - -Examples of {{agentName}}'s dialog and actions: -{{characterMessageExamples}} - -{{providers}} - -{{attachments}} - -{{actions}} - -# Capabilities -Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. - -{{messageDirections}} - -{{recentMessages}} - -# Instructions: Write the next message for {{agentName}}. Include an action, if appropriate. {{actionNames}} -` + messageCompletionFooter; - -export async function sendMessageInChunks( - channel: TextChannel, - content: string, - inReplyTo: string, - files: any[] -): Promise<DiscordMessage[]> { - const sentMessages: DiscordMessage[] = []; - const messages = splitMessage(content); - try { - for (let i = 0; i < messages.length; i++) { - const message = messages[i]; - if ( - message.trim().length > 0 || - (i === messages.length - 1 && files && files.length > 0) - ) { - const options: any = { - content: message.trim(), - }; - - // if (i === 0 && inReplyTo) { - // // Reply to the specified message for the first chunk - // options.reply = { - // messageReference: inReplyTo, - // }; - // } - - if (i === messages.length - 1 && files && files.length > 0) { - // Attach files to the last message chunk - options.files = files; - } - - const m = await channel.send(options); - sentMessages.push(m); - } - } - } catch (error) { - elizaLogger.error("Error sending message:", error); - } - - return sentMessages; -} - -function splitMessage(content: string): string[] { - const messages: string[] = []; - let currentMessage = ""; - - const rawLines = content?.split("\n") || []; - // split all lines into MAX_MESSAGE_LENGTH chunks so any long lines are split - const lines = rawLines - .map((line) => { - const chunks = []; - while (line.length > MAX_MESSAGE_LENGTH) { - chunks.push(line.slice(0, MAX_MESSAGE_LENGTH)); - line = line.slice(MAX_MESSAGE_LENGTH); - } - chunks.push(line); - return chunks; - }) - .flat(); - - for (const line of lines) { - if (currentMessage.length + line.length + 1 > MAX_MESSAGE_LENGTH) { - messages.push(currentMessage.trim()); - currentMessage = ""; - } - currentMessage += line + "\n"; - } - - if (currentMessage.trim().length > 0) { - messages.push(currentMessage.trim()); - } - - return messages; -} - -function canSendMessage(channel) { - // if it is a DM channel, we can always send messages - if (channel.type === ChannelType.DM) { - return { - canSend: true, - reason: null, - }; - } - const botMember = channel.guild?.members.cache.get(channel.client.user.id); - - if (!botMember) { - return { - canSend: false, - reason: "Not a guild channel or bot member not found", - }; - } - - // Required permissions for sending messages - const requiredPermissions = [ - PermissionsBitField.Flags.ViewChannel, - PermissionsBitField.Flags.SendMessages, - PermissionsBitField.Flags.ReadMessageHistory, - ]; - - // Add thread-specific permission if it's a thread - if (channel instanceof ThreadChannel) { - requiredPermissions.push( - PermissionsBitField.Flags.SendMessagesInThreads - ); - } - - // Check permissions - const permissions = channel.permissionsFor(botMember); - - if (!permissions) { - return { - canSend: false, - reason: "Could not retrieve permissions", - }; - } - - // Check each required permission - const missingPermissions = requiredPermissions.filter( - (perm) => !permissions.has(perm) - ); - - return { - canSend: missingPermissions.length === 0, - missingPermissions: missingPermissions, - reason: - missingPermissions.length > 0 - ? `Missing permissions: ${missingPermissions.map((p) => String(p)).join(", ")}` - : null, - }; -} - export class MessageManager { private client: Client; private runtime: IAgentRuntime; @@ -338,11 +63,7 @@ export class MessageManager { } async handleMessage(message: DiscordMessage) { - if ( - message.interaction || - message.author.id === - this.client.user?.id /* || message.author?.bot*/ - ) + if (message.interaction || message.author.id === this.client.user?.id) return; const userId = message.author.id as UUID; const userName = message.author.username; @@ -389,10 +110,10 @@ export class MessageManager { url: message.url, inReplyTo: message.reference?.messageId ? stringToUuid( - message.reference.messageId + - "-" + - this.runtime.agentId - ) + message.reference.messageId + + "-" + + this.runtime.agentId + ) : undefined, }; @@ -501,75 +222,49 @@ export class MessageManager { message.id + "-" + this.runtime.agentId ); } - if (message.channel.type === ChannelType.GuildVoice) { - // For voice channels, use text-to-speech - const audioStream = await ( - this.runtime.getService( - ServiceType.SPEECH_GENERATION - ) - ).getInstance<ISpeechService>() - .generate(this.runtime, content.text); - await this.voiceManager.playAudioStream( - userId, - audioStream - ); + + // For text channels, send the message + const messages = await sendMessageInChunks( + message.channel as TextChannel, + content.text, + message.id, + files + ); + + const memories: Memory[] = []; + for (const m of messages) { + let action = content.action; + // If there's only one message or it's the last message, keep the original action + // For multiple messages, set all but the last to 'CONTINUE' + if ( + messages.length > 1 && + m !== messages[messages.length - 1] + ) { + action = "CONTINUE"; + } + const memory: Memory = { id: stringToUuid( - message.id + "-" + this.runtime.agentId + m.id + "-" + this.runtime.agentId ), userId: this.runtime.agentId, agentId: this.runtime.agentId, - content, + content: { + ...content, + action, + inReplyTo: messageId, + url: m.url, + }, roomId, embedding: embeddingZeroVector, + createdAt: m.createdTimestamp, }; - return [memory]; - } else { - // For text channels, send the message - const messages = await sendMessageInChunks( - message.channel as TextChannel, - content.text, - message.id, - files - ); - - const memories: Memory[] = []; - for (const m of messages) { - let action = content.action; - // If there's only one message or it's the last message, keep the original action - // For multiple messages, set all but the last to 'CONTINUE' - if ( - messages.length > 1 && - m !== messages[messages.length - 1] - ) { - action = "CONTINUE"; - } - - const memory: Memory = { - id: stringToUuid( - m.id + "-" + this.runtime.agentId - ), - userId: this.runtime.agentId, - agentId: this.runtime.agentId, - content: { - ...content, - action, - inReplyTo: messageId, - url: m.url, - }, - roomId, - embedding: embeddingZeroVector, - createdAt: m.createdTimestamp, - }; - memories.push(memory); - } - for (const m of memories) { - await this.runtime.messageManager.createMemory( - m - ); - } - return memories; + memories.push(memory); } + for (const m of memories) { + await this.runtime.messageManager.createMemory(m); + } + return memories; } catch (error) { console.error("Error sending message:", error); return []; @@ -659,14 +354,15 @@ export class MessageManager { for (const url of urls) { if ( - this.runtime.getService(ServiceType.VIDEO) + this.runtime + .getService(ServiceType.VIDEO) .getInstance<IVideoService>() .isVideoUrl(url) ) { - const videoInfo = await (this.runtime + const videoInfo = await this.runtime .getService(ServiceType.VIDEO) .getInstance<IVideoService>() - .processVideo(url)); + .processVideo(url); attachments.push({ id: `youtube-${Date.now()}`, url: url, @@ -822,7 +518,6 @@ export class MessageManager { state: State ): Promise<boolean> { if (message.author.id === this.client.user?.id) return false; - // if (message.author.bot) return false; if (message.mentions.has(this.client.user?.id as string)) return true; const guild = message.guild; diff --git a/packages/client-discord/src/templates.ts b/packages/client-discord/src/templates.ts new file mode 100644 index 0000000000..18345fc02e --- /dev/null +++ b/packages/client-discord/src/templates.ts @@ -0,0 +1,126 @@ +import { + shouldRespondFooter, + messageCompletionFooter, +} from "@ai16z/eliza/src/parsing.ts"; + +export const discordShouldRespondTemplate = + `# Task: Decide if {{agentName}} should respond. +About {{agentName}}: +{{bio}} + +# INSTRUCTIONS: Determine if {{agentName}} should respond to the message and participate in the conversation. Do not comment. Just respond with "RESPOND" or "IGNORE" or "STOP". + +# RESPONSE EXAMPLES +<user 1>: I just saw a really great movie +<user 2>: Oh? Which movie? +Result: [IGNORE] + +{{agentName}}: Oh, this is my favorite scene +<user 1>: sick +<user 2>: wait, why is it your favorite scene +Result: [RESPOND] + +<user>: stfu bot +Result: [STOP] + +<user>: Hey {{agent}}, can you help me with something +Result: [RESPOND] + +<user>: {{agentName}} stfu plz +Result: [STOP] + +<user>: i need help +{{agentName}}: how can I help you? +<user>: no. i need help from someone else +Result: [IGNORE] + +<user>: Hey {{agent}}, can I ask you a question +{{agentName}}: Sure, what is it +<user>: can you ask claude to create a basic react module that demonstrates a counter +Result: [RESPOND] + +<user>: {{agentName}} can you tell me a story +<user>: {about a girl named elara +{{agentName}}: Sure. +{{agentName}}: Once upon a time, in a quaint little village, there was a curious girl named Elara. +{{agentName}}: Elara was known for her adventurous spirit and her knack for finding beauty in the mundane. +<user>: I'm loving it, keep going +Result: [RESPOND] + +<user>: {{agentName}} stop responding plz +Result: [STOP] + +<user>: okay, i want to test something. can you say marco? +{{agentName}}: marco +<user>: great. okay, now do it again +Result: [RESPOND] + +Response options are [RESPOND], [IGNORE] and [STOP]. + +{{agentName}} is in a room with other users and is very worried about being annoying and saying too much. +Respond with [RESPOND] to messages that are directed at {{agentName}}, or participate in conversations that are interesting or relevant to their background. +If a message is not interesting or relevant, respond with [IGNORE] +Unless directly responding to a user, respond with [IGNORE] to messages that are very short or do not contain much information. +If a user asks {{agentName}} to be quiet, respond with [STOP] +If {{agentName}} concludes a conversation and isn't part of the conversation anymore, respond with [STOP] + +IMPORTANT: {{agentName}} is particularly sensitive about being annoying, so if there is any doubt, it is better to respond with [IGNORE]. +If {{agentName}} is conversing with a user and they have not asked to stop, it is better to respond with [RESPOND]. + +{{recentMessages}} + +# INSTRUCTIONS: Choose the option that best describes {{agentName}}'s response to the last message. Ignore messages if they are addressed to someone else. +` + shouldRespondFooter; + +export const discordVoiceHandlerTemplate = + `# Task: Generate conversational voice dialog for {{agentName}}. +About {{agentName}}: +{{bio}} + +# Attachments +{{attachments}} + +# Capabilities +Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. + +{{actions}} + +{{messageDirections}} + +{{recentMessages}} + +# Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}} +` + messageCompletionFooter; + +export const discordMessageHandlerTemplate = + // {{goals}} + `# Action Examples +{{actionExamples}} +(Action examples are for reference only. Do not use the information from them in your response.) + +# Knowledge +{{knowledge}} + +# Task: Generate dialog and actions for the character {{agentName}}. +About {{agentName}}: +{{bio}} +{{lore}} + +Examples of {{agentName}}'s dialog and actions: +{{characterMessageExamples}} + +{{providers}} + +{{attachments}} + +{{actions}} + +# Capabilities +Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. + +{{messageDirections}} + +{{recentMessages}} + +# Instructions: Write the next message for {{agentName}}. Include an action, if appropriate. {{actionNames}} +` + messageCompletionFooter; diff --git a/packages/client-discord/src/utils.ts b/packages/client-discord/src/utils.ts new file mode 100644 index 0000000000..6b6adf6e70 --- /dev/null +++ b/packages/client-discord/src/utils.ts @@ -0,0 +1,210 @@ +import { + Content, + IAgentRuntime, + ModelClass, + UUID, +} from "@ai16z/eliza/src/types.ts"; +import { generateText, trimTokens } from "@ai16z/eliza/src/generation.ts"; +import { parseJSONObjectFromText } from "@ai16z/eliza/src/parsing.ts"; +import { + ChannelType, + Message as DiscordMessage, + PermissionsBitField, + TextChannel, + ThreadChannel, +} from "discord.js"; +import { elizaLogger } from "@ai16z/eliza/src/logger.ts"; + +export function getWavHeader( + audioLength: number, + sampleRate: number, + channelCount: number = 1, + bitsPerSample: number = 16 +): Buffer { + const wavHeader = Buffer.alloc(44); + wavHeader.write("RIFF", 0); + wavHeader.writeUInt32LE(36 + audioLength, 4); // Length of entire file in bytes minus 8 + wavHeader.write("WAVE", 8); + wavHeader.write("fmt ", 12); + wavHeader.writeUInt32LE(16, 16); // Length of format data + wavHeader.writeUInt16LE(1, 20); // Type of format (1 is PCM) + wavHeader.writeUInt16LE(channelCount, 22); // Number of channels + wavHeader.writeUInt32LE(sampleRate, 24); // Sample rate + wavHeader.writeUInt32LE( + (sampleRate * bitsPerSample * channelCount) / 8, + 28 + ); // Byte rate + wavHeader.writeUInt16LE((bitsPerSample * channelCount) / 8, 32); // Block align ((BitsPerSample * Channels) / 8) + wavHeader.writeUInt16LE(bitsPerSample, 34); // Bits per sample + wavHeader.write("data", 36); // Data chunk header + wavHeader.writeUInt32LE(audioLength, 40); // Data chunk size + return wavHeader; +} + +export async function generateSummary( + runtime: IAgentRuntime, + text: string +): Promise<{ title: string; description: string }> { + // make sure text is under 128k characters + text = trimTokens(text, 100000, "gpt-4o-mini"); // TODO: clean this up + + const prompt = `Please generate a concise summary for the following text: + + Text: """ + ${text} + """ + + Respond with a JSON object in the following format: + \`\`\`json + { + "title": "Generated Title", + "summary": "Generated summary and/or description of the text" + } + \`\`\``; + + const response = await generateText({ + runtime, + context: prompt, + modelClass: ModelClass.SMALL, + }); + + const parsedResponse = parseJSONObjectFromText(response); + + if (parsedResponse) { + return { + title: parsedResponse.title, + description: parsedResponse.summary, + }; + } + + return { + title: "", + description: "", + }; +} + +export function canSendMessage(channel) { + // if it is a DM channel, we can always send messages + if (channel.type === ChannelType.DM) { + return { + canSend: true, + reason: null, + }; + } + const botMember = channel.guild?.members.cache.get(channel.client.user.id); + + if (!botMember) { + return { + canSend: false, + reason: "Not a guild channel or bot member not found", + }; + } + + // Required permissions for sending messages + const requiredPermissions = [ + PermissionsBitField.Flags.ViewChannel, + PermissionsBitField.Flags.SendMessages, + PermissionsBitField.Flags.ReadMessageHistory, + ]; + + // Add thread-specific permission if it's a thread + if (channel instanceof ThreadChannel) { + requiredPermissions.push( + PermissionsBitField.Flags.SendMessagesInThreads + ); + } + + // Check permissions + const permissions = channel.permissionsFor(botMember); + + if (!permissions) { + return { + canSend: false, + reason: "Could not retrieve permissions", + }; + } + + // Check each required permission + const missingPermissions = requiredPermissions.filter( + (perm) => !permissions.has(perm) + ); + + return { + canSend: missingPermissions.length === 0, + missingPermissions: missingPermissions, + reason: + missingPermissions.length > 0 + ? `Missing permissions: ${missingPermissions.map((p) => String(p)).join(", ")}` + : null, + }; +} + +const MAX_MESSAGE_LENGTH = 1900; + +export async function sendMessageInChunks( + channel: TextChannel, + content: string, + inReplyTo: string, + files: any[] +): Promise<DiscordMessage[]> { + const sentMessages: DiscordMessage[] = []; + const messages = splitMessage(content); + try { + for (let i = 0; i < messages.length; i++) { + const message = messages[i]; + if ( + message.trim().length > 0 || + (i === messages.length - 1 && files && files.length > 0) + ) { + const options: any = { + content: message.trim(), + }; + + if (i === messages.length - 1 && files && files.length > 0) { + // Attach files to the last message chunk + options.files = files; + } + + const m = await channel.send(options); + sentMessages.push(m); + } + } + } catch (error) { + elizaLogger.error("Error sending message:", error); + } + + return sentMessages; +} + +function splitMessage(content: string): string[] { + const messages: string[] = []; + let currentMessage = ""; + + const rawLines = content?.split("\n") || []; + // split all lines into MAX_MESSAGE_LENGTH chunks so any long lines are split + const lines = rawLines + .map((line) => { + const chunks = []; + while (line.length > MAX_MESSAGE_LENGTH) { + chunks.push(line.slice(0, MAX_MESSAGE_LENGTH)); + line = line.slice(MAX_MESSAGE_LENGTH); + } + chunks.push(line); + return chunks; + }) + .flat(); + + for (const line of lines) { + if (currentMessage.length + line.length + 1 > MAX_MESSAGE_LENGTH) { + messages.push(currentMessage.trim()); + currentMessage = ""; + } + currentMessage += line + "\n"; + } + + if (currentMessage.trim().length > 0) { + messages.push(currentMessage.trim()); + } + + return messages; +} diff --git a/packages/client-discord/src/voice.ts b/packages/client-discord/src/voice.ts index 744a8106b8..48d3512390 100644 --- a/packages/client-discord/src/voice.ts +++ b/packages/client-discord/src/voice.ts @@ -1,4 +1,5 @@ import { + AudioPlayer, AudioReceiveStream, NoSubscriberBehavior, StreamType, @@ -21,7 +22,10 @@ import EventEmitter from "events"; import prism from "prism-media"; import { Readable, pipeline } from "stream"; import { composeContext } from "@ai16z/eliza/src/context.ts"; -import { generateMessageResponse } from "@ai16z/eliza/src/generation.ts"; +import { + generateMessageResponse, + generateShouldRespond, +} from "@ai16z/eliza/src/generation.ts"; import { embeddingZeroVector } from "@ai16z/eliza/src/memory.ts"; import { Content, @@ -31,60 +35,19 @@ import { ITranscriptionService, Memory, ModelClass, - Service, ServiceType, State, UUID, } from "@ai16z/eliza/src/types.ts"; import { stringToUuid } from "@ai16z/eliza/src/uuid.ts"; +import { + discordShouldRespondTemplate, + discordVoiceHandlerTemplate, +} from "./templates.ts"; -export function getWavHeader( - audioLength: number, - sampleRate: number, - channelCount: number = 1, - bitsPerSample: number = 16 -): Buffer { - const wavHeader = Buffer.alloc(44); - wavHeader.write("RIFF", 0); - wavHeader.writeUInt32LE(36 + audioLength, 4); // Length of entire file in bytes minus 8 - wavHeader.write("WAVE", 8); - wavHeader.write("fmt ", 12); - wavHeader.writeUInt32LE(16, 16); // Length of format data - wavHeader.writeUInt16LE(1, 20); // Type of format (1 is PCM) - wavHeader.writeUInt16LE(channelCount, 22); // Number of channels - wavHeader.writeUInt32LE(sampleRate, 24); // Sample rate - wavHeader.writeUInt32LE( - (sampleRate * bitsPerSample * channelCount) / 8, - 28 - ); // Byte rate - wavHeader.writeUInt16LE((bitsPerSample * channelCount) / 8, 32); // Block align ((BitsPerSample * Channels) / 8) - wavHeader.writeUInt16LE(bitsPerSample, 34); // Bits per sample - wavHeader.write("data", 36); // Data chunk header - wavHeader.writeUInt32LE(audioLength, 40); // Data chunk size - return wavHeader; -} - -import { messageCompletionFooter } from "@ai16z/eliza/src/parsing.ts"; - -const discordVoiceHandlerTemplate = - `# Task: Generate conversational voice dialog for {{agentName}}. -About {{agentName}}: -{{bio}} - -# Attachments -{{attachments}} - -# Capabilities -Note that {{agentName}} is capable of reading/seeing/hearing various forms of media, including images, videos, audio, plaintext and PDFs. Recent attachments have been included above under the "Attachments" section. - -{{actions}} - -{{messageDirections}} - -{{recentMessages}} +import { getWavHeader } from "./utils.ts"; -# Instructions: Write the next message for {{agentName}}. Include an optional action if appropriate. {{actionNames}} -` + messageCompletionFooter; +import debounce from "lodash/debounce.js"; // These values are chosen for compatibility with picovoice components const DECODE_FRAME_SIZE = 1024; @@ -106,7 +69,6 @@ export class AudioMonitor { this.readable = readable; this.maxSize = maxSize; this.readable.on("data", (chunk: Buffer) => { - //console.log('AudioMonitor got data'); if (this.lastFlagged < 0) { this.lastFlagged = this.buffers.length; } @@ -175,6 +137,16 @@ export class AudioMonitor { } export class VoiceManager extends EventEmitter { + private userStates: Map< + string, + { + buffers: Buffer[]; + totalLength: number; + lastActive: number; + transcriptionText: string; + } + > = new Map(); + private activeAudioPlayer: AudioPlayer | null = null; private client: Client; private runtime: IAgentRuntime; private streams: Map<string, Readable> = new Map(); @@ -236,14 +208,21 @@ export class VoiceManager extends EventEmitter { }); for (const [, member] of channel.members) { - if (!member.user.bot) { + if (member && !member.user.bot) { this.monitorMember(member, channel); } } - connection.receiver.speaking.on("start", (userId: string) => { - const user = channel.members.get(userId); - if (!user?.user.bot) { + connection.receiver.speaking.on("start", async (userId: string) => { + let user = channel.members.get(userId); + if (!user) { + try { + user = await channel.guild.members.fetch(userId); + } catch (error) { + console.error("Failed to fetch user:", error); + } + } + if (user && !user?.user.bot) { this.monitorMember(user as GuildMember, channel); this.streams.get(userId)?.emit("speakingStarted"); } @@ -277,6 +256,37 @@ export class VoiceManager extends EventEmitter { rate: DECODE_SAMPLE_RATE, frameSize: DECODE_FRAME_SIZE, }); + const volumeBuffer: number[] = []; + const VOLUME_WINDOW_SIZE = 30; + const SPEAKING_THRESHOLD = 0.05; + opusDecoder.on("data", (pcmData: Buffer) => { + // Monitor the audio volume while the agent is speaking. + // If the average volume of the user's audio exceeds the defined threshold, it indicates active speaking. + // When active speaking is detected, stop the agent's current audio playback to avoid overlap. + + if (this.activeAudioPlayer) { + const samples = new Int16Array( + pcmData.buffer, + pcmData.byteOffset, + pcmData.length / 2 + ); + const maxAmplitude = Math.max(...samples.map(Math.abs)) / 32768; + volumeBuffer.push(maxAmplitude); + + if (volumeBuffer.length > VOLUME_WINDOW_SIZE) { + volumeBuffer.shift(); + } + const avgVolume = + volumeBuffer.reduce((sum, v) => sum + v, 0) / + VOLUME_WINDOW_SIZE; + + if (avgVolume > SPEAKING_THRESHOLD) { + volumeBuffer.length = 0; + this.cleanupAudioPlayer(this.activeAudioPlayer); + } + } + }); + pipeline( receiveStream as AudioReceiveStream, opusDecoder as any, @@ -351,7 +361,6 @@ export class VoiceManager extends EventEmitter { async handleGuildCreate(guild: Guild) { console.log(`Joined guild ${guild.name}`); - // this.scanGuild(guild); } async handleUserStream( @@ -361,237 +370,332 @@ export class VoiceManager extends EventEmitter { channel: BaseGuildVoiceChannel, audioStream: Readable ) { + console.log(`Starting audio monitor for user: ${userId}`); const channelId = channel.id; - const buffers: Buffer[] = []; - let totalLength = 0; - const maxSilenceTime = 1000; // Maximum pause duration in milliseconds - const minSilenceTime = 50; // Minimum silence duration to trigger transcription - let lastChunkTime = Date.now(); - let transcriptionStarted = false; - let transcriptionText = ""; - console.log("new audio monitor for: ", userId); + if (!this.userStates.has(userId)) { + this.userStates.set(userId, { + buffers: [], + totalLength: 0, + lastActive: Date.now(), + transcriptionText: "", + }); + } + + const state = this.userStates.get(userId); + + const DEBOUNCE_TRANSCRIPTION_THRESHOLD = 1500; // wait for 1.5 seconds of silence + + const debouncedProcessTranscription = debounce(async () => { + await this.processTranscription( + userId, + channelId, + channel, + name, + userName + ); + }, DEBOUNCE_TRANSCRIPTION_THRESHOLD); + + const processBuffer = async (buffer: Buffer) => { + try { + state!.buffers.push(buffer); + state!.totalLength += buffer.length; + state!.lastActive = Date.now(); + + debouncedProcessTranscription(); + } catch (error) { + console.error( + `Error processing buffer for user ${userId}:`, + error + ); + } + }; const monitor = new AudioMonitor( audioStream, 10000000, async (buffer) => { - console.log("buffer: ", buffer); - const currentTime = Date.now(); - const silenceDuration = currentTime - lastChunkTime; if (!buffer) { - // Handle error - console.error("Empty buffer received"); + console.error("Received empty buffer"); return; } - buffers.push(buffer); - totalLength += buffer.length; - lastChunkTime = currentTime; - - if (silenceDuration > minSilenceTime && !transcriptionStarted) { - transcriptionStarted = true; - const inputBuffer = Buffer.concat(buffers, totalLength); - buffers.length = 0; - totalLength = 0; - - try { - // Convert Opus to WAV and add the header - const wavBuffer = - await this.convertOpusToWav(inputBuffer); - - console.log("starting transcription"); - const text = await this.runtime - .getService(ServiceType.TRANSCRIPTION) - .getInstance<ITranscriptionService>() - .transcribe(wavBuffer); - console.log("transcribed text: ", text); - transcriptionText += text; - } catch (error) { - console.error("Error processing audio stream:", error); - } + await processBuffer(buffer); + } + ); + } + + private async processTranscription( + userId: UUID, + channelId: string, + channel: BaseGuildVoiceChannel, + name: string, + userName: string + ) { + const state = this.userStates.get(userId); + if (!state || state.buffers.length === 0) return; + try { + const inputBuffer = Buffer.concat(state.buffers, state.totalLength); + state.buffers.length = 0; // Clear the buffers + state.totalLength = 0; + + // Convert Opus to WAV + const wavBuffer = await this.convertOpusToWav(inputBuffer); + + console.log("Starting transcription..."); + + const transcriptionText = await this.runtime + .getService(ServiceType.TRANSCRIPTION) + .getInstance<ITranscriptionService>() + .transcribe(wavBuffer); + + function isValidTranscription(text: string): boolean { + if (!text || text.includes("[BLANK_AUDIO]")) return false; + return true; + } + + if (transcriptionText && isValidTranscription(transcriptionText)) { + state.transcriptionText += transcriptionText; + } + + if (state.transcriptionText.length) { + this.cleanupAudioPlayer(this.activeAudioPlayer); + const finalText = state.transcriptionText; + state.transcriptionText = ""; + await this.handleUserMessage( + finalText, + userId, + channelId, + channel, + name, + userName + ); + } + } catch (error) { + console.error( + `Error transcribing audio for user ${userId}:`, + error + ); + } + } + + private async handleUserMessage( + message: string, + userId: UUID, + channelId: string, + channel: BaseGuildVoiceChannel, + name: string, + userName: string + ) { + try { + const roomId = stringToUuid(channelId + "-" + this.runtime.agentId); + const userIdUUID = stringToUuid(userId); + + await this.runtime.ensureConnection( + userIdUUID, + roomId, + userName, + name, + "discord" + ); + + let state = await this.runtime.composeState( + { + agentId: this.runtime.agentId, + content: { text: message, source: "Discord" }, + userId: userIdUUID, + roomId, + }, + { + discordChannel: channel, + discordClient: this.client, + agentName: this.runtime.character.name, } + ); - if (silenceDuration > maxSilenceTime && transcriptionStarted) { - console.log("transcription finished"); - transcriptionStarted = false; + if (message && message.startsWith("/")) { + return null; + } - if (!transcriptionText) return; + const memory = { + id: stringToUuid(channelId + "-voice-message-" + Date.now()), + agentId: this.runtime.agentId, + content: { + text: message, + source: "discord", + url: channel.url, + }, + userId: userIdUUID, + roomId, + embedding: embeddingZeroVector, + createdAt: Date.now(), + }; + + if (!memory.content.text) { + return { text: "", action: "IGNORE" }; + } - try { - const text = transcriptionText; + await this.runtime.messageManager.createMemory(memory); - // handle whisper cases - if ( - (text.length < 15 && - text.includes("[BLANK_AUDIO]")) || - (text.length < 5 && - text.toLowerCase().includes("you")) - ) { - transcriptionText = ""; // Reset transcription text - return; - } + state = await this.runtime.updateRecentMessageState(state); - const roomId = stringToUuid( - channelId + "-" + this.runtime.agentId - ); - const userIdUUID = stringToUuid(userId); - - await this.runtime.ensureConnection( - userIdUUID, - roomId, - userName, - name, - "discord" - ); + const shouldIgnore = await this._shouldIgnore(memory); - let state = await this.runtime.composeState( - { - agentId: this.runtime.agentId, - content: { text: text, source: "Discord" }, - userId: userIdUUID, - roomId, - }, - { - discordChannel: channel, - discordClient: this.client, - agentName: this.runtime.character.name, - } - ); + if (shouldIgnore) { + return { text: "", action: "IGNORE" }; + } - if (text && text.startsWith("/")) { - transcriptionText = ""; // Reset transcription text - return null; - } - - const memory = { - id: stringToUuid( - channelId + "-voice-message-" + Date.now() - ), - agentId: this.runtime.agentId, - content: { - text: text, - source: "discord", - url: channel.url, - }, - userId: userIdUUID, - roomId, - embedding: embeddingZeroVector, - createdAt: Date.now(), - }; - - if (!memory.content.text) { - transcriptionText = ""; // Reset transcription text - return { text: "", action: "IGNORE" }; - } - - await this.runtime.messageManager.createMemory(memory); - - state = - await this.runtime.updateRecentMessageState(state); - - const shouldIgnore = await this._shouldIgnore(memory); - - if (shouldIgnore) { - transcriptionText = ""; // Reset transcription text - return { text: "", action: "IGNORE" }; - } - - const context = composeContext({ - state, - template: - this.runtime.character.templates - ?.discordVoiceHandlerTemplate || - this.runtime.character.templates - ?.messageHandlerTemplate || - discordVoiceHandlerTemplate, - }); - - const responseContent = await this._generateResponse( - memory, - state, - context - ); + const shouldRespond = await this._shouldRespond( + message, + userId, + channel, + state + ); - const callback: HandlerCallback = async ( - content: Content - ) => { - console.log("callback content: ", content); - const { roomId } = memory; - - const responseMemory: Memory = { - id: stringToUuid( - memory.id + "-voice-response-" + Date.now() - ), - agentId: this.runtime.agentId, - userId: this.runtime.agentId, - content: { - ...content, - user: this.runtime.character.name, - inReplyTo: memory.id, - }, - roomId, - embedding: embeddingZeroVector, - }; - - if (responseMemory.content.text?.trim()) { - await this.runtime.messageManager.createMemory( - responseMemory - ); - state = - await this.runtime.updateRecentMessageState( - state - ); - const responseStream = await this.runtime - .getService(ServiceType.SPEECH_GENERATION) - .getInstance<ISpeechService>() - .generate(this.runtime, content.text); - - if (responseStream) { - await this.playAudioStream( - userId, - responseStream as Readable - ); - } - await this.runtime.evaluate(memory, state); - } else { - console.warn("Empty response, skipping"); - } - return [responseMemory]; - }; - - const responseMemories = - await callback(responseContent); - - const response = responseContent; - - const content = (response.responseMessage || - response.content || - response.message) as string; - - if (!content) { - transcriptionText = ""; // Reset transcription text - return null; - } - - console.log("responseMemories: ", responseMemories); - - await this.runtime.processActions( - memory, - responseMemories, - state, - callback - ); + if (!shouldRespond) { + return; + } + + const context = composeContext({ + state, + template: + this.runtime.character.templates + ?.discordVoiceHandlerTemplate || + this.runtime.character.templates?.messageHandlerTemplate || + discordVoiceHandlerTemplate, + }); + + const responseContent = await this._generateResponse( + memory, + state, + context + ); + + const callback: HandlerCallback = async (content: Content) => { + console.log("callback content: ", content); + const { roomId } = memory; + + const responseMemory: Memory = { + id: stringToUuid( + memory.id + "-voice-response-" + Date.now() + ), + agentId: this.runtime.agentId, + userId: this.runtime.agentId, + content: { + ...content, + user: this.runtime.character.name, + inReplyTo: memory.id, + }, + roomId, + embedding: embeddingZeroVector, + }; + + if (responseMemory.content.text?.trim()) { + await this.runtime.messageManager.createMemory( + responseMemory + ); + state = await this.runtime.updateRecentMessageState(state); + + const responseStream = await this.runtime + .getService(ServiceType.SPEECH_GENERATION) + .getInstance<ISpeechService>() + .generate(this.runtime, content.text); - transcriptionText = ""; // Reset transcription text - } catch (error) { - console.error( - "Error processing transcribed text:", - error + if (responseStream) { + await this.playAudioStream( + userId, + responseStream as Readable ); - transcriptionText = ""; // Reset transcription text } + + await this.runtime.evaluate(memory, state); + } else { + console.warn("Empty response, skipping"); } + return [responseMemory]; + }; + + const responseMemories = await callback(responseContent); + + const response = responseContent; + + const content = (response.responseMessage || + response.content || + response.message) as string; + + if (!content) { + return null; } - ); + + console.log("responseMemories: ", responseMemories); + + await this.runtime.processActions( + memory, + responseMemories, + state, + callback + ); + } catch (error) { + console.error("Error processing transcribed text:", error); + } + } + + private async _shouldRespond( + message: string, + userId: UUID, + channel: BaseGuildVoiceChannel, + state: State + ): Promise<boolean> { + if (userId === this.client.user?.id) return false; + const lowerMessage = message.toLowerCase(); + const botName = this.client.user.username.toLowerCase(); + const characterName = this.runtime.character.name.toLowerCase(); + const guild = channel.guild; + const member = guild?.members.cache.get(this.client.user?.id as string); + const nickname = member?.nickname; + + if ( + lowerMessage.includes(botName as string) || + lowerMessage.includes(characterName) || + lowerMessage.includes( + this.client.user?.tag.toLowerCase() as string + ) || + (nickname && lowerMessage.includes(nickname.toLowerCase())) + ) { + return true; + } + + if (!channel.guild) { + return true; + } + + // If none of the above conditions are met, use the generateText to decide + const shouldRespondContext = composeContext({ + state, + template: + this.runtime.character.templates + ?.discordShouldRespondTemplate || + this.runtime.character.templates?.shouldRespondTemplate || + discordShouldRespondTemplate, + }); + + const response = await generateShouldRespond({ + runtime: this.runtime, + context: shouldRespondContext, + modelClass: ModelClass.SMALL, + }); + + if (response === "RESPOND") { + return true; + } else if (response === "IGNORE") { + return false; + } else if (response === "STOP") { + return false; + } else { + console.error( + "Invalid response from response generateText:", + response + ); + return false; + } } private async convertOpusToWav(pcmBuffer: Buffer): Promise<Buffer> { @@ -696,24 +800,44 @@ export class VoiceManager extends EventEmitter { } async scanGuild(guild: Guild) { - const channels = (await guild.channels.fetch()).filter( - (channel) => channel?.type == ChannelType.GuildVoice - ); let chosenChannel: BaseGuildVoiceChannel | null = null; - for (const [, channel] of channels) { - const voiceChannel = channel as BaseGuildVoiceChannel; - if ( - voiceChannel.members.size > 0 && - (chosenChannel === null || - voiceChannel.members.size > chosenChannel.members.size) - ) { - chosenChannel = voiceChannel; + try { + const channelId = this.runtime.getSetting( + "DISCORD_VOICE_CHANNEL_ID" + ) as string; + if (channelId) { + const channel = await guild.channels.fetch(channelId); + if (channel?.isVoiceBased()) { + chosenChannel = channel as BaseGuildVoiceChannel; + } } - } - if (chosenChannel != null) { - this.joinChannel(chosenChannel); + if (!chosenChannel) { + const channels = (await guild.channels.fetch()).filter( + (channel) => channel?.type == ChannelType.GuildVoice + ); + for (const [, channel] of channels) { + const voiceChannel = channel as BaseGuildVoiceChannel; + if ( + voiceChannel.members.size > 0 && + (chosenChannel === null || + voiceChannel.members.size > + chosenChannel.members.size) + ) { + chosenChannel = voiceChannel; + } + } + } + + if (chosenChannel) { + console.log(`Joining channel: ${chosenChannel.name}`); + await this.joinChannel(chosenChannel); + } else { + console.warn("No suitable voice channel found to join."); + } + } catch (error) { + console.error("Error selecting or joining a voice channel:", error); } } @@ -723,11 +847,15 @@ export class VoiceManager extends EventEmitter { console.log(`No connection for user ${userId}`); return; } + + this.cleanupAudioPlayer(this.activeAudioPlayer); + const audioPlayer = createAudioPlayer({ behaviors: { noSubscriber: NoSubscriberBehavior.Pause, }, }); + this.activeAudioPlayer = audioPlayer; connection.subscribe(audioPlayer); const audioStartTime = Date.now(); @@ -737,21 +865,33 @@ export class VoiceManager extends EventEmitter { }); audioPlayer.play(resource); - audioPlayer.on("error", (err: any) => { + const handleError = (err: any) => { console.log(`Audio player error: ${err}`); - }); - - audioPlayer.on( - "stateChange", - (oldState: any, newState: { status: string }) => { - if (newState.status == "idle") { - const idleTime = Date.now(); - console.log( - `Audio playback took: ${idleTime - audioStartTime}ms` - ); - } + }; + const handleStateChange = ( + oldState: any, + newState: { status: string } + ) => { + if (newState.status == "idle") { + const idleTime = Date.now(); + console.log( + `Audio playback took: ${idleTime - audioStartTime}ms` + ); } - ); + }; + + audioPlayer.on("stateChange", handleStateChange); + audioPlayer.on("error", handleError); + } + + cleanupAudioPlayer(audioPlayer: AudioPlayer) { + if (!audioPlayer) return; + + audioPlayer.stop(); + audioPlayer.removeAllListeners(); + if (audioPlayer === this.activeAudioPlayer) { + this.activeAudioPlayer = null; + } } async handleJoinChannelCommand(interaction: any) { diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 040b743a68..88f0bcdfd9 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -503,15 +503,17 @@ export interface IMemoryManager { } export abstract class Service { - private static instance: Service | null = null; + private static instances: Map<any, Service> = new Map(); static serviceType: ServiceType; public static getInstance<T extends Service>(): T { - if (!Service.instance) { - // Use this.prototype.constructor to instantiate the concrete class - Service.instance = new (this as any)(); + if (!Service.instances.has(this)) { + Service.instances.set( + this, + new (this as unknown as { new (): T })() + ); } - return Service.instance as T; + return Service.instances.get(this) as T; } }