Merge branch 'image-gen' of https://github.com/ai16z/eliza into HEAD

lalalune · lalalune · commit 0e9b1e846515 · 2024-10-31T17:48:11.000-07:00
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -39,7 +39,6 @@
     "@rollup/plugin-json": "6.1.0",
     "@rollup/plugin-node-resolve": "15.3.0",
     "@rollup/plugin-replace": "6.0.1",
-    "@rollup/plugin-terser": "0.1.0",
     "@rollup/plugin-typescript": "12.1.1",
     "@types/better-sqlite3": "7.6.11",
     "@types/fluent-ffmpeg": "2.1.27",
@@ -75,6 +74,7 @@
     "@ai-sdk/openai": "^0.0.70",
     "@anthropic-ai/sdk": "^0.30.1",
     "@cliqz/adblocker-playwright": "1.34.0",
+    "@diffusionstudio/vits-web": "1.0.2",
     "@discordjs/opus": "github:discordjs/opus",
     "@discordjs/rest": "2.4.0",
     "@discordjs/voice": "0.17.0",
diff --git a/src/actions/ImageGeneration.ts b/src/actions/ImageGeneration.ts
@@ -20,35 +20,34 @@ export default {
         state = (await runtime.composeState(message)) as State;
         const userId = runtime.agentId;
         
-        const imagePrompt = "";
+        const imagePrompt = message.content.text;
         const res: { image: string, caption: string }[] = [];
         const images = await generateImage({
-            apiKey: runtime.getSetting("ANTHROPIC_API_KEY"),
             prompt: imagePrompt,
             width: 1024,
             height: 1024,
-            steps: 4,
             count: 1
-        })
+        }, runtime);
         if (images.success && images.data && images.data.length > 0) {
             for(let i = 0; i < images.data.length; i++) {
                 const image = images.data[i];
                 const caption = await generateCaption({
-                    apiKey: runtime.getSetting("ANTHROPIC_API_KEY"),
                     imageUrl: image
-                })
-                if (caption.success) {
-                    res.push({image: image, caption: caption.caption});
-                } else {
-                    console.error("Failed to generate caption for image", image, caption.error);
-                    res.push({image: image, caption: "Uncaptioned image"});
-                }
+                }, runtime);
+                res.push({image: image, caption: caption.title})
+                callback({
+                    text: caption.description,
+                    attachments: [{
+                        id: crypto.randomUUID(),
+                        url: image,
+                        title: "Generated image",
+                        source: "imageGeneration",
+                        description: caption.title,
+                        text: caption.description
+                    }]
+                }, [])
             }
         }
-        callback(null, {
-            success: true,
-            data: res
-        });
     },
     examples: [
         [
diff --git a/src/actions/imageGenerationUtils.ts b/src/actions/imageGenerationUtils.ts
@@ -1,37 +1,39 @@
 // TODO: Replace with the vercel ai sdk and support all providers
-import Anthropic from "@anthropic-ai/sdk";
 import { Buffer } from 'buffer';
 import Together from "together-ai";
+import { IAgentRuntime } from "../core/types";
+import { getModel, ImageGenModel } from "../core/imageGenModels.ts";
+import OpenAI from "openai";
 
 export const generateImage = async (data: {
-    apiKey: string, 
     prompt: string, 
     width: number, 
     height: number, 
-    steps?: number, 
     count?: number
-}): Promise<{
+}, runtime: IAgentRuntime): Promise<{
     success: boolean,
     data?: string[],
     error?: any
 }> => {
-    const { apiKey, prompt, width, height } = data;
-    let { steps, count } = data;
-    if (!steps) {
-        steps = 4;
-    }
+    const { prompt, width, height } = data;
+    let { count } = data;
     if (!count) {
         count = 1;
     }
 
+    const imageGenModel = runtime.imageGenModel;
+    const model = getModel(imageGenModel);
+    const apiKey = imageGenModel === ImageGenModel.TogetherAI ? runtime.getSetting("TOGETHER_API_KEY") : runtime.getSetting("OPENAI_API_KEY");
+
     try {
+        if (imageGenModel === ImageGenModel.TogetherAI) {
         const together = new Together({ apiKey });
         const response = await together.images.create({
             model: "black-forest-labs/FLUX.1-schnell",
             prompt,
             width,
             height,
-            steps,
+            steps: model.steps,
             n: count,
         });
         const urls: string[] = [];
@@ -48,63 +50,37 @@ export const generateImage = async (data: {
             base64 = "data:image/jpeg;base64," + base64;
             return base64;
         }));
-        return { success: true, data: base64s };
+            return { success: true, data: base64s };
+        } else {
+            let targetSize = `${width}x${height}`;
+            if (targetSize !== "1024x1024" && targetSize !== "1792x1024" && targetSize !== "1024x1792") {
+                targetSize = "1024x1024";
+            }
+            const openai = new OpenAI({ apiKey });
+            const response = await openai.images.generate({
+                model: model.subModel,
+                prompt,
+                size: targetSize as "1024x1024" | "1792x1024" | "1024x1792",
+                n: count,
+                response_format: "b64_json",
+            });
+            const base64s = response.data.map((image) => `data:image/png;base64,${image.b64_json}`);
+            return { success: true, data: base64s };
+        }
   } catch (error) {
         console.error(error);
         return { success: false, error: error };
   }
 };
 
-export const generateCaption = async (data: {apiKey: string, imageUrl: string}) => {
-    const { apiKey, imageUrl } = data;
-
-    try {
-        const anthropic = new Anthropic({
-            apiKey,
-        });
-
-        const base64Data = imageUrl.replace(/^data:image\/\w+;base64,/, "");
-        const buffer = Buffer.from(base64Data, 'base64');
-        const imageType = detectImageType(buffer);
-        
-        if (!imageType) {
-            throw new Error("Invalid image data");
-        }
-
-        const response = await anthropic.messages.create({
-            model: "claude-3-5-sonnet-20240620",
-            max_tokens: 8192,
-            temperature: 0,
-            messages: [
-              {
-                role: "user",
-                content: [
-                    {type: "text", text: "What do you see in this image? Generate a caption for it! Keep it short, max one phrase. Caption:"},
-                    //@ts-ignore
-                    {type: "image", source: {data: base64Data, media_type: `image/${imageType}`, type: "base64"}}
-                ]
-              },
-            ],
-            tools: [],
-          });
-
-          const responseContent = ((response.content[0] as any).text as string).replace("Caption:", "").trim();
-          return { success: true, caption: responseContent };
-    } catch (error) {
-        console.error(error);
-        return { success: false, error: error, caption: "" };
-    }
-}
-
-function detectImageType(buffer: Buffer): string | null {
-    if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) {
-        return 'jpeg';
-    } else if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) {
-        return 'png';
-    } else if (buffer[0] === 0x47 && buffer[1] === 0x49 && buffer[2] === 0x46) {
-        return 'gif';
-    } else if (buffer[0] === 0x42 && buffer[1] === 0x4D) {
-        return 'bmp';
-    }
-    return null;
+export const generateCaption = async (data: {imageUrl: string}, runtime: IAgentRuntime): Promise<{
+    title: string,
+    description: string
+}> => {
+    const { imageUrl } = data;
+    const resp = await runtime.imageDescriptionService.describeImage(imageUrl);
+    return {
+        title: resp.title.trim(),
+        description: resp.description.trim()
+    };
 }
diff --git a/src/clients/direct/index.ts b/src/clients/direct/index.ts
@@ -207,19 +207,12 @@ this.app.post("/:agentId/whisper", upload.single('file'), async (req: CustomRequ
         return;
       }
 
-      const togetherApiKey = agent.getSetting("TOGETHER_API_KEY");
-      const claudeApiKey = agent.getSetting("ANTHROPIC_API_KEY");
-      
-      const images = await generateImage({...req.body, apiKey: togetherApiKey });
+      const images = await generateImage({...req.body }, agent);
       const imagesRes: {image: string, caption: string}[] = [];
       if (images.data && images.data.length > 0) {
         for(let i = 0; i < images.data.length; i++) {
-          const caption = await generateCaption({apiKey: claudeApiKey, imageUrl: images.data[i]});
-          if (caption.success) {
-            imagesRes.push({image: images.data[i], caption: caption.caption});
-          } else {
-            imagesRes.push({image: images.data[i], caption: "Uncaptioned image"});
-          }
+          const caption = await generateCaption({imageUrl: images.data[i]}, agent);
+          imagesRes.push({image: images.data[i], caption: caption.title});
         }
       }
       res.json({images: imagesRes});
diff --git a/src/clients/telegram/src/messageManager.ts b/src/clients/telegram/src/messageManager.ts
@@ -207,6 +207,12 @@ export class MessageManager {
       return; // Exit if no message or sender info
     }
 
+    //@ts-ignore
+    if (ctx.message.text.startsWith("/")) {
+      //Handle commands?
+      return;
+    }
+
     const message = ctx.message;
 
     try {
diff --git a/src/core/defaultCharacter.ts b/src/core/defaultCharacter.ts
@@ -3,6 +3,7 @@ import { Character, ModelProvider } from "./types.ts";
 const defaultCharacter: Character = {
   name: "Eliza",
   clients: [
+    "telegram",
     // "discord",
     // "twitter"
   ],
diff --git a/src/core/imageGenModels.ts b/src/core/imageGenModels.ts
@@ -0,0 +1,19 @@
+export enum ImageGenModel {
+    TogetherAI = "TogetherAI",
+    Dalle = "Dalle",
+}
+
+const imageGenModels = {
+    [ImageGenModel.TogetherAI]: {
+        steps: 4,
+        subModel: "black-forest-labs/FLUX.1-schnell"
+    },
+    [ImageGenModel.Dalle]: {
+        steps: 0,
+        subModel: "dall-e-3"
+    }
+}   
+
+export function getModel(model: ImageGenModel) {
+    return imageGenModels[model];
+}
diff --git a/src/core/runtime.ts b/src/core/runtime.ts
@@ -55,6 +55,7 @@ import { defaultProviders, getProviders } from "./providers.ts";
 import settings from "./settings.ts";
 import { UUID, type Actor } from "./types.ts";
 import { stringToUuid } from "./uuid.ts";
+import { ImageGenModel } from "./imageGenModels.ts";
 
 /**
  * Represents the runtime environment for an agent, handling message processing,
@@ -105,6 +106,11 @@ export class AgentRuntime implements IAgentRuntime {
    */
   modelProvider = ModelProvider.LLAMALOCAL;
 
+  /**
+   * The model to use for image generation.
+   */
+  imageGenModel: ImageGenModel = ImageGenModel.TogetherAI;
+
   /**
    * Local Llama if no OpenAI key is present
    */
@@ -189,6 +195,7 @@ export class AgentRuntime implements IAgentRuntime {
     actions?: Action[]; // Optional custom actions
     evaluators?: Evaluator[]; // Optional custom evaluators
     providers?: Provider[];
+    imageGenModel?: ImageGenModel;
     modelProvider: ModelProvider;
     databaseAdapter: IDatabaseAdapter; // The database adapter used for interacting with the database
     fetch?: typeof fetch | unknown;
@@ -241,6 +248,7 @@ export class AgentRuntime implements IAgentRuntime {
     if (!this.serverUrl) {
       console.warn("No serverUrl provided, defaulting to localhost");
     }
+    this.imageGenModel = this.character.imageGenModel ?? opts.imageGenModel ?? this.imageGenModel;
 
     this.token = opts.token;
 
diff --git a/src/core/types.ts b/src/core/types.ts
@@ -1,4 +1,5 @@
 import { Keypair } from "@solana/web3.js";
+import { ImageGenModel } from "./imageGenModels";
 
 /**
  * Represents a UUID, which is a universally unique identifier conforming to the UUID standard.
@@ -283,6 +284,7 @@ export type Character = {
   name: string;
   system?: string;
   modelProvider: ModelProvider;
+  imageGenModel?: ImageGenModel;
   modelOverride?: string;
   bio: string | string[];
   lore: string[];
@@ -448,6 +450,7 @@ export interface IAgentRuntime {
   databaseAdapter: IDatabaseAdapter;
   token: string | null;
   modelProvider: ModelProvider;
+  imageGenModel: ImageGenModel;
   character: Character;
   providers: Provider[];
   actions: Action[];