Skip to content

Commit 0e9b1e8

Browse files
committed
Merge branch 'image-gen' of https://github.com/ai16z/eliza into HEAD
2 parents b9b692f + e517338 commit 0e9b1e8

10 files changed

+120
-426
lines changed

package-lock.json

+25-336
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
"@rollup/plugin-json": "6.1.0",
4040
"@rollup/plugin-node-resolve": "15.3.0",
4141
"@rollup/plugin-replace": "6.0.1",
42-
"@rollup/plugin-terser": "0.1.0",
4342
"@rollup/plugin-typescript": "12.1.1",
4443
"@types/better-sqlite3": "7.6.11",
4544
"@types/fluent-ffmpeg": "2.1.27",
@@ -75,6 +74,7 @@
7574
"@ai-sdk/openai": "^0.0.70",
7675
"@anthropic-ai/sdk": "^0.30.1",
7776
"@cliqz/adblocker-playwright": "1.34.0",
77+
"@diffusionstudio/vits-web": "1.0.2",
7878
"@discordjs/opus": "github:discordjs/opus",
7979
"@discordjs/rest": "2.4.0",
8080
"@discordjs/voice": "0.17.0",

src/actions/ImageGeneration.ts

+15-16
Original file line numberDiff line numberDiff line change
@@ -20,35 +20,34 @@ export default {
2020
state = (await runtime.composeState(message)) as State;
2121
const userId = runtime.agentId;
2222

23-
const imagePrompt = "";
23+
const imagePrompt = message.content.text;
2424
const res: { image: string, caption: string }[] = [];
2525
const images = await generateImage({
26-
apiKey: runtime.getSetting("ANTHROPIC_API_KEY"),
2726
prompt: imagePrompt,
2827
width: 1024,
2928
height: 1024,
30-
steps: 4,
3129
count: 1
32-
})
30+
}, runtime);
3331
if (images.success && images.data && images.data.length > 0) {
3432
for(let i = 0; i < images.data.length; i++) {
3533
const image = images.data[i];
3634
const caption = await generateCaption({
37-
apiKey: runtime.getSetting("ANTHROPIC_API_KEY"),
3835
imageUrl: image
39-
})
40-
if (caption.success) {
41-
res.push({image: image, caption: caption.caption});
42-
} else {
43-
console.error("Failed to generate caption for image", image, caption.error);
44-
res.push({image: image, caption: "Uncaptioned image"});
45-
}
36+
}, runtime);
37+
res.push({image: image, caption: caption.title})
38+
callback({
39+
text: caption.description,
40+
attachments: [{
41+
id: crypto.randomUUID(),
42+
url: image,
43+
title: "Generated image",
44+
source: "imageGeneration",
45+
description: caption.title,
46+
text: caption.description
47+
}]
48+
}, [])
4649
}
4750
}
48-
callback(null, {
49-
success: true,
50-
data: res
51-
});
5251
},
5352
examples: [
5453
[

src/actions/imageGenerationUtils.ts

+39-63
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,39 @@
11
// TODO: Replace with the vercel ai sdk and support all providers
2-
import Anthropic from "@anthropic-ai/sdk";
32
import { Buffer } from 'buffer';
43
import Together from "together-ai";
4+
import { IAgentRuntime } from "../core/types";
5+
import { getModel, ImageGenModel } from "../core/imageGenModels.ts";
6+
import OpenAI from "openai";
57

68
export const generateImage = async (data: {
7-
apiKey: string,
89
prompt: string,
910
width: number,
1011
height: number,
11-
steps?: number,
1212
count?: number
13-
}): Promise<{
13+
}, runtime: IAgentRuntime): Promise<{
1414
success: boolean,
1515
data?: string[],
1616
error?: any
1717
}> => {
18-
const { apiKey, prompt, width, height } = data;
19-
let { steps, count } = data;
20-
if (!steps) {
21-
steps = 4;
22-
}
18+
const { prompt, width, height } = data;
19+
let { count } = data;
2320
if (!count) {
2421
count = 1;
2522
}
2623

24+
const imageGenModel = runtime.imageGenModel;
25+
const model = getModel(imageGenModel);
26+
const apiKey = imageGenModel === ImageGenModel.TogetherAI ? runtime.getSetting("TOGETHER_API_KEY") : runtime.getSetting("OPENAI_API_KEY");
27+
2728
try {
29+
if (imageGenModel === ImageGenModel.TogetherAI) {
2830
const together = new Together({ apiKey });
2931
const response = await together.images.create({
3032
model: "black-forest-labs/FLUX.1-schnell",
3133
prompt,
3234
width,
3335
height,
34-
steps,
36+
steps: model.steps,
3537
n: count,
3638
});
3739
const urls: string[] = [];
@@ -48,63 +50,37 @@ export const generateImage = async (data: {
4850
base64 = "data:image/jpeg;base64," + base64;
4951
return base64;
5052
}));
51-
return { success: true, data: base64s };
53+
return { success: true, data: base64s };
54+
} else {
55+
let targetSize = `${width}x${height}`;
56+
if (targetSize !== "1024x1024" && targetSize !== "1792x1024" && targetSize !== "1024x1792") {
57+
targetSize = "1024x1024";
58+
}
59+
const openai = new OpenAI({ apiKey });
60+
const response = await openai.images.generate({
61+
model: model.subModel,
62+
prompt,
63+
size: targetSize as "1024x1024" | "1792x1024" | "1024x1792",
64+
n: count,
65+
response_format: "b64_json",
66+
});
67+
const base64s = response.data.map((image) => `data:image/png;base64,${image.b64_json}`);
68+
return { success: true, data: base64s };
69+
}
5270
} catch (error) {
5371
console.error(error);
5472
return { success: false, error: error };
5573
}
5674
};
5775

58-
export const generateCaption = async (data: {apiKey: string, imageUrl: string}) => {
59-
const { apiKey, imageUrl } = data;
60-
61-
try {
62-
const anthropic = new Anthropic({
63-
apiKey,
64-
});
65-
66-
const base64Data = imageUrl.replace(/^data:image\/\w+;base64,/, "");
67-
const buffer = Buffer.from(base64Data, 'base64');
68-
const imageType = detectImageType(buffer);
69-
70-
if (!imageType) {
71-
throw new Error("Invalid image data");
72-
}
73-
74-
const response = await anthropic.messages.create({
75-
model: "claude-3-5-sonnet-20240620",
76-
max_tokens: 8192,
77-
temperature: 0,
78-
messages: [
79-
{
80-
role: "user",
81-
content: [
82-
{type: "text", text: "What do you see in this image? Generate a caption for it! Keep it short, max one phrase. Caption:"},
83-
//@ts-ignore
84-
{type: "image", source: {data: base64Data, media_type: `image/${imageType}`, type: "base64"}}
85-
]
86-
},
87-
],
88-
tools: [],
89-
});
90-
91-
const responseContent = ((response.content[0] as any).text as string).replace("Caption:", "").trim();
92-
return { success: true, caption: responseContent };
93-
} catch (error) {
94-
console.error(error);
95-
return { success: false, error: error, caption: "" };
96-
}
97-
}
98-
99-
function detectImageType(buffer: Buffer): string | null {
100-
if (buffer[0] === 0xFF && buffer[1] === 0xD8 && buffer[2] === 0xFF) {
101-
return 'jpeg';
102-
} else if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4E && buffer[3] === 0x47) {
103-
return 'png';
104-
} else if (buffer[0] === 0x47 && buffer[1] === 0x49 && buffer[2] === 0x46) {
105-
return 'gif';
106-
} else if (buffer[0] === 0x42 && buffer[1] === 0x4D) {
107-
return 'bmp';
108-
}
109-
return null;
76+
export const generateCaption = async (data: {imageUrl: string}, runtime: IAgentRuntime): Promise<{
77+
title: string,
78+
description: string
79+
}> => {
80+
const { imageUrl } = data;
81+
const resp = await runtime.imageDescriptionService.describeImage(imageUrl);
82+
return {
83+
title: resp.title.trim(),
84+
description: resp.description.trim()
85+
};
11086
}

src/clients/direct/index.ts

+3-10
Original file line numberDiff line numberDiff line change
@@ -207,19 +207,12 @@ this.app.post("/:agentId/whisper", upload.single('file'), async (req: CustomRequ
207207
return;
208208
}
209209

210-
const togetherApiKey = agent.getSetting("TOGETHER_API_KEY");
211-
const claudeApiKey = agent.getSetting("ANTHROPIC_API_KEY");
212-
213-
const images = await generateImage({...req.body, apiKey: togetherApiKey });
210+
const images = await generateImage({...req.body }, agent);
214211
const imagesRes: {image: string, caption: string}[] = [];
215212
if (images.data && images.data.length > 0) {
216213
for(let i = 0; i < images.data.length; i++) {
217-
const caption = await generateCaption({apiKey: claudeApiKey, imageUrl: images.data[i]});
218-
if (caption.success) {
219-
imagesRes.push({image: images.data[i], caption: caption.caption});
220-
} else {
221-
imagesRes.push({image: images.data[i], caption: "Uncaptioned image"});
222-
}
214+
const caption = await generateCaption({imageUrl: images.data[i]}, agent);
215+
imagesRes.push({image: images.data[i], caption: caption.title});
223216
}
224217
}
225218
res.json({images: imagesRes});

src/clients/telegram/src/messageManager.ts

+6
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,12 @@ export class MessageManager {
207207
return; // Exit if no message or sender info
208208
}
209209

210+
//@ts-ignore
211+
if (ctx.message.text.startsWith("/")) {
212+
//Handle commands?
213+
return;
214+
}
215+
210216
const message = ctx.message;
211217

212218
try {

src/core/defaultCharacter.ts

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { Character, ModelProvider } from "./types.ts";
33
const defaultCharacter: Character = {
44
name: "Eliza",
55
clients: [
6+
"telegram",
67
// "discord",
78
// "twitter"
89
],

src/core/imageGenModels.ts

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
export enum ImageGenModel {
2+
TogetherAI = "TogetherAI",
3+
Dalle = "Dalle",
4+
}
5+
6+
const imageGenModels = {
7+
[ImageGenModel.TogetherAI]: {
8+
steps: 4,
9+
subModel: "black-forest-labs/FLUX.1-schnell"
10+
},
11+
[ImageGenModel.Dalle]: {
12+
steps: 0,
13+
subModel: "dall-e-3"
14+
}
15+
}
16+
17+
export function getModel(model: ImageGenModel) {
18+
return imageGenModels[model];
19+
}

src/core/runtime.ts

+8
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import { defaultProviders, getProviders } from "./providers.ts";
5555
import settings from "./settings.ts";
5656
import { UUID, type Actor } from "./types.ts";
5757
import { stringToUuid } from "./uuid.ts";
58+
import { ImageGenModel } from "./imageGenModels.ts";
5859

5960
/**
6061
* Represents the runtime environment for an agent, handling message processing,
@@ -105,6 +106,11 @@ export class AgentRuntime implements IAgentRuntime {
105106
*/
106107
modelProvider = ModelProvider.LLAMALOCAL;
107108

109+
/**
110+
* The model to use for image generation.
111+
*/
112+
imageGenModel: ImageGenModel = ImageGenModel.TogetherAI;
113+
108114
/**
109115
* Local Llama if no OpenAI key is present
110116
*/
@@ -189,6 +195,7 @@ export class AgentRuntime implements IAgentRuntime {
189195
actions?: Action[]; // Optional custom actions
190196
evaluators?: Evaluator[]; // Optional custom evaluators
191197
providers?: Provider[];
198+
imageGenModel?: ImageGenModel;
192199
modelProvider: ModelProvider;
193200
databaseAdapter: IDatabaseAdapter; // The database adapter used for interacting with the database
194201
fetch?: typeof fetch | unknown;
@@ -241,6 +248,7 @@ export class AgentRuntime implements IAgentRuntime {
241248
if (!this.serverUrl) {
242249
console.warn("No serverUrl provided, defaulting to localhost");
243250
}
251+
this.imageGenModel = this.character.imageGenModel ?? opts.imageGenModel ?? this.imageGenModel;
244252

245253
this.token = opts.token;
246254

src/core/types.ts

+3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { Keypair } from "@solana/web3.js";
2+
import { ImageGenModel } from "./imageGenModels";
23

34
/**
45
* Represents a UUID, which is a universally unique identifier conforming to the UUID standard.
@@ -283,6 +284,7 @@ export type Character = {
283284
name: string;
284285
system?: string;
285286
modelProvider: ModelProvider;
287+
imageGenModel?: ImageGenModel;
286288
modelOverride?: string;
287289
bio: string | string[];
288290
lore: string[];
@@ -448,6 +450,7 @@ export interface IAgentRuntime {
448450
databaseAdapter: IDatabaseAdapter;
449451
token: string | null;
450452
modelProvider: ModelProvider;
453+
imageGenModel: ImageGenModel;
451454
character: Character;
452455
providers: Provider[];
453456
actions: Action[];

0 commit comments

Comments
 (0)