Skip to content

Commit 0f6f3ec

Browse files
ae9iswtfsayo
andauthored
fix: elizaOS#2373 Fix image description (elizaOS#2375)
* fix: gif frame extraction and remove outdated dependency "gif-frames" * fix: local image description * fix: describe image action to account for differences in file location object between models * fix: image description service to handle more input formats correctly * fix: route to local vision model provider when using ollama * fix: improve temp file cleanup in image conversion --------- Co-authored-by: Sayo <82053242+wtfsayo@users.noreply.github.com>
1 parent 9683fc5 commit 0f6f3ec

File tree

3 files changed

+81
-61
lines changed

3 files changed

+81
-61
lines changed

packages/plugin-node/package.json

+1-2
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323
"tsup.config.ts"
2424
],
2525
"dependencies": {
26-
"@elizaos/core": "workspace:*",
2726
"@aws-sdk/client-s3": "^3.705.0",
2827
"@aws-sdk/s3-request-presigner": "^3.705.0",
2928
"@cliqz/adblocker-playwright": "1.34.0",
3029
"@echogarden/espeak-ng-emscripten": "0.3.3",
3130
"@echogarden/kissfft-wasm": "0.2.0",
3231
"@echogarden/speex-resampler-wasm": "0.2.1",
32+
"@elizaos/core": "workspace:*",
3333
"@huggingface/transformers": "3.0.2",
3434
"@opendocsg/pdf2md": "0.1.32",
3535
"@types/uuid": "10.0.0",
@@ -46,7 +46,6 @@
4646
"formdata-node": "6.0.3",
4747
"fs-extra": "11.2.0",
4848
"gaxios": "6.7.1",
49-
"gif-frames": "0.4.1",
5049
"glob": "11.0.0",
5150
"graceful-fs": "4.2.11",
5251
"html-escaper": "3.0.3",

packages/plugin-node/src/actions/describe-image.ts

+8-2
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,18 @@ export const describeImage: Action = {
4343
stop: ["\n"],
4444
});
4545

46-
if (!isFileLocationResult(fileLocationResultObject?.object)) {
46+
if (
47+
!isFileLocationResult(
48+
fileLocationResultObject?.object ?? fileLocationResultObject
49+
)
50+
) {
4751
elizaLogger.error("Failed to generate file location");
4852
return false;
4953
}
5054

51-
const { fileLocation } = fileLocationResultObject.object;
55+
let fileLocation = (fileLocationResultObject?.object as any)
56+
?.fileLocation;
57+
fileLocation ??= fileLocationResultObject;
5258

5359
const { description } = await runtime
5460
.getService<IImageDescriptionService>(ServiceType.IMAGE_DESCRIPTION)

packages/plugin-node/src/services/image.ts

+72-57
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ import {
1919
RawImage,
2020
type Tensor,
2121
} from "@huggingface/transformers";
22+
import sharp, { AvailableFormatInfo, FormatEnum } from "sharp";
2223
import fs from "fs";
23-
import gifFrames from "gif-frames";
2424
import os from "os";
2525
import path from "path";
2626

@@ -111,15 +111,14 @@ class LocalImageProvider implements ImageProvider {
111111
}
112112

113113
async describeImage(
114-
imageData: Buffer
114+
imageData: Buffer,
115+
mimeType: string
115116
): Promise<{ title: string; description: string }> {
116117
if (!this.model || !this.processor || !this.tokenizer) {
117118
throw new Error("Model components not initialized");
118119
}
119-
120-
const base64Data = imageData.toString("base64");
121-
const dataUrl = `data:image/jpeg;base64,${base64Data}`;
122-
const image = await RawImage.fromURL(dataUrl);
120+
const blob = new Blob([imageData], { type: mimeType });
121+
const image = await RawImage.fromBlob(blob);
123122
const visionInputs = await this.processor(image);
124123
const prompts = this.processor.construct_prompts("<DETAILED_CAPTION>");
125124
const textInputs = this.tokenizer(prompts);
@@ -314,10 +313,12 @@ export class ImageDescriptionService
314313
if (this.runtime.imageVisionModelProvider) {
315314
if (
316315
this.runtime.imageVisionModelProvider ===
317-
ModelProviderName.LLAMALOCAL
316+
ModelProviderName.LLAMALOCAL ||
317+
this.runtime.imageVisionModelProvider ===
318+
ModelProviderName.OLLAMA
318319
) {
319320
this.provider = new LocalImageProvider();
320-
elizaLogger.debug("Using llama local for vision model");
321+
elizaLogger.debug("Using local provider for vision model");
321322
} else if (
322323
this.runtime.imageVisionModelProvider ===
323324
ModelProviderName.GOOGLE
@@ -343,9 +344,12 @@ export class ImageDescriptionService
343344
);
344345
return false;
345346
}
346-
} else if (model === models[ModelProviderName.LLAMALOCAL]) {
347+
} else if (
348+
model === models[ModelProviderName.LLAMALOCAL] ||
349+
model === models[ModelProviderName.OLLAMA]
350+
) {
347351
this.provider = new LocalImageProvider();
348-
elizaLogger.debug("Using llama local for vision model");
352+
elizaLogger.debug("Using local provider for vision model");
349353
} else if (model === models[ModelProviderName.GOOGLE]) {
350354
this.provider = new GoogleImageProvider(this.runtime);
351355
elizaLogger.debug("Using google for vision model");
@@ -369,74 +373,85 @@ export class ImageDescriptionService
369373
}
370374

371375
private async loadImageData(
372-
imageUrl: string
376+
imageUrlOrPath: string
373377
): Promise<{ data: Buffer; mimeType: string }> {
374-
const isGif = imageUrl.toLowerCase().endsWith(".gif");
375-
let imageData: Buffer;
376-
let mimeType: string;
377-
378-
if (isGif) {
379-
const { filePath } = await this.extractFirstFrameFromGif(imageUrl);
380-
imageData = fs.readFileSync(filePath);
381-
mimeType = "image/png";
382-
fs.unlinkSync(filePath); // Clean up temp file
378+
let loadedImageData: Buffer;
379+
let loadedMimeType: string;
380+
const { imageData, mimeType } = await this.fetchImage(imageUrlOrPath);
381+
const skipConversion =
382+
mimeType === "image/jpeg" ||
383+
mimeType === "image/jpg" ||
384+
mimeType === "image/png";
385+
if (skipConversion) {
386+
loadedImageData = imageData;
387+
loadedMimeType = mimeType;
383388
} else {
384-
if (fs.existsSync(imageUrl)) {
385-
imageData = fs.readFileSync(imageUrl);
386-
const ext = path.extname(imageUrl).slice(1);
387-
mimeType = ext ? `image/${ext}` : "image/jpeg";
388-
} else {
389-
const response = await fetch(imageUrl);
390-
if (!response.ok) {
391-
throw new Error(
392-
`Failed to fetch image: ${response.statusText}`
393-
);
394-
}
395-
imageData = Buffer.from(await response.arrayBuffer());
396-
mimeType = response.headers.get("content-type") || "image/jpeg";
397-
}
389+
const converted = await this.convertImageDataToFormat(
390+
imageData,
391+
"png"
392+
);
393+
loadedImageData = converted.imageData;
394+
loadedMimeType = converted.mimeType;
398395
}
399-
400-
if (!imageData || imageData.length === 0) {
396+
if (!loadedImageData || loadedImageData.length === 0) {
401397
throw new Error("Failed to fetch image data");
402398
}
403-
404-
return { data: imageData, mimeType };
399+
return { data: loadedImageData, mimeType: loadedMimeType };
405400
}
406401

407-
private async extractFirstFrameFromGif(
408-
gifUrl: string
409-
): Promise<{ filePath: string }> {
410-
const frameData = await gifFrames({
411-
url: gifUrl,
412-
frames: 1,
413-
outputType: "png",
414-
});
415-
402+
private async convertImageDataToFormat(
403+
data: Buffer,
404+
format: keyof FormatEnum | AvailableFormatInfo = "png"
405+
): Promise<{ imageData: Buffer; mimeType: string }> {
416406
const tempFilePath = path.join(
417407
os.tmpdir(),
418-
`gif_frame_${Date.now()}.png`
408+
`tmp_img_${Date.now()}.${format}`
419409
);
410+
try {
411+
await sharp(data).toFormat(format).toFile(tempFilePath);
412+
const { imageData, mimeType } = await this.fetchImage(tempFilePath);
413+
return {
414+
imageData,
415+
mimeType,
416+
};
417+
} finally {
418+
fs.unlinkSync(tempFilePath); // Clean up temp file
419+
}
420+
}
420421

421-
return new Promise((resolve, reject) => {
422-
const writeStream = fs.createWriteStream(tempFilePath);
423-
frameData[0].getImage().pipe(writeStream);
424-
writeStream.on("finish", () => resolve({ filePath: tempFilePath }));
425-
writeStream.on("error", reject);
426-
});
422+
private async fetchImage(
423+
imageUrlOrPath: string
424+
): Promise<{ imageData: Buffer; mimeType: string }> {
425+
let imageData: Buffer;
426+
let mimeType: string;
427+
if (fs.existsSync(imageUrlOrPath)) {
428+
imageData = fs.readFileSync(imageUrlOrPath);
429+
const ext = path.extname(imageUrlOrPath).slice(1).toLowerCase();
430+
mimeType = ext ? `image/${ext}` : "image/jpeg";
431+
} else {
432+
const response = await fetch(imageUrlOrPath);
433+
if (!response.ok) {
434+
throw new Error(
435+
`Failed to fetch image: ${response.statusText}`
436+
);
437+
}
438+
imageData = Buffer.from(await response.arrayBuffer());
439+
mimeType = response.headers.get("content-type") || "image/jpeg";
440+
}
441+
return { imageData, mimeType };
427442
}
428443

429444
async describeImage(
430-
imageUrl: string
445+
imageUrlOrPath: string
431446
): Promise<{ title: string; description: string }> {
432447
if (!this.initialized) {
433448
this.initialized = await this.initializeProvider();
434449
}
435450

436451
if (this.initialized) {
437452
try {
438-
const { data, mimeType } = await this.loadImageData(imageUrl);
439-
return await this.provider!.describeImage(data, mimeType);
453+
const { data, mimeType } = await this.loadImageData(imageUrlOrPath);
454+
return await this.provider.describeImage(data, mimeType);
440455
} catch (error) {
441456
elizaLogger.error("Error in describeImage:", error);
442457
throw error;

0 commit comments

Comments
 (0)