@@ -19,8 +19,8 @@ import {
19
19
RawImage ,
20
20
type Tensor ,
21
21
} from "@huggingface/transformers" ;
22
+ import sharp , { AvailableFormatInfo , FormatEnum } from "sharp" ;
22
23
import fs from "fs" ;
23
- import gifFrames from "gif-frames" ;
24
24
import os from "os" ;
25
25
import path from "path" ;
26
26
@@ -111,15 +111,14 @@ class LocalImageProvider implements ImageProvider {
111
111
}
112
112
113
113
async describeImage (
114
- imageData : Buffer
114
+ imageData : Buffer ,
115
+ mimeType : string
115
116
) : Promise < { title : string ; description : string } > {
116
117
if ( ! this . model || ! this . processor || ! this . tokenizer ) {
117
118
throw new Error ( "Model components not initialized" ) ;
118
119
}
119
-
120
- const base64Data = imageData . toString ( "base64" ) ;
121
- const dataUrl = `data:image/jpeg;base64,${ base64Data } ` ;
122
- const image = await RawImage . fromURL ( dataUrl ) ;
120
+ const blob = new Blob ( [ imageData ] , { type : mimeType } ) ;
121
+ const image = await RawImage . fromBlob ( blob ) ;
123
122
const visionInputs = await this . processor ( image ) ;
124
123
const prompts = this . processor . construct_prompts ( "<DETAILED_CAPTION>" ) ;
125
124
const textInputs = this . tokenizer ( prompts ) ;
@@ -314,10 +313,12 @@ export class ImageDescriptionService
314
313
if ( this . runtime . imageVisionModelProvider ) {
315
314
if (
316
315
this . runtime . imageVisionModelProvider ===
317
- ModelProviderName . LLAMALOCAL
316
+ ModelProviderName . LLAMALOCAL ||
317
+ this . runtime . imageVisionModelProvider ===
318
+ ModelProviderName . OLLAMA
318
319
) {
319
320
this . provider = new LocalImageProvider ( ) ;
320
- elizaLogger . debug ( "Using llama local for vision model" ) ;
321
+ elizaLogger . debug ( "Using local provider for vision model" ) ;
321
322
} else if (
322
323
this . runtime . imageVisionModelProvider ===
323
324
ModelProviderName . GOOGLE
@@ -343,9 +344,12 @@ export class ImageDescriptionService
343
344
) ;
344
345
return false ;
345
346
}
346
- } else if ( model === models [ ModelProviderName . LLAMALOCAL ] ) {
347
+ } else if (
348
+ model === models [ ModelProviderName . LLAMALOCAL ] ||
349
+ model === models [ ModelProviderName . OLLAMA ]
350
+ ) {
347
351
this . provider = new LocalImageProvider ( ) ;
348
- elizaLogger . debug ( "Using llama local for vision model" ) ;
352
+ elizaLogger . debug ( "Using local provider for vision model" ) ;
349
353
} else if ( model === models [ ModelProviderName . GOOGLE ] ) {
350
354
this . provider = new GoogleImageProvider ( this . runtime ) ;
351
355
elizaLogger . debug ( "Using google for vision model" ) ;
@@ -369,74 +373,85 @@ export class ImageDescriptionService
369
373
}
370
374
371
375
private async loadImageData (
372
- imageUrl : string
376
+ imageUrlOrPath : string
373
377
) : Promise < { data : Buffer ; mimeType : string } > {
374
- const isGif = imageUrl . toLowerCase ( ) . endsWith ( ".gif" ) ;
375
- let imageData : Buffer ;
376
- let mimeType : string ;
377
-
378
- if ( isGif ) {
379
- const { filePath } = await this . extractFirstFrameFromGif ( imageUrl ) ;
380
- imageData = fs . readFileSync ( filePath ) ;
381
- mimeType = "image/png" ;
382
- fs . unlinkSync ( filePath ) ; // Clean up temp file
378
+ let loadedImageData : Buffer ;
379
+ let loadedMimeType : string ;
380
+ const { imageData, mimeType } = await this . fetchImage ( imageUrlOrPath ) ;
381
+ const skipConversion =
382
+ mimeType === "image/jpeg" ||
383
+ mimeType === "image/jpg" ||
384
+ mimeType === "image/png" ;
385
+ if ( skipConversion ) {
386
+ loadedImageData = imageData ;
387
+ loadedMimeType = mimeType ;
383
388
} else {
384
- if ( fs . existsSync ( imageUrl ) ) {
385
- imageData = fs . readFileSync ( imageUrl ) ;
386
- const ext = path . extname ( imageUrl ) . slice ( 1 ) ;
387
- mimeType = ext ? `image/${ ext } ` : "image/jpeg" ;
388
- } else {
389
- const response = await fetch ( imageUrl ) ;
390
- if ( ! response . ok ) {
391
- throw new Error (
392
- `Failed to fetch image: ${ response . statusText } `
393
- ) ;
394
- }
395
- imageData = Buffer . from ( await response . arrayBuffer ( ) ) ;
396
- mimeType = response . headers . get ( "content-type" ) || "image/jpeg" ;
397
- }
389
+ const converted = await this . convertImageDataToFormat (
390
+ imageData ,
391
+ "png"
392
+ ) ;
393
+ loadedImageData = converted . imageData ;
394
+ loadedMimeType = converted . mimeType ;
398
395
}
399
-
400
- if ( ! imageData || imageData . length === 0 ) {
396
+ if ( ! loadedImageData || loadedImageData . length === 0 ) {
401
397
throw new Error ( "Failed to fetch image data" ) ;
402
398
}
403
-
404
- return { data : imageData , mimeType } ;
399
+ return { data : loadedImageData , mimeType : loadedMimeType } ;
405
400
}
406
401
407
- private async extractFirstFrameFromGif (
408
- gifUrl : string
409
- ) : Promise < { filePath : string } > {
410
- const frameData = await gifFrames ( {
411
- url : gifUrl ,
412
- frames : 1 ,
413
- outputType : "png" ,
414
- } ) ;
415
-
402
+ private async convertImageDataToFormat (
403
+ data : Buffer ,
404
+ format : keyof FormatEnum | AvailableFormatInfo = "png"
405
+ ) : Promise < { imageData : Buffer ; mimeType : string } > {
416
406
const tempFilePath = path . join (
417
407
os . tmpdir ( ) ,
418
- `gif_frame_ ${ Date . now ( ) } .png `
408
+ `tmp_img_ ${ Date . now ( ) } .${ format } `
419
409
) ;
410
+ try {
411
+ await sharp ( data ) . toFormat ( format ) . toFile ( tempFilePath ) ;
412
+ const { imageData, mimeType } = await this . fetchImage ( tempFilePath ) ;
413
+ return {
414
+ imageData,
415
+ mimeType,
416
+ } ;
417
+ } finally {
418
+ fs . unlinkSync ( tempFilePath ) ; // Clean up temp file
419
+ }
420
+ }
420
421
421
- return new Promise ( ( resolve , reject ) => {
422
- const writeStream = fs . createWriteStream ( tempFilePath ) ;
423
- frameData [ 0 ] . getImage ( ) . pipe ( writeStream ) ;
424
- writeStream . on ( "finish" , ( ) => resolve ( { filePath : tempFilePath } ) ) ;
425
- writeStream . on ( "error" , reject ) ;
426
- } ) ;
422
+ private async fetchImage (
423
+ imageUrlOrPath : string
424
+ ) : Promise < { imageData : Buffer ; mimeType : string } > {
425
+ let imageData : Buffer ;
426
+ let mimeType : string ;
427
+ if ( fs . existsSync ( imageUrlOrPath ) ) {
428
+ imageData = fs . readFileSync ( imageUrlOrPath ) ;
429
+ const ext = path . extname ( imageUrlOrPath ) . slice ( 1 ) . toLowerCase ( ) ;
430
+ mimeType = ext ? `image/${ ext } ` : "image/jpeg" ;
431
+ } else {
432
+ const response = await fetch ( imageUrlOrPath ) ;
433
+ if ( ! response . ok ) {
434
+ throw new Error (
435
+ `Failed to fetch image: ${ response . statusText } `
436
+ ) ;
437
+ }
438
+ imageData = Buffer . from ( await response . arrayBuffer ( ) ) ;
439
+ mimeType = response . headers . get ( "content-type" ) || "image/jpeg" ;
440
+ }
441
+ return { imageData, mimeType } ;
427
442
}
428
443
429
444
async describeImage (
430
- imageUrl : string
445
+ imageUrlOrPath : string
431
446
) : Promise < { title : string ; description : string } > {
432
447
if ( ! this . initialized ) {
433
448
this . initialized = await this . initializeProvider ( ) ;
434
449
}
435
450
436
451
if ( this . initialized ) {
437
452
try {
438
- const { data, mimeType } = await this . loadImageData ( imageUrl ) ;
439
- return await this . provider ! . describeImage ( data , mimeType ) ;
453
+ const { data, mimeType } = await this . loadImageData ( imageUrlOrPath ) ;
454
+ return await this . provider . describeImage ( data , mimeType ) ;
440
455
} catch ( error ) {
441
456
elizaLogger . error ( "Error in describeImage:" , error ) ;
442
457
throw error ;
0 commit comments