File tree 5 files changed +311
-236
lines changed
5 files changed +311
-236
lines changed Original file line number Diff line number Diff line change 1
1
import settings from "./settings.ts" ;
2
2
import {
3
- Models ,
4
- ModelProviderName ,
3
+ EmbeddingModelSettings ,
4
+ ImageModelSettings ,
5
5
ModelClass ,
6
+ ModelProviderName ,
7
+ Models ,
6
8
ModelSettings ,
7
- ImageModelSettings ,
8
- EmbeddingModelSettings ,
9
9
} from "./types.ts" ;
10
10
11
11
export const models : Models = {
@@ -332,6 +332,7 @@ export const models: Models = {
332
332
} ,
333
333
} ,
334
334
[ ModelProviderName . GOOGLE ] : {
335
+ endpoint : "https://generativelanguage.googleapis.com" ,
335
336
model : {
336
337
[ ModelClass . SMALL ] : {
337
338
name :
Original file line number Diff line number Diff line change @@ -80,7 +80,51 @@ Provides web scraping and content extraction capabilities using Playwright.
80
80
81
81
### ImageDescriptionService
82
82
83
- Processes and analyzes images to generate descriptions.
83
+ Processes and analyzes images to generate descriptions. Supports multiple providers:
84
+
85
+ - Local processing using Florence model
86
+ - OpenAI Vision API
87
+ - Google Gemini
88
+
89
+ Configuration:
90
+
91
+ ``` env
92
+ # For OpenAI Vision
93
+ OPENAI_API_KEY=your_openai_api_key
94
+
95
+ # For Google Gemini
96
+ GOOGLE_GENERATIVE_AI_API_KEY=your_google_api_key
97
+ ```
98
+
99
+ Provider selection:
100
+
101
+ - If ` imageVisionModelProvider ` is set to ` google/openai ` , it will use this one.
102
+ - Else if ` model ` is set to ` google/openai ` , it will use this one.
103
+ - Default if nothing is set is OpenAI.
104
+
105
+ The service automatically handles different image formats, including GIFs (first frame extraction).
106
+
107
+ Features by provider:
108
+
109
+ ** Local (Florence):**
110
+
111
+ - Basic image captioning
112
+ - Local processing without API calls
113
+
114
+ ** OpenAI Vision:**
115
+
116
+ - Detailed image descriptions
117
+ - Text detection
118
+ - Object recognition
119
+
120
+ ** Google Gemini 1.5:**
121
+
122
+ - High-quality image understanding
123
+ - Detailed descriptions with natural language
124
+ - Multi-modal context understanding
125
+ - Support for complex scenes and content
126
+
127
+ The provider can be configured through the runtime settings, allowing easy switching between providers based on your needs.
84
128
85
129
### LlamaService
86
130
Original file line number Diff line number Diff line change @@ -2,17 +2,17 @@ export * from "./services/index.ts";
2
2
3
3
import { Plugin } from "@elizaos/core" ;
4
4
5
+ import { describeImage } from "./actions/describe-image.ts" ;
5
6
import {
7
+ AwsS3Service ,
6
8
BrowserService ,
7
9
ImageDescriptionService ,
8
10
LlamaService ,
9
11
PdfService ,
10
12
SpeechService ,
11
13
TranscriptionService ,
12
14
VideoService ,
13
- AwsS3Service ,
14
15
} from "./services/index.ts" ;
15
- import { describeImage } from "./actions/describe-image.ts" ;
16
16
17
17
export type NodePlugin = ReturnType < typeof createNodePlugin > ;
18
18
You can’t perform that action at this time.
0 commit comments