Skip to content

Commit 0efa787

Browse files
authored
Merge branch 'develop' into main
2 parents 7d7929d + c5b3e73 commit 0efa787

File tree

5 files changed

+311
-236
lines changed

5 files changed

+311
-236
lines changed

packages/core/src/models.ts

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import settings from "./settings.ts";
22
import {
3-
Models,
4-
ModelProviderName,
3+
EmbeddingModelSettings,
4+
ImageModelSettings,
55
ModelClass,
6+
ModelProviderName,
7+
Models,
68
ModelSettings,
7-
ImageModelSettings,
8-
EmbeddingModelSettings,
99
} from "./types.ts";
1010

1111
export const models: Models = {
@@ -332,6 +332,7 @@ export const models: Models = {
332332
},
333333
},
334334
[ModelProviderName.GOOGLE]: {
335+
endpoint: "https://generativelanguage.googleapis.com",
335336
model: {
336337
[ModelClass.SMALL]: {
337338
name:

packages/plugin-node/README.md

+45-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,51 @@ Provides web scraping and content extraction capabilities using Playwright.
8080

8181
### ImageDescriptionService
8282

83-
Processes and analyzes images to generate descriptions.
83+
Processes and analyzes images to generate descriptions. Supports multiple providers:
84+
85+
- Local processing using Florence model
86+
- OpenAI Vision API
87+
- Google Gemini
88+
89+
Configuration:
90+
91+
```env
92+
# For OpenAI Vision
93+
OPENAI_API_KEY=your_openai_api_key
94+
95+
# For Google Gemini
96+
GOOGLE_GENERATIVE_AI_API_KEY=your_google_api_key
97+
```
98+
99+
Provider selection:
100+
101+
- If `imageVisionModelProvider` is set to `google/openai`, it will use this one.
102+
- Else if `model` is set to `google/openai`, it will use this one.
103+
- Default if nothing is set is OpenAI.
104+
105+
The service automatically handles different image formats, including GIFs (first frame extraction).
106+
107+
Features by provider:
108+
109+
**Local (Florence):**
110+
111+
- Basic image captioning
112+
- Local processing without API calls
113+
114+
**OpenAI Vision:**
115+
116+
- Detailed image descriptions
117+
- Text detection
118+
- Object recognition
119+
120+
**Google Gemini 1.5:**
121+
122+
- High-quality image understanding
123+
- Detailed descriptions with natural language
124+
- Multi-modal context understanding
125+
- Support for complex scenes and content
126+
127+
The provider can be configured through the runtime settings, allowing easy switching between providers based on your needs.
84128

85129
### LlamaService
86130

packages/plugin-node/src/index.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@ export * from "./services/index.ts";
22

33
import { Plugin } from "@elizaos/core";
44

5+
import { describeImage } from "./actions/describe-image.ts";
56
import {
7+
AwsS3Service,
68
BrowserService,
79
ImageDescriptionService,
810
LlamaService,
911
PdfService,
1012
SpeechService,
1113
TranscriptionService,
1214
VideoService,
13-
AwsS3Service,
1415
} from "./services/index.ts";
15-
import { describeImage } from "./actions/describe-image.ts";
1616

1717
export type NodePlugin = ReturnType<typeof createNodePlugin>;
1818

0 commit comments

Comments
 (0)