Skip to content

Commit 71839f1

Browse files
committed
add eval data
1 parent 1e2eac6 commit 71839f1

File tree

7 files changed

+221
-489
lines changed

7 files changed

+221
-489
lines changed

README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,25 @@ Stay tuned for updates on pretrained models and datasets, which will be made ava
6161
## 💻 Installation Guide
6262

6363
1. **Create the environment**:
64-
6564
```bash
6665
conda create -n llm2clip python=3.8
6766
conda activate llm2clip
6867
pip install -r requirements.txt
6968
```
70-
2. **Data Preparation**:
69+
2. **Data Preparation for LLM2CLIP**:
70+
71+
```bash
72+
cd llm2clip/data
7173

72-
*(Coming Soon)*
74+
# training datasets
75+
DATASET=cc3m #options: "cc3m", "cc12m", "yfcc15m"
76+
bash download_dataset.sh $DATASET
77+
python extract_embedding.py $DATASET
78+
79+
# eval datasets
80+
bash setup_eval_datasets.sh
81+
python extract_eval_embedding.py
82+
```
7383

7484
3. **🔥 Training**:
7585

llm2clip/data/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
## Train
2+
```bash
3+
$DATASET=cc3m #options: "cc3m", "cc12m", "yfcc15m"
4+
bash download_dataset.sh $DATASET
5+
python extract_embedding.py $DATASET
6+
```
7+
## Eval
8+
```bash
9+
bash setup_eval_datasets.sh
10+
python extract_eval_embedding.py
11+
```
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import os
2+
import json
3+
import torch
4+
import logging
5+
from llm2vec import LLM2Vec
6+
from typing import List, Dict, Any
7+
from transformers import AutoModel, AutoConfig, AutoTokenizer
8+
9+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
10+
11+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12+
13+
CONFIG = {
14+
"llm_model_name": "microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned",
15+
"flickr": {
16+
"ann_path": "eval_data/flickr30k/test.json",
17+
"root": "eval_data/flickr30k/",
18+
"save_filename": "flickr30k_8B_llm_features.dpt"
19+
},
20+
"coco": {
21+
"ann_path": "eval_data/coco/coco_karpathy_test.json",
22+
"root": "eval_data/coco/",
23+
"save_filename": "coco_8B_llm_features.dpt"
24+
},
25+
"sharegpt4v": {
26+
"path": "eval_data/sharegpt4v/share-captioner_coco_lcs_sam_1246k_1107.json",
27+
"ann_path": "eval_data/sharegpt4v/validation_1k.json",
28+
"root": "eval_data/sharegpt4v/",
29+
"save_filename": "sv_8B_llm_features.dpt",
30+
"total_len": 1000
31+
},
32+
"urban1k": {
33+
"ann_path": "eval_data/Urban1k/test.json",
34+
"root": "eval_data/Urban1k",
35+
"save_filename": "urban1k_8B_llm_features.dpt"
36+
},
37+
"docci": {
38+
"path": "eval_data/docci/docci_descriptions.jsonlines",
39+
"ann_path": "eval_data/docci/test.json",
40+
"root": "eval_data/docci",
41+
"save_filename": "docci_8B_llm_features.dpt"
42+
}
43+
}
44+
45+
def load_json(file_path: str) -> List[Dict[str, Any]]:
46+
try:
47+
with open(file_path, 'r') as f:
48+
return json.load(f)
49+
except Exception as e:
50+
logging.error(f"Failed to load JSON file {file_path}: {e}")
51+
raise
52+
53+
def save_embeddings(embeddings: torch.Tensor, save_path: str) -> None:
54+
try:
55+
torch.save(embeddings, save_path)
56+
logging.info(f"Embeddings saved to {save_path}")
57+
except Exception as e:
58+
logging.error(f"Failed to save embeddings to {save_path}: {e}")
59+
raise
60+
61+
def process_multi_texts_dataset(data: List[Dict[str, Any]], llm_model: LLM2Vec, save_path: str) -> None:
62+
texts = [caption for item in data for caption in item['caption']]
63+
with torch.no_grad():
64+
embeddings = llm_model.encode(texts, convert_to_tensor=True, batch_size=196)
65+
66+
texts_num = len(data[0]['caption'])
67+
embeddings = embeddings.view(-1, texts_num, embeddings.size(-1))
68+
save_embeddings(embeddings, save_path)
69+
70+
def process_dataset(texts: List, llm_model: LLM2Vec, save_path: str) -> None:
71+
with torch.no_grad():
72+
embeddings = llm_model.encode(texts, convert_to_tensor=True, batch_size=128)
73+
save_embeddings(embeddings, save_path)
74+
75+
def flickr(llm_model: LLM2Vec) -> None:
76+
config = CONFIG["flickr"]
77+
data = load_json(config["ann_path"])
78+
save_path = os.path.join(config["root"], config["save_filename"])
79+
process_multi_texts_dataset(data, llm_model, save_path)
80+
81+
def coco(llm_model: LLM2Vec) -> None:
82+
config = CONFIG["coco"]
83+
data = load_json(config["ann_path"])
84+
save_path = os.path.join(config["root"], config["save_filename"])
85+
process_multi_texts_dataset(data, llm_model, save_path)
86+
87+
def sharegpt4v(llm_model: LLM2Vec) -> None:
88+
config = CONFIG["sharegpt4v"]
89+
data = load_json(config["path"])[:config["total_len"]]
90+
captions = []
91+
for it in data:
92+
dic = {}
93+
dic['caption'] = it['conversations'][1]['value']
94+
dic['image'] = it['image']
95+
captions.append(dic)
96+
97+
json.dump(captions, open(config['ann_path'], 'w'))
98+
99+
texts = [item['caption'] for item in captions]
100+
save_path = os.path.join(config["root"], config["save_filename"])
101+
process_dataset(texts, llm_model, save_path)
102+
103+
104+
def urban1k(llm_model: LLM2Vec) -> None:
105+
config = CONFIG["urban1k"]
106+
eval_data = []
107+
for i in range(1, 1001):
108+
caption_path = os.path.join(config["root"], f'caption/{i}.txt')
109+
image_path = os.path.join(config["root"], f'image/{i}.jpg')
110+
caption = open(caption_path, 'r').readlines()[0]
111+
eval_data.append({'caption': caption, 'image': image_path})
112+
113+
json.dump(eval_data, open(config['ann_path'], 'w'))
114+
115+
texts = [item['caption'] for item in eval_data]
116+
save_path = os.path.join(config["root"], config["save_filename"])
117+
process_dataset(texts, llm_model, save_path)
118+
119+
def docci(llm_model: LLM2Vec) -> None:
120+
config = CONFIG["docci"]
121+
data = open(config["path"], 'r').readlines()
122+
eval_data = []
123+
for line in data:
124+
dic = json.loads(line)
125+
if dic['split'] == "test":
126+
eval_data.append({'caption': dic['description'], 'image': dic['image_file']})
127+
128+
json.dump(eval_data, open(config['ann_path'], 'w'))
129+
130+
texts = [item['caption'] for item in eval_data]
131+
save_path = os.path.join(config["root"], config["save_filename"])
132+
process_dataset(texts, llm_model, save_path)
133+
134+
def main() -> None:
135+
llm_model_name = CONFIG["llm_model_name"]
136+
config = AutoConfig.from_pretrained(llm_model_name, trust_remote_code=True)
137+
llm_model = AutoModel.from_pretrained(
138+
llm_model_name,
139+
torch_dtype=torch.bfloat16,
140+
config=config,
141+
trust_remote_code=True,
142+
)
143+
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
144+
llm_model.config._name_or_path = "meta-llama/Meta-Llama-3-8B-Instruct"
145+
model = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
146+
147+
flickr(model)
148+
coco(model)
149+
sharegpt4v(model)
150+
urban1k(model)
151+
docci(model)
152+
153+
if __name__ == '__main__':
154+
main()

llm2clip/data/setup_eval_datasets.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
3+
echo "flickr30k"
4+
mkdir -p eval_data/flickr30k/
5+
wget https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json -O eval_data/flickr30k/test.json
6+
mkdir -P eval_data/flickr30k/flickr30k-images
7+
8+
echo "coco"
9+
mkdir -p eval_data/coco/
10+
wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json -P eval_data/coco/
11+
mkdir eval_data/coco/val2014
12+
13+
echo "sharegpt4v"
14+
mkdir -p eval_data/sharegpt4v/
15+
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/share-captioner_coco_lcs_sam_1246k_1107.json -P eval_data/sharegpt4v/
16+
17+
echo "Urban1k"
18+
mkdir -p eval_data/
19+
wget https://huggingface.co/datasets/BeichenZhang/Urban1k/resolve/main/Urban1k.zip -P eval_data/
20+
unzip eval_data/Urban1k.zip -d eval_data/
21+
22+
echo "docci"
23+
mkdir -p eval_data/docci/
24+
wget https://storage.googleapis.com/docci/data/docci_descriptions.jsonlines -P eval_data/docci/
25+
26+
echo "Please download the images of flickr30k, coco2014, sharegpt4v and docci manually, and then change the paths in the eval_datasets.yaml accordingly"
27+

llm2clip/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ python -m torch.distributed.launch --nproc_per_node=8 \
1010
--report-to="tensorboard, wandb" \
1111
--wandb-project-name="LLM2CLIP" \
1212
--wandb-notes="EVA02-CLIP-L-14-336" \
13-
--train-data-list "data/cc3m/cc3m-train-{00..0287}.tar;data/cc12m/cc12m-train-{00..1001}.tar" \
13+
--train-data-list "data/cc3m/{00..00287}.tar;data/cc12m/{00..01001}.tar" \
1414
--train-num-samples-list 2873538 10000225 \
1515
--eval-data-file=training/eval_datasets.yaml \
1616
--pretrained=${PRETRAINED} \

llm2clip/training/eval_datasets.yaml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11

22
- name: ret_flickr
3-
json_file: eval_data/flickr30k/annotations/test.json
4-
img_root: eval_data/flickr30k/
5-
text_feature_path: eval_data/flickr30k/flickr30k_llm2vec_0923_features.dpt
3+
json_file: data/eval_data/flickr30k/annotations/test.json
4+
img_root: data/eval_data/flickr30k/
5+
text_feature_path: data/eval_data/flickr30k/flickr30k_8B_llm_features.dpt
66

77
- name: ret_coco
8-
json_file: eval_data/coco/annotations/coco_karpathy_test.json
9-
img_root: eval_data/coco/images/
10-
text_feature_path: eval_data/coco/coco_llm2vec_0923_features.dpt
8+
json_file: data/eval_data/coco/annotations/coco_karpathy_test.json
9+
img_root: data/eval_data/coco/images/
10+
text_feature_path: data/eval_data/coco/coco_8B_llm_features.dpt
1111

1212
- name: sharegpt4v
13-
json_file: eval_data/sharegpt4v/annotations/validation_1k.json
14-
img_root: eval_data/sharegpt4v/
15-
text_feature_path: eval_data/sharegpt4v/sv_llm2vec_0923_features.dpt
13+
json_file: data/eval_data/sharegpt4v/annotations/validation_1k.json
14+
img_root: data/eval_data/sharegpt4v/
15+
text_feature_path: data/eval_data/sharegpt4v/sv_8B_llm_features.dpt
1616

1717
- name: Urban1k
18-
json_file: eval_data/Urban1k/annotations/test.json
19-
img_root: eval_data/Urban1k/
20-
text_feature_path: eval_data/Urban1k/urban1k_llm2vec_0923_features.dpt
18+
json_file: data/eval_data/Urban1k/annotations/test.json
19+
img_root: data/eval_data/Urban1k/
20+
text_feature_path: data/eval_data/Urban1k/urban1k_8B_llm_features.dpt
2121

2222
- name: DOCCI
23-
json_file: eval_data/docci/test.json
24-
img_root: eval_data/docci/images/
25-
text_feature_path: eval_data/docci/docci_llm2vec_0923_features.dpt
23+
json_file: data/eval_data/docci/test.json
24+
img_root: data/eval_data/docci/images/
25+
text_feature_path: data/eval_data/docci/docci_8B_llm_features.dpt

0 commit comments

Comments
 (0)