1
+ import os
2
+ import json
3
+ import torch
4
+ import logging
5
+ from llm2vec import LLM2Vec
6
+ from typing import List , Dict , Any
7
+ from transformers import AutoModel , AutoConfig , AutoTokenizer
8
+
9
+ os .environ ["CUDA_VISIBLE_DEVICES" ] = "0"
10
+
11
+ logging .basicConfig (level = logging .INFO , format = '%(asctime)s - %(levelname)s - %(message)s' )
12
+
13
+ CONFIG = {
14
+ "llm_model_name" : "microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned" ,
15
+ "flickr" : {
16
+ "ann_path" : "eval_data/flickr30k/test.json" ,
17
+ "root" : "eval_data/flickr30k/" ,
18
+ "save_filename" : "flickr30k_8B_llm_features.dpt"
19
+ },
20
+ "coco" : {
21
+ "ann_path" : "eval_data/coco/coco_karpathy_test.json" ,
22
+ "root" : "eval_data/coco/" ,
23
+ "save_filename" : "coco_8B_llm_features.dpt"
24
+ },
25
+ "sharegpt4v" : {
26
+ "path" : "eval_data/sharegpt4v/share-captioner_coco_lcs_sam_1246k_1107.json" ,
27
+ "ann_path" : "eval_data/sharegpt4v/validation_1k.json" ,
28
+ "root" : "eval_data/sharegpt4v/" ,
29
+ "save_filename" : "sv_8B_llm_features.dpt" ,
30
+ "total_len" : 1000
31
+ },
32
+ "urban1k" : {
33
+ "ann_path" : "eval_data/Urban1k/test.json" ,
34
+ "root" : "eval_data/Urban1k" ,
35
+ "save_filename" : "urban1k_8B_llm_features.dpt"
36
+ },
37
+ "docci" : {
38
+ "path" : "eval_data/docci/docci_descriptions.jsonlines" ,
39
+ "ann_path" : "eval_data/docci/test.json" ,
40
+ "root" : "eval_data/docci" ,
41
+ "save_filename" : "docci_8B_llm_features.dpt"
42
+ }
43
+ }
44
+
45
+ def load_json (file_path : str ) -> List [Dict [str , Any ]]:
46
+ try :
47
+ with open (file_path , 'r' ) as f :
48
+ return json .load (f )
49
+ except Exception as e :
50
+ logging .error (f"Failed to load JSON file { file_path } : { e } " )
51
+ raise
52
+
53
+ def save_embeddings (embeddings : torch .Tensor , save_path : str ) -> None :
54
+ try :
55
+ torch .save (embeddings , save_path )
56
+ logging .info (f"Embeddings saved to { save_path } " )
57
+ except Exception as e :
58
+ logging .error (f"Failed to save embeddings to { save_path } : { e } " )
59
+ raise
60
+
61
+ def process_multi_texts_dataset (data : List [Dict [str , Any ]], llm_model : LLM2Vec , save_path : str ) -> None :
62
+ texts = [caption for item in data for caption in item ['caption' ]]
63
+ with torch .no_grad ():
64
+ embeddings = llm_model .encode (texts , convert_to_tensor = True , batch_size = 196 )
65
+
66
+ texts_num = len (data [0 ]['caption' ])
67
+ embeddings = embeddings .view (- 1 , texts_num , embeddings .size (- 1 ))
68
+ save_embeddings (embeddings , save_path )
69
+
70
+ def process_dataset (texts : List , llm_model : LLM2Vec , save_path : str ) -> None :
71
+ with torch .no_grad ():
72
+ embeddings = llm_model .encode (texts , convert_to_tensor = True , batch_size = 128 )
73
+ save_embeddings (embeddings , save_path )
74
+
75
+ def flickr (llm_model : LLM2Vec ) -> None :
76
+ config = CONFIG ["flickr" ]
77
+ data = load_json (config ["ann_path" ])
78
+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
79
+ process_multi_texts_dataset (data , llm_model , save_path )
80
+
81
+ def coco (llm_model : LLM2Vec ) -> None :
82
+ config = CONFIG ["coco" ]
83
+ data = load_json (config ["ann_path" ])
84
+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
85
+ process_multi_texts_dataset (data , llm_model , save_path )
86
+
87
+ def sharegpt4v (llm_model : LLM2Vec ) -> None :
88
+ config = CONFIG ["sharegpt4v" ]
89
+ data = load_json (config ["path" ])[:config ["total_len" ]]
90
+ captions = []
91
+ for it in data :
92
+ dic = {}
93
+ dic ['caption' ] = it ['conversations' ][1 ]['value' ]
94
+ dic ['image' ] = it ['image' ]
95
+ captions .append (dic )
96
+
97
+ json .dump (captions , open (config ['ann_path' ], 'w' ))
98
+
99
+ texts = [item ['caption' ] for item in captions ]
100
+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
101
+ process_dataset (texts , llm_model , save_path )
102
+
103
+
104
+ def urban1k (llm_model : LLM2Vec ) -> None :
105
+ config = CONFIG ["urban1k" ]
106
+ eval_data = []
107
+ for i in range (1 , 1001 ):
108
+ caption_path = os .path .join (config ["root" ], f'caption/{ i } .txt' )
109
+ image_path = os .path .join (config ["root" ], f'image/{ i } .jpg' )
110
+ caption = open (caption_path , 'r' ).readlines ()[0 ]
111
+ eval_data .append ({'caption' : caption , 'image' : image_path })
112
+
113
+ json .dump (eval_data , open (config ['ann_path' ], 'w' ))
114
+
115
+ texts = [item ['caption' ] for item in eval_data ]
116
+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
117
+ process_dataset (texts , llm_model , save_path )
118
+
119
+ def docci (llm_model : LLM2Vec ) -> None :
120
+ config = CONFIG ["docci" ]
121
+ data = open (config ["path" ], 'r' ).readlines ()
122
+ eval_data = []
123
+ for line in data :
124
+ dic = json .loads (line )
125
+ if dic ['split' ] == "test" :
126
+ eval_data .append ({'caption' : dic ['description' ], 'image' : dic ['image_file' ]})
127
+
128
+ json .dump (eval_data , open (config ['ann_path' ], 'w' ))
129
+
130
+ texts = [item ['caption' ] for item in eval_data ]
131
+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
132
+ process_dataset (texts , llm_model , save_path )
133
+
134
+ def main () -> None :
135
+ llm_model_name = CONFIG ["llm_model_name" ]
136
+ config = AutoConfig .from_pretrained (llm_model_name , trust_remote_code = True )
137
+ llm_model = AutoModel .from_pretrained (
138
+ llm_model_name ,
139
+ torch_dtype = torch .bfloat16 ,
140
+ config = config ,
141
+ trust_remote_code = True ,
142
+ )
143
+ tokenizer = AutoTokenizer .from_pretrained (llm_model_name )
144
+ llm_model .config ._name_or_path = "meta-llama/Meta-Llama-3-8B-Instruct"
145
+ model = LLM2Vec (llm_model , tokenizer , pooling_mode = "mean" , max_length = 512 , doc_max_length = 512 )
146
+
147
+ flickr (model )
148
+ coco (model )
149
+ sharegpt4v (model )
150
+ urban1k (model )
151
+ docci (model )
152
+
153
+ if __name__ == '__main__' :
154
+ main ()
0 commit comments