Skip to content

Commit de4c07f

Browse files
authored
clip : cap max image size 1024 for qwen vl model (#13478)
1 parent 10d2af0 commit de4c07f

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

tools/mtmd/clip.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -1909,16 +1909,20 @@ struct clip_model_loader {
19091909
} break;
19101910
case PROJECTOR_TYPE_QWEN2VL:
19111911
{
1912-
// max image size = sqrt(max_pixels)
1913-
// https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
1914-
hparams.image_size = 3584;
1912+
// max image size = sqrt(max_pixels) = 3584
1913+
// ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
1914+
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
1915+
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
1916+
hparams.image_size = 1024;
19151917
hparams.warmup_image_size = hparams.patch_size * 8;
19161918
} break;
19171919
case PROJECTOR_TYPE_QWEN25VL:
19181920
{
19191921
// max image size = sqrt(max_pixels)
19201922
// https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
1921-
hparams.image_size = 3584;
1923+
// however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
1924+
// ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
1925+
hparams.image_size = 1024;
19221926
hparams.warmup_image_size = hparams.patch_size * 8;
19231927
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
19241928
} break;

0 commit comments

Comments
 (0)