Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,6 @@ results/
*.slurm
*.arrow
/shards/*
*.png
*.png

CLAUDE.md
2 changes: 1 addition & 1 deletion data/advanced_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(
max_sample_length: int = 1024,
seq_length: int = 1024,
num_of_sequences: int = 1024,
queue_size: int = 2048,
queue_size: int = 1024,
max_images_per_example: int = 4,
max_images_per_knapsack: int = 18,
):
Expand Down
3 changes: 2 additions & 1 deletion data/collators.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def _discard_samples_that_are_too_long(self, batch, max_length):
if len(ids) <= max_length
]
if not filtered:
return [], [], [], []
print("Warning: No samples in the batch are short enough to fit within the max_length limit.")
return {"input_ids": [], "labels": [], "attention_mask": [], "images": []}
batch_token_ids, batch_labels, batch_attentions, batch_images = zip(*filtered)
return {"input_ids": list(batch_token_ids), "labels": list(batch_labels), "attention_mask": list(batch_attentions), "images": list(batch_images)}

Expand Down
32 changes: 27 additions & 5 deletions models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,24 +54,46 @@ class TrainConfig:
lr_mp: float = 0.00512
lr_backbones: float = 5e-5
data_cutoff_idx: int = None
val_ratio: float = 0.025
val_ratio: float = 0.05
batch_size: int = 8
gradient_accumulation_steps: int = 8
max_grad_norm: float = 1.0
eval_in_epochs: bool = True
eval_interval: int = gradient_accumulation_steps * 100
stats_log_interval: int = gradient_accumulation_steps * 25
max_training_steps: int = 5000
max_training_steps: int = 20000
max_images_per_example: int = 4
max_images_per_knapsack: int = 18
max_sample_length: int = 1024
compile: bool = False
resume_from_vlm_checkpoint: bool = False # Indicate if the training should be resumed from a checkpoint of the whole VLM or you want to start from scratch
train_dataset_path: str = 'HuggingFaceM4/the_cauldron'
train_dataset_name: tuple[str, ...] = ("all", )
train_dataset_path: str = 'HuggingFaceM4/cauldron_v3_test'
# train_dataset_name: tuple[str, ...] = ("all", )
train_dataset_name: tuple[str, ...] = ('CoSyn_400k_chart', 'CoSyn_400k_chemical', 'CoSyn_400k_circuit', 'CoSyn_400k_diagram',
'CoSyn_400k_document', 'CoSyn_400k_graphic', 'CoSyn_400k_math', 'CoSyn_400k_music',
'CoSyn_400k_nutrition', 'CoSyn_400k_table', 'a_okvqa', 'ai2d', 'ai2d(gpt4v)', 'ai2d(internvl)',
'aokvqa', 'arxivqa', 'blockdiagramcomputerized', 'blockdiagramhandwritten',
'cambrian(filtered)_processed', 'chart2text', 'chart2text(cauldron)', 'chartqa',
'chrome_writting', 'clevr', 'clevr_math(mathv360k)', 'cocoqa', 'diagram_image_to_text',
'docvqa', 'drivelm', 'dvqa', 'figureqa', 'figureqa(mathv360k)', 'finqa', 'funsd', 'geo170k(align)',
'geo170k(qa)', 'geo3k', 'geometry3k(mathv360k)', 'geomverse', 'geos(mathv360k)', 'groundui',
'hateful_memes', 'hitab', 'hme100k', 'iam', 'iconqa', 'iconqa(mathv360k)', 'idk', 'iiit5k',
'image_textualization(filtered)', 'imgur5k', 'infographic(gpt4v)', 'infographic_vqa',
'infographic_vqa_llava_format', 'intergps', 'latex_handwritten', 'latexformulas', 'llavar_gpt4_20k',
'lnqa', 'localized_narratives', 'lrv_chart', 'lrv_normal(filtered)', 'lvis_instruct4v', 'mapqa',
'mapqa(mathv360k)', 'maptext', 'mavis_math_metagen', 'mavis_math_rule_geo', 'memotion',
'mimic_cgd', 'mmra', 'mmsoc_memotion', 'multihiertt', 'nlvr2', 'ocrvqa', 'oodvqa', 'orand_car_a',
'pathvqa', 'pdfvqa', 'plotqa', 'pmc_vqa(mathv360k)', 'raven', 'rendered_text', 'robut_sqa',
'robut_wikisql', 'robut_wtq', 'scienceqa', 'scienceqa(nona_context)', 'screen2words', 'screenqa',
'sharegpt4o', 'sharegpt4v(coco)', 'sharegpt4v(knowledge)', 'sharegpt4v(llava)', 'sharegpt4v(sam)',
'sketchyvqa', 'slidevqa', 'spark', 'spatialsense', 'spot_the_diff', 'sroie', 'st_vqa',
'super_clevr(mathv360k)', 'synthdog', 'tabmwp', 'tabmwp(mathv360k)', 'tallyqa', 'tat_qa',
'textcaps', 'textocr(gpt4v)', 'textvqa', 'tqa', 'ureader_cap', 'ureader_ie', 'ureader_kg_processed',
'vision_flan(filtered)', 'vistext', 'vistext(cauldron)', 'visual7w', 'visualmrc',
'visualwebinstruct(filtered)', 'vizwiz(mathv360k)', 'vqarad', 'vqav2', 'vsr', 'websight', 'wildvision', 'wordart', 'yesbut')
wandb_entity: str = "HuggingFace" # Indicate the entity to log to in wandb
log_wandb: bool = True
use_lmms_eval: bool = True # Use lmms-eval for evaluation
lmms_eval_tasks: str = 'mmstar,mmmu,ocrbench,textvqa' # Pass additional task as one string, seperated by commas without spaces (e.g. 'mmstar,mmmu,ocrbench')
lmms_eval_limit: int = 2000
lmms_eval_limit: int = None
lmms_eval_batch_size: int = 128
29 changes: 10 additions & 19 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from statistics import mean
from dataclasses import asdict
from datasets import load_dataset, concatenate_datasets, get_dataset_config_names
from torch.utils.data import DataLoader, DistributedSampler
from torch.utils.data import DataLoader
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel

Expand All @@ -31,10 +31,6 @@
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Fix for "Decompressed data too large" error with certain PNGs
import PIL.PngImagePlugin
PIL.PngImagePlugin.MAX_TEXT_CHUNK = 100 * 1024 * 1024

def seed_worker(worker_id):
worker_seed = torch.initial_seed() % 2**32
numpy.random.seed(worker_seed)
Expand Down Expand Up @@ -122,11 +118,13 @@ def get_dataloaders(train_cfg, vlm_cfg):
train_size = total_samples - val_size

train_dataset = VQADataset(train_ds.select(range(train_size)), tokenizer, image_processor, vlm_cfg.mp_image_token_length)

train_dataset = ConstantLengthDataset(train_dataset, infinite=False, max_sample_length=train_cfg.max_sample_length, seq_length=vlm_cfg.lm_max_length, num_of_sequences=train_cfg.batch_size*64, queue_size=train_cfg.batch_size*64*2,
max_images_per_example=train_cfg.max_images_per_example, max_images_per_knapsack=train_cfg.max_images_per_knapsack)

val_dataset = VQADataset(train_ds.select(range(train_size, total_samples)), tokenizer, image_processor, vlm_cfg.mp_image_token_length)

val_dataset = ConstantLengthDataset(val_dataset, infinite=False, max_sample_length=train_cfg.max_sample_length, seq_length=vlm_cfg.lm_max_length, num_of_sequences=train_cfg.batch_size*64, queue_size=train_cfg.batch_size*64*2,
max_images_per_example=train_cfg.max_images_per_example, max_images_per_knapsack=train_cfg.max_images_per_knapsack)

# Create collators
vqa_collator = VQACollator(tokenizer, vlm_cfg.lm_max_length)

Expand All @@ -139,26 +137,19 @@ def get_dataloaders(train_cfg, vlm_cfg):
train_dataset,
batch_size=train_cfg.batch_size, # =per device BS in DDP
collate_fn=vqa_collator,
num_workers=8,
num_workers=2,
pin_memory=True,
drop_last=True,
worker_init_fn=seed_worker,
generator=g,
)

val_sampler = DistributedSampler(
val_dataset,
rank=get_rank(),
num_replicas=get_world_size(),
shuffle=False # Usually False for validation
)

val_loader = DataLoader(
val_dataset,
batch_size=train_cfg.batch_size,
sampler=val_sampler,
batch_size=train_cfg.batch_size, # =per device BS in DDP
collate_fn=vqa_collator,
num_workers=8,
shuffle=False,
num_workers=2,
pin_memory=True,
drop_last=True,
worker_init_fn=seed_worker,
Expand Down Expand Up @@ -350,7 +341,7 @@ def train(train_cfg, vlm_cfg):
torch.cuda.empty_cache()
with torch.no_grad():
total_val_loss = 0
for batch in val_loader:
for batch in synchronized_dataloader_step(val_loader, is_dist()):
images = batch["images"]
input_ids = batch["input_ids"].to(device)
labels = batch["labels"].to(device)
Expand Down