_gradient_checkpointing: True
vae_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
noise_scheduler_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
scheduler_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
mllm_id: "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
diffusion_model_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
vae_downsample_f: 32
in_channels: 32
target_image_size: 512
per_device_train_batch_size: 16
learning_rate: 0.00005
eval_steps: 500
save_steps: 500
warmup_steps: 2000
num_train_epochs: 10.0
lr_scheduler_type: "cosine_with_min_lr"
lr_scheduler_kwargs:
  min_lr: 0.000001
loss_type: "flow"
num_metaqueries: 256
modules_to_freeze:
  - "vae"
  - "model.mllm_backbone"

modules_to_unfreeze:
  - "model.mllm_backbone.language_model.model.embed_tokens"

connector_num_hidden_layers: 24
deepspeed: "configs/zero1.json"

train_datasets:
  inst2m: -1

eval_dataset: "inst2m"
