_gradient_checkpointing: True
vae_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
noise_scheduler_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
scheduler_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
mllm_id: "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
diffusion_model_id: "Efficient-Large-Model/Sana_1600M_512px_diffusers"
vae_downsample_f: 32
in_channels: 32
target_image_size: 512
per_device_train_batch_size: 64
learning_rate: 0.0001
eval_steps: 1000
save_steps: 1000
warmup_steps: 5000
num_train_epochs: 10.0
lr_scheduler_type: "cosine_with_min_lr"
lr_scheduler_kwargs:
  min_lr: 0.00001
loss_type: "flow"
num_metaqueries: 256
modules_to_freeze:
  - "vae"
  - "model.mllm_backbone"

modules_to_unfreeze:
  - "model.mllm_backbone.language_model.model.embed_tokens"

connector_num_hidden_layers: 24
deepspeed: "configs/zero1.json"

train_datasets:
  cc12m_t2i: -1

eval_dataset: "cc12m_t2i"
