wandb:
  entity: null
#  run_id: askkz9i2
  resume: 'auto'

experiment:
    project: "training"
    name: "hermesflow-training-pairdpo"
    output_dir: "hermesflow-training-pairdpo_vqa_iteration1"
    max_train_examples_t2i: 20000000
    max_train_examples_mmu: 40000000
    save_every: 1000
    eval_every: 25000
    generate_every: 10000
    log_every: 50
    log_grad_norm_every: 500
    resume_from_checkpoint: 'latest'

model:
    vq_model:
        type: "magvitv2"
        vq_model_name: " "

    showo:
        load_from_showo: True
        pretrained_model_path: " "
        w_clip_vit: False
        vocab_size: 58498
        llm_vocab_size: 50295
        llm_model_path: ' '
        codebook_size: 8192
        num_vq_tokens: 256
        num_new_special_tokens: 10  # <|soi|> <|eoi|> <|sov|> <|eov|> <|t2i|> <|mmu|> <|t2v|> <|v2v|> <|lvg|> <|pad|>

    gradient_checkpointing: True

dataset:
    gen_type: "t2i"
    und_type: "captioning"
    combined_loader_mode: "max_size_cycle"
    add_system_prompt: False
    params:
        dpo_data_path: " "
        add_caption_prompt: True
        shuffle_buffer_size: 1000
        num_workers: 32
        resolution: 256
        pin_memory: True
        persistent_workers: True

    preprocessing:
        max_seq_length: 128 # for text tokens
        resolution: 256
        center_crop: False
        random_flip: False

optimizer:
    name: adamw
    params: # default adamw params
        learning_rate: 2e-5
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.01
        epsilon: 1e-8

lr_scheduler:
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.learning_rate}
        warmup_steps: 5000

training:
    gradient_accumulation_steps: 1
    noise_type: "mask"
    batch_size_t2i: 1
    # batch_size_lm: 4
    batch_size_mmu: 1
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 10086
    max_train_steps: 50000 # to be determined according to the scale of high-quality dataset
    overfit_one_batch: False
    cond_dropout_prob: 0.1
    min_masking_rate: 0.0
    label_smoothing: 0.0
    max_grad_norm: null
    guidance_scale: 0.0
    generation_timesteps: 12
    t2i_coeff: 1.0
    lm_coeff: 0.1
    mmu_coeff: 1.0
    beta: 0.2
