wandb:
  entity: null
  resume: 'auto'
  mode: 'offline'

experiment:
    project: "tuning"
    name: ""
    output_dir: ""
    max_train_examples_t2i: 20000000
    max_train_examples_mmu: 40000000
    save_every: 5000
    eval_every: 4000
    eval_t2i_file: ""
    eval_mmu_file: ""
    eval_mmu_max_new_tokens: 200
    generate_every: 1000
    log_every: 50
    log_grad_norm_every: 500
    resume_from_checkpoint: 'latest'

model:
    vq_model:
        type: "magvitv2"

    face_model:
        load_from_showo: True
        pretrained_model_path: ""
        load_ckpt: True
        ckpt_path: 
        w_clip_vit: False
        vocab_size: 58498
        llm_vocab_size: 50295
        llm_model_path: 'microsoft/phi-1_5'
        codebook_size: 8192
        num_vq_tokens: 256
        num_new_special_tokens: 10
        use_moe: True

        moe: # origin MLP is 2048*8192(intermediate_size)
            aux_loss_alpha: 0.001
            num_experts_per_tok: 2
            n_routed_experts_t2i: 8
            n_routed_experts_mmu: 8
            n_shared_experts: 1 # Total experts = n_routed_experts + n_shared_experts
            scoring_func: "softmax"
            seq_aux: True
            norm_topk_prob: False
            moe_intermediate_size: 256
            pretraining_tp: 1

            use_instance_moe: True

            instance:
                resampler_intermediate_size: 256
                resampler_depth: 4
                resampler_dim_head: 32
                resampler_heads: 10
                resampler_num_tokens: 256
                resampler_ff_mult: 2
                t2i:
                    image:
                        n_copy_experts: 2
                        n_zero_experts: 1
                        n_noise_experts: 1
                        num_experts_per_instance: 2
                    text: # Unused
                        n_copy_experts: 2
                        n_zero_experts: 2
                        num_experts_per_instance: 2
                mmu:
                    image:
                        n_copy_experts: 2
                        n_clip_experts: 1
                        n_face_experts: 1
                        num_experts_per_instance: 2
                    text: # Unused
                        n_copy_experts: 1
                        n_zero_experts: 1
                        n_clip_experts: 1
                        n_face_experts: 1
                        num_experts_per_instance: 2

    gradient_checkpointing: True

dataset:
    gen_type: "t2i"
    und_type: "llava_tuning"
    combined_loader_mode: "min_size"
    add_system_prompt: False
    params:
        train_t2i_shards_path_or_url: ""
        test_t2i_shards_path_or_url: ""
        clip_features_dir: ""
        face_features_dir: ""
        validation_prompts_file: """
        shuffle_buffer_size: 1000
        num_workers: 32
        resolution: 256
        pin_memory: True
        persistent_workers: True

    preprocessing:
        max_seq_length: 700 # for text tokens stage1 use 300 stage2 use 700
        resolution: 256
        center_crop: False
        random_flip: False

optimizer:
    name: adamw
    params: # default adamw params
        learning_rate: 5e-05
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.01
        epsilon: 1e-8

lr_scheduler: 
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.learning_rate}
        warmup_steps: 10000 # stage1 5000

training:
    gradient_accumulation_steps: 4
    noise_type: "mask"
    batch_size_t2i: 4 # stage1 6
    batch_size_lm: 0
    batch_size_mmu: 10 # stage1 9
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 20250223
    max_train_steps: 65000 # 10000 Steps=10 EPOCHS IF 1 NODE 4 GPUS 4 GRA (8+12) BSZ # 100epochs
    overfit_one_batch: False
    cond_dropout_prob: 0.1
    min_masking_rate: 0.0
    label_smoothing: 0.0
    max_grad_norm: null
    guidance_scale: 0.0
    generation_timesteps: 12
    t2i_coeff: 1.0
    lm_coeff: 0.0
    mmu_coeff: 1.0
    t2i_dwdse_coeff: 0.001

    # t2i_loss_type: "cross_entropy"
    t2i_loss_type: "with_DWDSE"

    # mask_schedule: "cosine"
    mask_schedule: "linear_new"

    stage: 2
    stage2_with_desc: True