wandb:
  entity: null
#  run_id: askkz9i2
  resume: 'auto'

experiment:
    project: "pretrain_stage1"
    name: ""
    output_dir: ""
    ablation_name: "pretrain_1.0_data"
    dataset_ratio: 1 #0.25
    max_train_examples_mmu: 10000000
    save_every: 10000 #
    eval_every: 10000
    generate_every: 1000 #
    log_every: 50
    log_grad_norm_every: 500
    resume_from_checkpoint: False #'latest'
    weight_path: ''
    img_show_path: ''
    eval:
      use_fvd: True
      max_eval_iters: 10000
      use_frame_metrics: True
      eval_generate_times: 4
      max_generate_batchsize: 20
      action_conditioned: True
      max_decode_batchsize: 2
model:
    vq_model:
        type: "Compressive_magvit_v2"
        vq_model_name: ""
        mask_l: 0
        mask_h: 1
        latent_size: 4
        patch_size: 2
        num_vq_embeddings: 8192
        num_dyn_embeddings: 8192
        pretrained_model_path: ""
        pretrained_model_name_or_path: "none"
    clip_model: ""
    cond_enable: true #false
    showo:
        load_from_showo: True #False
        pretrained_model_path: ""
        w_clip_vit: True
        vocab_size: 58498
        llm_vocab_size: 50295
        llm_model_path: ''
        codebook_size: 8192
        num_vq_tokens: 256 #128
        num_new_special_tokens: 10  # <|soi|> <|eoi|> <|sov|> <|eov|> <|t2i|> <|mmu|> <|t2v|> <|v2v|> <|lvg|> <|pad|>
        dynamic_size: 8192
    eval:
        i3d_path: ''
    gradient_checkpointing: True

dataset:
    gen_type: "t2d"
    gen_und_type: "d2t"
    und_type: "captioning"
    lm_type: "lm"
    dataset_use: "pretraining_stage1"
    clip_img_token: -200
    clip_vq_token: -100
    combined_loader_mode: "max_size_cycle"
    json_root: ''
    params:
      add_caption_prompt: True
      validation_prompts_file: "validation_prompts/showoprompts.txt"
      shuffle_buffer_size: 1000
      num_workers: 4 #32
      pin_memory: True
      persistent_workers: True
      context_length: 2
      resolution_h: 128
      resolution_w: 224
    preprocessing:
      max_seq_length: 2048 # for text tokens
      resolution: [ 256, 256 ] #256
      center_crop: False
      random_flip: False
    ctd:
      context_length: 2
      condition_length: 2
      segment_horizon: 24
      segment_length: 24
      resolution_h: 128
      resolution_w: 224
      Con_resolution_h: 256
      Con_resolution_w: 448
      split_frames: 5
      mix_data:
        'OPENDV_MINI': 'opendvmini'
        'OPENDV_FULL': 'opendv'
      video_root:
        opendv: '/dataset'
        opendvmini: '/defaultShare/OPendvmini'
    mix_data:
      'OPENDV_MINI': 'opendvmini'
      'OPENDV_FULL': 'opendv'
    mmu_data:
      cc12m: 'LLaVA-ReCap-CC12M'
      resolution: null
    lm_data:
      fineweb: ''



optimizer:
    name: adamw
    params: # default adamw params
        learning_rate: 0.0001
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.01
        epsilon: 1e-8

lr_scheduler:
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.learning_rate}
        warmup_steps: 1000

training:
    gradient_accumulation_steps: 2
    batch_size_t2d: 1
    batch_size_d2t: 2
    batch_size_mmu: 1
    batch_size_lm: 1
    val_batch_size_t2d: 4
    val_batch_size_d2t: 6
    val_batch_size_mmu: 5
    val_batch_size_lm: 3
    eval_only: false
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 10086
    num_workers: 4
    max_train_steps: 1000000
    overfit_one_batch: False
    cond_dropout_prob: 0.1
    min_masking_rate: 0.0
    label_smoothing: 0.0
    max_grad_norm: 8.0 #null
    guidance_scale: 0.0
    generation_timesteps: 5 #12
    lora_enable: False
    t2d_coeff: 1.0
    d2t_coeff: 1.0
    lm_coeff: 0.5
    mmu_coeff: 0.5
