wandb:
  entity: null
  resume: 'auto'

experiment:
    project: "sft_nuscenes"
    name: ""
    output_dir: ""
    ablation_name: ""
    max_train_examples_mmu: 50000
    save_every: 5000 #
    eval_every: 5000
    generate_every: 200 
    log_every: 50
    log_grad_norm_every: 500000
    eval_from_checkpoint: True #'latest'
    eval_only: False #True #
    resume_dir: ""
    img_show_path: ''
    text_root: ''
    add_ego: false #true #
    add_cmd: true
    nfp_loss:
      alpha_coffe: 1.0
      beta_coffe: 0.4
    eval:
      use_fvd: True
      max_eval_iters: 10000
      use_frame_metrics: True
      eval_generate_times: 4
      max_generate_batchsize: 20
      action_conditioned: True
      max_decode_batchsize: 2
      use_text_metrics: True
      use_trj_metrics: True
      trj_anno_path: ""
      eval_dir: ""

model:
    vq_model:
        type: "Compressive_magvit_v2"
        vq_model_name: ""
        mask_l: 0
        mask_h: 1
        latent_size: 4
        patch_size: 2
        num_vq_embeddings: 8192
        num_dyn_embeddings: 8192
        pretrained_model_path: ""
        pretrained_model_name_or_path: ""
        pretrained_model_name_or_path_con: ""
    clip_model: ""
    cond_enable: true #false
    showo:
        load_from_showo: True #False
        pretrained_model_path: ""
        w_clip_vit: False #True
        vocab_size: 58498
        llm_vocab_size: 50295
        llm_model_path: ''
        codebook_size: 8192
        num_vq_tokens: 256
        num_new_special_tokens: 10  # <|soi|> <|eoi|> <|sov|> <|eov|> <|t2i|> <|mmu|> <|t2v|> <|v2v|> <|lvg|> <|pad|>
        dynamic_size: 8192
        resume_from_pretrain: ''
    eval:
        i3d_path: ''
    gradient_checkpointing: True

dataset:
    dataset_use: 
    clip_img_token: -200
    clip_vq_token: -100
    json_root: ''
    params:
        add_caption_prompt: True
        validation_prompts_file: "validation_prompts/showoprompts.txt"
        shuffle_buffer_size: 1000
        num_workers: 8 #32
        pin_memory: True
        persistent_workers: True
        context_length: 2
        resolution_h: 128
        resolution_w: 224
    preprocessing:
        max_seq_length: 2048 # for text tokens
        resolution: [256, 448] #256
        center_crop: False
        random_flip: False
    ctd:
      context_length: 2
      condition_length: 2
      segment_horizon: 21
      segment_length: 21
      d_resolution: [128,224]
      c_resolution: [256,448]
      split_frames: 5
      prev_frames: 12
      next_frames: 12 #12
      anno_path: ''
      omini_path: ''
      image_file: ''
      image_root: ''
      views:
#        'NUSCENES_FRONT': 'nuscenes_front'
        #      'NUSCENES_BACK': 'nuscenes_back'
        #      'NUSCENES_FRONT_LEFT': 'nuscenes_front_left'
        #      'NUSCENES_FRONT_RIGHT': 'nuscenes_front_right'
        #      'NUSCENES_BACK_LEFT': 'nuscenes_back_left'
        #      'NUSCENES_BACK_RIGHT': 'nuscenes_back_right'
        nuscenes_front: '/nuscenes_front'
        nuscenes_back: ''
        nuscenes_front_left: ''
        nuscenes_front_right: ''
        nuscenes_back_left: ''
        nuscenes_back_right: ''

      nuscenes_data_path: '/nuscenes'


optimizer:
    name: adamw
    params: # default adamw params
        learning_rate: 0.00003 # 0.00005
        scale_lr: False # scale learning rate by total batch size
        beta1: 0.9
        beta2: 0.999
        weight_decay: 0.01
        epsilon: 1e-8

lr_scheduler:
    scheduler: "cosine"
    params:
        learning_rate: ${optimizer.params.learning_rate}
        warmup_steps: 1000

training:
    gradient_accumulation_steps: 4 #4
    batch_size_train_nus: 4
    batch_size_val_nus: 16 #16
    eval_only: false
    mixed_precision: "bf16"
    enable_tf32: True
    seed: 10086
    num_workers: 10
    max_train_steps: 50000
    overfit_one_batch: False
    cond_dropout_prob: 0.1
    min_masking_rate: 0.0
    label_smoothing: 0.0
    max_grad_norm: 4.0 #null
    guidance_scale: 0.0
    generation_timesteps: 5 #12
    lora_enable: False
    qa_coeff: 0.1
    video_coeff: 0.3 #1.0
    tj_coeff: 1.0
    motion_weight: True
