app: vjepa_wm
nodes: 4
tasks_per_node: 8
cpus_per_task: 16
folder: ${JEPAWM_LOGS}/vm2m/opensource_decs/step2_lpips_vm2m_vjepa2vitgopen_vgxf_vitldec_dup_256_vjtransf_norm_bs4_4n
data:
  # Dataset configuration
  dataset_type: mixed_dataset
  datasets:
    - K710
    - SSv2
    - HowTo100M
  datasets_weights: null
  cluster_sample_config:
    - null
    - null
    - null
  seed: 234
  img_size: 256
  # Validation configuration
  validation:
    val_datasets: null
    num_frames_val: 8
    val_dataset_fpcs: [8]
  # DataLoader configuration
  loader:
    batch_size: 4
    num_workers: 16
    pin_mem: true
    persistent_workers: true
  # Mixed dataset parameters
  custom:
    dataset_fpcs: [16, 16, 16]
    state_skip: 1
    frameskip: 1
    normalize_action: false
    fps: 4
  droid:
    camera_views:
      - left_mp4_path
data_aug:
  auto_augment: false
  random_horizontal_flip: false
  motion_shift: false
  random_resize_aspect_ratio:
  - 0.75
  - 1.35
  random_resize_scale:
  - 1.777
  - 1.777
  reprob: 0.0
  normalize: [[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]]
  hwc: true
  do_255_to_1: true
logging:
  write_tag: jepa
  wandb:
    use_wandb: false
    debug: false
    project: vjepa_wm
    disable_wandb_media: true
    log_media_locally: true
loss:
  cos_loss_weight: 0.0
  l1_loss_weight: 0.0
  l2_loss_weight: 1.0
  smooth_l1_loss_weight: 0.0
meta:
  plan_only_eval_mode: false
  light_eval_only_mode: false
  unroll_decode_eval_only_mode: false
  quick_debug: false
  freeze_encoder: true
  load_checkpoint: true
  load_opt_scale_epoch: true
  read_checkpoint: null
  seed: 234
  eval_freq: 1
  light_eval_freq: 2000
  save_every_freq: 1
  dtype: bfloat16
  data_traj_rollout_eval:
    do_data_traj_rollout_eval: false
  energy_landscape_eval:
    do_energy_landscape_eval: false
    energy_landscape_rollout_steps: 1
    energy_landscape_ctxt_window: 3
model:
  # Shared fields
  grid_size: 16
  tubelet_size_enc: 1
  use_activation_checkpointing: false
  action_conditioning: none
  proprio_encoding: none
  num_frames_pred: 8
  # Visual encoder config
  visual_encoder:
    enc_type: vjepa
    enc_version: v2_open
    pretrain_enc_path: ${JEPAWM_OSSCKPT}/vjepa2_opensource/vjepa2_vit_giant.pth
    pretrain_enc_ckpt_key: encoder
    embed_dim: 1408
    enc_use_rope: true
    enc_name: vit_giant_xformers
    use_sdpa_enc: null
    num_frames_enc: 64
    uniform_power: true
  # Action encoder config
  action_encoder:
    action_tokens: 0
    action_emb_dim: 0
    act_mlp: true
    action_encoder_inpred: true
  # Proprio encoder config
  proprio_encoder:
    proprio_tokens: 0
    proprio_emb_dim: 0
    prop_mlp: true
    proprio_encoder_inpred: false
  # Predictor config
  predictor:
    tubelet_size: 1
    pred_num_heads: 16
    pred_depth: 6
    pred_embed_dim: 384
    pred_type: none
  # VideoWM encoding
  wm_encoding:
    batchify_video: true
    dup_image: true
    normalize_reps: true
  # Rollout config
  rollout_cfg:
    rollout_steps: 1
    train_rollout_prefixes: random
    rollout_stop_gradient: true
    sampling_scheduler:
      type: linear
      start: 0.
      end: 0.
  # Attention config
  attn:
    local_window_time: 8
    local_window_h: -1
    local_window_w: -1
  # Heads config
  heads_cfg:
    architectures:
      image_head:
        kind: vit
        config:
          patch_size: 8
          in_chans: 3
          img_size: [256, 256]
          embed_dim: 1408
          decoder_embed_dim: 1024
          depth: 12
          num_heads: 16
          mlp_ratio: 4.0
          num_views: 1
          use_activation_checkpointing: false
          use_lpips: true
          pixelloss_weight: 10
          perceptual_weight: 1
    pretrain_dec_path: null
optimization:
  main_optimizer: image_head
  train_heads: true
  heads:
    train_predictor: false
    train_heads_on_predictor: false
    image_head:
      use_radamw: false
      betas: [0.9, 0.999]
      eps: 1e-8
      ipe_scale: 1.25
      weight_decay: 0.1
      final_weight_decay: 0.1
      final_lr: 1.0e-8
      start_lr: 0.0
      ref_lr: 1.e-5
      warmup: 0
      num_epochs: 80
      iterations_per_epoch: 1000
      clip_grad: 1
      mixed_precision: true
evals:
  eval_cfg_paths:
    null
