results_dir: experiments
global_seed: 42
log_every: 50
val_every: 5000
ckpt_every: 5000

dataset: k600
data_path_list: [
  YOUR_DATA_DIR/K600.csv,
  YOUR_DATA_DIR/K600.csv,
  YOUR_DATA_DIR/BVI_HFR.csv,
  YOUR_DATA_DIR/BVI_HFR.csv,
]
fps_list: [24, 30, 60, 120]
fps_weight: [10, 100, 5, 5]
val_data_path_list: [
  YOUR_DATA_DIR/YOUR_VAL_DATA.csv
]
val_fps_list: [30, 24, 15, 10]
period: 0.6666666666667 # 2/3
data_column: video_path
image_size: 256
num_frames: 16
variable_num_frames: True
mix_fps: True
mix_fps_rate: 0.5

vq_model: AE-16
vq_ckpt: 
disc_ckpt: 
num_latent_tokens: 128
codebook_embed_dim: 128
commit_loss_beta: 0.0
decoder_patch_size: 8
decoder_pretrained: False
decoder_tuning_method: full
dec_type: vit
decoder_model: vit_base_patch14_dinov2.lvd142m
dropout_p: 0.0
encoder_patch_size: 8
encoder_pretrained: False
encoder_tuning_method: full
enc_type: vit
encoder_model: vit_base_patch14_dinov2.lvd142m
t_patch_size: 4
dec_seperate_mask_token: False

repa: False
repa_align: repeat
repa_loss_weight: 0.1
repa_model: vit_large_patch14_dinov2.lvd142m
repa_patch_size: 8
repa_proj_dim: 1024

entropy_loss_ratio: 0.0
kl_loss_weight: 0.0
vq_loss_ratio: 0.0
reconstruction_weight: 1.0
reconstruction_loss: l1
perceptual_weight: 1.0
perceptual_warmup: 1
reconstruction_std_weight: 0.0
perceptual_std_weight: 0.0
t_shift_loss: False
max_t_shift: 0
t_shift_rec_weight: 0.0
disc_weight: 0.2
disc_adaptive_weight: True
use_diff_aug: True 
disc_cr_loss_weight: 4.0
disc_start: 1
disc_type: dino
disc_loss: hinge
gen_loss: hinge
lecam_loss_weight: 0.001

vq_mean: 0.0
vq_std: 1.0
rope_mixed: False
rope_theta: 10000.0
rope_theta_t: 10000.0
rope_heads: 6
tau: 0.07
to_pixel: linear
use_ape: False
use_rope: True

epochs: 100