data_path: /mnt/localssd/ImageNet2012/train
val_data_path: /mnt/localssd/ImageNet2012/val
save_best: true
image_size: 256
vq_model: VQ-16
enc_type: dinov2
dec_type: dinov2
semantic_guide: dinov2
global_batch_size: 1024
epochs: 200
lr_scheduler: cosine
lr: 3e-5
#max_grad_norm: 1.0
#reconstruction_weight: 4.0
codebook_embed_dim: 32
codebook_size: 4096
abs_pos_embed: true
product_quant: 2
ema: true
codebook_drop: 0.1
encoder_model: vit_base_patch14_dinov2.lvd142m
decoder_model: vit_base_patch14_dinov2.lvd142m
num_latent_tokens: 121
v_patch_nums: [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
half_sem: true
start_drop: 3
ckpt_every: 10000
sem_loss_weight: 0.1
enc_tuning_method: full
lecam_loss_weight: 0.001
weight_decay: 0.0
disc_weight_decay: 0.0005
disc_epoch_start: 56
disc_type: dinodisc
aug_fade_steps: 0
disc_adaptive_weight: true
cloud_save_path: output/exp-msvr10p2-4096
