data_path: /ODIR_ImageFolder/train.X
val_data_path: /ODIR_ImageFolder/val.X
save_best: true
dataset: odir

log_every: 30
vis_every: 100

image_size: 128 # Remember to remove one more downsample layer in the encoder


vq_model: VQ-16
enc_type: cnn
dec_type: cnn
dropout_p: 0.1

semantic_guide: none
global_batch_size: 128  # 32 for original VAR
epochs: 500
lr_scheduler: cosine
lr: 1e-4
cosine_min_ratio: 0.01


codebook_embed_dim: 32
codebook_size: 16384
codebook_l2_norm: true


# abs_pos_embed: true # useless for cnn backbone 
 
product_quant: 1 # quantize once (necessary for original VAR)

ema: false  
 
codebook_drop: 0.0 # no dropout for original VAR
# encoder_model: vit_base_patch14_dinov2.lvd142m # useless for cnn backbone
# decoder_model: vit_base_patch14_dinov2.lvd142m # useless for cnn backbone

num_latent_tokens: 256 # set as 256 for our VAR setting 16 * 16 patches

v_patch_nums: [1, 2, 3, 4, 5, 6, 8, 10, 13, 16] # Original VAR setting 16 * 16 patches



# half_sem: true # useless for cnn backbone
# start_drop: 3 No codebook drop for Original VAR
ckpt_per_epoch: 5

# sem_loss_weight: 0.1 # useless for cnn backbone
# enc_tuning_method: full # 


lecam_loss_weight: 0.001 # confirm lecam loss weight
weight_decay: 0.0 

kmeans_init: true

disc_weight_decay: 0.0005
disc_epoch_start: 56
disc_type: none # Original VAR seems to use dinodisc as discriminator which is different from the previous work

aug_fade_steps: 0 # confirm aug fade steps
disc_adaptive_weight: true
cloud_save_path: /ImageFolder/output/exp-ODIR_l2/My_VAR_MSVR10P1_Ori_16384_no_disc_e500_lr1e-4_bs128_cosine
