CUDA_VISIBLE_DEVICES=0,1,4,5 python -m torch.distributed.launch --nproc_per_node=4 run_vqkd_training.py \
    --data_set image_folder \
    --data_path /workspace/sync/imagenet-1k/train \
    --eval_data_path /workspace/sync/imagenet-1k/val \
    --output_dir /workspace/sync/Feature-Distillation/results/deit/beit2/default \
    --log_dir /workspace/sync/Feature-Distillation/results/deit/beit2/default \
    --process_type imagenet_norm \
    --train_interpolation bicubic \
    --min_crop_scale 0.08 \
    --model vqkd_encoder_base_decoder_1x768x12_deit \
    --teacher_input_size 224 \
    --codebook_n_emd 8192  \
    --codebook_emd_dim 32 \
    --quantize_kmeans_init \
    --rec_loss_type cosine \
    --batch_size 128 \
    --opt adamw \
    --opt_betas 0.9 0.99 \
    --weight_decay 1e-4  \
    --warmup_epochs 10 \
    --epochs 100 \
    --save_ckpt_freq 100