python -m torch.distributed.launch --nproc_per_node=8 run_beitv2_pretraining.py \
    --data_set image_folder \
    --data_path /workspace/sync/imagenet-1k/train \
    --output_dir /workspace/sync/Feature-Distillation/results/clip/beit2/pretrained \
    --log_dir /workspace/sync/Feature-Distillation/results/clip/beit2/pretrained \
    --model beit_base_patch16_224_8k_vocab \
    --shared_lm_head True \
    --early_layers 9 \
    --head_layers 2 \
    --num_mask_patches 75 \
    --second_input_size 224 \
    --second_interpolation bicubic \
    --min_crop_scale 0.2 \
    --tokenizer_model vqkd_encoder_base_decoder_1x768x12_clip \
    --tokenizer_weight /workspace/sync/Feature-Distillation/results/clip/beit2/tokenizer/vqkd_encoder_base_decoder_1x768x12_clip-d93179da.pth \
    --batch_size 128 \
    --accum_iter 2 \
    --lr 1.5e-3 \
    --warmup_epochs 10 \
    --clip_grad 3.0 \
    --drop_path 0.0 \
    --layer_scale_init_value 0.1 \
    --imagenet_default_mean_and_std \
    --opt_betas 0.98 0.999 \
    --opt_eps 1e-8  \
    --epochs 300 \
    --save_ckpt_freq 20