OUTPUT_DIR='./qnli_slim'

NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --nproc_per_node=$num_gpus --master_port=12348 non-GPT-2/examples/pytorch/text-classification/run_glue_slim.py \
    --save_total_limit 1 \
    --model_name_or_path bert-base-uncased \
    --task_name qnli \
    --output_dir ${OUTPUT_DIR} \
    --do_train \
    --do_eval \
    --num_train_epochs 3 \
    --save_steps 1000 \
    --seed 1 \
    --per_device_train_batch_size 8 \
    --max_seq_length 128 \
    --per_device_eval_batch_size 8 \
    --overwrite_output_dir \
    --logging_steps 1000 \
    --load_best_model_at_end True \
    --metric_for_best_model eval_accuracy \
    --l1_loss_self_coef  1e-4 \
    --l1_loss_inter_coef 1e-4 \
    --apply_lora \
    --lora_r 16 \
    --evaluation_strategy steps &


OUTPUT_DIR='./qnli_slim_train'
num_gpus=4
NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --nproc_per_node=$num_gpus --master_port=12349 non-GPT-2/examples/pytorch/text-classification/run_glue_slim_train.py \
    --save_total_limit 1 \
    --model_name_or_path qnli_slim \
    --task_name qnli \
    --output_dir ${OUTPUT_DIR} \
    --do_train \
    --do_eval \
    --num_train_epochs 3 \
    --save_steps 1000 \
    --seed 1 \
    --per_device_train_batch_size 8 \
    --max_seq_length 128 \
    --per_device_eval_batch_size 8 \
    --overwrite_output_dir \
    --logging_steps 1000 \
    --load_best_model_at_end True \
    --metric_for_best_model eval_accuracy \
    --l1_loss_coef 0.0 \
    --l1_loss_self_coef 0.0 \
    --l1_loss_inter_coef 0.0 \
    --self_pruning_ratio   0.33 \
    --self_pruning_method  layerwise \
    --inter_pruning_ratio  0.4 \
    --inter_pruning_method global \
    --prune_before_train \
    --self_slimming_coef_file  "qnli_slim/self_slimming_coef_records.npy" \
    --inter_slimming_coef_file "qnli_slim/inter_slimming_coef_records.npy" \
    --slimming_coef_step 345 \
    --apply_lora \
    --lora_r 16 \
    --lora_path qnli_slim/pytorch_model.bin \
    --evaluation_strategy steps  

OUTPUT_DIR='./qnli_slim_train'
num_gpus=4
NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --nproc_per_node=$num_gpus --master_port=12349 non-GPT-2/examples/pytorch/text-classification/run_glue_slim_train.py \
    --save_total_limit 1 \
    --model_name_or_path qnli_slim \
    --task_name qnli \
    --output_dir ${OUTPUT_DIR} \
    --do_train \
    --do_eval \
    --num_train_epochs 3 \
    --save_steps 1000 \
    --seed 1 \
    --per_device_train_batch_size 8 \
    --max_seq_length 128 \
    --per_device_eval_batch_size 8 \
    --overwrite_output_dir \
    --logging_steps 1000 \
    --load_best_model_at_end True \
    --metric_for_best_model eval_accuracy \
    --l1_loss_coef 0.0 \
    --l1_loss_self_coef 0.0 \
    --l1_loss_inter_coef 0.0 \
    --self_pruning_ratio   0.25 \
    --self_pruning_method  layerwise \
    --inter_pruning_ratio  0.4 \
    --inter_pruning_method global \
    --prune_before_train \
    --self_slimming_coef_file  "qnli_slim/self_slimming_coef_records.npy" \
    --inter_slimming_coef_file "qnli_slim/inter_slimming_coef_records.npy" \
    --slimming_coef_step 345 \
    --apply_lora \
    --lora_r 16 \
    --lora_path qnli_slim/pytorch_model.bin \
    --evaluation_strategy steps  