OUTPUT_DIR='./deberta_mrpc'
num_gpus=4
CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --nproc_per_node=$num_gpus --master_port=12368 non-GPT-2/examples/pytorch/text-classification/run_glue.py \
    --save_total_limit 20 \
    --model_name_or_path microsoft/deberta-xlarge \
    --task_name mrpc \
    --output_dir ${OUTPUT_DIR} \
    --do_train \
    --do_eval \
    --learning_rate 1e-4 \
    --num_train_epochs 5 \
    --save_steps 1000 \
    --seed 1 \
    --per_device_train_batch_size 2 \
    --max_seq_length 256 \
    --per_device_eval_batch_size 2 \
    --overwrite_output_dir \
    --logging_steps 1000 \
    --overwrite_output_dir \
    --weight_decay 0.01 \
    --load_best_model_at_end True \
    --metric_for_best_model eval_accuracy \
    --evaluation_strategy steps
