#!/bin/bash

###################  modified params  ##########################
modified_dropout_pattern=$1
modified_dropout_rate=$2
modified_aug_loss=$3
modified_aug_loss_weight=$4
GPU_ID=$5
project_root=<project_root>
python_path=<python_path>

###################  LoRA params  ##########################
task_type=SEQ_CLS    # Task type
inference_mode=False # Whether to use inference mode
r=8                  # Lora attention dimension
lora_alpha=16        # Lora alpha
lora_dropout=0.0     # Lora dropout
#target_modules=["q_proj","v_proj",]          # List of module names or regex expression of the module names to replace with Lora.
#fan_in_fan_out=False         # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
#bias="none"                  # Bias type for Lora. Can be 'none', 'all' or 'lora_only'
#modules_to_save=None         # List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
#init_lora_weights=True       # Whether to initialize the weights of the Lora layers.

###################  Data  ##########################
task_name=rte
max_seq_length=512
#"pad_to_max_length": true,
#max_train_samples=None
#max_eval_samples=None
#max_predict_samples=None

###################  Model  ##########################
model_name_or_path=yahma/llama-7b-hf

###################  Training params  ##########################
num_train_epochs=10
per_device_train_batch_size=16
per_device_eval_batch_size=16
gradient_accumulation_steps=4
learning_rate=4e-4
warmup_ratio=0.06
weight_decay=0.1
metric_for_best_model=accuracy
greater_is_better=True
#label_smoothing_factor=0.0,
#resume_from_checkpoint=None,
disable_tqdm=True
run_name=glue.${TASK_NAME}

#######################  Run  ############################
export PYTHONPATH=${project_root}:$PYTHONPATH

seed_min=0
seed_max=4
for ((seed = seed_min; seed <= seed_max; seed++)); do
  TIME=$(date "+%Y%m%d-%H%M%S")
  infix=${TIME}_GPU_${GPU_ID}_sd_${seed}_llama7b
  infix+=_dp_${modified_dropout_pattern}_${modified_dropout_rate}
  infix+=_ls_${modified_aug_loss}_${modified_aug_loss_weight}
  output_dir=checkpoints/glue_${task_name}/${infix}
  logging_dir=logs/glue_${task_name}_${infix}
  log_pth=${logging_dir}/log.txt

  cd ${project_root} || exit
  echo -e "Model Dir: ${output_dir} \nLog Dir: ${logging_dir}"
  mkdir -p ${output_dir} ${logging_dir}

  CUDA_VISIBLE_DEVICES=$GPU_ID \
    ${python_path} -u exps/run_glue_llama.py \
    --seed ${seed} \
    --modified_aug_loss=${modified_aug_loss} \
    --modified_aug_loss_weight=${modified_aug_loss_weight} \
    --modified_dropout_pattern ${modified_dropout_pattern} \
    --modified_dropout_rate ${modified_dropout_rate} \
    --model_name_or_path=${model_name_or_path} \
    --task_name ${task_name} \
    --max_seq_length ${max_seq_length} \
    --do_train \
    --do_eval \
    --disable_tqdm ${disable_tqdm} \
    --per_device_train_batch_size ${per_device_train_batch_size} \
    --per_device_eval_batch_size ${per_device_eval_batch_size} \
    --gradient_accumulation_steps ${gradient_accumulation_steps} \
    --learning_rate ${learning_rate} \
    --num_train_epochs ${num_train_epochs} \
    --weight_decay ${weight_decay} \
    --warmup_ratio ${warmup_ratio} \
    --logging_steps 10 \
    --save_total_limit 1 \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --load_best_model_at_end \
    --report_to "tensorboard" \
    --overwrite_output_dir \
    --metric_for_best_model ${metric_for_best_model} \
    --greater_is_better ${greater_is_better} \
    --run_name=${run_name} \
    --output_dir ${output_dir} \
    --logging_dir ${logging_dir} \
    --task_type ${task_type} \
    --inference_mode ${inference_mode} \
    --r ${r} \
    --lora_alpha ${lora_alpha} \
    --lora_dropout ${lora_dropout} \
    --fp16 True \
    --optim adamw_torch \
    2>& \
    1 | tee -a ${log_pth}

done
