#!/bin/bash
MASTER_PORT=$((RANDOM % 707 + 20000))
echo $MASTER_PORT

project_name=LIMA_gemma_7b_FFT

model=Mistral-7B-v0.1
model_name_or_path=mistralai/$model
do_train=True
do_mmlu_eval=False
do_eval=False
gradient_checkpointing=False

weight_decay=0.1
seed=1997
data_seed=1997
dataloader_num_workers=4

logging_strategy='steps'
lora_dropout=0.1
lr_scheduler_type='cosine'
optim=adamw_torch
beta1=0.9
beta2=0.999

all_layers='ALL+LM+EMBED'
lora_r=32

whether_quantize=False
bits=16
bf16=True
fp16=False

whether_unfreeze_normal=False
whether_localization=False
default_localization=True
save_only_model=True

# Datasets Relevant
dataset='LIMA'

train_with_input=True
source_max_len=6
warmup_ratio=0.03

# Whether Full finetune
use_gradient_score=False
save_interval=500

max_steps=-1
save_strategy='no'
save_total_limit=3

# # Training Relevant
# num_train_epochs=2

max_grad_norm=1.0

group_by_length=False
#max_train_samples=25000

train_on_source=True
train_without_system=True

target_max_len=1024
# dropout=0.2
full_finetune=True

for learning_rate in 0.00001
do
    for num_train_epochs in 0.1
    do  
        n_gpus=4
        CUDA=0,1,2,3,4,5,6,7
        per_device_train_batch_size=2
        gradient_accumulation_steps=2

        run_name=FFT-FT$full_finetune-$dataset-$model-bf16-$all_layers-lr$learning_rate-ep$num_train_epochs-s$data_seed-wd$weight_decay-mg$max_grad_norm-b2$beta2-TOS$train_on_source

        output_dir=./output/$model/$run_name/
        results_dir=./results/$model/$run_name/

        CUDA_VISIBLE_DEVICES=$CUDA torchrun --nproc_per_node $n_gpus --master_port $MASTER_PORT \
        run_llama2.py \
        --model_name_or_path $model_name_or_path --output_dir $output_dir --do_mmlu_eval $do_mmlu_eval \
        --do_eval $do_eval --max_steps $max_steps --whether_localization $whether_localization \
        --optim $optim --run_name $run_name --project_name $project_name --do_train $do_train --bf16 $bf16 --fp16 $fp16 \
        --whether_quantize $whether_quantize --bits $bits --gradient_checkpointing $gradient_checkpointing --whether_unfreeze_normal $whether_unfreeze_normal \
        --data_seed $data_seed --seed $seed --dataloader_num_workers $dataloader_num_workers --lora_dropout $lora_dropout \
        --lr_scheduler_type $lr_scheduler_type --dataset $dataset --source_max_len $source_max_len --target_max_len $target_max_len --learning_rate $learning_rate \
        --default_localization $default_localization --warmup_ratio $warmup_ratio --all_layers $all_layers --logging_strategy $logging_strategy \
        --lora_r $lora_r --per_device_train_batch_size $per_device_train_batch_size --gradient_accumulation_steps $gradient_accumulation_steps \
        --full_finetune $full_finetune --weight_decay $weight_decay \
        --use_gradient_score $use_gradient_score --save_interval $save_interval \
        --save_only_model $save_only_model --results_dir $results_dir --save_strategy $save_strategy --save_total_limit $save_total_limit \
        --num_train_epochs $num_train_epochs --max_grad_norm $max_grad_norm --deepspeed zero1_config_accelerate.json \
        --beta1 $beta1 --beta2 $beta2 \
        --group_by_length $group_by_length \
        --train_on_source $train_on_source \
        --train_without_system $train_without_system

        ##########################################################################################################
        # MMLU
        checkpoint_dir=./output/$model/$run_name/checkpoint-final
        tasks=mmlu

        CUDA_VISIBLE_DEVICES=$CUDA accelerate launch --main_process_port $MASTER_PORT -m lm_eval --model hf --model_args pretrained=$checkpoint_dir,dtype="bfloat16" \
        --tasks $tasks --batch_size auto --output_path $results_dir --num_fewshot 5

        # MMLU
        checkpoint_dir=./output/$model/$run_name/checkpoint-final
        tasks=hellaswag,winogrande

        CUDA_VISIBLE_DEVICES=$CUDA accelerate launch --main_process_port $MASTER_PORT -m lm_eval --model hf --model_args pretrained=$checkpoint_dir,dtype="bfloat16" \
        --tasks $tasks --batch_size auto --output_path $results_dir --num_fewshot 0

        ##########################################################################################################
        ##########################################################################################################
        ###########################################################################################################
        # Genearation
        path=./output/$model/$run_name/checkpoint-final
        model_name=$run_name
        bf16=True
        num_choices=1
        model_id='LIMA'
        num_gpus_per_model=1
        bench_name='vicuna_bench'
        max_new_token=1024
        
        CUDA_VISIBLE_DEVICES=$CUDA python \
        gen_model_answer.py \
        --model_name_or_path $path --model_name $model_name \
        --bf16 $bf16 --num-choices $num_choices --model_id $model_id \
        --num-gpus-per-model $num_gpus_per_model --num-gpus-total $n_gpus --bench-name $bench_name \
        --max_new_token $max_new_token

        # # ###########################################################################################################
        # # # Genearation
        path=./output/$model/$run_name/checkpoint-final
        model_name=$run_name
        bf16=True
        num_choices=1
        model_id='LIMA'
        num_gpus_per_model=1
        bench_name='mt_bench'
        max_new_token=1024

        CUDA_VISIBLE_DEVICES=$CUDA python \
        gen_model_answer.py \
        --model_name_or_path $path --model_name $model_name \
        --bf16 $bf16 --num-choices $num_choices --model_id $model_id \
        --num-gpus-per-model $num_gpus_per_model --num-gpus-total $n_gpus --bench-name $bench_name \
        --max_new_token $max_new_token
    done
done
