#!/bin/bash

export PROJECT_CACHE=save_output
export WANDB_MODE=offline
export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
export TORCH_DISTRIBUTED_DEBUG=OFF
export HYDRA_FULL_ERROR=1
export HF_DATASETS_OFFLINE=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# dsdLora training 
dataset_name=multi_alpaca
model=mistral7b
n_epochs=2
batch_size=32
grad_norm=1
save_every=epoch_1
sparsity_ratio=0.0
lr=5e-5
lora_rank=128
lora_alpha=256

gradient_accumulation_steps=1
indexfile=src/dataset/copy_from_dense_log128_mistral.json

exp_name="${dataset_name}_${model}/idLora_m2_rank_${lora_rank}_alpha_${lora_alpha}_lr_${lr}_bs_${batch_size}"
adapter_path="${PROJECT_CACHE}/${exp_name}/epoch-${n_epochs}"
results_path="${PROJECT_CACHE}/${dataset_name}_${model}"


python -u src/train_id.py \
        model=$model \
        datasets=[$dataset_name] \
        exp_name=$exp_name \
        lr=$lr \
        save_every=$save_every \
        n_epochs=$n_epochs \
        batch_size=$batch_size \
        model.fsdp_policy_mp=bfloat16 \
        fsdp_port=$MASTER_PORT \
        optimizer=AdamW \
        grad_norm_strategy=even \
        max_grad_norm=$grad_norm \
        lora_rank=$lora_rank \
        lora_alpha=$lora_alpha \
        indexfile=$indexfile \
        gradient_accumulation_steps=$gradient_accumulation_steps 


python src/eval_model.py --model_name $model --adapter_path $adapter_path --datasets $dataset_name --results_path $results_path --sparsity_ratio $sparsity_ratio --batch_size 16

accelerate launch bigcode/main.py \
        --model $model \
        --peft_model $adapter_path \
        --metric_output_path $results_path \
        --tasks humaneval \
        --temperature 0.2 \
        --n_samples 20 \
        --batch_size 10 \
        --sparsity_ratio $sparsity_ratio \
        --allow_code_execution