#!/bin/bash
export WANDB_DISABLED=true

# Set common variables
lr=1e-5
lora_rank=64
lora_alpha=128
lora_trainable="self_attn.q_proj,self_attn.v_proj,self_attn.k_proj,self_attn.o_proj,mlp.gate_proj,mlp.down_proj,mlp.up_proj"
# lora_trainable="self_attn.o_proj,self_attn.qkv_proj,mlp.gate_up_proj,mlp.down_proj"
# lora_trainable="attn.c_attn,attn.c_proj,mlp.w1,mlp.w2,mlp.c_proj"
modules_to_save=None
lora_dropout=0.05
# model="microsoft_Phi-3-mini-4k-instruct"
# model="Qwen-7B"
# pretrained_model="/data/${model}"
model="Llama-3"
pretrained_model="/home/${model}"
per_device_train_batch_size=4
gradient_accumulation_steps=8
max_seq_length=512
RANDOM=0


export TOKENIZERS_PARALLELISM=false
output_dir_base_file='/data/Llama-3_begin_end_layer'
mkdir -p ${output_dir_base_file}


datasets=("EN_ProofWriter" "EN_FOLIO" "EN_LogicalDeduction")
# datadir="/home/gsm8k_split_data"
datadir='/home/Logic_data'
no_lora="False"


# available_gpus=(3 5 6)
available_gpus=(0)

# Create a function to assign tasks to GPUs
run_task() {
  local gpu_id=$1
  local begin_layer=$2
  local end_layer=$3
  local dataset=$4

  output_dir="${output_dir_base}/${model}_begin_layer_${begin_layer}_end_layer_${end_layer}_${dataset}_model"

    # 检查 output_dir 是否已存在
  if [ -d "${output_dir}" ]; then
    echo "Output directory ${output_dir} already exists. Skipping training for dataset ${dataset} with begin_layer ${begin_layer}, end_layer ${end_layer} on GPU ${gpu_id}."
    return 0
  fi
  
  mkdir -p ${output_dir}
  stdout_log="${output_dir}/${dataset}_output.txt"

  echo "Starting training for dataset ${dataset} with begin_layer ${begin_layer}, end_layer ${end_layer} on GPU ${gpu_id}. Save output to ${output_dir}"

  CUDA_VISIBLE_DEVICES=${gpu_id} python sft_with_only_layers.py \
      --model_name_or_path ${pretrained_model} \
      --tokenizer_name_or_path ${pretrained_model} \
      --dataset_dir "${datadir}/${dataset}" \
      --per_device_train_batch_size ${per_device_train_batch_size} \
      --do_train \
      --low_cpu_mem_usage \
      --do_eval False \
      --seed $RANDOM \
      --bf16 \
      --num_train_epochs 3 \
      --lr_scheduler_type cosine \
      --learning_rate ${lr} \
      --warmup_ratio 0.03 \
      --logging_strategy steps \
      --logging_steps 10 \
      --save_strategy steps \
      --save_total_limit 100 \
      --save_steps 10000 \
      --gradient_accumulation_steps ${gradient_accumulation_steps} \
      --preprocessing_num_workers 1 \
      --max_seq_length ${max_seq_length} \
      --output_dir ${output_dir} \
      --overwrite_output_dir \
      --ddp_timeout 30000 \
      --logging_first_step True \
      --lora_rank ${lora_rank} \
      --lora_alpha ${lora_alpha} \
      --trainable ${lora_trainable} \
      --lora_dropout ${lora_dropout} \
      --modules_to_save ${modules_to_save} \
      --torch_dtype bfloat16 \
      --load_in_kbits 16 \
      --ddp_find_unused_parameters False \
      --full_finetuning ${no_lora} \
      --begin_layer ${begin_layer} \
      --end_layer ${end_layer} > "${stdout_log}" 2>&1
}

declare -A begin_layers=( ["gsm8k_alpaca"]=3 ["EN_ProofWriter"]=4 ["EN_FOLIO"]=3 ["EN_LogicalDeduction"]=4)  
declare -A end_layers=( ["gsm8k_alpaca"]=20 ["EN_ProofWriter"]=19 ["EN_FOLIO"]=21 ["EN_LogicalDeduction"]=18)  
# Main loop for dataset, begin_layer, and end_layer
task_id=0

for dataset in "${!begin_layers[@]}"; do
  begin_layer=${begin_layers[$dataset]}
  end_layer=${end_layers[$dataset]}

  output_dir_base="${output_dir_base_file}/${model}_${dataset}_begin_end_layer"
  mkdir -p "${output_dir_base}"

  # 轮流分配 GPU
  gpu_id=${available_gpus[$((task_id % ${#available_gpus[@]}))]}  

  # 运行四个不同的任务
  run_task ${gpu_id} ${begin_layer} ${end_layer} ${dataset} &
  ((task_id++))
  if [ $((task_id % ${#available_gpus[@]})) -eq 0 ]; then
    wait  # 等待所有任务完成
  fi
    
  gpu_id=${available_gpus[$((task_id % ${#available_gpus[@]}))]}  
  run_task ${gpu_id} 0 ${begin_layer} ${dataset} &
  ((task_id++))
  if [ $((task_id % ${#available_gpus[@]})) -eq 0 ]; then
    wait  # 等待所有任务完成
  fi

  gpu_id=${available_gpus[$((task_id % ${#available_gpus[@]}))]}  
  run_task ${gpu_id} ${end_layer} 32 ${dataset} &
  ((task_id++))
  if [ $((task_id % ${#available_gpus[@]})) -eq 0 ]; then
    wait  # 等待所有任务完成
  fi
  
  gpu_id=${available_gpus[$((task_id % ${#available_gpus[@]}))]}  
  run_task ${gpu_id} 0 32 ${dataset} &
  ((task_id++))
  # 控制并发，确保任务不会超出 GPU 数量
  if [ $((task_id % ${#available_gpus[@]})) -eq 0 ]; then
    wait  # 等待所有任务完成
  fi
done

# Wait for any remaining tasks to complete
wait

echo "Training completed for all datasets, begin_layer, and end_layer values."
