#!/bin/bash

### Yon can uncomment these lines to set HF cache and token globally
# export HF_HOME=/dccstor/larimar/irene/.cache/ # HF_CACHE, will be used to save HugginFace download model
# export HF_TOKEN= ## PLEASE ADD YOUR HF TOKEN HERE ##

# Function to intelligently select GPU IDs
# Args: $1 = number of GPUs to use (0 = use all available)
pick_gpus() {
  local n="${1:-0}"
  # Prefer scheduler-provided visibility; else list all GPU indices
  local ids="${CUDA_VISIBLE_DEVICES:-$(nvidia-smi --query-gpu=index --format=csv,noheader | paste -sd, -)}"
  # If a limit n>0 is given, take the first n; else return all
  if (( n > 0 )); then
    echo "$ids" | awk -F, -v n="$n" '{ for (i=1;i<=NF && i<=n;i++) printf "%s%s", $i, (i<n?",":"") }'
  else
    echo "$ids"
  fi
}

# --- CUDA Environment Setup ---
# Automatically detect and configure CUDA_HOME for DeepSpeed compatibility
if command -v nvcc >/dev/null 2>&1; then
  CUDA_HOME="$(dirname "$(dirname "$(readlink -f "$(command -v nvcc)")")")"
  export CUDA_HOME CUDA_PATH="$CUDA_HOME"
  export PATH="$CUDA_HOME/bin:$PATH"
  # Add CUDA library paths (handles both conda and system CUDA installations)
  if [ -d "$CUDA_HOME/lib64" ]; then
    export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
  elif [ -d "$CUDA_HOME/lib" ]; then
    export LD_LIBRARY_PATH="$CUDA_HOME/lib:${LD_LIBRARY_PATH:-}"
  fi
fi

##### Fixed Configuration #####
output_home="./"
master_port=16909  # Change this if running multiple DeepSpeed jobs on the same node

# Model configuration
model="Qwen/Qwen3-0.6B-Base"
model_abbr="qwen-3-0.6b"

# GPU allocation
train_gpus="0,1,2,3"  # Specify GPU IDs for training
test_gpus="0,1,2,3"   # Specify GPU IDs for testing

# Batch size settings
inference_bs=800  # VLLM inference batch size (does not affect model performance)
per_bs=4          # Per-GPU training batch size for DeepSpeed (affects memory usage, set to lower values if you encounter OOM errors)

##### Dynamic Experiment Configuration #####
seeds=(1 2 3)           # Random seeds for reproducibility
ranks=(128)             # LoRA rank(s) to experiment with
tasks=("metamath")      # Task(s) to run
                        # Examples: "metamath:5000" (first 5000 samples), "metamath-ep2" (2 epochs)
train_bs=(16 128)

peft_methods=("LoRA" "PiSSA" "MiLoRA" "InitAB" "DoRA")  # PEFT methods to evaluate

##### Learning Rate Grid for Each PEFT Method #####
# Logarithmically spaced learning rates for hyperparameter search
declare -A peft_lrs  
peft_lrs["LoRA"]="2.0000e-6 3.5566e-6 6.3246e-6 1.1247e-5 2.0000e-5 3.5566e-5 6.3246e-5 1.1247e-4 2.0000e-4 3.5566e-4 6.3246e-4 1.1247e-3 2.0000e-3"
peft_lrs["PiSSA"]="2.0000e-6 3.5566e-6 6.3246e-6 1.1247e-5 2.0000e-5 3.5566e-5 6.3246e-5 1.1247e-4 2.0000e-4 3.5566e-4 6.3246e-4 1.1247e-3 2.0000e-3"
peft_lrs["InitAB"]="2.0000e-6 3.5566e-6 6.3246e-6 1.1247e-5 2.0000e-5 3.5566e-5 6.3246e-5 1.1247e-4 2.0000e-4 3.5566e-4 6.3246e-4 1.1247e-3 2.0000e-3"
peft_lrs["MiLoRA"]="2.0000e-6 3.5566e-6 6.3246e-6 1.1247e-5 2.0000e-5 3.5566e-5 6.3246e-5 1.1247e-4 2.0000e-4 3.5566e-4 6.3246e-4 1.1247e-3 2.0000e-3"
peft_lrs["DoRA"]="2.0000e-6 3.5566e-6 6.3246e-6 1.1247e-5 2.0000e-5 3.5566e-5 6.3246e-5 1.1247e-4 2.0000e-4 3.5566e-4 6.3246e-4 1.1247e-3 2.0000e-3"

##### Main Experiment Loop #####
for seed in "${seeds[@]}"; do
  for rank in "${ranks[@]}"; do
    for bs in "${train_bs[@]}"; do
      for task in "${tasks[@]}"; do  
        for peft in "${peft_methods[@]}"; do
          
          # Task-specific settings
          data="pissa-dataset"
          model_max_length=512

          # Retrieve learning rates for current PEFT method
          lr_string="${peft_lrs[$peft]}"
          
          if [[ -z "$lr_string" ]]; then
            echo "Warning: No learning rates defined for '$peft', using default_lrs"
            lr_string="$default_lrs"
          fi
          
          if [[ -z "$lr_string" ]]; then
            echo ">>> Skipping $peft: learning rates explicitly set to empty"
            continue
          fi
          
          read -ra curr_lrs <<< "$lr_string"          
          echo ">>> Finetuning $peft with ${#curr_lrs[@]} learning rates: ${curr_lrs[*]}"
          
          # Inner loop: iterate over all learning rates
          for lr in "${curr_lrs[@]}"; do  
            echo ">>> Running experiment: seed=${seed}, rank=${rank}, bs=${bs}, task=${task}, peft=${peft}, lr=${lr}"
            timestamp=$(date +"%Y%m%d-%H%M%S")
            output_path=$(readlink -m "${output_home}/output/${task}-${peft}-${model_abbr}-r${rank}/bs${bs}-lr${lr}-trial${seed}")
            adapter_path="${output_path}/adapter_model"
            perf_json_path="${output_path}/perf.json"
            temp_path=$(readlink -m "${output_home}/output/temp_merged_model-${timestamp}")
                
            echo ">>> Experiment output path: $output_path"
            echo ">>> Temp merged model save path: $temp_path"
            
            # Skip if performance results already exist
            if [[ -f "$perf_json_path" ]] && grep -E '[0-9]+\.?[0-9]*' "$perf_json_path" >/dev/null; then
              echo "✓ perf.json exists and contains numbers"
              echo "Skip this experiment"
              continue
            fi

            ##### Training Phase #####
            if [[ -d "$adapter_path" ]]; then
              echo ">>> Skipping training: adapter already exists at $adapter_path"
            else
              echo ">>> Starting training, adapters will be saved at $adapter_path"
              ./scripts/train.sh \
                --data "$data" \
                --master_port "$master_port" \
                --model "$model" \
                --model_abbr "$model_abbr" \
                --output_path "$output_path" \
                --task "$task" \
                --peft "$peft" \
                --rank "$rank" \
                --gpus "$train_gpus" \
                --trial_id "$seed" \
                --lr "$lr" \
                --bs "$bs" \
                --model_max_length "$model_max_length" \
                --timestamp "$timestamp" \
                --per_bs "$per_bs" \
                --output_home "$output_home"
            fi

            ##### Testing Phase #####
            echo ">>> Starting testing, performance will be saved at $perf_json_path"   
            ./scripts/test.sh \
              --model "$model" \
              --model_abbr "$model_abbr" \
              --task "$task" \
              --peft "$peft" \
              --rank "$rank" \
              --bs "$inference_bs" \
              --gpus "$test_gpus" \
              --output_home "$output_home" \
              --output_path "$output_path" \
              --temp_path "$temp_path" \
              --timestamp "$timestamp"

            ##### Cleanup: Remove temporary merged model #####
            if [[ -d "$temp_path" ]]; then
              rm -rf "$temp_path"
              echo "✓ Successfully deleted: $temp_path"
            else
              echo "✗ Directory not found: $temp_path (should already be deleted by Python script)"
            fi
            
            ##### Cleanup: Remove adapters if results are valid #####
            # This saves disk space after successful experiments
            if [[ -f "$perf_json_path" ]] && grep -E '[0-9]+\.?[0-9]*' "$perf_json_path" >/dev/null; then
              echo "✓ perf.json exists and contains numbers"
              adapter_dir=$(readlink -m "${output_path}/adapter_model")
              if [[ -d "$adapter_dir" ]]; then
                rm -rf "$adapter_dir"
                echo "✓ Successfully deleted adapter: $adapter_dir"
              else
                echo "✗ Adapter directory not found: $adapter_dir"
              fi
            else
              echo "✗ perf.json missing or contains no numbers"
              echo "Keeping adapter for debugging"            
            fi

            ##### Special Cleanup for InitAB #####
            # InitAB uses random initialization, so we delete initialization files after each run
            # PiSSA and MiLoRA reuse the same initialization across learning rates
            if [ "$peft" == "InitAB" ]; then
              initAB_res_model=$(readlink -m "${output_home}/output/InitAB-${model_abbr}-r${rank}-${timestamp}")
              if [ -d "$initAB_res_model" ]; then
                echo "Deleting InitAB initialization files after experiment"
                rm -rf "$initAB_res_model"
              fi
            fi
          done
            
        done  
      done
    done
  done
done