#!/bin/bash
# Get the project root directory (parent of scripts directory)
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
export PYTHONPATH=$PYTHONPATH:${PROJECT_DIR}

# Cleanup function for temporary model
cleanup() {
    if [ -n "$TMP_MODEL_PATH" ] && [ -d "$TMP_MODEL_PATH" ]; then
        echo "Cleaning up temporary model copy..."
        rm -rf "$TMP_MODEL_PATH"
    fi
}

# Set trap to cleanup on exit (including errors)
trap cleanup EXIT

# Ensure reproducibility at system level
export PYTHONHASHSEED=42
# Critical: Enable expandable memory segments to prevent OOM errors
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Commenting out thread restrictions that may cause memory issues
export GRB_NUM_THREADS=1
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export OPENBLAS_NUM_THREADS=1

# Define variables
MODEL_NAME="meta-llama/Meta-Llama-3-8B"
TASK_NAME="alpaca"
LORA_R_VALUES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16"
LORA_BUDGET=40000000 


  batch_size=2
  learning_rate=1.0e-04
  num_epochs=5
  weight_decay=0.08
  max_grad_norm=0.612
  max_seq_length=256
  warmup_ratio=0.044
  lora_dropout=0.069
  pruning_target_reduction=0.8
  pruning_steps=10
  importance_ema_decay=0.015
  momentum_penalty_weight=0.809
  gradient_accumulation_steps=16

RECOVERY_STEPS=100  # Recovery steps after pruning
EXTENDED_RECOVERY_STEPS=200

SEEDS=(42 777 2025)

# Initialize counters for tracking results
SUCCESSFUL_RUNS=0
FAILED_RUNS=0

# Loop through each seed and run the experiment
for SEED in "${SEEDS[@]}"; do
    echo "========================================"
    echo "Starting training with seed ${SEED}"
    echo "========================================"
    
    # Generate single timestamp for entire experiment
    EXPERIMENT_TIMESTAMP=$(date +%Y%m%d_%H%M%S)
    
    # Create unique output directory for this seed (use absolute path)
    OUTPUT_DIR="${PROJECT_DIR}/output/${TASK_NAME}_optimal_lora_seed${SEED}_${EXPERIMENT_TIMESTAMP}"
    
    # Create output directory
    mkdir -p "$OUTPUT_DIR"

    # Run the training script with current seed (exact same as trial_0)
    python scripts/run_alpaca.py \
        --model_name_or_path "$MODEL_NAME" \
        --task_name "$TASK_NAME" \
        --output_dir "$OUTPUT_DIR" \
        --lora_r_values "$LORA_R_VALUES" \
        --lora_budget "$LORA_BUDGET" \
        --per_device_train_batch_size $batch_size \
        --per_device_eval_batch_size $batch_size \
        --gradient_accumulation_steps $gradient_accumulation_steps \
        --learning_rate $learning_rate \
        --num_train_epochs $num_epochs \
        --weight_decay $weight_decay \
        --warmup_ratio $warmup_ratio \
        --max_seq_length $max_seq_length \
        --lr_scheduler_type cosine \
        --max_grad_norm $max_grad_norm \
        --lora_dropout $lora_dropout \
        --seed "$SEED" \
        --overwrite_output_dir
    
    TRAIN_EXIT_CODE=$?

    # Save final reproducibility verification
    echo "Execution completed: $(date)" >> "$OUTPUT_DIR/reproducibility_info.txt"
    echo "Exit code: $TRAIN_EXIT_CODE" >> "$OUTPUT_DIR/reproducibility_info.txt"
    echo "Used seed: ${SEED}" >> "$OUTPUT_DIR/reproducibility_info.txt"
    
    # Check if training was successful
    if [ $TRAIN_EXIT_CODE -ne 0 ]; then
        echo "ERROR: Training failed with exit code $TRAIN_EXIT_CODE"
        echo "Skipping MT-Bench evaluation for seed ${SEED}"
        ((FAILED_RUNS++))
        continue
    fi
    
    echo "Completed training with seed ${SEED}"
    echo "========================================"
    
    # Run MT-Bench evaluation
    echo ""
    echo "========================================"
    echo "Starting MT-Bench evaluation for seed ${SEED}"
    echo "========================================"
    
    # Use the same timestamp from experiment start
    EVAL_TIMESTAMP="$EXPERIMENT_TIMESTAMP"

    # Check if final_model exists
    if [ ! -d "$OUTPUT_DIR/final_model" ]; then
        echo "Error: $OUTPUT_DIR/final_model does not exist. Skipping MT-Bench evaluation."
        ((FAILED_RUNS++))
        continue
    fi

    # Ensure tmp directory exists
    mkdir -p /home/work/tmp

    # Copy model to tmp directory for MT-Bench evaluation
    TMP_MODEL_PATH="/home/work/tmp/llama3_model_for_mtbench_seed${SEED}_${EVAL_TIMESTAMP}"
    echo "Copying model to temporary location: ${TMP_MODEL_PATH}..."
    rm -rf "$TMP_MODEL_PATH"
    cp -r "$OUTPUT_DIR/final_model" "$TMP_MODEL_PATH"

    if [ $? -ne 0 ]; then
        echo "ERROR: Failed to copy model to tmp directory"
        ((FAILED_RUNS++))
        continue
    fi

    # Use model from tmp directory
    MT_BENCH_MODEL_PATH="$TMP_MODEL_PATH"
    echo "Using model from ${MT_BENCH_MODEL_PATH}..."

    # Set MT-Bench model ID with timestamp to avoid conflicts
    MT_BENCH_MODEL_ID="trial_seed_${SEED}_${EVAL_TIMESTAMP}"
    
    # Save current directory and change to FastChat directory
    ORIGINAL_DIR=$(pwd)
    cd "${PROJECT_DIR}/FastChat/fastchat/llm_judge" || {
        echo "ERROR: Failed to cd to FastChat directory"
        ((FAILED_RUNS++))
        continue
    }
    
    # Set environment variables
    if [ -f "${PROJECT_DIR}/set_env.sh" ]; then
        source ${PROJECT_DIR}/set_env.sh
    else
        echo "Warning: set_env.sh not found at ${PROJECT_DIR}/set_env.sh"
    fi
    
    # Step 1: Generate model answers
    echo "Generating model answers..."
    python gen_model_answer.py \
        --model-path "$MT_BENCH_MODEL_PATH" \
        --model-id "$MT_BENCH_MODEL_ID" \
        --num-gpus-per-model 1 \
        --num-gpus-total 1
    
    if [ $? -ne 0 ]; then
        echo "ERROR: gen_model_answer.py failed"
        cd "$ORIGINAL_DIR"
        ((FAILED_RUNS++))
        continue
    fi
    
    # Clean up any existing judgment file to prevent accumulation
    rm -f data/mt_bench/model_judgment/gpt-4_single.jsonl
    
    # Step 2: Generate GPT-4 judgments
    echo "Generating GPT-4 judgments..."
    python gen_judgment.py \
        --model-list "$MT_BENCH_MODEL_ID" \
        --parallel 2 \
        --mode single
    
    if [ $? -ne 0 ]; then
        echo "ERROR: gen_judgment.py failed"
        cd "$ORIGINAL_DIR"
        ((FAILED_RUNS++))
        continue
    fi
    
    # Step 3: Show and save results
    echo "Showing results..."
    # Store MT-Bench results in the output directory
    MT_BENCH_RESULTS_DIR="${OUTPUT_DIR}/mt_bench_results"
    mkdir -p "$MT_BENCH_RESULTS_DIR"
    python show_result.py --model-list "$MT_BENCH_MODEL_ID" | tee "$MT_BENCH_RESULTS_DIR/results_seed${SEED}_${EVAL_TIMESTAMP}.txt"
    
    if [ ${PIPESTATUS[0]} -ne 0 ]; then
        echo "Warning: show_result.py may have failed, but continuing..."
    fi
    
    # Note: We will move (not copy) the model_answer file later to avoid duplication
    
    # Move the entire judgment file to our results directory
    if [ -f "data/mt_bench/model_judgment/gpt-4_single.jsonl" ]; then
        mv "data/mt_bench/model_judgment/gpt-4_single.jsonl" "$MT_BENCH_RESULTS_DIR/gpt4_judgment_seed${SEED}_${EVAL_TIMESTAMP}.jsonl"
        echo "GPT-4 judgments saved to $MT_BENCH_RESULTS_DIR/gpt4_judgment_seed${SEED}_${EVAL_TIMESTAMP}.jsonl"
    else
        echo "Warning: No judgment file generated"
    fi
    
    # Move FastChat files to our results directory to avoid accumulation
    if [ -f "data/mt_bench/model_answer/${MT_BENCH_MODEL_ID}.jsonl" ]; then
        mv data/mt_bench/model_answer/${MT_BENCH_MODEL_ID}.jsonl "$MT_BENCH_RESULTS_DIR/model_answer_seed${SEED}_${EVAL_TIMESTAMP}.jsonl"
    fi
    
    # Return to original directory
    cd "$ORIGINAL_DIR"
    
    echo "MT-Bench evaluation completed for seed ${SEED}"
    echo "========================================"
    ((SUCCESSFUL_RUNS++))
done

# Print summary
echo ""
echo "========================================"
echo "EXPERIMENT SUMMARY"
echo "========================================"
echo "Successful runs: $SUCCESSFUL_RUNS"
echo "Failed runs: $FAILED_RUNS"
if [ $FAILED_RUNS -eq 0 ]; then
    echo "All training runs and MT-Bench evaluations completed successfully!"
else
    echo "WARNING: $FAILED_RUNS runs failed. Check logs for details."
    exit 1
fi