#!/bin/bash

# =================================================================
# Parallel Grid Search Script for Federated Learning
# Usage: ./run_grid_search.sh
# Description: Runs experiments in parallel across multiple GPUs using 
#              a producer-consumer model (FIFO queue).
# =================================================================

# ==========================================
# 1. Tuning Configuration
# ==========================================

# --- Core Method Variable ---
METHOD="FedNewton"          # <--- [Modify Here] Select the method to tune

# --- Hardware Configuration ---
GPUS=(0 1 2)                # Available GPU IDs

# --- Hyperparameter Grid Search Space ---
NEWTON_LR_LIST=(0.1 0.01 0.001)          # Server-side learning rate (for Newton-based methods)
DAMPING_LIST=(0.000001 0.00001 0.0001)   # Damping factor (Regularization term)

# --- Fixed Parameters ---
CLIENT_LR=0.01              # Local SGD learning rate (Fixed)
G_NUM_CLIENTS=10
G_FRACTION=0.5
G_ROUNDS=10                
G_LOCAL_EPOCHS=1
G_BATCH_SIZE=64
ALPHA=0.1                   # Dirichlet alpha (Controls data heterogeneity). 0.1 = Highly Non-IID

# --- Core Training Parameters ---
TRAINING_MODE="head"        # 'head' (Fine-tune head) or 'full' (End-to-end)
DATASET="cifar10"
MODEL_NAME="resnet18"
# MODEL_NAME="clip_vit_b32"  
# MODEL_NAME="vit_base_patch14_dinov2.lvd142m"  
OPTIMIZER="sgd"
NEWTON_HESSIAN_BATCHES=64

# --- Random Seeds ---
SEEDS=(0 1 2)               # Run multiple seeds for statistical significance

# --- Directory Setup ---
BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TIMESTAMP_GLOBAL=$(date +"%Y%m%d_%H%M%S")

# [Modify Here] Global log directory name includes the Method variable
GLOBAL_LOG_DIR="runs/grid_search_${TIMESTAMP_GLOBAL}_${METHOD}_${DATASET}_${MODEL_NAME}_${TRAINING_MODE}"

# --- Print Config to Terminal ---
echo "=================================================="
echo "       Starting Parallel Grid Search: ${METHOD}"
echo "=================================================="
echo "Method:            ${METHOD}"
echo "Client LR (Fixed): ${CLIENT_LR}"
echo "Newton LR List:    ${NEWTON_LR_LIST[@]}"
echo "Damping List:      ${DAMPING_LIST[@]}"
echo "Global Dir:        ${GLOBAL_LOG_DIR}"
echo "--------------------------------------------------"

# -------------------------------------------------------
# Step 0: Preparation
# -------------------------------------------------------
# Create directory and backup scripts for reproducibility
mkdir -p "${GLOBAL_LOG_DIR}"
cp "${BASE_DIR}/main.py"             "${GLOBAL_LOG_DIR}/main.py"             2>/dev/null || true
cp "${BASE_DIR}/plot_separate.py"    "${GLOBAL_LOG_DIR}/plot_separate.py"    2>/dev/null || true
cp "${BASE_DIR}/plot_grid_search.py" "${GLOBAL_LOG_DIR}/plot_grid_search.py" 2>/dev/null || true
cp "${BASE_DIR}/run.sh"              "${GLOBAL_LOG_DIR}/run.sh"              2>/dev/null || true


# ==========================================
# 2. Concurrency Control Initialization
# ==========================================
# Initialize a named pipe (FIFO) to manage the GPU pool
tmp_fifo="/tmp/$$.fifo"
mkfifo $tmp_fifo
exec 6<>$tmp_fifo
rm $tmp_fifo

# Fill the FIFO with available GPU IDs
for gpu in "${GPUS[@]}"; do
    echo "$gpu" >&6
done


# ==========================================
# 3. Job Generation and Dispatch
# ==========================================

for seed in "${SEEDS[@]}"; do
    for damp in "${DAMPING_LIST[@]}"; do
        for newton_lr in "${NEWTON_LR_LIST[@]}"; do
        
            # Wait for a GPU to become available (read from FIFO)
            read -u6 gpu_id
            
            {
                # Define sub-task specific variables
                # Naming convention: only keep varying parameters (nlr, damp)
                TASK_NAME="nlr${newton_lr}_damp${damp}"
                LOG_DIR="${GLOBAL_LOG_DIR}/${TASK_NAME}"
                mkdir -p "${LOG_DIR}"
                
                LOG_FILE="${LOG_DIR}/log_s${seed}.txt"
                
                echo ">>> [Start] GPU:${gpu_id} | Method=${METHOD} | NLR=${newton_lr} | Damp=${damp} | Seed=${seed}"
                
                # Run the main python script
                CUDA_VISIBLE_DEVICES=$gpu_id python main.py \
                    --method "${METHOD}" \
                    --training_mode "${TRAINING_MODE}" \
                    --dataset "${DATASET}" \
                    --model_name "${MODEL_NAME}" \
                    --seed $seed \
                    --lr $CLIENT_LR \
                    --newton_lr $newton_lr \
                    --damping $damp \
                    --hessian_batches $NEWTON_HESSIAN_BATCHES \
                    --num_clients $G_NUM_CLIENTS \
                    --fraction $G_FRACTION \
                    --rounds $G_ROUNDS \
                    --local_epochs $G_LOCAL_EPOCHS \
                    --batch_size $G_BATCH_SIZE \
                    --optimizer "${OPTIMIZER}" \
                    --log_dir "$LOG_DIR" \
                    --alpha $ALPHA \
                    > "${LOG_FILE}" 2>&1

                echo "    [Done]  GPU:${gpu_id} | Method=${METHOD} | NLR=${newton_lr} | Damp=${damp} | Seed=${seed}"
                
                # Return the GPU ID to the pool
                echo "$gpu_id" >&6
                
            } & 

        done
    done
done

# Wait for all background processes to finish
wait
exec 6>&-

echo "=================================================="
echo "       All tasks completed. Starting Plotting."
echo "=================================================="


# ==========================================
# 4. Post-Processing
# ==========================================
echo "Running plot_grid_search.py..."

# Check if the plotting script exists in the log dir (backed up version) or root
if [ -f "${GLOBAL_LOG_DIR}/plot_grid_search.py" ]; then
    python "${GLOBAL_LOG_DIR}/plot_grid_search.py" --root_dir "${GLOBAL_LOG_DIR}"
else
    python plot_grid_search.py --root_dir "${GLOBAL_LOG_DIR}"
fi

echo "All Done. Results saved to: ${GLOBAL_LOG_DIR}"