#!/bin/bash

# =================================================================
# Script Name: exp_comparison.sh
# Description: Orchestrates Federated Learning experiments across 
#              multiple datasets and methods using GPU concurrency.
# =================================================================

# =================================================================
# 1. Experimental Environment Configuration
# =================================================================

# --- Hardware Resources ---
# ⚠️  WARNING: Using (0 0 0) with BatchSize 64 entails a high risk 
#    of GPU OOM (Out of Memory). Adjust based on VRAM capacity.
GPUS=(0 1 2) 

# --- Basic FL Settings ---
G_NUM_CLIENTS=10
G_FRACTION=0.5           # Fraction of clients selected per round
G_ROUNDS=50
G_LOCAL_EPOCHS=1
G_BATCH_SIZE=64

# --- Model & Task Settings ---
TRAINING_MODE="head"     # 'head' (classifier only) or 'full'
MODEL_NAME="resnet18"
OPTIMIZER="sgd"
ALPHA=0.1                # Dirichlet Alpha (Data Heterogeneity): 0.1 (High), 0.5 (Mid), 10 (Low)

# --- Experiment Scope ---
# DATASETS=("mnist" "fashionmnist" "cifar10")
# METHODS_LIST=("FedAvg" "FedProx" "FedSophia" "FedNew" "FedDANE" "FedNewton")
DATASETS=("cifar10")
METHODS_LIST=("FedNewton")
SEEDS=(0 1 2)

# --- Static Hyperparameters ---
# [FedSophia / FedNew]
SOPHIA_LR=0.05
SOPHIA_RHO=0.04
BETAS="0.9,0.99"

# [FedDANE]
DANE_MU=0.01
DANE_LR=0.1

# [FedNewton]
NEWTON_HESSIAN_BATCHES=64
NEWTON_LR=0.001
NEWTON_DAMPING=0.00001

# [Common / FedProx]
FIXED_LR=0.01
FIXED_MU=0.01

# =================================================================
# 2. Directory Initialization & Concurrency Control
# =================================================================

TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
RUN_ROOT="runs/run_${TIMESTAMP}_${MODEL_NAME}_${TRAINING_MODE}"
BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Setup directory structure and backup scripts for reproducibility
mkdir -p "${RUN_ROOT}"
cp "${BASE_DIR}/main.py"     "${RUN_ROOT}/main.py"     2>/dev/null || true
cp "${BASE_DIR}/analysis.py" "${RUN_ROOT}/analysis.py" 2>/dev/null || true
cp "${BASE_DIR}/run.sh"      "${RUN_ROOT}/run.sh"      2>/dev/null || true

# --- Initialize FIFO pipe for GPU pooling ---
tmp_fifo="/tmp/$$.fifo"
mkfifo $tmp_fifo
exec 6<>$tmp_fifo
rm $tmp_fifo

# Populate the FIFO with GPU tokens
for gpu in "${GPUS[@]}"; do
    echo "$gpu" >&6
done

# Trap interrupt signals to kill background processes on exit
trap 'echo ">>> Terminating... Killing all jobs."; kill 0; exit' SIGINT SIGTERM

echo "=================================================="
echo "       Start Training Phase"
echo "=================================================="

# =================================================================
# 3. Main Training Loop
# =================================================================

for DATASET in "${DATASETS[@]}"; do
    echo "##################################################"
    echo "Queuing Dataset: ${DATASET}"
    echo "##################################################"

    # Optional: Pre-flight check/download (commented out)
    # python main.py --method FedAvg --dataset "${DATASET}" --model_name "${MODEL_NAME}" --rounds 0 --num_clients 2 > /dev/null 2>&1
    
    LOG_DIR="${RUN_ROOT}/${DATASET}/logs"
    FIG_DIR="${RUN_ROOT}/${DATASET}/figs"
    mkdir -p "${LOG_DIR}"
    mkdir -p "${FIG_DIR}"

    for METHOD in "${METHODS_LIST[@]}"; do
        for seed in "${SEEDS[@]}"; do
            
            # Configure Method-specific Arguments
            EXTRA_ARGS=""
            case $METHOD in
                "FedAvg")   
                    EXTRA_ARGS="--lr $FIXED_LR" 
                    ;;
                "FedProx")  
                    EXTRA_ARGS="--lr $FIXED_LR --mu $FIXED_MU" 
                    ;;
                "FedSophia"|"FedNew") 
                    EXTRA_ARGS="--lr $FIXED_LR --sophia_lr $SOPHIA_LR --rho $SOPHIA_RHO --betas $BETAS" 
                    ;;
                "FedDANE")  
                    EXTRA_ARGS="--lr $FIXED_LR --newton_lr $DANE_LR --mu $DANE_MU" 
                    ;;
                "FedNewton") 
                    EXTRA_ARGS="--newton_lr $NEWTON_LR --damping $NEWTON_DAMPING --hessian_batches $NEWTON_HESSIAN_BATCHES" 
                    ;;
                *) echo "Skipping unknown method: $METHOD"; continue ;;
            esac

            # 1. Acquire GPU Token
            read -u6 gpu_id
            
            echo "    -> [GPU $gpu_id] Submitted: $METHOD (Seed $seed, Data $DATASET)"

            # 2. Execute task in background
            {
                CMD="python -u main.py \
                    --method $METHOD \
                    --training_mode $TRAINING_MODE \
                    --dataset $DATASET \
                    --model_name $MODEL_NAME \
                    --seed $seed \
                    --num_clients $G_NUM_CLIENTS \
                    --fraction $G_FRACTION \
                    --rounds $G_ROUNDS \
                    --local_epochs $G_LOCAL_EPOCHS \
                    --batch_size $G_BATCH_SIZE \
                    --optimizer $OPTIMIZER \
                    --log_dir $LOG_DIR \
                    --alpha $ALPHA \
                    $EXTRA_ARGS"

                # Execute command
                CUDA_VISIBLE_DEVICES=$gpu_id $CMD > "${LOG_DIR}/log_${METHOD}_s${seed}.txt" 2>&1
                
                echo "    <- [GPU $gpu_id] Finished:  $METHOD (Seed $seed, Data $DATASET)"
                
                # 3. Return GPU Token
                echo "$gpu_id" >&6
            } & 

        done
    done
done

echo ">>> All tasks submitted. Waiting for completion..."
# Wait for all background processes to finish
wait 

# Close file descriptor
exec 6>&-

# =================================================================
# 4. Post-Processing & Analysis Phase
# =================================================================

echo "=================================================="
echo "       Start Analysis"
echo "=================================================="

for DATASET in "${DATASETS[@]}"; do
    echo "Generating Analysis for ${DATASET}..."
    
    # [Fix] Paths must be redefined here to avoid using stale variables from the loop above
    CURRENT_LOG_DIR="${RUN_ROOT}/${DATASET}/logs"
    CURRENT_FIG_DIR="${RUN_ROOT}/${DATASET}/figs"

    # Check which analysis script to use (backup vs source)
    if [ -f "${RUN_ROOT}/analysis.py" ]; then
        python "${RUN_ROOT}/analysis.py" --log_dir "${CURRENT_LOG_DIR}" --fig_dir "${CURRENT_FIG_DIR}"
    else
        python "${BASE_DIR}/analysis.py" --log_dir "${CURRENT_LOG_DIR}" --fig_dir "${CURRENT_FIG_DIR}"
    fi
done

echo "All experiments completed successfully."
