#!/bin/bash

# =================================================================
# Script Name: exp_solvers.sh
# Description: Compares different linear system solvers for the 
#              FedNewton algorithm (Second-Order Federated Learning).
# =================================================================

# =================================================================
# 1. Environment Configuration
# =================================================================

# --- Hardware Resources ---
GPUS=(0 1 2) 

# --- Basic Settings ---
G_NUM_CLIENTS=10
G_FRACTION=0.5
G_ROUNDS=50
G_LOCAL_EPOCHS=1
G_BATCH_SIZE=64

# --- Target Configuration (User Defined) ---
TRAINING_MODE="head"      # Recommended: 'head'. Full training of models like ResNet18 
                          # with 'exact' Newton often causes OOM (Out of Memory).
MODEL_NAME="tv_resnet50"
DATASET="cifar100"        
OPTIMIZER="sgd"
ALPHA=0.5                 # Dirichlet Alpha (Data Heterogeneity)

# --- Comparison Variable: Solvers ---
# [Key Configuration] Define the list of solvers to compare:
#   exact:   Exact Matrix Inversion (Baseline, computationally expensive)
#   cg:      Conjugate Gradient (Hessian-Free optimization)
#   diag:    Diagonal Approximation (Fast, low memory)
#   neumann: Neumann Series Expansion (Iterative approximation)
#   lbfgs:   Limited-memory BFGS (Quasi-Newton)
#   lowrank: Low-Rank Approximation (Top-k Eigenvalues)
SOLVERS=("cg" "exact" "diag" "neumann" "lbfgs" "lowrank")

SEEDS=(0 1 2)

# --- Hyperparameters ---
NEWTON_LR=0.01             # Newton methods typically allow for larger Learning Rates
NEWTON_DAMPING=0.00001     # Damping coefficient (prevents singular matrices)
NEWTON_HESSIAN_BATCHES=64  # Number of batches used to estimate the Hessian

# [Solver Specific Parameters]
NEWTON_CG_MAX_ITER=10      # Max iterations for the CG Solver
LBFGS_M=5                  # History size for L-BFGS updates

FIXED_LR=0.01              # Base Learning Rate (for the gradient calculation step)


# =================================================================
# 2. Directory Initialization
# =================================================================

TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
# Create a dedicated directory for this comparison run
RUN_ROOT="runs/compare_solvers_${TIMESTAMP}_${MODEL_NAME}_${TRAINING_MODE}"
BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

mkdir -p "${RUN_ROOT}"
# Backup source code for reproducibility
cp "${BASE_DIR}/main.py"     "${RUN_ROOT}/main.py"     2>/dev/null || true
cp "${BASE_DIR}/analysis.py" "${RUN_ROOT}/analysis.py" 2>/dev/null || true
cp "${BASE_DIR}/run.sh"      "${RUN_ROOT}/run.sh"      2>/dev/null || true

# --- Initialize FIFO Pipe (Concurrency Control) ---
tmp_fifo="/tmp/$$.fifo"
mkfifo $tmp_fifo
exec 6<>$tmp_fifo
rm $tmp_fifo

# Populate FIFO with GPU tokens
for gpu in "${GPUS[@]}"; do
    echo "$gpu" >&6
done

# Trap interrupt signals to ensure clean exit
trap 'echo ">>> Terminating... Killing all background jobs."; kill 0; exit' SIGINT SIGTERM

echo "=================================================="
echo "   Start Comparison: FedNewton Solvers"
echo "   Methods: ${SOLVERS[*]}"
echo "   Dataset: ${DATASET} | Model: ${MODEL_NAME}"
echo "=================================================="

# =================================================================
# 3. Main Training Loop
# =================================================================

echo "##################################################"
echo "Processing Dataset: ${DATASET}"
echo "##################################################"

# Iterate over all selected solvers
for SOLVER in "${SOLVERS[@]}"; do
    
    # Create a specific log subdirectory for each solver to keep results organized
    LOG_DIR="${RUN_ROOT}/${DATASET}/logs/${SOLVER}"
    mkdir -p "${LOG_DIR}"

    for seed in "${SEEDS[@]}"; do
        
        # 1. Acquire GPU Token
        read -u6 gpu_id
        
        echo "    -> [GPU $gpu_id] Submitted: Solver=${SOLVER} (Seed $seed)"

        # 2. Execute Task in Background
        {
            CMD="python -u main.py \
                --method FedNewton \
                --training_mode $TRAINING_MODE \
                --dataset $DATASET \
                --model_name $MODEL_NAME \
                --seed $seed \
                --num_clients $G_NUM_CLIENTS \
                --fraction $G_FRACTION \
                --rounds $G_ROUNDS \
                --local_epochs $G_LOCAL_EPOCHS \
                --batch_size $G_BATCH_SIZE \
                --optimizer $OPTIMIZER \
                --log_dir $LOG_DIR \
                --alpha $ALPHA \
                --lr $FIXED_LR \
                --newton_lr $NEWTON_LR \
                --damping $NEWTON_DAMPING \
                --hessian_batches $NEWTON_HESSIAN_BATCHES \
                --newton_solver $SOLVER \
                --newton_cg_max_iter $NEWTON_CG_MAX_ITER \
                --lbfgs_m $LBFGS_M"

            # Redirect output to the solver-specific directory
            CUDA_VISIBLE_DEVICES=$gpu_id $CMD > "${LOG_DIR}/log_FedNewton_${SOLVER}_s${seed}.txt" 2>&1
            
            echo "    <- [GPU $gpu_id] Finished:  Solver=${SOLVER} (Seed $seed)"
            
            # 3. Return GPU Token
            echo "$gpu_id" >&6
        } & 

    done
done

echo ">>> All tasks submitted. Waiting for completion..."
wait 

# Close file descriptor
exec 6>&-

echo "=================================================="
echo "       Comparison Experiments Completed"
echo "=================================================="
echo "Results are saved in structure: ${RUN_ROOT}/${DATASET}/logs/<SOLVER_NAME>/"
echo ""
echo "NOTE ON ANALYSIS:"
echo "1. Each solver has its own sub-folder to prevent file name collisions."
echo "2. When running analysis.py, you may need to iterate through these sub-folders"
echo "   manually or point the script to specific paths."
