#!/bin/bash

# =================================================================
# Script Name: exp_backbones.sh
# Description: Orchestrates a Federated Learning experiment comparing 
#              multiple backbones (ViTs, ResNets) using varying methods.
#              Uses FIFO for GPU concurrency control.
# =================================================================

# =================================================================
# 1. Configuration
# =================================================================

# --- Hardware Resources ---
GPUS=(0 1 2)            # List of available GPU IDs

# --- Basic Environment ---
G_NUM_CLIENTS=10
G_FRACTION=0.5          # Fraction of clients selected per round
G_ROUNDS=50
G_LOCAL_EPOCHS=1

# --- Task Settings ---
DATASET="cifar10"       # Fixed dataset
TRAINING_MODE="head"    # 'head' (fine-tune classifier only) or 'full'
OPTIMIZER="sgd"
ALPHA=0.5               # Dirichlet Alpha: 0.5 is moderate but challenging for ViTs

# --- Model Backbones to Compare ---
# MODELS_LIST=(
#     "resnet18"
#     "tv_resnet50"
#     "vit_base_patch16_clip_224"
#     "vit_base_patch14_dinov2.lvd142m"
# )

MODELS_LIST=(
    "vit_base_patch14_dinov2.lvd142m"
)

# --- FL Methods to Compare ---
# METHODS_LIST=("FedAvg" "FedProx" "FedNewton") 
METHODS_LIST=("FedNewton") 

SEEDS=(0 1 2)

# --- Static Hyperparameters ---

# [FedNewton Parameters]
NEWTON_HESSIAN_BATCHES=64
NEWTON_DAMPING=0.000001   # Recommended: 1e-3 or 1e-4. Too small may cause divergence.
NEWTON_LR=0.001           # Server-side learning rate (used with damping)
# NEWTON_MAX_NORM=1.0     # [Highly Recommended] Safety valve to prevent gradient explosion

# [FedAvg / Common Parameters]
FIXED_LR=0.01

# [FedProx Parameters]
FEDPROX_MU=0.01           # Regularization term (mu)

# =================================================================
# 2. Initialization & Concurrency Control (FIFO)
# =================================================================

TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
RUN_ROOT="runs/multimodel_${TIMESTAMP}_${DATASET}"
BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Setup directories and backup scripts for reproducibility
mkdir -p "${RUN_ROOT}"
cp "${BASE_DIR}/main.py"     "${RUN_ROOT}/main.py"     2>/dev/null || true
cp "${BASE_DIR}/analysis.py" "${RUN_ROOT}/analysis.py" 2>/dev/null || true
cp "${BASE_DIR}/run.sh"      "${RUN_ROOT}/run.sh"      2>/dev/null || true

# --- Initialize FIFO pipe for GPU pooling ---
tmp_fifo="/tmp/$$.fifo"
mkfifo $tmp_fifo
exec 6<>$tmp_fifo
rm $tmp_fifo

# Populate the FIFO with GPU tokens
for gpu in "${GPUS[@]}"; do
    echo "$gpu" >&6
done

# Ensure background processes are killed if script is interrupted
trap 'echo ">>> Terminating... Killing all background jobs."; kill 0; exit' SIGINT SIGTERM

echo "=================================================="
echo "       Start Multi-Backbone Comparison"
echo "=================================================="
echo "Dataset: $DATASET | Alpha: $ALPHA"
echo "Models:  ${MODELS_LIST[*]}"
echo "Methods: ${METHODS_LIST[*]}"
echo "--------------------------------------------------"

# =================================================================
# 3. Main Loop (Model -> Method -> Seed)
# =================================================================

# [Optimization] Iterate through Models first to organize results by Backbone
for MODEL_NAME in "${MODELS_LIST[@]}"; do
    echo "##################################################"
    echo "Scheduling Model: ${MODEL_NAME}"
    echo "##################################################"

    # --- Dynamic Batch Size Adjustment (Prevent OOM) ---
    # Reduce batch size for Vision Transformers or large CNNs
    case $MODEL_NAME in
        *"vit"*|*"efficientnet"*|*"convnext"*) 
            CURRENT_BATCH_SIZE=32 
            echo "    -> [Config] Memory-intensive model detected. Using BatchSize=32"
            ;;
        *) 
            CURRENT_BATCH_SIZE=64 
            echo "    -> [Config] Standard model. Using BatchSize=64"
            ;;
    esac

    # Create specific logging directories for this model
    LOG_DIR="${RUN_ROOT}/${MODEL_NAME}/logs"
    FIG_DIR="${RUN_ROOT}/${MODEL_NAME}/figs"
    mkdir -p "${LOG_DIR}"
    mkdir -p "${FIG_DIR}"

    for METHOD in "${METHODS_LIST[@]}"; do
        for seed in "${SEEDS[@]}"; do
            
            # Configure Method-specific Arguments
            EXTRA_ARGS=""
            case $METHOD in
                "FedAvg")   
                    EXTRA_ARGS="--lr $FIXED_LR" 
                    ;;
                "FedProx")
                    EXTRA_ARGS="--lr $FIXED_LR --mu $FEDPROX_MU"
                    ;;
                "FedNewton") 
                    EXTRA_ARGS="--newton_lr $NEWTON_LR --damping $NEWTON_DAMPING --hessian_batches $NEWTON_HESSIAN_BATCHES"
                    ;;
                *) echo "Skipping unknown method: $METHOD"; continue ;;
            esac

            # 1. Acquire GPU Token
            read -u6 gpu_id
            
            echo "    -> [GPU $gpu_id] SUBMIT: $MODEL_NAME | $METHOD | Seed $seed"

            # 2. Execute in Background
            {
                # Construct command string
                CMD="python -u main.py \
                    --method $METHOD \
                    --training_mode $TRAINING_MODE \
                    --dataset $DATASET \
                    --model_name \"$MODEL_NAME\" \
                    --seed $seed \
                    --num_clients $G_NUM_CLIENTS \
                    --fraction $G_FRACTION \
                    --rounds $G_ROUNDS \
                    --local_epochs $G_LOCAL_EPOCHS \
                    --batch_size $CURRENT_BATCH_SIZE \
                    --optimizer $OPTIMIZER \
                    --log_dir $LOG_DIR \
                    --alpha $ALPHA \
                    $EXTRA_ARGS"

                # Use 'eval' to correctly handle quotes and variable arguments
                CUDA_VISIBLE_DEVICES=$gpu_id eval $CMD > "${LOG_DIR}/log_${METHOD}_s${seed}.txt" 2>&1
                
                # Check exit status
                if [ $? -ne 0 ]; then
                    echo "    !! [GPU $gpu_id] FAILED: $MODEL_NAME | $METHOD | Seed $seed"
                else
                    echo "    <- [GPU $gpu_id] DONE:   $MODEL_NAME | $METHOD | Seed $seed"
                fi
                
                # 3. Return GPU Token
                echo "$gpu_id" >&6
            } & 

        done
    done
done

echo ">>> All tasks submitted. Waiting for completion..."
wait 
exec 6>&-

# =================================================================
# 4. Results Analysis
# =================================================================

echo "=================================================="
echo "       Start Post-Processing & Analysis"
echo "=================================================="

for MODEL_NAME in "${MODELS_LIST[@]}"; do
    echo "Generating Analysis for ${MODEL_NAME}..."
    
    CURRENT_LOG_DIR="${RUN_ROOT}/${MODEL_NAME}/logs"
    CURRENT_FIG_DIR="${RUN_ROOT}/${MODEL_NAME}/figs"

    # Check if logs exist before running analysis
    count=$(ls "$CURRENT_LOG_DIR"/*.txt 2>/dev/null | wc -l)
    
    if [ "$count" != "0" ]; then
        # Use the backed-up analysis script if available, otherwise use the source
        if [ -f "${RUN_ROOT}/analysis.py" ]; then
            python "${RUN_ROOT}/analysis.py" --log_dir "${CURRENT_LOG_DIR}" --fig_dir "${CURRENT_FIG_DIR}"
        else
            python "${BASE_DIR}/analysis.py" --log_dir "${CURRENT_LOG_DIR}" --fig_dir "${CURRENT_FIG_DIR}"
        fi
    else
        echo "    [Warning] No logs found for ${MODEL_NAME}, skipping analysis."
    fi
done

echo "All experiments completed successfully."
