#!/usr/bin/env bash

# ======================================================================================
# Unified Evaluation Script for RO-GRPO
#
# Description:
# This script launches the evaluation process for the experiments described in the paper
# "Balancing the Experts: Unlocking LoRA-MoE for GRPO via Mechanism-Aware Rewards".
# It supports both unimodal evaluation using OpenCompass and multimodal evaluation
# using VLMEvalKit.
#
# All paths and sensitive information have been replaced with placeholders.
# Please configure the variables in the "CONFIGURATION" section before running.
# ======================================================================================

set -e

# --- Helper Functions ---
function usage() {
    echo "Usage: $0 [task_type]"
    echo ""
    echo "Arguments:"
    echo "  task_type:  Specifies the evaluation task to run."
    echo "              Options: 'unimodal', 'multimodal'"
    echo ""
    echo "Example:"
    echo "  # Run unimodal evaluation on all specified checkpoints and datasets"
    echo "  bash $0 unimodal"
    echo ""
    echo "  # Run multimodal evaluation"
    echo "  bash $0 multimodal"
}

# --- Argument Validation ---
if [ "$#" -ne 1 ]; then
    echo "Error: Invalid number of arguments."
    usage
    exit 1
fi

TASK_TYPE=$1

# ======================================================================================
# --- CONFIGURATION ---
# Please modify the paths and settings below to match your environment.
# ======================================================================================

# --- General Configuration ---
NUM_GPUS=8 # Number of GPUs to use per evaluation task. This is the single source of truth.

# --- Unimodal Evaluation Configuration (OpenCompass) ---
# Set your OpenAI API key for judgment.
export OC_JUDGE_MODEL='gpt-4-turbo'
export OC_JUDGE_API_KEY='YOUR_API_KEY_HERE'
export OC_JUDGE_API_BASE='YOUR_API_URL_HERE'

# Path to the base instruct-tuned model for unimodal tasks
UNIMODAL_BASE_MODEL_PATH="path/to/your/Qwen2.5-7B-Instruct"

# List of PEFT checkpoint paths to evaluate for unimodal tasks.
# Use "base" to evaluate the base model without any adapter.
UNIMODAL_PEFT_PATHS=(
    "base"
    "./output/unimodal/ro_grpo_smooth/checkpoint-final"
    "./output/unimodal/ro_grpo_relative/checkpoint-final"
    "./output/unimodal/lora_moe_baseline/checkpoint-final"
    "./output/unimodal/lora_baseline/checkpoint-final"
)

# List of unimodal datasets to evaluate on (using OpenCompass dataset names)
UNIMODAL_DATASETS=(
    "gsm8k_0shot_gen_a58960"
    "math_0shot_gen_393424"
    "svamp_0shot_gen"
    "mgsm_gen_d967bc"
)

# --- Multimodal Evaluation Configuration (VLMEvalKit) ---
MULTIMODAL_MODEL_NAME="Qwen2.5-VL-7B-Instruct"

# List of PEFT checkpoint paths to evaluate for multimodal tasks.
# Use "base" to evaluate the base model without any adapter.
MULTIMODAL_PEFT_PATHS=(
    "base"
    "./output/multimodal/ro_grpo_smooth/checkpoint-final"
    "./output/multimodal/ro_grpo_relative/checkpoint-final"
    "./output/multimodal/lora_moe_baseline/checkpoint-final"
    "./output/multimodal/lora_baseline/checkpoint-final"
)

# List of multimodal datasets to evaluate on (using VLMEvalKit dataset names)
MULTIMODAL_DATASETS=(
    "MathVista_MINI"
    "MathVerse_MINI"
    "Geometry3k"
    "WeMath"
)

# Base directory to save multimodal evaluation results
MULTIMODAL_OUTPUT_DIR_BASE="./outputs/evaluation/multimodal"

# ======================================================================================
# --- DYNAMIC CONFIGURATION ---
# Automatically generate GPU device IDs based on NUM_GPUS
# ======================================================================================
if [[ $NUM_GPUS -gt 0 ]]; then
    GPU_DEVICE_IDS=$(seq 0 $((NUM_GPUS - 1)) | paste -sd,)
else
    GPU_DEVICE_IDS=""
fi


# ======================================================================================
# --- EXECUTION LOGIC ---
# Do not modify below this line unless you know what you are doing.
# ======================================================================================

case "$TASK_TYPE" in
    unimodal)
        echo "==================================================="
        echo "Starting Unimodal Evaluation with OpenCompass"
        echo "==================================================="
        echo "Using $NUM_GPUS GPUs with IDs: $GPU_DEVICE_IDS"
        echo "Judge Model: $OC_JUDGE_MODEL"
        echo "Base LLM Path: $UNIMODAL_BASE_MODEL_PATH"
        echo "---------------------------------------------------"

        for dataset in "${UNIMODAL_DATASETS[@]}"; do
            for peft_path in "${UNIMODAL_PEFT_PATHS[@]}"; do
                
                CMD="CUDA_VISIBLE_DEVICES=$GPU_DEVICE_IDS opencompass --datasets \"$dataset\" \\
                    --hf-path \"$UNIMODAL_BASE_MODEL_PATH\" \\
                    --batch-size 8 \\
                    --max-out-len 4096 \\
                    --max-num-worker $NUM_GPUS"

                if [ "$peft_path" = "base" ]; then
                    echo "Evaluating Dataset: $dataset | Model: Base Model"
                else
                    echo "Evaluating Dataset: $dataset | PEFT Adapter: $peft_path"
                    CMD="$CMD --peft-path \"$peft_path\""
                fi

                echo "Executing command:"
                echo "$CMD"
                eval $CMD

                echo "Evaluation finished for this configuration."
                echo "---------------------------------------------------"
            done
        done
        ;;

    multimodal)
        echo "==================================================="
        echo "Starting Multimodal Evaluation with VLMEvalKit"
        echo "==================================================="
        echo "Using $NUM_GPUS GPUs"
        echo "Base Model Name: $MULTIMODAL_MODEL_NAME"
        echo "---------------------------------------------------"

        for dataset in "${MULTIMODAL_DATASETS[@]}"; do
            for peft_path in "${MULTIMODAL_PEFT_PATHS[@]}"; do
                
                if [ "$peft_path" = "base" ]; then
                    # Case for evaluating the base model without a PEFT adapter
                    work_dir="${MULTIMODAL_OUTPUT_DIR_BASE}/${dataset}/base"
                    echo "Evaluating Dataset: $dataset | Model: Base Model"
                    
                    CMD="torchrun --nproc_per_node=$NUM_GPUS run.py \\
                        --data \"$dataset\" \\
                        --model \"$MULTIMODAL_MODEL_NAME\" \\
                        --work-dir \"$work_dir\" \\
                        --verbose"
                else
                    # Case for evaluating with a PEFT adapter
                    # Generate a descriptive name from the peft_path for the output directory
                    peft_name=$(echo "${peft_path%/}" | awk -F/ '{print $(NF-2)"_"$(NF-1)}')
                    work_dir="${MULTIMODAL_OUTPUT_DIR_BASE}/${dataset}/${peft_name}"
                    echo "Evaluating Dataset: $dataset | PEFT Adapter: $peft_path"

                    CMD="torchrun --nproc_per_node=$NUM_GPUS run.py \\
                        --data \"$dataset\" \\
                        --model \"$MULTIMODAL_MODEL_NAME\" \\
                        --peft_path \"$peft_path\" \\
                        --work-dir \"$work_dir\" \\
                        --verbose"
                fi
                
                echo "Work Directory: $work_dir"
                echo "Executing command:"
                echo "$CMD"
                eval $CMD

                echo "Evaluation finished for this configuration."
                echo "---------------------------------------------------"
            done
        done
        ;;

    *)
        echo "Error: Invalid task_type '$TASK_TYPE'."
        usage
        exit 1
        ;;
esac

echo "==================================================="
echo "All '$TASK_TYPE' evaluations finished."
echo "==================================================="