#!/bin/bash
#
# 分阶段测试脚本 - 使用 test_trained_model.py 测试训练好的模型
# 用法: 
#   ./test_trained_model.sh                    # 完整测试（三个阶段）
#   ./test_trained_model.sh --stage 1          # 只运行阶段1（生成）
#   ./test_trained_model.sh --stage 2          # 只运行阶段2（评估unsafe）
#   ./test_trained_model.sh --stage 3          # 只运行阶段3（评估similarity）
#   ./test_trained_model.sh --stage 23         # 运行阶段2和3（跳过生成）
#   ./test_trained_model.sh --stage 12         # 运行阶段1和2
#   ./test_trained_model.sh --stage 13         # 运行阶段1和3
#   ./test_trained_model.sh --quick            # 快速测试（10个样本）
#   ./test_trained_model.sh --no-adapter       # 测试基础模型
#   ./test_trained_model.sh --method dpo       # 测试 DPO 模型
#   ./test_trained_model.sh --method sft --adapter-type prompt-tuning  # 测试 Prompt-Tuning SFT
#   ./test_trained_model.sh --method sft --adapter-type prefix-tuning  # 测试 Prefix-Tuning SFT
#   ./test_trained_model.sh --method sft --adapter-type p-tuning       # 测试 P-Tuning SFT
#   ./test_trained_model.sh --checkpoint 1200  # 指定使用 checkpoint-1200（覆盖自动选择最新 checkpoint）
#   ./test_trained_model.sh --gpt-rewrite      # 使用 GPT 模型进行改写
#   ./test_trained_model.sh --gpt-rewrite --gpt-rewrite-model gpt-4o  # 指定 GPT 模型
#   ./test_trained_model.sh --local-rewrite    # 使用本地 SGLang 服务器进行改写
#   ./test_trained_model.sh --local-rewrite --sglang-server-node localhost --sglang-server-port 30000  # 指定 SGLang 服务器
#   ./test_trained_model.sh --use-gptjudge --gptjudge-model gpt-4o  # 使用 GPTJudge 评估
#   ./test_trained_model.sh --no-gptjudge     # 禁用 GPTJudge，使用 LlamaGuard
#

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"

# 切换到项目根目录（确保相对路径正确）
cd "$PROJECT_ROOT"

# ============================================================================
# 模型配置 - 只需修改这里即可切换模型
# ============================================================================
BASE_MODEL="Qwen/Qwen3-4B-Instruct-2507"

# 从 BASE_MODEL 自动生成模型标识符（用于路径）
# 例如: "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b-instruct-2507"
# 如果有斜杠，提取斜杠后的部分；否则使用整个字符串
if [[ "$BASE_MODEL" == *"/"* ]]; then
    MODEL_ID=$(echo "$BASE_MODEL" | sed 's/.*\///' | tr '[:upper:]' '[:lower:]')
else
    MODEL_ID=$(echo "$BASE_MODEL" | tr '[:upper:]' '[:lower:]')
fi

echo "========================================================================"
echo "Testing Trained Model - 3-Stage Process"
echo "========================================================================"
echo ""

# 默认配置
DIRECTION="increase"  # "decrease" 或 "increase"
METHOD="dpo"  # "sft" | "dpo"
ADAPTER_TYPE="lora"
CHECKPOINT=""
ADAPTER_PATH=""  # 将在参数解析后根据 METHOD 和 ADAPTER_TYPE 自动生成
MERGED_BASE_MODEL=""  # 只在 prompt-tuning / prefix-tuning / p-tuning 时使用
USER_SET_ADAPTER_PATH=false  # 标记用户是否通过 --adapter_path 指定了路径
TEST_FILE="data/pku_saferlhf_test.jsonl"
GENERATION_FILE="results/test_generation_results.jsonl"
OUTPUT_FILE="results/test_trained_model_results.jsonl"
METRICS_FILE="results/test_trained_model_metrics.json"
BATCH_SIZE=32
MAX_NEW_TOKENS=2048
STAGE="all"
MAX_SAMPLES=""
NO_ADAPTER=false
NO_SYSTEM=false
GPT_REWRITE=false
GPT_REWRITE_MODEL="gpt-4.1-mini-2025-04-14"
LOCAL_REWRITE=false 
LOCAL_BACKEND="vllm"  # "sglang" | "vllm"
SGLANG_SERVER_NODE="localhost"
SGLANG_SERVER_PORT="8000"
USE_GPTJUDGE=false
GPTJUDGE_MODEL="gpt-4o-2024-11-20"

# 解析参数
while [[ $# -gt 0 ]]; do
    case $1 in
        --stage=*)
            STAGE="${1#*=}"
            shift
            ;;
        --stage)
            STAGE="$2"
            shift 2
            ;;
        --test-file|--test_file)
            TEST_FILE="$2"
            shift 2
            ;;
        --adapter_path|--adapter-path)
            ADAPTER_PATH="$2"
            USER_SET_ADAPTER_PATH=true
            shift 2
            ;;
        --base_model|--base-model)
            BASE_MODEL="$2"
            shift 2
            ;;
        --method)
            METHOD="$2"
            shift 2
            ;;
        --adapter-type|--adapter_type)
            ADAPTER_TYPE="$2"
            shift 2
            ;;
        --no-adapter)
            NO_ADAPTER=true
            shift
            ;;
        --no-system|--no_system)
            NO_SYSTEM=true
            shift
            ;;
        --with-system|--with_system)
            NO_SYSTEM=false
            shift
            ;;
        --gpt-rewrite|--gpt_rewrite)
            GPT_REWRITE=true
            shift
            ;;
        --gpt-rewrite-model|--gpt_rewrite_model)
            GPT_REWRITE_MODEL="$2"
            shift 2
            ;;
        --local-rewrite|--local_rewrite|--sglang-rewrite|--sglang_rewrite)
            LOCAL_REWRITE=true
            LOCAL_BACKEND="sglang"
            shift
            ;;
        --vllm-rewrite|--vllm_rewrite)
            LOCAL_REWRITE=true
            LOCAL_BACKEND="vllm"
            shift
            ;;
        --local-backend|--local_backend)
            LOCAL_BACKEND="$2"
            shift 2
            ;;
        --sglang-server-node|--sglang_server_node)
            SGLANG_SERVER_NODE="$2"
            shift 2
            ;;
        --sglang-server-port|--sglang_server_port)
            SGLANG_SERVER_PORT="$2"
            shift 2
            ;;
        --use-gptjudge|--use_gptjudge)
            USE_GPTJUDGE=true
            shift
            ;;
        --no-gptjudge|--no_gptjudge)
            USE_GPTJUDGE=false
            shift
            ;;
        --gptjudge-model|--gptjudge_model)
            GPTJUDGE_MODEL="$2"
            shift 2
            ;;
        --quick)
            MAX_SAMPLES="--max_samples 10"
            BATCH_SIZE=4
            GENERATION_FILE="results/test_generation_results_quick.jsonl"
            OUTPUT_FILE="results/test_trained_model_results_quick.jsonl"
            METRICS_FILE="results/test_trained_model_metrics_quick.json"
            shift
            ;;
        --checkpoint=*)
            CHECKPOINT="${1#*=}"
            shift
            ;;
        --checkpoint)
            CHECKPOINT="$2"
            shift 2
            ;;
        *)
            shift
            ;;
    esac
done

# 根据 METHOD 和 ADAPTER_TYPE 自动生成默认路径（如果用户未指定）
if [ "$USER_SET_ADAPTER_PATH" = false ]; then
    ADAPTER_PATH="model/${MODEL_ID}/${ADAPTER_TYPE}/${DIRECTION}/${METHOD}"
fi

# Prompt-like methods（需要 merged base model dir）
if [ "$ADAPTER_TYPE" = "prompt-tuning" ] || [ "$ADAPTER_TYPE" = "prefix-tuning" ] || [ "$ADAPTER_TYPE" = "p-tuning" ]; then
    # 合并后的 base model（从 DPO LoRA merge 来的）
    MERGED_BASE_MODEL="model/${MODEL_ID}/peftfactory/${DIRECTION}/dpo_lora_merged_model"
    # Base model: 使用合并后的 base model 目录
    BASE_MODEL_FOR_INFER="$MERGED_BASE_MODEL"
else
    # LoRA: 清空 MERGED_BASE_MODEL，使用原始 BASE_MODEL
    MERGED_BASE_MODEL=""
    BASE_MODEL_FOR_INFER="$BASE_MODEL"
fi

echo "Configuration:"
echo "  Stage: $STAGE"
if [ "$GPT_REWRITE" = false ] && [ "$LOCAL_REWRITE" = false ]; then
    echo "  Base model: $BASE_MODEL_FOR_INFER"
    echo "  Method: $METHOD"
    echo "  Adapter type: $ADAPTER_TYPE"
    if [ "$NO_ADAPTER" = true ]; then
        echo "  Adapter: None (using base model only)"
    else
        echo "  Adapter: $ADAPTER_PATH"
    fi
    echo "  No system: $NO_SYSTEM"
else
    echo "  Base model: N/A (using external rewrite service)"
fi
echo "  Test file: $TEST_FILE"
echo "  Generation file: $GENERATION_FILE"
echo "  Output file: $OUTPUT_FILE"
echo "  Metrics file: $METRICS_FILE"
echo "  Direction: $DIRECTION"
if [ -n "$CHECKPOINT" ]; then
    echo "  Checkpoint: $CHECKPOINT (manual override)"
fi
if [ "$GPT_REWRITE" = true ]; then
    echo "  GPT Rewrite: Enabled (model: $GPT_REWRITE_MODEL)"
elif [ "$LOCAL_REWRITE" = true ]; then
    echo "  Local Rewrite: Enabled (backend: $LOCAL_BACKEND, server: $SGLANG_SERVER_NODE:$SGLANG_SERVER_PORT)"
else
    echo "  Batch size: $BATCH_SIZE"
    echo "  Max new tokens: $MAX_NEW_TOKENS"
fi
if [ "$USE_GPTJUDGE" = true ]; then
    echo "  Use GPTJudge: Enabled (model: $GPTJUDGE_MODEL)"
else
    echo "  Use GPTJudge: Disabled (using LlamaGuard)"
fi
echo "  Determinism: Enabled by default (seed=123, greedy decoding)"
if [ -n "$MAX_SAMPLES" ]; then
    echo "  Max samples: 10 (quick test)"
fi
echo ""

# 检查测试文件
if [ ! -f "$TEST_FILE" ]; then
    echo "❌ Error: Test file not found: $TEST_FILE"
    exit 1
fi

echo "✅ Found test file: $TEST_FILE"

# 处理 checkpoint（如果指定了，且使用 adapter）
# 这个逻辑需要在 LOCAL_REWRITE 检查之前执行，以便服务器启动时能使用正确的 checkpoint
if [ "$NO_ADAPTER" = false ] && [ -n "$CHECKPOINT" ]; then
    # 规范化 CHECKPOINT 输入：允许 "1200" 或 "checkpoint-1200"
    if [[ "$CHECKPOINT" == checkpoint-* ]]; then
        CHECKPOINT_DIRNAME="$CHECKPOINT"
    else
        CHECKPOINT_DIRNAME="checkpoint-$CHECKPOINT"
    fi

    # 如果 ADAPTER_PATH 已经指向某个 checkpoint，替换为用户指定的 checkpoint（使用其父目录）
    if [[ "$ADAPTER_PATH" == */checkpoint-* ]]; then
        ADAPTER_PARENT_DIR="$(dirname "$ADAPTER_PATH")"
        ADAPTER_PATH="$ADAPTER_PARENT_DIR/$CHECKPOINT_DIRNAME"
    else
        ADAPTER_PATH="$ADAPTER_PATH/$CHECKPOINT_DIRNAME"
    fi

    if [ -d "$ADAPTER_PATH" ]; then
        echo "✅ Using specified checkpoint: $ADAPTER_PATH"
    else
        echo "⚠️  Warning: Specified checkpoint not found: $ADAPTER_PATH"
        echo "    Falling back to automatic checkpoint selection"
        # 回退到原始 adapter 根目录（移除末尾 checkpoint-xxx）
        ADAPTER_PATH="$(dirname "$ADAPTER_PATH")"
    fi
fi

# 如果使用本地改写，确保 SGLang 服务器运行
SGLANG_SERVER_STARTED_BY_SCRIPT=false
SGLANG_SERVER_PID=""
if [ "$LOCAL_REWRITE" = true ] && ([ "$STAGE" = "1" ] || [ "$STAGE" = "all" ]); then
    echo "Checking/Starting SGLang server..."
    SGLANG_URL="http://${SGLANG_SERVER_NODE}:${SGLANG_SERVER_PORT}"
    
    # 检查服务器是否已经在运行
    if curl -s "$SGLANG_URL/health" > /dev/null 2>&1; then
        echo "✅ SGLang server is already running at $SGLANG_URL"
        # 服务器已经运行，不是我启动的，所以不设置 SGLANG_SERVER_STARTED_BY_SCRIPT=true
        # 获取 PID（如果可能，用于信息显示）
        if [ "$SGLANG_SERVER_NODE" = "localhost" ] || [ "$SGLANG_SERVER_NODE" = "127.0.0.1" ]; then
            SGLANG_SERVER_PID=$(pgrep -f "sglang.launch_server.*${SGLANG_SERVER_PORT}" | head -1 || echo "")
            if [ -n "$SGLANG_SERVER_PID" ]; then
                echo "  Found existing server PID: $SGLANG_SERVER_PID (will not be stopped by this script)"
            fi
        fi
    else
        # 如果服务器未运行，尝试启动（仅在 localhost 时）
        if [ "$SGLANG_SERVER_NODE" = "localhost" ] || [ "$SGLANG_SERVER_NODE" = "127.0.0.1" ]; then
            if [ "$LOCAL_BACKEND" = "vllm" ]; then
                echo "Starting vLLM server using deploy_vllm_background.sh..."
                DEPLOY_SERVER_SCRIPT="$SCRIPT_DIR/deploy_vllm_background.sh"
            else
                echo "Starting SGLang server using deploy_sglang_background.sh..."
                DEPLOY_SERVER_SCRIPT="$SCRIPT_DIR/deploy_sglang_background.sh"
            fi

            if [ -f "$DEPLOY_SERVER_SCRIPT" ]; then
                # 记录启动前的 PID（应该为空）
                if [ "$LOCAL_BACKEND" = "vllm" ]; then
                    OLD_PID=$(pgrep -f "vllm serve .*--port ${SGLANG_SERVER_PORT}" | head -1 || echo "")
                else
                    OLD_PID=$(pgrep -f "sglang.launch_server.*${SGLANG_SERVER_PORT}" | head -1 || echo "")
                fi
                if [ -n "$OLD_PID" ]; then
                    echo "  Found existing server PID before startup: $OLD_PID"
                else
                    echo "  No existing server found (will start new one)"
                fi
                
                # Align SGLang server weights with the current test config.
                # NOTE: deploy_sglang.sh has a non-empty default LORA_PATH. If we want "base model only",
                # we must explicitly override it to empty via --lora-path "".
                SGLANG_MODEL_ARG=(--model "$BASE_MODEL_FOR_INFER")
                SGLANG_LORA_ARG=()
                if [ "$NO_ADAPTER" = true ]; then
                    SGLANG_LORA_ARG=(--lora-path "")
                else
                    if [ "$ADAPTER_TYPE" = "lora" ] && [ -n "$ADAPTER_PATH" ]; then
                        SGLANG_LORA_ARG=(--lora-path "$ADAPTER_PATH")
                    else
                        if [ "$ADAPTER_TYPE" != "lora" ]; then
                            echo "⚠️  Warning: adapter_type=$ADAPTER_TYPE is not supported by SGLang here; starting server without LoRA"
                        fi
                        SGLANG_LORA_ARG=(--lora-path "")
                    fi
                fi

                # 启动服务器并等待就绪
                bash "$DEPLOY_SERVER_SCRIPT" \
                    --server-node "$SGLANG_SERVER_NODE" \
                    --server-port "$SGLANG_SERVER_PORT" \
                    "${SGLANG_MODEL_ARG[@]}" \
                    "${SGLANG_LORA_ARG[@]}" || {
                    echo "⚠️  Warning: Failed to start SGLang server. Continuing anyway..."
                }
                
                # 获取启动后的服务器 PID（等待更长时间确保服务器完全启动）
                # deploy_sglang_background.sh 会等待服务器就绪，所以这里不需要等待太久
                sleep 2
                if [ "$LOCAL_BACKEND" = "vllm" ]; then
                    NEW_PID=$(pgrep -f "vllm serve .*--port ${SGLANG_SERVER_PORT}" | head -1 || echo "")
                else
                    NEW_PID=$(pgrep -f "sglang.launch_server.*${SGLANG_SERVER_PORT}" | head -1 || echo "")
                fi
                
                # 如果找到了新的 PID，且与旧的不同（或旧的不存在），说明是我启动的
                if [ -n "$NEW_PID" ]; then
                    if [ -z "$OLD_PID" ]; then
                        # 旧的不存在，说明是我启动的
                        SGLANG_SERVER_PID="$NEW_PID"
                        SGLANG_SERVER_STARTED_BY_SCRIPT=true
                        echo "  ✓ Server started by this script with PID: $SGLANG_SERVER_PID (will be stopped after Stage 1)"
                        # 将 PID 信息写入文件，供 Python 脚本使用
                        SGLANG_PID_FILE="/tmp/sglang_server_pid_$$.txt"
                        echo "$SGLANG_SERVER_PID" > "$SGLANG_PID_FILE"
                        export SGLANG_PID_FILE="$SGLANG_PID_FILE"
                    elif [ "$NEW_PID" != "$OLD_PID" ]; then
                        # PID 不同，说明是我启动的（替换了旧的）
                        SGLANG_SERVER_PID="$NEW_PID"
                        SGLANG_SERVER_STARTED_BY_SCRIPT=true
                        echo "  ✓ Server started by this script with PID: $SGLANG_SERVER_PID (replaced old PID: $OLD_PID, will be stopped after Stage 1)"
                        # 将 PID 信息写入文件，供 Python 脚本使用
                        SGLANG_PID_FILE="/tmp/sglang_server_pid_$$.txt"
                        echo "$SGLANG_SERVER_PID" > "$SGLANG_PID_FILE"
                        export SGLANG_PID_FILE="$SGLANG_PID_FILE"
                    else
                        # PID 相同，说明服务器已经运行（可能是并发启动，或者 deploy_sglang_background.sh 检测到已运行）
                        SGLANG_SERVER_PID="$NEW_PID"
                        echo "  Server already running with PID: $SGLANG_SERVER_PID (not started by this script)"
                        # 不创建 PID 文件，因为不是我们启动的
                        SGLANG_PID_FILE=""
                    fi
                else
                    echo "  ⚠️  Warning: Could not find server PID after startup"
                fi
            else
                echo "⚠️  Warning: deploy server script not found at $DEPLOY_SERVER_SCRIPT"
                echo "    Please ensure SGLang server is running at $SGLANG_URL"
            fi
        else
            echo "⚠️  Warning: SGLang server is not accessible at $SGLANG_URL"
            echo "    Please ensure the server is running on $SGLANG_SERVER_NODE:$SGLANG_SERVER_PORT"
        fi
    fi
    echo ""
fi

# 检查 adapter（如果使用，且不使用 GPT rewrite 和 local rewrite）
# 注意：checkpoint 处理已经在前面完成（在 LOCAL_REWRITE 检查之前），这里只需要处理自动查找最新 checkpoint
if [ "$GPT_REWRITE" = false ] && [ "$LOCAL_REWRITE" = false ] && [ "$NO_ADAPTER" = false ] && ([ "$STAGE" = "1" ] || [ "$STAGE" = "all" ]); then
    # 如果用户已经指定了一个 checkpoint 目录，直接使用（不再查找子目录）
    if [[ "$ADAPTER_PATH" == */checkpoint-* ]]; then
        if [ -d "$ADAPTER_PATH" ]; then
            echo "✅ Using specified checkpoint: $ADAPTER_PATH"
        else
            echo "⚠️  Warning: Specified checkpoint not found: $ADAPTER_PATH"
            echo "    Using directory directly"
        fi
    # 查找最新的 checkpoint
    elif [ -d "$ADAPTER_PATH" ]; then
        # 优先使用当前训练 run 的 global_step 对应的 checkpoint（避免目录里残留旧 checkpoint 误选）
        TRAINER_STATE_JSON="$ADAPTER_PATH/trainer_state.json"
        if [ -f "$TRAINER_STATE_JSON" ]; then
            GLOBAL_STEP=$(python3 -c "import json; print(json.load(open('$TRAINER_STATE_JSON','r')).get('global_step',''))" 2>/dev/null || true)
            if [[ "$GLOBAL_STEP" =~ ^[0-9]+$ ]] && [ -d "$ADAPTER_PATH/checkpoint-$GLOBAL_STEP" ]; then
                ADAPTER_PATH="$ADAPTER_PATH/checkpoint-$GLOBAL_STEP"
                echo "✅ Found checkpoint from trainer_state.json: $ADAPTER_PATH (global_step=$GLOBAL_STEP)"
            else
                # 回退：按修改时间找最新 checkpoint
                LATEST_CHECKPOINT=$(ls -td "$ADAPTER_PATH"/checkpoint-* 2>/dev/null | head -1)
                if [ -n "$LATEST_CHECKPOINT" ]; then
                    ADAPTER_PATH="$LATEST_CHECKPOINT"
                    echo "✅ Found latest checkpoint by mtime: $ADAPTER_PATH"
                else
                    echo "⚠️  Warning: No checkpoint found in $ADAPTER_PATH"
                    echo "    Using directory directly"
                fi
            fi
        else
            # 没有 trainer_state.json 时，按修改时间找最新 checkpoint
            LATEST_CHECKPOINT=$(ls -td "$ADAPTER_PATH"/checkpoint-* 2>/dev/null | head -1)
            if [ -n "$LATEST_CHECKPOINT" ]; then
                ADAPTER_PATH="$LATEST_CHECKPOINT"
                echo "✅ Found latest checkpoint by mtime: $ADAPTER_PATH"
            else
                echo "⚠️  Warning: No checkpoint found in $ADAPTER_PATH"
                echo "    Using directory directly"
            fi
        fi
    else
        echo "⚠️  Warning: Adapter not found: $ADAPTER_PATH"
        echo "    Using base model only"
        NO_ADAPTER=true
    fi
fi

# 创建输出目录
mkdir -p "$(dirname "$GENERATION_FILE")"
mkdir -p "$(dirname "$OUTPUT_FILE")"
mkdir -p "$(dirname "$METRICS_FILE")"

# 构建命令
CMD="python test_trained_model.py \
    --stage $STAGE \
    --base_model $BASE_MODEL_FOR_INFER \
    --test_file $TEST_FILE \
    --generation_file $GENERATION_FILE \
    --output_file $OUTPUT_FILE \
    --metrics_file $METRICS_FILE \
    --direction $DIRECTION \
    $MAX_SAMPLES"

# 只有在不使用 GPT rewrite 和 local rewrite 时才添加 batch_size 和 max_new_tokens
if [ "$GPT_REWRITE" = false ] && [ "$LOCAL_REWRITE" = false ]; then
    CMD="$CMD --batch_size $BATCH_SIZE --max_new_tokens $MAX_NEW_TOKENS"
fi

# 只有在不使用 GPT rewrite 和 local rewrite 时才处理 adapter 相关参数
if [ "$GPT_REWRITE" = false ] && [ "$LOCAL_REWRITE" = false ]; then
    if [ "$NO_ADAPTER" = true ]; then
        CMD="$CMD --no_adapter"
    else
        CMD="$CMD --adapter_path $ADAPTER_PATH"
    fi
    
    if [ "$NO_SYSTEM" = true ]; then
        CMD="$CMD --no_system"
    fi
    
    # 传递 method 和 adapter_type 给 Python 脚本
    CMD="$CMD --method $METHOD --adapter_type $ADAPTER_TYPE"
elif [ "$GPT_REWRITE" = true ]; then
    # 使用 GPT rewrite 时，添加 GPT rewrite 参数
    CMD="$CMD --gpt_rewrite --gpt_rewrite_model $GPT_REWRITE_MODEL"
elif [ "$LOCAL_REWRITE" = true ]; then
    # 使用本地 SGLang 服务器改写时，添加相应参数
    CMD="$CMD --local_rewrite --sglang_server_node $SGLANG_SERVER_NODE --sglang_server_port $SGLANG_SERVER_PORT"
    # 如果服务器是由脚本启动的，传递 PID 文件路径
    if [ "$SGLANG_SERVER_STARTED_BY_SCRIPT" = true ] && [ -n "$SGLANG_PID_FILE" ]; then
        CMD="$CMD --sglang_pid_file $SGLANG_PID_FILE"
    fi
fi

# 添加 GPTJudge 相关参数
if [ "$USE_GPTJUDGE" = true ]; then
    CMD="$CMD --use_gptjudge --gptjudge_model $GPTJUDGE_MODEL"
fi

echo "Running test (Stage: $STAGE)..."
echo ""

# 运行测试（使用 trap 确保即使出错也会执行清理）
cleanup_sglang_server() {
    if [ "$LOCAL_REWRITE" = true ]; then
        # 检查 STAGE 是否包含阶段1
        STAGE_CONTAINS_1=false
        if [ "$STAGE" = "1" ] || [ "$STAGE" = "all" ] || [ "$STAGE" = "12" ] || [ "$STAGE" = "13" ] || [ "$STAGE" = "123" ]; then
            STAGE_CONTAINS_1=true
        fi
        
        if [ "$STAGE_CONTAINS_1" = true ]; then
            # 检查 PID 文件是否还存在（如果 Python 脚本已经关闭了服务器，文件会被删除）
            if [ -n "$SGLANG_PID_FILE" ] && [ -f "$SGLANG_PID_FILE" ]; then
                # PID 文件还存在，说明 Python 脚本可能没有关闭服务器（比如出错退出）
                echo ""
                echo "Stage 1 completed. Shutting down SGLang server (fallback cleanup)..."
                
                # 如果是我启动的服务器，关闭它
                if [ "$SGLANG_SERVER_STARTED_BY_SCRIPT" = true ] && [ -n "$SGLANG_SERVER_PID" ]; then
                    if ps -p "$SGLANG_SERVER_PID" > /dev/null 2>&1; then
                        echo "  Stopping SGLang server (PID: $SGLANG_SERVER_PID)..."
                        kill "$SGLANG_SERVER_PID" 2>/dev/null || true
                        # 等待进程结束
                        sleep 2
                        # 如果还在运行，强制杀死
                        if ps -p "$SGLANG_SERVER_PID" > /dev/null 2>&1; then
                            kill -9 "$SGLANG_SERVER_PID" 2>/dev/null || true
                            sleep 1
                        fi
                        echo "  ✓ SGLang server stopped"
                    else
                        echo "  Server process (PID: $SGLANG_SERVER_PID) is not running"
                    fi
                    # 清理 PID 文件
                    rm -f "$SGLANG_PID_FILE" 2>/dev/null || true
                fi
            fi
        fi
    fi
}

# 注册清理函数，在脚本退出时执行（包括正常退出和错误退出）
trap cleanup_sglang_server EXIT

# 运行测试
eval $CMD
EXIT_CODE=$?

# 手动调用清理函数（因为 trap 只在脚本退出时执行，这里先执行一次）
cleanup_sglang_server

# 取消 trap（避免重复执行）
trap - EXIT

# 退出，使用 Python 脚本的退出码
exit $EXIT_CODE

