#!/bin/bash

# Load conda environment
source /data/user/miniconda3/etc/profile.d/conda.sh
conda activate rllm2
cd /data/user/rllm

set -a
. /data/user/rllm/.env
set +a

set -x
# Print GPU info
srun -l bash -c 'echo "Node: $(hostname -s)"; nvidia-smi -L'

# --- vLLM / torch env
unset ROCR_VISIBLE_DEVICES ROCM_VISIBLE_DEVICES HIP_VISIBLE_DEVICES
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:False"
export VLLM_USE_V1=1
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export VLLM_ENGINE_ITERATION_TIMEOUT_S=1000000000
export CUDA_DEVICE_ORDER=PCI_BUS_ID
export HYDRA_FULL_ERROR=1
export RAY_DISABLE_DASHBOARD=1

# clean any previous instances and stray shared dirs
ray stop -f || true
pkill -9 -f "ray::" || true
rm -rf "/tmp/$USER"/ray_* 2>/dev/null || true
export RAY_TMPDIR="/data/user/ray"
export TMPDIR="/data/user/tmp"
mkdir -p "$RAY_TMPDIR" "$TMPDIR"
chmod 700 "$RAY_TMPDIR" "$TMPDIR"
export RAY_object_store_allow_fallback_to_memory=1

RLLM_DIR="$(pwd -P)"

# -----------------------------
# Configuration
# -----------------------------
MODEL_PATH="Qwen/Qwen2.5-Coder-7B-Instruct"
MODEL_NAME="Qwen/Qwen2.5-Coder-7B-Instruct"
HOST="0.0.0.0"
PORT=30000
TP=1
DP=1
CUDA_DEVICES=0
N_PARALLEL=64
FIXER_ATTEMPTS_VAL=1
# VAL_DATASETS="${VAL_DATASETS:-bugbench:test bugbench_human:test bugbench_qwen7b_sampled:test bugbench_gpt-oss-20b_sampled:test bugbench_adversarial:test}"
VAL_DATASETS="${VAL_DATASETS:-bugbench:test_small bugbench_human:test_small bugbench_qwen7b_sampled:test_small bugbench_gpt-oss-20b_sampled:test_small}"
OUTPUT_DIR="${OUTPUT_DIR:-logs/eval/bugs/small_base}"

BASE_URL="http://127.0.0.1:${PORT}/v1"

# NEW: how many times to repeat + starting seed
NUM_SEEDS="${NUM_SEEDS:-3}"
SEED_START="${SEED_START:-0}"

echo "=============================================="
echo "Base Model Evaluation"
echo "=============================================="
echo "Model: $MODEL_PATH"
echo "Configuration:"
echo "  Host: $HOST:$PORT"
echo "  Tensor Parallel: $TP"
echo "  Data Parallel: $DP"
echo "  CUDA Devices: $CUDA_DEVICES"
echo "  Parallel tasks: $N_PARALLEL"
echo "  Fixer attempts: $FIXER_ATTEMPTS_VAL"
echo "  Val datasets: $VAL_DATASETS"
echo "  Output dir: $OUTPUT_DIR"
echo "  Num seeds: $NUM_SEEDS (start: $SEED_START)"
echo "=============================================="
echo ""

mkdir -p "$OUTPUT_DIR"

# -----------------------------
# Helper: Kill vLLM server
# -----------------------------
cleanup_vllm() {
    echo "[cleanup] Stopping vLLM server..."
    if [[ -n "${VLLM_PID:-}" ]]; then
        kill "$VLLM_PID" 2>/dev/null || true
        wait "$VLLM_PID" 2>/dev/null || true
    fi
    # Also kill any stray vllm processes on this port
    pkill -9 -f "vllm.*--port.*${PORT}" 2>/dev/null || true
    sleep 2
}

# Cleanup on exit
trap cleanup_vllm EXIT

# -----------------------------
# Helper: Wait for vLLM to be ready
# -----------------------------
wait_for_vllm() {
    local model_name="$1"
    local max_attempts="${2:-60}"

    echo "[vllm] Waiting for server to be ready..."
    for i in $(seq 1 "$max_attempts"); do
        if curl -fsS "${BASE_URL}/models" 2>/dev/null | grep -q "data"; then
            echo "[vllm] Server is ready!"
            return 0
        fi

        # Check if server process is still alive
        if ! kill -0 "$VLLM_PID" 2>/dev/null; then
            echo "[vllm] ERROR: Server process died"
            return 1
        fi

        echo "[vllm] Waiting... (${i}/${max_attempts})"
        sleep 5
    done

    echo "[vllm] ERROR: Server did not become ready in time"
    return 1
}

# -----------------------------
# Start vLLM server
# -----------------------------
VLLM_LOG="${OUTPUT_DIR}/vllm_server.log"

echo "[vllm] Starting server for: $MODEL_PATH"
echo "[vllm] Log: $VLLM_LOG"

CUDA_VISIBLE_DEVICES="$CUDA_DEVICES" \
    vllm serve "$MODEL_PATH" \
        --host "$HOST" \
        --port "$PORT" \
        --served-model-name "$MODEL_NAME" \
        --tensor-parallel-size "$TP" \
        --data-parallel-size "$DP" \
        >> "$VLLM_LOG" 2>&1 &
VLLM_PID=$!

echo "[vllm] Launched with PID: $VLLM_PID"

# Wait for server to be ready
if ! wait_for_vllm "$MODEL_NAME" 60; then
    echo "[error] Failed to start vLLM for $MODEL_NAME"
    echo "[error] Last 50 lines of log:"
    tail -50 "$VLLM_LOG" 2>/dev/null || true
    exit 1
fi

# -----------------------------
# Run evaluation (repeated)
# -----------------------------
echo ""
echo "[eval] Running evaluation for NUM_SEEDS=${NUM_SEEDS} (SEED_START=${SEED_START})..."

ANY_FAIL=0

for idx in $(seq 0 $((NUM_SEEDS - 1))); do
    SEED=$((SEED_START + idx))
    RUN_OUTPUT_DIR="${OUTPUT_DIR}/seed_${SEED}"
    mkdir -p "$RUN_OUTPUT_DIR"

    echo ""
    echo "[eval] =============================================="
    echo "[eval] Seed ${SEED} ($((idx + 1))/${NUM_SEEDS})"
    echo "[eval] Output: ${RUN_OUTPUT_DIR}"
    echo "[eval] =============================================="

    EVAL_LOG="${RUN_OUTPUT_DIR}/eval.log"

    PYTHONHASHSEED="$SEED" \
    python -m examples.bugs_refactor.run_generator_fixer_flow \
        --val_datasets ${VAL_DATASETS} \
        --model "$MODEL_NAME" \
        --base_url "${BASE_URL}" \
        --n_parallel ${N_PARALLEL} \
        --eval_pregenerated_only \
        --evaluate_codegen \
        --include_failed_test_output \
        --fixer_attempts_val ${FIXER_ATTEMPTS_VAL} \
        --temperature 0.0 \
        --save_results \
        --output_dir "${RUN_OUTPUT_DIR}" \
        2>&1 | tee "$EVAL_LOG"

    # get python exit status (not tee)
    EVAL_STATUS=${PIPESTATUS[0]}

    if [[ $EVAL_STATUS -eq 0 ]]; then
        echo "[eval] ✅ Evaluation complete for seed ${SEED}"
    else
        echo "[eval] ❌ Evaluation failed for seed ${SEED} (exit code: $EVAL_STATUS)"
        ANY_FAIL=1
    fi
done

# Cleanup
cleanup_vllm

echo ""
echo "=============================================="
if [[ $ANY_FAIL -eq 0 ]]; then
    echo "All evaluations complete!"
else
    echo "Evaluations complete (some seeds failed)."
fi
echo "Results saved under: $OUTPUT_DIR/seed_*"
echo "=============================================="

exit $ANY_FAIL
