#!/bin/bash
# Bridges 8x8de + Undead 5x5de pass@32 on novelty_production_gspo_topk100_a01 step 15.
# Uses lm_eval_dp_diverse.py for proper per-prompt seed diversification (CLAUDE.md rule).
# GPUs: 0,1,2 only (GPU 3 reserved for user's microrun training).

set -e
PROJECT_DIR="${PROJECT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
cd "$PROJECT_DIR"
source "${VLLM_VENV_PATH:-$HOME/verl-vllm012}/bin/activate"

export WANDB_CONSOLE=off
export PYTHONUNBUFFERED=1
export VLLM_USE_TRTLLM_ATTENTION=0
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export VLLM_USE_V1=1
export CUDA_VISIBLE_DEVICES=0,1,2

MODEL_PATH="checkpoints/olmo3-puzzle-grpo/novelty_production_gspo_topk100_a01/merged_step_15"
CUSTOM_TASKS_PATH="evaluate/custom_tasks"
OUTPUT_DIR="results/novelty_prod_s15_puzzle_pass32"
LABEL="novelty_prod_s15"
LOG_FILE="logs/novelty_prod_s15_puzzle_eval.log"

SYSTEM_INSTRUCTION="A conversation between User and Assistant. The user asks a question, and the Assistant solves it step by step by reasoning. Provide the reasoning in <reasoning> reasoning here </reasoning> and the final solution within <answer> answer here </answer>"

mkdir -p "${OUTPUT_DIR}" logs

model_args="pretrained=${MODEL_PATH}"
model_args="${model_args},tensor_parallel_size=1"
model_args="${model_args},data_parallel_size=3"
model_args="${model_args},gpu_memory_utilization=0.85"
model_args="${model_args},max_model_len=26000"

run_task() {
    local task=$1
    local out="${OUTPUT_DIR}/${LABEL}/${task}"

    if ls "${out}"/*/results*.json 2>/dev/null | head -1 | grep -q .; then
        echo "[SKIP] ${task} (results exist in ${out})"
        return 0
    fi

    mkdir -p "${out}"

    echo "=========================================="
    echo "${task}: novelty_production s15 (n=4)"
    echo "Model: ${MODEL_PATH}"
    echo "Output: ${out}"
    echo "Started: $(date '+%Y-%m-%d %H:%M:%S')"
    echo "=========================================="

    python scripts/evals/lm_eval_dp_diverse.py \
        --model vllm \
        --model_args "${model_args}" \
        --include_path ${CUSTOM_TASKS_PATH} \
        --tasks "${task}" \
        --batch_size auto \
        --apply_chat_template \
        --system_instruction "${SYSTEM_INSTRUCTION}" \
        --seed 42 \
        --output_path "${out}" \
        --log_samples

    echo "[DONE] ${task} at $(date '+%Y-%m-%d %H:%M:%S')"
}

# Run sequentially — same vLLM doesn't have multi-task batching across schemas
run_task bridges_8x8de_pass32
run_task undead_5x5de_pass32

echo ""
echo "=========================================="
echo "Computing pass@k..."
echo "=========================================="
python scripts/evals/compute_pass_at_k.py "${OUTPUT_DIR}/${LABEL}" \
    --k_values 1,4,8,16,32 \
    --workers 8 \
    --json_output "${OUTPUT_DIR}/${LABEL}_pass_at_k.json" \
    --per_problem || echo "compute_pass_at_k.py exited non-zero — see output"

echo ""
echo "Done. $(date '+%Y-%m-%d %H:%M:%S')"
