#!/bin/bash
# OLMo3 base — hard puzzle pass@32 (bridges_8x8de + undead_5x5de) for §4.1
# primitive/motif analysis.
#
# Same recipe as scripts/evals/run_hard_puzzle_pass32.sh but pointed at the
# un-puzzle-SFT base allenai/OLMo-3-7B-Instruct-SFT. 100 problems × 32
# rollouts per task to match v90_sft_puzzles / v90_gspo_puzzles input scale.
#
# Usage:
#   nohup ./scripts/evals/eval_olmo3_base_puzzles_pass32.sh \
#       > logs/olmo3_base_hard_puzzle_pass32.log 2>&1 &

set -euo pipefail
PROJECT_DIR="${PROJECT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
cd "$PROJECT_DIR"

source "${VLLM_VENV_PATH:-$HOME/verl-vllm012}/bin/activate"

export VLLM_USE_V1=1
export VLLM_USE_TRTLLM_ATTENTION=0
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export PYTHONUNBUFFERED=1
export WANDB_CONSOLE=off

MODEL="allenai/OLMo-3-7B-Instruct-SFT"
TASKS="bridges_8x8de_pass32,undead_5x5de_pass32"
OUTPUT_DIR="results/olmo3_base/hard_puzzle_pass32"
SEED=42

SYS_INST="A conversation between User and Assistant. The user asks a question, and the Assistant solves it step by step by reasoning. Provide the reasoning in <reasoning> reasoning here </reasoning> and the final solution within <answer> answer here </answer>"

mkdir -p "${OUTPUT_DIR}" logs

echo "=========================================="
echo "OLMo3 base — hard puzzle pass@32"
echo "Model: ${MODEL}"
echo "Tasks: ${TASKS}"
echo "Limit: 100 problems x 32 rollouts per task"
echo "Output: ${OUTPUT_DIR}"
echo "Started: $(date '+%Y-%m-%d %H:%M:%S')"
echo "=========================================="

python3 scripts/evals/lm_eval_dp_diverse.py \
    --model vllm \
    --model_args "pretrained=${MODEL},data_parallel_size=4,gpu_memory_utilization=0.95,max_model_len=24000,trust_remote_code=True" \
    --include_path evaluate/custom_tasks \
    --tasks "${TASKS}" \
    --batch_size auto \
    --apply_chat_template \
    --system_instruction "${SYS_INST}" \
    --limit 100 \
    --seed ${SEED} \
    --output_path "${OUTPUT_DIR}" \
    --log_samples

echo ""
echo "=========================================="
echo "Eval done at $(date '+%Y-%m-%d %H:%M:%S')"
echo "=========================================="
ls -la "${OUTPUT_DIR}"
