#!/usr/bin/env bash
set -euxo pipefail

#=========================================================================================
# Batched progressive evaluation script for large datasets (100K+ examples)
# 
# This version processes data in batches to avoid OOM issues when each job
# handles the MATH-B dataset (181 problems).
#=========================================================================================

# --- Condor Job Parameters ---
# The job ID from Condor is passed as the first argument
if [ -z "$1" ]; then
    echo "Error: Job ID not provided."
    echo "Usage: $0 <job_id>"
    exit 1
fi
JOB_ID=$1

# Total number of jobs (adjust based on your setup)
NUM_JOBS=1

# --- Model Configuration ---
# DeepSeek-R1-Distill-Qwen-7B from HuggingFace
#MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
MODEL_PATH="meta-llama/Llama-3.1-8B"


# --- Dataset Configuration ---
# Path to 100K subset of NuminaMath dataset
DATA_HOME=${DATA_HOME:-"./"}
DATASET_PATH="./union_dataset_comprehensive.parquet"

# --- Progressive Evaluation Parameters ---
# INITIAL_SAMPLES=64      # Start with 16 samples to quickly eliminate easy problems
# SAMPLE_INCREMENT=64     # Add 16 samples each round for finer-grained progression
# TARGET_SAMPLES=128      # Stop at 1024 total samples – sufficient while saving compute
# N_SAMPLES_CHUNK=64     # Generate 16 samples at a time to match increment

INITIAL_SAMPLES=64      # Start with 16 samples to quickly eliminate easy problems
SAMPLE_INCREMENT=64     # Add 16 samples each round for finer-grained progression
TARGET_SAMPLES=1024      # Stop at 1024 total samples – sufficient while saving compute
N_SAMPLES_CHUNK=64     # Generate 16 samples at a time to match increment

# --- Generation Parameters ---
TEMPERATURE=0.6         # As recommended for DeepSeek-R1 models
TOP_P=0.95             # Standard top-p sampling
MAX_TOKENS=32768        # For complex reasoning chains

# --- Batching & Performance ---
# Increase batch to fully saturate GPU memory and align sample chunk size
BATCH_SIZE=64         # Process 512 problems at a time for higher throughput
TP_SIZE=1              # Use 2 GPUs for tensor parallelism (unchanged)
NUM_GRADING_WORKERS=32 # More CPU workers to keep up with larger batches

# --- Output Configuration ---
OUTPUT_DIR="./evaluations"
DATASET_NAME="math_beyond"
EXP_NAME="llama3.1_8b_${DATASET_NAME}_progressive_batched_${TARGET_SAMPLES}"

# --- Environment Setup ---
export TOKENIZERS_PARALLELISM=false
module load cuda/12.4
#module load gcc/4.7
source ./venv/bin/activate

# --- Execution ---
echo "======================================"
echo "Batched Progressive Evaluation - Job ${JOB_ID}/${NUM_JOBS}"
echo "======================================"
echo "Model: ${MODEL_PATH}"
echo "Dataset: ${DATASET_PATH}"
echo "Progressive: ${INITIAL_SAMPLES} -> ${TARGET_SAMPLES} (increment: ${SAMPLE_INCREMENT})"
echo "Batch size: ${BATCH_SIZE} problems"
echo "Temperature: ${TEMPERATURE}, Top-p: ${TOP_P}"
echo "Output: ${OUTPUT_DIR}/${EXP_NAME}"
echo "======================================"

# Run the batched progressive evaluation
python3 ./evaluate_progressive_batched.py \
    --model_path "${MODEL_PATH}" \
    --dataset_path "${DATASET_PATH}" \
    --output_dir "${OUTPUT_DIR}" \
    --exp_name "${EXP_NAME}" \
    --initial_samples "${INITIAL_SAMPLES}" \
    --sample_increment "${SAMPLE_INCREMENT}" \
    --target_samples "${TARGET_SAMPLES}" \
    --temperature "${TEMPERATURE}" \
    --top_p "${TOP_P}" \
    --max_tokens "${MAX_TOKENS}" \
    --batch_size "${BATCH_SIZE}" \
    --n_samples_chunk "${N_SAMPLES_CHUNK}" \
    --n_jobs "${NUM_JOBS}" \
    --job_id "${JOB_ID}" \
    --tensor_parallel_size "${TP_SIZE}" \
    --num_grading_workers "${NUM_GRADING_WORKERS}" \
    --save_intermediate

echo "======================================"
echo "Job ${JOB_ID}/${NUM_JOBS} completed successfully!"
echo "Results saved to: ${OUTPUT_DIR}/${EXP_NAME}"
echo "======================================" 