#!/bin/bash

# Script to run judge evaluation for multiple single turn model response files

# Define the list of models to evaluate
models=(
    # "o4-mini"
    # "gpt-4.1"
    # "gpt-4o"
    "gpt-5"
    # "gpt-5-think"
    # "o3"
    # "claude-opus-4-1-thinking24"
    # "claude-sonnet-4-20250514-thinking24"
    # "claude-sonnet-4-20250514"
    "claude-opus-4-1"
    # "gemini-2.5-flash"
    "gemini-2.5-pro"
    # "us.amazon.nova-premier-v1:0"
    # "pixtral-large-latest"
    # "llama4-maverick-instruct-basic"
    # "llama4-scout-instruct-basic"
)

# Configuration
JUDGE_MODEL="o4-mini"
NUM_WORKERS=32
CLEAN_RUBRICS="data/single_turn_data_corrected_with_rubrics_weights.json"
INPUT_DIR="eval_no_python_processing"
OUTPUT_DIR="./eval_no_python_processing_results"

# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

echo "Starting batch judge evaluation..."
echo "Judge model: $JUDGE_MODEL"
echo "Number of workers: $NUM_WORKERS"
echo "Clean rubrics: $CLEAN_RUBRICS"
echo "Models to evaluate: ${#models[@]}"
echo "----------------------------------------"

# Initialize counters
total_models=${#models[@]}
completed_models=0
failed_models=0

# Loop through each model
for model in "${models[@]}"; do
    echo ""
    echo "=== Processing model: $model ($(($completed_models + 1))/$total_models) ==="
    
    # Define file paths
    model_response_file="$INPUT_DIR/${model}_w_tool_system_low_max_tool_calls_20_trial_1.json"
    eval_results_file="$OUTPUT_DIR/${model}_eval_results.json"
    eval_summary_file="$OUTPUT_DIR/${model}_eval_summary.json"
    
    # Check if input file exists
    if [ ! -f "$model_response_file" ]; then
        echo "WARNING: Model response file not found: $model_response_file"
        echo "Skipping $model..."
        ((failed_models++))
        continue
    fi
    
    # Check if results already exist
    if [ -f "$eval_results_file" ]; then
        echo "Results already exist for $model. Skipping..."
        ((completed_models++))
        continue
    fi
    
    echo "Input file: $model_response_file"
    echo "Output file: $eval_results_file"
    echo "Summary file: $eval_summary_file"
    
    # Run the judge evaluation command
    python run_judge_single_turn.py \
        --model_response "$model_response_file" \
        --eval_results "$eval_results_file" \
        --eval_summary "$eval_summary_file" \
        --judge_model "$JUDGE_MODEL" \
        --num_workers $NUM_WORKERS \
        --clean_rubrics "$CLEAN_RUBRICS"
    
    # Check if the command executed successfully
    if [ $? -eq 0 ]; then
        echo "✓ $model evaluation completed successfully!"
        ((completed_models++))
    else
        echo "✗ $model evaluation failed with exit code $?"
        ((failed_models++))
    fi
    
    echo "----------------------------------------"
done

# Final summary
echo ""
echo "=== BATCH EVALUATION SUMMARY ==="
echo "Total models processed: $total_models"
echo "Successfully completed: $completed_models"
echo "Failed: $failed_models"
echo "Skipped: $((total_models - completed_models - failed_models))"

if [ $failed_models -eq 0 ]; then
    echo "All evaluations completed successfully! 🎉"
    exit 0
else
    echo "Some evaluations failed. Check the logs above for details."
    exit 1
fi 