#!/usr/bin/env bash


# ==========================================================================================
# This script runs a round-robin "LLM-as-a-Judge" evaluation using GPT-4 for all model
# responses found in a directory. It finds all `response.jsonl` files in the subdirectories
# and runs a pairwise comparison for every unique pair.
#
# The OpenAI API key is read from the `OPENAI_API_KEY` environment variable.
#
# Sample Usage
# bash scripts/test/gpt4_evaluation.sh --response_dir results/generated_responses --prompt harmlessness
# ==========================================================================================


if [ -z "${BASH_VERSION}" ]; then
	echo "Please use bash to run this script." >&2
	exit 1
fi

set -x
set -e # Exit immediately if a command exits with a non-zero status.

SCRIPT_DIR="$(cd "$(dirname "$0")" &>/dev/null && pwd)"
ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
export PYTHONPATH="${ROOT_DIR}${PYTHONPATH:+:${PYTHONPATH}}"

# --- Configuration & Argument Parsing ---
RESPONSE_DIR=""
REDO_EVALUATIONS="false"
SECRET_FILE_PATH="./openai_api_key.env"
while [[ "$#" -gt 0 ]]; do
	arg="$1"
	shift
	case "${arg}" in
		--response_dir) RESPONSE_DIR="$1"; shift ;;
		--response_dir=*) RESPONSE_DIR="${arg#*=}" ;;
		--redo) REDO_EVALUATIONS="true" ;;
        --dry_run) DRY_RUN_FLAG="--dry_run" ;;
        --prompt) PROMPT="$1"; shift ;; # ADDED: Handle --prompt flag
        --prompt=*) PROMPT="${arg#*=}" ;; # ADDED: Handle --prompt=value
		*) echo "Unknown parameter passed: '${arg}'" >&2; exit 1 ;;        
	esac
done

# --- Validate Arguments and API Key ---
if [[ -z "${RESPONSE_DIR}" ]]; then
	echo "Error: --response_dir is required." >&2
	exit 1
fi

# Safety Check: Ensure the directory is named `generated_responses` to prevent accidental loops.
if [[ "$(basename "${RESPONSE_DIR}")" != "generated_responses" ]]; then
    echo "Warning: For safety, this script is designed to read from a directory named 'generated_responses'." >&2
    echo "You provided: ${RESPONSE_DIR}" >&2
    echo "Continuing in 5 seconds... (Press Ctrl+C to cancel)" >&2
    sleep 5
fi

# Check if the secret file exists and is readable
if [[ ! -f "${SECRET_FILE_PATH}" ]] || [[ ! -r "${SECRET_FILE_PATH}" ]]; then
    echo "Error: OpenAI API key file not found or not readable at ${SECRET_FILE_PATH}" >&2
    echo "Please ensure the file exists and you have read permissions." >&2
    exit 1
fi

# --- Find Response Files ---
mapfile -t RESPONSE_FILES < <(find "${RESPONSE_DIR}" -mindepth 2 -maxdepth 2 -type f -name "response.jsonl")

if [ ${#RESPONSE_FILES[@]} -lt 2 ]; then
    echo "Error: Need at least two 'response.jsonl' files in the subdirectories of ${RESPONSE_DIR} to run a comparison." >&2
    exit 1
fi

echo "Found ${#RESPONSE_FILES[@]} response files. Starting pairwise GPT-4 evaluations..."
printf " - %s\n" "${RESPONSE_FILES[@]}"

# --- Determine Output Directory ---
# The evaluation output will be a sibling directory to the response directory.
PARENT_DIR=$(dirname "${RESPONSE_DIR}")
TOURNAMENT_BASE_DIR="${PARENT_DIR}/gpt4_tournament" # Renamed for clarity
EVALUATION_ROOT_DIR="${TOURNAMENT_BASE_DIR}/${PROMPT}"
echo "GPT-4 evaluation results will be saved in subdirectories under: ${EVALUATION_ROOT_DIR}"

API_KEY=$(cat "${SECRET_FILE_PATH}")

# --- Main Evaluation Loop ---
num_files=${#RESPONSE_FILES[@]}
for ((i=0; i<num_files; i++)); do
    for ((j=i+1; j<num_files; j++)); do
        MODEL_1_FILE="${RESPONSE_FILES[i]}"
        MODEL_2_FILE="${RESPONSE_FILES[j]}"

        # The model name is the name of the directory containing the response file
        MODEL_1_NAME=$(basename "$(dirname "${MODEL_1_FILE}")")
        MODEL_2_NAME=$(basename "$(dirname "${MODEL_2_FILE}")")

        echo "------------------------------------------------------------"
        echo "GPT-4 Evaluation: ${MODEL_1_NAME} vs. ${MODEL_2_NAME}"
        echo "------------------------------------------------------------"

        # Create a specific output directory for this pair
        PAIR_OUTPUT_DIR="${EVALUATION_ROOT_DIR}/${MODEL_1_NAME}_vs_${MODEL_2_NAME}"

        # Check if the evaluation for this pair has already been run
        if [ -d "${PAIR_OUTPUT_DIR}" ]; then
            if [ "${REDO_EVALUATIONS}" = "true" ]; then
                echo "Re-running evaluation for ${MODEL_1_NAME} vs. ${MODEL_2_NAME} as requested by --redo flag."
            else
                echo "Skipping ${MODEL_1_NAME} vs. ${MODEL_2_NAME}: Results already exist. Use --redo to re-run."
                echo
                continue # Skip to the next pair
            fi
        fi

        mkdir -p "${PAIR_OUTPUT_DIR}"

        # Run the python script and log output to the pair's directory
        (
            python3 -u safe_rlhf/evaluate/gpt4/eval2.py \
                --model_1_response_file "${MODEL_1_FILE}" \
                --model_2_response_file "${MODEL_2_FILE}" \
                --output_dir "${PAIR_OUTPUT_DIR}" \
                --openai_api_key "${API_KEY}" \
                --prompt "${PROMPT}" \
                ${DRY_RUN_FLAG}
        ) > >(tee "${PAIR_OUTPUT_DIR}/stdout.log") 2> >(tee "${PAIR_OUTPUT_DIR}/stderr.log" >&2)

        echo "GPT-4 evaluation complete for ${MODEL_1_NAME} vs. ${MODEL_2_NAME}."
        echo "Results saved in ${PAIR_OUTPUT_DIR}"
        echo
    done
done

echo "All pairwise GPT-4 evaluations are complete."