#!/usr/bin/env bash
set -euo pipefail

# End-to-end pipeline:
# 1) Generate model answers (gen_api_answer.py)
# 2) Rewrite answers with defense (dj_defense.py) [default: increase, set REWRITE_DIRECTION=decrease for decrease]
# 3) Generate GPT-4 judgments (gen_judgment.py) for both before+after (non-interactive)
# 4) Compare metrics (compare_judgments_before_after.py)
#
# Defaults:
# - generation: use OpenAI API (gpt-4o) - set GEN_API_BASE to use local server
# - rewrite:     use local vLLM server http://localhost:8000/v1 (override with REWRITE_SERVER_URL*)
# - rewrite (optional): set REWRITE_WITH_OPENAI=1 to use OpenAI official API (default model: gpt-4.1-mini)

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

BENCH_NAME="${BENCH_NAME:-mt_bench}"
MODEL="${MODEL:-gpt-4}"                       # model id used by files under data/<bench-name>/
GEN_PARALLEL="${GEN_PARALLEL:-8}"
GEN_MAX_TOKENS="${GEN_MAX_TOKENS:-1024}"

# Generation: default to OpenAI API (empty = use OpenAI). Set GEN_API_BASE to use local server.
GEN_API_BASE="${GEN_API_BASE:-}"

REWRITE_DIRECTION="${REWRITE_DIRECTION:-increase}"
REWRITE_PARALLEL="${REWRITE_PARALLEL:-8}"

# Rewrite backend:
# - default: local vLLM (OpenAI-compatible) at http://localhost:8000/v1
# - optional: OpenAI official API (https://api.openai.com/v1) with model gpt-4.1-mini
REWRITE_WITH_OPENAI="${REWRITE_WITH_OPENAI:-0}"        # 1 = use OpenAI for rewriting instead of local vLLM
REWRITE_OPENAI_MODEL="${REWRITE_OPENAI_MODEL:-gpt-4.1-mini}"

# dj_defense.py already defaults rewrite-server-url to http://localhost:8000/v1
# Rewrite model name should follow direction by default (increase/decrease), but can be overridden:
#   REWRITE_MODEL=increase|decrease
REWRITE_MODEL="${REWRITE_MODEL:-}"

JUDGE_MODEL="${JUDGE_MODEL:-gpt-4}"
JUDGE_PARALLEL="${JUDGE_PARALLEL:-8}"
FIRST_N="${FIRST_N:-}"                         # optional: e.g. 5

OVERWRITE="${OVERWRITE:-0}"                    # 1 overwrite outputs, 0 keep (default: keep & skip existing)
# Convenience: only redo rewrite + judgment + compare, reusing already-generated answers.
# - Skips Step 1 (generation)
# - Forces Step 2/3 to rerun by deleting rewritten answers + judgments outputs
REWRITE_EVAL_ONLY="${REWRITE_EVAL_ONLY:-0}"     # 1 = force rerun rewrite+judge only (keep original answers); 0 = reuse existing outputs if present

# Judge/compare control:
# - SKIP_BEFORE_JUDGE=1: don't re-judge the baseline model if we already have a baseline file.
SKIP_BEFORE_JUDGE="${SKIP_BEFORE_JUDGE:-1}"
#
# Baseline judgments selection:
# Prefer reusing an existing baseline judgment file, so we don't generate a new baseline each run.
# - You can explicitly set:
#     BASELINE_JUDGE_FILE=...  BASELINE_MODEL=...
# - Or let it auto-pick:
#     If MODEL=gpt-4 and JUDGE_MODEL=gpt-4, use FastChat's existing `gpt-4_single.jsonl` as baseline.
BASELINE_JUDGE_FILE="${BASELINE_JUDGE_FILE:-}"
BASELINE_MODEL="${BASELINE_MODEL:-${MODEL}}"

if [[ -z "${BASELINE_JUDGE_FILE}" ]]; then
  if [[ "${MODEL}" == "gpt-4" && "${JUDGE_MODEL}" == "gpt-4" && -f "data/mt_bench/model_judgment/gpt-4_single.jsonl" ]]; then
    BASELINE_JUDGE_FILE="data/mt_bench/model_judgment/gpt-4_single.jsonl"
    BASELINE_MODEL="gpt-4"
  fi
fi

ANSWER_DIR="data/${BENCH_NAME}/model_answer"
ANSWER_BEFORE="${ANSWER_DIR}/${MODEL}.jsonl"
MODEL_AFTER="${MODEL}-defended-${REWRITE_DIRECTION}"
ANSWER_AFTER="${ANSWER_DIR}/${MODEL_AFTER}.jsonl"

JUDGE_DIR="data/${BENCH_NAME}/model_judgment"
JUDGE_BEFORE_OUT="${JUDGE_DIR}/${JUDGE_MODEL}_single_${MODEL}.jsonl"
JUDGE_AFTER_OUT="${JUDGE_DIR}/${JUDGE_MODEL}_single_${MODEL_AFTER}.jsonl"

echo "=== MT-bench defense pipeline ==="
echo "bench        : ${BENCH_NAME}"
echo "model(before): ${MODEL}"
echo "model(after) : ${MODEL_AFTER}"
echo "gen_api_base : ${GEN_API_BASE}"
echo "rewrite_dir  : ${REWRITE_DIRECTION}"
echo "judge_model  : ${JUDGE_MODEL}"
echo "outputs:"
echo "  answers(before): ${ANSWER_BEFORE}"
echo "  answers(after) : ${ANSWER_AFTER}"
echo "  judgments(before): ${JUDGE_BEFORE_OUT}"
echo "  judgments(after) : ${JUDGE_AFTER_OUT}"
echo "rewrite_eval_only: ${REWRITE_EVAL_ONLY}"
echo "skip_before_judge: ${SKIP_BEFORE_JUDGE}"
echo "baseline_judge_file: ${BASELINE_JUDGE_FILE:-<pipeline>}"
echo "baseline_model     : ${BASELINE_MODEL}"
echo ""

mkdir -p "${ANSWER_DIR}" "${JUDGE_DIR}"

if [[ "${OVERWRITE}" == "1" ]]; then
  echo "OVERWRITE=1: removing existing outputs (if any)"
  rm -f "${ANSWER_BEFORE}" "${ANSWER_AFTER}" "${JUDGE_BEFORE_OUT}" "${JUDGE_AFTER_OUT}"
fi

if [[ "${REWRITE_EVAL_ONLY}" == "1" ]]; then
  if [[ ! -f "${ANSWER_BEFORE}" ]]; then
    echo "ERROR: REWRITE_EVAL_ONLY=1 but missing generated answers: ${ANSWER_BEFORE}" >&2
    echo "  Run once without REWRITE_EVAL_ONLY or set MODEL to match an existing answer file." >&2
    exit 2
  fi
  echo "REWRITE_EVAL_ONLY=1: will reuse ${ANSWER_BEFORE} and rerun rewrite + judge"
  rm -f "${ANSWER_AFTER}" "${JUDGE_AFTER_OUT}"
  if [[ "${SKIP_BEFORE_JUDGE}" != "1" ]]; then
    rm -f "${JUDGE_BEFORE_OUT}"
  fi
fi

echo "== Step 1: Generate model answers (using OpenAI API) =="
if [[ "${REWRITE_EVAL_ONLY}" == "1" ]]; then
  echo "  Skip: REWRITE_EVAL_ONLY=1"
elif [[ -f "${ANSWER_BEFORE}" ]]; then
  echo "  Skip: answers already exist: ${ANSWER_BEFORE}"
else
  GEN_ARGS=(--bench-name "${BENCH_NAME}" --model "${MODEL}" --parallel "${GEN_PARALLEL}" --max-tokens "${GEN_MAX_TOKENS}")
  if [[ -n "${GEN_API_BASE}" ]]; then
    GEN_ARGS+=(--openai-api-base "${GEN_API_BASE}")
    echo "  Using local API: ${GEN_API_BASE}"
  else
    echo "  Using OpenAI API (official)"
  fi
  python3 gen_api_answer.py "${GEN_ARGS[@]}"
fi

echo ""
echo "== Step 2: Rewrite answers with defense (using local vLLM server) =="
if [[ -f "${ANSWER_AFTER}" ]]; then
  echo "  Skip: rewritten answers already exist: ${ANSWER_AFTER}"
else
  # Select rewrite backend
  if [[ "${REWRITE_WITH_OPENAI}" == "1" ]]; then
    # OpenAI official API
    REWRITE_SERVER_URL_SELECTED="https://api.openai.com/v1"
    REWRITE_MODEL_SELECTED="${REWRITE_OPENAI_MODEL}"
    REWRITE_API_KEY_SELECTED="${REWRITE_API_KEY:-${OPENAI_API_KEY:-}}"
    if [[ -z "${REWRITE_API_KEY_SELECTED}" ]]; then
      echo "ERROR: REWRITE_WITH_OPENAI=1 but missing API key. Set OPENAI_API_KEY or REWRITE_API_KEY." >&2
      exit 4
    fi
    echo "  Rewrite backend    : OpenAI"
  else
    # Local vLLM (or any OpenAI-compatible endpoint)
    # Use the same rewrite URL for both directions (unless you override it).
    REWRITE_SERVER_URL_SELECTED="${REWRITE_SERVER_URL:-${REWRITE_SERVER_URL_INCREASE:-http://localhost:8000/v1}}"
    # Rewrite model defaults to the direction name (increase/decrease) unless overridden.
    REWRITE_MODEL_SELECTED="${REWRITE_MODEL:-${REWRITE_DIRECTION}}"
    REWRITE_API_KEY_SELECTED="${REWRITE_API_KEY:-}"
    echo "  Rewrite backend    : local"
  fi

  REWRITE_ARGS=(
    --bench-name "${BENCH_NAME}"
    --in-file "${ANSWER_BEFORE}"
    --out-model "${MODEL_AFTER}"
    --direction "${REWRITE_DIRECTION}"
    --rewrite-model "${REWRITE_MODEL_SELECTED}"
    --rewrite-server-url "${REWRITE_SERVER_URL_SELECTED}"
    --rewrite-api-key "${REWRITE_API_KEY_SELECTED}"
    --parallel "${REWRITE_PARALLEL}"
    --overwrite
  )

  # Optional prompt overrides (default prompt path is Decoy-for-the-Judge/prompt/system_prompt_{increase,decrease}.txt)
  if [[ -n "${SYSTEM_PROMPT_INCREASE:-}" ]]; then
    REWRITE_ARGS+=(--system-prompt-increase "${SYSTEM_PROMPT_INCREASE}")
  fi
  if [[ -n "${SYSTEM_PROMPT_DECREASE:-}" ]]; then
    REWRITE_ARGS+=(--system-prompt-decrease "${SYSTEM_PROMPT_DECREASE}")
  fi

  echo "  Using rewrite server: ${REWRITE_SERVER_URL_SELECTED}"
  echo "  Using rewrite model : ${REWRITE_MODEL_SELECTED}"
  python3 ../defense/dj_defense.py "${REWRITE_ARGS[@]}"
fi

echo ""
echo "== Step 3: Generate GPT-4 judgments (non-interactive) =="
# FastChat expects reference answers keyed by the judge model name for math categories.
# If the reference file for the selected judge doesn't exist, fall back to gpt-4 reference answers.
REF_DIR="data/${BENCH_NAME}/reference_answer"
mkdir -p "${REF_DIR}"
if [[ ! -f "${REF_DIR}/${JUDGE_MODEL}.jsonl" && -f "${REF_DIR}/gpt-4.jsonl" ]]; then
  ln -sf "gpt-4.jsonl" "${REF_DIR}/${JUDGE_MODEL}.jsonl"
  echo "  Linked reference answers: ${REF_DIR}/${JUDGE_MODEL}.jsonl -> gpt-4.jsonl"
fi
if [[ "${SKIP_BEFORE_JUDGE}" != "1" ]]; then
  if [[ -n "${BASELINE_JUDGE_FILE}" ]]; then
    echo "  Skip: BASELINE_JUDGE_FILE is set (will not judge BEFORE in pipeline)"
  elif [[ -f "${JUDGE_BEFORE_OUT}" ]]; then
    echo "  Skip: before-judgments already exist: ${JUDGE_BEFORE_OUT}"
  else
    echo "  Judging BEFORE: ${MODEL}"
    BASE_ARGS=(--bench-name "${BENCH_NAME}" --mode single --judge-model "${JUDGE_MODEL}" --parallel "${JUDGE_PARALLEL}" --yes --output-file "${JUDGE_BEFORE_OUT}" --model-list "${MODEL}")
    if [[ -n "${FIRST_N}" ]]; then
      BASE_ARGS+=(--first-n "${FIRST_N}")
    fi
    python3 gen_judgment.py "${BASE_ARGS[@]}"
  fi
else
  # Still ensure baseline exists for comparison; auto-generate once if missing.
  if [[ -n "${BASELINE_JUDGE_FILE}" ]]; then
    echo "  Skip: BASELINE_JUDGE_FILE is set (will use external baseline)"
  elif [[ ! -f "${JUDGE_BEFORE_OUT}" ]]; then
    echo "  Baseline judgments missing; generating once: ${MODEL} -> ${JUDGE_BEFORE_OUT}"
    BASE_ARGS=(--bench-name "${BENCH_NAME}" --mode single --judge-model "${JUDGE_MODEL}" --parallel "${JUDGE_PARALLEL}" --yes --output-file "${JUDGE_BEFORE_OUT}" --model-list "${MODEL}")
    if [[ -n "${FIRST_N}" ]]; then
      BASE_ARGS+=(--first-n "${FIRST_N}")
    fi
    python3 gen_judgment.py "${BASE_ARGS[@]}"
  fi
fi

if [[ -f "${JUDGE_AFTER_OUT}" ]]; then
  echo "  Skip: after-judgments already exist: ${JUDGE_AFTER_OUT}"
else
  echo "  Judging AFTER: ${MODEL_AFTER}"
  AFTER_ARGS=(--bench-name "${BENCH_NAME}" --mode single --judge-model "${JUDGE_MODEL}" --parallel "${JUDGE_PARALLEL}" --yes --output-file "${JUDGE_AFTER_OUT}" --model-list "${MODEL_AFTER}")
  if [[ -n "${FIRST_N}" ]]; then
    AFTER_ARGS+=(--first-n "${FIRST_N}")
  fi
  python3 gen_judgment.py "${AFTER_ARGS[@]}"
fi

echo ""
echo "== Step 4: Compare metrics (before vs after) =="
if [[ -n "${BASELINE_JUDGE_FILE}" ]]; then
  if [[ ! -f "${BASELINE_JUDGE_FILE}" ]]; then
    echo "ERROR: BASELINE_JUDGE_FILE not found: ${BASELINE_JUDGE_FILE}" >&2
    exit 3
  fi
  python3 compare_judgments_before_after.py \
    --bench-name "${BENCH_NAME}" \
    --judge-file-before "${BASELINE_JUDGE_FILE}" \
    --judge-file-after "${JUDGE_AFTER_OUT}" \
    --before "${BASELINE_MODEL}" \
    --after "${MODEL_AFTER}"
else
  python3 compare_judgments_before_after.py \
    --bench-name "${BENCH_NAME}" \
    --judge-file-before "${JUDGE_BEFORE_OUT}" \
    --judge-file-after "${JUDGE_AFTER_OUT}" \
    --before "${MODEL}" \
    --after "${MODEL_AFTER}"
fi

echo ""
echo "Done."


