#!/usr/bin/env python3
"""
comparative_analysis_2.py

Simplified comparative evaluation pipeline:
- Extract pairwise judge winner columns from CSV files.
- Aggregate pairwise win statistics and per‑model win shares.
- Perform binomial tests (with FDR).
- Produce several diagnostic plots (pairwise heatmaps, overall win rates, rank correlations, entropy).

This is a cleaned / fixed version of the previously malformed file:
  * Correct indentation
  * Repaired regex, glob patterns, arithmetic, try/except
  * Added defensive checks
"""

import os
import re
import glob
import argparse
import logging
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple

import yaml
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import spearmanr

# ---- Binomial test compatibility (SciPy renamed binom_test -> binomtest) ----
try:
    from scipy.stats import binomtest as _scipy_binomtest

    def binom_test(k: int, n: int, p: float = 0.5, alternative: str = "two-sided"):
        return _scipy_binomtest(k, n, p=p, alternative=alternative).pvalue
except Exception:
    # Older SciPy
    from scipy.stats import binom_test as _legacy_binom_test  # type: ignore

    def binom_test(k: int, n: int, p: float = 0.5, alternative: str = "two-sided"):
        return _legacy_binom_test(k, n, p=p, alternative=alternative)

from itertools import combinations
from math import log2


# --------------------------------------------------------------------------- #
# Config dataclass
# --------------------------------------------------------------------------- #
@dataclass
class Config:
    models: List[str]
    judges: List[str]
    csv_root: str
    recursive: bool
    filters: List[str]
    skip_unknown_language: bool
    languages: List[Dict]
    tie_handling: str
    output_dir: str
    dpi: int
    style: str
    per_language: bool
    palette: str
    log_level: str
    min_matches_for_test: int


# Column pattern: answer_<m1>_AND_answer_<m2>_comparison_by_<judge>_judge_winner
COL_PATTERN = re.compile(
    r"^answer_(?P<m1>[A-Za-z0-9]+)_AND_answer_(?P<m2>[A-Za-z0-9]+)_comparison_by_(?P<judge>[A-Za-z0-9]+)_judge_winner$"
)


# --------------------------------------------------------------------------- #
# Configuration / IO helpers
# --------------------------------------------------------------------------- #
def load_config(path: str) -> Config:
    with open(path, "r", encoding="utf-8") as f:
        raw = yaml.safe_load(f)

    eval_raw = raw.get("evaluation", {})
    plots_raw = raw.get("plots", {})
    logging_raw = raw.get("logging", {"level": "INFO"})

    return Config(
        models=eval_raw.get("base_models", eval_raw.get("models", [])),
        judges=[j.get("name") if isinstance(j, dict) else j for j in eval_raw.get("evaluators", eval_raw.get("judges", []))],
        csv_root=raw["data"]["csv_root"],
        recursive=raw["data"].get("recursive", True),
        filters=raw["data"].get("include_only_filename_substrings", []),
        skip_unknown_language=raw["data"].get("skip_unknown_language", True),
        languages=raw.get("languages", []),
        tie_handling=raw.get("comparative", {}).get("tie_handling", "half"),
        output_dir=plots_raw.get("output_dir", "output"),
        dpi=plots_raw.get("dpi", 150),
        style=plots_raw.get("style", "whitegrid"),
        per_language=plots_raw.get("per_language", True),
        palette=plots_raw.get("palette", "muted"),
        log_level=logging_raw.get("level", "INFO"),
        min_matches_for_test=raw.get("comparative", {}).get("min_matches_for_test", 10),
    )


def setup_logging(level: str):
    logging.basicConfig(
        level=getattr(logging, level.upper(), logging.INFO),
        format="%(asctime)s %(levelname)s %(message)s",
    )


def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)


def discover_csv_files(root: str, recursive: bool, filters: List[str]) -> List[str]:
    pattern = "**/*.csv" if recursive else "*.csv"
    files = glob.glob(os.path.join(root, pattern), recursive=recursive)
    if filters:
        fl = [s.lower() for s in filters]
        files = [f for f in files if any(sub in os.path.basename(f).lower() for sub in fl)]
    return sorted(files)


def infer_language(filename: str, lang_cfg: List[Dict], skip_unknown: bool) -> Optional[str]:
    lower = os.path.basename(filename).lower()
    for entry in lang_cfg:
        for sub in entry.get("match_substrings", []):
            if sub.lower() in lower:
                return entry["name"]
    if skip_unknown:
        return None
    return "unknown"


# --------------------------------------------------------------------------- #
# Extraction
# --------------------------------------------------------------------------- #
def find_comparative_columns(df: pd.DataFrame) -> List[Tuple[str, str, str, str]]:
    found = []
    for col in df.columns:
        m = COL_PATTERN.match(col)
        if m:
            g = m.groupdict()
            found.append((col, g["m1"], g["m2"], g["judge"]))
    return found


def process_file(path: str, cfg: Config) -> pd.DataFrame:
    try:
        df = pd.read_csv(path, encoding="utf-8")
    except Exception as e:
        logging.warning("Skipping %s: %s", path, e)
        return pd.DataFrame()

    cols = find_comparative_columns(df)
    if not cols:
        return pd.DataFrame()

    lang = infer_language(path, cfg.languages, cfg.skip_unknown_language)
    if lang is None:
        return pd.DataFrame()

    rows = []
    rel = os.path.relpath(path, cfg.csv_root)

    for (col, m1, m2, judge) in cols:
        if judge not in cfg.judges:
            continue
        if m1 not in cfg.models or m2 not in cfg.models:
            continue

        series = df[col]
        for idx, val in series.items():
            if pd.isna(val):
                continue
            s = str(val).strip()
            if not s:
                continue
            tie = False
            winner = None
            if s.lower() in {"tie", "draw"}:
                tie = True
            elif s.startswith("answer_"):
                winner = s.replace("answer_", "")
            else:
                # Unrecognized token
                continue
            rows.append(
                {
                    "source_file": rel,
                    "row_index": int(idx),
                    "language": lang,
                    "judge": judge,
                    "model_a": m1,
                    "model_b": m2,
                    "winner": winner,
                    "is_tie": tie,
                }
            )

    if not rows:
        return pd.DataFrame()
    return pd.DataFrame(rows)


def build_events(cfg: Config) -> pd.DataFrame:
    files = discover_csv_files(cfg.csv_root, cfg.recursive, cfg.filters)
    if not files:
        logging.warning("No CSV files discovered.")
        return pd.DataFrame()

    parts = []
    for fp in files:
        part = process_file(fp, cfg)
        if not part.empty:
            parts.append(part)

    if not parts:
        logging.warning("No comparative events found.")
        return pd.DataFrame()

    events = pd.concat(parts, ignore_index=True)
    logging.info(
        "Extracted comparative events rows=%d judges=%s",
        len(events),
        events.judge.unique().tolist(),
    )
    return events


# --------------------------------------------------------------------------- #
# Aggregations
# --------------------------------------------------------------------------- #
def pairwise_stats(events: pd.DataFrame, cfg: Config, per_language: bool = False) -> pd.DataFrame:
    if events.empty:
        return pd.DataFrame()

    group_fields = ["judge", "model_a", "model_b"]
    if per_language:
        group_fields = ["judge", "language", "model_a", "model_b"]

    rows = []
    for keys, grp in events.groupby(group_fields):
        d = dict(zip(group_fields, keys))
        total = len(grp)
        a_wins = int((grp["winner"] == grp["model_a"]).sum())
        b_wins = int((grp["winner"] == grp["model_b"]).sum())
        ties = int(grp["is_tie"].sum())

        if cfg.tie_handling == "half":
            eff_total = total
            a_score = a_wins + 0.5 * ties
            b_score = b_wins + 0.5 * ties
        elif cfg.tie_handling == "ignore":
            eff_total = total - ties if total - ties > 0 else np.nan
            a_score = a_wins
            b_score = b_wins
        else:  # "loss": ties count as losses (score unaffected)
            eff_total = total
            a_score = a_wins
            b_score = b_wins

        a_win_rate = (a_score / eff_total) if eff_total and not np.isnan(eff_total) else np.nan
        b_win_rate = (b_score / eff_total) if eff_total and not np.isnan(eff_total) else np.nan

        rows.append(
            {
                **d,
                "matches_total": int(total),
                "a_wins": a_wins,
                "b_wins": b_wins,
                "ties": ties,
                "a_win_rate": float(a_win_rate) if not np.isnan(a_win_rate) else np.nan,
                "b_win_rate": float(b_win_rate) if not np.isnan(b_win_rate) else np.nan,
            }
        )

    return pd.DataFrame(rows)


def model_winshare(events: pd.DataFrame, cfg: Config, per_language: bool = False) -> pd.DataFrame:
    if events.empty:
        return pd.DataFrame()

    recs = []
    for _, r in events.iterrows():
        for m in (r.model_a, r.model_b):
            if r.is_tie:
                outcome = "tie"
                is_win = False
                is_loss = False
            else:
                is_win = r.winner == m
                is_loss = r.winner is not None and r.winner != m
            recs.append(
                {
                    "judge": r.judge,
                    "language": r.language,
                    "model": m,
                    "is_win": bool(is_win),
                    "is_loss": bool(is_loss),
                    "is_tie": bool(r.is_tie),
                }
            )

    df = pd.DataFrame(recs)
    group_fields = ["judge", "model"] if not per_language else ["judge", "language", "model"]

    out_rows = []
    for keys, grp in df.groupby(group_fields):
        d = dict(zip(group_fields, keys))
        wins = int(grp.is_win.sum())
        ties = int(grp.is_tie.sum())
        matches = int(len(grp))

        if cfg.tie_handling == "half":
            effective = matches
            score = wins + 0.5 * ties
        elif cfg.tie_handling == "ignore":
            effective = matches - ties if matches - ties > 0 else np.nan
            score = wins
        else:
            effective = matches
            score = wins

        win_rate = (score / effective) if effective and not np.isnan(effective) else np.nan
        out_rows.append(
            {
                **d,
                "wins": wins,
                "ties": ties,
                "matches": matches,
                "win_rate": float(win_rate) if not np.isnan(win_rate) else np.nan,
            }
        )

    return pd.DataFrame(out_rows)


# --------------------------------------------------------------------------- #
# Statistical tests (binomial + FDR)
# --------------------------------------------------------------------------- #
def benjamini_hochberg(pvals: List[float], alpha: float = 0.05):
    p = np.array(pvals, dtype=float)
    m = len(p)
    if m == 0:
        return np.array([], dtype=bool), 0.0
    order = np.argsort(p)
    sorted_p = p[order]
    thresholds = (np.arange(1, m + 1) / m) * alpha
    below = sorted_p <= thresholds
    if not np.any(below):
        return np.zeros(m, dtype=bool), float(sorted_p.min()) if m > 0 else 0.0
    max_idx = np.where(below)[0].max()
    p_thresh = float(sorted_p[max_idx])
    rejected = p <= p_thresh
    # reorder to original order
    out = np.zeros(m, dtype=bool)
    out[order] = rejected
    return out, p_thresh


def pairwise_binomial_tests(pairwise_df: pd.DataFrame, cfg: Config) -> pd.DataFrame:
    if pairwise_df.empty:
        return pd.DataFrame()

    rows = []
    pvals = []

    for _, r in pairwise_df.iterrows():
        total = int(r.matches_total)
        ties = int(r.ties)
        a_wins = int(r.a_wins)

        if cfg.tie_handling == "half":
            successes = a_wins
            trials = total - ties if total - ties > 0 else total
        elif cfg.tie_handling == "ignore":
            successes = a_wins
            trials = total - ties if total - ties > 0 else total
        else:  # "loss"
            successes = a_wins
            trials = total

        if trials < cfg.min_matches_for_test:
            pval = np.nan
        else:
            try:
                pval = binom_test(successes, trials, p=0.5, alternative="two-sided")
            except Exception:
                pval = np.nan

        rows.append({**r.to_dict(), "binom_p": pval, "trials_for_test": trials})
        pvals.append(pval if not np.isnan(pval) else 1.0)

    out = pd.DataFrame(rows)
    rejected, p_thresh = benjamini_hochberg(pvals, alpha=0.05)
    out["binom_fdr_reject"] = rejected
    out["binom_fdr_threshold"] = float(p_thresh)
    return out


# --------------------------------------------------------------------------- #
# Plotting
# --------------------------------------------------------------------------- #
def plot_pairwise_heatmaps(pairwise_df: pd.DataFrame, cfg: Config, out_dir: str):
    if pairwise_df.empty:
        return
    judges = sorted(pairwise_df.judge.unique())
    models = cfg.models

    for judge in judges:
        sub = pairwise_df[pairwise_df.judge == judge]
        mat = pd.DataFrame(index=models, columns=models, dtype=float)
        for _, r in sub.iterrows():
            a = r.model_a
            b = r.model_b
            mat.loc[a, b] = r.a_win_rate
            mat.loc[b, a] = r.b_win_rate
        for m in models:
            mat.loc[m, m] = 0.5

        plt.figure(figsize=(1 + 1.1 * len(models), 1 + 0.9 * len(models)))
        sns.heatmap(mat.astype(float), annot=True, fmt=".3f", cmap="vlag", vmin=0, vmax=1)
        plt.title(f"Pairwise Win Rates - Judge: {judge}")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, f"pairwise_heatmap_{judge}.png"), dpi=cfg.dpi)
        plt.close()


def plot_overall_winrate(winshare: pd.DataFrame, cfg: Config, out_path: str):
    if winshare.empty:
        return
    plt.figure(figsize=(1.6 + 2.2 * len(cfg.models), 5))
    sns.barplot(data=winshare, x="model", y="win_rate", hue="judge", palette=cfg.palette, edgecolor="k")
    plt.ylim(0, 1)
    plt.title("Overall Model Win Rate by Judge")
    plt.ylabel("Win rate")
    plt.xlabel("Model")
    plt.tight_layout()
    plt.savefig(out_path, dpi=cfg.dpi)
    plt.close()


def plot_overall_winrate_language(winshare_lang: pd.DataFrame, cfg: Config, out_path: str):
    if winshare_lang.empty:
        return
    g = sns.catplot(
        data=winshare_lang,
        x="model",
        y="win_rate",
        hue="judge",
        col="language",
        kind="bar",
        palette=cfg.palette,
        sharey=True,
        col_wrap=3,
    )
    g.set(ylim=(0, 1))
    g.fig.subplots_adjust(top=0.88)
    g.fig.suptitle("Model Win Rate by Judge & Language")
    g.savefig(out_path, dpi=cfg.dpi)
    plt.close(g.fig)


def compute_judge_kappa(events: pd.DataFrame, cfg: Config) -> pd.DataFrame:
    if events.empty:
        return pd.DataFrame()

    ev = events.copy()
    ev["pair_key"] = ev.apply(
        lambda r: "{}::{}::{}::{}".format(
            r.source_file, r.row_index, "-".join(sorted([r.model_a, r.model_b])), r.language
        ),
        axis=1,
    )

    def canon_winner(row):
        m_sorted = sorted([row.model_a, row.model_b])
        if row.is_tie:
            return "T"
        if pd.isna(row.winner):
            return np.nan
        if row.winner == m_sorted[0]:
            return "A"
        elif row.winner == m_sorted[1]:
            return "B"
        else:
            return np.nan

    ev["canon_winner"] = ev.apply(canon_winner, axis=1)
    pivot = ev.pivot_table(
        index="pair_key",
        columns="judge",
        values="canon_winner",
        aggfunc=lambda x: x.iloc[0] if len(x) > 0 else np.nan,
    )

    judges = [j for j in cfg.judges if j in pivot.columns]
    kappa_mat = pd.DataFrame(index=judges, columns=judges, dtype=float)

    for a, b in combinations(judges, 2):
        pair = pivot[[a, b]].dropna()
        if pair.shape[0] < cfg.min_matches_for_test:
            kappa = np.nan
        else:
            A = pair[a].values
            B = pair[b].values
            labels = ["A", "B", "T"]
            obs = sum(1 for x, y in zip(A, B) if x == y) / len(A)
            pa = {lab: (A == lab).sum() / len(A) for lab in labels}
            pb = {lab: (B == lab).sum() / len(B) for lab in labels}
            pe = sum(pa[l] * pb[l] for l in labels)
            kappa = np.nan if pe >= 1.0 else (obs - pe) / (1 - pe)
        kappa_mat.loc[a, b] = kappa
        kappa_mat.loc[b, a] = kappa

    for j in judges:
        kappa_mat.loc[j, j] = 1.0

    return kappa_mat


def plot_judge_kappa_heatmap(kappa_mat: pd.DataFrame, cfg: Config, out_path: str):
    if kappa_mat is None or kappa_mat.empty:
        return
    plt.figure(figsize=(1 + 1.0 * len(kappa_mat), 1 + 0.9 * len(kappa_mat)))
    sns.heatmap(kappa_mat.astype(float), annot=True, fmt=".3f", cmap="coolwarm", vmin=-1, vmax=1)
    plt.title("Pairwise Judge Cohen-like Kappa")
    plt.tight_layout()
    plt.savefig(out_path, dpi=cfg.dpi)
    plt.close()


def plot_judge_rank_correlation(winshare: pd.DataFrame, cfg: Config, out_path: str):
    if winshare.empty:
        return
    pivot = winshare.pivot(index="model", columns="judge", values="win_rate")
    judges = [c for c in pivot.columns if c in cfg.judges]
    pivot_f = pivot.fillna(pivot.mean())
    corr_mat = pd.DataFrame(index=judges, columns=judges, dtype=float)

    for a, b in combinations(judges, 2):
        ca, _ = spearmanr(pivot_f[a], pivot_f[b], nan_policy="omit")
        corr_mat.loc[a, b] = ca
        corr_mat.loc[b, a] = ca
    for j in judges:
        corr_mat.loc[j, j] = 1.0

    plt.figure(figsize=(1 + 1.0 * len(judges), 1 + 0.9 * len(judges)))
    sns.heatmap(corr_mat.astype(float), annot=True, fmt=".3f", cmap="vlag", vmin=-1, vmax=1)
    plt.title("Spearman Rank Correlation between Judges")
    plt.tight_layout()
    plt.savefig(out_path, dpi=cfg.dpi)
    plt.close()


def plot_judge_entropy(events: pd.DataFrame, cfg: Config, out_path: str):
    if events.empty:
        return

    recs = []
    for _, r in events.iterrows():
        for m in (r.model_a, r.model_b):
            if r.is_tie:
                outcome = "tie"
            elif r.winner == m:
                outcome = "win"
            else:
                outcome = "loss"
            recs.append({"judge": r.judge, "language": r.language, "model": m, "outcome": outcome})

    df = pd.DataFrame(recs)
    if df.empty:
        return

    rows = []
    for (judge, model), grp in df.groupby(["judge", "model"]):
        counts = grp.outcome.value_counts(normalize=True).to_dict()
        probs = [p for p in [counts.get("win", 0), counts.get("loss", 0), counts.get("tie", 0)] if p > 0]
        ent = -sum(p * log2(p) for p in probs) if probs else 0.0
        rows.append(
            {
                "judge": judge,
                "model": model,
                "entropy_bits": ent,
                "p_win": counts.get("win", 0.0),
                "p_loss": counts.get("loss", 0.0),
                "p_tie": counts.get("tie", 0.0),
            }
        )

    out = pd.DataFrame(rows).sort_values(["judge", "model"])
    out.to_csv(os.path.join(cfg.output_dir, "judge_model_entropy.csv"), index=False)

    judges = sorted(out.judge.unique(), key=lambda x: cfg.judges.index(x) if x in cfg.judges else x)
    models = cfg.models if cfg.models else sorted(out.model.unique())

    x_pos = {j: i for i, j in enumerate(judges)}
    max_theoretical = log2(3)
    high_band_start = max_theoretical * 0.9

    base_palette = sns.color_palette("colorblind", n_colors=len(models))
    model_color = {m: base_palette[i] for i, m in enumerate(models)}
    markers = ["o", "s", "^", "D", "P", "X", "*", "v", "<", ">"]
    model_marker = {m: markers[i % len(markers)] for i, m in enumerate(models)}

    fig, ax = plt.subplots(figsize=(1.4 + 1.4 * len(judges), 4.8))
    ax.axhspan(high_band_start, max_theoretical, color="#f5c97f", alpha=0.20, lw=0)
    ax.axhline(max_theoretical, color="#772222", linestyle="--", linewidth=1.0, alpha=0.85)

    for m in models:
        sub = out[out.model == m].copy()
        if sub.empty:
            continue
        sub["x"] = sub.judge.map(x_pos)
        sub.sort_values("x", inplace=True)
        ax.plot(sub["x"], sub["entropy_bits"], color=model_color[m], linewidth=1.4, alpha=0.6, zorder=2)
        ax.scatter(
            sub["x"],
            sub["entropy_bits"],
            color=model_color[m],
            marker=model_marker[m],
            s=70,
            edgecolor="white",
            linewidth=0.8,
            label=m,
            zorder=3,
        )

    ax.set_xticks([x_pos[j] for j in judges])
    ax.set_xticklabels(judges)
    ax.set_xlim(-0.4, len(judges) - 0.2)
    y_upper = min(max_theoretical * 1.08, max_theoretical + 0.18)
    ax.set_ylim(0, y_upper)
    ax.set_ylabel("Outcome Entropy (bits)")
    ax.set_xlabel("Judge")
    ax.set_title("Judge Consistency Across Models", pad=10)

    leg = ax.legend(
        title="Model",
        loc="upper left",
        bbox_to_anchor=(1.02, 1.0),
        frameon=False,
        handlelength=1.2,
        borderaxespad=0.6,
    )

    ax.text(
        0.005,
        0.985,
        "Shaded: top 10% of possible entropy",
        transform=ax.transAxes,
        ha="left",
        va="top",
        fontsize=9,
        color="#555555",
    )

    ymin, ymax = ax.get_ylim()
    offset = (ymax - ymin) * 0.035
    ax.text(
        0.995,
        max_theoretical - offset,
        f"Max log2(3) = {max_theoretical:.2f}",
        transform=ax.get_yaxis_transform(),
        ha="right",
        va="top",
        fontsize=9,
        color="#772222",
    )

    for spine in ["top", "right"]:
        ax.spines[spine].set_visible(False)

    plt.tight_layout()
    fig.savefig(out_path, dpi=max(cfg.dpi, 300), bbox_inches="tight")
    fig.savefig(os.path.splitext(out_path)[0] + ".pdf", bbox_inches="tight")
    plt.close(fig)


# --------------------------------------------------------------------------- #
# Orchestration
# --------------------------------------------------------------------------- #
def run(cfg: Config):
    sns.set_style(cfg.style)
    ensure_dir(cfg.output_dir)

    files = discover_csv_files(cfg.csv_root, cfg.recursive, cfg.filters)
    logging.info("Discovered %d CSV files under %s", len(files), cfg.csv_root)

    events = build_events(cfg)
    if events.empty:
        logging.error("No comparative events extracted; exiting.")
        return

    events.to_csv(os.path.join(cfg.output_dir, "pairwise_events.csv"), index=False)

    pairwise_overall = pairwise_stats(events, cfg, per_language=False)
    pairwise_overall.to_csv(os.path.join(cfg.output_dir, "pairwise_results.csv"), index=False)

    winshare_overall = model_winshare(events, cfg, per_language=False)
    winshare_overall.to_csv(os.path.join(cfg.output_dir, "overall_model_winshare.csv"), index=False)

    if cfg.per_language:
        pairwise_lang = pairwise_stats(events, cfg, per_language=True)
        pairwise_lang.to_csv(os.path.join(cfg.output_dir, "pairwise_results_by_language.csv"), index=False)
        winshare_lang = model_winshare(events, cfg, per_language=True)
        winshare_lang.to_csv(
            os.path.join(cfg.output_dir, "overall_model_winshare_by_language.csv"), index=False
        )
    else:
        pairwise_lang = pd.DataFrame()
        winshare_lang = pd.DataFrame()

    # Plots
    plot_overall_winrate(winshare_overall, cfg, os.path.join(cfg.output_dir, "overall_winrate_bars.png"))
    if cfg.per_language:
        plot_overall_winrate_language(
            winshare_lang, cfg, os.path.join(cfg.output_dir, "overall_winrate_bars_by_language.png")
        )
    plot_pairwise_heatmaps(pairwise_overall, cfg, cfg.output_dir)

    pairwise_tests = pairwise_binomial_tests(pairwise_overall, cfg)
    pairwise_tests.to_csv(os.path.join(cfg.output_dir, "pairwise_stat_tests.csv"), index=False)

    kappa_mat = compute_judge_kappa(events, cfg)
    if not kappa_mat.empty:
        kappa_mat.to_csv(os.path.join(cfg.output_dir, "judge_pairwise_kappa.csv"))
        plot_judge_kappa_heatmap(kappa_mat, cfg, os.path.join(cfg.output_dir, "judge_kappa_heatmap.png"))

    plot_judge_rank_correlation(
        winshare_overall, cfg, os.path.join(cfg.output_dir, "judge_rank_correlation.png")
    )

    plot_judge_entropy(events, cfg, os.path.join(cfg.output_dir, "judge_entropy_by_model.png"))

    summary = {
        "n_files": len(files),
        "n_events": len(events),
        "n_pairwise_rows": len(pairwise_overall),
        "n_models": len(cfg.models),
        "judges": ",".join(cfg.judges),
    }
    pd.DataFrame([summary]).to_csv(
        os.path.join(cfg.output_dir, "comparative_summary.csv"), index=False
    )

    logging.info("Comparative analysis complete. Outputs saved in %s", cfg.output_dir)


# --------------------------------------------------------------------------- #
# CLI
# --------------------------------------------------------------------------- #
def parse_args():
    ap = argparse.ArgumentParser(description="Comparative pairwise evaluation pipeline")
    ap.add_argument("--config", required=True, help="Path to YAML config.")
    return ap.parse_args()


def main():
    args = parse_args()
    cfg = load_config(args.config)
    setup_logging(cfg.log_level)
    ensure_dir(cfg.output_dir)
    run(cfg)


if __name__ == "__main__":
    main()