#!/usr/bin/env python3
import os
import glob
import argparse
import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple

import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from numpy.linalg import eig
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

@dataclass
class EvaluatorCfg:
    name: str
    suffix: str

@dataclass
class LanguageCfg:
    name: str
    match_substrings: List[str]

@dataclass
class RootCfg:
    base_models: List[str]
    criteria: List[str]
    evaluators: List[EvaluatorCfg]
    languages: List[LanguageCfg]
    csv_root: str
    recursive: bool
    filename_filters: List[str]
    skip_unknown_language: bool
    output_dir: str
    dpi: int
    style: str

def load_config(path: str) -> RootCfg:
    with open(path, "r", encoding="utf-8") as f:
        raw = yaml.safe_load(f)
    eval_raw = raw["evaluation"]
    evs = [EvaluatorCfg(**e) for e in eval_raw["evaluators"]]
    langs = [LanguageCfg(**l) for l in raw.get("languages", [])]
    data_raw = raw["data"]
    plots_raw = raw.get("plots", {})
    return RootCfg(
        base_models=eval_raw["base_models"],
        criteria=eval_raw["criteria"],
        evaluators=evs,
        languages=langs,
        csv_root=data_raw["csv_root"],
        recursive=data_raw.get("recursive", True),
        filename_filters=data_raw.get("include_only_filename_substrings", []),
        skip_unknown_language=data_raw.get("skip_unknown_language", True),
        output_dir=plots_raw.get("output_dir", "extended_analysis_out"),
        dpi=plots_raw.get("dpi", 150),
        style=plots_raw.get("style", "white"),
    )

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def discover_csv_files(root: str, recursive: bool, filters: List[str]) -> List[str]:
    pattern = "**/*.csv" if recursive else "*.csv"
    files = glob.glob(os.path.join(root, pattern), recursive=recursive)
    if filters:
        fl = [s.lower() for s in filters]
        files = [f for f in files if any(s in os.path.basename(f).lower() for s in fl)]
    return sorted(files)

def infer_language(path: str, languages: List[LanguageCfg]) -> Optional[str]:
    fname = os.path.basename(path).lower()
    for lang in languages:
        for sub in lang.match_substrings:
            if sub.lower() in fname:
                return lang.name
    return None

def score_col(crit: str, model: str, suffix: str) -> str:
    return f"score_{crit}_{model}_eval{suffix}"

def build_tidy_from_csvs(cfg: RootCfg) -> pd.DataFrame:
    files = discover_csv_files(cfg.csv_root, cfg.recursive, cfg.filename_filters)
    if not files:
        logging.warning("No CSV files discovered in csv_root: %s", cfg.csv_root)
        return pd.DataFrame()
    rows = []
    for fp in files:
        lang = infer_language(fp, cfg.languages)
        if lang is None:
            if cfg.skip_unknown_language:
                continue
            lang = "unknown"
        try:
            df = pd.read_csv(fp, encoding="utf-8")
        except Exception as e:
            logging.warning("Skipping %s: %s", fp, e)
            continue
        for ev in cfg.evaluators:
            present_any = any(score_col(cfg.criteria[0], m, ev.suffix) in df.columns for m in cfg.base_models)
            if not present_any:
                continue
            for row_idx, r in df.iterrows():
                for model in cfg.base_models:
                    for crit in cfg.criteria:
                        col = score_col(crit, model, ev.suffix)
                        if col not in df.columns:
                            continue
                        val = r.get(col, np.nan)
                        if pd.isna(val):
                            continue
                        try:
                            rows.append({
                                "evaluator": ev.name,
                                "model": model,
                                "criterion": crit,
                                "language": lang,
                                "score": float(val),
                                "source_file": os.path.relpath(fp, cfg.csv_root),
                                "row_index": int(row_idx)
                            })
                        except Exception:
                            continue
    if not rows:
        logging.error("No scoring rows assembled from CSVs.")
        return pd.DataFrame()
    tidy = pd.DataFrame(rows)
    tidy['evaluator'] = tidy['evaluator'].astype(str)
    ensure_dir(cfg.output_dir)
    tidy.to_csv(os.path.join(cfg.output_dir, "tidy_snapshot.csv"), index=False)
    logging.info("Tidy assembled: rows=%d, evaluators=%s", len(tidy), tidy.evaluator.unique().tolist())
    return tidy

def bootstrap_mean_ci(data: np.ndarray, n_boot: int = 2000, alpha: float = 0.05, seed: Optional[int] = None) -> Tuple[float,float,float]:
    rng = np.random.default_rng(seed)
    n = len(data)
    if n == 0:
        return float('nan'), float('nan'), float('nan')
    boots = []
    for _ in range(n_boot):
        s = rng.choice(data, size=n, replace=True)
        boots.append(np.nanmean(s))
    lower = np.percentile(boots, 100*alpha/2)
    upper = np.percentile(boots, 100*(1-alpha/2))
    return float(np.nanmean(data)), float(lower), float(upper)

def bootstrap_variance_ratio(human_scores: np.ndarray, llm_scores: np.ndarray, n_boot: int = 2000, alpha: float = 0.05, seed: Optional[int]=0):
    rng = np.random.default_rng(seed)
    if len(human_scores) < 2 or len(llm_scores) < 2:
        return np.nan, np.nan, np.nan
    boots = []
    for _ in range(n_boot):
        ha = rng.choice(human_scores, size=len(human_scores), replace=True)
        la = rng.choice(llm_scores, size=len(llm_scores), replace=True)
        vh = np.nanvar(ha, ddof=1)
        vl = np.nanvar(la, ddof=1)
        if vl <= 0:
            continue
        boots.append(vh / vl)
    if len(boots) == 0:
        return np.nan, np.nan, np.nan
    return float(np.mean(boots)), float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))

def permutation_test_wasserstein(a: np.ndarray, b: np.ndarray, n_perm: int = 2000, seed: Optional[int]=0):
    rng = np.random.default_rng(seed)
    obs = float(stats.wasserstein_distance(a, b))
    combined = np.concatenate([a, b])
    n = len(a)
    greater = 0
    for _ in range(n_perm):
        rng.shuffle(combined)
        a_p = combined[:n]
        b_p = combined[n:]
        if stats.wasserstein_distance(a_p, b_p) >= obs:
            greater += 1
    p = (greater + 1) / (n_perm + 1)
    return obs, p

def bootstrap_kl(a: np.ndarray, b: np.ndarray, bins: int=4, n_boot: int=1000, seed: Optional[int]=0):
    rng = np.random.default_rng(seed)
    if len(a) < 2 or len(b) < 2:
        return np.nan, np.nan, np.nan
    pooled = np.concatenate([a, b])
    edges = np.quantile(pooled, np.linspace(0, 1, bins+1))
    edges[0] -= 1e-6
    edges[-1] += 1e-6
    def hist_probs(x):
        h, _ = np.histogram(x, bins=edges, density=False)
        h = h.astype(float) + 1e-6
        h /= h.sum()
        return h
    boots = []
    for _ in range(n_boot):
        aa = rng.choice(a, size=len(a), replace=True)
        bb = rng.choice(b, size=len(b), replace=True)
        p = hist_probs(aa); q = hist_probs(bb)
        boots.append(stats.entropy(p, q))
    boots = np.array(boots)
    return float(np.mean(boots)), float(np.percentile(boots, 2.5)), float(np.percentile(boots, 97.5))

def benjamini_hochberg(pvals: List[float], alpha: float = 0.05) -> Tuple[np.ndarray, float]:
    p = np.array(pvals)
    m = len(p)
    if m == 0:
        return np.array([], dtype=bool), 0.0
    order = np.argsort(p)
    sorted_p = p[order]
    thresholds = (np.arange(1, m+1) / m) * alpha
    below = sorted_p <= thresholds
    if not np.any(below):
        return np.zeros(m, dtype=bool), 0.0
    max_idx = np.where(below)[0].max()
    p_thresh = sorted_p[max_idx]
    rejected = p <= p_thresh
    return rejected, float(p_thresh)

def item_level_matrix(tidy: pd.DataFrame) -> pd.DataFrame:
    t = tidy.copy()
    t['item_id'] = t['source_file'].astype(str) + "::" + t['row_index'].astype(str) + "::" + t['model'].astype(str)
    item_avg = t.groupby(['item_id','evaluator']).score.mean().unstack()
    return item_avg

def compute_percent_agreement_matrix(item_avg: pd.DataFrame, tolerance: float = 0.0, round_decimals: Optional[int]=None) -> pd.DataFrame:
    evals = list(item_avg.columns)
    pa = pd.DataFrame(index=evals, columns=evals, dtype=float)
    for i,a in enumerate(evals):
        for j,b in enumerate(evals):
            if a == b:
                pa.loc[a,b] = 1.0
                continue
            pair = item_avg[[a,b]].dropna()
            if pair.shape[0] == 0:
                pa.loc[a,b] = np.nan
                continue
            A = pair[a].copy(); B = pair[b].copy()
            if round_decimals is not None:
                A = A.round(round_decimals); B = B.round(round_decimals)
            matches = (A - B).abs() <= tolerance
            pa.loc[a,b] = matches.sum() / len(matches)
    return pa

def compute_pairwise_kappa(item_avg: pd.DataFrame, k_bins: int = 3, custom_edges: Optional[List[float]] = None, min_overlap: int = 3):
    evals = list(item_avg.columns)
    kappa = pd.DataFrame(index=evals, columns=evals, dtype=float)
    overlap = pd.DataFrame(index=evals, columns=evals, dtype=float)
    for i,a in enumerate(evals):
        for j,b in enumerate(evals):
            if i == j:
                kappa.loc[a,b] = 1.0
                overlap.loc[a,b] = np.nan
                continue
            pair = item_avg[[a,b]].dropna()
            n = pair.shape[0]
            overlap.loc[a,b] = n
            if n < min_overlap:
                kappa.loc[a,b] = np.nan
                continue
            vals = np.concatenate([pair[a].values, pair[b].values])
            uniq = np.unique(vals)
            if uniq.size < 2:
                kappa.loc[a,b] = np.nan
                continue
            if custom_edges:
                edges = sorted(custom_edges)
                edges = [e for e in edges if vals.min() < e < vals.max()]
                if edges:
                    full_edges = [vals.min()-1e-6] + edges + [vals.max()+1e-6]
                    a_cat = pd.cut(pair[a], bins=full_edges, labels=False, include_lowest=True)
                    b_cat = pd.cut(pair[b], bins=full_edges, labels=False, include_lowest=True)
                else:
                    a_cat = pair[a].rank(method='dense').astype(int).values
                    b_cat = pair[b].rank(method='dense').astype(int).values
            elif uniq.size <= k_bins:
                mapping = {v:i for i,v in enumerate(np.sort(uniq))}
                a_cat = pair[a].map(mapping).values
                b_cat = pair[b].map(mapping).values
            else:
                q = np.linspace(0,1,k_bins+1)
                edges = np.quantile(vals, q)
                for t in range(1,len(edges)):
                    if edges[t] <= edges[t-1]:
                        edges[t] = edges[t-1] + 1e-6
                a_cat = pd.cut(pair[a], bins=edges, labels=False, include_lowest=True)
                b_cat = pd.cut(pair[b], bins=edges, labels=False, include_lowest=True)
            cats = np.unique(np.concatenate([a_cat.dropna(), b_cat.dropna()]))
            if cats.size < 2:
                kappa.loc[a,b] = np.nan
                continue
            idx = {c:i for i,c in enumerate(cats)}
            mat = np.zeros((cats.size, cats.size), dtype=int)
            for ac, bc in zip(a_cat, b_cat):
                try:
                    mat[idx[ac], idx[bc]] += 1
                except Exception:
                    continue
            total = mat.sum()
            if total == 0:
                kappa.loc[a,b] = np.nan
                continue
            po = np.trace(mat) / total
            row = mat.sum(axis=1); col = mat.sum(axis=0)
            pe = (row * col).sum() / (total**2)
            if pe >= 1.0:
                kappa.loc[a,b] = np.nan
            else:
                kappa.loc[a,b] = (po - pe) / (1 - pe)
    return kappa, overlap

def spectral_dominant_direction(tidy: pd.DataFrame):
    mat = tidy.groupby(["evaluator","model"], as_index=False).score.mean().pivot(index="evaluator", columns="model", values="score").fillna(0)
    if mat.shape[0] < 2 or mat.shape[1] < 1:
        return None, None, None
    vals = mat.values
    centered = vals - np.nanmean(vals, axis=1, keepdims=True)
    G = centered.T @ centered
    eigvals, eigvecs = eig(G)
    idx = np.argsort(np.real(eigvals))[::-1]
    eigvals = np.real(eigvals[idx])
    eigvecs = np.real(eigvecs[:, idx])
    dominant_val = eigvals[0] if eigvals.size>0 else np.nan
    dominant_vec = eigvecs[:,0] if eigvecs.size>0 else None
    model_names = list(mat.columns)
    return dominant_val, dominant_vec, model_names

def bootstrap_spectral_vector(tidy: pd.DataFrame, n_boot: int = 500):
    tidy_ids = tidy.groupby(["source_file","row_index","model"]).size().reset_index()[["source_file","row_index","model"]]
    recs = tidy_ids.to_dict(orient="records")
    n_items = len(recs)
    if n_items == 0:
        return None
    rng = np.random.default_rng(0)
    boot_vecs = []
    for _ in range(n_boot):
        picks = rng.choice(np.arange(n_items), size=n_items, replace=True)
        sel = pd.DataFrame([recs[i] for i in picks])
        merged = pd.merge(tidy, sel, on=["source_file","row_index","model"], how="inner")
        if merged.empty:
            continue
        val, vec, models = spectral_dominant_direction(merged)
        if vec is not None:
            boot_vecs.append(vec)
    if not boot_vecs:
        return None
    boot_vecs = np.vstack(boot_vecs)
    mean = np.mean(boot_vecs, axis=0)
    std = np.std(boot_vecs, axis=0)
    return mean, std, models

def build_item_features_for_evaluator(tidy: pd.DataFrame, evaluator: str, cfg: RootCfg) -> Tuple[List[str], np.ndarray]:
    df = tidy[tidy.evaluator == evaluator].copy()
    if df.empty:
        return [], np.empty((0,0))
    df["item_id"] = df["source_file"].astype(str) + "::" + df["row_index"].astype(str)
    wide = df.pivot_table(index="item_id", columns=["model","criterion"], values="score", aggfunc="mean")
    desired_cols = [(m,c) for m in cfg.base_models for c in cfg.criteria]
    present = [col for col in desired_cols if col in wide.columns]
    if not present:
        return [], np.empty((0,0))
    wide = wide.loc[:, present].fillna(0.0)
    col_names = [f"{m}__{c}" for (m,c) in present]
    wide.columns = col_names
    item_ids = list(wide.index.astype(str))
    return item_ids, wide.values

def fit_linear_operator_ridge(X: np.ndarray, Y: np.ndarray, ridge_alpha: float = 1e-3) -> np.ndarray:
    if X.size == 0 or Y.size == 0:
        return np.empty((0,0))
    scaler = StandardScaler(with_mean=True, with_std=True)
    Xs = scaler.fit_transform(X)
    out_dim = Y.shape[1]
    in_dim = X.shape[1]
    A = np.zeros((out_dim, in_dim))
    for j in range(out_dim):
        clf = Ridge(alpha=ridge_alpha, fit_intercept=True)
        clf.fit(Xs, Y[:, j])
        coef = clf.coef_
        coef_raw = coef / (scaler.scale_ + 1e-12)
        A[j,:] = coef_raw
    return A

def operator_spectral_radius(A: np.ndarray) -> float:
    if A.size == 0:
        return np.nan
    try:
        s = np.linalg.svd(A, compute_uv=False)
        return float(np.max(s))
    except Exception:
        try:
            ev = eig(A)[0]
            return float(np.max(np.abs(np.real(ev))))
        except Exception:
            return np.nan

def fit_meta_operators(tidy: pd.DataFrame, cfg: RootCfg, min_common: int = 5, n_boot: int = 500):
    evaluators = [e.name for e in cfg.evaluators]
    items_map = {}
    for ev in evaluators:
        ids, mat = build_item_features_for_evaluator(tidy, ev, cfg)
        items_map[ev] = set(ids)
    results = []
    for ev_prev in evaluators:
        for ev_next in evaluators:
            if ev_prev == ev_next:
                continue
            common = sorted(list(items_map.get(ev_prev, set()) & items_map.get(ev_next, set())))
            if len(common) < min_common:
                logging.info("Skipping operator %s -> %s: common items %d < %d", ev_prev, ev_next, len(common), min_common)
                continue
            def build_for_ids(ev, ids):
                df_ev = tidy[tidy.evaluator == ev].copy()
                df_ev["item_id"] = df_ev["source_file"].astype(str) + "::" + df_ev["row_index"].astype(str)
                df_ev = df_ev[df_ev["item_id"].isin(ids)]
                if df_ev.empty:
                    return None
                wide = df_ev.pivot_table(index="item_id", columns=["model","criterion"], values="score", aggfunc="mean")
                desired_cols = [(m,c) for m in cfg.base_models for c in cfg.criteria]
                present = [col for col in desired_cols if col in wide.columns]
                if not present:
                    return None
                wide = wide.loc[:, present].fillna(0.0)
                wide.columns = [f"{m}__{c}" for (m,c) in present]
                wide = wide.reindex(index=ids).fillna(0.0)
                return wide.values
            X = build_for_ids(ev_prev, common)
            Y = build_for_ids(ev_next, common)
            if X is None or Y is None:
                continue
            if X.shape[0] < min_common or Y.shape[0] < min_common:
                continue
            A = fit_linear_operator_ridge(X, Y, ridge_alpha=1e-3)
            rho = operator_spectral_radius(A)
            rng = np.random.default_rng(0)
            rhos = []
            n_items = X.shape[0]
            for _ in range(n_boot):
                idxs = rng.choice(np.arange(n_items), size=n_items, replace=True)
                Xb = X[idxs,:]; Yb = Y[idxs,:]
                try:
                    Ab = fit_linear_operator_ridge(Xb, Yb, ridge_alpha=1e-3)
                    rb = operator_spectral_radius(Ab)
                    if np.isfinite(rb):
                        rhos.append(rb)
                except Exception:
                    continue
            if rhos:
                rho_mean = float(np.mean(rhos)); rho_lo = float(np.percentile(rhos,2.5)); rho_hi = float(np.percentile(rhos,97.5))
            else:
                rho_mean = rho_lo = rho_hi = float(np.nan)
            results.append({"from": ev_prev, "to": ev_next, "n_items": int(len(common)),
                            "rho_point": float(rho), "rho_boot_mean": rho_mean,
                            "rho_ci_lo": rho_lo, "rho_ci_hi": rho_hi})
            col_in = [f"{m}__{c}" for m in cfg.base_models for c in cfg.criteria][:A.shape[1]]
            col_out = [f"{m}__{c}" for m in cfg.base_models for c in cfg.criteria][:A.shape[0]]
            A_df = pd.DataFrame(A, index=[f"out::{n}" for n in col_out], columns=[f"in::{n}" for n in col_in])
            A_df.to_csv(os.path.join(cfg.output_dir, f"operator_A_{ev_prev}_to_{ev_next}.csv"))
            logging.info("Fitted operator %s->%s rho=%.4f (boot mean=%.4f CI=[%.4f,%.4f]) items=%d",
                         ev_prev, ev_next, rho, rho_mean, rho_lo, rho_hi, len(common))
    res_df = pd.DataFrame(results)
    res_df.to_csv(os.path.join(cfg.output_dir, "meta_operator_rho_results.csv"), index=False)
    return res_df

def compute_item_avg_across_criteria_and_languages(tidy: pd.DataFrame) -> pd.DataFrame:
    t = tidy.copy()
    t['item_id'] = t['source_file'].astype(str) + "::" + t['row_index'].astype(str) + "::" + t['model'].astype(str)
    grouped = t.groupby(['evaluator','item_id']).agg(mean_score=('score','mean'), n_entries=('score','count')).reset_index()
    return grouped

def plot_violin_avg_all(cfg: RootCfg, per_item_df: pd.DataFrame, min_items_for_violin: int = 5):
    outdir = cfg.output_dir
    counts = per_item_df.groupby('evaluator')['item_id'].nunique().reset_index().rename(columns={'item_id':'n_items'})
    counts.to_csv(os.path.join(outdir, "per_evaluator_item_counts.csv"), index=False)
    allowed = counts[counts.n_items >= min_items_for_violin].evaluator.tolist()
    if len(allowed) == 0:
        logging.warning("No evaluator has >=%d items for violin plotting. Saved per_evaluator_item_counts.csv", min_items_for_violin)
        return
    df_plot = per_item_df[per_item_df.evaluator.isin(allowed)].copy()
    evals = sorted(df_plot.evaluator.unique(), key=lambda x: (0 if x.lower()=="human" else 1, x.lower()))
    plt.figure(figsize=(max(8, len(evals)*1.0), 6))
    sns.violinplot(data=df_plot, x='evaluator', y='mean_score', order=evals, inner=None, cut=0)
    sns.stripplot(data=df_plot, x='evaluator', y='mean_score', order=evals, color='k', size=3, jitter=0.15, alpha=0.4)
    means = df_plot.groupby('evaluator')['mean_score'].mean().reset_index()
    sns.pointplot(data=means, x='evaluator', y='mean_score', order=evals, color='white', join=False, markers='D', scale=0.8, errwidth=0)
    plt.title("Per-evaluator score distributions — per-item mean (collapsed over criteria & languages)")
    plt.ylabel("Per-item mean score")
    plt.xlabel("Evaluator (human + LLM judges)")
    vmin = float(df_plot['mean_score'].min()); vmax = float(df_plot['mean_score'].max())
    margin = max(0.02, (vmax - vmin) * 0.05) if vmax > vmin else 0.05
    plt.ylim(max(0.0, vmin - margin), vmax + margin)
    plt.tight_layout()
    outpath = os.path.join(outdir, "violin_avg_all_criteria_langs.png")
    plt.savefig(outpath, dpi=cfg.dpi)
    plt.close()
    df_plot.to_csv(os.path.join(outdir, "per_item_mean_scores.csv"), index=False)
    logging.info("Saved violin plot and per_item_mean_scores.csv (n_evaluators=%d)", len(evals))

def run_all(cfg: RootCfg):
    sns.set_style(cfg.style)
    ensure_dir(cfg.output_dir)
    tidy = build_tidy_from_csvs(cfg)
    if tidy.empty:
        logging.error("Tidy is empty, abort.")
        return
    tidy.to_csv(os.path.join(cfg.output_dir, "tidy_snapshot.csv"), index=False)
    overall = tidy.groupby(["evaluator","model"], as_index=False).score.mean().pivot(index="evaluator", columns="model", values="score")
    overall.to_csv(os.path.join(cfg.output_dir, "agg_overall_model.csv"))
    model_vs_lang = tidy.groupby(["evaluator","language","model"], as_index=False).score.mean()
    model_vs_lang.to_csv(os.path.join(cfg.output_dir, "agg_model_vs_language.csv"), index=False)
    model_vs_crit = tidy.groupby(["evaluator","criterion","model"], as_index=False).score.mean()
    model_vs_crit.to_csv(os.path.join(cfg.output_dir, "agg_model_vs_criteria.csv"), index=False)
    plt.figure(figsize=(8,5))
    overall_plot = overall.reset_index().melt(id_vars="evaluator", value_name="mean_score", var_name="model")
    sns.barplot(data=overall_plot, x="evaluator", y="mean_score", hue="model")
    plt.title("Overall model performance by evaluator (avg over lang+crit)")
    plt.ylabel("Mean score")
    plt.tight_layout()
    plt.savefig(os.path.join(cfg.output_dir, "overall_model_by_evaluator.png"), dpi=cfg.dpi)
    plt.close()
    evaluators_list = tidy.evaluator.unique().tolist()
    for ev in evaluators_list:
        sub = model_vs_lang[model_vs_lang.evaluator == ev]
        if sub.empty:
            continue
        plt.figure(figsize=(8,4))
        sns.barplot(data=sub, x="language", y="score", hue="model")
        plt.title(f"Model vs Language (evaluator = {ev})")
        plt.ylabel("Mean score")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, f"model_vs_language_{ev}.png"), dpi=cfg.dpi)
        plt.close()
    for ev in evaluators_list:
        sub = model_vs_crit[model_vs_crit.evaluator == ev]
        if sub.empty:
            continue
        plt.figure(figsize=(8,4))
        sns.barplot(data=sub, x="criterion", y="score", hue="model")
        plt.title(f"Model vs Criterion (evaluator = {ev})")
        plt.ylabel("Mean score")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, f"model_vs_criterion_{ev}.png"), dpi=cfg.dpi)
        plt.close()
    item_avg = item_level_matrix(tidy)
    item_avg.to_csv(os.path.join(cfg.output_dir, "item_level_means.csv"))
    corr = item_avg.corr(method="pearson")
    plt.figure(figsize=(max(5, corr.shape[1]*1.0), max(4, corr.shape[0]*0.8)))
    sns.heatmap(corr, annot=True, fmt=".3f", cmap="vlag", vmin=-1, vmax=1)
    plt.title("Inter-evaluator Pearson Correlation (item-level means)")
    plt.tight_layout()
    plt.savefig(os.path.join(cfg.output_dir, "evaluator_correlation_heatmap.png"), dpi=cfg.dpi)
    plt.close()
    corr.to_csv(os.path.join(cfg.output_dir, "evaluator_correlation.csv"))
    pa_exact = compute_percent_agreement_matrix(item_avg, tolerance=0.0, round_decimals=None)
    pa_exact.to_csv(os.path.join(cfg.output_dir, "pairwise_percent_agreement_exact.csv"))
    plt.figure(figsize=(max(5, pa_exact.shape[1]*1.0), max(4, pa_exact.shape[0]*0.8)))
    sns.heatmap(pa_exact.astype(float), annot=True, fmt=".3f", cmap="Greens", vmin=0, vmax=1)
    plt.title("Pairwise Percent Agreement (exact match)")
    plt.tight_layout()
    plt.savefig(os.path.join(cfg.output_dir, "pairwise_percent_agreement_exact.png"), dpi=cfg.dpi)
    plt.close()
    pa_rounded = compute_percent_agreement_matrix(item_avg, tolerance=0.0, round_decimals=3)
    pa_rounded.to_csv(os.path.join(cfg.output_dir, "pairwise_percent_agreement_rounded3.csv"))
    plt.figure(figsize=(max(5, pa_rounded.shape[1]*1.0), max(4, pa_rounded.shape[0]*0.8)))
    sns.heatmap(pa_rounded.astype(float), annot=True, fmt=".3f", cmap="Greens", vmin=0, vmax=1)
    plt.title("Pairwise Percent Agreement (rounded 3 decimals)")
    plt.tight_layout()
    plt.savefig(os.path.join(cfg.output_dir, "pairwise_percent_agreement_rounded3.png"), dpi=cfg.dpi)
    plt.close()
    kappa, overlap = compute_pairwise_kappa(item_avg, k_bins=3, custom_edges=None, min_overlap=3)
    kappa.to_csv(os.path.join(cfg.output_dir, "pairwise_kappa_matrix.csv"))
    plt.figure(figsize=(max(5, kappa.shape[1]*1.0), max(4, kappa.shape[0]*0.8)))
    sns.heatmap(kappa.astype(float), annot=True, fmt=".3f", cmap="rocket", vmin=0, vmax=1)
    plt.title("Pairwise Cohen's Kappa (discretized)")
    plt.tight_layout()
    plt.savefig(os.path.join(cfg.output_dir, "pairwise_kappa_heatmap.png"), dpi=cfg.dpi)
    plt.close()
    overlap.to_csv(os.path.join(cfg.output_dir, "pairwise_overlap_counts.csv"))
    llm_judges = [e.name for e in cfg.evaluators if e.name.lower() != "human"]
    vr_rows = []
    for crit in cfg.criteria:
        for lang in [l.name for l in cfg.languages]:
            human_scores = tidy[(tidy.evaluator=="human") & (tidy.criterion==crit) & (tidy.language==lang)].score.dropna().values
            llm_scores_all = tidy[(tidy.evaluator.isin(llm_judges)) & (tidy.criterion==crit) & (tidy.language==lang)].score.dropna().values
            if human_scores.size < 2 or llm_scores_all.size < 2:
                continue
            mean_r, lo, hi = bootstrap_variance_ratio(human_scores, llm_scores_all, n_boot=2000)
            vr_rows.append({"criterion": crit, "language": lang, "variance_ratio": mean_r, "ci_lo": lo, "ci_hi": hi,
                            "n_human": human_scores.size, "n_llm": llm_scores_all.size})
    vr_df = pd.DataFrame(vr_rows)
    vr_df.to_csv(os.path.join(cfg.output_dir, "variance_ratio_by_criterion_language.csv"), index=False)
    if not vr_df.empty:
        heat = vr_df.pivot(index="criterion", columns="language", values="variance_ratio")
        plt.figure(figsize=(max(6, heat.shape[1]*1.6), max(4, heat.shape[0]*1.0)))
        sns.heatmap(heat, annot=True, fmt=".2f", cmap="vlag", center=1.0)
        plt.title("Variance ratio (Human variance / mean LLM variance)")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, "variance_ratio_heatmap.png"), dpi=cfg.dpi)
        plt.close()
    divergence_rows = []
    for crit in cfg.criteria:
        for lang in [l.name for l in cfg.languages]:
            human_scores = tidy[(tidy.evaluator=="human") & (tidy.criterion==crit) & (tidy.language==lang)].score.dropna().values
            if human_scores.size < 2:
                continue
            for ev in llm_judges:
                llm_scores = tidy[(tidy.evaluator==ev) & (tidy.criterion==crit) & (tidy.language==lang)].score.dropna().values
                if llm_scores.size < 2:
                    continue
                wass, p = permutation_test_wasserstein(human_scores, llm_scores, n_perm=2000)
                rng = np.random.default_rng(0)
                boots = []
                for _ in range(1000):
                    ha = rng.choice(human_scores, size=len(human_scores), replace=True)
                    la = rng.choice(llm_scores, size=len(llm_scores), replace=True)
                    boots.append(stats.wasserstein_distance(ha, la))
                boots = np.array(boots) if boots else np.array([])
                w_lo = float(np.percentile(boots, 2.5)) if boots.size else float(np.nan)
                w_hi = float(np.percentile(boots, 97.5)) if boots.size else float(np.nan)
                kl_mean, kl_lo, kl_hi = bootstrap_kl(human_scores, llm_scores, bins=4, n_boot=1000)
                divergence_rows.append({"criterion": crit, "language": lang, "llm_judge": ev, "wasserstein": wass, "wass_p": p, "wass_lo": w_lo, "wass_hi": w_hi, "kl_mean": kl_mean, "kl_lo": kl_lo, "kl_hi": kl_hi, "n_human": human_scores.size, "n_llm": llm_scores.size})
    div_df = pd.DataFrame(divergence_rows)
    if not div_df.empty:
        rejected, p_thresh = benjamini_hochberg(div_df["wass_p"].tolist(), alpha=0.05)
        div_df["wass_fdr_reject"] = rejected
        div_df["wass_fdr_threshold"] = p_thresh
    div_df.to_csv(os.path.join(cfg.output_dir, "divergence_metrics_by_crit_lang.csv"), index=False)
    if not div_df.empty:
        plt.figure(figsize=(10,5))
        sns.barplot(data=div_df, x="criterion", y="wasserstein", hue="llm_judge")
        plt.title("Wasserstein distance between human and LLM-judges by criterion")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, "wass_by_criterion.png"), dpi=cfg.dpi)
        plt.close()
        for ev in div_df.llm_judge.unique():
            sub = div_df[div_df.llm_judge==ev]
            heat = sub.pivot(index="criterion", columns="language", values="kl_mean")
            if heat is None or heat.empty:
                continue
            plt.figure(figsize=(max(6, heat.shape[1]*1.6), max(4, heat.shape[0]*1.0)))
            sns.heatmap(heat, annot=True, fmt=".3f", cmap="mako")
            plt.title(f"KL (human || {ev}) by criterion & language")
            plt.tight_layout()
            plt.savefig(os.path.join(cfg.output_dir, f"kl_heatmap_{ev}.png"), dpi=cfg.dpi)
            plt.close()
    mean_diff_rows = []
    for crit in cfg.criteria:
        for lang in [l.name for l in cfg.languages]:
            for model in cfg.base_models:
                h = tidy[(tidy.evaluator=="human") & (tidy.criterion==crit) & (tidy.language==lang) & (tidy.model==model)].score.dropna().values
                if h.size < 2:
                    continue
                for ev in llm_judges:
                    l = tidy[(tidy.evaluator==ev) & (tidy.criterion==crit) & (tidy.language==lang) & (tidy.model==model)].score.dropna().values
                    if l.size < 2:
                        continue
                    rng = np.random.default_rng(0)
                    diffs = []
                    n_boot = 2000
                    for _ in range(n_boot):
                        ha = rng.choice(h, size=len(h), replace=True)
                        la = rng.choice(l, size=len(l), replace=True)
                        diffs.append(np.nanmean(ha) - np.nanmean(la))
                    diffs = np.array(diffs)
                    mean_diff = float(np.nanmean(diffs)); lo = float(np.percentile(diffs, 2.5)); hi = float(np.percentile(diffs, 97.5))
                    p_boot = float((np.sum(np.abs(diffs) >= np.abs(mean_diff)) + 1) / (n_boot + 1))
                    mean_diff_rows.append({"criterion": crit, "language": lang, "model": model, "llm_judge": ev, "mean_diff": mean_diff, "ci_lo": lo, "ci_hi": hi, "p_boot": p_boot, "n_human": h.size, "n_llm": l.size})
    mean_df = pd.DataFrame(mean_diff_rows)
    if not mean_df.empty:
        rejected, p_thresh = benjamini_hochberg(mean_df["p_boot"].tolist(), alpha=0.05)
        mean_df["fdr_reject"] = rejected
        mean_df["fdr_threshold"] = p_thresh
        mean_df.to_csv(os.path.join(cfg.output_dir, "mean_diff_human_minus_llm_with_fdr.csv"), index=False)
        mean_df['p_for_plot'] = mean_df['p_boot'].clip(lower=1e-10)
        mean_df['neglogp'] = -np.log10(mean_df['p_for_plot'])
        plt.figure(figsize=(8,5))
        sns.scatterplot(data=mean_df, x='mean_diff', y='neglogp', hue='fdr_reject', palette={True:'red', False:'gray'}, alpha=0.8)
        plt.axvline(0, color='k', linestyle='--')
        plt.xlabel("Mean(Human) - Mean(LLM judge)")
        plt.ylabel("-log10(p_boot)")
        plt.title("Mean difference (human - LLM) with FDR flags")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, "mean_diff_volcano.png"), dpi=cfg.dpi)
        plt.close()
    dom_val, dom_vec, model_names = spectral_dominant_direction(tidy)
    if dom_vec is not None:
        pd.DataFrame({"model": model_names, "dominant_weight": dom_vec}).to_csv(os.path.join(cfg.output_dir, "spectral_dominant_vector.csv"), index=False)
        plt.figure(figsize=(7,4))
        sns.barplot(x=model_names, y=dom_vec)
        plt.title("Dominant spectral direction across models (avg over evaluators)")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, "spectral_dominant_evec.png"), dpi=cfg.dpi)
        plt.close()
        boot_stats = bootstrap_spectral_vector(tidy, n_boot=500)
        if boot_stats is not None:
            mean_boot, std_boot, models_boot = boot_stats
            pd.DataFrame({"model": models_boot, "boot_mean": mean_boot, "boot_std": std_boot}).to_csv(os.path.join(cfg.output_dir, "spectral_bootstrap_stats.csv"), index=False)
    op_results = fit_meta_operators(tidy, cfg, min_common=5, n_boot=500)
    if not op_results.empty:
        plt.figure(figsize=(max(6, op_results.shape[0]*0.6), 4))
        op_results['pair'] = op_results['from'] + "→" + op_results['to']
        sns.barplot(data=op_results, x='pair', y='rho_point')
        for i, r in op_results.iterrows():
            rho_pt = r.get('rho_point', np.nan)
            rho_lo = r.get('rho_ci_lo', np.nan)
            rho_hi = r.get('rho_ci_hi', np.nan)
            try:
                rho_pt = float(rho_pt)
            except Exception:
                rho_pt = np.nan
            try:
                rho_lo = float(rho_lo)
                rho_hi = float(rho_hi)
            except Exception:
                rho_lo = rho_hi = np.nan
            if np.isfinite(rho_pt) and np.isfinite(rho_lo) and np.isfinite(rho_hi):
                lower = rho_pt - rho_lo
                upper = rho_hi - rho_pt
                if not (lower >= 0 and upper >= 0):
                    logging.warning("Invalid CI for operator %s->%s: rho_pt=%s, lo=%s, hi=%s. Clamping to non-negative widths.",
                                    r['from'], r['to'], rho_pt, rho_lo, rho_hi)
                    lower = max(0.0, lower if np.isfinite(lower) else 0.0)
                    upper = max(0.0, upper if np.isfinite(upper) else 0.0)
                yerr = np.array([[lower], [upper]])
                plt.errorbar(i, rho_pt, yerr=yerr, fmt='none', color='k', capsize=4)
            else:
                logging.info("Skipping errorbar for operator %s->%s due to missing CI.", r['from'], r['to'])
        plt.xticks(rotation=45, ha='right')
        plt.title("Operator spectral radii (ρ) per evaluator pair")
        plt.tight_layout()
        plt.savefig(os.path.join(cfg.output_dir, "operator_rho_barplot.png"), dpi=cfg.dpi)
        plt.close()
    per_item_df = compute_item_avg_across_criteria_and_languages(tidy)
    per_item_df.to_csv(os.path.join(cfg.output_dir, "per_item_mean_scores_raw.csv"), index=False)
    plot_violin_avg_all(cfg, per_item_df, min_items_for_violin=5)
    logging.info("All analyses complete. Outputs saved in %s", cfg.output_dir)

def setup_logging():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

def parse_args():
    ap = argparse.ArgumentParser(description="Extended evaluator analysis pipeline.")
    ap.add_argument("--config", required=True, help="Path to YAML config.")
    return ap.parse_args()

def main():
    args = parse_args()
    setup_logging()
    cfg = load_config(args.config)
    ensure_dir(cfg.output_dir)
    run_all(cfg)

if __name__ == "__main__":
    main()