import os
import glob
import argparse
import logging
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
import yaml
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

@dataclass
class EvaluatorCfg:
    name: str
    suffix: str

@dataclass
class LanguageCfg:
    name: str
    match_substrings: List[str]

@dataclass
class PlotFacetCfg:
    fig_width_per_col: float = 5.4
    fig_height_per_row: float = 4.2
    max_cols: int = 3
    bar_width: float = 0.55
    category_margin: float = 0.05
    x_label_rotation: int = 25
    title_fontsize: int = 15
    facet_title_fontsize: int = 11
    annotation_fontsize: int = 8
    legend_ncol: Optional[int] = None
    legend_fontsize: int = 10
    legend_title_fontsize: int = 11
    legend_labelspacing: float = 0.6
    legend_columnspacing: float = 1.0
    legend_handlelength: float = 1.4
    legend_anchor_y: Optional[float] = None
    top_legend: bool = True
    y_min: float = 0.0
    y_max: float = 5.0
    y_tick_step: float = 1.0
    y_headroom: float = 0.05
    suptitle_y: float = 1.04
    tighten: bool = True
    tight_layout_pad: float = 0.5
    title_y: Optional[float] = None
    title_legend_gap: float = 0.05
    auto_legend_below_title: bool = True
    combined_single_panel: bool = False
    single_panel_fig_width: Optional[float] = None
    single_panel_fig_height: Optional[float] = None

@dataclass
class AgreementCfg:
    enable_pairwise_kappa: bool = True
    kappa_bins: int = 3
    min_overlap_items: int = 3
    fill_na_with: Optional[float] = None
    enable_percent_agreement: bool = True
    percent_match_tolerance: float = 0.0
    percent_round_decimals: Optional[int] = None
    custom_kappa_edges: Optional[List[float]] = None

@dataclass
class RootCfg:
    base_models: List[str]
    criteria: List[str]
    evaluators: List[EvaluatorCfg]
    languages: List[LanguageCfg]
    csv_root: str
    recursive: bool
    filename_filters: List[str]
    skip_unknown_language: bool
    output_dir: str
    dpi: int
    style: str
    model_vs_language_cfg: PlotFacetCfg
    model_vs_criterion_cfg: PlotFacetCfg
    overall_model_cfg: PlotFacetCfg
    agreement: AgreementCfg

def _load_plot_section(plots_raw: Dict[str, Any], key: str) -> PlotFacetCfg:
    s = plots_raw.get(key, {})
    return PlotFacetCfg(
        fig_width_per_col=s.get("fig_width_per_col", 5.4),
        fig_height_per_row=s.get("fig_height_per_row", 4.2),
        max_cols=s.get("max_cols", 3),
        bar_width=s.get("bar_width", 0.55),
        category_margin=s.get("category_margin", 0.05),
        x_label_rotation=s.get("x_label_rotation", 25),
        title_fontsize=s.get("title_fontsize", 15),
        facet_title_fontsize=s.get("facet_title_fontsize", 11),
        annotation_fontsize=s.get("annotation_fontsize", 8),
        legend_ncol=s.get("legend_ncol"),
        legend_fontsize=s.get("legend_fontsize", 10),
        legend_title_fontsize=s.get("legend_title_fontsize", 11),
        legend_labelspacing=s.get("legend_labelspacing", 0.6),
        legend_columnspacing=s.get("legend_columnspacing", 1.0),
        legend_handlelength=s.get("legend_handlelength", 1.4),
        legend_anchor_y=s.get("legend_anchor_y"),
        top_legend=s.get("top_legend", True),
        y_min=s.get("y_min", 0.0),
        y_max=s.get("y_max", 5.0),
        y_tick_step=s.get("y_tick_step", 1.0),
        y_headroom=s.get("y_headroom", 0.05),
        suptitle_y=s.get("suptitle_y", 1.04),
        tighten=s.get("tighten", True),
        tight_layout_pad=s.get("tight_layout_pad", 0.5),
        title_y=s.get("title_y"),
        title_legend_gap=s.get("title_legend_gap", 0.05),
        auto_legend_below_title=s.get("auto_legend_below_title", True),
        combined_single_panel=s.get("combined_single_panel", False),
        single_panel_fig_width=s.get("single_panel_fig_width"),
        single_panel_fig_height=s.get("single_panel_fig_height"),
    )

def load_config(path: str) -> RootCfg:
    with open(path, "r", encoding="utf-8") as f:
        raw = yaml.safe_load(f)
    eval_raw = raw["evaluation"]
    evaluators = [EvaluatorCfg(**e) for e in eval_raw["evaluators"]]
    languages = [LanguageCfg(**l) for l in raw.get("languages", [])]
    data_raw = raw["data"]
    plots_raw = raw.get("plots", {})
    agreement_raw = raw.get("agreement", {})
    agreement_cfg = AgreementCfg(
        enable_pairwise_kappa=agreement_raw.get("enable_pairwise_kappa", True),
        kappa_bins=agreement_raw.get("kappa_bins", 3),
        min_overlap_items=agreement_raw.get("min_overlap_items", 3),
        fill_na_with=agreement_raw.get("fill_na_with"),
        enable_percent_agreement=agreement_raw.get("enable_percent_agreement", True),
        percent_match_tolerance=agreement_raw.get("percent_match_tolerance", 0.0),
        percent_round_decimals=agreement_raw.get("percent_round_decimals"),
        custom_kappa_edges=agreement_raw.get("custom_kappa_edges")
    )
    return RootCfg(
        base_models=eval_raw["base_models"],
        criteria=eval_raw["criteria"],
        evaluators=evaluators,
        languages=languages,
        csv_root=data_raw["csv_root"],
        recursive=data_raw.get("recursive", True),
        filename_filters=data_raw.get("include_only_filename_substrings", []),
        skip_unknown_language=data_raw.get("skip_unknown_language", True),
        output_dir=plots_raw.get("output_dir", "output"),
        dpi=plots_raw.get("dpi", 150),
        style=plots_raw.get("style", "white"),
        model_vs_language_cfg=_load_plot_section(plots_raw, "model_vs_language"),
        model_vs_criterion_cfg=_load_plot_section(plots_raw, "model_vs_criterion"),
        overall_model_cfg=_load_plot_section(plots_raw, "overall_model"),
        agreement=agreement_cfg
    )

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def discover_csv_files(root: str, recursive: bool, filters: List[str]) -> List[str]:
    pattern = "**/*.csv" if recursive else "*.csv"
    files = glob.glob(os.path.join(root, pattern), recursive=recursive)
    if filters:
        fl = [s.lower() for s in filters]
        files = [f for f in files if any(s in os.path.basename(f).lower() for s in fl)]
    return sorted(files)

def infer_language(path: str, languages: List[LanguageCfg]) -> Optional[str]:
    fname = os.path.basename(path).lower()
    for lang in languages:
        for sub in lang.match_substrings:
            if sub.lower() in fname:
                return lang.name
    return None

def score_col(crit: str, model: str, suffix: str) -> str:
    return f"score_{crit}_{model}_eval{suffix}"

def build_tidy(df: pd.DataFrame, evaluator: EvaluatorCfg, base_models: List[str], criteria: List[str], language: str, rel_file: str) -> pd.DataFrame:
    rows = []
    cols = df.columns
    for row_idx, r in df.iterrows():
        for model in base_models:
            for crit in criteria:
                col = score_col(crit, model, evaluator.suffix)
                if col not in cols:
                    continue
                val = r.get(col, np.nan)
                if pd.isna(val):
                    continue
                rows.append({"evaluator": evaluator.name, "model": model, "criterion": crit, "language": language, "score": val, "source_file": rel_file, "row_index": row_idx})
    return pd.DataFrame(rows)

def assemble_tidy(cfg: RootCfg) -> pd.DataFrame:
    files = discover_csv_files(cfg.csv_root, cfg.recursive, cfg.filename_filters)
    if not files:
        return pd.DataFrame()
    parts = []
    for fp in files:
        lang = infer_language(fp, cfg.languages)
        if lang is None:
            if cfg.skip_unknown_language:
                continue
            lang = "unknown"
        try:
            df = pd.read_csv(fp, encoding="utf-8")
        except Exception:
            continue
        for ev in cfg.evaluators:
            if not any(score_col(cfg.criteria[0], m, ev.suffix) in df.columns for m in cfg.base_models):
                continue
            t = build_tidy(df, ev, cfg.base_models, cfg.criteria, lang, os.path.relpath(fp, cfg.csv_root))
            if not t.empty:
                parts.append(t)
    if not parts:
        return pd.DataFrame()
    return pd.concat(parts, ignore_index=True)

def agg_model_language(tidy: pd.DataFrame) -> pd.DataFrame:
    return tidy.groupby(["evaluator","language","model"], as_index=False).agg(mean_score=("score","mean"), n=("score","count"))

def agg_model_criteria(tidy: pd.DataFrame) -> pd.DataFrame:
    return tidy.groupby(["evaluator","criterion","model"], as_index=False).agg(mean_score=("score","mean"), n=("score","count"))

def agg_overall_model(tidy: pd.DataFrame) -> pd.DataFrame:
    return tidy.groupby(["evaluator","model"], as_index=False).agg(mean_score=("score","mean"), n=("score","count"))

def item_level_matrix(tidy: pd.DataFrame) -> pd.DataFrame:
    item_avg = tidy.groupby(["evaluator","source_file","row_index","model"], as_index=False).agg(item_mean=("score","mean"))
    item_avg["item_id"] = item_avg["source_file"].astype(str) + "::" + item_avg["row_index"].astype(str) + "::" + item_avg["model"].astype(str)
    return item_avg.pivot(index="item_id", columns="evaluator", values="item_mean")

def apply_y_limits(ax, y_min, y_max, step, headroom):
    upper = y_max + headroom
    ax.set_ylim(y_min, upper)
    if step <= 0 or step >= (y_max - y_min):
        ticks = [y_min, y_max]
    else:
        ticks = list(np.arange(y_min, y_max + 1e-9, step))
        if ticks[-1] != y_max:
            ticks.append(y_max)
    ax.set_yticks(ticks)

def annotate_bars(ax, fmt="%.2f", offset=2, fontsize=8):
    for p in ax.patches:
        h = p.get_height()
        if h is None or np.isnan(h):
            continue
        ax.annotate(fmt % h, (p.get_x() + p.get_width()/2, h), ha="center", va="bottom", fontsize=fontsize, xytext=(0, offset), textcoords="offset points")

def facet_bar(df: pd.DataFrame, cfg: RootCfg, plot_cfg: PlotFacetCfg, facet_col: str, x: str, hue: str, x_order: List[str], model_order: List[str], evaluator_order: List[str], title: str, out_path: str):
    if df.empty:
        return
    evaluator_order_present = [e for e in evaluator_order if e in df[facet_col].unique()]
    n = len(evaluator_order_present)
    cols = min(plot_cfg.max_cols, n)
    rows = (n + cols - 1)//cols
    fig_w = plot_cfg.fig_width_per_col * cols
    fig_h = plot_cfg.fig_height_per_row * rows
    fig, axes = plt.subplots(rows, cols, figsize=(fig_w, fig_h), squeeze=False)
    palette = sns.color_palette("muted", n_colors=len(model_order))
    for idx, ev in enumerate(evaluator_order_present):
        ax = axes[idx//cols][idx%cols]
        sub = df[df[facet_col] == ev]
        sns.barplot(data=sub, x=x, y="mean_score", hue=hue, order=[v for v in x_order if v in sub[x].unique()], hue_order=[m for m in model_order if m in sub[hue].unique()], palette=palette, errorbar=None, ax=ax, width=plot_cfg.bar_width)
        if ax.legend_:
            ax.legend_.remove()
        ax.set_title(ev, fontsize=plot_cfg.facet_title_fontsize, pad=8)
        ax.set_xlabel(x.capitalize())
        ax.set_ylabel("Mean score")
        for label in ax.get_xticklabels():
            label.set_rotation(plot_cfg.x_label_rotation)
        apply_y_limits(ax, plot_cfg.y_min, plot_cfg.y_max, plot_cfg.y_tick_step, plot_cfg.y_headroom)
        annotate_bars(ax, fontsize=plot_cfg.annotation_fontsize)
        ax.margins(x=plot_cfg.category_margin)
    for j in range(n, rows*cols):
        axes[j//cols][j%cols].axis("off")
    handles = [plt.Rectangle((0,0),1,1,color=palette[i % len(palette)]) for i, m in enumerate(model_order) if m in df[hue].unique()]
    legend_ncol = plot_cfg.legend_ncol or len(model_order)
    if plot_cfg.top_legend:
        fig.legend(handles, [m for m in model_order if m in df[hue].unique()], loc="upper center", bbox_to_anchor=(0.5, 1.01), frameon=False, ncol=legend_ncol, title="Model")
    fig.suptitle(title, fontsize=plot_cfg.title_fontsize, y=plot_cfg.suptitle_y)
    if plot_cfg.tighten:
        fig.tight_layout(rect=[0.02,0.02,0.98,0.97])
    fig.savefig(out_path, dpi=cfg.dpi, bbox_inches="tight")
    plt.close(fig)

def plot_evaluator_correlation(tidy: pd.DataFrame, cfg: RootCfg, out_path: str):
    wide = item_level_matrix(tidy)
    if wide.shape[1] < 2:
        return
    corr = wide.corr(method="pearson")
    eval_order = [e.name for e in cfg.evaluators if e.name in corr.columns]
    corr = corr.loc[eval_order, eval_order]
    base = sns.color_palette("muted", n_colors=6)
    cmap = LinearSegmentedColormap.from_list("muted_continuous", base, N=256)
    plt.figure(figsize=(1 + 1.0 * corr.shape[1], 1 + 0.9 * corr.shape[1]))
    sns.heatmap(corr, annot=True, fmt=".3f", cmap=cmap, vmin=0.0, vmax=1.0, linewidths=0.4, linecolor="#ffffff")
    plt.title("Inter-Annotator Pearson Correlation")
    plt.tight_layout()
    plt.savefig(out_path, dpi=cfg.dpi)
    plt.close()
    corr.to_csv(out_path.replace(".png", ".csv"))

def plot_overall_models_combined(overall_df: pd.DataFrame, cfg: RootCfg, overall_cfg: PlotFacetCfg, out_path: str):
    if overall_df.empty:
        return
    pcfg = overall_cfg
    evaluator_order = [e.name for e in cfg.evaluators if e.name in overall_df.evaluator.unique()]
    model_order = cfg.base_models
    fig_w = pcfg.single_panel_fig_width or max(3.0, pcfg.fig_width_per_col * len(evaluator_order))
    fig_h = pcfg.single_panel_fig_height or pcfg.fig_height_per_row
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    palette = sns.color_palette("muted", n_colors=len(model_order))
    sns.barplot(data=overall_df, x="evaluator", y="mean_score", hue="model", order=evaluator_order, hue_order=[m for m in model_order if m in overall_df.model.unique()], palette=palette, errorbar=None, ax=ax, width=pcfg.bar_width)
    if ax.legend_:
        ax.legend_.remove()
    legend_models = [m for m in model_order if m in overall_df.model.unique()]
    handles = [plt.Rectangle((0,0),1,1,color=palette[i % len(palette)]) for i,_ in enumerate(legend_models)]
    if pcfg.auto_legend_below_title:
        title_y = pcfg.title_y or 1.05
        legend_y = pcfg.legend_anchor_y or (title_y - pcfg.title_legend_gap)
        if legend_y >= title_y:
            legend_y = title_y - 0.02
        fig.suptitle("Overall Model Performance (Lang+Criterion Averaged)", fontsize=pcfg.title_fontsize, y=title_y)
        fig.legend(handles, legend_models, loc="upper center", bbox_to_anchor=(0.5, legend_y), frameon=False, ncol=pcfg.legend_ncol or len(legend_models), fontsize=pcfg.legend_fontsize, title="Model", title_fontsize=pcfg.legend_title_fontsize, labelspacing=pcfg.legend_labelspacing, columnspacing=pcfg.legend_columnspacing, handlelength=pcfg.legend_handlelength)
        axes_top = min(0.90, legend_y - 0.05)
        fig.subplots_adjust(top=max(0.30, axes_top))
    else:
        ax.set_title("Overall Model Performance (Lang+Criterion Averaged)", fontsize=pcfg.title_fontsize, pad=10)
        ax.legend(handles, legend_models, loc="upper center", bbox_to_anchor=(0.5, pcfg.legend_anchor_y or 1.10), frameon=False, ncol=pcfg.legend_ncol or len(legend_models), title="Model")
    ax.set_xlabel("Evaluator")
    ax.set_ylabel("Mean score")
    for tick in ax.get_xticklabels():
        tick.set_rotation(pcfg.x_label_rotation)
    apply_y_limits(ax, pcfg.y_min, pcfg.y_max, pcfg.y_tick_step, pcfg.y_headroom)
    annotate_bars(ax, fontsize=pcfg.annotation_fontsize)
    ax.margins(x=pcfg.category_margin)
    if pcfg.tighten:
        fig.tight_layout(pad=pcfg.tight_layout_pad)
        if pcfg.auto_legend_below_title:
            pass
    fig.savefig(out_path, dpi=cfg.dpi, bbox_inches="tight")
    plt.close(fig)

def plot_overall_combined(overall_df: pd.DataFrame, cfg: RootCfg, out_path: str):
    if overall_df.empty:
        return
    evaluator_order = [e.name for e in cfg.evaluators if e.name in overall_df.evaluator.unique()]
    model_order = cfg.base_models
    pcfg = cfg.overall_model_cfg
    fig_w = max(3.0, pcfg.fig_width_per_col * (len(evaluator_order)/pcfg.max_cols if pcfg.max_cols else len(evaluator_order)))
    fig_h = pcfg.fig_height_per_row
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    palette = sns.color_palette("muted", n_colors=len(model_order))
    sns.barplot(data=overall_df, x="evaluator", y="mean_score", hue="model", order=evaluator_order, hue_order=[m for m in model_order if m in overall_df.model.unique()], palette=palette, errorbar=None, ax=ax, width=pcfg.bar_width)
    legend_models = [m for m in model_order if m in overall_df.model.unique()]
    handles = [plt.Rectangle((0,0),1,1,color=palette[i % len(palette)]) for i,_ in enumerate(legend_models)]
    ax.legend(handles, legend_models, loc="upper center", bbox_to_anchor=(0.5, 1.10), frameon=False, ncol=pcfg.legend_ncol or len(legend_models), title="Model")
    ax.set_xlabel("Evaluator")
    ax.set_ylabel("Mean score")
    ax.set_title("Overall Model Performance (Lang+Criterion Averaged)", fontsize=pcfg.title_fontsize, pad=12)
    for tick in ax.get_xticklabels():
        tick.set_rotation(pcfg.x_label_rotation)
    apply_y_limits(ax, pcfg.y_min, pcfg.y_max, pcfg.y_tick_step, pcfg.y_headroom)
    annotate_bars(ax, fontsize=pcfg.annotation_fontsize)
    ax.margins(x=pcfg.category_margin)
    fig.tight_layout()
    fig.savefig(out_path, dpi=cfg.dpi, bbox_inches="tight")
    plt.close(fig)

def _pairwise_discretize(a: pd.Series, b: pd.Series, k_bins: int, custom_edges: Optional[List[float]] = None):
    df = pd.DataFrame({"a": a, "b": b}).dropna()
    if df.empty:
        return None, None
    vals = pd.concat([df.a, df.b], ignore_index=True).values
    uniq = np.unique(vals)
    if uniq.size < 2:
        return None, None
    if custom_edges:
        edges = sorted(custom_edges)
        edges = [e for e in edges if vals.min() < e < vals.max()]
        if edges:
            full_edges = [vals.min() - 1e-6] + edges + [vals.max() + 1e-6]
            a_cat = pd.cut(df.a, bins=full_edges, labels=False, include_lowest=True)
            b_cat = pd.cut(df.b, bins=full_edges, labels=False, include_lowest=True)
            cats_used = np.unique(np.concatenate([a_cat.dropna(), b_cat.dropna()]))
            if cats_used.size >= 2:
                return a_cat.values, b_cat.values
    if uniq.size <= k_bins:
        mapping = {v: i for i, v in enumerate(np.sort(uniq))}
        return df.a.map(mapping).values, df.b.map(mapping).values
    q = np.linspace(0, 1, k_bins + 1)
    edges = np.quantile(vals, q)
    for i in range(1, len(edges)):
        if edges[i] <= edges[i-1]:
            edges[i] = edges[i-1] + 1e-6
    if np.unique(edges).size < 2:
        edges = np.linspace(vals.min(), vals.max() + 1e-6, min(k_bins, uniq.size) + 1)
    a_cat = pd.cut(df.a, bins=edges, labels=False, include_lowest=True)
    b_cat = pd.cut(df.b, bins=edges, labels=False, include_lowest=True)
    if len(np.unique(np.concatenate([a_cat.dropna(), b_cat.dropna()]))) < 2:
        return None, None
    return a_cat.values, b_cat.values

def _cohen_kappa_cats(a_cats: np.ndarray, b_cats: np.ndarray) -> float:
    if a_cats is None or b_cats is None:
        return np.nan
    if a_cats.size == 0 or b_cats.size == 0:
        return np.nan
    cats = np.unique(np.concatenate([a_cats, b_cats]))
    if cats.size < 2:
        return np.nan
    idx = {c: i for i, c in enumerate(cats)}
    mat = np.zeros((cats.size, cats.size), dtype=int)
    for ac, bc in zip(a_cats, b_cats):
        mat[idx[ac], idx[bc]] += 1
    total = mat.sum()
    if total == 0:
        return np.nan
    po = np.trace(mat) / total
    row = mat.sum(axis=1)
    col = mat.sum(axis=0)
    pe = (row * col).sum() / (total**2)
    if pe >= 1.0:
        return np.nan
    return (po - pe) / (1 - pe)

def compute_pairwise_kappa_heatmap(wide: pd.DataFrame, evaluator_order: List[str], k_bins: int, min_overlap: int, out_path: str, dpi: int, fill_na_with: Optional[float] = None, custom_edges: Optional[List[float]] = None):
    evals = [e for e in evaluator_order if e in wide.columns]
    if len(evals) < 2:
        return
    kappa_mat = pd.DataFrame(index=evals, columns=evals, dtype=float)
    overlap_mat = pd.DataFrame(index=evals, columns=evals, dtype=float)
    for i, a in enumerate(evals):
        kappa_mat.loc[a, a] = 1.0
        overlap_mat.loc[a, a] = np.nan
        for j in range(i + 1, len(evals)):
            b = evals[j]
            pair = wide[[a, b]].dropna()
            overlap = pair.shape[0]
            overlap_mat.loc[a, b] = overlap_mat.loc[b, a] = overlap
            if overlap < min_overlap:
                val = np.nan
            else:
                a_cats, b_cats = _pairwise_discretize(pair[a], pair[b], k_bins, custom_edges=custom_edges)
                val = _cohen_kappa_cats(a_cats, b_cats)
            kappa_mat.loc[a, b] = val
            kappa_mat.loc[b, a] = val
    plot_mat = kappa_mat.fillna(fill_na_with) if fill_na_with is not None else kappa_mat.copy()
    base = sns.color_palette("muted", n_colors=6)
    cmap = LinearSegmentedColormap.from_list("muted_kappa", base, N=256)
    plt.figure(figsize=(1 + 1.0 * len(evals), 1 + 0.9 * len(evals)))
    sns.heatmap(plot_mat, annot=True, fmt=".3f", cmap=cmap, vmin=0.0, vmax=1.0, linewidths=0.4, linecolor="#ffffff")
    plt.title("Pairwise Cohen's Kappa")
    plt.tight_layout()
    plt.savefig(out_path, dpi=dpi)
    plt.close()
    kappa_mat.to_csv(out_path.replace(".png", ".csv"))
    overlap_mat.to_csv(out_path.replace(".png", "_overlap_counts.csv"))
    off = []
    for i in range(len(evals)):
        for j in range(i + 1, len(evals)):
            v = kappa_mat.iloc[i, j]
            if pd.notna(v):
                off.append(v)
    if off:
        with open(out_path.replace(".png", "_summary.txt"), "w", encoding="utf-8") as f:
            f.write(f"mean_pairwise_kappa,{float(np.mean(off)):.6f}\n")

def compute_pairwise_percent_agreement(wide: pd.DataFrame, evaluator_order: List[str], min_overlap: int, tolerance: float, round_decimals: Optional[int], out_path: str, dpi: int, fill_na_with: Optional[float] = None):
    evals = [e for e in evaluator_order if e in wide.columns]
    if len(evals) < 2:
        return
    pa = pd.DataFrame(index=evals, columns=evals, dtype=float)
    overlap_mat = pd.DataFrame(index=evals, columns=evals, dtype=float)
    for i, a in enumerate(evals):
        pa.loc[a, a] = 1.0
        overlap_mat.loc[a, a] = np.nan
        for j in range(i+1, len(evals)):
            b = evals[j]
            pair = wide[[a, b]].dropna()
            n_overlap = pair.shape[0]
            overlap_mat.loc[a, b] = overlap_mat.loc[b, a] = n_overlap
            if n_overlap < min_overlap:
                val = np.nan
            else:
                col_a = pair[a].copy()
                col_b = pair[b].copy()
                if round_decimals is not None:
                    col_a = col_a.round(round_decimals)
                    col_b = col_b.round(round_decimals)
                matches = (col_a - col_b).abs() <= tolerance
                val = float(matches.sum() / n_overlap) if n_overlap > 0 else np.nan
            pa.loc[a, b] = val
            pa.loc[b, a] = val
    plot_mat = pa.fillna(fill_na_with) if fill_na_with is not None else pa
    base = sns.color_palette("muted", n_colors=6)
    cmap = LinearSegmentedColormap.from_list("muted_percent_agree", base, N=256)
    plt.figure(figsize=(1 + 1.0 * len(evals), 1 + 0.9 * len(evals)))
    suffix = f"tol={tolerance}"
    if round_decimals is not None:
        suffix += f", round={round_decimals}"
    sns.heatmap(plot_mat, annot=True, fmt=".3f", cmap=cmap, vmin=0.0, vmax=1.0, linewidths=0.4, linecolor="#ffffff")
    plt.title(f"Pairwise Percent Agreement ({suffix})")
    plt.tight_layout()
    plt.savefig(out_path, dpi=dpi)
    plt.close()
    pa.to_csv(out_path.replace(".png", ".csv"))
    overlap_mat.to_csv(out_path.replace(".png", "_overlap_counts.csv"))
    off = []
    for i in range(len(evals)):
        for j in range(i+1, len(evals)):
            v = pa.iloc[i, j]
            if pd.notna(v):
                off.append(v)
    if off:
        with open(out_path.replace(".png", "_summary.txt"), "w", encoding="utf-8") as f:
            f.write(f"mean_pairwise_percent_agreement,{float(np.mean(off)):.6f}\n")

def run(cfg: RootCfg):
    sns.set_style(cfg.style)
    ensure_dir(cfg.output_dir)
    tidy = assemble_tidy(cfg)
    if tidy.empty:
        return
    model_lang = agg_model_language(tidy)
    model_crit = agg_model_criteria(tidy)
    overall = agg_overall_model(tidy)
    model_lang.to_csv(os.path.join(cfg.output_dir, "agg_model_vs_language.csv"), index=False)
    model_crit.to_csv(os.path.join(cfg.output_dir, "agg_model_vs_criteria.csv"), index=False)
    overall.to_csv(os.path.join(cfg.output_dir, "agg_overall_model.csv"), index=False)
    evaluator_order = [e.name for e in cfg.evaluators]
    model_order = cfg.base_models
    crit_order = cfg.criteria
    lang_order = [l.name for l in cfg.languages]
    facet_bar(model_lang, cfg, cfg.model_vs_language_cfg, facet_col="evaluator", x="language", hue="model", x_order=lang_order, model_order=model_order, evaluator_order=evaluator_order, title="Model vs Language (Criteria Averaged)", out_path=os.path.join(cfg.output_dir, "facet_model_vs_language.png"))
    facet_bar(model_crit, cfg, cfg.model_vs_criterion_cfg, facet_col="evaluator", x="criterion", hue="model", x_order=crit_order, model_order=model_order, evaluator_order=evaluator_order, title="Model vs Criterion (Languages Averaged)", out_path=os.path.join(cfg.output_dir, "facet_model_vs_criterion.png"))
    overall_out = os.path.join(cfg.output_dir, "facet_overall_model.png")
    if cfg.overall_model_cfg.combined_single_panel:
        plot_overall_models_combined(overall, cfg, cfg.overall_model_cfg, overall_out)
    else:
        facet_bar(overall, cfg, cfg.overall_model_cfg, facet_col="evaluator", x="model", hue="model", x_order=model_order, model_order=model_order, evaluator_order=evaluator_order, title="Overall Model Performance (Lang+Criterion Averaged)", out_path=overall_out)
    plot_evaluator_correlation(tidy, cfg, os.path.join(cfg.output_dir, "evaluator_correlation_heatmap.png"))
    if cfg.agreement.enable_pairwise_kappa:
        wide_items = item_level_matrix(tidy)
        compute_pairwise_kappa_heatmap(wide_items, evaluator_order, cfg.agreement.kappa_bins, cfg.agreement.min_overlap_items, os.path.join(cfg.output_dir, "evaluator_pairwise_kappa_heatmap.png"), cfg.dpi, fill_na_with=cfg.agreement.fill_na_with, custom_edges=cfg.agreement.custom_kappa_edges)
    if cfg.agreement.enable_percent_agreement:
        wide_items = wide_items if 'wide_items' in locals() else item_level_matrix(tidy)
        compute_pairwise_percent_agreement(wide_items, evaluator_order, cfg.agreement.min_overlap_items, cfg.agreement.percent_match_tolerance, cfg.agreement.percent_round_decimals, os.path.join(cfg.output_dir, "evaluator_pairwise_percent_agreement_heatmap.png"), cfg.dpi, fill_na_with=cfg.agreement.fill_na_with)

def setup_logging():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

def parse_args():
    ap = argparse.ArgumentParser(description="Evaluator aggregation & agreement plots.")
    ap.add_argument("--config", required=True, help="Path to YAML config.")
    return ap.parse_args()

def main():
    args = parse_args()
    setup_logging()
    cfg = load_config(args.config)
    run(cfg)

if __name__ == "__main__":
    main()