import argparse
import csv
from pathlib import Path


def fmt(value: str) -> str:
    try:
        return f"{float(value):.3f}"
    except (TypeError, ValueError):
        return value


def as_float(value: str) -> float | None:
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def bold(value: str) -> str:
    return f"\\textbf{{{value}}}"


def maybe_bold(value: str, should_bold: bool) -> str:
    return bold(value) if should_bold else value


def close_enough(value: str, target: float | None, tol: float = 5e-4) -> bool:
    number = as_float(value)
    return number is not None and target is not None and abs(number - target) <= tol


def fmt_stat(value: str) -> str:
    try:
        number = float(value)
    except (TypeError, ValueError):
        return value
    if number.is_integer():
        return str(int(number))
    return f"{number:.3f}"


def fmt_p(value: str) -> str:
    try:
        number = float(value)
    except (TypeError, ValueError):
        return value
    if number < 0.001:
        return "$<0.001$"
    return f"{number:.3f}"


def read_csv(path: Path) -> list[dict[str, str]]:
    with path.open("r", encoding="utf-8", newline="") as f:
        return list(csv.DictReader(f))


def latex_escape(value: str) -> str:
    return value.replace("_", "\\_")


REPRESENTATION_LABELS = {
    "raw": "Raw text",
    "normalized": "Normalized text",
    "structured": "Structured",
    "state_only": "State-only",
    "state_meta": "State+metadata",
    "retrieved_premise": "Retrieved-premise",
    "oracle_premise": "Future premise",
    "premise": "Future premise",
    "Oracle premise": "Future premise",
}


MODEL_LABELS = {
    "majority_class": "Majority",
    "keyword_heuristic": "Keyword",
    "text_naive_bayes": "Naive Bayes",
    "tfidf_logistic_regression": "TF-IDF LR",
    "tfidf_linear_svm": "TF-IDF SVM",
    "naive_bayes": "Naive Bayes",
    "logistic_regression": "Logistic",
}


STRATEGY_LABELS = {
    "unguided": "Unguided",
    "family_guided": "Hard family",
    "family_soft": "Soft family",
    "family_top_m": "Top-m family",
    "family_rrf": "Family RRF",
    "oracle_family": "True family",
    "Oracle family": "True family",
}


def display_representation(value: str) -> str:
    return REPRESENTATION_LABELS.get(value, value)


def display_model(value: str) -> str:
    return MODEL_LABELS.get(value, value)


def display_strategy(value: str) -> str:
    return STRATEGY_LABELS.get(value, value)


def is_future_representation(row: dict[str, str]) -> bool:
    return display_representation(row.get("representation", "")) == "Future premise"


def metric_lookup(rows: list[dict[str, str]]) -> dict[str, str]:
    return {row["metric"]: row["value"] for row in rows}


def metric_value(rows: list[dict[str, str]], key: str, default: str = "") -> str:
    return metric_lookup(rows).get(key, default)


def fmt_pm(mean: str, std: str) -> str:
    return f"{fmt(mean)} $\\pm$ {fmt(std)}"


def dataset_table(rows: list[dict[str, str]]) -> str:
    values = metric_lookup(rows)
    selected = [
        ("Proof steps", "n_steps"),
        ("Theorems", "n_theorems"),
        ("Tactic families", "n_labels"),
        ("Mean steps/theorem", "mean_steps_per_theorem"),
        ("Mean goal tokens", "mean_goal_tokens"),
        ("Mean context size", "mean_context_size"),
    ]
    lines = [
        "\\begin{table}[t]",
        "\\centering",
        "\\caption{Controlled LeanDojo subset characteristics. The dataset is extracted from a curated mathlib4 subset rather than a full-mathlib trace.}",
        "\\label{tab:dataset}",
        "\\begin{tabular}{lr}",
        "\\toprule",
        "Statistic & Value \\\\",
        "\\midrule",
    ]
    for label, key in selected:
        lines.append(f"{label} & {fmt_stat(values.get(key, ''))} \\\\")
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}", ""])
    return "\n".join(lines)


def classification_table(rows: list[dict[str, str]]) -> str:
    metric_keys = ["accuracy", "macro_f1", "top3_accuracy"]
    deployable_rows = [row for row in rows if not is_future_representation(row)]
    best = {
        key: max(
            (value for value in (as_float(row.get(key, "")) for row in deployable_rows) if value is not None),
            default=None,
        )
        for key in metric_keys
    }

    def cell(row: dict[str, str], key: str) -> str:
        value = fmt(row[key])
        return maybe_bold(value, not is_future_representation(row) and close_enough(row[key], best[key]))

    lines = [
        "\\begin{table*}[t]",
        "\\centering",
        "\\footnotesize",
        "\\caption{Tactic-family prediction results under by-theorem splitting. Accuracy, macro-F1 and top-3 accuracy compare the same model families across proof-state representations. Bold marks the strongest state-visible entry in each metric; future premise is a hindsight comparison.}",
        "\\label{tab:classification}",
        "\\begin{tabular}{clrrr}",
        "\\toprule",
        "Representation & Model & Accuracy & Macro-F1 & Top-3 \\\\",
        "\\midrule",
    ]

    i = 0
    while i < len(rows):
        representation = display_representation(rows[i]["representation"])
        group = []
        while i + len(group) < len(rows):
            row = rows[i + len(group)]
            if display_representation(row["representation"]) != representation:
                break
            group.append(row)
        for j, row in enumerate(group):
            representation_cell = (
                f"\\multirow{{{len(group)}}}{{*}}{{{latex_escape(representation)}}}" if j == 0 else ""
            )
            lines.append(
                f"{representation_cell} & "
                f"{latex_escape(display_model(row['model']))} & "
                f"{cell(row, 'accuracy')} & {cell(row, 'macro_f1')} & {cell(row, 'top3_accuracy')} \\\\"
            )
        i += len(group)
        if i < len(rows):
            lines.append("\\cmidrule(lr){1-5}")
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}", ""])
    return "\n".join(lines)


def aggregate_table(rows: list[dict[str, str]]) -> str:
    selected_models = {"text_naive_bayes", "tfidf_linear_svm"}
    rows = [row for row in rows if row["model"] in selected_models]
    metric_keys = ["accuracy_mean", "macro_f1_mean", "top3_accuracy_mean"]
    deployable_rows = [row for row in rows if not is_future_representation(row)]
    best = {
        key: max(
            (value for value in (as_float(row.get(key, "")) for row in deployable_rows) if value is not None),
            default=None,
        )
        for key in metric_keys
    }

    def cell(row: dict[str, str], mean_key: str, std_key: str) -> str:
        mean = maybe_bold(
            fmt(row[mean_key]),
            not is_future_representation(row) and close_enough(row[mean_key], best[mean_key]),
        )
        return f"{mean} $\\pm$ {fmt(row[std_key])}"

    lines = [
        "\\begin{table*}[t]",
        "\\centering",
        "\\footnotesize",
        "\\caption{Split-seed stability on the controlled subset. Values are mean $\\pm$ population standard deviation over six by-theorem random splits. Bold marks the strongest state-visible mean in each metric.}",
        "\\label{tab:stability}",
        "\\begin{tabular}{clrrr}",
        "\\toprule",
        "Representation & Model & Accuracy & Macro-F1 & Top-3 \\\\",
        "\\midrule",
    ]

    i = 0
    while i < len(rows):
        representation = display_representation(rows[i]["representation"])
        group = []
        while i + len(group) < len(rows):
            row = rows[i + len(group)]
            if display_representation(row["representation"]) != representation:
                break
            group.append(row)
        for j, row in enumerate(group):
            representation_cell = (
                f"\\multirow{{{len(group)}}}{{*}}{{{latex_escape(representation)}}}" if j == 0 else ""
            )
            lines.append(
                f"{representation_cell} & "
                f"{latex_escape(display_model(row['model']))} & "
                f"{cell(row, 'accuracy_mean', 'accuracy_std')} & "
                f"{cell(row, 'macro_f1_mean', 'macro_f1_std')} & "
                f"{cell(row, 'top3_accuracy_mean', 'top3_accuracy_std')} \\\\"
            )
        i += len(group)
        if i < len(rows):
            lines.append("\\cmidrule(lr){1-5}")
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}", ""])
    return "\n".join(lines)


def search_table(rows: list[dict[str, str]]) -> str:
    def metric(row: dict[str, str], name: str) -> str:
        return row.get(name) or row.get(f"{name}_mean", "")

    def row_label(row: dict[str, str]) -> str:
        label = display_strategy(row["strategy"])
        if row.get("family_model"):
            label += f" ({display_model(row['family_model'])}"
            if row.get("family_weight"):
                label += f", w={row['family_weight']}"
            label += ")"
        elif row.get("family_weights") and row["family_weights"] not in {"None", "none"}:
            label += " (val-selected)"
        return label

    order = ["unguided", "family_soft", "family_guided", "family_rrf", "family_top_m", "oracle_family"]
    row_by_strategy = {row["strategy"]: row for row in rows}
    ordered_rows = [row_by_strategy[name] for name in order if name in row_by_strategy]
    ordered_rows.extend(row for row in rows if row["strategy"] not in set(order))
    metric_keys = [
        "family_success_at_1",
        "family_success_at_5",
        "exact_tactic_success_at_1",
        "exact_tactic_success_at_5",
    ]
    non_ceiling_rows = [row for row in ordered_rows if row["strategy"] != "oracle_family"]
    best = {
        key: max(
            (value for value in (as_float(metric(row, key)) for row in non_ceiling_rows) if value is not None),
            default=None,
        )
        for key in metric_keys
    }

    def cell(row: dict[str, str], key: str) -> str:
        raw = metric(row, key)
        value = fmt(raw)
        return maybe_bold(value, row["strategy"] != "oracle_family" and close_enough(raw, best[key], tol=0.0015))

    lines = [
        "\\begin{table*}[t]",
        "\\centering",
        "\\footnotesize",
        "\\caption{Retrieval ranking before Lean checking. The table reports family-level and trace-tactic success at fixed ranks. Bold marks leading non-ceiling entries; near-equal exact@5 values are treated as ties.}",
        "\\label{tab:search}",
        "\\begin{tabular}{lrrrr}",
        "\\toprule",
        "Strategy & Family@1 & Family@5 & Exact@1 & Exact@5 \\\\",
        "\\midrule",
    ]
    for row in ordered_rows:
        lines.append(
            f"{latex_escape(row_label(row))} & {cell(row, 'family_success_at_1')} & "
            f"{cell(row, 'family_success_at_5')} & {cell(row, 'exact_tactic_success_at_1')} & "
            f"{cell(row, 'exact_tactic_success_at_5')} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}", ""])
    return "\n".join(lines)


def execution_table(rows: list[dict[str, str]]) -> str:
    order = ["unguided", "family_soft", "family_guided", "family_rrf", "family_top_m"]
    row_by_strategy = {row["strategy"]: row for row in rows}
    ordered_rows = [row_by_strategy[name] for name in order if name in row_by_strategy]
    ordered_rows.extend(row for row in rows if row["strategy"] not in set(order))
    high_is_good = [
        "accept_at_1_all",
        "accept_at_3_all",
        "accept_at_5_all",
        "candidate_execution_coverage",
    ]
    low_is_good = ["reconstruction_failure_rate"]
    best_high = {
        key: max(
            (value for value in (as_float(row.get(key, "")) for row in ordered_rows) if value is not None),
            default=None,
        )
        for key in high_is_good
    }
    best_low = {
        key: min(
            (value for value in (as_float(row.get(key, "")) for row in ordered_rows) if value is not None),
            default=None,
        )
        for key in low_is_good
    }

    def cell(row: dict[str, str], key: str) -> str:
        value = fmt(row[key])
        if key in best_high:
            return maybe_bold(value, close_enough(row[key], best_high[key]))
        return maybe_bold(value, close_enough(row[key], best_low[key]))

    n_queries = ordered_rows[0].get("queries", "") if ordered_rows else ""
    n_candidates = ordered_rows[0].get("candidates", "") if ordered_rows else ""
    if n_queries and n_candidates:
        caption = (
            "Lean checking results on "
            f"{fmt_stat(n_queries)} held-out S4 states. Each strategy checks "
            f"{fmt_stat(n_candidates)} candidate tactics. Accept@k uses all queries as the denominator; "
            "coverage is the fraction of candidate tactics that reached ordinary Lean acceptance or "
            "rejection after state reconstruction. Bold marks the best value in each column."
        )
    else:
        caption = (
            "Lean checking results on held-out S4 states. Accept@k uses all queries as "
            "the denominator; coverage is the fraction of candidate tactics that reached ordinary "
            "Lean acceptance or rejection after state reconstruction. Bold marks the best value in each column."
        )
    lines = [
        "\\begin{table*}[t]",
        "\\centering",
        "\\footnotesize",
        f"\\caption{{{caption}}}",
        "\\label{tab:micro_execution}",
        "\\begin{tabular}{lrrrrr}",
        "\\toprule",
        "Strategy & Accept@1 & Accept@3 & Accept@5 & Coverage & Recon. fail \\\\",
        "\\midrule",
    ]
    for row in ordered_rows:
        lines.append(
            f"{latex_escape(display_strategy(row['strategy']))} & "
            f"{cell(row, 'accept_at_1_all')} & {cell(row, 'accept_at_3_all')} & "
            f"{cell(row, 'accept_at_5_all')} & {cell(row, 'candidate_execution_coverage')} & "
            f"{cell(row, 'reconstruction_failure_rate')} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}", ""])
    return "\n".join(lines)


def execution_appendix_table(rows: list[dict[str, str]]) -> str:
    order = ["unguided", "family_soft", "family_guided", "family_rrf", "family_top_m"]
    row_by_strategy = {row["strategy"]: row for row in rows}
    ordered_rows = [row_by_strategy[name] for name in order if name in row_by_strategy]
    ordered_rows.extend(row for row in rows if row["strategy"] not in set(order))
    lines = [
        "\\begin{table*}[t]",
        "\\centering",
        "\\footnotesize",
        "\\caption{Lean checking after removing queries with reconstruction or infrastructure failures. Exec.Q@5 reports the remaining denominator.}",
        "\\label{tab:micro_execution_executable}",
        "\\begin{tabular}{lrrrrrr}",
        "\\toprule",
        "Strategy & A@1 exe. & A@3 exe. & A@5 exe. & Exec.Q@5 & Unknown id & Elab. \\\\",
        "\\midrule",
    ]
    for row in ordered_rows:
        lines.append(
            f"{latex_escape(display_strategy(row['strategy']))} & "
            f"{fmt(row['accept_at_1_executable'])} & {fmt(row['accept_at_3_executable'])} & "
            f"{fmt(row['accept_at_5_executable'])} & {row['executable_queries_at_5']} & "
            f"{row['reconstruction_unknown_identifier']} & {row['reconstruction_elaboration']} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table*}", ""])
    return "\n".join(lines)


def execution_significance_table(rows: list[dict[str, str]]) -> str:
    target_rows = [row for row in rows if row.get("metric") == "accept_at_5"]
    order = ["family_soft", "family_guided", "family_rrf", "family_top_m"]
    row_by_strategy = {row["strategy"]: row for row in target_rows}
    ordered_rows = [row_by_strategy[name] for name in order if name in row_by_strategy]
    ordered_rows.extend(row for row in target_rows if row["strategy"] not in set(order))
    n_pairs = target_rows[0].get("n_pairs", "") if target_rows else ""
    if n_pairs:
        caption = (
            "Paired Lean checking comparison against unguided retrieval on Accept@5. Differences are strategy "
            f"minus unguided over the same {fmt_stat(n_pairs)} states."
        )
    else:
        caption = (
            "Paired Lean checking comparison against unguided retrieval on Accept@5. Differences are strategy "
            "minus unguided over the same held-out states."
        )
    lines = [
        "\\begin{table}[!htbp]",
        "\\centering",
        "\\footnotesize",
        f"\\caption{{{caption}}}",
        "\\label{tab:execution_significance}",
        "\\begin{tabular}{lrrr}",
        "\\toprule",
        "Strategy & $\\Delta$A@5 & 95\\% CI & McNemar $p$ \\\\",
        "\\midrule",
    ]
    for row in ordered_rows:
        ci = f"[{fmt(row['bootstrap_ci_low'])}, {fmt(row['bootstrap_ci_high'])}]"
        lines.append(
            f"{latex_escape(display_strategy(row['strategy']))} & {fmt(row['diff'])} & "
            f"{ci} & {fmt_p(row['mcnemar_p'])} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}", ""])
    return "\n".join(lines)


def proxy_execution_gap_table(rows: list[dict[str, str]]) -> str:
    order = ["unguided", "family_soft", "family_guided", "family_rrf", "family_top_m"]
    row_by_strategy = {row["strategy"]: row for row in rows}
    ordered_rows = [row_by_strategy[name] for name in order if name in row_by_strategy]
    ordered_rows.extend(row for row in rows if row["strategy"] not in set(order))
    lines = [
        "\\begin{table}[t]",
        "\\centering",
        "\\footnotesize",
        "\\caption{Trace matching and Lean acceptance on the S4 execution set. The columns summarize two different views of the same candidate lists.}",
        "\\label{tab:trace_execution_gap}",
        "\\begin{tabular}{lrrr}",
        "\\toprule",
        "Strategy & Trace E@5 & Lean A@5 & Difference \\\\",
        "\\midrule",
    ]
    for row in ordered_rows:
        lines.append(
            f"{latex_escape(display_strategy(row['strategy']))} & "
            f"{fmt(row['proxy_exact_at_5'])} & {fmt(row['lean_accept_at_5'])} & "
            f"{fmt(row['lean_minus_proxy_exact_at_5'])} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}", ""])
    return "\n".join(lines)


def accepted_alternatives_table(rows: list[dict[str, str]]) -> str:
    order = ["unguided", "family_soft", "family_guided", "family_rrf", "family_top_m"]
    row_by_strategy = {row["strategy"]: row for row in rows}
    ordered_rows = [row_by_strategy[name] for name in order if name in row_by_strategy]
    ordered_rows.extend(row for row in rows if row["strategy"] not in set(order))
    lines = [
        "\\begin{table}[!htbp]",
        "\\centering",
        "\\scriptsize",
        "\\caption{Lean-accepted alternatives that differ from the traced tactic. All rates use the same query-level denominator as the 500-state execution set. A-no-E is acceptance without a trace hit; E-no-A is a trace hit without Lean acceptance in the reconstructed state.}",
        "\\label{tab:accepted_alternatives}",
        "\\begin{tabular}{lrrrrr}",
        "\\toprule",
        "Strategy & Trace & Lean & Non-trace & A-no-E & E-no-A \\\\",
        "\\midrule",
    ]
    for row in ordered_rows:
        lines.append(
            f"{latex_escape(display_strategy(row['strategy']))} & "
            f"{fmt(row['proxy_exact_at_5_on_sample'])} & {fmt(row['lean_accept_at_5'])} & "
            f"{fmt(row['accepted_not_gold_at_5'])} & {fmt(row['accept_without_exact_at_5'])} & "
            f"{fmt(row['exact_without_accept_at_5'])} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}", ""])
    return "\n".join(lines)


def reconstruction_failure_class_table(rows: list[dict[str, str]]) -> str:
    values = metric_lookup(rows)
    n_candidates = float(values.get("n_candidates", "0") or 0)
    reconstruction_total = (
        float(values.get("reconstruction_unknown_identifier", "0") or 0)
        + float(values.get("reconstruction_elaboration", "0") or 0)
        + float(values.get("parse_error", "0") or 0)
    )
    selected = [
        ("Unknown identifier", "reconstruction_unknown_identifier"),
        ("Elab./type class", "reconstruction_elaboration"),
        ("Parse error", "parse_error"),
        ("Timeout", "timeout"),
        ("Other tool failure", "infrastructure_error"),
    ]
    sample_size = values.get("sample_size", "")
    total_label = fmt_stat(str(reconstruction_total))
    caption = (
        f"Reconstruction and checking failures in reconstructed-state Lean checking. "
        f"The {total_label} reconstruction failures are separated from normal Lean rejections; "
        "timeouts and other tool failures did not occur."
    )
    if sample_size:
        caption = (
            f"Reconstruction and checking failures in the {fmt_stat(sample_size)}-state Lean check. "
            f"The {total_label} reconstruction failures are separated from normal Lean rejections; "
            "timeouts and other tool failures did not occur."
        )
    lines = [
        "\\begin{table}[!htbp]",
        "\\centering",
        "\\footnotesize",
        f"\\caption{{{caption}}}",
        "\\label{tab:execution_failure_classes}",
        "\\begin{tabular}{lrr}",
        "\\toprule",
        "Class & Count & Cand. frac. \\\\",
        "\\midrule",
    ]
    for label, key in selected:
        count = float(values.get(key, "0") or 0)
        fraction = count / n_candidates if n_candidates else 0.0
        lines.append(f"{label} & {fmt_stat(str(count))} & {fmt(str(fraction))} \\\\")
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}", ""])
    return "\n".join(lines)


def confusion_table(rows: list[dict[str, str]], max_rows: int = 8) -> str:
    off_diagonal = [row for row in rows if row["gold"] != row["predicted"]][:max_rows]
    lines = [
        "\\begin{table}[t]",
        "\\centering",
        "\\caption{Largest off-diagonal tactic-family confusions for a selected lightweight model.}",
        "\\label{tab:confusions}",
        "\\begin{tabular}{llr}",
        "\\toprule",
        "Gold family & Predicted family & Count \\\\",
        "\\midrule",
    ]
    for row in off_diagonal:
        lines.append(
            f"{latex_escape(row['gold'])} & {latex_escape(row['predicted'])} & {row['count']} \\\\"
        )
    lines.extend(["\\bottomrule", "\\end{tabular}", "\\end{table}", ""])
    return "\n".join(lines)


def main() -> None:
    parser = argparse.ArgumentParser(description="Generate LaTeX tables from result CSV files.")
    parser.add_argument("--dataset", type=Path, default=Path("results/tables/s4_dataset_summary.csv"))
    parser.add_argument("--classification", type=Path, default=Path("results/tables/s4_classification_summary.csv"))
    parser.add_argument("--aggregate", type=Path, default=Path("results/tables/s4_classification_aggregate.csv"))
    parser.add_argument("--search", type=Path, default=Path("results/tables/s4_search_validation_aggregate.csv"))
    parser.add_argument("--execution", type=Path, default=Path("results/tables/execution_audit_by_strategy.csv"))
    parser.add_argument("--execution-summary", type=Path, default=Path("results/tables/execution_audit_summary.csv"))
    parser.add_argument("--execution-significance", type=Path, default=Path("results/tables/execution_accept_significance.csv"))
    parser.add_argument(
        "--trace-execution-gap",
        "--proxy-execution-gap",
        dest="trace_execution_gap",
        type=Path,
        default=Path("results/tables/trace_execution_gap.csv"),
    )
    parser.add_argument(
        "--accepted-alternatives",
        type=Path,
        default=Path("results/tables/execution_accepted_alternatives.csv"),
    )
    parser.add_argument("--confusion", type=Path, default=Path("results/tables/s4_confusion_matrix.csv"))
    parser.add_argument("--output", type=Path, default=Path("paper/generated_tables.tex"))
    parser.add_argument("--execution-appendix-output", type=Path, default=Path("paper/generated_execution_appendix_tables.tex"))
    args = parser.parse_args()

    parts = []
    if args.dataset.exists():
        parts.append(dataset_table(read_csv(args.dataset)))
    parts.append(classification_table(read_csv(args.classification)))
    if args.aggregate.exists():
        parts.append(aggregate_table(read_csv(args.aggregate)))
    parts.append(search_table(read_csv(args.search)))
    if args.execution.exists():
        execution_rows = read_csv(args.execution)
        parts.append(execution_table(execution_rows))
        args.execution_appendix_output.parent.mkdir(parents=True, exist_ok=True)
        appendix_parts = []
        if args.execution_significance.exists():
            appendix_parts.append(execution_significance_table(read_csv(args.execution_significance)))
        if args.accepted_alternatives and args.accepted_alternatives.exists():
            appendix_parts.append(accepted_alternatives_table(read_csv(args.accepted_alternatives)))
        if args.execution_summary.exists():
            appendix_parts.append(reconstruction_failure_class_table(read_csv(args.execution_summary)))
        args.execution_appendix_output.write_text("\n".join(appendix_parts), encoding="utf-8")
    if args.confusion.exists():
        parts.append(confusion_table(read_csv(args.confusion)))
    args.output.parent.mkdir(parents=True, exist_ok=True)
    args.output.write_text("\n".join(parts), encoding="utf-8")
    print(f"Wrote paper tables to: {args.output}")


if __name__ == "__main__":
    main()
