#!/usr/bin/env python3
import json
import os
import signal
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from dataclasses import dataclass
from pathlib import Path
from subprocess import Popen
from typing import Callable

from rdflib import BNode, Graph, Literal, Namespace, RDF, RDFS, URIRef
from rdflib.namespace import XSD

ROOT = Path(__file__).resolve().parent
CONFIG_PATH = ROOT / "config.json"
RESULTS_PATH = ROOT / "results.txt"
RESULTS_JSON_PATH = ROOT / "results_json.json"
PARTIAL_RESULTS_PATH = ROOT / "partial_results.txt"

HMAS = Namespace("https://purl.org/hmas/")
EVALUATION_TEMPERATURE = 0


@dataclass
class LlmConfig:
    provider: str
    model: str
    reasoning_effort: str | None = None

    def result_label(self) -> str:
        label = f"{self.provider}:{self.model}"
        if self.reasoning_effort is not None:
            return f"{label} (reasoning={self.reasoning_effort})"
        return label


@dataclass
class GoalCase:
    goal: str
    relevant_tools: list[str]
    maximum_signifiers: int | None = None
    minimum_relevance_value: float | None = None


@dataclass(frozen=True)
class SignifierRelevance:
    label: str
    relevance: float


SERVER_COMMANDS = [
    ("cherrybot_simu", 5, ["uv", "run", "wot_sem/cherrybot_simulation.py"]),
    ("formalizer_coala", 2, ["uv", "run", "a2a_sem/formalizer/formalizer_coala.py"]),
    ("goal_mcp", 2, ["uv", "run", "mcp_sem/goal_mcp.py"]),
    ("cherrybot_proxy", 10, ["uv", "run", "wot_sem/cherrybot_proxy.py"]),
    ("app", 2, ["uv", "run", "app.py"]),
    ("sem_mcp", 0, ["uv", "run", "mcp_sem/sem_mcp.py"]),
]

ROBOT_GOAL = "The agent wants to perform a formal operation on the robot."

BASE_GOAL_CASES = [
    GoalCase(
        goal="The agent wants to read the user goal.",
        relevant_tools=["goal_mcp_CurrentGoal"],
    ),
    GoalCase(
        goal="The agent wants to provide feedback to the user.",
        relevant_tools=["goal_mcp_provide_feedback"],
    ),
    GoalCase(
        goal="The agent wants to convert the natural language goal into a formal representation.",
        relevant_tools=["formalizer_Formalize goal"],
    ),
    GoalCase(
        goal="The agent wants to read the goal or provide feedback to the user.",
        relevant_tools=["goal_mcp_CurrentGoal", "goal_mcp_provide_feedback"],
    ),
]

LLMS_TO_EVALUATE = [
    LlmConfig(provider="ollama", model="gemma3:1b"),
    LlmConfig(provider="ollama", model="ministral-3:latest"),
    LlmConfig(provider="ollama", model="ministral-3:14b"),
    LlmConfig(provider="openai", model="gpt-4.1-mini"),
    LlmConfig(provider="openai", model="gpt-5.4-mini", reasoning_effort="none"),
    LlmConfig(provider="openai", model="gpt-5.4-mini", reasoning_effort="medium"),
]

CHERRYBOT_UTCP_INSTANCE = "cherrybot_utcp"
CHERRYBOT_UTCP_URL = "http://localhost:8086/utcp"
CHERRYBOT_TD_INSTANCE = "cherrybot_td"
CHERRYBOT_TD_URL = "http://localhost:8086/td"

ARTIFACT_REGISTRATION_URL = "http://localhost:5000/artifacts/registration"

PROFILE_ID = "evaluation"
PROFILE_URL = f"http://localhost:5000/profile/{PROFILE_ID}"
SIGNIFIERS_URL = "http://localhost:5000/signifiers"
PROFILE_CONTEXT_URL = f"http://localhost:5000/profile/{PROFILE_ID}/nl_context"

RUNS_PER_GOAL = 25
RUN_WOT_ROBOT_CONTROL_TEST = False

MAXIMUM_SIGNIFIERS_VALUES = [None, 1, 2, 10]
MINIMUM_RELEVANCE_VALUES = [None, 0, 0.25, 0.75, 1]
RELEVANCE_COLLECTION_THRESHOLD = -1.0


class ServerManager:
    def __init__(self) -> None:
        self.processes: list[Popen] = []
        self.started = False

    def start(self) -> None:
        if self.started:
            return
        for name, delay, cmd in SERVER_COMMANDS:
            print(f"Starting: {name}")
            proc = Popen(cmd, cwd=str(ROOT), start_new_session=True)
            self.processes.append(proc)
            print(f"  -> PID {proc.pid}")
            if delay:
                print(f"Sleeping {delay}s...")
                time.sleep(delay)
        self.started = True

    def stop(self) -> None:
        if not self.processes:
            return
        print("\nStopping all servers...")
        for proc in self.processes:
            try:
                os.killpg(proc.pid, signal.SIGTERM)
            except ProcessLookupError:
                continue
        for proc in self.processes:
            try:
                proc.wait(timeout=5)
            except Exception:
                continue


def _http_request(
    url: str, method: str = "GET", data: bytes | None = None, headers: dict | None = None
):
    req = urllib.request.Request(url, data=data, method=method)
    if headers:
        for key, value in headers.items():
            req.add_header(key, value)
    # Intentionally no timeout: wait until the endpoint responds.
    return urllib.request.urlopen(req)


def update_profile_context(context: str) -> None:
    payload = json.dumps({"context": context}).encode("utf-8")
    headers = {"Content-Type": "application/json"}
    with _http_request(PROFILE_CONTEXT_URL, method="PUT", data=payload, headers=headers) as resp:
        if resp.status >= 400:
            raise RuntimeError(f"Failed to update profile context: HTTP {resp.status}")


def fetch_signifier_relevance() -> list[SignifierRelevance]:
    params = urllib.parse.urlencode({"profile": PROFILE_URL})
    url = f"{SIGNIFIERS_URL}?{params}"
    headers = {"Accept": "text/turtle"}
    with _http_request(url, headers=headers) as resp:
        data = resp.read()
    graph = Graph()
    graph.parse(data=data, format="turtle")
    signifier_relevance: list[SignifierRelevance] = []
    for signifier in graph.subjects(RDF.type, HMAS["Signifier"]):
        label = None
        for lbl in graph.objects(signifier, RDFS["label"]):
            label = str(lbl)
            break
        relevance = 0.0
        for salience in graph.objects(signifier, HMAS["salience"]):
            try:
                relevance = float(str(salience))
            except (ValueError, TypeError):
                relevance = 0.0
            break
        signifier_relevance.append(SignifierRelevance(label or str(signifier), relevance))
    return sorted(signifier_relevance, key=lambda item: (-item.relevance, item.label))


def filter_signifier_relevance(
    signifier_relevance: list[SignifierRelevance], goal_case: GoalCase
) -> list[SignifierRelevance]:
    threshold = 0.5
    if goal_case.minimum_relevance_value is not None:
        threshold = goal_case.minimum_relevance_value

    filtered = [item for item in signifier_relevance if item.relevance >= threshold]
    if goal_case.maximum_signifiers is not None:
        filtered = filtered[: goal_case.maximum_signifiers]
    return filtered


def compute_precision_recall(
    selected: set[str], relevant: set[str]
) -> tuple[float, float, int, int, int]:
    tp = len(selected & relevant)
    fp = len(selected - relevant)
    fn = len(relevant - selected)

    if selected:
        precision = tp / len(selected)
    else:
        precision = 1.0 if not relevant else 0.0

    if relevant:
        recall = tp / len(relevant)
    else:
        recall = 1.0 if not selected else 0.0

    return precision, recall, tp, fp, fn


def compute_r_precision(ranked_labels: list[str], relevant: set[str]) -> float:
    r = len(relevant)
    if r == 0:
        return 1.0
    return len(set(ranked_labels[:r]) & relevant) / r


def compute_r_precision_bounds(
    signifier_relevance: list[SignifierRelevance], relevant: set[str]
) -> tuple[float, float, list[str], list[str]]:
    """
    Return the minimum and maximum R-precision possible for the assigned relevance values.

    Ranking is always decreasing by relevance. Ties are resolved optimistically for the maximum
    bound and pessimistically for the minimum bound.
    """
    best_ranking: list[str] = []
    worst_ranking: list[str] = []
    grouped_by_relevance: dict[float, list[SignifierRelevance]] = {}
    for item in signifier_relevance:
        grouped_by_relevance.setdefault(item.relevance, []).append(item)

    for relevance in sorted(grouped_by_relevance.keys(), reverse=True):
        group = grouped_by_relevance[relevance]
        best_ranking.extend(
            item.label
            for item in sorted(
                group,
                key=lambda item: (item.label not in relevant, item.label),
            )
        )
        worst_ranking.extend(
            item.label
            for item in sorted(
                group,
                key=lambda item: (item.label in relevant, item.label),
            )
        )

    return (
        compute_r_precision(worst_ranking, relevant),
        compute_r_precision(best_ranking, relevant),
        best_ranking,
        worst_ranking,
    )


def load_config() -> dict:
    return json.loads(CONFIG_PATH.read_text())


def write_config(config: dict) -> None:
    CONFIG_PATH.write_text(json.dumps(config, indent=2) + "\n")


def set_temperature_zero(config: dict) -> None:
    for section in ("sem", "llm_agent", "formalizer_agent"):
        config.setdefault(section, {})
        config[section]["temperature"] = EVALUATION_TEMPERATURE


def iter_goal_cases() -> list[GoalCase]:
    goal_cases: list[GoalCase] = []
    for base_case in BASE_GOAL_CASES:
        goal_cases.extend(_with_profile_parameter_combinations(base_case))
    return goal_cases


def iter_base_goal_cases(cherrybot_tool: str) -> list[GoalCase]:
    goal_cases = [
        GoalCase(goal=goal_case.goal, relevant_tools=list(goal_case.relevant_tools))
        for goal_case in BASE_GOAL_CASES
    ]
    goal_cases.append(
        GoalCase(
            goal=ROBOT_GOAL,
            relevant_tools=[cherrybot_tool],
        )
    )
    return goal_cases


def _with_profile_parameter_combinations(goal_case: GoalCase) -> list[GoalCase]:
    return [
        GoalCase(
            goal=goal_case.goal,
            relevant_tools=list(goal_case.relevant_tools),
            maximum_signifiers=maximum_signifiers,
            minimum_relevance_value=minimum_relevance_value,
        )
        for maximum_signifiers in MAXIMUM_SIGNIFIERS_VALUES
        for minimum_relevance_value in MINIMUM_RELEVANCE_VALUES
    ]


def _parameter_key(goal_case: GoalCase) -> dict[str, int | float | None]:
    return {
        "maximumSignifiers": goal_case.maximum_signifiers,
        "minimumRelevanceValue": goal_case.minimum_relevance_value,
    }


def _parameter_label(goal_case: GoalCase) -> str:
    maximum_signifiers = (
        "absent" if goal_case.maximum_signifiers is None else str(goal_case.maximum_signifiers)
    )
    minimum_relevance_value = (
        "absent"
        if goal_case.minimum_relevance_value is None
        else _format_ratio(goal_case.minimum_relevance_value)
    )
    return f"max={maximum_signifiers}, min={minimum_relevance_value}"


def update_profile(
    goal_case: GoalCase,
    *,
    force_minimum_relevance_value: float | None = None,
) -> None:
    graph = Graph()
    profile_uri = URIRef(PROFILE_URL)
    context = BNode()
    graph.add((profile_uri, HMAS["hasContext"], context))
    graph.add((context, RDFS["comment"], Literal(goal_case.goal)))
    if goal_case.maximum_signifiers is not None:
        graph.add(
            (
                profile_uri,
                HMAS["maximumSignifiers"],
                Literal(goal_case.maximum_signifiers, datatype=XSD.integer),
            )
        )
    minimum_relevance_value = (
        force_minimum_relevance_value
        if force_minimum_relevance_value is not None
        else goal_case.minimum_relevance_value
    )
    if minimum_relevance_value is not None:
        graph.add(
            (
                profile_uri,
                HMAS["minimumRelevanceValue"],
                Literal(minimum_relevance_value, datatype=XSD.decimal),
            )
        )

    data = graph.serialize(format="turtle").encode("utf-8")
    headers = {"Content-Type": "text/turtle"}
    with _http_request(PROFILE_URL, method="PUT", data=data, headers=headers) as resp:
        if resp.status >= 400:
            raise RuntimeError(f"Failed to update profile: HTTP {resp.status}")


def wait_for_app_ready() -> None:
    last_error = None
    while True:
        try:
            with _http_request("http://localhost:5000/signifiers/list") as resp:
                if resp.status < 400:
                    return
        except Exception as exc:
            last_error = exc
        time.sleep(1)
    raise RuntimeError(f"App did not become ready: {last_error}")


def _find_existing_result(
    all_results: list[dict], llm: LlmConfig, interface_name: str
) -> dict | None:
    llm_label = llm.result_label()
    for entry in all_results:
        if entry.get("llm") == llm_label and entry.get("interface") == interface_name:
            return entry

    legacy_label = f"{llm.provider}:{llm.model}"
    if llm.reasoning_effort == "none" and legacy_label != llm_label:
        for entry in all_results:
            if entry.get("llm") == legacy_label and entry.get("interface") == interface_name:
                entry["llm"] = llm_label
                entry["reasoning"] = llm.reasoning_effort
                return entry
    return None


def _find_or_create_goal_result(results: dict, goal_case: GoalCase) -> dict:
    parameter_key = _parameter_key(goal_case)
    for goal_result in results.get("goals", []):
        if (
            goal_result.get("goal") == goal_case.goal
            and goal_result.get("profile_parameters") == parameter_key
        ):
            goal_result["relevant_tools"] = list(goal_case.relevant_tools)
            goal_result["profile_parameters"] = parameter_key
            goal_result.setdefault("runs", [])
            return goal_result

    goal_result = {
        "goal": goal_case.goal,
        "relevant_tools": list(goal_case.relevant_tools),
        "profile_parameters": parameter_key,
        "runs": [],
    }
    results.setdefault("goals", []).append(goal_result)
    return goal_result


def evaluate_llm(
    llm: LlmConfig,
    cherrybot_tool: str,
    interface_name: str,
    all_results: list[dict],
    on_run_persist: Callable[[], None] | None = None,
) -> dict:
    config = load_config()
    config.setdefault("sem", {})
    config["sem"]["provider"] = llm.provider
    config["sem"]["model"] = llm.model
    if llm.reasoning_effort is not None:
        config["sem"]["reasoning"] = llm.reasoning_effort
    else:
        config["sem"].pop("reasoning", None)
    set_temperature_zero(config)
    write_config(config)

    llm_label = llm.result_label()
    results = _find_existing_result(all_results, llm, interface_name)
    if results is None:
        results = {
            "llm": llm_label,
            "interface": interface_name,
            "goals": [],
        }
        if llm.reasoning_effort is not None:
            results["reasoning"] = llm.reasoning_effort
        all_results.append(results)
    elif llm.reasoning_effort is not None:
        results["reasoning"] = llm.reasoning_effort

    for base_goal_case in iter_base_goal_cases(cherrybot_tool):
        parameter_cases = _with_profile_parameter_combinations(base_goal_case)
        goal_results = [
            (goal_case, _find_or_create_goal_result(results, goal_case))
            for goal_case in parameter_cases
        ]

        for _goal_case, goal_result in goal_results:
            goal_result["runs"] = list(goal_result.get("runs", []))[:RUNS_PER_GOAL]

        completed_runs = min(len(goal_result["runs"]) for _goal_case, goal_result in goal_results)
        for run_index in range(completed_runs, RUNS_PER_GOAL):
            run_start = time.monotonic()
            update_profile(
                base_goal_case,
                force_minimum_relevance_value=RELEVANCE_COLLECTION_THRESHOLD,
            )
            signifier_relevance = fetch_signifier_relevance()
            serialized_relevance = [
                {"label": item.label, "relevance": item.relevance}
                for item in signifier_relevance
            ]

            for goal_case, goal_result in goal_results:
                if len(goal_result["runs"]) > run_index:
                    continue

                selected_relevance = filter_signifier_relevance(signifier_relevance, goal_case)
                selected = {item.label for item in selected_relevance}
                relevant_set = set(goal_case.relevant_tools)
                precision, recall, tp, fp, fn = compute_precision_recall(selected, relevant_set)
                (
                    min_r_precision,
                    max_r_precision,
                    best_ranking,
                    worst_ranking,
                ) = compute_r_precision_bounds(
                    signifier_relevance,
                    relevant_set,
                )
                goal_result["runs"].append(
                    {
                        "profile_parameters": _parameter_key(goal_case),
                        "signifier_relevance": serialized_relevance,
                        "selected": sorted(selected),
                        "precision": precision,
                        "recall": recall,
                        "minimum_possible_r_precision": min_r_precision,
                        "maximum_possible_r_precision": max_r_precision,
                        "best_ranking": best_ranking,
                        "worst_ranking": worst_ranking,
                        "tp": tp,
                        "fp": fp,
                        "fn": fn,
                        "elapsed_seconds": time.monotonic() - run_start,
                    }
                )

            if on_run_persist is not None:
                on_run_persist()

    return results


def summarize_results(results: dict) -> None:
    print("\nEvaluation results:")
    print(f"LLM: {results['llm']}")
    if results.get("interface"):
        print(f"Interface: {results['interface']}")
    for goal in results["goals"]:
        precisions = [r["precision"] for r in goal["runs"]]
        recalls = [r["recall"] for r in goal["runs"]]
        min_r_precisions = [
            float(r.get("minimum_possible_r_precision", 0.0)) for r in goal["runs"]
        ]
        max_r_precisions = [
            float(r.get("maximum_possible_r_precision", 0.0)) for r in goal["runs"]
        ]
        elapsed_seconds = [float(r.get("elapsed_seconds", 0.0)) for r in goal["runs"]]
        avg_precision = sum(precisions) / len(precisions)
        avg_recall = sum(recalls) / len(recalls)
        avg_min_r_precision = sum(min_r_precisions) / len(min_r_precisions)
        avg_max_r_precision = sum(max_r_precisions) / len(max_r_precisions)
        avg_elapsed_seconds = sum(elapsed_seconds) / len(elapsed_seconds)
        print("-")
        print(f"Goal: {goal['goal']}")
        if goal.get("profile_parameters"):
            print(f"Profile parameters: {goal['profile_parameters']}")
        print(f"Relevant tools: {goal['relevant_tools']}")
        print(f"Avg precision: {avg_precision:.3f}")
        print(f"Avg recall: {avg_recall:.3f}")
        print(
            "Avg possible R-precision: "
            f"{avg_min_r_precision:.3f}-{avg_max_r_precision:.3f}"
        )
        print(f"Avg time: {avg_elapsed_seconds:.2f}s")


def _load_existing_results(path: Path) -> dict | None:
    if not path.exists():
        return None
    text = path.read_text()
    try:
        parsed = json.loads(text)
        return parsed if isinstance(parsed, dict) else None
    except json.JSONDecodeError:
        # Support files where a human-readable table is appended after JSON.
        decoder = json.JSONDecoder()
        try:
            parsed, _end = decoder.raw_decode(text.lstrip())
        except json.JSONDecodeError:
            return None
        return parsed if isinstance(parsed, dict) else None


def _format_cell_stats(goal_entry: dict) -> str:
    runs = goal_entry.get("runs", [])
    if not runs:
        return "-"
    n = len(runs)
    avg_precision = sum(float(run.get("precision", 0.0)) for run in runs) / n
    avg_recall = sum(float(run.get("recall", 0.0)) for run in runs) / n
    avg_elapsed_seconds = sum(float(run.get("elapsed_seconds", 0.0)) for run in runs) / n
    return (
        f"P:{_format_ratio(avg_precision)}, "
        f"R:{_format_ratio(avg_recall)}, "
        f"T:{_format_seconds(avg_elapsed_seconds)}"
    )


def _format_r_precision_cell(runs: list[dict]) -> str:
    if not runs:
        return "-"
    records = {
        (
            float(run.get("minimum_possible_r_precision", 0.0)),
            float(run.get("maximum_possible_r_precision", 0.0)),
            tuple(str(label) for label in run.get("best_ranking", [])),
            tuple(str(label) for label in run.get("worst_ranking", [])),
        )
        for run in runs
        if "minimum_possible_r_precision" in run and "maximum_possible_r_precision" in run
    }
    if not records:
        return "-"

    n = len(records)
    avg_min_r_precision = sum(record[0] for record in records) / n
    avg_max_r_precision = sum(record[1] for record in records) / n
    best_rankings = sorted({record[2] for record in records})
    worst_rankings = sorted({record[3] for record in records})
    return (
        f"RP:{_format_ratio(avg_min_r_precision)}-{_format_ratio(avg_max_r_precision)}; "
        f"best_ranking:{_format_rankings(best_rankings)}; "
        f"worst_ranking:{_format_rankings(worst_rankings)}"
    )


def _format_rankings(rankings: list[tuple[str, ...]]) -> str:
    return " | ".join("[" + ", ".join(ranking) + "]" for ranking in rankings)


def _runs_with_r_precision_rankings(goal_entry: dict) -> list[dict]:
    relevant = {str(label) for label in goal_entry.get("relevant_tools", [])}
    enriched_runs: list[dict] = []
    for run in goal_entry.get("runs", []):
        if not isinstance(run, dict):
            continue
        enriched_run = dict(run)
        if (
            "best_ranking" not in enriched_run
            or "worst_ranking" not in enriched_run
            or "minimum_possible_r_precision" not in enriched_run
            or "maximum_possible_r_precision" not in enriched_run
        ):
            signifier_relevance = [
                SignifierRelevance(str(item.get("label", "")), float(item.get("relevance", 0.0)))
                for item in enriched_run.get("signifier_relevance", [])
                if isinstance(item, dict) and item.get("label")
            ]
            if signifier_relevance:
                (
                    min_r_precision,
                    max_r_precision,
                    best_ranking,
                    worst_ranking,
                ) = compute_r_precision_bounds(signifier_relevance, relevant)
                enriched_run["minimum_possible_r_precision"] = min_r_precision
                enriched_run["maximum_possible_r_precision"] = max_r_precision
                enriched_run["best_ranking"] = best_ranking
                enriched_run["worst_ranking"] = worst_ranking
        enriched_runs.append(enriched_run)
    return enriched_runs


def _format_ratio(value: float) -> str:
    rounded = round(value, 2)
    if rounded.is_integer():
        return str(int(rounded))
    return f"{rounded:.2f}"


def _format_seconds(value: float) -> str:
    rounded = round(value, 2)
    if rounded.is_integer():
        return f"{int(rounded)}s"
    return f"{rounded:.2f}s"


def _format_duration(value: float) -> str:
    total_seconds = int(round(value))
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    if hours:
        return f"{hours}h {minutes}m {seconds}s"
    if minutes:
        return f"{minutes}m {seconds}s"
    return _format_seconds(value)


def _canonical_goal_name(goal_name: str) -> str:
    normalized = " ".join(goal_name.strip().lower().split())
    if normalized == "the agent wants to read the user goal.":
        return "Goal Signifier"
    if normalized in {
        "the agent wants to provide feedback to the user.",
    }:
        return "Feedback"
    if normalized == "the agent wants to convert the natural language goal into a formal representation.":
        return "Formalizer"
    if normalized in {
        "the agent wants to read the goal or provide feedback to the user."
    }:
        return "Goal/Feedback"
    if "perform a formal operation on the robot" in normalized:
        return ROBOT_GOAL
    return goal_name.strip()


def _goal_row_label(goal_name: str, interface_name: str) -> str:
    canonical_goal = _canonical_goal_name(goal_name)
    if canonical_goal == ROBOT_GOAL:
        if interface_name.strip().lower() in {"td", "wot"}:
            return "Robot Control (WoT)"
        return "Robot Control"
    return canonical_goal


def _goal_entry_row_label(goal_entry: dict, interface_name: str) -> str:
    row_label = _goal_row_label(str(goal_entry.get("goal", "unknown")), interface_name)
    parameters = goal_entry.get("profile_parameters")
    if isinstance(parameters, dict):
        parameter_case = GoalCase(
            goal=str(goal_entry.get("goal", "")),
            relevant_tools=[],
            maximum_signifiers=parameters.get("maximumSignifiers"),
            minimum_relevance_value=parameters.get("minimumRelevanceValue"),
        )
        row_label = f"{row_label} ({_parameter_label(parameter_case)})"
    return row_label


def _configured_llm_labels() -> list[str]:
    return [llm.result_label() for llm in LLMS_TO_EVALUATE]


def _configured_llm_display_labels() -> dict[str, str]:
    return {label: f"LLM {index}" for index, label in enumerate(_configured_llm_labels())}


def _model_labels_for_results(results_entries: list[dict]) -> list[str]:
    present_model_labels: list[str] = []
    for entry in results_entries:
        label = str(entry.get("llm", "unknown"))
        if label not in present_model_labels:
            present_model_labels.append(label)

    configured_model_labels = _configured_llm_labels()
    model_labels = list(configured_model_labels)
    model_labels.extend([label for label in present_model_labels if label not in model_labels])
    return model_labels


def _expected_goal_rows(include_wot_robot_control: bool) -> list[str]:
    rows: list[str] = []
    for goal_case in iter_goal_cases():
        row_label = _goal_row_label(goal_case.goal, "utcp")
        row_label = f"{row_label} ({_parameter_label(goal_case)})"
        if row_label not in rows:
            rows.append(row_label)
    robot_goal_cases = _with_profile_parameter_combinations(
        GoalCase(goal=ROBOT_GOAL, relevant_tools=[])
    )
    rows.extend(f"Robot Control ({_parameter_label(goal_case)})" for goal_case in robot_goal_cases)
    if include_wot_robot_control:
        rows.extend(
            f"Robot Control (WoT) ({_parameter_label(goal_case)})" for goal_case in robot_goal_cases
        )
    return rows


def _expected_r_precision_goal_rows(include_wot_robot_control: bool) -> list[str]:
    rows: list[str] = []
    for goal_case in BASE_GOAL_CASES:
        row_label = _goal_row_label(goal_case.goal, "utcp")
        if row_label not in rows:
            rows.append(row_label)
    rows.append("Robot Control")
    if include_wot_robot_control:
        rows.append("Robot Control (WoT)")
    return rows


def _display_model_label(
    llm_label: str, model_display_labels: dict[str, str] | None = None
) -> str:
    display_labels = model_display_labels or _configured_llm_display_labels()
    return display_labels.get(llm_label, llm_label)


def _is_wot_entry(result_entry: dict) -> bool:
    return str(result_entry.get("interface", "")).strip().lower() in {"td", "wot"}


def _filtered_results_entries(
    results_entries: list[dict], include_wot_robot_control: bool
) -> list[dict]:
    if include_wot_robot_control:
        return list(results_entries)
    return [entry for entry in results_entries if not _is_wot_entry(entry)]


def _latex_escape(value: str) -> str:
    replacements = {
        "&": r"\&",
        "%": r"\%",
        "_": r"\_",
        "#": r"\#",
    }
    out = value
    for old, new in replacements.items():
        out = out.replace(old, new)
    return out


def build_results_table(
    results_entries: list[dict],
    include_wot_robot_control: bool = False,
    model_labels: list[str] | None = None,
    model_display_labels: dict[str, str] | None = None,
) -> str:
    results_entries = _filtered_results_entries(results_entries, include_wot_robot_control)
    if not results_entries:
        return "% No results available."

    if model_labels is None:
        model_labels = _model_labels_for_results(results_entries)

    goals = _expected_goal_rows(include_wot_robot_control)
    present_goal_rows: list[str] = []
    for entry in results_entries:
        interface_name = str(entry.get("interface", ""))
        for goal in entry.get("goals", []):
            row_label = _goal_entry_row_label(goal, interface_name)
            if not include_wot_robot_control and row_label == "Robot Control (WoT)":
                continue
            if row_label not in present_goal_rows:
                present_goal_rows.append(row_label)
    goals.extend([row for row in present_goal_rows if row not in goals])

    stats_runs_by_model_goal: dict[tuple[str, str], list[dict]] = {}
    for entry in results_entries:
        model = str(entry.get("llm", "unknown"))
        interface_name = str(entry.get("interface", ""))
        for goal in entry.get("goals", []):
            row_label = _goal_entry_row_label(goal, interface_name)
            if not include_wot_robot_control and row_label == "Robot Control (WoT)":
                continue
            key = (model, row_label)
            stats_runs_by_model_goal.setdefault(key, [])
            stats_runs_by_model_goal[key].extend(goal.get("runs", []))

    col_spec = "l" + ("c" * len(model_labels))
    rows = [
        r"\begin{table}[htbp]",
        r"\centering",
        f"\\begin{{tabular}}{{{col_spec}}}",
        "  Task/Model & "
        + " & ".join(
            _latex_escape(_display_model_label(label, model_display_labels))
            for label in model_labels
        )
        + r"\\",
    ]
    for goal_name in goals:
        cells: list[str] = []
        for model in model_labels:
            runs = stats_runs_by_model_goal.get((model, goal_name), [])
            cells.append(_format_cell_stats({"runs": runs}) if runs else "-")
        rows.append(
            "  "
            + _latex_escape(goal_name)
            + " & "
            + " & ".join(_latex_escape(cell) for cell in cells)
            + r"\\"
        )
    rows.extend(
        [
            r"\end{tabular}",
            r"\caption{Results}",
            r"\label{table:results}",
            r"\end{table}",
        ]
    )
    return "\n".join(rows)


def _profile_parameters_match(
    goal_entry: dict,
    maximum_signifiers: int | None,
    minimum_relevance_value: float | None,
) -> bool:
    parameters = goal_entry.get("profile_parameters")
    if not isinstance(parameters, dict):
        return maximum_signifiers is None and minimum_relevance_value is None
    return (
        parameters.get("maximumSignifiers") == maximum_signifiers
        and parameters.get("minimumRelevanceValue") == minimum_relevance_value
    )


def _expected_filtered_goal_rows(include_wot_robot_control: bool) -> list[str]:
    rows: list[str] = []
    for goal_case in BASE_GOAL_CASES:
        row_label = _goal_row_label(goal_case.goal, "utcp")
        if row_label not in rows:
            rows.append(row_label)
    rows.append("Robot Control")
    if include_wot_robot_control:
        rows.append("Robot Control (WoT)")
    return rows


def build_parameter_results_table(
    results_entries: list[dict],
    maximum_signifiers: int | None,
    minimum_relevance_value: float | None,
    include_wot_robot_control: bool = False,
    model_labels: list[str] | None = None,
    model_display_labels: dict[str, str] | None = None,
) -> str:
    results_entries = _filtered_results_entries(results_entries, include_wot_robot_control)
    if not results_entries:
        return "% No results available."

    if model_labels is None:
        model_labels = _model_labels_for_results(results_entries)

    goals = _expected_filtered_goal_rows(include_wot_robot_control)
    present_goal_rows: list[str] = []
    for entry in results_entries:
        interface_name = str(entry.get("interface", ""))
        for goal in entry.get("goals", []):
            if not _profile_parameters_match(goal, maximum_signifiers, minimum_relevance_value):
                continue
            row_label = _goal_row_label(str(goal.get("goal", "unknown")), interface_name)
            if not include_wot_robot_control and row_label == "Robot Control (WoT)":
                continue
            if row_label not in present_goal_rows:
                present_goal_rows.append(row_label)
    goals.extend([row for row in present_goal_rows if row not in goals])

    stats_runs_by_model_goal: dict[tuple[str, str], list[dict]] = {}
    for entry in results_entries:
        model = str(entry.get("llm", "unknown"))
        interface_name = str(entry.get("interface", ""))
        for goal in entry.get("goals", []):
            if not _profile_parameters_match(goal, maximum_signifiers, minimum_relevance_value):
                continue
            row_label = _goal_row_label(str(goal.get("goal", "unknown")), interface_name)
            if not include_wot_robot_control and row_label == "Robot Control (WoT)":
                continue
            key = (model, row_label)
            stats_runs_by_model_goal.setdefault(key, [])
            stats_runs_by_model_goal[key].extend(goal.get("runs", []))

    parameter_case = GoalCase(
        goal="",
        relevant_tools=[],
        maximum_signifiers=maximum_signifiers,
        minimum_relevance_value=minimum_relevance_value,
    )
    parameter_label = _parameter_label(parameter_case)
    table_label = parameter_label.replace("=", "-").replace(", ", "-").replace(".", "-")

    col_spec = "l" + ("c" * len(model_labels))
    rows = [
        r"\begin{table}[htbp]",
        r"\centering",
        f"\\begin{{tabular}}{{{col_spec}}}",
        "  Task/Model & "
        + " & ".join(
            _latex_escape(_display_model_label(label, model_display_labels))
            for label in model_labels
        )
        + r"\\",
    ]
    for goal_name in goals:
        cells: list[str] = []
        for model in model_labels:
            runs = stats_runs_by_model_goal.get((model, goal_name), [])
            cells.append(_format_cell_stats({"runs": runs}) if runs else "-")
        rows.append(
            "  "
            + _latex_escape(goal_name)
            + " & "
            + " & ".join(_latex_escape(cell) for cell in cells)
            + r"\\"
        )
    rows.extend(
        [
            r"\end{tabular}",
            f"\\caption{{Results ({_latex_escape(parameter_label)})}}",
            f"\\label{{table:results-{table_label}}}",
            r"\end{table}",
        ]
    )
    return "\n".join(rows)


def build_r_precision_table(
    results_entries: list[dict],
    include_wot_robot_control: bool = False,
    model_labels: list[str] | None = None,
    model_display_labels: dict[str, str] | None = None,
) -> str:
    results_entries = _filtered_results_entries(results_entries, include_wot_robot_control)
    if not results_entries:
        return "% No R-precision results available."

    if model_labels is None:
        model_labels = _model_labels_for_results(results_entries)

    goals = _expected_r_precision_goal_rows(include_wot_robot_control)
    present_goal_rows: list[str] = []
    for entry in results_entries:
        interface_name = str(entry.get("interface", ""))
        for goal in entry.get("goals", []):
            row_label = _goal_row_label(str(goal.get("goal", "unknown")), interface_name)
            if not include_wot_robot_control and row_label == "Robot Control (WoT)":
                continue
            if row_label not in present_goal_rows:
                present_goal_rows.append(row_label)
    goals.extend([row for row in present_goal_rows if row not in goals])

    runs_by_model_goal: dict[tuple[str, str], list[dict]] = {}
    for entry in results_entries:
        model = str(entry.get("llm", "unknown"))
        interface_name = str(entry.get("interface", ""))
        for goal in entry.get("goals", []):
            row_label = _goal_row_label(str(goal.get("goal", "unknown")), interface_name)
            if not include_wot_robot_control and row_label == "Robot Control (WoT)":
                continue
            key = (model, row_label)
            runs_by_model_goal.setdefault(key, [])
            runs_by_model_goal[key].extend(_runs_with_r_precision_rankings(goal))

    col_spec = "l" + ("c" * len(model_labels))
    rows = [
        r"\begin{table}[htbp]",
        r"\centering",
        f"\\begin{{tabular}}{{{col_spec}}}",
        "  Task/Model & "
        + " & ".join(
            _latex_escape(_display_model_label(label, model_display_labels))
            for label in model_labels
        )
        + r"\\",
    ]
    for goal_name in goals:
        cells: list[str] = []
        for model in model_labels:
            runs = runs_by_model_goal.get((model, goal_name), [])
            cells.append(_format_r_precision_cell(runs) if runs else "-")
        rows.append(
            "  "
            + _latex_escape(goal_name)
            + " & "
            + " & ".join(_latex_escape(cell) for cell in cells)
            + r"\\"
        )
    rows.extend(
        [
            r"\end{tabular}",
            r"\caption{R-precision bounds}",
            r"\label{table:r-precision-bounds}",
            r"\end{table}",
        ]
    )
    return "\n".join(rows)


def write_results(
    path: Path,
    payload: dict,
    include_wot_robot_control: bool = False,
    total_elapsed_seconds: float | None = None,
    json_path: Path | None = None,
) -> None:
    results_entries = payload.get("results", [])
    filtered_entries = (
        _filtered_results_entries(results_entries, include_wot_robot_control)
        if isinstance(results_entries, list)
        else []
    )
    table = build_results_table(
        filtered_entries,
        include_wot_robot_control=include_wot_robot_control,
    )
    filtered_stats_tables = [
        (
            _parameter_label(
                GoalCase(
                    goal="",
                    relevant_tools=[],
                    maximum_signifiers=maximum_signifiers,
                    minimum_relevance_value=minimum_relevance_value,
                )
            ),
            build_parameter_results_table(
                filtered_entries,
                maximum_signifiers,
                minimum_relevance_value,
                include_wot_robot_control=include_wot_robot_control,
            ),
        )
        for maximum_signifiers in MAXIMUM_SIGNIFIERS_VALUES
        for minimum_relevance_value in MINIMUM_RELEVANCE_VALUES
    ]
    r_precision_table = build_r_precision_table(
        filtered_entries,
        include_wot_robot_control=include_wot_robot_control,
    )
    if total_elapsed_seconds is None:
        payload_elapsed_seconds = payload.get("total_elapsed_seconds")
        if isinstance(payload_elapsed_seconds, int | float):
            total_elapsed_seconds = float(payload_elapsed_seconds)
    filtered_payload: dict[str, object] = {"results": filtered_entries}
    runtime_section = ""
    if total_elapsed_seconds is not None:
        filtered_payload["total_elapsed_seconds"] = total_elapsed_seconds
        runtime_section = (
            "\n\n# Evaluation Runtime\n\n"
            f"Total elapsed time: {_format_duration(total_elapsed_seconds)}"
            f" ({total_elapsed_seconds:.2f}s)\n"
        )
    if json_path is not None:
        json_path.write_text(json.dumps(filtered_payload, indent=2, sort_keys=True) + "\n")
    output = (
        json.dumps(filtered_payload, indent=2, sort_keys=True)
        + runtime_section
        + "\n\n# Task x Model Statistics (LaTeX)\n\n"
        + table
        + "".join(
            "\n\n# Task x Model Statistics "
            f"({_latex_escape(parameter_label)}) (LaTeX)\n\n"
            + filtered_table
            for parameter_label, filtered_table in filtered_stats_tables
        )
        + "\n\n# Task x Model R-Precision Bounds (LaTeX)\n\n"
        + r_precision_table
        + "\n"
    )
    path.write_text(output)


def _post_artifact_registration(payload: dict) -> None:
    data = json.dumps(payload).encode("utf-8")
    headers = {"Content-Type": "application/json"}
    with _http_request(
        ARTIFACT_REGISTRATION_URL, method="POST", data=data, headers=headers
    ) as resp:
        if resp.status >= 400:
            body = resp.read().decode("utf-8", errors="replace")
            raise RuntimeError(f"Artifact registration failed: HTTP {resp.status} {body}")


def switch_to_td_interface() -> None:
    _post_artifact_registration(
        {
            "action": "delete",
            "instance_name": CHERRYBOT_UTCP_INSTANCE,
        }
    )
    _post_artifact_registration(
        {
            "action": "register",
            "kind": "wot",
            "instance_name": CHERRYBOT_TD_INSTANCE,
            "url": CHERRYBOT_TD_URL,
        }
    )


def main() -> int:
    server_manager = ServerManager()
    evaluation_start = time.monotonic()

    def _cleanup(*_args):
        server_manager.stop()
        sys.exit(1)

    signal.signal(signal.SIGINT, _cleanup)
    signal.signal(signal.SIGTERM, _cleanup)

    original_config = load_config()
    partial_payload = _load_existing_results(PARTIAL_RESULTS_PATH)
    if isinstance(partial_payload, dict) and isinstance(partial_payload.get("results"), list):
        all_results = _filtered_results_entries(
            partial_payload["results"], RUN_WOT_ROBOT_CONTROL_TEST
        )
    else:
        all_results = []

    def persist_partial_results() -> None:
        write_results(
            PARTIAL_RESULTS_PATH,
            {"results": all_results},
            include_wot_robot_control=RUN_WOT_ROBOT_CONTROL_TEST,
            total_elapsed_seconds=time.monotonic() - evaluation_start,
        )

    try:
        server_manager.start()
        wait_for_app_ready()

        for llm in LLMS_TO_EVALUATE:
            results = evaluate_llm(
                llm,
                f"{CHERRYBOT_UTCP_INSTANCE}_operation",
                "utcp",
                all_results,
                persist_partial_results,
            )
            summarize_results(results)

        if RUN_WOT_ROBOT_CONTROL_TEST:
            switch_to_td_interface()
            time.sleep(2)
            for llm in LLMS_TO_EVALUATE:
                results = evaluate_llm(
                    llm,
                    f"{CHERRYBOT_TD_INSTANCE}_operation",
                    "wot",
                    all_results,
                    persist_partial_results,
                )
                summarize_results(results)
        write_results(
            RESULTS_PATH,
            {"results": all_results},
            include_wot_robot_control=RUN_WOT_ROBOT_CONTROL_TEST,
            total_elapsed_seconds=time.monotonic() - evaluation_start,
            json_path=RESULTS_JSON_PATH,
        )
        if PARTIAL_RESULTS_PATH.exists():
            PARTIAL_RESULTS_PATH.unlink()
    except Exception as exc:
        print(f"Evaluation failed: {exc}")
        return 1
    finally:
        write_config(original_config)
        server_manager.stop()

    return 0


if __name__ == "__main__":
    raise SystemExit(main())
