import json
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from evaluation import GoalCase, LlmConfig, SignifierRelevance, build_r_precision_table
from evaluation import build_results_table
from evaluation import compute_r_precision_bounds
from evaluation import compute_precision_recall, filter_signifier_relevance
from evaluation import _configured_llm_labels, _find_existing_result
from evaluation import write_results


def test_compute_r_precision_bounds_orders_relevant_ties_best_and_worst():
    signifier_relevance = [
        SignifierRelevance("high_irrelevant", 0.9),
        SignifierRelevance("relevant_a", 0.5),
        SignifierRelevance("irrelevant_a", 0.5),
        SignifierRelevance("relevant_b", 0.5),
        SignifierRelevance("low_irrelevant", 0.1),
    ]

    min_r_precision, max_r_precision, best_ranking, worst_ranking = compute_r_precision_bounds(
        signifier_relevance,
        {"relevant_a", "relevant_b"},
    )

    assert min_r_precision == 0.0
    assert max_r_precision == 0.5
    assert best_ranking == [
        "high_irrelevant",
        "relevant_a",
        "relevant_b",
        "irrelevant_a",
        "low_irrelevant",
    ]
    assert worst_ranking == [
        "high_irrelevant",
        "irrelevant_a",
        "relevant_a",
        "relevant_b",
        "low_irrelevant",
    ]


def test_minimum_relevance_filter_includes_exact_threshold_match():
    goal_case = GoalCase(
        goal="The agent wants to read the goal or provide feedback to the user.",
        relevant_tools=["goal_mcp_CurrentGoal"],
        maximum_signifiers=1,
        minimum_relevance_value=1,
    )
    signifier_relevance = [
        SignifierRelevance("goal_mcp_CurrentGoal", 1.0),
        SignifierRelevance("irrelevant", 0.75),
    ]

    selected = {item.label for item in filter_signifier_relevance(signifier_relevance, goal_case)}
    _precision, recall, tp, _fp, fn = compute_precision_recall(
        selected, set(goal_case.relevant_tools)
    )

    assert selected == {"goal_mcp_CurrentGoal"}
    assert recall == 1.0
    assert tp == 1
    assert fn == 0


def test_r_precision_is_presented_in_separate_parameter_independent_table():
    results_entries = [
        {
            "llm": "ollama:ministral-3:14b",
            "interface": "utcp",
            "goals": [
                {
                    "goal": "The agent wants to read the user goal.",
                    "relevant_tools": ["goal_mcp_CurrentGoal"],
                    "profile_parameters": {
                        "maximumSignifiers": 1,
                        "minimumRelevanceValue": 0.75,
                    },
                    "runs": [
                        {
                            "precision": 1.0,
                            "recall": 1.0,
                            "minimum_possible_r_precision": 0.0,
                            "maximum_possible_r_precision": 1.0,
                            "best_ranking": ["goal_mcp_CurrentGoal", "other"],
                            "worst_ranking": ["other", "goal_mcp_CurrentGoal"],
                            "elapsed_seconds": 2.0,
                        }
                    ],
                }
            ],
        }
    ]

    stats_table = build_results_table(results_entries)
    r_precision_table = build_r_precision_table(results_entries)

    assert "RP:" not in stats_table
    assert "P:1, R:1, T:2s" in stats_table
    assert "Goal Signifier" in r_precision_table
    assert "RP:0-1" in r_precision_table
    assert r"best\_ranking:[goal\_mcp\_CurrentGoal, other]" in r_precision_table
    assert r"worst\_ranking:[other, goal\_mcp\_CurrentGoal]" in r_precision_table
    assert "(max=" not in r_precision_table


def test_r_precision_table_derives_missing_rankings_from_signifier_relevance():
    results_entries = [
        {
            "llm": "ollama:ministral-3:14b",
            "interface": "utcp",
            "goals": [
                {
                    "goal": "The agent wants to read the user goal.",
                    "relevant_tools": ["goal_mcp_CurrentGoal"],
                    "profile_parameters": {
                        "maximumSignifiers": 1,
                        "minimumRelevanceValue": 0.75,
                    },
                    "runs": [
                        {
                            "minimum_possible_r_precision": 0.0,
                            "maximum_possible_r_precision": 1.0,
                            "signifier_relevance": [
                                {"label": "other", "relevance": 0.5},
                                {"label": "goal_mcp_CurrentGoal", "relevance": 0.5},
                            ],
                        }
                    ],
                }
            ],
        }
    ]

    r_precision_table = build_r_precision_table(results_entries)

    assert r"best\_ranking:[goal\_mcp\_CurrentGoal, other]" in r_precision_table
    assert r"worst\_ranking:[other, goal\_mcp\_CurrentGoal]" in r_precision_table


def test_configured_llm_labels_include_reasoning_effort_for_gpt5_variants():
    labels = _configured_llm_labels()

    assert "openai:gpt-5-mini (reasoning=none)" in labels
    assert "openai:gpt-5-mini (reasoning=medium)" in labels
    assert "openai:gpt-5-mini" not in labels
    assert len(labels) == len(set(labels))


def test_find_existing_result_renames_legacy_gpt5_none_reasoning_bucket():
    all_results = [
        {
            "llm": "openai:gpt-5-mini",
            "interface": "utcp",
            "goals": [],
        }
    ]

    none_result = _find_existing_result(
        all_results,
        LlmConfig(provider="openai", model="gpt-5-mini", reasoning_effort="none"),
        "utcp",
    )
    medium_result = _find_existing_result(
        all_results,
        LlmConfig(provider="openai", model="gpt-5-mini", reasoning_effort="medium"),
        "utcp",
    )

    assert none_result is not None
    assert none_result is all_results[0]
    assert none_result["llm"] == "openai:gpt-5-mini (reasoning=none)"
    assert none_result["reasoning"] == "none"
    assert medium_result is None


def test_write_results_appends_table_for_each_profile_parameter_combination(tmp_path):
    output_path = tmp_path / "results.txt"

    write_results(output_path, {"results": []})

    output = output_path.read_text()
    assert output.count("# Task x Model Statistics (max=") == 20
    for maximum_signifiers in ("absent", "1", "2", "10"):
        for minimum_relevance_value in ("absent", "0", "0.25", "0.75", "1"):
            assert (
                "# Task x Model Statistics "
                f"(max={maximum_signifiers}, min={minimum_relevance_value}) (LaTeX)"
            ) in output


def test_write_results_can_write_json_only_companion_file(tmp_path):
    output_path = tmp_path / "results.txt"
    json_output_path = tmp_path / "results_json.json"

    write_results(
        output_path,
        {"results": []},
        total_elapsed_seconds=12.34,
        json_path=json_output_path,
    )

    json_payload = json.loads(json_output_path.read_text())
    assert json_payload == {"results": [], "total_elapsed_seconds": 12.34}
    assert "# Task x Model Statistics" not in json_output_path.read_text()
