import tempfile
import unittest
from pathlib import Path

from scripts.analyze_execution_audit import classify_records, summarize_by_strategy
from scripts.analyze_execution_significance import exact_mcnemar_pvalue, significance_rows
from scripts.execution_audit_common import (
    EXECUTED_REJECT,
    INFRASTRUCTURE_ERROR,
    RECON_UNKNOWN_IDENTIFIER,
    classify_execution_error,
)


class ExecutionAuditTests(unittest.TestCase):
    def test_classifies_reconstruction_and_infrastructure_errors(self) -> None:
        self.assertEqual(
            classify_execution_error(
                "environment_error",
                False,
                "tmp.lean:11:6: error(lean.unknownIdentifier): Unknown identifier `f`",
            ),
            RECON_UNKNOWN_IDENTIFIER,
        )
        self.assertEqual(
            classify_execution_error(
                "environment_error",
                False,
                "info: mathlib: cloning https://github.com/leanprover-community/mathlib4\nerror: external command 'git' exited with code 255",
            ),
            INFRASTRUCTURE_ERROR,
        )
        self.assertEqual(
            classify_execution_error(
                "executed",
                False,
                "error: Tactic `rewrite` failed: Did not find an occurrence of the pattern",
            ),
            EXECUTED_REJECT,
        )

    def test_summary_uses_all_query_and_executable_denominators(self) -> None:
        records = classify_records(
            [
                {"strategy": "unguided", "query_id": "q1", "rank": 1, "status": "executed", "accepted": True, "error": ""},
                {"strategy": "unguided", "query_id": "q1", "rank": 2, "status": "executed", "accepted": False, "error": "tactic failed"},
                {
                    "strategy": "unguided",
                    "query_id": "q2",
                    "rank": 1,
                    "status": "environment_error",
                    "accepted": False,
                    "error": "Unknown identifier `f`",
                },
                {"strategy": "unguided", "query_id": "q2", "rank": 2, "status": "executed", "accepted": True, "error": ""},
            ]
        )
        summary = summarize_by_strategy(records, [1, 2])[0]
        self.assertEqual(summary["queries"], 2)
        self.assertAlmostEqual(summary["accept_at_1_all"], 0.5)
        self.assertAlmostEqual(summary["accept_at_2_all"], 1.0)
        self.assertEqual(summary["executable_queries_at_1"], 1)
        self.assertAlmostEqual(summary["candidate_execution_coverage"], 0.75)

    def test_paired_execution_significance(self) -> None:
        records = [
            {"strategy": "unguided", "query_id": "q1", "rank": 1, "accepted": True},
            {"strategy": "unguided", "query_id": "q2", "rank": 1, "accepted": False},
            {"strategy": "family_soft", "query_id": "q1", "rank": 1, "accepted": False},
            {"strategy": "family_soft", "query_id": "q2", "rank": 1, "accepted": False},
        ]
        rows = significance_rows(records, "unguided", [1], n_bootstrap=100, seed=1)
        self.assertEqual(rows[0]["n_pairs"], 2)
        self.assertAlmostEqual(rows[0]["diff"], -0.5)
        self.assertEqual(rows[0]["mcnemar_c_baseline_only"], 1)
        self.assertAlmostEqual(exact_mcnemar_pvalue(0, 1), 1.0)


if __name__ == "__main__":
    unittest.main()
