import importlib.util
import hashlib
import json
import os
from pathlib import Path
import sys
from types import SimpleNamespace

import pytest

os.environ.setdefault("OPENAI_API_KEY", "test")

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from openevolve.config import Config
from openevolve.database import Program, ProgramDatabase
from openevolve.multi_task_shared_then_specialize.k_module_problem import (
    K_MODULE_TASK_SPECS,
    K_MODULE_TASK_SELECTOR_ENV_VAR,
    aggregate_task_results,
    build_task_result,
    count_correct_modules,
    extract_task_result,
)
import openevolve.multi_task_shared_then_specialize.spawn as mt_sts_spawn
from openevolve.multi_task_shared_then_specialize.spawn import spawn_task_checkpoints


def _load_k_module_evaluator_module():
    evaluator_path = REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "evaluator.py"
    spec = importlib.util.spec_from_file_location("k_module_mt_sts_test_module", evaluator_path)
    if spec is None or spec.loader is None:
        raise ImportError(f"Could not load evaluator from {evaluator_path}")
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


k_module_evaluator = _load_k_module_evaluator_module()


def _load_mt_sts_reporter_module():
    reporter_path = (
        REPO_ROOT / "multi_task_shared_then_adapt" / "report_mt_sts_results.py"
    )
    spec = importlib.util.spec_from_file_location("mt_sts_reporter_test_module_k_module", reporter_path)
    if spec is None or spec.loader is None:
        raise ImportError(f"Could not load MT-STS reporter from {reporter_path}")
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


mt_sts_reporter = _load_mt_sts_reporter_module()


def _write_summary_phase_output(output_dir: Path, *, score: float, iterations: int) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    payload = {
        "best_fitness": score,
        "best_metrics": {
            "score": score,
            "combined_score": score,
        },
        "total_iterations": iterations,
        "best_iteration": iterations,
        "wall_clock_time_sec": 1.0,
    }
    (output_dir / "summary.json").write_text(
        json.dumps(payload),
        encoding="utf-8",
    )


def _write_spawn_output(output_dir: Path, *, score: float, last_iteration: int) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    (output_dir / "best_program_info.json").write_text(
        json.dumps(
            {
                "iteration": 0,
                "metrics": {
                    "score": score,
                    "combined_score": score,
                },
            }
        ),
        encoding="utf-8",
    )
    (output_dir / "spawn_metadata.json").write_text(
        json.dumps({"last_iteration": last_iteration}),
        encoding="utf-8",
    )


def _build_reporter_run_with_seed_results(tmp_path: Path) -> Path:
    run_root = tmp_path / "run_01_seed_42"
    configs_root = run_root / "configs"
    configs_root.mkdir(parents=True, exist_ok=True)
    (configs_root / "shared_config.yaml").write_text(
        json.dumps(
            {
                "max_iterations": 20,
                "llm": {"primary_model": "claude-sonnet-4-6"},
                "wandb": {"project": "openevolve-mt-sts", "run_id": "run-kmodule"},
            }
        ),
        encoding="utf-8",
    )
    _write_summary_phase_output(run_root / "shared_run", score=0.4, iterations=20)

    comparison_tasks = {}
    for index, task in enumerate(K_MODULE_TASK_SPECS):
        spawn_score = 0.40 + 0.01 * index
        adaptation_score = 0.60 + 0.01 * index
        best_shared_seed_score = 0.55 + 0.01 * index
        best_task_seed_score = 0.58 + 0.01 * index
        baseline_score = 0.50 + 0.01 * index

        (configs_root / f"adaptation_{task.task_id}.yaml").write_text(
            json.dumps({"max_iterations": 10}),
            encoding="utf-8",
        )
        (configs_root / f"best_shared_seed_adaptation_{task.task_id}.yaml").write_text(
            json.dumps({"max_iterations": 10}),
            encoding="utf-8",
        )
        (configs_root / f"best_task_seed_adaptation_{task.task_id}.yaml").write_text(
            json.dumps({"max_iterations": 10}),
            encoding="utf-8",
        )
        (configs_root / f"baseline_{task.task_id}.yaml").write_text(
            json.dumps({"max_iterations": 25}),
            encoding="utf-8",
        )

        _write_spawn_output(
            run_root / "spawned_checkpoints" / task.task_id,
            score=spawn_score,
            last_iteration=20,
        )
        _write_summary_phase_output(
            run_root / "adaptation" / task.task_id,
            score=adaptation_score,
            iterations=10,
        )
        _write_summary_phase_output(
            run_root / "adaptation_best_shared_seed_ablation" / task.task_id,
            score=best_shared_seed_score,
            iterations=10,
        )
        _write_summary_phase_output(
            run_root / "adaptation_best_task_seed_ablation" / task.task_id,
            score=best_task_seed_score,
            iterations=10,
        )
        _write_summary_phase_output(
            run_root / "baselines" / task.task_id,
            score=baseline_score,
            iterations=25,
        )
        comparison_tasks[task.task_id] = {
            "best_shared_seed_adaptation_best_score": best_shared_seed_score,
            "best_task_seed_adaptation_best_score": best_task_seed_score,
        }

    (run_root / "comparison_summary.json").write_text(
        json.dumps(
            {
                "best_shared_seed_adaptation_ablation": {
                    "requested": True,
                    "enabled": True,
                    "iterations": 10,
                },
                "best_task_seed_adaptation_ablation": {
                    "requested": True,
                    "enabled": True,
                    "iterations": 10,
                },
                "tasks": comparison_tasks,
            }
        ),
        encoding="utf-8",
    )
    return run_root


def _build_candidate_program_code(config: dict[str, str]) -> str:
    serialized = json.dumps(config, sort_keys=True, indent=4)
    return (
        "# EVOLVE-BLOCK-START\n"
        "def configure_pipeline():\n"
        f"    return {serialized}\n\n"
        "def run_pipeline():\n"
        "    return configure_pipeline()\n"
        "# EVOLVE-BLOCK-END\n"
    )


def _task_metrics_for_config(
    task,
    candidate_config: dict[str, str],
    *,
    eval_time: float,
) -> dict[str, float]:
    correct_modules = count_correct_modules(task, candidate_config)
    accuracy = correct_modules / 4.0
    return {
        "correct_modules": correct_modules,
        "total_modules": 4,
        "accuracy": accuracy,
        "score": accuracy,
        "combined_score": accuracy,
        "eval_time": eval_time,
    }


def test_shared_mode_returns_aggregate_metrics_and_task_artifacts(monkeypatch):
    monkeypatch.setenv(K_MODULE_TASK_SELECTOR_ENV_VAR, "all")

    result = k_module_evaluator.evaluate(
        str(REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "initial_program.py")
    )

    assert result.metrics["combined_score"] == pytest.approx(result.metrics["score"])
    assert result.metrics["task_count"] == pytest.approx(4.0)
    assert len(result.artifacts["task_results"]) == 4
    assert {task_result["task_id"] for task_result in result.artifacts["task_results"]} == {
        task.task_id for task in K_MODULE_TASK_SPECS
    }
    for task_result in result.artifacts["task_results"]:
        assert task_result["metrics"]["combined_score"] == pytest.approx(
            task_result["metrics"]["score"]
        )
        assert task_result["spec"] == {
            "module_names": ["loader", "preprocess", "algorithm", "formatter"],
            "num_modules": 4,
        }


def test_task_specific_mode_returns_one_task(monkeypatch):
    selected_task_id = "km_task_c"
    monkeypatch.setenv(K_MODULE_TASK_SELECTOR_ENV_VAR, selected_task_id)

    result = k_module_evaluator.evaluate(
        str(REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "initial_program.py")
    )

    assert result.metrics["combined_score"] == pytest.approx(result.metrics["score"])
    assert result.artifacts["task_selector"] == selected_task_id
    assert len(result.artifacts["task_results"]) == 1
    assert result.artifacts["task_results"][0]["task_id"] == selected_task_id


def test_spawn_builds_loadable_task_checkpoint_without_reevaluation(tmp_path, monkeypatch):
    base_config_path = REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "config.yaml"
    evaluation_file = REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "evaluator.py"
    initial_program = REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "initial_program.py"

    config = Config.from_yaml(base_config_path)
    config.database.db_path = None
    shared_database = ProgramDatabase(config.database)

    candidate_configs = [
        {
            "loader": "csv_reader",
            "preprocess": "normalize",
            "algorithm": "quicksort",
            "formatter": "xml",
        },
        {
            "loader": "csv_reader",
            "preprocess": "minmax",
            "algorithm": "quicksort",
            "formatter": "yaml",
        },
    ]

    source_metrics_by_program: dict[str, dict[str, dict[str, float]]] = {}
    source_candidate_configuration_by_program: dict[str, str] = {}
    for program_index, candidate_config in enumerate(candidate_configs):
        task_results = [
            build_task_result(
                task,
                raw_metrics=_task_metrics_for_config(
                    task,
                    candidate_config,
                    eval_time=0.01 + 0.01 * program_index + 0.001 * task.task_index,
                ),
            )
            for task in K_MODULE_TASK_SPECS
        ]
        metrics = aggregate_task_results(task_results)
        program = Program(
            id=f"program_{program_index}",
            code=_build_candidate_program_code(candidate_config),
            changes_description=f"program {program_index}",
            language="python",
            generation=program_index,
            iteration_found=program_index,
            metrics=metrics,
            metadata={"island": program_index % config.database.num_islands},
            artifacts_json=json.dumps(
                {
                    "task_selector": "all",
                    "task_results": task_results,
                    "search_space_size": 625,
                    "candidate_configuration": json.dumps(candidate_config, sort_keys=True),
                    "status": "Shared evaluation complete across 4 hidden tasks.",
                    "suggestion": "Improve the average score across the hidden task family.",
                }
            ),
        )
        shared_database.add(program, target_island=program.metadata["island"])
        source_metrics_by_program[program.id] = {
            task_result["task_id"]: task_result["metrics"] for task_result in task_results
        }
        source_candidate_configuration_by_program[program.id] = json.dumps(
            candidate_config,
            sort_keys=True,
        )

    shared_checkpoint = tmp_path / "shared_checkpoint"
    shared_database.save(str(shared_checkpoint), iteration=5)

    def fail_if_reevaluated(**kwargs):
        raise AssertionError("Spawn should use stored task_results artifacts instead of reevaluation")

    monkeypatch.setattr(
        "openevolve.multi_task_shared_then_specialize.spawn._reevaluate_program_for_task",
        fail_if_reevaluated,
    )

    spawned_root = tmp_path / "spawned"
    spawn_results = spawn_task_checkpoints(
        shared_checkpoint_path=shared_checkpoint,
        output_root=spawned_root,
        base_config_path=base_config_path,
        evaluation_file=evaluation_file,
        family="k_module_problem",
        task_ids=["km_task_d"],
        initial_program=initial_program,
    )

    assert "km_task_d" in spawn_results
    spawned_checkpoint = spawned_root / "km_task_d"
    assert (spawned_checkpoint / "metadata.json").is_file()
    assert (spawned_checkpoint / "best_program_info.json").is_file()

    spawned_config = Config.from_yaml(base_config_path)
    spawned_config.database.db_path = None
    spawned_database = ProgramDatabase(spawned_config.database)
    spawned_database.load(str(spawned_checkpoint))

    assert spawned_database.last_iteration == 0
    assert len(spawned_database.programs) == 2
    assert spawned_database.best_program_id is not None

    for program_id, program in spawned_database.programs.items():
        expected_metrics = source_metrics_by_program[program_id]["km_task_d"]
        assert program.metrics["combined_score"] == pytest.approx(program.metrics["score"])
        assert program.metrics["combined_score"] == pytest.approx(expected_metrics["combined_score"])
        assert program.metadata["sts_warmstarted"] is True
        assert program.metadata["sts_target_task_id"] == "km_task_d"
        assert program.artifact_dir is None

        task_artifacts = spawned_database.get_artifacts(program_id)
        assert task_artifacts["task_selector"] == "km_task_d"
        assert len(task_artifacts["task_results"]) == 1
        assert task_artifacts["task_results"][0]["task_id"] == "km_task_d"
        assert task_artifacts["search_space_size"] == 625
        assert (
            task_artifacts["candidate_configuration"]
            == source_candidate_configuration_by_program[program_id]
        )
        assert task_artifacts["status"].endswith("modules correct.")
        assert task_artifacts["suggestion"] == "Try different module combinations to improve the score."


def test_load_run_report_uses_manifest_family_in_setting_fingerprint(tmp_path):
    run_root = tmp_path / "run_01_seed_42"
    shared_config = {
        "max_iterations": 20,
        "llm": {"primary_model": "claude-sonnet-4-6"},
        "wandb": {"project": "openevolve-mt-sts", "run_id": "run-kmodule"},
    }
    (run_root / "configs").mkdir(parents=True, exist_ok=True)
    (run_root / "configs" / "shared_config.yaml").write_text(
        json.dumps(shared_config),
        encoding="utf-8",
    )
    for task in K_MODULE_TASK_SPECS:
        (run_root / "configs" / f"adaptation_{task.task_id}.yaml").write_text(
            json.dumps({"max_iterations": 10}),
            encoding="utf-8",
        )
        (run_root / "configs" / f"baseline_{task.task_id}.yaml").write_text(
            json.dumps({"max_iterations": 25}),
            encoding="utf-8",
        )

    manifest_path = REPO_ROOT / "multi_task_shared_then_adapt" / "k_module_problem_mt_sts.yaml"
    run_report = mt_sts_reporter.load_run_report(
        run_root,
        repo_root=REPO_ROOT,
        manifest_path=manifest_path,
        manifest_family="k_module_problem",
        task_specs=K_MODULE_TASK_SPECS,
        wandb_entity_override=None,
    )

    expected_payload = {
        "manifest_path": mt_sts_reporter.display_repo_relative(manifest_path, REPO_ROOT),
        "family": "k_module_problem",
        "shared_config": mt_sts_reporter.normalize_config_for_setting(shared_config),
        "shared_iterations": 20,
        "adaptation_iterations": 10,
        "best_shared_seed_adaptation_iterations": None,
        "best_task_seed_adaptation_iterations": None,
        "baseline_iterations": 25,
        "prompt_modes": {
            "shared_prompt_mode": "generic",
            "adaptation_prompt_mode": "generic",
            "baseline_prompt_mode": "generic",
        },
        "task_ids": [task.task_id for task in K_MODULE_TASK_SPECS],
    }
    expected_fingerprint = hashlib.sha1(
        json.dumps(expected_payload, sort_keys=True).encode("utf-8")
    ).hexdigest()[:12]

    assert run_report["setting"]["family"] == "k_module_problem"
    assert run_report["setting"]["id"] == expected_fingerprint


def test_load_run_report_setting_fingerprint_ignores_llm_transport_fields(tmp_path):
    manifest_path = REPO_ROOT / "multi_task_shared_then_adapt" / "k_module_problem_mt_sts.yaml"

    def build_run(run_name: str, *, api_base: str, api_key: str):
        run_root = tmp_path / run_name
        shared_config = {
            "max_iterations": 20,
            "llm": {
                "primary_model": "claude-haiku-4-5",
                "api_base": api_base,
                "api_key": api_key,
                "models": [
                    {
                        "name": "claude-haiku-4-5",
                        "weight": 1.0,
                        "api_base": api_base,
                        "api_key": api_key,
                    }
                ],
            },
            "wandb": {"project": "openevolve-mt-sts", "run_id": f"run-{run_name}"},
        }
        (run_root / "configs").mkdir(parents=True, exist_ok=True)
        (run_root / "configs" / "shared_config.yaml").write_text(
            json.dumps(shared_config),
            encoding="utf-8",
        )
        for task in K_MODULE_TASK_SPECS:
            (run_root / "configs" / f"adaptation_{task.task_id}.yaml").write_text(
                json.dumps({"max_iterations": 10}),
                encoding="utf-8",
            )
            (run_root / "configs" / f"baseline_{task.task_id}.yaml").write_text(
                json.dumps({"max_iterations": 25}),
                encoding="utf-8",
            )
        return run_root, shared_config

    run_a_root, shared_config_a = build_run(
        "run_01_seed_42",
        api_base="http://127.0.0.1:4001",
        api_key="litellm-a",
    )
    run_b_root, shared_config_b = build_run(
        "run_02_seed_43",
        api_base="http://127.0.0.1:4002",
        api_key="litellm-b",
    )

    normalized_a = mt_sts_reporter.normalize_config_for_setting(shared_config_a)
    normalized_b = mt_sts_reporter.normalize_config_for_setting(shared_config_b)
    assert "api_base" not in normalized_a["llm"]
    assert "api_key" not in normalized_a["llm"]
    assert "api_base" not in normalized_a["llm"]["models"][0]
    assert "api_key" not in normalized_a["llm"]["models"][0]
    assert normalized_a == normalized_b

    run_report_a = mt_sts_reporter.load_run_report(
        run_a_root,
        repo_root=REPO_ROOT,
        manifest_path=manifest_path,
        manifest_family="k_module_problem",
        task_specs=K_MODULE_TASK_SPECS,
        wandb_entity_override=None,
    )
    run_report_b = mt_sts_reporter.load_run_report(
        run_b_root,
        repo_root=REPO_ROOT,
        manifest_path=manifest_path,
        manifest_family="k_module_problem",
        task_specs=K_MODULE_TASK_SPECS,
        wandb_entity_override=None,
    )

    assert run_report_a["setting"]["id"] == run_report_b["setting"]["id"]


def test_load_run_report_includes_seed_adaptation_metrics(tmp_path):
    run_root = _build_reporter_run_with_seed_results(tmp_path)
    manifest_path = REPO_ROOT / "multi_task_shared_then_adapt" / "k_module_problem_mt_sts.yaml"

    run_report = mt_sts_reporter.load_run_report(
        run_root,
        repo_root=REPO_ROOT,
        manifest_path=manifest_path,
        manifest_family="k_module_problem",
        task_specs=K_MODULE_TASK_SPECS,
        wandb_entity_override=None,
    )

    expected_best_shared_seed_scores = [
        0.55 + 0.01 * index for index in range(len(K_MODULE_TASK_SPECS))
    ]
    expected_best_task_seed_scores = [
        0.58 + 0.01 * index for index in range(len(K_MODULE_TASK_SPECS))
    ]
    expected_adaptation_scores = [0.60 + 0.01 * index for index in range(len(K_MODULE_TASK_SPECS))]

    assert run_report["setting"]["best_shared_seed_adaptation_iterations"] == 10
    assert run_report["setting"]["best_task_seed_adaptation_iterations"] == 10
    assert run_report["macro"]["best_shared_seed_mean_score"] == pytest.approx(
        sum(expected_best_shared_seed_scores) / len(expected_best_shared_seed_scores)
    )
    assert run_report["macro"]["best_task_seed_mean_score"] == pytest.approx(
        sum(expected_best_task_seed_scores) / len(expected_best_task_seed_scores)
    )
    assert run_report["macro"]["adaptation_minus_best_shared_seed_mean"] == pytest.approx(0.05)
    assert run_report["macro"]["adaptation_minus_best_task_seed_mean"] == pytest.approx(0.02)
    assert run_report["macro"]["best_task_seed_minus_best_shared_seed_mean"] == pytest.approx(
        0.03
    )
    assert run_report["macro"]["adaptation_vs_best_shared_seed_counts"] == {
        "wins": len(K_MODULE_TASK_SPECS),
        "ties": 0,
        "losses": 0,
        "comparable": len(K_MODULE_TASK_SPECS),
    }
    assert run_report["macro"]["adaptation_vs_best_task_seed_counts"] == {
        "wins": len(K_MODULE_TASK_SPECS),
        "ties": 0,
        "losses": 0,
        "comparable": len(K_MODULE_TASK_SPECS),
    }
    assert run_report["macro"]["best_task_seed_vs_best_shared_seed_counts"] == {
        "wins": len(K_MODULE_TASK_SPECS),
        "ties": 0,
        "losses": 0,
        "comparable": len(K_MODULE_TASK_SPECS),
    }
    assert run_report["tasks"]["km_task_a"]["best_shared_seed_adaptation"]["score"] == pytest.approx(
        0.55
    )
    assert run_report["tasks"]["km_task_a"]["best_task_seed_adaptation"]["score"] == pytest.approx(
        0.58
    )
    assert run_report["tasks"]["km_task_a"]["adaptation_minus_best_shared_seed"] == pytest.approx(
        0.05
    )
    assert run_report["tasks"]["km_task_a"]["adaptation_minus_best_task_seed"] == pytest.approx(
        0.02
    )
    assert run_report["tasks"]["km_task_a"]["best_task_seed_minus_best_shared_seed"] == pytest.approx(
        0.03
    )
    assert run_report["macro"]["adaptation_mean_score"] == pytest.approx(
        sum(expected_adaptation_scores) / len(expected_adaptation_scores)
    )


def test_build_markdown_report_includes_seed_adaptation_columns(tmp_path):
    run_root = _build_reporter_run_with_seed_results(tmp_path)
    manifest_path = REPO_ROOT / "multi_task_shared_then_adapt" / "k_module_problem_mt_sts.yaml"

    run_report = mt_sts_reporter.load_run_report(
        run_root,
        repo_root=REPO_ROOT,
        manifest_path=manifest_path,
        manifest_family="k_module_problem",
        task_specs=K_MODULE_TASK_SPECS,
        wandb_entity_override=None,
    )
    group = mt_sts_reporter.aggregate_group(
        {
            "setting_id": run_report["setting"]["id"],
            "setting": run_report["setting"],
            "runs": [run_report],
            "problem_runs": [],
            "excluded_runs": [],
            "all_runs": [run_report],
        }
    )
    report = mt_sts_reporter.build_report(
        manifest_path=manifest_path,
        manifest_family="k_module_problem",
        results_dir=tmp_path,
        run_roots=[run_root],
        groups=[group],
        latest_per_setting=1,
        include_all_runs=False,
    )

    markdown = mt_sts_reporter.build_markdown_report(report, repo_root=REPO_ROOT)

    assert "best-shared-seed mean" in markdown
    assert "best-task-seed mean" in markdown
    assert "adapt>best-shared W/T/L" in markdown
    assert "adapt>best-task W/T/L" in markdown
    assert "best-task>best-shared W/T/L" in markdown
    assert (
        "| task | spawn | adapted | best-shared-seed | best-task-seed | baseline | "
        "adapt - spawn | adapt - best-shared | adapt - best-task | best-task - "
        "best-shared | adapt - baseline | adapt>spawn | adapt>best-shared | "
        "adapt>best-task | best-task>best-shared | adapt>baseline |"
    ) in markdown
    assert "km_task_a" in markdown


def test_serialize_report_path_makes_repo_local_paths_portable():
    repo_local_path = REPO_ROOT / "multi_task_shared_then_adapt" / "results" / "demo_run"
    external_path = Path("/tmp/mt_sts_demo")

    assert (
        mt_sts_reporter.serialize_report_path(repo_local_path, REPO_ROOT)
        == "multi_task_shared_then_adapt/results/demo_run"
    )
    assert mt_sts_reporter.serialize_report_path(external_path, REPO_ROOT) == str(
        external_path.resolve()
    )


def test_extract_task_result_returns_none_for_malformed_stored_metrics():
    assert extract_task_result(
        {
            "task_results": [
                {
                    "task_id": "km_task_a",
                    "metrics": "bad",
                }
            ]
        },
        "km_task_a",
    ) is None

    assert extract_task_result(
        {
            "task_results": [
                {
                    "task_id": "km_task_a",
                    "metrics": {
                        "correct_modules": 2,
                        "total_modules": 4,
                        "accuracy": 0.5,
                        "score": 0.5,
                        "combined_score": 0.5,
                    },
                }
            ]
        },
        "km_task_a",
    ) is None


def test_reevaluate_program_for_task_supports_plain_dict_results(tmp_path, monkeypatch):
    monkeypatch.setattr(
        mt_sts_spawn,
        "_load_evaluation_module",
        lambda _: SimpleNamespace(
            evaluate=lambda _: {
                "correct_modules": 3,
                "total_modules": 4,
                "accuracy": 0.75,
                "score": 0.75,
                "combined_score": 0.75,
                "eval_time": 0.01,
                "artifacts": {"status": "plain_dict_ok"},
            }
        ),
    )

    program = Program(
        id="program_plain_dict_eval",
        code=_build_candidate_program_code(
            {
                "loader": "json_reader",
                "preprocess": "standardize",
                "algorithm": "mergesort",
                "formatter": "xml",
            }
        ),
        changes_description="plain dict evaluator compatibility",
        language="python",
        metrics={"score": 0.0, "combined_score": 0.0},
    )

    task_result = mt_sts_spawn._reevaluate_program_for_task(
        program=program,
        task_id="km_task_a",
        family="k_module_problem",
        evaluation_file=tmp_path / "unused_evaluator.py",
    )

    assert task_result["task_id"] == "km_task_a"
    assert task_result["metrics"]["combined_score"] == pytest.approx(
        task_result["metrics"]["score"]
    )
    assert task_result["metrics"]["combined_score"] == pytest.approx(0.75)


def test_hidden_correct_configs_are_not_leaked_in_task_results(monkeypatch):
    monkeypatch.setenv(K_MODULE_TASK_SELECTOR_ENV_VAR, "all")

    result = k_module_evaluator.evaluate(
        str(REPO_ROOT / "examples" / "k_module_problem_mt_sts" / "initial_program.py")
    )
    serialized = json.dumps(result.artifacts["task_results"], sort_keys=True)

    for hidden_value in (
        "csv_reader",
        "normalize",
        "quicksort",
        "json",
        "heapsort",
        "parquet_reader",
        "minmax",
        "yaml",
    ):
        assert hidden_value not in serialized
