#!/usr/bin/env python3

from __future__ import annotations

import argparse
import csv
import json
import os
import re
import sys
import tempfile
from pathlib import Path
import shutil
from typing import Dict, List, Tuple

from swebench.harness.constants import KEY_INSTANCE_ID, KEY_MODEL, KEY_PREDICTION
from swebench.harness.run_evaluation import main as harness_main
from swebench.harness.path_utils import safe_component
from swebench.harness.utils import load_swebench_dataset

NOOP_PATCH = (
    "diff --git a/.swebench_noop b/.swebench_noop\n"
    "new file mode 100644\n"
    "index 0000000..1111111\n"
    "--- /dev/null\n"
    "+++ b/.swebench_noop\n"
    "@@ -0,0 +1 @@\n"
    "+noop\n"
)

PATCH_HEADER_PATTERN = re.compile(r"^(diff --git |---\s+a/)", re.MULTILINE)


def normalize_patch(patch: str) -> str:
    """Normalize patch newlines and trim non-diff chatter lines.
    The harness also normalizes patches; this is an extra pre-clean layer.
    """
    if patch is None:
        return ""
    # Normalize newlines
    text = patch.replace("\r\n", "\n").replace("\r", "\n")
    # Strip BOM
    if text.startswith("\ufeff"):
        text = text.lstrip("\ufeff")
    # Remove noise lines typical in git apply logs
    cleaned_lines = []
    for ln in text.split("\n"):
        if ln.startswith("patching file "):
            continue
        if ln.startswith("Checking patch "):
            continue
        if ln.startswith("error: ") or ln.startswith("warning: "):
            continue
        if ln.startswith("Only in "):
            continue
        if ln.strip() == r"\ No newline at end of file":
            continue
        cleaned_lines.append(ln.rstrip())
    text = "\n".join(cleaned_lines)
    # Trim to first diff header if any lead-in content exists
    m = PATCH_HEADER_PATTERN.search(text)
    if m and m.start() > 0:
        text = text[m.start():]
    # Remove trailing whitespace on each line
    text = "\n".join(line.rstrip() for line in text.split("\n"))
    if not text.endswith("\n"):
        text += "\n"
    return text


def read_predictions_file(path: Path) -> Dict[str, str]:
    """Read a predictions file (.json or .jsonl) and return {instance_id: patch}.
    Supports various field names (instance_id/task_id/id and model_patch/patch/diff).
    """
    items: List[dict]
    if path.suffix.lower() == ".jsonl":
        items = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
    else:
        data = json.loads(path.read_text(encoding="utf-8"))
        if isinstance(data, dict):
            # Some tools export {id: {...}}
            items = list(data.values())
        elif isinstance(data, list):
            items = data
        else:
            raise ValueError(f"Unsupported predictions JSON structure in {path}")

    id_keys = ["instance_id", "task_id", "id"]
    patch_keys = ["model_patch", "patch", "modelDiff", "diff", "model_patch_text"]

    out: Dict[str, str] = {}
    for it in items:
        if not isinstance(it, dict):
            continue
        iid = None
        for k in id_keys:
            if k in it and isinstance(it[k], str) and it[k].strip():
                iid = it[k].strip()
                break
        if not iid:
            continue
        ptxt = None
        for k in patch_keys:
            if k in it and isinstance(it[k], str):
                ptxt = it[k]
                break
        if ptxt is None:
            # Some exports wrap the diff under nested fields; try to find a likely child
            for v in it.values():
                if isinstance(v, str) and ("diff --git" in v or v.startswith("--- a/") or v.startswith("--- ")):
                    ptxt = v
                    break
        out[iid] = normalize_patch(ptxt or "")
    return out


def write_predictions_json(pred_map: Dict[str, str], model_name: str, out_dir: Path) -> Path:
    preds = [
        {KEY_INSTANCE_ID: iid, KEY_PREDICTION: pred, KEY_MODEL: model_name}
        for iid, pred in pred_map.items()
        if pred is not None
    ]
    out = out_dir / f"predictions_{model_name}.json"
    out.write_text(json.dumps(preds, indent=2), encoding="utf-8")
    return out


def run_harness(dataset_name: str, split: str, instance_ids: List[str], predictions_path: str,
                run_id: str, max_workers: int, timeout: int, force_rebuild: bool,
                cache_level: str, clean: bool, namespace: str | None, instance_image_tag: str,
                te_enabled: bool, te_id: str | None, open_file_limit: int = 4096,
                rerun_completed: bool = False):
    # Configure TE injection via environment variables seen by test_spec
    if te_enabled:
        # Enable TE injection by ensuring TE is UNSET and TE_ID is set
        os.environ.pop("TE", None)
        if te_id:
            os.environ["TE_ID"] = te_id
    else:
        # Disable TE injection by setting TE
        os.environ["TE"] = "1"
        os.environ.pop("TE_ID", None)

    # Convert namespace string to None when requested
    effective_ns = namespace
    if isinstance(effective_ns, str) and effective_ns.lower() in {"none", "null", ""}:
        effective_ns = None

    return harness_main(
        dataset_name=dataset_name,
        split=split,
        instance_ids=instance_ids,
        predictions_path=predictions_path,
        max_workers=max_workers,
        force_rebuild=bool(force_rebuild),
        cache_level=cache_level,
        clean=bool(clean),
        open_file_limit=open_file_limit,
        run_id=run_id,
        timeout=timeout,
        namespace=effective_ns,
        rewrite_reports=False,
        modal=False,
        instance_image_tag=instance_image_tag,
        report_dir=".",
        force_rerun=bool(rerun_completed),
    )


def _existing_completed_instances(run_id: str) -> set[str]:
    base = Path("logs") / "run_evaluation" / safe_component(run_id)
    done: set[str] = set()
    if not base.exists():
        return done
    try:
        for model_dir in base.iterdir():
            if not model_dir.is_dir():
                continue
            for inst_dir in model_dir.iterdir():
                if (inst_dir / "report.json").exists():
                    done.add(inst_dir.name)
    except Exception:
        pass
    return done


def _read_cov_pct(p: Path) -> float | None:
    try:
        data = json.loads((p / "coverage.json").read_text(encoding="utf-8"))
        if isinstance(data, dict):
            t = data.get("totals")
            if isinstance(t, dict) and isinstance(t.get("percent_covered"), (int, float)):
                return float(t["percent_covered"])
    except Exception:
        return None
    return None


def _merge_te_sources_for_instances(target_te_id: str, source_te_ids: List[str], instance_ids: List[str]):
    root = Path("logs") / "test_enhancer"
    target_root = root / target_te_id
    target_root.mkdir(parents=True, exist_ok=True)
    for iid in instance_ids:
        dst_inst = target_root / iid
        dst_inst.mkdir(parents=True, exist_ok=True)
        # find current max idx in target
        existing = [int(p.name) for p in dst_inst.iterdir() if p.is_dir() and p.name.isdigit()]
        next_idx = (max(existing) + 1) if existing else 0
        for src_id in source_te_ids:
            if src_id == target_te_id:
                continue
            src_inst = root / src_id / iid
            if not src_inst.exists():
                continue
            # copy all numeric subfolders in ascending order to preserve chronology
            subdirs = sorted([p for p in src_inst.iterdir() if p.is_dir() and p.name.isdigit()], key=lambda x: int(x.name))
            for sd in subdirs:
                dst = dst_inst / str(next_idx)
                # if destination exists (race), skip
                if not dst.exists():
                    try:
                        shutil.copytree(sd, dst)
                    except Exception:
                        # ignore copy errors per folder; proceed
                        pass
                next_idx += 1


def main():
    parser = argparse.ArgumentParser(description="Batch SWE-bench runner for baseline, gold, and model patches with dataset/TE tests.")
    parser.add_argument("--dataset_name", type=str, default="SWE-bench/SWE-bench")
    parser.add_argument("--split", type=str, default="test")

    parser.add_argument("--pred_dir", type=Path, default=Path("model_generated"), help="Directory containing model predictions JSON/JSONL files")
    parser.add_argument("--predictions_paths", nargs="*", type=Path, default=None, help="Optional explicit list of prediction files (.json/.jsonl); overrides --pred_dir if provided")
    parser.add_argument("--limit_files", type=int, default=0, help="Limit to first N predictions files (0=all)")

    parser.add_argument("--te_id", type=str, default="TE_batch_fast_new", help="Primary TE logs ID under logs/test_enhancer/<TE_ID>/instance_id/")
    parser.add_argument("--merge_te_ids", nargs="*", type=str, default=None, help="Additional TE IDs to merge into --te_id before running (e.g., TE_20)")

    parser.add_argument("--max_workers", type=int, default=4)
    parser.add_argument("--timeout", type=int, default=1800)
    parser.add_argument("--force_rebuild", action="store_true")
    parser.add_argument("--cache_level", type=str, default="env", choices=["none", "base", "env", "instance"])
    parser.add_argument("--clean", action="store_true")
    parser.add_argument("--namespace", type=str, default="none", help="Docker images namespace; use 'none' for local-only builds")
    parser.add_argument("--instance_image_tag", type=str, default="latest")
    parser.add_argument("--run_prefix", type=str, default="batch", help="Prefix for run IDs")
    parser.add_argument("--accumulate", action="store_true", help="Skip instances that already have report.json under a scenario run_id and accumulate results instead of re-running them")
    parser.add_argument(
        "--purge_scenarios",
        nargs="*",
        type=str,
        default=None,
        help=(
            "Optional list of scenarios to purge before running (delete logs/run_evaluation/<run_id>) "
            "Choices per scenario: dataset.baseline, dataset.gold, dataset.model, te.baseline, te.gold, te.model"
        ),
    )

    args = parser.parse_args()

    # Warn if user passed a non-empty namespace (this disables local image builds)
    if isinstance(args.namespace, str) and args.namespace.lower() not in {"none", "null", ""}:
        print(
            f"[WARN] Using remote namespace '{args.namespace}'. The harness will NOT build env/instance images locally and will attempt to pull them instead.\n"
            "       If these images are not published to a registry under that namespace, you will see 'pull access denied'.\n"
            "       For local builds, use: --namespace none"
        )

    if args.predictions_paths:
        pred_files = [Path(p) for p in args.predictions_paths if Path(p).is_file()]
    else:
        pred_files = sorted([p for p in Path(args.pred_dir).glob("*.json*") if p.is_file()])
    if args.limit_files > 0:
        pred_files = pred_files[: args.limit_files]
    if not pred_files:
        print(f"No prediction files found in {args.pred_dir}")
        sys.exit(1)

    # Scratch workspace for generated predictions
    tmp_root = Path(tempfile.mkdtemp(prefix="swebench_batch_"))
    print(f"[INFO] Using workspace: {tmp_root}")

    # Keep records of all scenario reports for CSV summarization
    summary_rows: List[Dict[str, str | int | float]] = []
    per_instance_rows: List[Dict[str, str]] = []

    for pf_idx, pf in enumerate(pred_files, 1):
        model_label = pf.stem  # use filename as model label
        print(f"\n===== Processing predictions file [{pf_idx}/{len(pred_files)}]: {pf.name} =====")
        pred_map = read_predictions_file(pf)
        instance_ids = sorted(pred_map.keys())
        if not instance_ids:
            print(f"[WARN] No valid predictions parsed from {pf}")
            continue

        if args.merge_te_ids:
            _merge_te_sources_for_instances(args.te_id, args.merge_te_ids, instance_ids)

        te_root = Path("logs") / "test_enhancer" / args.te_id
        for iid in instance_ids:
            inst_dir = te_root / iid
            if not inst_dir.exists():
                print(f"[TE] {iid}: skipped, no tests (missing {inst_dir})")
                continue
            # Look for any numbered subfolder 0..39 with at least one file
            has_any = False
            for n in range(40):
                sub = inst_dir / str(n)
                if sub.is_dir():
                    try:
                        # Consider presence of any non-empty file under this subdir as a signal of tests
                        if any(p.is_file() and p.stat().st_size > 0 for p in sub.rglob("*")):
                            has_any = True
                            break
                    except Exception:
                        # Ignore filesystem errors for scan, continue checking others
                        pass
            if not has_any:
                print(f"[TE] {iid}: skipped, no tests (no numbered subfolders 0..39 with content)")

        # Create predictions files for three variants: baseline(noop), model, gold
        work_dir = tmp_root / model_label
        work_dir.mkdir(parents=True, exist_ok=True)

        # A) baseline/no-op
        baseline_map = {iid: NOOP_PATCH for iid in instance_ids}
        preds_baseline = write_predictions_json(baseline_map, f"{model_label}__baseline", work_dir)
        # C) model
        preds_model = write_predictions_json(pred_map, f"{model_label}__model", work_dir)
        # B) gold handled by 'gold' shortcut in harness (use dataset patch); we just pass string 'gold'

        # Ensure dataset subset exists (for early validation)
        try:
            load_swebench_dataset(args.dataset_name, args.split, instance_ids=instance_ids)
        except Exception as e:
            print(f"[ERROR] Loading dataset subset failed for {len(instance_ids)} ids: {e}")
            continue

        # Run scenarios with dataset tests (TE disabled)
        print("\n--- Running dataset tests (TE disabled) ---")
        # Baseline
        run_id = f"{args.run_prefix}.dataset.baseline.{model_label}"
        if args.purge_scenarios and "dataset.baseline" in args.purge_scenarios:
            run_dir = Path("logs")/"run_evaluation"/safe_component(run_id)
            if run_dir.exists():
                print(f"[PURGE] Removing existing run dir: {run_dir}")
                shutil.rmtree(run_dir, ignore_errors=True)
        pending_ids = instance_ids
        if args.accumulate:
            done = _existing_completed_instances(run_id)
            pending_ids = [i for i in instance_ids if i not in done]
        rpt = run_harness(
            dataset_name=args.dataset_name,
            split=args.split,
            instance_ids=pending_ids,
            predictions_path=str(preds_baseline),
            run_id=run_id,
            max_workers=args.max_workers,
            timeout=args.timeout,
            force_rebuild=args.force_rebuild,
            cache_level=args.cache_level,
            clean=args.clean,
            namespace=args.namespace,
            instance_image_tag=args.instance_image_tag,
            te_enabled=False,
            te_id=None,
        )
        # Collect summary
        try:
            with open(rpt, "r", encoding="utf-8") as f:
                data = json.load(f)
            summary_rows.append({
                "run_id": run_id,
                "tests": "dataset",
                "patch": "baseline",
                "model": model_label,
                "submitted": len(instance_ids),
                "completed": int(data.get("completed_instances", 0)),
                "resolved": int(data.get("resolved_instances", 0)),
                "unresolved": int(data.get("unresolved_instances", 0)),
                "errors": int(data.get("error_instances", 0)),
                "empty_patches": int(data.get("empty_patch_instances", 0)),
            })
            # Per-instance status
            sid = set(data.get("submitted_ids", []))
            rid = set(data.get("resolved_ids", []))
            uid = set(data.get("unresolved_ids", []))
            eid = set(data.get("error_ids", []))
            iid_incomplete = set(data.get("incomplete_ids", []))
            emp = set(data.get("empty_patch_ids", []))
            for iid in sid | rid | uid | eid | iid_incomplete | emp:
                if iid in rid:
                    st = "resolved"
                elif iid in uid:
                    st = "unresolved"
                elif iid in emp:
                    st = "empty"
                elif iid in eid:
                    st = "error"
                elif iid in iid_incomplete:
                    st = "incomplete"
                else:
                    st = "submitted"
                # coverage (dataset baseline)
                cov = _read_cov_pct(Path("logs")/"run_evaluation"/run_id/model_label/iid) if False else None
                per_instance_rows.append({
                    "run_id": run_id,
                    "tests": "dataset",
                    "patch": "baseline",
                    "model": model_label,
                    "instance_id": iid,
                    "status": st,
                    "dataset_coverage_pct": "",
                    "te_coverage_pct": "",
                    "coverage_delta_pct": "",
                })
        except Exception:
            pass
        # Gold
        run_id = f"{args.run_prefix}.dataset.gold.{model_label}"
        if args.purge_scenarios and "dataset.gold" in args.purge_scenarios:
            run_dir = Path("logs")/"run_evaluation"/safe_component(run_id)
            if run_dir.exists():
                print(f"[PURGE] Removing existing run dir: {run_dir}")
                shutil.rmtree(run_dir, ignore_errors=True)
        pending_ids = instance_ids
        if args.accumulate:
            done = _existing_completed_instances(run_id)
            pending_ids = [i for i in instance_ids if i not in done]
        rpt = run_harness(
            dataset_name=args.dataset_name,
            split=args.split,
            instance_ids=pending_ids,
            predictions_path="gold",
            run_id=run_id,
            max_workers=args.max_workers,
            timeout=args.timeout,
            force_rebuild=args.force_rebuild,
            cache_level=args.cache_level,
            clean=args.clean,
            namespace=args.namespace,
            instance_image_tag=args.instance_image_tag,
            te_enabled=False,
            te_id=None,
        )
        try:
            with open(rpt, "r", encoding="utf-8") as f:
                data = json.load(f)
            summary_rows.append({
                "run_id": run_id,
                "tests": "dataset",
                "patch": "gold",
                "model": model_label,
                "submitted": len(instance_ids),
                "completed": int(data.get("completed_instances", 0)),
                "resolved": int(data.get("resolved_instances", 0)),
                "unresolved": int(data.get("unresolved_instances", 0)),
                "errors": int(data.get("error_instances", 0)),
                "empty_patches": int(data.get("empty_patch_instances", 0)),
            })
            sid = set(data.get("submitted_ids", []))
            rid = set(data.get("resolved_ids", []))
            uid = set(data.get("unresolved_ids", []))
            eid = set(data.get("error_ids", []))
            iid_incomplete = set(data.get("incomplete_ids", []))
            emp = set(data.get("empty_patch_ids", []))
            for iid in sid | rid | uid | eid | iid_incomplete | emp:
                if iid in rid:
                    st = "resolved"
                elif iid in uid:
                    st = "unresolved"
                elif iid in emp:
                    st = "empty"
                elif iid in eid:
                    st = "error"
                elif iid in iid_incomplete:
                    st = "incomplete"
                else:
                    st = "submitted"
                per_instance_rows.append({
                    "run_id": run_id,
                    "tests": "dataset",
                    "patch": "gold",
                    "model": model_label,
                    "instance_id": iid,
                    "status": st,
                    "dataset_coverage_pct": "",
                    "te_coverage_pct": "",
                    "coverage_delta_pct": "",
                })
        except Exception:
            pass
        # Model
        run_id = f"{args.run_prefix}.dataset.model.{model_label}"
        if args.purge_scenarios and "dataset.model" in args.purge_scenarios:
            run_dir = Path("logs")/"run_evaluation"/safe_component(run_id)
            if run_dir.exists():
                print(f"[PURGE] Removing existing run dir: {run_dir}")
                shutil.rmtree(run_dir, ignore_errors=True)
        pending_ids = instance_ids
        if args.accumulate:
            done = _existing_completed_instances(run_id)
            pending_ids = [i for i in instance_ids if i not in done]
        rpt = run_harness(
            dataset_name=args.dataset_name,
            split=args.split,
            instance_ids=pending_ids,
            predictions_path=str(preds_model),
            run_id=run_id,
            max_workers=args.max_workers,
            timeout=args.timeout,
            force_rebuild=args.force_rebuild,
            cache_level=args.cache_level,
            clean=args.clean,
            namespace=args.namespace,
            instance_image_tag=args.instance_image_tag,
            te_enabled=False,
            te_id=None,
        )
        try:
            with open(rpt, "r", encoding="utf-8") as f:
                data = json.load(f)
            summary_rows.append({
                "run_id": run_id,
                "tests": "dataset",
                "patch": "model",
                "model": model_label,
                "submitted": len(instance_ids),
                "completed": int(data.get("completed_instances", 0)),
                "resolved": int(data.get("resolved_instances", 0)),
                "unresolved": int(data.get("unresolved_instances", 0)),
                "errors": int(data.get("error_instances", 0)),
                "empty_patches": int(data.get("empty_patch_instances", 0)),
            })
            sid = set(data.get("submitted_ids", []))
            rid = set(data.get("resolved_ids", []))
            uid = set(data.get("unresolved_ids", []))
            eid = set(data.get("error_ids", []))
            iid_incomplete = set(data.get("incomplete_ids", []))
            emp = set(data.get("empty_patch_ids", []))
            for iid in sid | rid | uid | eid | iid_incomplete | emp:
                if iid in rid:
                    st = "resolved"
                elif iid in uid:
                    st = "unresolved"
                elif iid in emp:
                    st = "empty"
                elif iid in eid:
                    st = "error"
                elif iid in iid_incomplete:
                    st = "incomplete"
                else:
                    st = "submitted"
                per_instance_rows.append({
                    "run_id": run_id,
                    "tests": "dataset",
                    "patch": "model",
                    "model": model_label,
                    "instance_id": iid,
                    "status": st,
                    "dataset_coverage_pct": "",
                    "te_coverage_pct": "",
                    "coverage_delta_pct": "",
                })
        except Exception:
            pass

        # Run scenarios with TE tests (TE enabled)
        print("\n--- Running TE tests (TE enabled) ---")
        # Baseline
        run_id = f"{args.run_prefix}.te.baseline.{model_label}"
        if args.purge_scenarios and "te.baseline" in args.purge_scenarios:
            run_dir = Path("logs")/"run_evaluation"/safe_component(run_id)
            if run_dir.exists():
                print(f"[PURGE] Removing existing run dir: {run_dir}")
                shutil.rmtree(run_dir, ignore_errors=True)
        pending_ids = instance_ids
        if args.accumulate:
            done = _existing_completed_instances(run_id)
            pending_ids = [i for i in instance_ids if i not in done]
        rpt = run_harness(
            dataset_name=args.dataset_name,
            split=args.split,
            instance_ids=pending_ids,
            predictions_path=str(preds_baseline),
            run_id=run_id,
            max_workers=args.max_workers,
            timeout=args.timeout,
            force_rebuild=args.force_rebuild,
            cache_level=args.cache_level,
            clean=args.clean,
            namespace=args.namespace,
            instance_image_tag=args.instance_image_tag,
            te_enabled=True,
            te_id=args.te_id,
            rerun_completed=True,
        )
        try:
            with open(rpt, "r", encoding="utf-8") as f:
                data = json.load(f)
            summary_rows.append({
                "run_id": run_id,
                "tests": "te",
                "patch": "baseline",
                "model": model_label,
                "submitted": len(instance_ids),
                "completed": int(data.get("completed_instances", 0)),
                "resolved": int(data.get("resolved_instances", 0)),
                "unresolved": int(data.get("unresolved_instances", 0)),
                "errors": int(data.get("error_instances", 0)),
                "empty_patches": int(data.get("empty_patch_instances", 0)),
            })
            sid = set(data.get("submitted_ids", []))
            rid = set(data.get("resolved_ids", []))
            uid = set(data.get("unresolved_ids", []))
            eid = set(data.get("error_ids", []))
            iid_incomplete = set(data.get("incomplete_ids", []))
            emp = set(data.get("empty_patch_ids", []))
            for iid in sid | rid | uid | eid | iid_incomplete | emp:
                if iid in rid:
                    st = "resolved"
                elif iid in uid:
                    st = "unresolved"
                elif iid in emp:
                    st = "empty"
                elif iid in eid:
                    st = "error"
                elif iid in iid_incomplete:
                    st = "incomplete"
                else:
                    st = "submitted"
                # Compute coverage for dataset baseline vs TE baseline and delta
                ds_run_id = f"{args.run_prefix}.dataset.baseline.{model_label}"
                ds_cov = _read_cov_pct(
                    Path("logs")/"run_evaluation"/safe_component(ds_run_id)/safe_component(f"{model_label}__baseline")/safe_component(iid)
                )
                te_cov = _read_cov_pct(
                    Path("logs")/"run_evaluation"/safe_component(run_id)/safe_component(f"{model_label}__baseline")/safe_component(iid)
                )
                delta = (te_cov - ds_cov) if (ds_cov is not None and te_cov is not None) else None
                per_instance_rows.append({
                    "run_id": run_id,
                    "tests": "te",
                    "patch": "baseline",
                    "model": model_label,
                    "instance_id": iid,
                    "status": st,
                    "dataset_coverage_pct": f"{ds_cov:.3f}" if ds_cov is not None else "",
                    "te_coverage_pct": f"{te_cov:.3f}" if te_cov is not None else "",
                    "coverage_delta_pct": f"{delta:.3f}" if delta is not None else "",
                })
        except Exception:
            pass
        # Gold
        run_id = f"{args.run_prefix}.te.gold.{model_label}"
        if args.purge_scenarios and "te.gold" in args.purge_scenarios:
            run_dir = Path("logs")/"run_evaluation"/safe_component(run_id)
            if run_dir.exists():
                print(f"[PURGE] Removing existing run dir: {run_dir}")
                shutil.rmtree(run_dir, ignore_errors=True)
        pending_ids = instance_ids
        if args.accumulate:
            done = _existing_completed_instances(run_id)
            pending_ids = [i for i in instance_ids if i not in done]
        rpt = run_harness(
            dataset_name=args.dataset_name,
            split=args.split,
            instance_ids=pending_ids,
            predictions_path="gold",
            run_id=run_id,
            max_workers=args.max_workers,
            timeout=args.timeout,
            force_rebuild=args.force_rebuild,
            cache_level=args.cache_level,
            clean=args.clean,
            namespace=args.namespace,
            instance_image_tag=args.instance_image_tag,
            te_enabled=True,
            te_id=args.te_id,
            rerun_completed=True,
        )
        try:
            with open(rpt, "r", encoding="utf-8") as f:
                data = json.load(f)
            summary_rows.append({
                "run_id": run_id,
                "tests": "te",
                "patch": "gold",
                "model": model_label,
                "submitted": len(instance_ids),
                "completed": int(data.get("completed_instances", 0)),
                "resolved": int(data.get("resolved_instances", 0)),
                "unresolved": int(data.get("unresolved_instances", 0)),
                "errors": int(data.get("error_instances", 0)),
                "empty_patches": int(data.get("empty_patch_instances", 0)),
            })
            sid = set(data.get("submitted_ids", []))
            rid = set(data.get("resolved_ids", []))
            uid = set(data.get("unresolved_ids", []))
            eid = set(data.get("error_ids", []))
            iid_incomplete = set(data.get("incomplete_ids", []))
            emp = set(data.get("empty_patch_ids", []))
            for iid in sid | rid | uid | eid | iid_incomplete | emp:
                if iid in rid:
                    st = "resolved"
                elif iid in uid:
                    st = "unresolved"
                elif iid in emp:
                    st = "empty"
                elif iid in eid:
                    st = "error"
                elif iid in iid_incomplete:
                    st = "incomplete"
                else:
                    st = "submitted"
                per_instance_rows.append({
                    "run_id": run_id,
                    "tests": "te",
                    "patch": "gold",
                    "model": model_label,
                    "instance_id": iid,
                    "status": st,
                    "dataset_coverage_pct": "",
                    "te_coverage_pct": "",
                    "coverage_delta_pct": "",
                })
        except Exception:
            pass
        # Model
        run_id = f"{args.run_prefix}.te.model.{model_label}"
        if args.purge_scenarios and "te.model" in args.purge_scenarios:
            run_dir = Path("logs")/"run_evaluation"/safe_component(run_id)
            if run_dir.exists():
                print(f"[PURGE] Removing existing run dir: {run_dir}")
                shutil.rmtree(run_dir, ignore_errors=True)
        pending_ids = instance_ids
        if args.accumulate:
            done = _existing_completed_instances(run_id)
            pending_ids = [i for i in instance_ids if i not in done]
        rpt = run_harness(
            dataset_name=args.dataset_name,
            split=args.split,
            instance_ids=pending_ids,
            predictions_path=str(preds_model),
            run_id=run_id,
            max_workers=args.max_workers,
            timeout=args.timeout,
            force_rebuild=args.force_rebuild,
            cache_level=args.cache_level,
            clean=args.clean,
            namespace=args.namespace,
            instance_image_tag=args.instance_image_tag,
            te_enabled=True,
            te_id=args.te_id,
            rerun_completed=True,
        )
        try:
            with open(rpt, "r", encoding="utf-8") as f:
                data = json.load(f)
            summary_rows.append({
                "run_id": run_id,
                "tests": "te",
                "patch": "model",
                "model": model_label,
                "submitted": len(instance_ids),
                "completed": int(data.get("completed_instances", 0)),
                "resolved": int(data.get("resolved_instances", 0)),
                "unresolved": int(data.get("unresolved_instances", 0)),
                "errors": int(data.get("error_instances", 0)),
                "empty_patches": int(data.get("empty_patch_instances", 0)),
            })
            sid = set(data.get("submitted_ids", []))
            rid = set(data.get("resolved_ids", []))
            uid = set(data.get("unresolved_ids", []))
            eid = set(data.get("error_ids", []))
            iid_incomplete = set(data.get("incomplete_ids", []))
            emp = set(data.get("empty_patch_ids", []))
            for iid in sid | rid | uid | eid | iid_incomplete | emp:
                if iid in rid:
                    st = "resolved"
                elif iid in uid:
                    st = "unresolved"
                elif iid in emp:
                    st = "empty"
                elif iid in eid:
                    st = "error"
                elif iid in iid_incomplete:
                    st = "incomplete"
                else:
                    st = "submitted"
                per_instance_rows.append({
                    "run_id": run_id,
                    "tests": "te",
                    "patch": "model",
                    "model": model_label,
                    "instance_id": iid,
                    "status": st,
                    "dataset_coverage_pct": "",
                    "te_coverage_pct": "",
                    "coverage_delta_pct": "",
                })
        except Exception:
            pass

    # Write comprehensive CSVs
    out_dir = Path("combined_preds")
    out_dir.mkdir(parents=True, exist_ok=True)
    summary_csv = out_dir / f"{args.te_id}_batch_eval_summary.csv"
    per_instance_csv = out_dir / f"{args.te_id}_batch_eval_per_instance.csv"
    if summary_rows:
        # add derived rates
        for r in summary_rows:
            try:
                completed = int(r.get("completed", 0) or 0)
                submitted = int(r.get("submitted", 0) or 0)
                resolved = int(r.get("resolved", 0) or 0)
                r["completion_rate"] = (completed / submitted) if submitted else 0.0
                r["resolve_rate"] = (resolved / submitted) if submitted else 0.0
            except Exception:
                r["completion_rate"] = 0.0
                r["resolve_rate"] = 0.0
        header = [
            "run_id","tests","patch","model",
            "submitted","completed","resolved","unresolved","errors","empty_patches",
            "completion_rate","resolve_rate",
        ]
        with open(summary_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=header)
            w.writeheader()
            for r in summary_rows:
                w.writerow({k: r.get(k, "") for k in header})
        print(f"Wrote summary CSV: {summary_csv}")
    if per_instance_rows:
        header = [
            "run_id","tests","patch","model","instance_id","status",
            "dataset_coverage_pct","te_coverage_pct","coverage_delta_pct",
        ]
        with open(per_instance_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=header)
            w.writeheader()
            for r in per_instance_rows:
                w.writerow({k: r.get(k, "") for k in header})
        print(f"Wrote per-instance CSV: {per_instance_csv}")

    print("\nAll scenarios complete.")


if __name__ == "__main__":
    main()
