from __future__ import annotations

import base64
import hashlib
import json
import math
import os
import re
import signal
import subprocess
import sys
import time
import uuid
from typing import Any, Dict, List, Optional, Tuple

from python_src.precision import normalize_precision_model, precision_format_to_dag_dtype
from python_src.io_utils import read_json, write_json_atomic
from schemas.mcp_contract import SCHEMA_VERSION, build_artifact_id, make_response, parse_artifact_id
from tools.spec_validate import normalize_spec_dict


def _safe_write_json(path: str, payload: Dict[str, Any]) -> None:
    write_json_atomic(path, payload, ensure_ascii=True, indent=2)


def _read_json(path: str) -> Dict[str, Any]:
    return read_json(path)


def _stable_payload_hash(payload: Dict[str, Any]) -> str:
    serialized = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
    return hashlib.sha256(serialized.encode("utf-8")).hexdigest()


def _sanitize_task_tag(value: str) -> str:
    value = value.strip()
    if not value:
        return "run"
    value = re.sub(r"[^\w\-\.]+", "_", value)
    return value[:80]


def _sanitize_log_label(value: str) -> str:
    value = value.strip()
    if not value:
        return "log"
    value = re.sub(r"[^\w\-\.]+", "_", value)
    return value[:80]


def _safe_relpath(rel_path: str) -> str:
    normalized = os.path.normpath(rel_path).lstrip(os.sep)
    if normalized.startswith(".."):
        raise ValueError("artifact path escapes task directory")
    return normalized


def _count_ops(dag_data: Dict[str, Any]) -> int:
    ops = 0
    for node in dag_data.get("nodes", []):
        if node.get("type") in (2, 3, 4, 5, 6):
            ops += 1
    return ops


def _best_candidate_from_bestof(task_dir: str, task_tag: str) -> Optional[Dict[str, Any]]:
    bestof_dir = os.path.join(task_dir, "bestof")
    if not os.path.isdir(bestof_dir):
        return None
    best_files = [f for f in os.listdir(bestof_dir) if f.endswith(".json")]
    if not best_files:
        return None

    best_path = None
    best_error = float("inf")
    best_ops = None
    for name in best_files:
        path = os.path.join(bestof_dir, name)
        try:
            data = _read_json(path)
        except Exception:
            continue
        try:
            error = float(data.get("optimization_error"))
        except (TypeError, ValueError):
            continue
        if not math.isfinite(error):
            continue
        ops = _count_ops(data)
        if (
            error < best_error
            or (error == best_error and (best_ops is None or ops < best_ops))
        ):
            best_error = error
            best_ops = ops
            best_path = path

    if best_path is None:
        return None

    best_rel_path = os.path.relpath(best_path, task_dir)
    return {
        "artifact_id": build_artifact_id(task_tag, best_rel_path),
        "optimization_error": best_error,
        "ops": best_ops,
        "source": os.path.basename(best_path),
    }


def _pid_alive(pid: Any) -> bool:
    try:
        pid_int = int(pid)
    except (TypeError, ValueError):
        return False
    if pid_int <= 0:
        return False
    try:
        os.kill(pid_int, 0)
        return True
    except OSError:
        return False


def _process_liveness(run_info: Dict[str, Any]) -> Tuple[int, int]:
    alive = 0
    total = 0
    for proc in run_info.get("processes", []) or []:
        pid = proc.get("pid")
        try:
            pid_int = int(pid)
        except (TypeError, ValueError):
            continue
        if pid_int <= 0:
            continue
        total += 1
        if _pid_alive(pid_int):
            alive += 1
    return alive, total


def _tracked_alive_pids(run_info: Dict[str, Any]) -> List[int]:
    alive: List[int] = []
    for proc in run_info.get("processes", []) or []:
        pid = proc.get("pid")
        try:
            pid_int = int(pid)
        except (TypeError, ValueError):
            continue
        if pid_int <= 0:
            continue
        if _pid_alive(pid_int):
            alive.append(pid_int)
    return alive


def _status_updated_recently(status_payload: Dict[str, Any], now_s: float, stale_after_s: int = 120) -> bool:
    updated_at = status_payload.get("updated_at")
    try:
        updated_at_s = float(updated_at)
    except (TypeError, ValueError):
        return False
    return (now_s - updated_at_s) < float(max(1, stale_after_s))


class RunManager:
    def __init__(self, base_op_folder: Optional[str] = None, repo_root: Optional[str] = None):
        if repo_root is None:
            repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        self.repo_root = repo_root
        if base_op_folder is None:
            base_op_folder = os.getenv("ANUM_OP_ROOT") or os.path.join(self.repo_root, "op")
        self.base_op_folder = base_op_folder

    def _task_dir(self, task_tag: str) -> str:
        return os.path.join(self.base_op_folder, task_tag)

    def _normalize_spec(self, spec: Dict[str, Any]) -> Dict[str, Any]:
        return normalize_spec_dict(spec)

    def _load_run_info(self, task_tag: str) -> Dict[str, Any]:
        run_path = os.path.join(self._task_dir(task_tag), "run.json")
        if not os.path.exists(run_path):
            return {}
        try:
            return _read_json(run_path)
        except Exception:
            return {}

    def _load_status_info(self, task_tag: str) -> Dict[str, Any]:
        status_path = os.path.join(self._task_dir(task_tag), "status.json")
        if not os.path.exists(status_path):
            return {}
        try:
            return _read_json(status_path)
        except Exception:
            return {}

    def _load_saved_spec(self, task_tag: str) -> Dict[str, Any]:
        spec_path = os.path.join(self._task_dir(task_tag), "spec.json")
        if not os.path.exists(spec_path):
            return {}
        try:
            return _read_json(spec_path)
        except Exception:
            return {}

    def _reconcile_status_with_liveness(
        self,
        task_tag: str,
        status_payload: Dict[str, Any],
        run_info: Dict[str, Any],
    ) -> Tuple[Dict[str, Any], int, int]:
        alive_pids, total_pids = _process_liveness(run_info)
        status_value = str(status_payload.get("status", "")).lower()
        active_statuses = {"queued", "running", "stopping"}
        if status_value not in active_statuses:
            return status_payload, alive_pids, total_pids
        if total_pids <= 0 or alive_pids > 0:
            return status_payload, alive_pids, total_pids
        stale_after_s = 120
        if status_value == "running":
            stale_after_s = int(os.getenv("ANUM_STATUS_STALE_RUNNING_S", "45"))
        elif status_value == "stopping":
            stale_after_s = int(os.getenv("ANUM_STATUS_STALE_STOPPING_S", "30"))
        elif status_value == "queued":
            stale_after_s = int(os.getenv("ANUM_STATUS_STALE_QUEUED_S", "120"))
        if _status_updated_recently(status_payload, time.time(), stale_after_s=stale_after_s):
            return status_payload, alive_pids, total_pids

        stop_requested = status_payload.get("stop_requested_at") is not None
        stop_file_exists = os.path.exists(os.path.join(self._task_dir(task_tag), "STOP"))
        if status_value == "stopping" or stop_requested or stop_file_exists:
            next_status = "stopped"
            note = "all_tracked_processes_exited_after_stop"
        else:
            next_status = "failed"
            note = "all_tracked_processes_exited_unexpectedly"

        reconciled = dict(status_payload)
        reconciled["status"] = next_status
        reconciled["updated_at"] = int(time.time())
        reconciled["note"] = note
        try:
            _safe_write_json(os.path.join(self._task_dir(task_tag), "status.json"), reconciled)
        except Exception:
            pass
        return reconciled, alive_pids, total_pids

    def _resolve_artifact_path(self, artifact_id: str) -> Tuple[str, str]:
        task_tag, rel_path = parse_artifact_id(artifact_id)
        task_dir = self._task_dir(task_tag)
        if not os.path.isdir(task_dir):
            raise FileNotFoundError("task_tag not found")
        if rel_path == "root":
            rel_path = "."
        rel_path = _safe_relpath(rel_path)
        abs_path = os.path.abspath(os.path.join(task_dir, rel_path))
        task_dir_abs = os.path.abspath(task_dir)
        if not abs_path.startswith(task_dir_abs + os.sep) and abs_path != task_dir_abs:
            raise ValueError("artifact path escapes task directory")
        return task_tag, abs_path

    def submit(
        self,
        spec: Dict[str, Any],
        task_tag: Optional[str] = None,
        runner: Optional[Dict[str, Any]] = None,
        request_id: Optional[str] = None,
    ) -> Dict[str, Any]:
        try:
            normalized = self._normalize_spec(spec)
        except Exception as exc:
            return make_response(
                "error",
                errors=[{"code": "spec_invalid", "message": str(exc), "details": {}}],
            )

        request_id = request_id or normalized.get("request_id") or uuid.uuid4().hex[:16]
        normalized["request_id"] = request_id
        task_tag = _sanitize_task_tag(task_tag or request_id)
        task_dir = self._task_dir(task_tag)
        submit_warnings: List[Dict[str, Any]] = []

        existing_run = self._load_run_info(task_tag)
        existing_status = self._load_status_info(task_tag)
        if existing_run and existing_status:
            existing_status, _, _ = self._reconcile_status_with_liveness(task_tag, existing_status, existing_run)
        existing_status_value = str(existing_status.get("status", "")).lower()
        existing_run_id = existing_run.get("run_id")
        existing_request_id = existing_run.get("request_id")

        # Idempotent replay: same task_tag + same request_id should return existing run metadata.
        if existing_run_id and existing_request_id and request_id == existing_request_id:
            existing_spec = self._load_saved_spec(task_tag)
            if existing_spec and _stable_payload_hash(existing_spec) != _stable_payload_hash(normalized):
                # For completed/stopped historical runs, auto-refresh request_id to allow continuation.
                if existing_status_value in ("queued", "running"):
                    return make_response(
                        "error",
                        errors=[
                            {
                                "code": "idempotency_conflict",
                                "message": "request_id already exists with a different spec payload",
                                "details": {
                                    "task_tag": task_tag,
                                    "request_id": request_id,
                                },
                            }
                        ],
                        request_id=request_id,
                        run_id=existing_run_id,
                        task_tag=task_tag,
                    )
                old_request_id = request_id
                request_id = f"{old_request_id}_{int(time.time())}_{uuid.uuid4().hex[:8]}"
                normalized["request_id"] = request_id
                submit_warnings.append(
                    {
                        "code": "request_id_refreshed",
                        "message": "request_id conflicted with historical run; refreshed automatically.",
                        "details": {
                            "task_tag": task_tag,
                            "old_request_id": old_request_id,
                            "new_request_id": request_id,
                        },
                    }
                )
            else:
                return make_response(
                    "ok",
                    data={
                        "run_id": existing_run_id,
                        "task_tag": task_tag,
                        "request_id": existing_request_id,
                        "status": existing_status_value or "unknown",
                        "artifact_id": build_artifact_id(task_tag, "root"),
                    },
                    warnings=[
                        {
                            "code": "duplicate_submit_ignored",
                            "message": "Idempotent replay detected; returning existing run.",
                            "details": {
                                "task_tag": task_tag,
                                "request_id": request_id,
                            },
                        }
                    ],
                    request_id=existing_request_id,
                    run_id=existing_run_id,
                    task_tag=task_tag,
                )

        # task_tag-level dedup to block duplicate active runs from repeated SUBMIT actions.
        if existing_run_id and existing_status_value in ("queued", "running"):
            return make_response(
                "ok",
                data={
                    "run_id": existing_run_id,
                    "task_tag": task_tag,
                    "request_id": existing_request_id or request_id,
                    "status": existing_status_value,
                    "artifact_id": build_artifact_id(task_tag, "root"),
                },
                warnings=[
                    {
                        "code": "task_active_reused",
                        "message": "task_tag already has an active run; reusing existing run.",
                        "details": {
                            "task_tag": task_tag,
                            "status": existing_status_value,
                        },
                    }
                ],
                request_id=existing_request_id or request_id,
                run_id=existing_run_id,
                task_tag=task_tag,
            )

        run_id = uuid.uuid4().hex[:12]
        os.makedirs(task_dir, exist_ok=True)
        stop_path = os.path.join(task_dir, "STOP")
        if os.path.exists(stop_path):
            try:
                os.remove(stop_path)
                submit_warnings.append(
                    {
                        "code": "stale_stop_cleared",
                        "message": "Removed stale STOP marker before submitting a new run.",
                        "details": {"task_tag": task_tag},
                    }
                )
            except OSError:
                submit_warnings.append(
                    {
                        "code": "stale_stop_clear_failed",
                        "message": "Failed to remove stale STOP marker; new run may stop immediately.",
                        "details": {"task_tag": task_tag},
                    }
                )

        spec_path = os.path.join(task_dir, "spec.json")
        _safe_write_json(spec_path, normalized)

        status_path = os.path.join(task_dir, "status.json")
        status_payload = {
            "status": "queued",
            "task_tag": task_tag,
            "progress": {},
            "elapsed_s": 0.0,
            "updated_at": int(time.time()),
        }
        _safe_write_json(status_path, status_payload)

        run_payload = {
            "schema_version": SCHEMA_VERSION,
            "run_id": run_id,
            "task_tag": task_tag,
            "request_id": request_id,
            "created_at": int(time.time()),
            "runner": runner or {"type": "local", "params": {}},
            "processes": [],
        }
        run_path = os.path.join(task_dir, "run.json")
        _safe_write_json(run_path, run_payload)

        runner = run_payload["runner"]
        if runner.get("type") == "local":
            processes = self._start_local_run(task_tag, normalized, spec_path, runner.get("params", {}))
            run_payload["processes"] = processes
            _safe_write_json(run_path, run_payload)
            status_payload["status"] = "running"
            status_payload["updated_at"] = int(time.time())
            _safe_write_json(status_path, status_payload)

        return make_response(
            "ok",
            data={
                "run_id": run_id,
                "task_tag": task_tag,
                "request_id": request_id,
                "status": status_payload.get("status"),
                "artifact_id": build_artifact_id(task_tag, "root"),
            },
            warnings=submit_warnings,
            request_id=request_id,
            run_id=run_id,
            task_tag=task_tag,
        )

    def poll(self, task_tag: str) -> Dict[str, Any]:
        task_dir = self._task_dir(task_tag)
        status_path = os.path.join(task_dir, "status.json")
        if not os.path.exists(status_path):
            return make_response(
                "not_found",
                errors=[{"code": "status_missing", "message": "status.json not found", "details": {}}],
            )
        payload = _read_json(status_path)
        run_info = self._load_run_info(task_tag)
        if run_info:
            payload, alive_pids, total_pids = self._reconcile_status_with_liveness(task_tag, payload, run_info)
            payload["run_id"] = run_info.get("run_id")
            payload["request_id"] = run_info.get("request_id")
            payload["process_liveness"] = {
                "alive_pids": alive_pids,
                "total_pids": total_pids,
            }
        payload["artifact_id"] = build_artifact_id(task_tag, "root")
        payload.setdefault("suggested_poll_interval_ms", 5000)
        return make_response(
            "ok",
            data=payload,
            request_id=run_info.get("request_id"),
            run_id=run_info.get("run_id"),
            task_tag=task_tag,
        )

    def stop(self, task_tag: str) -> Dict[str, Any]:
        task_dir = self._task_dir(task_tag)
        if not os.path.isdir(task_dir):
            return make_response(
                "not_found",
                errors=[{"code": "task_missing", "message": "task_tag not found", "details": {}}],
            )
        stop_path = os.path.join(task_dir, "STOP")
        stop_requested_at = int(time.time())
        with open(stop_path, "w", encoding="utf-8") as handle:
            handle.write(f"stop_requested_at={stop_requested_at}\n")

        status_path = os.path.join(task_dir, "status.json")
        status_payload: Dict[str, Any] = {}
        if os.path.exists(status_path):
            try:
                status_payload = _read_json(status_path)
            except Exception:
                status_payload = {}
        current_status = str(status_payload.get("status", "")).lower()
        if current_status in ("", "queued", "running", "stopping"):
            status_payload["status"] = "stopping"
        status_payload["stop_requested_at"] = stop_requested_at
        status_payload["updated_at"] = stop_requested_at
        try:
            _safe_write_json(status_path, status_payload)
        except Exception:
            pass

        run_path = os.path.join(task_dir, "run.json")
        stopped: List[int] = []
        force_killed: List[int] = []
        if os.path.exists(run_path):
            run_info = _read_json(run_path)
            for pid in _tracked_alive_pids(run_info):
                try:
                    os.kill(pid, signal.SIGTERM)
                    stopped.append(pid)
                except Exception:
                    continue

            # If tasks ignore SIGTERM, force-kill after a short grace period so
            # status can converge instead of lingering in stopping.
            force_kill_grace_s = float(os.getenv("ANUM_STOP_FORCE_KILL_GRACE_S", "1.0"))
            deadline = time.time() + max(0.1, force_kill_grace_s)
            while time.time() < deadline:
                if not any(_pid_alive(pid) for pid in stopped):
                    break
                time.sleep(0.05)
            for pid in stopped:
                if not _pid_alive(pid):
                    continue
                try:
                    os.kill(pid, signal.SIGKILL)
                    force_killed.append(pid)
                except Exception:
                    continue

        run_info = self._load_run_info(task_tag)
        return make_response(
            "ok",
            data={
                "task_tag": task_tag,
                "stopped_pids": stopped,
                "force_killed_pids": force_killed,
                "stop_requested_at": stop_requested_at,
            },
            request_id=run_info.get("request_id"),
            run_id=run_info.get("run_id"),
            task_tag=task_tag,
        )

    def result(self, task_tag: str) -> Dict[str, Any]:
        task_dir = self._task_dir(task_tag)
        run_info = self._load_run_info(task_tag)
        status_path = os.path.join(task_dir, "status.json")
        status_payload: Dict[str, Any] = {}
        if os.path.exists(status_path):
            try:
                status_payload = _read_json(status_path)
            except Exception:
                status_payload = {}
        if run_info and status_payload:
            status_payload, alive_pids, total_pids = self._reconcile_status_with_liveness(
                task_tag,
                status_payload,
                run_info,
            )
        else:
            alive_pids, total_pids = _process_liveness(run_info)
        status_value = str(status_payload.get("status", "")).lower()
        stopping_still_active = status_value == "stopping" and alive_pids > 0
        active_statuses = {"queued", "running"}

        # Never report "ok/completed" while the run is still active.
        if status_value in active_statuses or stopping_still_active:
            best_candidate = _best_candidate_from_bestof(task_dir, task_tag)
            preview = None
            if best_candidate:
                preview = {
                    "optimization_error": best_candidate.get("optimization_error"),
                    "ops": best_candidate.get("ops"),
                    "source": best_candidate.get("source"),
                    "artifact_id": best_candidate.get("artifact_id"),
                }
            return make_response(
                "running",
                data={
                    "schema_version": SCHEMA_VERSION,
                    "task_tag": task_tag,
                    "run_id": run_info.get("run_id"),
                    "request_id": run_info.get("request_id"),
                    "task_status": status_value,
                    "progress": status_payload.get("progress"),
                    "elapsed_s": status_payload.get("elapsed_s"),
                    "updated_at": status_payload.get("updated_at"),
                    "process_liveness": {
                        "alive_pids": alive_pids,
                        "total_pids": total_pids,
                    },
                    "best_preview": preview,
                    "artifact_id": build_artifact_id(task_tag, "root"),
                },
                request_id=run_info.get("request_id"),
                run_id=run_info.get("run_id"),
                task_tag=task_tag,
            )

        result_path = os.path.join(task_dir, "result.json")
        if os.path.exists(result_path):
            cached = _read_json(result_path)
            if cached.get("schema_version") == SCHEMA_VERSION:
                status_updated = status_payload.get("updated_at")
                cached_generated = cached.get("generated_at")
                if isinstance(status_updated, int) and isinstance(cached_generated, int):
                    if cached_generated >= status_updated:
                        return make_response(
                            "ok",
                            data=cached,
                            request_id=run_info.get("request_id"),
                            run_id=run_info.get("run_id"),
                            task_tag=task_tag,
                        )
                else:
                    return make_response(
                        "ok",
                        data=cached,
                        request_id=run_info.get("request_id"),
                        run_id=run_info.get("run_id"),
                        task_tag=task_tag,
                    )

        spec_path = os.path.join(task_dir, "spec.json")
        if not os.path.exists(spec_path):
            return make_response(
                "not_found",
                errors=[{"code": "spec_missing", "message": "spec.json not found", "details": {}}],
            )

        best_candidate = _best_candidate_from_bestof(task_dir, task_tag)
        if not best_candidate:
            # Task is no longer active but no usable best candidate exists.
            if status_value in ("stopping", "stopped", "failed", "error"):
                return make_response(
                    "error",
                    errors=[
                        {
                            "code": "bestof_missing_terminal",
                            "message": "task finished without any valid bestof candidate",
                            "details": {"task_status": status_value},
                        }
                    ],
                )
            return make_response(
                "running",
                errors=[{"code": "bestof_invalid", "message": "no valid bestof entries", "details": {}}],
            )

        result_payload = {
            "schema_version": SCHEMA_VERSION,
            "task_tag": task_tag,
            "run_id": run_info.get("run_id"),
            "request_id": run_info.get("request_id"),
            "best_candidate": best_candidate,
            "artifacts": {
                "spec": build_artifact_id(task_tag, "spec.json"),
                "status": build_artifact_id(task_tag, "status.json"),
                "run": build_artifact_id(task_tag, "run.json"),
                "bestof": build_artifact_id(task_tag, "bestof"),
                "dag_pool": build_artifact_id(task_tag, os.path.join("dag", "pool")),
                "dag_archive": build_artifact_id(task_tag, os.path.join("dag", "archive")),
                "logs": build_artifact_id(task_tag, "logs"),
            },
            "generated_at": int(time.time()),
        }
        _safe_write_json(result_path, result_payload)
        return make_response(
            "ok",
            data=result_payload,
            request_id=run_info.get("request_id"),
            run_id=run_info.get("run_id"),
            task_tag=task_tag,
        )

    def list(self, status_filter: Optional[str] = None) -> Dict[str, Any]:
        """List all runs with optional status filter.

        Args:
            status_filter: Optional filter by status ("running", "stopped", "queued", "completed")

        Returns:
            Response with list of runs, each containing task_tag, status, elapsed_s, best_error
        """
        runs: List[Dict[str, Any]] = []
        if not os.path.isdir(self.base_op_folder):
            return make_response("ok", data={"runs": runs, "count": 0})

        for name in sorted(os.listdir(self.base_op_folder)):
            task_dir = os.path.join(self.base_op_folder, name)
            if not os.path.isdir(task_dir):
                continue
            status_file = os.path.join(task_dir, "status.json")
            if not os.path.exists(status_file):
                continue
            try:
                status_data = _read_json(status_file)
            except Exception:
                continue

            run_info = self._load_run_info(name)
            if run_info:
                status_data, _, _ = self._reconcile_status_with_liveness(name, status_data, run_info)
            run_status = status_data.get("status", "unknown")
            if status_filter is not None and run_status != status_filter:
                continue
            best_summary = status_data.get("best_summary") or {}
            runs.append({
                "task_tag": name,
                "run_id": run_info.get("run_id"),
                "request_id": run_info.get("request_id"),
                "status": run_status,
                "elapsed_s": status_data.get("elapsed_s", 0),
                "best_error": best_summary.get("error"),
                "best_ops": best_summary.get("ops"),
                "updated_at": status_data.get("updated_at"),
            })

        return make_response("ok", data={"runs": runs, "count": len(runs)})

    def artifact_get(self, artifact_id: str, fmt: Optional[str] = None) -> Dict[str, Any]:
        started = time.time()
        try:
            task_tag, abs_path = self._resolve_artifact_path(artifact_id)
        except FileNotFoundError:
            return make_response(
                "not_found",
                errors=[{"code": "artifact_missing", "message": "artifact_id not found", "details": {}}],
            )
        except ValueError as exc:
            return make_response(
                "error",
                errors=[{"code": "invalid_artifact", "message": str(exc), "details": {}}],
            )

        if not os.path.exists(abs_path):
            return make_response(
                "not_found",
                errors=[{"code": "artifact_missing", "message": "artifact path not found", "details": {}}],
            )
        if os.path.isdir(abs_path):
            return make_response(
                "error",
                errors=[{"code": "artifact_is_dir", "message": "artifact is a directory", "details": {}}],
            )

        fmt = (fmt or "json").strip().lower()
        if fmt not in ("json", "text", "base64"):
            return make_response(
                "error",
                errors=[{"code": "invalid_format", "message": "format must be json|text|base64", "details": {}}],
            )

        content: Any
        mime_type = "application/octet-stream"
        if fmt == "json":
            with open(abs_path, "r", encoding="utf-8") as handle:
                content = json.load(handle)
            mime_type = "application/json"
        elif fmt == "text":
            with open(abs_path, "r", encoding="utf-8") as handle:
                content = handle.read()
            mime_type = "text/plain"
        else:
            with open(abs_path, "rb") as handle:
                content = handle.read()
            content = base64.b64encode(content).decode("ascii")
            mime_type = "application/octet-stream"

        size_bytes = os.path.getsize(abs_path)
        updated_at = int(os.path.getmtime(abs_path))
        run_info = self._load_run_info(task_tag)
        return make_response(
            "ok",
            data={
                "artifact_id": artifact_id,
                "format": fmt,
                "mime_type": mime_type,
                "content": content,
                "size_bytes": size_bytes,
                "updated_at": updated_at,
            },
            request_id=run_info.get("request_id"),
            run_id=run_info.get("run_id"),
            task_tag=task_tag,
            duration_ms=(time.time() - started) * 1000.0,
        )

    def artifact_list(self, artifact_id: str) -> Dict[str, Any]:
        started = time.time()
        try:
            task_tag, abs_path = self._resolve_artifact_path(artifact_id)
        except FileNotFoundError:
            return make_response(
                "not_found",
                errors=[{"code": "artifact_missing", "message": "artifact_id not found", "details": {}}],
            )
        except ValueError as exc:
            return make_response(
                "error",
                errors=[{"code": "invalid_artifact", "message": str(exc), "details": {}}],
            )

        if not os.path.exists(abs_path):
            return make_response(
                "not_found",
                errors=[{"code": "artifact_missing", "message": "artifact path not found", "details": {}}],
            )
        if not os.path.isdir(abs_path):
            return make_response(
                "error",
                errors=[{"code": "artifact_not_dir", "message": "artifact is not a directory", "details": {}}],
            )

        entries = []
        task_dir = self._task_dir(task_tag)
        for name in sorted(os.listdir(abs_path)):
            full_path = os.path.join(abs_path, name)
            rel_path = os.path.relpath(full_path, task_dir)
            entry = {
                "name": name,
                "artifact_id": build_artifact_id(task_tag, rel_path),
                "is_dir": os.path.isdir(full_path),
                "updated_at": int(os.path.getmtime(full_path)),
            }
            if os.path.isfile(full_path):
                entry["size_bytes"] = os.path.getsize(full_path)
            entries.append(entry)

        run_info = self._load_run_info(task_tag)
        return make_response(
            "ok",
            data={
                "artifact_id": artifact_id,
                "count": len(entries),
                "entries": entries,
            },
            request_id=run_info.get("request_id"),
            run_id=run_info.get("run_id"),
            task_tag=task_tag,
            duration_ms=(time.time() - started) * 1000.0,
        )

    def _start_local_run(
        self,
        task_tag: str,
        spec: Dict[str, Any],
        spec_path: str,
        params: Dict[str, Any],
    ) -> List[Dict[str, Any]]:
        # Kill any leftover processes from a previous run of the same task_tag
        # to prevent worker over-subscription when force_resubmit is used.
        existing_run = self._load_run_info(task_tag)
        if existing_run:
            for pid in _tracked_alive_pids(existing_run):
                try:
                    os.kill(pid, signal.SIGKILL)
                except OSError:
                    pass

        processes: List[Dict[str, Any]] = []
        pieces = spec.get("domain", {}).get("pieces", [])
        piece_ids = [piece.get("piece_id") for piece in pieces if piece.get("piece_id") is not None]
        if params.get("worker_num") is not None:
            worker_num = max(1, int(params.get("worker_num")))
        elif os.getenv("ANUM_DEFAULT_WORKER_NUM"):
            try:
                worker_num = max(1, int(os.getenv("ANUM_DEFAULT_WORKER_NUM", "1")))
            except ValueError:
                worker_num = max(1, os.cpu_count() or 1)
        else:
            # Default to host CPU count to avoid under-utilization on single-piece runs.
            worker_num = max(1, os.cpu_count() or 1)
        if piece_ids:
            # Ensure each piece can have at least one worker assigned.
            worker_num = max(worker_num, len(piece_ids))
        batch_size = int(params.get("batch_size", 40))
        check_interval = int(params.get("check_interval", 60))
        run_time = int(params.get("run_time", 0))
        max_tasks = params.get("max_tasks")
        archive_threshold = int(params.get("archive_threshold", 1000))
        stats_frequency = int(params.get("stats_frequency", 10000))
        init_multiplier = int(params.get("init_multiplier", 1))
        force_init = bool(params.get("force_init", False))
        min_dag_count = int(params.get("min_dag_count", 10))
        precision_info = normalize_precision_model(spec.get("precision_model"))
        type_string = str(
            params.get("type_string")
            or precision_format_to_dag_dtype(precision_info["compute_format"])
        )

        logs_dir = os.path.join(self._task_dir(task_tag), "logs")
        os.makedirs(logs_dir, exist_ok=True)

        master_cmd = [
            sys.executable,
            "master/master_node.py",
            "--task_tag",
            task_tag,
            "--worker_num",
            str(worker_num),
            "--batch_size",
            str(batch_size),
            "--check_interval",
            str(check_interval),
            "--archive_threshold",
            str(archive_threshold),
            "--stats_frequency",
            str(stats_frequency),
            "--init_multiplier",
            str(init_multiplier),
            "--min_dag_count",
            str(min_dag_count),
            "--type_string",
            type_string,
        ]
        if run_time:
            master_cmd += ["--run_time", str(run_time)]
        if max_tasks is not None:
            master_cmd += ["--max_tasks", str(int(max_tasks))]
        if force_init:
            master_cmd.append("--force_init")

        master_log_path = os.path.join(logs_dir, "master.log")
        master_log = open(master_log_path, "ab")
        master_proc = subprocess.Popen(
            master_cmd,
            cwd=self.repo_root,
            stdout=master_log,
            stderr=master_log,
            start_new_session=True,
        )
        processes.append(
            {
                "role": "master",
                "pid": master_proc.pid,
                "cmd": master_cmd,
                "log_path": master_log_path,
            }
        )
        master_log.close()

        if params.get("start_workers", True):
            num_mantain = (
                spec.get("search_config", {})
                .get("evolution", {})
                .get("num_mantain", 40)
            )
            worker_check_interval = int(params.get("worker_check_interval", 5))
            worker_run_time = int(params.get("worker_run_time", 0))
            for idx in range(worker_num):
                piece_id = None
                if piece_ids:
                    piece_id = piece_ids[idx % len(piece_ids)]
                worker_cmd = [
                    sys.executable,
                    "worker/worker_node_log.py",
                    "--task_tag",
                    task_tag,
                    "--check_interval",
                    str(worker_check_interval),
                    "--run_time",
                    str(worker_run_time),
                    "--num_mantain",
                    str(num_mantain),
                    "--spec_path",
                    spec_path,
                ]
                if piece_id is not None:
                    worker_cmd += ["--piece_id", str(piece_id)]

                worker_label = f"worker_{idx:03d}"
                if piece_id is not None:
                    worker_label += f"_piece_{_sanitize_log_label(str(piece_id))}"
                worker_log_path = os.path.join(logs_dir, f"{worker_label}.log")
                worker_log = open(worker_log_path, "ab")
                worker_proc = subprocess.Popen(
                    worker_cmd,
                    cwd=self.repo_root,
                    stdout=worker_log,
                    stderr=worker_log,
                    start_new_session=True,
                )
                processes.append(
                    {
                        "role": "worker",
                        "pid": worker_proc.pid,
                        "cmd": worker_cmd,
                        "piece_id": piece_id,
                        "log_path": worker_log_path,
                    }
                )
                worker_log.close()

        return processes
