#!/usr/bin/env python3
"""Run the Claude Code operational workflow traces.

This script intentionally shells out to the local `claude` CLI because the
paper evaluates Claude Code as an agentic consumer of Lacuna pages.
"""

from __future__ import annotations

import argparse
import json
import re
import subprocess
import time
from datetime import datetime, timezone
from pathlib import Path


ROOT = Path(__file__).resolve().parent
PROMPTS = {
    "lacuna_navigation": ROOT / "lacuna_navigation_prompt.md",
    "pdf_to_chat_baseline": ROOT / "pdf_to_chat_baseline_prompt.md",
}


def iter_json_lines(text: str):
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            yield json.loads(line)
        except json.JSONDecodeError:
            continue


def collect_tool_commands(events: list[dict]) -> list[str]:
    commands: list[str] = []

    def walk(value):
        if isinstance(value, dict):
            if value.get("type") == "tool_use" and value.get("name") == "Bash":
                tool_input = value.get("input") or {}
                command = tool_input.get("command")
                if command:
                    commands.append(sanitize_command(command))
            for child in value.values():
                walk(child)
        elif isinstance(value, list):
            for child in value:
                walk(child)

    for event in events:
        walk(event)
    return commands


def sanitize_command(command: str) -> str:
    command = re.sub(
        r"/Users/[^\s'\"]+/.claude/projects/[^\s'\"]+",
        "<claude_tool_result_path>",
        command,
    )
    if "<<" in command or len(command) > 1600:
        first_line = command.splitlines()[0]
        return f"{first_line}\n<omitted {max(0, len(command) - len(first_line))} chars>"
    return command


def collect_tool_result_bytes(events: list[dict]) -> int:
    total = 0
    for event in events:
        if event.get("type") != "user":
            continue
        message = event.get("message") or {}
        for content in message.get("content") or []:
            if content.get("type") == "tool_result":
                total += len((content.get("content") or "").encode("utf-8"))
    return total


def parse_result_text(result_text: str):
    try:
        return json.loads(result_text)
    except json.JSONDecodeError:
        pass

    match = re.search(r"```json\s*(\{.*?\})\s*```", result_text, flags=re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass

    start = result_text.find("{")
    end = result_text.rfind("}")
    if start != -1 and end != -1 and end > start:
        try:
            return json.loads(result_text[start : end + 1])
        except json.JSONDecodeError:
            pass

    return result_text


def run_condition(name: str, output_dir: Path, model: str, max_budget_usd: str) -> dict:
    prompt = PROMPTS[name].read_text()
    run_dir = Path("/private/tmp/lacuna_workflow_eval")
    run_dir.mkdir(parents=True, exist_ok=True)

    cmd = [
        "claude",
        "-p",
        "--model",
        model,
        "--permission-mode",
        "bypassPermissions",
        "--no-session-persistence",
        "--output-format",
        "stream-json",
        "--verbose",
        "--max-budget-usd",
        max_budget_usd,
        "--tools=Bash",
    ]

    started_at = datetime.now(timezone.utc).isoformat()
    t0 = time.monotonic()
    proc = subprocess.run(
        cmd,
        input=prompt,
        text=True,
        cwd=run_dir,
        capture_output=True,
        check=False,
    )
    wall_seconds = time.monotonic() - t0
    finished_at = datetime.now(timezone.utc).isoformat()

    jsonl_path = output_dir / f"{name}.jsonl"
    stderr_path = output_dir / f"{name}.stderr.txt"
    jsonl_path.write_text(proc.stdout)
    stderr_path.write_text(proc.stderr)

    events = list(iter_json_lines(proc.stdout))
    final = next((event for event in reversed(events) if event.get("type") == "result"), {})
    result_text = final.get("result") or ""
    parsed_result = parse_result_text(result_text)
    model_usage = final.get("modelUsage") or {}
    primary_model = next((usage for key, usage in model_usage.items() if "sonnet" in key), {})

    summary = {
        "condition": name,
        "started_at": started_at,
        "finished_at": finished_at,
        "wall_seconds": round(wall_seconds, 3),
        "returncode": proc.returncode,
        "claude_duration_ms": final.get("duration_ms"),
        "claude_api_duration_ms": final.get("duration_api_ms"),
        "num_turns": final.get("num_turns"),
        "total_cost_usd": final.get("total_cost_usd"),
        "stop_reason": final.get("stop_reason"),
        "terminal_reason": final.get("terminal_reason"),
        "permission_denials": final.get("permission_denials"),
        "tool_commands": collect_tool_commands(events),
        "tool_result_bytes": collect_tool_result_bytes(events),
        "primary_model_output_tokens": primary_model.get("outputTokens"),
        "primary_model_input_tokens": primary_model.get("inputTokens"),
        "primary_model_cache_read_tokens": primary_model.get("cacheReadInputTokens"),
        "primary_model_cache_creation_tokens": primary_model.get("cacheCreationInputTokens"),
        "result": parsed_result,
    }
    (output_dir / f"{name}.summary.json").write_text(json.dumps(summary, indent=2))
    return summary


def write_markdown(summaries: list[dict], output_dir: Path) -> None:
    try:
        display_dir = output_dir.resolve().relative_to(ROOT)
    except ValueError:
        display_dir = output_dir
    lines = [
        "# Claude Code Workflow Evaluation",
        "",
        f"Run directory: `{display_dir}`",
        "",
        "This is an operational trace, not a human-subjects study. Both conditions used Claude Code in non-interactive mode with the same model and a fixed task.",
        "",
        "## Summary Table",
        "",
        "| Condition | Wall time | Claude turns | Tool calls | Tool text | Output tokens | Estimated blocking model calls | Cost | Outcome |",
        "|---|---:|---:|---:|---:|---:|---:|---:|---|",
    ]
    for summary in summaries:
        result = summary.get("result") if isinstance(summary.get("result"), dict) else {}
        estimated_calls = result.get("blocking_model_calls_needed", "")
        final_question = (result.get("final_question") or "").replace("\n", " ")
        if len(final_question) > 120:
            final_question = final_question[:117] + "..."
        lines.append(
            f"| `{summary['condition']}` | {summary['wall_seconds']:.1f}s | "
            f"{summary.get('num_turns') or ''} | {len(summary.get('tool_commands') or [])} | "
            f"{int(summary.get('tool_result_bytes') or 0) / 1000:.1f}KB | "
            f"{summary.get('primary_model_output_tokens') or ''} | "
            f"{estimated_calls} | ${float(summary.get('total_cost_usd') or 0):.3f} | "
            f"{final_question} |"
        )

    lines.extend(["", "## Tool Commands", ""])
    for summary in summaries:
        lines.append(f"### {summary['condition']}")
        commands = summary.get("tool_commands") or []
        if not commands:
            lines.append("")
            lines.append("No Bash commands captured.")
            lines.append("")
            continue
        lines.append("")
        for command in commands:
            lines.append(f"- `{command}`")
        lines.append("")

    lines.extend(["## Final Outputs", ""])
    for summary in summaries:
        lines.append(f"### {summary['condition']}")
        lines.append("")
        lines.append("```json")
        lines.append(json.dumps(summary.get("result"), indent=2))
        lines.append("```")
        lines.append("")

    (output_dir / "results.md").write_text("\n".join(lines))


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="sonnet")
    parser.add_argument("--max-budget-usd", default="1.00")
    parser.add_argument("--output-dir", type=Path, default=ROOT / "outputs" / "2026-05-08")
    parser.add_argument("--summarize-only", action="store_true")
    args = parser.parse_args()

    args.output_dir.mkdir(parents=True, exist_ok=True)
    if args.summarize_only:
        summaries = []
        for name in PROMPTS:
            summary_path = args.output_dir / f"{name}.summary.json"
            summary = json.loads(summary_path.read_text())
            if isinstance(summary.get("result"), str):
                summary["result"] = parse_result_text(summary["result"])
            summary["tool_commands"] = [sanitize_command(c) for c in summary.get("tool_commands") or []]
            summary_path.write_text(json.dumps(summary, indent=2))
            summaries.append(summary)
    else:
        summaries = [
            run_condition("lacuna_navigation", args.output_dir, args.model, args.max_budget_usd),
            run_condition("pdf_to_chat_baseline", args.output_dir, args.model, args.max_budget_usd),
        ]
    write_markdown(summaries, args.output_dir)
    print(json.dumps({"output_dir": str(args.output_dir), "summaries": summaries}, indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
