"""
Real Wikipedia runner using BrowserGym with localhost:9999.

This bypasses WebArena's task system and uses BrowserGym directly
to navigate real Wikipedia running on localhost:9999.
"""

from typing import Dict, Any, Optional
import json
import random
import os

from .operator import JudgmentOperator
from .suites import make_artifact_for_suite
from .webarena_violations import detect_violation_for_webarena, classify_action_type
from .multi_agent_coordinator import MultiAgentCoordinator, simulate_multi_agent_action
from .baselines import BaselineOperator


# Check if OpenAI API is available
USE_REAL_AGENT = bool(os.getenv("OPENAI_API_KEY"))
if USE_REAL_AGENT:
    try:
        from .webarena_agent import WebArenaAgent
        print("✓ Using REAL LLM agent (OpenAI API detected)")
    except ImportError as e:
        print(f"Warning: Could not import WebArenaAgent: {e}")
        USE_REAL_AGENT = False
else:
    print("⚠️  Using SIMULATED agent (no OpenAI API key)")
    print("   Set OPENAI_API_KEY for real LLM calls!")


def run_episode_wikipedia_real(
    instance=None,
    instances=None,
    run_cfg=None,
    env=None,
    n_agents=1,
    seed=0,
    max_steps=20,
    retrieve_threshold=0.70,
    suite_cfg=None,
    log_trajectory=True,
    persistent_operator=None,
    persistent_precedent_store=None,
    task_index=0,  # For invariant logging
):
    """
    Run episode with REAL Wikipedia on localhost:9999.

    Uses BrowserGym directly (not WebArena task system) to navigate
    real Wikipedia pages.
    """
    rng = random.Random(seed)

    # Use first instance if instances list provided
    if instances and len(instances) > 0:
        instance = instances[0]

    if instance is None:
        raise ValueError("No instance provided")

    # Extract task details
    task_id = instance.instance_id
    task_prompt = instance.prompt
    policy_text = instance.policy_text
    extra = instance.extra

    constraints = extra.get("constraints", {})
    task_type = extra.get("task_type", "multi_agent")
    starting_url = extra.get("starting_url", "http://localhost:9999/wikipedia_en_all_maxi_2022-05/A/Main_Page")
    gold_answer = extra.get("gold_answer", None)

    print(f"\n{'='*60}")
    print(f"Task: {task_id}")
    print(f"Prompt: {task_prompt[:80]}...")
    print(f"Method: {run_cfg.method_name if run_cfg else 'NR'}")
    print(f"{'='*60}")

    # Check environment
    if not os.getenv('OPENAI_API_KEY'):
        raise RuntimeError(
            "OPENAI_API_KEY not set!\n"
            "Without API key, experiment will use simulated agent.\n"
            "Run: export OPENAI_API_KEY='sk-...'"
        )

    # Build artifact
    load_static = False
    if run_cfg and run_cfg.method_kind == "judgment_routing":
        load_static = True

    artifact = make_artifact_for_suite(
        suite_type="webarena_wiki",
        policy_text=policy_text,
        precedents=[],
        resource_policy=None,
        load_static_precedents=load_static,
    )

    # Create agent
    agent = None
    if USE_REAL_AGENT:
        model_name = getattr(run_cfg, "model_name", "gpt-4o-mini") if run_cfg else "gpt-4o-mini"
        few_shot_examples = getattr(run_cfg, "few_shot_examples", None) if run_cfg else None
        api_base = getattr(run_cfg, "api_base", None) if run_cfg else None
        api_key_env = getattr(run_cfg, "api_key_env", None) if run_cfg else None
        # gpt-5-* models only support temperature=1.0 and need higher max_tokens
        temperature = 1.0 if "gpt-5" in model_name else 0.7
        max_tokens = 2000 if "gpt-5" in model_name else 500
        agent = WebArenaAgent(
            model=model_name,
            temperature=temperature,
            max_tokens=max_tokens,
            few_shot_examples=few_shot_examples,
            api_base=api_base,
            api_key_env=api_key_env,
        )

    # Create or reuse operator
    operator = persistent_operator
    precedent_store = persistent_precedent_store

    print(f"  [RUNNER] Task start: persistent_operator={id(persistent_operator) if persistent_operator else None}, persistent_precedent_store={id(persistent_precedent_store) if persistent_precedent_store else None}")
    if persistent_precedent_store:
        print(f"  [RUNNER] persistent_precedent_store.size() = {persistent_precedent_store.size()}")

    if operator is None and run_cfg and run_cfg.method_kind == "judgment_routing":
        use_dynamic = getattr(run_cfg, "dynamic_precedents", False)
        operator = JudgmentOperator(
            artifact=artifact,
            allowed_outcomes=getattr(run_cfg, "allowed_outcomes", ["ALLOW", "EDIT", "ESCALATE", "DENY"]),
            disable_precedents=False,
            dynamic_precedents=use_dynamic,
            retrieve_threshold=getattr(run_cfg, "retrieve_threshold", retrieve_threshold),
            max_precedent_capacity=getattr(run_cfg, "max_precedent_capacity", 100),
            # Theta ablation settings
            enable_theta_updates=getattr(run_cfg, "enable_theta_updates", False),
            theta_lambda=getattr(run_cfg, "theta_lambda", 1.0),
            theta_eta=getattr(run_cfg, "theta_eta", 0.01),
            theta_max_norm=getattr(run_cfg, "theta_max_norm", 5.0),
            # Multi-candidate selection
            enable_multi_candidate=getattr(run_cfg, "enable_multi_candidate", False),
        )

        if hasattr(operator, "precedent_store"):
            precedent_store = operator.precedent_store

            # Load static precedents into precedent_store for semantic matching
            if precedent_store is not None and load_static and task_type == "multi_agent":
                # Choose precedent file based on constraints
                require_format = constraints.get("require_format", False)
                if require_format:
                    # Use format conversion precedents
                    precedent_path = "data/webarena/multiagent_format_precedents.json"
                else:
                    # Use citation precedents
                    min_citations = constraints.get("min_citations", 3)
                    if min_citations >= 5:
                        precedent_path = "data/webarena/multiagent_static_precedents_5citations.json"
                    else:
                        precedent_path = "data/webarena/multiagent_static_precedents.json"
                if os.path.exists(precedent_path):
                    try:
                        with open(precedent_path, 'r') as f:
                            static_precs = json.load(f)
                            for prec in static_precs:
                                # Add each static precedent to the precedent store
                                precedent_store.add(
                                    site=prec.get("site", "wikipedia"),
                                    intent=prec.get("intent", ""),
                                    failure_mode=prec.get("failure_mode", ""),
                                    approved_action=prec.get("approved_action", {}),
                                    bad_action_text=prec.get("bad_action_text", ""),
                                    state=prec.get("state", {}),
                                )
                            print(f"✓ Added {len(static_precs)} static precedents to precedent store")
                    except Exception as e:
                        print(f"Warning: Failed to load static precedents into store: {e}")

            # Load precedents for transfer learning (if specified)
            load_precedents_from = getattr(run_cfg, "load_precedents_from", None)
            if load_precedents_from and precedent_store is not None:
                if os.path.exists(load_precedents_from):
                    try:
                        with open(load_precedents_from, 'r') as f:
                            transfer_precs = json.load(f)
                            precedent_store.import_from_dict(transfer_precs)
                            print(f"✓ Loaded {len(transfer_precs)} transfer precedents from {load_precedents_from}")
                    except Exception as e:
                        print(f"Warning: Failed to load transfer precedents: {e}")
                else:
                    print(f"Warning: Transfer precedents file not found: {load_precedents_from}")

    # Create baseline operator (for constrained_decoding, schema_validation, guardrails, shield)
    baseline_operator = None
    if operator is None and run_cfg and run_cfg.method_kind in ["constrained_decoding", "schema_validation", "guardrails", "shield"]:
        baseline_config = {
            "require_citation": constraints.get("require_citation", False),
            "require_format": constraints.get("require_format", False),
            "format_pattern": r".*\|.*" if constraints.get("require_format") else None,
            "role_constraints": {
                "RESEARCHER": ["navigate", "click", "inform"],
                "WRITER": ["answer", "inform"],
                "VERIFIER": ["answer", "inform", "noop"],
            } if constraints.get("enforce_role_separation") else {},
        }
        baseline_operator = BaselineOperator(
            baseline_type=run_cfg.method_kind,
            config=baseline_config,
        )
        print(f"✓ Created {run_cfg.method_kind} baseline operator")

    # Initialize BrowserGym environment with real Wikipedia
    if env is None:
        try:
            # Use playwright directly for simpler Wikipedia navigation
            from playwright.sync_api import sync_playwright

            # Start playwright browser
            playwright = sync_playwright().start()
            browser = playwright.chromium.launch(headless=True)
            context = browser.new_context()
            page = context.new_page()

            # Navigate to starting URL
            page.goto(starting_url, timeout=10000)
            print(f"✓ Playwright browser created")
            print(f"✓ Navigated to: {starting_url}")

            # Store for cleanup
            env = {
                'playwright': playwright,
                'browser': browser,
                'context': context,
                'page': page,
            }

        except Exception as e:
            raise RuntimeError(
                f"Failed to create browser environment.\n"
                f"Error: {e}\n"
                f"Make sure Wikipedia is running on localhost:9999"
            )

    # Metrics
    violations = 0
    edits = 0
    escalations = 0
    retrieve_hits = 0
    # Decomposed retrieval tracking (CTHR fix)
    static_hits = 0       # origin_task_id is None (static/seed precedents)
    within_task_hits = 0  # origin_task_id == current task
    cross_task_hits = 0   # origin_task_id != current task AND origin is not None
    steps = 0
    success = False

    # Failure tracking
    last_failure_mode = "none"
    gave_answer = False
    answer_text = ""

    # Trajectory logging
    trajectory = [] if log_trajectory else None

    # Collect violations for cross-episode learning
    episode_violations = []

    # Context tracking
    history_tail = []
    visited_urls = [starting_url]
    current_url = starting_url

    # Multi-agent coordinator
    agent_roles_list = extra.get("agent_roles", [])
    coordinator = None
    if len(agent_roles_list) > 1:
        coordinator = MultiAgentCoordinator(agent_roles_list, max_steps)
        print(f"✓ Multi-agent mode: {' → '.join(agent_roles_list)}")

    # Get initial observation from Wikipedia page
    page = env['page']
    try:
        obs_text = page.content()
        current_url = page.url
        print(f"✓ Got initial page content ({len(obs_text)} chars)")
    except Exception as e:
        print(f"Warning: Failed to get page content: {e}")
        obs_text = "Wikipedia homepage"

    # Episode loop
    for t in range(max_steps):
        steps += 1

        # Determine current agent role
        if coordinator:
            current_agent_role = coordinator.get_current_agent()
            agent_context = coordinator.get_agent_context()
        else:
            current_agent_role = extra.get("agent_role", "default")
            agent_context = {}

        # Get agent proposal
        if agent is not None:
            # Real LLM agent
            candidate_action = agent.act(
                task_prompt=task_prompt,
                obs_text=obs_text,
                current_url=current_url,
                visited_urls=visited_urls,
                step=t,
                max_steps=max_steps,
                task_type=task_type,
                constraints=constraints,
                agent_role=current_agent_role,
            )
        else:
            # Simulated agent
            if coordinator:
                candidate_action = simulate_multi_agent_action(
                    agent_role=current_agent_role,
                    task_prompt=task_prompt,
                    obs_text=obs_text,
                    current_url=current_url,
                    visited_urls=visited_urls,
                    step=t,
                    max_steps=max_steps,
                    real_agent=None,
                    task_type=task_type,
                    constraints=constraints,
                )
            else:
                candidate_action = _simulate_agent_action(obs_text, task_prompt, t, max_steps, rng)

        # Build context for violation detection
        ctx = {
            "t": t,
            "task_type": task_type,
            "url": current_url,
            "visited_urls": visited_urls,
            "history_tail": history_tail[-10:],
            "agent_role": current_agent_role,
            "budget_left": max_steps - t,
            "last_obs": obs_text[:500] if obs_text else "",
            "gold_answer": gold_answer,
            **agent_context,
        }

        # Detect violations
        would_violate, reason, failure_mode = detect_violation_for_webarena(
            action=candidate_action,
            obs=obs_text,
            ctx=ctx,
            constraints=constraints,
        )

        # Routing
        final_action = candidate_action
        decision_outcome = "ALLOW"
        retrieved = False
        edited_action = None
        decision = None

        # Handle baseline operators (constrained_decoding, schema_validation, guardrails, shield)
        if baseline_operator is not None:
            baseline_ctx = {
                **ctx,
                "require_format": constraints.get("require_format", False),
                "require_citation": constraints.get("require_citation", False),
            }
            baseline_result = baseline_operator.route(candidate_action, context=baseline_ctx)
            decision_outcome = baseline_result.get("outcome", "ALLOW")

            if baseline_result.get("action") != candidate_action:
                edits += 1
                edited_action = baseline_result["action"]
                final_action = edited_action

            if decision_outcome == "BLOCK" or decision_outcome == "DENY":
                final_action = None

        elif operator is not None:
            route_ctx = {
                **ctx,
                "would_violate": would_violate,
                "reason": reason,
                "failure_mode": failure_mode,
                "scenario": "webarena_wiki",
            }

            decision = operator.route(candidate_action, context=route_ctx)
            decision_outcome = decision.outcome

            if decision.outcome == "EDIT":
                edits += 1
                edited_action = decision.edited_action
                final_action = edited_action if edited_action is not None else candidate_action

                # Record repair outcome for theta learning
                if hasattr(operator, 'record_repair_outcome'):
                    repair_ctx = {
                        "failure_mode": failure_mode,
                        "repair_type": "precedent" if "retrieved precedent" in (decision.reason or "").lower() else "static",
                        "repair_source": "precedent" if "retrieved precedent" in (decision.reason or "").lower() else "static",
                        "repair_retrieve_score": getattr(decision, 'retrieve_score', 0.0),
                        "severity": 0.7 if failure_mode in ["missing_citation", "format"] else 0.5,
                    }
                    operator.record_repair_outcome(
                        repair_x_star=final_action,
                        violated_x_tilde=candidate_action,
                        original_action=candidate_action,
                        context=repair_ctx,
                        success=True,  # Assume success on EDIT (no crash)
                    )

                if decision.reason and "retrieved precedent" in decision.reason.lower():
                    retrieve_hits += 1
                    retrieved = True

                    # === INVARIANT C + CTHR DECOMPOSITION ===
                    origin_task_id = getattr(decision, 'source_task_id', None)

                    # Log for Invariant C
                    print(f"    [INVARIANT-C] Retrieval: origin_task_id={origin_task_id}, current_task_id={task_id}")

                    # Decompose into static/within/cross
                    if origin_task_id is None:
                        static_hits += 1  # Static/seed precedent
                    elif origin_task_id == task_id:
                        within_task_hits += 1  # Same task
                    else:
                        cross_task_hits += 1  # Different task (TRUE cross-task)

            elif decision.outcome == "ESCALATE":
                escalations += 1
                final_action = None

            elif decision.outcome == "DENY":
                # Don't increment violations here - counted below via would_violate flag
                final_action = None

        # Track violations - count whenever a violation is detected
        if would_violate:
            violations += 1
            last_failure_mode = failure_mode

        # Execute action
        executed = final_action is not None

        if executed:
            action_type = classify_action_type(final_action)

            try:
                if action_type == "search":
                    # Simulate search - for now just continue
                    obs_text = page.content()
                    current_url = page.url

                elif action_type == "navigate":
                    # Navigate to URL
                    url = final_action.get("url", "")
                    if url and "localhost:9999" in url:
                        page.goto(url, timeout=5000)
                        obs_text = page.content()
                        current_url = page.url
                        if current_url not in visited_urls:
                            visited_urls.append(current_url)

                elif action_type == "answer":
                    gave_answer = True
                    answer_text = final_action.get("text", "")

                    # Update observation to include the answer for next agent
                    obs_text = f"Previous answer from {current_agent_role}: {answer_text}\n\nOriginal page content:\n{obs_text[:1500]}"

                    # Check success
                    if gold_answer:
                        answer_lower = answer_text.lower()

                        # Collect all expected values from gold_answer
                        # (handles various formats: "text", "end_date", "capital", etc.)
                        expected_values = []
                        for key, value in gold_answer.items():
                            if key != "citation" and key != "citation_pattern":  # Skip citation fields
                                if isinstance(value, str):
                                    expected_values.append(value.lower())

                        # Success if ALL expected values appear in the answer
                        if expected_values:
                            success = all(val in answer_lower for val in expected_values)
                        else:
                            # No expected values - default to checking "text" field
                            expected = gold_answer.get("text", "").lower()
                            expected_tokens = expected.split()
                            success = len(expected_tokens) > 0 and all(token in answer_lower for token in expected_tokens)

                        # Check citation if required
                        if success and constraints.get("require_citations"):
                            if "CITATION:" in answer_text or "citation:" in answer_text:
                                # Extract all URLs from answer
                                import re
                                urls_in_answer = re.findall(r'http://[^\s,]+', answer_text)

                                # Check if citation matches pattern(s)
                                citation_patterns = gold_answer.get("citation_pattern", [])
                                if not citation_patterns:
                                    # No specific pattern required, just need any citation
                                    success = len(urls_in_answer) > 0
                                elif isinstance(citation_patterns, list):
                                    # Check if all required patterns appear in the cited URLs
                                    min_citations = constraints.get("min_citations", 1)
                                    matched = sum(1 for pattern in citation_patterns
                                                 if any(pattern.lower() in url.lower() for url in urls_in_answer))
                                    success = matched >= min_citations
                                else:
                                    # Single pattern (string)
                                    success = any(citation_patterns.lower() in url.lower() for url in urls_in_answer)
                            else:
                                success = False  # Missing citation

                        # Check format requirement
                        if success and constraints.get("require_format"):
                            # Require pipe-separated format: "X: A | Y: B | Z: C"
                            if "|" not in answer_text or ":" not in answer_text:
                                success = False  # Missing required format

                    # In multi-agent mode, only end episode if VERIFIER answered or if answer is complete
                    should_end = True
                    if coordinator:
                        # Check if this is VERIFIER's answer (final role)
                        if current_agent_role == "VERIFIER":
                            should_end = True  # VERIFIER answered, end episode
                        elif success:
                            should_end = True  # Answer is correct with citation, end early
                        else:
                            should_end = False  # Let other agents continue (e.g., VERIFIER adds citation)

                    if should_end:
                        # Log trajectory before breaking
                        if log_trajectory:
                            trajectory.append({
                                "t": t,
                                "obs_text_trunc": obs_text[:200] if obs_text else "",
                                "url": current_url,
                                "candidate_action": candidate_action,
                                "operator_outcome": decision_outcome,
                                "edited_action": edited_action,
                                "would_violate": would_violate,
                                "failure_mode": failure_mode,
                                "reason": reason,
                                "retrieve_hit": retrieved,
                                "retrieve_score": decision.retrieve_score if decision and hasattr(decision, 'retrieve_score') else 0.0,
                                "budget_left": max_steps - t,
                                "executed": executed,
                                "agent_role": current_agent_role,
                            })
                        break  # End episode after answer

                else:
                    # Other action types - just get current content
                    obs_text = page.content()
                    current_url = page.url

            except Exception as e:
                print(f"Warning: Action execution failed: {e}")
                obs_text = page.content()
                current_url = page.url

        # Collect violations for learning
        if operator is not None and operator.dynamic_precedents and operator.precedent_store is not None:
            if would_violate:
                gold_action_for_precedent = _generate_gold_action(
                    candidate_action=candidate_action,
                    failure_mode=failure_mode,
                    task_type=task_type,
                    gold_answer=gold_answer,
                    current_url=current_url,
                    agent_role=current_agent_role,
                )

                if gold_action_for_precedent is not None:
                    episode_violations.append({
                        "candidate_action": candidate_action,
                        "failure_mode": failure_mode,
                        "gold_action": gold_action_for_precedent,
                        "route_ctx": route_ctx,
                        "intent": classify_action_type(candidate_action),
                    })

        # Intra-episode learning (every 5 steps)
        if (t + 1) % 5 == 0 and operator and operator.dynamic_precedents and operator.precedent_store:
            for violation in episode_violations:
                operator.precedent_store.add(
                    site="wikipedia",
                    intent=violation["intent"],
                    failure_mode=violation["failure_mode"],
                    state=violation["route_ctx"],
                    bad_action_text=json.dumps(violation["candidate_action"]),
                    approved_action=violation["gold_action"],
                    task_id=task_id,  # Track source task for CTHR
                )
            episode_violations.clear()

        # Log trajectory
        if log_trajectory:
            trajectory.append({
                "t": t,
                "obs_text_trunc": obs_text[:200] if obs_text else "",
                "url": current_url,
                "candidate_action": candidate_action,
                "operator_outcome": decision_outcome,
                "edited_action": edited_action,
                "would_violate": would_violate,
                "failure_mode": failure_mode,
                "reason": reason,
                "retrieve_hit": retrieved,
                "retrieve_score": decision.retrieve_score if decision and hasattr(decision, 'retrieve_score') else 0.0,
                "budget_left": max_steps - t,
                "executed": executed,
                "agent_role": current_agent_role,
            })

        # Update history
        if executed:
            history_tail.append(json.dumps(final_action))

        # Advance coordinator
        if coordinator:
            coordinator.advance_step()

    # Calculate metrics
    vr = violations / max(1, steps)
    er = escalations / max(1, steps)
    im = edits / max(1, steps)
    ts = 1.0 if success else 0.0
    retrieve_hit_rate = retrieve_hits / max(1, steps)

    # Decomposed retrieval rates (CTHR fix)
    total_retrieval_hits = static_hits + within_task_hits + cross_task_hits
    static_hit_rate = static_hits / max(1, total_retrieval_hits)
    within_task_hit_rate = within_task_hits / max(1, total_retrieval_hits)
    # TRUE CTHR: Only cross-task hits with known origin (excludes static precedents)
    cthr = cross_task_hits / max(1, total_retrieval_hits)

    # Add remaining precedents
    if operator is not None and operator.dynamic_precedents and operator.precedent_store is not None:
        for violation in episode_violations:
            operator.precedent_store.add(
                site="wikipedia",
                intent=violation["intent"],
                failure_mode=violation["failure_mode"],
                state=violation["route_ctx"],
                bad_action_text=json.dumps(violation["candidate_action"]),
                approved_action=violation["gold_action"],
                task_id=task_id,  # Track source task for CTHR
            )

    # Get theta info if available
    theta_info = {}
    if operator is not None and hasattr(operator, 'get_theta_info'):
        theta_info = operator.get_theta_info()

    result = {
        "suite_name": instance.suite if hasattr(instance, "suite") else "webarena_wiki_real",
        "task_id": task_id,
        "method": run_cfg.method_name if run_cfg else "NR",
        "seed": seed,
        "n_agents": n_agents,
        "n_steps": steps,
        "ts": float(ts),
        "vr": float(vr),
        "im": float(im),
        "er": float(er),
        "retrieve_hit_rate": float(retrieve_hit_rate),
        # Decomposed retrieval metrics (CTHR fix)
        "static_hits": static_hits,
        "within_task_hits": within_task_hits,
        "cross_task_hits": cross_task_hits,
        "static_hit_rate": float(static_hit_rate),
        "within_task_hit_rate": float(within_task_hit_rate),
        "cthr": float(cthr),  # TRUE Cross-Task Hit Rate (excludes static)
        "success": success,
        "failure_type": "success" if success else "fail_incomplete",
        "operator": operator,
        "precedent_store": precedent_store,
        # Theta ablation info
        "theta_updates": theta_info.get("theta_updates_count", 0),
        "theta_values": theta_info.get("theta", [0, 0, 0, 0]),
    }

    if log_trajectory:
        result["trajectory"] = trajectory

    print(f"\nCompleted: {steps} steps, Success: {success}, TS: {ts:.2f}")

    # Clean up playwright
    if env and isinstance(env, dict):
        try:
            env['context'].close()
            env['browser'].close()
            env['playwright'].stop()
        except Exception as e:
            print(f"Warning: Cleanup failed: {e}")

    return result


def _simulate_agent_action(obs_text: str, task_prompt: str, t: int, max_steps: int, rng: random.Random) -> Dict[str, Any]:
    """Simulate agent action for testing."""
    if t < max_steps - 2:
        if rng.random() < 0.5:
            return {"action_type": "search", "query": "relevant search query"}
        else:
            return {"action_type": "click", "bid": rng.randint(1, 100)}
    else:
        return {"action_type": "answer", "text": "Sample answer without citation"}


def _convert_to_pipe_format(text: str) -> str:
    """
    Convert prose answer to pipe-separated format.

    CONSTRAINT: This is a SYNTAX-LEVEL transformation only.
    - We ONLY rearrange tokens already present in the input text
    - We NEVER add new facts, entities, or semantic content
    - We NEVER change factual claims or entity names
    - We ONLY insert structural markers (First:, |, etc.)

    Example:
        Input: "Einstein was born in 1879, Newton in 1643, and Darwin in 1809."
        Output: "First: Einstein was born in 1879 | Second: Newton in 1643 | Third: Darwin in 1809"

    Strategy:
        1. Parse existing text for key-value patterns
        2. Extract spans (no semantic rewriting)
        3. Insert enumeration markers
        4. Join with pipe separators

    This ensures JO acts as a structural normalizer, not a content generator.
    """
    import re

    # Preserve citations if present
    citation_part = ""
    if "CITATION:" in text:
        parts = text.split("CITATION:", 1)
        text = parts[0].strip()
        citation_part = " | CITATIONS: " + parts[1].strip()

    # Preamble words to reject (not entities)
    preamble_words = {
        'the', 'lengths', 'boiling', 'melting', 'populations', 'capitals',
        'founding', 'years', 'points', 'temperatures', 'heights', 'weights',
        'densities', 'speeds', 'rates', 'counts', 'numbers', 'amounts'
    }

    # Split by comma when followed by word (but not number)
    # This preserves commas in numbers (6,650)
    phrases = re.split(r',\s+(?=[a-zA-Z])|(?:\s+and\s+)(?=[a-zA-Z])', text)

    # Clean up phrases: remove leading "the", "and the", etc.
    cleaned_phrases = []
    for p in phrases:
        p = p.strip()
        # Remove leading articles and conjunctions
        p = re.sub(r'^(?:the|and|and\s+the)\s+', '', p, flags=re.IGNORECASE)
        if p:
            cleaned_phrases.append(p)
    phrases = cleaned_phrases

    items = []
    for phrase in phrases:
        # Pattern 1: Dash-separated "X - Y" (most specific)
        match = re.search(r'([A-Za-z][a-z]+(?:\s+[A-Za-z][a-z]+)*)\s+-\s+(.+)', phrase)
        if match:
            key = match.group(1).strip()
            value = match.group(2).strip().rstrip('.')

            # Reject preamble
            if not any(pw in key.lower() for pw in preamble_words):
                items.append(f"{key}: {value}")
            continue

        # Pattern 2: Parenthetical "X (Y)"
        match = re.search(r'([A-Z][a-z]+)\s*\(([^\)]+)\)', phrase)
        if match:
            key = match.group(1).strip()
            value = match.group(2).strip()
            items.append(f"{key}: {value}")
            continue

        # Pattern 3: "X was/has born/founded/traced/roots Y"
        # Also handles: "X's origins can be traced back to Y"
        match = re.search(r"([A-Z][a-z]+(?:\s+of\s+[A-Z][a-z]+|\s+[A-Z][a-z]+)*)'?s?\s+(?:origins|roots)?\s*(?:was|were|has|have|can be)?\s*(?:born in|founded in|traced back to|traced to|dating back to)\s+(.+)", phrase)
        if not match:
            match = re.search(r'([A-Z][a-z]+(?:\s+of\s+[A-Z][a-z]+|\s+[A-Z][a-z]+)*)\s+(?:was|were|has|have)\s+(?:born in|founded in|established in|created in)\s+(.+)', phrase)
        if not match:
            # Pattern for "X is/has about/approximately/around Y"
            match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|has|are)\s+(?:about|approximately|around)?\s*(.+)', phrase)
        if match:
            key = match.group(1).strip()
            value = match.group(2).strip().rstrip('.')
            if not any(pw in key.lower() for pw in preamble_words):
                items.append(f"{key}: {value}")
            continue

        # Pattern 4: "X in Y" (for short forms like "Newton in 1643")
        match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+in\s+(\d+)', phrase)
        if match:
            key = match.group(1).strip()
            value = match.group(2).strip()
            if not any(pw in key.lower() for pw in preamble_words):
                items.append(f"{key}: {value}")
            continue

    # If we found items, use them
    if items:
        result = ' | '.join(items)
        return result + citation_part

    # If no patterns matched, try simple comma/period split (STRICT - proper nouns only)
    if not items:
        # Split by common delimiters
        sentences = re.split(r'[,;]|\sand\s|\.|(?:\s+CITATION)', text)
        for sent in sentences:
            sent = sent.strip()
            if not sent or len(sent) < 5:
                continue
            # Only match proper nouns (capitalized words) - skip preamble
            match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s+(?:is|was|are|were|has|have)\s+(.+)', sent)
            if match:
                key = match.group(1).strip()
                value = match.group(2).strip()

                # Skip preamble words
                key_lower = key.lower()
                if any(pw in key_lower for pw in preamble_words):
                    continue

                items.append(f"{key}: {value}")

    # If still nothing, return original with minimal formatting
    if not items:
        # Last resort: just ensure there's a colon and pipe somewhere
        if ':' not in text and '|' not in text:
            # Try to insert structure
            text = text.replace(', ', ' | ').replace(' and ', ' | ')
            if '|' in text and ':' not in text:
                # Add colons before first word in each section
                parts = text.split('|')
                formatted_parts = []
                for part in parts:
                    part = part.strip()
                    if part:
                        words = part.split()
                        if len(words) > 1:
                            formatted_parts.append(f"{words[0]}: {' '.join(words[1:])}")
                        else:
                            formatted_parts.append(part)
                text = ' | '.join(formatted_parts)
            return text.strip() + citation_part
        return text.strip() + citation_part

    # Join items with pipe separator
    result = ' | '.join(items)
    return result + citation_part


def _convert_to_numbered_format(text: str) -> str:
    """
    Convert prose answer to numbered list format.
    Template B: First: X | Second: Y | Third: Z
    """
    import re

    # Preserve citations if present
    citation_part = ""
    if "CITATION:" in text:
        parts = text.split("CITATION:", 1)
        text = parts[0].strip()
        citation_part = " | CITATIONS: " + parts[1].strip()

    ordinals = ["First", "Second", "Third", "Fourth", "Fifth", "Sixth", "Seventh", "Eighth"]

    # Split by comma or "and"
    phrases = re.split(r',\s+(?=[a-zA-Z])|(?:\s+and\s+)(?=[a-zA-Z])', text)
    phrases = [p.strip() for p in phrases if p.strip()]

    items = []
    for i, phrase in enumerate(phrases):
        # Clean leading articles
        phrase = re.sub(r'^(?:the|and|and\s+the)\s+', '', phrase, flags=re.IGNORECASE)
        if phrase and len(phrase) > 3:
            ordinal = ordinals[i] if i < len(ordinals) else f"Item{i+1}"
            items.append(f"{ordinal}: {phrase.rstrip('.')}")

    if items:
        return ' | '.join(items) + citation_part

    # Fallback: just add "First:" prefix
    return f"First: {text.strip()}" + citation_part


def _convert_to_bullet_format(text: str) -> str:
    """
    Convert prose answer to bullet format.
    Template C: * X: val * Y: val
    """
    import re

    # Preserve citations if present
    citation_part = ""
    if "CITATION:" in text:
        parts = text.split("CITATION:", 1)
        text = parts[0].strip()
        citation_part = " [Source: " + parts[1].strip() + "]"

    # Split by comma or "and"
    phrases = re.split(r',\s+(?=[a-zA-Z])|(?:\s+and\s+)(?=[a-zA-Z])', text)
    phrases = [p.strip() for p in phrases if p.strip()]

    items = []
    for phrase in phrases:
        phrase = re.sub(r'^(?:the|and|and\s+the)\s+', '', phrase, flags=re.IGNORECASE)
        if phrase and len(phrase) > 3:
            # Try to extract key-value
            match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was|has|in)\s+(.+)', phrase)
            if match:
                key = match.group(1).strip()
                val = match.group(2).strip().rstrip('.')
                items.append(f"* {key}: {val}")
            else:
                items.append(f"* {phrase.rstrip('.')}")

    if items:
        return ' '.join(items) + citation_part

    # Fallback
    return f"* {text.strip()}" + citation_part


def _generate_citation_variants(text: str, url: str) -> list:
    """
    Generate multiple citation format variants for the same content.
    Returns list of (formatted_text, template_id) tuples.
    """
    import re

    # Extract article title from URL
    article_title = "Article"
    match = re.search(r'/A/([^/#]+)', url)
    if match:
        article_title = match.group(1).replace('_', ' ')

    variants = []

    # Template A: Append CITATION: url
    if "CITATION:" not in text:
        variants.append((f"{text} CITATION: {url}", "citation_append"))
    else:
        variants.append((text, "citation_append"))  # Already has citation

    # Template B: Inline [Source: url]
    text_b = text.replace("CITATION:", "[Source:").rstrip()
    if "[Source:" not in text_b:
        text_b = f"{text} [Source: {url}]"
    else:
        text_b = text_b + "]" if not text_b.endswith("]") else text_b
    variants.append((text_b, "citation_inline"))

    # Template C: See: article_title
    text_c = text.split("CITATION:")[0].strip() if "CITATION:" in text else text
    text_c = f"{text_c} (See: {article_title})"
    variants.append((text_c, "citation_see"))

    return variants


def _generate_gold_action(
    candidate_action: Dict[str, Any],
    failure_mode: str,
    task_type: str,
    gold_answer: Optional[Dict[str, Any]],
    current_url: str,
    agent_role: str = "",
) -> Optional[Dict[str, Any]]:
    """Generate gold action for precedent learning."""
    action_type = candidate_action.get("action_type", "")

    # Handle role_leakage: WRITER/VERIFIER trying to navigate should use answer instead
    if failure_mode == "role_leakage" and action_type == "navigate":
        if agent_role in ("WRITER", "VERIFIER"):
            # Force them to answer instead of navigate
            return {
                "action_type": "answer",
                "text": "[ROLE VIOLATION: Use ANSWER action to provide your response]"
            }

    # Handle answer action violations
    if action_type == "answer":
        text = candidate_action.get("text", "")

        # Handle format violations
        if "format" in failure_mode.lower() or "pipe" in failure_mode.lower():
            formatted_text = _convert_to_pipe_format(text)
            return {
                "action_type": "answer",
                "text": formatted_text
            }

        # Handle citation violations
        if failure_mode == "missing_citation":
            citation = current_url if current_url else "http://localhost:9999/wikipedia_en_all_maxi_2022-05/A/Article"
            return {
                "action_type": "answer",
                "text": f"{text} CITATION: {citation}"
            }

    # Default: navigate action (for navigation errors, etc.)
    return {
        "action_type": "navigate",
        "url": current_url or "http://localhost:9999/wikipedia_en_all_maxi_2022-05/A/Main_Page"
    }
