#!/usr/bin/env python3
"""
Reproduce WebArena-Wikipedia experiments from the paper.

Experiments:
- Table 3: Format constraint enforcement (NO vs JO)
- Table 5: Recurrent pattern learning
"""

import sys
import argparse
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from jo.precedent_store import PrecedentStore


def pr(*args, **kwargs):
    print(*args, **kwargs)
    sys.stdout.flush()


# Paper configuration
N_TASKS = 100
N_SEEDS = 2  # Total N = 100 * 2 = 200
MAX_STEPS = 10
MODEL = "gpt-4o-mini"


# Wikipedia format constraints
WIKIPEDIA_CONSTRAINTS = {
    "enumeration_format": True,     # Pipe-separated values
    "citation_required": True,      # Must cite source URL
    "verbatim_quotes": True,        # Quoted text must match source
}


def run_table3_wikipedia(seeds=None):
    """Table 3: Wikipedia format constraint enforcement."""
    pr("=" * 70)
    pr("TABLE 3: Wikipedia Format Constraint Enforcement")
    pr("=" * 70)

    seeds = seeds or list(range(N_SEEDS))

    for condition in ["NO", "JO_static", "JO_dynamic"]:
        pr(f"\n--- Condition: {condition} ---")

        for seed in seeds:
            pr(f"  Seed {seed}: Running {N_TASKS} Wikipedia tasks...")

            # Actual experiment would:
            # 1. Create JudgmentOperator with appropriate mode
            # 2. Run browsergym Wikipedia tasks
            # 3. Track success rate and violation rate
            # results = run_wikipedia_tasks(condition, n_tasks=N_TASKS, seed=seed)
            # pr(f"  Seed {seed}: Success={results['success']:.1%}, VR={results['vr']:.1%}")


def run_table5_recurrent(seeds=None):
    """Table 5: Recurrent pattern learning."""
    pr("=" * 70)
    pr("TABLE 5: Recurrent Pattern Learning")
    pr("=" * 70)

    seeds = seeds or list(range(N_SEEDS))

    # Group tasks by template for recurrence analysis
    templates = [
        "person_birth_year",
        "person_birth_location",
        "country_capital",
        "company_founding_year",
    ]

    for condition in ["NO", "JO_dynamic"]:
        pr(f"\n--- Condition: {condition} ---")

        for template in templates:
            pr(f"\n  Template: {template}")

            for seed in seeds:
                pr(f"    Seed {seed}: Running template-grouped tasks...")

                # Tracks precedent reuse across similar tasks
                # store = PrecedentStore()
                # results = run_template_tasks(template, condition, seed, store)
                # pr(f"    Precedent reuse rate: {results['reuse_rate']:.1%}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run Wikipedia experiments")
    parser.add_argument("--experiment", type=str, default="all",
                       choices=["all", "table3", "table5"],
                       help="Which experiment to run")
    parser.add_argument("--seeds", type=str, default=None,
                       help="Comma-separated seeds")
    args = parser.parse_args()

    seeds = [int(s) for s in args.seeds.split(",")] if args.seeds else None

    if args.experiment in ["all", "table3"]:
        run_table3_wikipedia(seeds)
    if args.experiment in ["all", "table5"]:
        run_table5_recurrent(seeds)
