"""Stage 1.5: repair Gemini's char offsets using context-based anchoring.

Gemini's `char_offset_start` / `char_offset_end` are frequently wrong because
the model cannot reliably count characters through LaTeX escapes and Unicode
forms. The `context` field it provides is however semantically accurate.

This stage reads labeled_numbers.jsonl, relocates each candidate using
`fix_offset_by_context`, and writes labeled_numbers_fixed.jsonl with verified
offsets. Candidates whose offset cannot be recovered are dropped (so the pool
that reaches `select_candidates.py` is clean).
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from number_edit.common import fix_offset_by_context, load_jsonl, write_jsonl


def main() -> None:
    parser = argparse.ArgumentParser(description="Fix numeric char offsets via context anchoring")
    parser.add_argument("--input", required=True, help="Original dataset JSONL (for text lookup)")
    parser.add_argument("--labels", default="./number_edit/data/labeled_numbers.jsonl")
    parser.add_argument("--output", default="./number_edit/data/labeled_numbers_fixed.jsonl")
    args = parser.parse_args()

    problems = {row["name"]: row for row in load_jsonl(args.input)}
    labeled = load_jsonl(args.labels)

    fixed_rows = []
    n_in = n_out = n_dropped = n_already_ok = n_recovered = 0

    for row in labeled:
        name = row.get("problem_name", "")
        p = problems.get(name)
        if p is None:
            fixed_rows.append(row)  # keep as-is, select will skip later
            continue

        stmt = str(p.get("informal_statement", "") or "")
        proof = str(p.get("informal_proof", "") or "")

        kept = []
        for cand in row.get("numbers", []) or []:
            n_in += 1
            source = cand.get("source", "")
            text = stmt if source == "statement" else proof if source == "proof" else ""
            if not text:
                n_dropped += 1
                continue

            value = str(cand.get("value", "")).strip()
            ctx = cand.get("context", "")
            s = int(cand.get("char_offset_start", -1))
            e = int(cand.get("char_offset_end", -1))

            was_ok = 0 <= s < e <= len(text) and text[s:e] == value
            result = fix_offset_by_context(text, value, ctx, s, e)
            if result is None:
                n_dropped += 1
                continue
            new_s, new_e = result
            cand = dict(cand)
            cand["char_offset_start"] = new_s
            cand["char_offset_end"] = new_e
            if was_ok:
                n_already_ok += 1
            else:
                n_recovered += 1
            kept.append(cand)
            n_out += 1

        new_row = dict(row)
        new_row["numbers"] = kept
        fixed_rows.append(new_row)

    write_jsonl(args.output, fixed_rows)

    print("=== Offset Fix Summary ===")
    print(f"  candidates in:         {n_in}")
    print(f"  candidates out:        {n_out}")
    print(f"  already correct:       {n_already_ok}")
    print(f"  recovered via context: {n_recovered}")
    print(f"  dropped (unrecoverable): {n_dropped}")
    print(f"  fix rate:              {100 * n_out / max(n_in, 1):.1f}%")
    print(f"  output: {args.output}")


if __name__ == "__main__":
    main()
