#!/usr/bin/env python3
"""
Verify that the generated BACKEND TEST CASES fully cover – and only cover –
the requirements of each WebGen-Bench instruction.

Differences from the old DB-verification script
-----------------------------------------------
1. Works on `backend_test_cases` instead of `data_structures`.
2. Produces two new keys in every sample:
   - backend_adjustment_suggestions   – list[str]
   - backend_adjusted_test_cases      – list[dict | str]
3. New prompt template geared toward test-case coverage rather than tables.
4. Keeps the same error handling, multi-threading and I/O conventions.
"""

import json
import multiprocessing
import os
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

from tqdm import tqdm

# --------------------------------------------------------------- #
# Make utils importable (utils.py is one directory above)         #
# --------------------------------------------------------------- #
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import llm_generation  # noqa: E402

# ----------  paths  ---------- #
GEN_PATH = (
    "src/"
    "generate_fullstack_tests/test_backend_model-gen.jsonl"
)
OUT_PATH = (
    "src/"
    "generate_fullstack_tests/test_backend_model-verify.json"
)

MODEL_ID = "Qwen3-Coder-480B-A35B-Instruct-FP8"

# --------------------------------------------------------------- #
#                 I / O    H E L P E R S                          #
# --------------------------------------------------------------- #
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def dump_json(obj, path):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def load_jsonl(path: str):
    """Return a list with one dict per line in the JSONL file."""
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def dump_jsonl(objs, path: str):
    """Write a list of dicts to a JSONL file."""
    with open(path, "w", encoding="utf-8") as f:
        for obj in objs:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

# --------------------------------------------------------------- #
#                P R O M P T    T E M P L A T E                   #
# --------------------------------------------------------------- #
VERIFY_TEMPLATE = """You are a senior backend QA engineer.

Your tasks:
1. Carefully examine whether every *requirement* in the website instruction below is covered by at least one BACKEND test case.
   - If important functionality is missing, ADD a new test case at the end of the "backend_adjusted_test_cases" list.
   - If a GUI requirement can be satisfied purely on the front-end, you do NOT need to create a backend test case for it.
   - An API endpoint should be tested by invalid inputs and valid inputs (if applicable).

2. Check that **every test case** in the proposed list is absolutely required
   to satisfy a backend requirement.
   - If a test case is unnecessary, mark it for removal.

3. Produce:
   a) A detailed step-by-step analysis mapping requirements ↔︎
      backend test cases and explaining any changes.
   b) A JSON object inside a ```json code fence with the exact format:
      {{
        "backend_adjustment_suggestions": [
          "... textual remarks ...",
          "... more remarks ...",
          ...
        ],
        "backend_adjusted_test_cases": [
          {{}}        # empty object if unchanged, or
          "(removed)" # string literal if deleted, or
          {{ "instruction": "...", "expected_result": "..." }}  # new/edited
        ]
      }}

Remember:
- KEEP THE ORIGINAL ORDER of test cases.  
  - If a test case is deleted put the string "(removed)" in that slot.  
  - If unchanged put {{}} (empty JSON object) in that slot.
- Append any brand-new test cases after the last original slot.
- The JSON block must be valid and must appear at the very end.

----------------------
WEBSITE INSTRUCTION:
{instruction}

----------------------
PROPOSED BACKEND TEST CASES:
{test_cases}
"""


def build_validation_prompt(instr: str, test_cases: list[dict]) -> str:
    # pretty bullet list
    bullets = []
    for tc in test_cases:
        ins = tc.get("instruction", "")
        exp = tc.get("expected_result", "")
        bullets.append(f'- instruction: "{ins}"\n  expected_result: "{exp}"')
    return VERIFY_TEMPLATE.format(instruction=instr, test_cases="\n".join(bullets))


# --------------------------------------------------------------- #
#            P A R S E    L L M    R E S P O N S E                #
# --------------------------------------------------------------- #
_JSON_FENCE_RE = re.compile(r"```json(.*?)```", re.S)


def parse_verify_response(txt: str):
    """
    Return (suggestions:list[str], adjusted:list[dict|str]).
    On failure → ([], []).
    """
    m = _JSON_FENCE_RE.search(txt)
    if not m:
        return [], []
    try:
        parsed = json.loads(m.group(1).strip())
        sugg = parsed.get("backend_adjustment_suggestions", [])
        adj = parsed.get("backend_adjusted_test_cases", [])
        return sugg, adj
    except json.JSONDecodeError:
        return [], []


# --------------------------------------------------------------- #
#                       L L M   C A L L                           #
# --------------------------------------------------------------- #
def llm_call(prompt: str) -> str:
    msgs = [{"role": "user", "content": prompt}]
    return llm_generation(msgs, model=MODEL_ID).get("content", "")


# --------------------------------------------------------------- #
#                 P E R - S A M P L E  W O R K E R                #
# --------------------------------------------------------------- #
def verify_sample(sample: dict) -> dict:
    instruction = sample.get("instruction", "")
    test_cases = sample.get("backend_test_cases", [])

    prompt = build_validation_prompt(instruction, test_cases)
    raw_answer = llm_call(prompt)
    suggestions, adjusted = parse_verify_response(raw_answer)

    sample["backend_adjustment_suggestions"] = suggestions
    sample["backend_adjusted_test_cases"] = adjusted
    return sample


# --------------------------------------------------------------- #
#                             M A I N                              #
# --------------------------------------------------------------- #
def main():
    data = load_json(GEN_PATH)

    verified = []
    max_workers = min(32, multiprocessing.cpu_count() * 2)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futs = {executor.submit(verify_sample, itm): idx for idx, itm in enumerate(data)}
        for fut in tqdm(as_completed(futs), total=len(data), desc="Verifying backend tests"):
            idx = futs[fut]
            try:
                verified.append(fut.result())
            except Exception as e:
                bad = data[idx]
                bad["backend_adjustment_suggestions"] = []
                bad["backend_adjusted_test_cases"] = []
                verified.append(bad)
                print(f"[WARN] verification failed for id={bad.get('id')}: {e}")

    verified.sort(key=lambda d: d.get("id", ""))
    dump_json(verified, OUT_PATH)
    print(f"Verification file written to {OUT_PATH}")


if __name__ == "__main__":
    ts = time.time()
    main()
    print(f"Done in {time.time() - ts:.1f}s")