# qoj_submissions_scraper.py
# Scrapes QOJ submissions list and augments each row with subtask_result_1..50.
# For each submission row, opens the first-td link in a NEW TAB (same window),
# reads subtask scores from the details accordion, or falls back to user input mode.
#
# If verdict == "Compile Error", sets all subtask_result_* = None and skips details parsing.
# After processing each row, prints one compact JSON line for debugging.
#
# Requirements:
#   pip install playwright datasets
#   playwright install chromium
#
# Usage:
#   python qoj_submissions_scraper.py
#
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
from urllib.parse import urljoin
from datasets import Dataset,Value
from typing import List, Dict, Tuple
import sys
import re
import json
import traceback

BASE = "https://qoj.ac/"

# Create 50 subtask columns
SUBTASK_COLS = [f"subtask_result_{i}" for i in range(1, 51)]
SUBTASK_NUM_RE = re.compile(r"^\s*Subtask\s*#\s*(\d+)\s*:\s*$", re.IGNORECASE)
EXTRA_TEST_RE  = re.compile(r"^\s*Extra\s*Test\s*#\s*\d+\s*:", re.IGNORECASE)
SPECIAL_CASE_LINKS = [
    "https://qoj.ac/contest/1355/problem/7161",
    "https://qoj.ac/contest/1354/problem/7156"
]

def _blank_subtask_dict() -> Dict[str, str]:
    """Return a dict with subtask_result_1..50 set to '' (blank by default)."""
    return {col: "" for col in SUBTASK_COLS}


def _none_subtask_dict() -> Dict[str, None]:
    """Return a dict with subtask_result_1..50 set to None."""
    return {col: None for col in SUBTASK_COLS}


def _debug_print_row(row: Dict[str, str]) -> None:
    """Print a single-line JSON for debugging after each row is processed."""
    try:
        print(json.dumps(row, ensure_ascii=False), flush=True)
    except Exception:
        print(str(row), flush=True)


def _parse_subtasks_from_details(context, details_url: str) -> Tuple[Dict[str, str], bool]:
    """
    Open details_url in a NEW TAB within the same browser window and parse subtask results.

    Returns:
        (mapping, ok)
        - mapping: dict of {"subtask_result_X": "score: ..."} (only those found)
        - ok: True if parsed using the page structure; False if caller should fallback to user input
    """
    mapping: Dict[str, str] = {}
    new_tab = context.new_page()  # opens a new tab in the same window (headful launch)
    try:
        try:
            new_tab.goto(details_url, wait_until="domcontentloaded", timeout=15000)
        except Exception as e:
            print(f"[warn] goto failed for {details_url}: {e}", flush=True)
            return mapping, False

        try:
            new_tab.wait_for_selector("#details_details_accordion", timeout=8000)
        except PlaywrightTimeoutError as e:
            print(f"[warn] details accordion not found on {details_url}: {e}", flush=True)
            return mapping, False

        headers = new_tab.query_selector_all("#details_details_accordion .card-header[data-parent='#details_details_accordion']")
        if not headers:
            print(f"[warn] no .card-header elements on {details_url}", flush=True)
            return mapping, False

        # Parse ALL headers; if any violates rules, abort and request user input.
        for h in headers:
            rows = h.query_selector_all(".row")
            if len(rows) != 1:
                print(f"[warn] expected exactly one .row, got {len(rows)} on {details_url}", flush=True)
                return {}, False

            row_div = rows[0]
            # Prefer direct children; fall back to all divs if :scope not supported.
            inner_divs = row_div.query_selector_all(":scope > div") or row_div.query_selector_all("div")
            if len(inner_divs) < 2:
                print(f"[warn] row has <2 inner divs on {details_url}", flush=True)
                return {}, False

            subtask_label = (inner_divs[0].inner_text() or "").strip()
            score_text = (inner_divs[1].inner_text() or "").strip()

            # ---- SKIP "Extra Test #x:" headers ----
            if EXTRA_TEST_RE.match(subtask_label):
                continue

            # Validate "Subtask #x:" pattern
            m = SUBTASK_NUM_RE.match(subtask_label)
            if not m:
                print(f"[warn] label not like 'Subtask #x:' -> {subtask_label!r} on {details_url}", flush=True)
                return {}, False
            subtask_num = int(m.group(1))

            # Validate "score: "
            if not score_text.lower().startswith("score:"):
                print(f"[warn] score div does not start with 'score: ' -> {score_text!r} on {details_url}", flush=True)
                return {}, False

            # Only map into 1..50; ignore larger numbers silently
            if 1 <= subtask_num <= 50:
                mapping[f"subtask_result_{subtask_num}"] = score_text

        return mapping, True

    except Exception as e:
        print(f"[warn] parsing error on {details_url}: {e}", flush=True)
        traceback.print_exc()
        return {}, False
    finally:
        try:
            new_tab.close()
        except Exception:
            pass


def _user_input_subtasks(prompt_hint: str = "") -> Dict[str, str]:
    """
    Ask user to input a comma-separated list of scores for each subtask.
    Accepts either 'score: 10, score: 0, ...' or bare values like '10,0,...'.
    """
    print(
        "\n[User Input Mode] Could not auto-parse subtasks."
        f"{(' Context: ' + prompt_hint) if prompt_hint else ''}\n"
        "Please enter a comma-separated list of subtask scores.\n"
        "Examples:\n"
        "  score: 10, score: 0, score: 20\n"
        "  10, 0, 20\n",
        flush=True
    )
    line = input("Enter scores for subtasks 1..N (N≤50): ").strip()
    mapping: Dict[str, str] = {}
    if not line:
        return mapping

    parts = [p.strip() for p in line.split(",") if p.strip()]
    for i, val in enumerate(parts[:50], start=1):
        if not val.lower().startswith("score:"):
            val = f"score: {val}"
        mapping[f"subtask_result_{i}"] = val
    return mapping


def scrape_page(page) -> List[Dict[str, str]]:
    """Scrape the current submissions table into a list of dicts, with subtask_result_* columns added."""
    try:
        page.wait_for_selector(".table.table-bordered.table-hover.table-striped.table-text-center", timeout=8000)
    except PlaywrightTimeoutError:
        return []

    rows = page.query_selector_all(
        ".table.table-bordered.table-hover.table-striped.table-text-center tbody tr"
    )
    out: List[Dict[str, str]] = []

    for r in rows:
        # Build base row first so we can still print something on failure
        row_dict: Dict[str, object] = {
            "link": "",
            "verdict": "",
            "submission_link": "",
            **_blank_subtask_dict(),
        }
        try:
            # 2nd td -> <a href> (absolute), 4th td -> <a> text, 5th td -> time, 6th td -> memory
            a2 = r.query_selector("td:nth-child(2) a")
            href2 = a2.get_attribute("href").strip() if a2 else ""
            row_dict["link"] = urljoin(BASE, href2) if href2 else ""

            a4 = r.query_selector("td:nth-child(4) a")
            verdict = (a4.inner_text() if a4 else "").strip()
            row_dict["verdict"] = verdict

            # New: details link from 1st td
            a1 = r.query_selector("td:nth-child(1) a")
            href1 = a1.get_attribute("href").strip() if a1 else ""
            details_url = urljoin(BASE, href1) if href1 else ""
            row_dict["submission_link"] = details_url

            # If the verdict is "Compile Error" or is a special case: set all subtasks to None and skip parsing/details.
            if verdict.strip().lower() == "compile error" or row_dict["link"] in SPECIAL_CASE_LINKS:
                row_dict.update(_none_subtask_dict())
                continue  # move to the next table row

            if details_url:
                # page.context is an explicit BrowserContext (created in main)
                mapping, ok = _parse_subtasks_from_details(page.context, details_url)
                if ok and mapping:
                    row_dict.update(mapping)
                else:
                    # User input mode
                    user_map = _user_input_subtasks(prompt_hint=details_url)
                    if user_map:
                        row_dict.update(user_map)
            else:
                # No details link -> ask user
                user_map = _user_input_subtasks(prompt_hint="(no details link in first td)")
                if user_map:
                    row_dict.update(user_map)

        except Exception as e:
            print(f"[error] row scraping failed: {e}", flush=True)
            traceback.print_exc()
        finally:
            # Print once per processed row, and append to output
            _debug_print_row(row_dict)
            out.append(row_dict)

    return out


def is_last_page(page) -> bool:
    """
    Return True if the LAST li.page-item in the pagination has class 'disabled'.
    If pagination doesn't exist, treat as last page (single page).
    """
    li = page.query_selector("li.page-item:last-of-type")
    if not li:
        return True
    class_attr = li.get_attribute("class") or ""
    return "disabled" in class_attr.split()


def click_next(page) -> bool:
    """
    Click the 'next' pagination item (assumed last li.page-item > a).
    Returns True on click success, False if cannot click (e.g., no next).
    """
    li = page.query_selector("li.page-item:last-of-type")
    if not li:
        return False
    cls = (li.get_attribute("class") or "").split()
    if "disabled" in cls:
        return False
    a = li.query_selector("a")
    if not a:
        return False
    a.click()
    return True


def post_process(ds: Dataset) -> Dataset:
    subtask_cols = [f"subtask_result_{i}" for i in range(1, 51)]

    def clean(batch):
        out = {}
        for col, vals in batch.items():
            new_vals = []
            for row_idx, v in enumerate(vals):
                # 0) Capture link for conditional adjustments
                link = batch.get("link")[row_idx] if "link" in batch else None

                # 1) Fill None or blank → "0" for strings, 0.0 for floats
                if v is None or (isinstance(v, str) and not v.strip()):
                    if col in subtask_cols:
                        v = 0.0
                    else:
                        v = "0"

                # 2) verdict cleaning (still keep as string)
                if col == "verdict":
                    if v == "100 ✓":
                        v = "100"
                    if v != "Compile Error":
                        try:
                            float(v)
                        except:
                            v = "0"

                # 3) subtask_result_i → float, with regex extraction & warnings
                if col in subtask_cols:
                    # extract "score: x"
                    if isinstance(v, str):
                        m = re.match(r"score:\s*([0-9]+(?:\.[0-9]+)?)", v)
                        if m:
                            v = m.group(1)
                    # cast or warn
                    try:
                        v = float(v)
                    except:
                        print(f'[WARN] Undefined string in row {row_idx} in column {col}: {v!r}')
                        v = 0.0
                    if link == "https://qoj.ac/contest/194/problem/792":
                        if col == "subtask_result_1" and v == 20.0:
                            v = 30.0
                        if col == "subtask_result_2" and v == 25.0:
                            v = 30.0
                        if col == "subtask_result_3" and v == 20.0:
                            v = 40.0
                new_vals.append(v)
            out[col] = new_vals
        return out

    # 1) Clean everything in one batched map
    ds = ds.map(clean, batched=True, desc="Post-processing all columns")

    # 2) Cast all subtask columns to float64 so Arrow sees a uniform type
    features = ds.features.copy()
    for col in subtask_cols:
        features[col] = Value("float64")
    ds = ds.cast(features)

    return ds


def main():
    print("Opening browser and navigating to qoj.ac ...")
    with sync_playwright() as p:
        # Use an EXPLICIT context so context.new_page() works (same window, new tab).
        browser = p.chromium.launch(headless=False)
        # Optional: slow down for visual debugging
        # browser = p.chromium.launch(headless=False, slow_mo=120)

        context = browser.new_context()  # explicit context
        page = context.new_page()
        page.goto(BASE, wait_until="load")

        print("Please log in on the opened window.")
        input("After logging in successfully, press [y] + Enter here to continue: ")

        user_id = input("Enter the user ID to scrape (the submitter ID in qoj): ").strip()
        if not user_id:
            print("No user ID provided. Exiting.")
            context.close()
            browser.close()
            sys.exit(1)

        submissions_url = f"{BASE}submissions?submitter={user_id}"
        print(f"Navigating to {submissions_url}")
        page.goto(submissions_url, wait_until="load")

        all_rows: List[Dict[str, object]] = []
        page_index = 1

        while True:
            print(f"Scraping page {page_index} ...")
            page.wait_for_load_state("domcontentloaded")
            page_rows = scrape_page(page)
            all_rows.extend(page_rows)

            if is_last_page(page):
                print("Reached last page.")
                break

            if not click_next(page):
                print("Could not locate next page link; stopping.")
                break

            try:
                page.wait_for_load_state("load", timeout=15000)
            except PlaywrightTimeoutError:
                # Still proceed; DOM might be ready
                pass
            page_index += 1

        # Build and persist dataset
        print(f"Collected {len(all_rows)} rows.")
        ds = Dataset.from_list(all_rows) if all_rows else Dataset.from_list([])
        out_name = f"{user_id}-submissions-temp.parquet"
        ds.to_parquet(out_name)
        print(f"Saved raw data to {out_name}")

        print("Post-processing dataset...")
        ds = post_process(ds)

        out_name = f"{user_id}-submissions.parquet"
        ds.to_parquet(out_name)
        print(f"Saved to {out_name}")

        print("Done. Leaving the browser window open.")
        input("Press Enter here to close the browser... ")
        context.close()
        browser.close()


if __name__ == "__main__":
    main()
