#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Book-oriented PDF to Markdown OCR."""

from __future__ import annotations

import argparse
import base64
import io
import json
import re
import sys
import threading
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed

from PIL import Image, ImageOps
import pdf2image
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from openai import OpenAI


OCR_MD_PROMPT = "Extract all text and math from the image and output Markdown only."
OCR_MD_PROMPT_FALLBACK = "OCR to Markdown. Output only the content."


# -------- config helpers --------

def find_config_json() -> Path:
    p = Path.cwd() / "config.json"
    if p.exists():
        return p.resolve()
    here = Path(__file__).resolve().parent
    for d in [here] + list(here.parents):
        q = d / "config.json"
        if q.exists():
            return q.resolve()
    raise FileNotFoundError("config.json not found (checked CWD and script parents).")


def load_config() -> Dict[str, Any]:
    cfg_path = find_config_json()
    data = json.loads(cfg_path.read_text(encoding="utf-8"))
    if not isinstance(data, dict):
        raise ValueError(f"{cfg_path} must contain a JSON object.")
    return data


def require_str(cfg: Dict[str, Any], key: str) -> str:
    v = cfg.get(key)
    if not isinstance(v, str) or not v.strip():
        raise KeyError(f"Missing/invalid '{key}' in config.json (must be non-empty string)")
    return v.strip()


def get_cfg(
    cfg: Dict[str, Any],
    key: str,
    default: Any,
    *,
    expected_type: Optional[type] = None,
    nonempty: bool = False,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None,
    allow_none: bool = False,
) -> Any:
    """
    Typed config getter with optional range checks.
    Keeps config.json structure unchanged, but enforces types/ranges when requested.
    """
    v = cfg.get(key, default)
    if v is None and allow_none:
        return None

    if expected_type is not None and not isinstance(v, expected_type):
        raise TypeError(f"Invalid '{key}' in config.json: expected {expected_type.__name__}, got {type(v).__name__}")

    if isinstance(v, str) and nonempty and not v.strip():
        raise ValueError(f"Invalid '{key}' in config.json: must be non-empty string")

    if isinstance(v, (int, float)):
        if min_value is not None and float(v) < float(min_value):
            raise ValueError(f"Invalid '{key}' in config.json: {v} < min {min_value}")
        if max_value is not None and float(v) > float(max_value):
            raise ValueError(f"Invalid '{key}' in config.json: {v} > max {max_value}")

    return v


# -------- settings helpers (module-local JSON) --------

def find_settings_json() -> Path:
    p = Path(__file__).resolve().with_name("settings.json")
    if p.exists():
        return p.resolve()
    raise FileNotFoundError(f"settings.json not found next to script: {p}")


def load_settings() -> Dict[str, Any]:
    sp = find_settings_json()
    data = json.loads(sp.read_text(encoding="utf-8"))
    if not isinstance(data, dict):
        raise ValueError(f"{sp} must contain a JSON object.")
    return data


def get_setting(settings: Dict[str, Any], name: str, default: Any) -> Any:
    return settings.get(name, default)


# -------- PDF render (stream per page) --------

def get_pdf_page_count(pdf_path: Path) -> int:
    info = pdf2image.pdfinfo_from_path(str(pdf_path))
    pages = info.get("Pages")
    if not isinstance(pages, int) or pages <= 0:
        raise RuntimeError(f"Failed to read PDF page count: {pdf_path}")
    return pages


def render_single_page(pdf_path: Path, dpi: int, page_1based: int) -> Image.Image:
    images = pdf2image.convert_from_path(
        str(pdf_path),
        dpi=dpi,
        fmt="png",
        first_page=page_1based,
        last_page=page_1based,
        thread_count=1,
    )
    if not images:
        raise RuntimeError(f"Failed to render page {page_1based} at dpi={dpi}")
    im = images[0]
    if im.mode != "RGB":
        im = im.convert("RGB")
    return im


# -------- image utilities --------

def pad_image(img: Image.Image, pad_top: float = 0.0, pad_other: float = 0.0, *, max_px: int = 120) -> Image.Image:
    """
    Add white padding (helps OCR not miss edge-touching glyphs).
    Padding is now proportional to image size by default.

    pad_top / pad_other:
      - if 0 < value <= 1: treated as ratio of image height (top) / min(w,h) (other)
      - if value > 1: treated as pixels (compat)
    """
    w, h = img.size

    def _to_px(v: float, *, base: int) -> int:
        if v <= 0:
            return 0
        if 0 < v <= 1:
            return int(round(base * v))
        return int(round(v))

    top_px = _to_px(float(pad_top), base=h)
    other_base = min(w, h)
    other_px = _to_px(float(pad_other), base=other_base)

    if max_px > 0:
        top_px = min(top_px, int(max_px))
        other_px = min(other_px, int(max_px))

    if top_px <= 0 and other_px <= 0:
        return img

    border = (other_px, top_px, other_px, other_px)  # left, top, right, bottom
    return ImageOps.expand(img, border=border, fill="white")


def save_debug_images(
    debug_dir: Path,
    page_i: int,
    raw_img: Image.Image,
    ocr_img: Image.Image,
    tag: str,
    *,
    jpeg_quality: int = 90,
) -> None:
    """
    Save images for debugging (JPEG only):
      - raw render
      - final OCR input (after padding)
    """
    debug_dir.mkdir(parents=True, exist_ok=True)
    q = int(max(1, min(95, jpeg_quality)))
    try:
        raw_img.save(debug_dir / f"page_{page_i:04d}_{tag}_raw.jpg", "JPEG", quality=q, optimize=True, progressive=True)
        ocr_img.save(debug_dir / f"page_{page_i:04d}_{tag}_ocr.jpg", "JPEG", quality=q, optimize=True, progressive=True)
    except Exception:
        pass


# -------- OCR helpers --------

def _default_upload_max_side_for_dpi(dpi: int) -> int:
    """
    Heuristic: larger DPI -> allow larger upload max_side.
    Clamped to keep payload manageable.
    """
    # 350 dpi -> ~2100; 450 -> ~2700; 600 -> ~3600 (clamp)
    m = int(round(dpi * 6))
    return max(1800, min(3600, m))


def _downscale_if_needed(img: Image.Image, max_side: Optional[int]) -> Image.Image:
    """
    Downscale image so that max(width, height) <= max_side (keeps aspect ratio).
    Uses high-quality resampling. If max_side is None or <=0, no-op.
    """
    if not max_side or max_side <= 0:
        return img
    w, h = img.size
    m = max(w, h)
    if m <= max_side:
        return img
    scale = max_side / float(m)
    new_w = max(1, int(round(w * scale)))
    new_h = max(1, int(round(h * scale)))
    return img.resize((new_w, new_h), resample=Image.Resampling.LANCZOS)


def pil_image_to_data_url(
    img: Image.Image,
    *,
    fmt: str = "JPEG",                 # "PNG" | "JPEG"
    jpeg_quality: int = 85,            # 1..95
    max_side: Optional[int] = 2000,    # int or None
    grayscale: bool = False,
) -> Tuple[str, Dict[str, Any]]:
    """
    Encode image to data URL for API upload.
    Returns (data_url, info) where info includes actual upload size/bytes.
    """
    orig_w, orig_h = img.size

    img2 = _downscale_if_needed(img, max_side=max_side)
    up_w, up_h = img2.size

    if grayscale:
        img2 = img2.convert("L")
    else:
        if img2.mode != "RGB":
            img2 = img2.convert("RGB")

    buf = io.BytesIO()
    fmt_u = (fmt or "JPEG").upper()

    if fmt_u == "PNG":
        img2.save(buf, format="PNG", optimize=True)
        mime = "image/png"
        actual_fmt = "PNG"
        q = None
    else:
        q = int(max(1, min(95, int(jpeg_quality))))
        img2.save(
            buf,
            format="JPEG",
            quality=q,
            optimize=True,
            progressive=True,
        )
        mime = "image/jpeg"
        actual_fmt = "JPEG"

    raw_bytes = buf.getvalue()
    b64 = base64.b64encode(raw_bytes).decode("ascii")

    info: Dict[str, Any] = {
        "orig_w": orig_w,
        "orig_h": orig_h,
        "upload_w": up_w,
        "upload_h": up_h,
        "upload_mode": "L" if grayscale else "RGB",
        "upload_fmt": actual_fmt,
        "jpeg_quality": q,
        "max_side": max_side,
        "grayscale": grayscale,
        "upload_bytes": len(raw_bytes),
        "upload_b64_chars": len(b64),
    }

    return f"data:{mime};base64,{b64}", info


def strip_code_fences(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s*```(?:md|markdown)?\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*```\s*$", "", s)
    return s.strip()


def looks_like_layout(md: str) -> bool:
    s = (md or "").strip()
    if not s:
        return True
    head = s[:1000]
    return bool(re.search(
        r"\b(text|equation|interline_equation|sub_title|title|figure|table)\s*\[\[\d+,\s*\d+,\s*\d+,\s*\d+\]\]",
        head
    ))


def strip_layout_boxes(md: str) -> str:
    lines = (md or "").splitlines()
    out = []
    for ln in lines:
        ln2 = re.sub(r"^\s*\w+\s*\[\[\d+,\s*\d+,\s*\d+,\s*\d+\]\]\s*", "", ln)
        if ln2.strip():
            out.append(ln2.rstrip())
    return "\n".join(out).strip()


def compile_boilerplate_patterns(patterns: Any) -> List[re.Pattern]:
    """
    patterns: list[str] regex strings.
    """
    out: List[re.Pattern] = []
    if not patterns:
        return out
    if not isinstance(patterns, list):
        return out
    for p in patterns:
        if not isinstance(p, str) or not p.strip():
            continue
        try:
            out.append(re.compile(p, re.IGNORECASE))
        except re.error:
            continue
    return out


def strip_boilerplate(md: str, patterns: List[re.Pattern]) -> Tuple[str, int]:
    """
    Remove leading boilerplate lines that match any configured pattern.
    Returns (cleaned_md, removed_count).
    """
    lines = (md or "").splitlines()
    while lines and not lines[0].strip():
        lines.pop(0)

    removed = 0
    # remove up to 6 leading boilerplate lines (more flexible than fixed 3)
    while lines and removed < 6:
        head = lines[0]
        if any(rx.match(head) for rx in patterns):
            lines.pop(0)
            removed += 1
            while lines and not lines[0].strip():
                lines.pop(0)
        else:
            break

    return "\n".join(lines).strip(), removed


def postprocess_and_assess(md: str, boilerplate_patterns: List[re.Pattern]) -> Tuple[str, Dict[str, Any]]:
    """
    Merge: strip_code_fences + layout stripping + configurable boilerplate stripping + bad detection.
    Returns (clean_md, meta_flags).
    """
    meta: Dict[str, Any] = {
        "layout": False,
        "boilerplate_removed_lines": 0,
        "bad": False,
        "bad_reason": None,
    }

    s = strip_code_fences(md or "")

    if looks_like_layout(s):
        meta["layout"] = True
        s2 = strip_layout_boxes(s)
        if len(s2) > 50:
            s = s2

    s, removed = strip_boilerplate(s, boilerplate_patterns)
    meta["boilerplate_removed_lines"] = removed

    s_stripped = (s or "").strip()
    
    s = strip_code_fences(md or "")

    if looks_like_layout(s):
        meta["layout"] = True
        s2 = strip_layout_boxes(s)
        if len(s2) > 50:
            s = s2

    s, removed = strip_boilerplate(s, boilerplate_patterns)
    meta["boilerplate_removed_lines"] = removed

    s_stripped = (s or "").strip()
    
    # --- 1. Basic Check: Empty ---
    if not s_stripped:
        meta["bad"] = True
        meta["bad_reason"] = "empty"
        return "", meta

    head = s_stripped[:800].lower()
    
    # --- 2. Basic Check: Loop/Hallucination ---
    if "sub-sub-sub" in head:
        meta["bad"] = True
        meta["bad_reason"] = "sub-loop"
        return s_stripped, meta

    # --- 3. Basic Check: Prompt Echoing ---
    echo_phrases = [
        "ocr to markdown",
        "output markdown only",
        "output only the content",
        "extract all text and math",
    ]
    if len(s_stripped) < 120 and any(head.startswith(p) for p in echo_phrases):
        meta["bad"] = True
        meta["bad_reason"] = "prompt_echo"
        return s_stripped, meta

    # Math degradation checks.
    if re.search(r"(\\quad\s*){6,}", s_stripped):
        meta["bad"] = True
        meta["bad_reason"] = "excessive_quad"
        return s_stripped, meta
        
    if re.search(r"(\\\\\s*){10,}", s_stripped):
        meta["bad"] = True
        meta["bad_reason"] = "excessive_newlines"
        return s_stripped, meta

    if re.search(r"([a-zA-Z0-9\uff01-\uff5e])\1{19,}", s_stripped):
        meta["bad"] = True
        meta["bad_reason"] = "char_repetition"
        return s_stripped, meta

    open_braces = s_stripped.count('{')
    close_braces = s_stripped.count('}')
    if abs(open_braces - close_braces) > 15:
        meta["bad"] = True
        meta["bad_reason"] = "unbalanced_braces"
        return s_stripped, meta

    return s_stripped, meta


@retry(
    reraise=True,
    stop=stop_after_attempt(4),
    wait=wait_exponential(multiplier=1, min=2, max=20),
    retry=retry_if_exception_type(Exception),
)
def ocr_image_to_markdown(
    client: OpenAI,
    model: str,
    img: Image.Image,
    boilerplate_patterns: List[re.Pattern],
    *,
    max_tokens: Optional[int] = None,
    upload_fmt: str = "JPEG",
    upload_jpeg_quality: int = 85,
    upload_max_side: Optional[int] = 2000,
    upload_grayscale: bool = False,
    api_semaphore: Optional[threading.Semaphore] = None,
) -> Tuple[str, Dict[str, Any]]:
    """
    Returns (markdown, meta) where meta includes flags about fallback/layout/boilerplate and upload info.
    """
    url, upload_info = pil_image_to_data_url(
        img,
        fmt=upload_fmt,                      # FIX: no hardcode
        jpeg_quality=upload_jpeg_quality,    # FIX: no hardcode
        max_side=upload_max_side,            # FIX: no hardcode
        grayscale=upload_grayscale,          # FIX: no hardcode
    )

    def _call(prompt: str) -> str:
        # limit API concurrency if semaphore is provided
        if api_semaphore is not None:
            api_semaphore.acquire()
        try:
            kwargs = dict(
                model=model,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image_url", "image_url": {"url": url}},
                        ],
                    }
                ],
                temperature=0.0,
                top_p=1.0,
            )
            if max_tokens is not None:
                kwargs["max_tokens"] = max_tokens
            resp = client.chat.completions.create(**kwargs)
            return resp.choices[0].message.content or ""
        finally:
            if api_semaphore is not None:
                api_semaphore.release()

    meta: Dict[str, Any] = {
        "used_fallback": False,
        "upload_fmt": upload_fmt,
        "upload_jpeg_quality": int(upload_jpeg_quality),
        "upload_max_side": upload_max_side,
        "upload_grayscale": bool(upload_grayscale),
        "upload_info": upload_info,
    }

    t_api0 = time.perf_counter()
    raw = _call(OCR_MD_PROMPT)
    t_api1 = time.perf_counter()
    meta["t_api_primary_s"] = t_api1 - t_api0

    cleaned, flags = postprocess_and_assess(raw, boilerplate_patterns)
    meta.update(flags)

    if meta.get("bad"):
        meta["used_fallback"] = True
        t_f0 = time.perf_counter()
        raw2 = _call(OCR_MD_PROMPT_FALLBACK)
        t_f1 = time.perf_counter()
        meta["t_api_fallback_s"] = t_f1 - t_f0

        cleaned2, flags2 = postprocess_and_assess(raw2, boilerplate_patterns)
        # override with fallback result (even if still bad)
        cleaned = cleaned2
        meta.update(flags2)

    return cleaned.strip(), meta


# -------- main --------

def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("pdf", type=str, help="Input PDF")
    ap.add_argument("out_md", type=str, help="Output Markdown file")

    ap.add_argument("--max-tokens", type=int, default=None, help="Override settings.OCR_MAX_TOKENS (default: use settings)")
    ap.add_argument("--workers", type=int, default=None, help="Override settings.OCR_WORKERS (default: use settings)")
    ap.add_argument("--debug", action="store_true", help="Override settings.OCR_DEBUG=True for this run")
    ap.add_argument("--no-debug", action="store_true", help="Override settings.OCR_DEBUG=False for this run")
    args = ap.parse_args()

    cfg = load_config()
    api_key = require_str(cfg, "api_key")
    base_url = get_cfg(cfg, "base_url", "https://aihubmix.com/v1", expected_type=str, nonempty=True)
    model = require_str(cfg, "model")

    settings = load_settings()

    # dpi moved to settings.json
    dpi = int(get_setting(settings, "OCR_DPI", 350))
    if dpi < 72 or dpi > 1200:
        raise ValueError(f"Invalid OCR_DPI in settings.json: {dpi} (expected 72..1200)")


    pdf_path = Path(args.pdf).expanduser().resolve()
    out_md = Path(args.out_md).expanduser().resolve()
    if not pdf_path.exists():
        print(f"ERROR: PDF not found: {pdf_path}", file=sys.stderr)
        sys.exit(2)
    out_md.parent.mkdir(parents=True, exist_ok=True)

    # ---- settings ----
    workers = int(get_setting(settings, "OCR_WORKERS", 4))
    api_conc = int(get_setting(settings, "OCR_API_CONCURRENCY", max(1, min(4, workers))))
    timeout_s = int(get_setting(settings, "OCR_TIMEOUT", 120))
    max_tokens_setting = get_setting(settings, "OCR_MAX_TOKENS", None)

    debug_enabled = bool(get_setting(settings, "OCR_DEBUG", False))
    verbose_page_log = bool(get_setting(settings, "OCR_VERBOSE_PAGE_LOG", True))
    debug_jpeg_quality = int(get_setting(settings, "OCR_DEBUG_JPEG_QUALITY", 90))

    # padding (proportional)
    pad_top = float(get_setting(settings, "OCR_PAD_TOP", 0.015))
    pad_other = float(get_setting(settings, "OCR_PAD_OTHER", 0.004))
    pad_max_px = int(get_setting(settings, "OCR_PAD_MAX_PX", 80))

    # upload encoding controls
    upload_fmt = str(get_setting(settings, "OCR_UPLOAD_FMT", "JPEG")).upper()
    upload_jpeg_quality = int(get_setting(settings, "OCR_UPLOAD_JPEG_QUALITY", 85))
    upload_max_side_raw = get_setting(settings, "OCR_UPLOAD_MAX_SIDE", "auto")
    upload_grayscale = bool(get_setting(settings, "OCR_UPLOAD_GRAYSCALE", False))

    # boilerplate patterns (configurable)
    boilerplate_patterns = compile_boilerplate_patterns(
        get_setting(settings, "OCR_STRIP_PREFIX_PATTERNS", [])
    )

    # ---- CLI overrides ----
    if args.workers is not None:
        workers = int(args.workers)
    if workers <= 0:
        workers = 4

    if args.max_tokens is not None:
        ocr_max_tokens = int(args.max_tokens)
    else:
        ocr_max_tokens = max_tokens_setting
        if ocr_max_tokens is not None:
            ocr_max_tokens = int(ocr_max_tokens)

    if args.debug:
        debug_enabled = True
    if args.no_debug:
        debug_enabled = False

    api_conc = max(1, int(api_conc))
    if api_conc > workers:
        # allow, but usually you want api_conc <= workers
        pass

    # resolve upload_max_side (dpi-dynamic "auto")
    upload_max_side: Optional[int]
    if upload_max_side_raw is None:
        upload_max_side = None
    elif isinstance(upload_max_side_raw, str) and upload_max_side_raw.strip().lower() == "auto":
        upload_max_side = _default_upload_max_side_for_dpi(dpi)
    else:
        try:
            upload_max_side = int(upload_max_side_raw)  # may raise
        except Exception:
            upload_max_side = _default_upload_max_side_for_dpi(dpi)

    client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout_s)
    api_sem = threading.Semaphore(api_conc)

    # Debug dirs
    debug_dir = out_md.parent / f"{out_md.stem}_debug"
    pages_dir = debug_dir / "pages"
    per_page_md_dir = debug_dir / "page_md"

    # Page count
    t0 = time.perf_counter()
    n_pages = get_pdf_page_count(pdf_path)
    t1 = time.perf_counter()
    print(f"[init] pdf={pdf_path.name} pages={n_pages} dpi={dpi} (pdfinfo {t1 - t0:.3f}s)")
    print(
        f"[run] model={model} workers={workers} api_concurrency={api_conc} timeout={timeout_s}s "
        f"upload={upload_fmt} q={upload_jpeg_quality} max_side={upload_max_side} gray={upload_grayscale} "
        f"pad_top={pad_top} pad_other={pad_other} pad_max_px={pad_max_px} debug={debug_enabled}"
    )

    results: Dict[int, str] = {}
    quality: Dict[int, Dict[str, Any]] = {}

    def process_page(page_i: int) -> Tuple[int, str, Dict[str, Any]]:
        t_page0 = time.perf_counter()

        # 1) render single page (Standard Quality)
        t_r0 = time.perf_counter()
        raw_img = render_single_page(pdf_path, dpi=dpi, page_1based=page_i)
        t_r1 = time.perf_counter()

        # 2) padding
        ocr_img = pad_image(raw_img, pad_top=pad_top, pad_other=pad_other, max_px=pad_max_px)

        # 3) debug save (jpeg only)
        if debug_enabled:
            save_debug_images(pages_dir, page_i, raw_img, ocr_img, tag=f"dpi{dpi}", jpeg_quality=debug_jpeg_quality)

        # 4) OCR (First Attempt)
        t_o0 = time.perf_counter()
        md, meta = ocr_image_to_markdown(
            client,
            model,
            ocr_img,
            boilerplate_patterns,
            max_tokens=ocr_max_tokens,
            upload_fmt=upload_fmt,
            upload_jpeg_quality=upload_jpeg_quality,
            upload_max_side=upload_max_side,
            upload_grayscale=upload_grayscale,
            api_semaphore=api_sem,
        )
        t_o1 = time.perf_counter()

        if meta.get("bad"):
            high_dpi = max(dpi + 150, 450) 
            tqdm.write(f"[Retry] Page {page_i} bad quality ({meta.get('bad_reason')}). Retrying at DPI {high_dpi}...")

            try:
                raw_img_hq = render_single_page(pdf_path, dpi=high_dpi, page_1based=page_i)
                ocr_img_hq = pad_image(raw_img_hq, pad_top=pad_top, pad_other=pad_other, max_px=pad_max_px)

                if debug_enabled:
                    save_debug_images(pages_dir, page_i, raw_img_hq, ocr_img_hq, tag=f"dpi{high_dpi}_RETRY", jpeg_quality=debug_jpeg_quality)

                md_retry, meta_retry = ocr_image_to_markdown(
                    client,
                    model,
                    ocr_img_hq,
                    boilerplate_patterns,
                    max_tokens=ocr_max_tokens,
                    upload_fmt=upload_fmt,
                    upload_jpeg_quality=upload_jpeg_quality,
                    upload_max_side=_default_upload_max_side_for_dpi(high_dpi),
                    upload_grayscale=upload_grayscale,
                    api_semaphore=api_sem,
                )

                md = md_retry
                meta_retry["is_retry"] = True
                meta_retry["retry_reason"] = meta.get("bad_reason")
                meta = meta_retry
                
            except Exception as e:
                tqdm.write(f"[Retry Failed] Page {page_i}: {e}")
        t_page1 = time.perf_counter()

        out_meta: Dict[str, Any] = {
            "page": page_i,
            "dpi": dpi,
            "t_render_s": t_r1 - t_r0,
            "t_ocr_total_s": t_o1 - t_o0,
            "t_page_total_s": t_page1 - t_page0,
            "md_len": len(md),
            "full_meta": meta,
            "upload": meta.get("upload_info", {}),
        }

        return page_i, md, out_meta

    # Run with proper tqdm (no broken redraw). Use tqdm.write for per-page logs.
    page_indices = list(range(1, n_pages + 1))
    with ThreadPoolExecutor(max_workers=workers) as ex:
        futs = {ex.submit(process_page, i): i for i in page_indices}
        with tqdm(total=n_pages, desc="OCR pages", dynamic_ncols=True) as pbar:
            for fut in as_completed(futs):
                page_i, md, meta = fut.result()
                results[page_i] = md
                quality[page_i] = meta
                pbar.update(1)

                if verbose_page_log:
                    fm = meta.get("full_meta") or {}
                    bad = bool(fm.get("bad"))
                    layout = bool(fm.get("layout"))
                    used_fallback = bool(fm.get("used_fallback"))
                    up = meta.get("upload") or {}
                    up_bytes = int(up.get("upload_bytes") or 0)
                    up_kb = up_bytes / 1024.0
                    up_wh = f"{up.get('upload_w')}x{up.get('upload_h')}"
                    orig_wh = f"{up.get('orig_w')}x{up.get('orig_h')}"
                    up_fmt2 = up.get("upload_fmt")
                    up_q2 = up.get("jpeg_quality")
                    up_gray2 = up.get("grayscale")

                    # tqdm.write(
                    #     f"[OCR] page={page_i} bad={bad} layout={layout} fallback={used_fallback} len={len(md)} "
                    #     f"render={meta.get('t_render_s', 0):.3f}s ocr={meta.get('t_ocr_total_s', 0):.3f}s "
                    #     f"upload={up_fmt2} q={up_q2} gray={up_gray2} orig={orig_wh} up={up_wh} bytes={up_kb:.1f}KB"
                    # )

    # Assemble output in order
    chunks: List[str] = []
    for i in range(1, n_pages + 1):
        md = results.get(i, "")
        chunks.append(f"<!-- PAGE {i} -->\n{md}\n")
    out_md.write_text("\n".join(chunks), encoding="utf-8")

    if debug_enabled:
        debug_dir.mkdir(parents=True, exist_ok=True)
        report = {
            "pdf": str(pdf_path),
            "out_md": str(out_md),
            "dpi": dpi,
            "workers": workers,
            "api_concurrency": api_conc,
            "timeout_s": timeout_s,
            "model": model,
            "pad_top": pad_top,
            "pad_other": pad_other,
            "pad_max_px": pad_max_px,
            "upload_fmt": upload_fmt,
            "upload_jpeg_quality": upload_jpeg_quality,
            "upload_max_side": upload_max_side,
            "upload_grayscale": upload_grayscale,
            "strip_prefix_patterns": [rx.pattern for rx in boilerplate_patterns],
            "pages": [quality[k] for k in sorted(quality.keys())],
        }
        (debug_dir / "quality_report.json").write_text(
            json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8"
        )

    print(f"DONE: {out_md}")
    if debug_enabled:
        print(f"DEBUG: {debug_dir}")


if __name__ == "__main__":
    main()
