#!/usr/bin/env python3
import os
import json
import glob
from pathlib import Path
from extraction_pdf_agent import extract_structured_info   # Reuse original script function

SINGLE_PAPER_DIR = "/home/yueke//autoleaderboard/output/pdfs_loose"
OUTPUT_DIR       = "extracted_content_hotpotqa_pdfs_loose"
RETRY_LOG        = Path(OUTPUT_DIR) / "retry_error.log"
API_KEY          = "EMPTY"
API_BASE         = "http://localhost:8000/v1"

def need_retry(pdf_path: Path) -> bool:
    """Determine if retry is needed"""
    stem = pdf_path.stem
    json_file = Path(OUTPUT_DIR) / f"{stem}.json"
    if not json_file.exists():
        return True
    try:
        with open(json_file, encoding='utf-8') as f:
            data = json.load(f)
        # Empty dict or all keys are null/empty also retry
        return not any(data.values())
    except Exception:
        return True      # Parse failure also retry

def main():
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
    pdfs = list(Path(SINGLE_PAPER_DIR).rglob("*.pdf"))

    retry_list = [p for p in pdfs if need_retry(p)]
    print(f"Total {len(retry_list)} PDFs need retry")

    with open(RETRY_LOG, 'a', encoding='utf-8') as log_f:
        for pdf in retry_list:
            json_path = Path(OUTPUT_DIR) / f"{pdf.stem}.json"
            try:
                result = extract_structured_info(str(pdf), API_KEY, API_BASE)
                with open(json_path, 'w', encoding='utf-8') as out_f:
                    json.dump(result, out_f, ensure_ascii=False, indent=2)
                print(f"[RETRY OK] {pdf.name}")
            except ValueError as e:
                # PDF file corruption and other structural issues
                log_f.write(f"[RETRY SKIP] {pdf}: PDF file corrupted - {e}\n")
                print(f"[RETRY SKIP] {pdf.name}: PDF file corrupted")
            except Exception as e:
                # Other errors (such as network, API, etc.)
                log_f.write(f"[RETRY ERROR] {pdf}: {e}\n")
                print(f"[RETRY ERROR] {pdf.name}: {e}")

if __name__ == "__main__":
    main()