import os
import requests
from pathlib import Path
from tqdm import tqdm
import argparse
import re
from lxml import etree

NS = {"tei": "http://www.tei-c.org/ns/1.0"}


def convert_pdf_to_xml(pdf_path, grobid_url):
    with open(pdf_path, "rb") as f:
        files = {"input": f}
        response = requests.post(
            f"{grobid_url}/api/processFulltextDocument",
            files=files,
            data={
                "consolidateHeader": "0",
                "consolidateCitations": "0",
                "consolidateFunders": "0",
            },
        )
        response.raise_for_status()
        return response.text


def extract_body_paragraphs_from_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        paragraphs = tree.xpath("//tei:body//tei:p", namespaces=NS)
        para_texts = [" ".join(p.xpath(".//text()")) for p in paragraphs]
        para_texts = [
            re.sub(r"\s{2,}", " ", t).strip() for t in para_texts if t.strip()
        ]
        return "\n\n".join(para_texts) if para_texts else None
    except Exception as e:
        print(f"Failed to extract body text from {xml_path}: {e}")
        return None


def main(input_dir, output_dir, grobid_url):
    pdf_files = list(Path(input_dir).glob("*.pdf"))
    os.makedirs(output_dir, exist_ok=True)
    print(f"Found {len(pdf_files)} PDF files in {input_dir}")
    for pdf in tqdm(pdf_files):
        file_name = pdf.stem
        xml_output_path = os.path.join(output_dir, file_name + ".xml")
        txt_output_path = os.path.join(output_dir, file_name + ".txt")
        if os.path.exists(xml_output_path):
            print(f"Skipped: {xml_output_path} (already exists)")
        else:
            print(f"Processing: {file_name}")
            tei_xml = convert_pdf_to_xml(pdf, grobid_url)
            with open(xml_output_path, "w", encoding="utf-8") as f:
                f.write(tei_xml)
        # Extract body text from XML and output as .txt
        if os.path.exists(xml_output_path):
            body_text = extract_body_paragraphs_from_xml(xml_output_path)
            if body_text:
                with open(txt_output_path, "w", encoding="utf-8") as f:
                    f.write(body_text)
            else:
                print(f"No body text extracted for: {file_name}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Convert PDFs to XML using GROBID and extract body text."
    )
    parser.add_argument(
        "--input-dir", type=str, required=True, help="Directory containing PDF files."
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory to save XML and TXT files.",
    )
    parser.add_argument(
        "--grobid-url",
        type=str,
        default="http://localhost:8070",
        help="GROBID server URL.",
    )
    args = parser.parse_args()
    main(args.input_dir, args.output_dir, args.grobid_url)
