import arxiv
import json
import re
import pandas as pd
import numpy as np
import os
from create_cite import *
from create_util import *

# Build the default arXiv client.
client = arxiv.Client()

parser_manager = ArgParser()
args = parser_manager.add_topic_args().parse()
topic = args.topic

# -----------------------------------------------------------------------------
# Anonymous base directory – set via ENV or change below
# -----------------------------------------------------------------------------
BASE_DIR = os.environ.get("ANON_PROJECT_ROOT", "./anonymous_project_root")
LIT_REVIEW_DIR = os.path.join(BASE_DIR, "lit_review", topic)
META_DIR       = os.path.join(BASE_DIR, "meta", topic)

# Load all JSON files from the literature-review folder
already_info = []
for file in os.listdir(LIT_REVIEW_DIR):
    file_path = os.path.join(LIT_REVIEW_DIR, file)
    if os.path.isfile(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            already_info.extend(data.get("paper_bank", []))

# Deduplicate by paper ID
print("Paper count before deduplication:", len(already_info))
already_info = list({d["id"]: d for d in already_info}.values())
print("Paper count after deduplication:", len(already_info))

# -----------------------------------------------------------------------------
# Fetch arXiv metadata
# -----------------------------------------------------------------------------
results_all = []
for item in already_info:
    disclaimer = item.get("openAccessPdf", {}).get("disclaimer", "")
    match = re.search(r"abs/(\d+\.\d+(?:v\d+)?)", disclaimer)
    if match:
        arxiv_id = match.group(1)
        arxiv_content = search_arxiv_by_cite(arxiv_id)
        if arxiv_content:
            results_all.append(arxiv_content)

# -----------------------------------------------------------------------------
# Build unified paper_info list
# -----------------------------------------------------------------------------
paper_info = []
for paper_result in results_all:
    authors = " and ".join([author.name for author in paper_result.authors])
    title = paper_result.title.replace("{", "\\{").replace("}", "\\}")
    year = paper_result.published.year
    arxiv_id = paper_result.entry_id.split("/")[-1]

    first_author_last = (
        paper_result.authors[0].name.split()[-1]
        if paper_result.authors
        else "NoAuthor"
    )
    citation_key = f"{first_author_last}_{arxiv_id.split('v')[0].replace('.', '_')}"

    bibtex_entry = f"""
@misc{{{citation_key},
title={{{title}}},
author={{{authors}}},
year={{{year}}},
archivePrefix={{arXiv}},
primaryClass={{{paper_result.primary_category}}},
url={{{paper_result.entry_id}}}
}}"""

    paper_info.append({
        "citation_key": citation_key,
        "entry_id": paper_result.entry_id,
        "title": paper_result.title,
        "year": year,
        "authors": [author.name for author in paper_result.authors],
        "published_date": paper_result.published.strftime("%Y-%m-%d"),
        "summary": paper_result.summary,
        "primary_category": paper_result.primary_category,
        "pdf_url": paper_result.pdf_url,
        "comment": paper_result.comment,
        "bibtex_entry": bibtex_entry,
    })
    print(bibtex_entry)

# -----------------------------------------------------------------------------
# Persist final JSON
# -----------------------------------------------------------------------------
os.makedirs(META_DIR, exist_ok=True)
output_json = os.path.join(META_DIR, "paper_info.json")
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(paper_info, f, indent=4, ensure_ascii=False)

print(f"Saved metadata for {len(paper_info)} papers -> {output_json}")