import arxiv
import json
import re
import pandas as pd
import numpy as np
import os
from create_cite import *
from create_util import *

# Build the default API client.
client = arxiv.Client()

parser_manager = ArgParser()
args = parser_manager.add_topic_args().add_csv_args().parse()

topic = args.topic
original_csv_path = args.original_csv_path

df = pd.read_csv(original_csv_path)
names = df['title'].tolist()
print(names)

results_all = []

for name in names:
    # Define your search query.
    query = name
    # Perform the search and fetch results.
    # num_results: number of returned results (default 10).
    # lang: search language ("en" for English, "zh-cn" for Simplified Chinese).
    # pause: seconds to pause between requests to avoid IP ban by Google.
    from googlesearch import search
    retry_count = 5
    while retry_count > 0:
        try:
            research_result = search(query, num_results=8)
            retry_count -= 1
            for url in research_result:
                print(url)
                if "https://arxiv.org/" in url:
                    # Extract the paper ID.
                    paper_id = url.split("/")[-1]
                    break

            search = arxiv.Search(id_list=[paper_id])
            for result in client.results(search):
                results_all.append(result)
                print(f"Found: {result.title}")

            print(f"Successfully retrieved {len(results_all)} results.")
            break
        except Exception as e:
            print(f"An error occurred: {e}")
            retry_count -= 1
            if retry_count == 0:
                print("Retried 5 times, still failed.")
                break
    # except arxiv.UnexpectedEmptyPageError:
    #     print("Reached the end of search results unexpectedly early.")
    #     print(f"Retrieved {len(results_all)} results before the error.")
    # except Exception as e:
    #     print(f"An error occurred: {e}")

paper_info = []
for item in results_all:
    paper_result = item
    authors = " and ".join([author.name for author in paper_result.authors])
    title = paper_result.title.replace('{', '\\{').replace('}', '\\}')  # Escape braces for BibTeX
    year = paper_result.published.year
    arxiv_id = paper_result.entry_id.split('/')[-1]  # Keep only the ID (e.g., 2405.19320v4)

    # Create a simple citation key (e.g., FirstAuthorLastNameYear_ArxivIDPart)
    first_author_lastname = paper_result.authors[0].name.split(' ')[-1] if paper_result.authors else "NoAuthor"
    citation_key = f"{first_author_lastname}_{arxiv_id.split('v')[0].replace('.', '_')}"

    bibtex_entry = f"""
@misc{{{citation_key},
title={{{title}}},
author={{{authors}}},
year={{{year}}},
archivePrefix={{arXiv}},
primaryClass={{{paper_result.primary_category}}},
url={{{paper_result.entry_id}}}
}}"""

    paper_info_dict = {
        "citation_key": citation_key,
        "entry_id": paper_result.entry_id,
        "title": paper_result.title,
        "original_title": name,
        "year": paper_result.published.year,
        "authors": [author.name for author in paper_result.authors],  # Extract names only
        "published_date": paper_result.published.strftime("%Y-%m-%d"),  # Format date
        "summary": paper_result.summary,
        "primary_category": paper_result.primary_category,
        "pdf_url": paper_result.pdf_url,  # Provided by the arxiv library
        "comment": paper_result.comment,  # If available
        "bibtex_entry": bibtex_entry
    }
    print(bibtex_entry)
    paper_info.append(paper_info_dict)

path = ""

if not os.path.exists(f"{path}/{topic}"):
    os.makedirs(f"{path}/{topic}", exist_ok=True)
# Save JSON file.
if not os.path.exists(f"{path}/{topic}/humanpaper"):
    os.makedirs(f"{path}/{topic}/humanpaper", exist_ok=True)

with open(f"{path}/{topic}/humanpaper/paper_info.json", 'w') as f:
    json.dump(paper_info, f, indent=4, ensure_ascii=False)