import bibtexparser
import requests
from bs4 import BeautifulSoup
import re
import time

def get_google_scholar_bibtex(title):
    """Searches Google Scholar for a paper title and returns its BibTeX entry."""
    search_url = f"https://scholar.google.com/scholar?q={requests.utils.quote(title)}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the "Cite" link for the first result
        cite_link = None
        for a_tag in soup.find_all('a', href=True):
            if 'scholar.google.com/scholar_url' in a_tag['href'] and 'cites' in a_tag['href']:
                # This is a link to the paper itself, not the cite button.
                # We need to find the 'Cite' button associated with the search result.
                # Google Scholar's HTML structure is complex.
                # Let's look for the 'gs_ri' div which contains each result.
                break # This approach is not direct enough.

        # A more direct way to find the cite link for the first result
        # Look for the 'gs_fl' div which contains the "Cite" link
        first_result_block = soup.find('div', class_='gs_ri')
        paper_id = None

        # Attempt to find the paper ID from a 'related' link
        related_link = first_result_block.find('a', href=re.compile(r'related:([^:]+):scholar\.google\.com'))
        if related_link:
            match = re.search(r'related:([^:]+):scholar\.google\.com', related_link['href'])
            if match:
                paper_id = match.group(1)

        # If not found in 'related' link, try to find it in the main title link if it's an 'info' link
        if not paper_id:
            title_h3 = first_result_block.find('h3', class_='gs_rt')
            if title_h3:
                title_link = title_h3.find('a', href=re.compile(r'info:([^:]+):scholar\.google\.com'))
                if title_link:
                    match = re.search(r'info:([^:]+):scholar\.google\.com', title_link['href'])
                    if match:
                        paper_id = match.group(1)

        if not first_result_block.find('h3', class_='gs_rt').find('a').get_text().lower() == title.lower():
            print(f"Warning::: \n Original Paper title:: {title}\n Found:: {first_result_block.find('h3', class_='gs_rt').find('a').get_text().lower()}")
        if paper_id:
            # Construct the cite URL directly using the extracted paper_id
            # Using 'hl=en' for English, but you can change it to 'hl=zh-TW' if preferred
            cite_url = f"https://scholar.google.com/scholar?q=info:{paper_id}:scholar.google.com/&output=cite&scirp=0&hl=en"

            # Fetch the cite page
            cite_response = requests.get(cite_url, headers=headers, timeout=10)
            cite_response.raise_for_status()
            cite_soup = BeautifulSoup(cite_response.text, 'html.parser')

            # Find the BibTeX link on the cite page
            bibtex_link = cite_soup.find('a', string='BibTeX')
            if bibtex_link and bibtex_link.has_attr('href'):
                bibtex_url_relative = bibtex_link['href']
                # bibtex_url = f"https://scholar.google.com{bibtex_url_relative}"
                bibtex_url = bibtex_url_relative

                # Fetch the BibTeX content
                bibtex_response = requests.get(bibtex_url, headers=headers, timeout=10)
                bibtex_response.raise_for_status()
                return bibtex_response.text
        else:
            print(f"Could not extract paper ID for '{title}'.")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for '{title}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred for '{title}': {e}")
    return None

def update_bib_file(bib_file_path):
    """
    Reads a BibTeX file, searches Google Scholar for each entry's title,
    and updates the entry with the BibTeX found on Google Scholar,
    preserving the original citation key.
    """
    try:
        with open(bib_file_path, 'r', encoding='utf-8') as bib_file:
            bib_database = bibtexparser.load(bib_file)
    except FileNotFoundError:
        print(f"Error: BibTeX file not found at {bib_file_path}")
        return
    except Exception as e:
        print(f"Error reading BibTeX file: {e}")
        return

    updated_entries = []
    for entry in bib_database.entries:
        original_key = entry['ID']
        title = entry.get('title')

        if not title:
            print(f"Warning: Entry with key '{original_key}' has no title. Skipping.")
            updated_entries.append(entry)
            continue

        print(f"Searching for '{title}' (key: {original_key})...")
        new_bibtex_content = get_google_scholar_bibtex(title)

        if new_bibtex_content:
            try:
                # Parse the new BibTeX content
                new_bib_database = bibtexparser.loads(new_bibtex_content)
                if new_bib_database.entries:
                    new_entry = new_bib_database.entries[0]
                    # Preserve the original citation key
                    new_entry['ID'] = original_key
                    updated_entries.append(new_entry)
                    print(f"Successfully updated entry for '{title}'.")
                else:
                    print(f"Warning: No BibTeX entry found in Google Scholar response for '{title}'. Keeping original.")
                    updated_entries.append(entry)
            except Exception as e:
                print(f"Error parsing new BibTeX for '{title}': {e}. Keeping original.")
                updated_entries.append(entry)
        else:
            print(f"Warning: Could not find BibTeX on Google Scholar for '{title}'. Keeping original.")
            updated_entries.append(entry)
        time.sleep(10) # Be polite to Google Scholar

    # Create a new BibTeX database with updated entries
    updated_bib_database = bibtexparser.bibdatabase.BibDatabase()
    updated_bib_database.entries = updated_entries

    try:
        with open(bib_file_path, 'w', encoding='utf-8') as bib_file:
            bibtexparser.dump(updated_bib_database, bib_file)
        print(f"\nSuccessfully updated {bib_file_path}")
    except Exception as e:
        print(f"Error writing updated BibTeX file: {e}")

if __name__ == "__main__":
    # Replace with the actual path to your references.bib file
    bib_file_path = '/Users/clf/Downloads/Agents4Science/选题/paper/paper/references.bib'
    update_bib_file(bib_file_path)