# !pip install habanero
import pandas as pd
from habanero import Crossref
from tqdm.contrib import tenumerate
import os
import json

cr = Crossref()

def get_paper_data(doi_list, id_list, temp_dir="temp_data"):
  """
  Creates a DataFrame containing paper titles and abstracts from a list of DOIs.

  Args:
    doi_list: A list of DOIs.
    id_list: A list of COD IDs.
    temp_dir: Directory for temporary file storage.

  Returns:
    A DataFrame containing paper titles and abstracts.
  """

  data = []
  os.makedirs(temp_dir, exist_ok=True)
  
  for i, (doi, cod_id) in tenumerate(zip(doi_list, id_list)):
    temp_filepath = os.path.join(temp_dir, f"{cod_id}.json")
    if os.path.exists(temp_filepath):
      with open(temp_filepath, 'r') as f:
        temp_data = json.load(f)
        data.append(temp_data)
      print(f"Skipping ID {cod_id} - response already cached.")
      continue

    try:
      # Retrieve article metadata with Crossref API
      res = cr.works(ids=doi)
      title = res['message']['title'][0] if 'title' in res['message'] else None
      abstract = res['message']['abstract'] if 'abstract' in res['message'] else None
      temp_data = {'DOI': doi, 'File':cod_id , 'Title': title, 'Abstract': abstract}

      # int64 -> int
      if 'File' in temp_data:
          temp_data['File'] = int(temp_data['File'])

      data.append(temp_data)
      with open(temp_filepath, 'w') as f:
        json.dump(temp_data, f)
        
    except Exception as e:
      print(f"Error fetching data for DOI: {doi}, Error: {e}")
      temp_data = {'DOI': doi, 'File':cod_id , 'Title': None, 'Abstract': None}

      # int64 -> int
      if 'File' in temp_data:
          temp_data['File'] = int(temp_data['File'])

      data.append(temp_data)
      with open(temp_filepath, 'w') as f:
        json.dump(temp_data, f)
  return pd.DataFrame(data)


if __name__ == '__main__':
    # change here
    save_root_path = "../data/cod_full_20240331"  
    cod_metadata = pd.read_csv("../data/cod_metadata_20240331.csv")

    doi_list = cod_metadata['doi'].values
    id_list = cod_metadata['file'].values
    
    df = get_paper_data(doi_list, id_list)
    filtered_df = df.dropna(subset=['Title', 'Abstract'])
    filtered_df.to_csv(f"{save_root_path}/abstruct_doi_filtered.csv")