# !pip install habanero
import pandas as pd
from habanero import Crossref
from tqdm import tqdm
from tqdm.contrib import tenumerate
import os
import json

cr = Crossref()

def get_paper_data(doi_list, id_list, temp_dir="temp_data"):
  """
    A function to create a DataFrame containing the titles and abstracts of papers from a list of DOIs.

    Args: doi_list: A list of DOIs. id_list: A list of COD IDs. temp_dir: Temporary file storage directory.

    Returns: DataFrame: A DataFrame containing the titles and abstracts of the papers.
  """

  data = []
  os.makedirs(temp_dir, exist_ok=True)
  
  for i, (doi, cod_id) in tenumerate(zip(doi_list, id_list)):
    temp_filepath = os.path.join(temp_dir, f"{cod_id}.json")
    if os.path.exists(temp_filepath):
      with open(temp_filepath, 'r') as f:
        temp_data = json.load(f)
        data.append(temp_data)
      print(f"Skipping ID {cod_id} - response already cached.")
      continue

    try:
      res = cr.works(ids=doi)
      title = res['message']['title'][0] if 'title' in res['message'] else None
      abstract = res['message']['abstract'] if 'abstract' in res['message'] else None
      temp_data = {'DOI': doi, 'File':cod_id , 'Title': title, 'Abstract': abstract}

      if 'File' in temp_data:
          temp_data['File'] = int(temp_data['File'])

      data.append(temp_data)
      with open(temp_filepath, 'w') as f:
        json.dump(temp_data, f)
    except Exception as e:
      print(f"Error fetching data for DOI: {doi}, Error: {e}")
      temp_data = {'DOI': doi, 'File':cod_id , 'Title': None, 'Abstract': None}

      if 'File' in temp_data:
          temp_data['File'] = int(temp_data['File'])

      data.append(temp_data)
      with open(temp_filepath, 'w') as f:
        json.dump(temp_data, f)
  return pd.DataFrame(data)


def filter_dataframe(df):
  """
    A function to extract only the data where both the title and abstract are not None.

    Args: df: Input DataFrame.

    Returns: DataFrame: The filtered DataFrame.
  """

  return df.dropna(subset=['Title', 'Abstract'])


if __name__ == '__main__':
    save_root_path = "../data/cod_full_20240331"  
    cod_metadata = pd.read_csv("../data/cod_metadata_20240331.csv")

    doi_list = cod_metadata['doi'].values
    id_list = cod_metadata['file'].values
    

    df = get_paper_data(doi_list, id_list)

    filtered_df = filter_dataframe(df)
    filtered_df.to_csv(f"{save_root_path}/abstruct_doi_filtered.csv")

    output_lines = []

    for i in range(len(filtered_df)):
        title_format = f"Title: {filtered_df.iloc[i]['Title']}"
        abst_format = f"Abstract: {filtered_df.iloc[i]['Abstract']}"
        cod_id = filtered_df.iloc[i]['File']
        
        output_lines.append(f"ID {cod_id}\n{title_format}\n{abst_format}\n")

    with open(f'{save_root_path}/title_abst_pair_all.txt', 'w') as file:
        file.writelines(output_lines)
