import pandas as pd
from pycocotools.coco import COCO
import inflect
import re
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_nouns(text):
  """
  This function takes a text string and returns a list of noun phrases.

  Args:
      text: The text string to be processed.

  Returns:
      A list of noun phrases extracted from the text.
  """
  doc = nlp(text)
  nouns = []

  for token in doc:
      if token.pos_ == "NOUN":
          nouns.append(token.text)

  return nouns


def construct_expanded_dataset(annotation_file, captions_file):
    # Load COCO annotations
    coco = COCO(annotation_file)
    # Load COCO captions
    captions_data = COCO(captions_file)
    engine = inflect.engine()
    expanded_data = []
    img_ids = coco.getImgIds()

    for idx, img_id in enumerate(img_ids):
        print(f"Processing {idx}")

        ann_ids_for_image = coco.getAnnIds(imgIds=[img_id])
        annotations_for_image = coco.loadAnns(ann_ids_for_image)

        # cat_ids = [ann['category_id'] for ann in annotations_for_image]
        # for cat_id in cat_ids:
        #     cat_name = coco.loadCats([cat_id])[0]['name']
        #     print(cat_name)
            
        for ann in captions_data.loadAnns(captions_data.getAnnIds(imgIds=[img_id])):
            caption = ann['caption']
            nouns = extract_nouns(caption)
            if len(nouns) > 1:
                expanded_data.append([img_id, caption, nouns])

    df = pd.DataFrame(expanded_data, columns=['image id', 'caption', 'objs'])

    return df

    
def main():
    # Replace with your path
    annotation_file = '../../data/coco/annotations/instances_val2017.json'
    captions_file = '../../data/coco/annotations/captions_val2017.json'
    output_file = '../../data/coco/coco_ours.csv'

    df = construct_expanded_dataset(annotation_file, captions_file)
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    main()
