# Script for checking intersection b/w Flickr30K and VIST. Oscar pretraining used Flickr30K and therefore this is important to know
import glob
import json
import os

flickr30K = "/n/fs/nlp-xxxx/datasets/flicker30K/Sentences"
vist_stories = "/n/fs/nlp-xxxx/datasets/VIST/sis"

if __name__ == "__main__":
    img_files = glob.glob(os.path.join(flickr30K, "*.txt"))
    flickr_img_ids = [
        os.path.splitext(os.path.basename(img_file))[0] for img_file in img_files
    ]
    vist_img_ids = []
    splits = ["train", "val", "test"]
    # load stories
    for split in splits:
        with open(os.path.join(vist_stories, "%s.story-in-sequence.json" % split)) as f:
            data = json.load(f)
            # get all the image ids in the annotations
            ann = data["annotations"]
            for a in ann:
                vist_img_ids.append(a[0]["photo_flickr_id"])
    vist_img_ids_set = set(vist_img_ids)
    flickr_img_ids_set = set(flickr_img_ids)
    intersecting_images = vist_img_ids_set.intersection(flickr_img_ids_set)
    print("sample of vist img ids", vist_img_ids[:5])
    print("sample of flick img ids", flickr_img_ids[:5])
    print("intersecting image ids", intersecting_images)
    print(
        "number of intersecting images",
        len(intersecting_images),
    )
