#%%
import os
import json
from collections import Counter

# Paths
train_json_path = '/fs/scratch/PAS2099/Jiacheng/Orientation/train/train.json'
train_img_dir   = '/fs/scratch/PAS2099/Jiacheng/Orientation/train/image'
val_json_path   = '/fs/scratch/PAS2099/Jiacheng/Orientation/val/val.json'

# Load train.json
with open(train_json_path, 'r') as f:
    train_data = json.load(f)

# Extract train image filenames
train_imgs = [ os.path.basename(item['image']) for item in train_data ]

# 1. Check that every train.json image exists on disk
disk_train_imgs = set(os.listdir(train_img_dir))
missing_on_disk = set(train_imgs) - disk_train_imgs
extra_on_disk   = disk_train_imgs - set(train_imgs)

print("=== Train image existence ===")
if missing_on_disk:
    print(f"Missing on disk ({len(missing_on_disk)}):", sorted(missing_on_disk))
else:
    print("All train.json images are present on disk.")
if extra_on_disk:
    print(f"Extra files in directory ({len(extra_on_disk)}):", sorted(extra_on_disk))
else:
    print("No extra files in train image directory.")
print()

# 2. Check for duplicate entries within train.json
dupe_train = [img for img, cnt in Counter(train_imgs).items() if cnt > 1]
print("=== Duplicate images within train.json ===")
print(dupe_train or "None")
print()

# Load val.json
with open(val_json_path, 'r') as f:
    val_data = json.load(f)

# Extract val image filenames
val_imgs = [ item['image'] for item in val_data ]

# 3. Check for duplicate entries within val.json
dupe_val = [img for img, cnt in Counter(val_imgs).items() if cnt > 1]
print("=== Duplicate images within val.json ===")
print(dupe_val or "None")
print()

# 4. Check for overlap between train and val
overlap = set(train_imgs) & set(val_imgs)
print("=== Overlap between train and val ===")
print(sorted(overlap) or "None")

# %%
