#%%
import os
import json

# Paths
train_json_path = '/fs/scratch/PAS2099/Jiacheng/Texture/train/train.json'
val_json_path   = '/fs/scratch/PAS2099/Jiacheng/Texture/val/val.json'
train_image_dir = '/fs/scratch/PAS2099/Jiacheng/Texture/train/image'

# Load JSON
with open(train_json_path, 'r') as f:
    train_data = json.load(f)
with open(val_json_path, 'r') as f:
    val_data = json.load(f)

# Extract image basenames from train.json and val.json
train_images = [os.path.basename(item['image']) for item in train_data]
val_images   = [os.path.basename(item['image']) for item in val_data]

# 1. Check for duplicates _within_ train.json
train_duplicates = set([
    img for img in train_images
    if train_images.count(img) > 1
])

# 2. Check for overlap _between_ train and val
overlap = set(train_images) & set(val_images)

# 3. Verify that each train.json image exists in train/image directory
existing_files = set(os.listdir(train_image_dir))
missing_in_dir = set(train_images) - existing_files
extra_in_dir   = existing_files - set(train_images)

# Print results
print("=== Duplicate images within train.json ===")
if train_duplicates:
    for img in sorted(train_duplicates):
        print("  ", img)
else:
    print("  None")

print("\n=== Overlap between train.json and val.json ===")
if overlap:
    for img in sorted(overlap):
        print("  ", img)
else:
    print("  None")

print("\n=== Train.json vs train/image directory ===")
print("Images listed in train.json but missing on disk:")
if missing_in_dir:
    for img in sorted(missing_in_dir):
        print("  ", img)
else:
    print("  None")

print("\nImages in train/image directory not referenced in train.json:")
if extra_in_dir:
    for img in sorted(extra_in_dir):
        print("  ", img)
else:
    print("  None")

# %%
