import os
import json
from glob import glob

# Step 1: Read full relative paths from CSV
csv_path = "/home/yaseema/pfd/real_classified_images.csv"
with open(csv_path, 'r') as f:
    csv_full_paths = [line.strip() for line in f if line.strip()]

# Convert to relative paths
csv_relative_paths = [
    os.path.relpath(p, start="/home/yaseema/VLM-Poisoning/data/") for p in csv_full_paths
]
csv_relative_paths_set = set(csv_relative_paths)
print(csv_relative_paths_set)
# Step 2: Locate your JSON files
json_files = glob("/home/yaseema/VLM-Poisoning/data/poisoned_training_data/llava/ok_vqa-lowFuelLight_base_engineLight_target_filtered/poison_*-seed_0.json")

# Step 3: Output directory
output_dir = "/home/yaseema/VLM-Poisoning/data/poisoned_training_data/llava/ok_vqa-lowFuelLight_base_engineLight_target_filtered"
os.makedirs(output_dir, exist_ok=True)

base_path = "/home/yaseema/VLM-Poisoning/data/"

# Step 4: Process each JSON file
for json_file in json_files:
    with open(json_file, 'r') as f:
        data = json.load(f)

    # Filter entries
    filtered_data = []

    for entry in data:
        image_path = os.path.normpath(entry["image"])
        relative_path = os.path.relpath(image_path, base_path)
        # print(relative_path)
        if image_path.startswith("poisons/") or relative_path.startswith("poisons/"):
            # Always keep clean data
            filtered_data.append(entry)
        elif image_path in csv_relative_paths_set:
            # Keep poisoned image only if it's in the CSV list
            filtered_data.append(entry)
        elif relative_path in csv_relative_paths_set:
            filtered_data.append(entry)
        else:
            # print("Removing:", image_path)
            pass

    # Write to new JSON in the output directory
    out_path = os.path.join(output_dir, os.path.basename(json_file))
    with open(out_path, 'w') as f:
        json.dump(filtered_data, f, indent=4)

    print(f"Saved filtered {os.path.basename(json_file)}: {len(data) - len(filtered_data)} entries removed.")
