from pathlib import Path
import json
import shutil
from tqdm import tqdm

from datasets import load_dataset

from src.path import source_dataset_download_dir, test_intermediate_data_dir, test_intermediate_dir


if __name__ == "__main__":
    dataset = load_dataset("AILab-CVC/SEED-Bench", split="test")
    
    # shuffle the dataset
    dataset = dataset.shuffle(seed=68)
    
    # save images
    seedbench_image_dir = Path("images") / "SEED-Bench"
    intermediate_seedbench_image_dir = test_intermediate_dir / seedbench_image_dir
    intermediate_seedbench_image_dir.mkdir(parents=True, exist_ok=True)
    
    geometry_diagram_data = []
    for d in tqdm(dataset):
        # 2: Instance Identity, 3:Instance Attribute, 4: Instance Location, 6: Spatial Relation, 7: Instance Interaction
        if d["question_type_id"] in ["2", "3", "4", "6", "7"]:
            image_name = d["data_id"]
            
            source_image_path: Path = source_dataset_download_dir / "SEED-Bench/SEED-Bench-image" / image_name
            target_image_path: Path = intermediate_seedbench_image_dir / f"{image_name}.png"
            target_image_path.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy(source_image_path, target_image_path)
            
            geometry_diagram_data.append(
                {
                    "image": str(seedbench_image_dir / f"{image_name}.png"),
                    "original_question": d["question"],
                    "original_answer": d["answer"],
                }
            )
    
    # save the geometry diagram data
    general_images_intermediate_dir = test_intermediate_data_dir / "general_images"
    general_images_intermediate_dir.mkdir(parents=True, exist_ok=True)
    
    with open(general_images_intermediate_dir / "SEED-Bench.jsonl", "w") as f:
        for data in geometry_diagram_data:
            f.write(json.dumps(data) + "\n")
