# output ideal json and images format

# import os
# import json
# import random
# import shutil
# from collections import defaultdict

# # === Configuration ===
# random.seed(42)

# # Paths
# COMBINED_JSON = "/fs/scratch/PAS2099/Jiacheng/Orientation/output/combined_general_complex_with_top.json"

# # EgoOrient image root
# EGO_ROOT = "/fs/scratch/PAS2099/Jiacheng/EgoOrientBench/EgoOrientBench/all_data/EgocentricDataset/imagenet_after"
# # CureOr image dirs

# CURE_DIRS = [
#     "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/white/iPhone",
#     "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/texture1/iPhone",
#     "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/texture2/iPhone"
# ]

# # Output directories
# BASE_OUT       = "/fs/scratch/PAS2099/Jiacheng/Orientation"
# TRAIN_IMG_DIR  = os.path.join(BASE_OUT, "train", "images")
# VAL_IMG_DIR    = os.path.join(BASE_OUT, "val",   "images")
# TRAIN_JSON     = os.path.join(BASE_OUT, "train", "train.json")
# VAL_JSON       = os.path.join(BASE_OUT, "val",   "val.json")

# # Create output dirs
# for path in (TRAIN_IMG_DIR, VAL_IMG_DIR, os.path.dirname(TRAIN_JSON), os.path.dirname(VAL_JSON)):
#     os.makedirs(path, exist_ok=True)

# # Load combined JSON
# with open(COMBINED_JSON, "r") as f:
#     data = json.load(f)

# # Group entries by (category_name, label)
# groups = defaultdict(list)
# for entry in data:
#     if entry.get("type") != "general_complex":
#         continue
#     key = (entry["category_name"], entry["label"])
#     groups[key].append(entry)

# # Prepare train/val containers
# train_entries = []
# val_entries   = []

# # For each group, split images 80/20
# for (cat, lbl), entries in groups.items():
#     # collect unique image filenames
#     fnames = list({ os.path.basename(e["image"]) for e in entries })
#     random.shuffle(fnames)
#     n_train = round(0.8 * len(fnames))
#     train_f = set(fnames[:n_train])
#     val_f   = set(fnames[n_train:])

#     # assign each entry
#     for e in entries:
#         fn = os.path.basename(e["image"])
#         if fn in train_f:
#             train_entries.append(e)
#         else:
#             val_entries.append(e)

#     # copy train images
#     for fn in train_f:
#         # determine source path
#         src = None
#         if e.get("base_dataset") == "Cure_Or":
#             # search in CureOr dirs
#             for d in CURE_DIRS:
#                 p = os.path.join(d, fn)
#                 if os.path.exists(p):
#                     src = p
#                     break
#         else:
#             # EgoOrient
#             src = os.path.join(EGO_ROOT, fn)
#         # copy if found
#         if src and os.path.exists(src):
#             shutil.copy(src, os.path.join(TRAIN_IMG_DIR, fn))

#     # copy val images
#     for fn in val_f:
#         src = None
#         if e.get("base_dataset") == "Cure_Or":
#             for d in CURE_DIRS:
#                 p = os.path.join(d, fn)
#                 if os.path.exists(p):
#                     src = p
#                     break
#         else:
#             src = os.path.join(EGO_ROOT, fn)
#         if src and os.path.exists(src):
#             shutil.copy(src, os.path.join(VAL_IMG_DIR, fn))

# # Save train/val JSON files
# with open(TRAIN_JSON, "w") as f:
#     json.dump(train_entries, f, indent=2, ensure_ascii=False)
# with open(VAL_JSON, "w") as f:
#     json.dump(val_entries, f, indent=2, ensure_ascii=False)

# print(f"✅ Done. Train entries: {len(train_entries)}, Val entries: {len(val_entries)}")
# print(f"Images copied to {TRAIN_IMG_DIR} and {VAL_IMG_DIR}")


import os
import json
import random
import shutil
from collections import defaultdict

# === 配置 ===
random.seed(42)

# Input combined JSON (包含两种设备)
COMBINED_JSON = "/fs/scratch/PAS2099/Jiacheng/Orientation/output/combined_general_complex_with_top_2devices_v2.json"

# EgoOrient 图片根目录
EGO_ROOT = "/fs/scratch/PAS2099/Jiacheng/EgoOrientBench/EgoOrientBench/all_data/EgocentricDataset/imagenet_after"
# CureOr 的图片目录（iPhone + LG）
CURE_DIRS = [
    "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/white/iPhone",
    "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/texture1/iPhone",
    "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/texture2/iPhone",
    "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/texture1/LG",
    "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/texture2/LG",
    "/fs/scratch/PAS2099/Jiacheng/Cure_or/01_no_challenge/white/LG",
]

# 输出目录
BASE_OUT       = "/fs/scratch/PAS2099/Jiacheng/Orientation"
TRAIN_IMG_DIR  = os.path.join(BASE_OUT, "train_v2", "image")
VAL_IMG_DIR    = os.path.join(BASE_OUT, "val_v2",   "image")
TRAIN_JSON     = os.path.join(BASE_OUT, "train_v2", "train.json")
VAL_JSON       = os.path.join(BASE_OUT, "val_v2",   "val.json")

# Create output dirs if not exist
for path in (TRAIN_IMG_DIR, VAL_IMG_DIR, os.path.dirname(TRAIN_JSON), os.path.dirname(VAL_JSON)):
    os.makedirs(path, exist_ok=True)

# Load combined JSON
with open(COMBINED_JSON, "r") as f:
    data = json.load(f)

# 1) Group entries by (category_name, label)
groups = defaultdict(list)
for entry in data:
    if entry.get("type") != "general_complex":
        continue
    key = (entry["category_name"], entry["label"])
    groups[key].append(entry)

train_entries = []
val_entries   = []

# 2) 对每个 (object, orientation) 组合分别做 80/20 拆分
for (cat, lbl), entries in groups.items():
    # collect unique image filenames
    fnames = list({os.path.basename(e["image"]) for e in entries})
    random.shuffle(fnames)
    n_train = round(0.8 * len(fnames))
    train_f = set(fnames[:n_train])
    val_f   = set(fnames[n_train:])

    # 3) assign each entry to train or val
    for e in entries:
        fn = os.path.basename(e["image"])
        if fn in train_f:
            train_entries.append(e)
        else:
            val_entries.append(e)

    # 4) copy train images
    for fn in train_f:
        src = None
        # search in CureOr dirs first
        for d in CURE_DIRS:
            p = os.path.join(d, fn)
            if os.path.exists(p):
                src = p
                break
        # if not found in CureOr, use EgoOrient
        if src is None:
            src = os.path.join(EGO_ROOT, fn)
        if src and os.path.exists(src):
            shutil.copy(src, os.path.join(TRAIN_IMG_DIR, fn))

    # 5) copy val images
    for fn in val_f:
        src = None
        for d in CURE_DIRS:
            p = os.path.join(d, fn)
            if os.path.exists(p):
                src = p
                break
        if src is None:
            src = os.path.join(EGO_ROOT, fn)
        if src and os.path.exists(src):
            shutil.copy(src, os.path.join(VAL_IMG_DIR, fn))

# 6) 保存拆分好的 JSON
with open(TRAIN_JSON, "w") as f:
    json.dump(train_entries, f, indent=2, ensure_ascii=False)
with open(VAL_JSON, "w") as f:
    json.dump(val_entries, f, indent=2, ensure_ascii=False)

print(f"✅ Split complete. Train entries: {len(train_entries)}, Val entries: {len(val_entries)}")
print(f"Images saved to:\n  {TRAIN_IMG_DIR}\n  {VAL_IMG_DIR}")
