# #!/usr/bin/env python3
# import os
# import shutil
# import csv

# # ———— 配置 ————
# scene_csv     = '/fs/scratch/PAS2099/Jiacheng/Places_merge/output/scene_labels.csv'
# extra_root    = '/fs/scratch/PAS2099/Jiacheng/Place_365_Extra_69/data/data_large_extra'
# place365_root = '/fs/scratch/PAS2099/Jiacheng/place365/output/data_with_category'
# merged_root   = '/fs/scratch/PAS2099/Jiacheng/Places_merge/output/merged_optimized_data'

# # ———— 1. 读取原始→优化标签映射 ————
# mapping = {}
# with open(scene_csv, newline='', encoding='utf-8') as f:
#     reader = csv.DictReader(f)
#     for row in reader:
#         mapping[row['original_label']] = row['optimized_label']

# # ———— 2. 预先创建所有目标文件夹 ————
# for opt in mapping.values():
#     os.makedirs(os.path.join(merged_root, opt), exist_ok=True)

# # ———— 3. 处理 Extra69 数据（支持两级子文件夹） ————
# for letter in os.listdir(extra_root):
#     letter_dir = os.path.join(extra_root, letter)
#     if not os.path.isdir(letter_dir):
#         continue

#     # 第一层遍历：athletic_field, badminton_court, ...
#     for lvl1 in os.listdir(letter_dir):
#         lvl1_path = os.path.join(letter_dir, lvl1)
#         if not os.path.isdir(lvl1_path):
#             continue

#         # 查看是否有第二层子目录
#         subdirs = [d for d in os.listdir(lvl1_path)
#                    if os.path.isdir(os.path.join(lvl1_path, d))]
#         if subdirs:
#             # 两级结构：/a/athletic_field/indoor
#             for lvl2 in subdirs:
#                 src = os.path.join(lvl1_path, lvl2)
#                 orig_key = f"/{letter}/{lvl1}/{lvl2}"
#                 opt_label = mapping.get(orig_key)
#                 if not opt_label:
#                     continue
#                 dst = os.path.join(merged_root, opt_label)
#                 for fn in os.listdir(src):
#                     if fn.lower().endswith(('.jpg', '.jpeg', '.png')):
#                         shutil.copy2(
#                             os.path.join(src, fn),
#                             os.path.join(dst, fn)
#                         )
#         else:
#             # 单级结构：/a/airfield
#             src = lvl1_path
#             orig_key = f"/{letter}/{lvl1}"
#             opt_label = mapping.get(orig_key)
#             if not opt_label:
#                 continue
#             dst = os.path.join(merged_root, opt_label)
#             for fn in os.listdir(src):
#                 if fn.lower().endswith(('.jpg', '.jpeg', '.png')):
#                     shutil.copy2(
#                         os.path.join(src, fn),
#                         os.path.join(dst, fn)
#                     )

# # ———— 4. 处理 Place365 验证集 ————
# for orig_key, opt_label in mapping.items():
#     parts = orig_key.strip('/').split('/')
#     rel_path = os.path.join(*parts[1:])   # e.g. "stadium/baseball" or "airfield"
#     src_dir = os.path.join(place365_root, rel_path)
#     if not os.path.isdir(src_dir):
#         continue
#     dst_dir = os.path.join(merged_root, opt_label)
#     for fn in os.listdir(src_dir):
#         if fn.lower().endswith(('.jpg', '.jpeg', '.png')):
#             shutil.copy2(
#                 os.path.join(src_dir, fn),
#                 os.path.join(dst_dir, fn)
#             )

# # ———— 5. 最终统计 & 列出无图标签 ————
# all_labels = set(mapping.keys())
# has_imgs = set()
# for orig_key, opt_label in mapping.items():
#     folder = os.path.join(merged_root, opt_label)
#     imgs = [f for f in os.listdir(folder)
#             if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
#     if imgs:
#         has_imgs.add(orig_key)

# missing = sorted(all_labels - has_imgs)

# print("映射表中标签总数       :", len(all_labels))
# print("实际包含图像的标签数   :", len(has_imgs))
# print("\n以下 original_label 对应的文件夹无任何图像：")
# for m in missing:
#     print(m)

#!/usr/bin/env python3
import os
import shutil
import csv

# ———— 配置 ————
scene_csv     = '/fs/scratch/PAS2099/Jiacheng/Places_merge/output/llm_optimized_scene_labels_v2.csv'
extra_root    = '/fs/scratch/PAS2099/Jiacheng/Place_365_Extra_69/data/data_large_extra'
place365_root = '/fs/scratch/PAS2099/Jiacheng/place365/output/data_with_category'
merged_root   = '/fs/scratch/PAS2099/Jiacheng/Places_merge/output/merged_optimized_data_v5'

# ———— 1. 读取原始→优化标签映射 ————
mapping = {}
with open(scene_csv, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        mapping[row['original_label']] = row['optimized_label']

# ———— 2. 预先创建所有目标文件夹 ————
for opt in mapping.values():
    os.makedirs(os.path.join(merged_root, opt), exist_ok=True)

# ———— 3. 处理 Extra69 数据 ————
for letter in os.listdir(extra_root):
    letter_dir = os.path.join(extra_root, letter)
    if not os.path.isdir(letter_dir):
        continue

    # 第一层遍历：athletic_field, badminton_court, ...
    for lvl1 in os.listdir(letter_dir):
        lvl1_path = os.path.join(letter_dir, lvl1)
        if not os.path.isdir(lvl1_path):
            continue

        # 检查是否有第二层子目录
        subdirs = [d for d in os.listdir(lvl1_path)
                   if os.path.isdir(os.path.join(lvl1_path, d))]
        if subdirs:
            # 两级结构：/a/athletic_field/indoor
            for lvl2 in subdirs:
                src = os.path.join(lvl1_path, lvl2)
                orig_key = f"/{letter}/{lvl1}/{lvl2}"
                opt_label = mapping.get(orig_key)
                if not opt_label:
                    continue
                dst = os.path.join(merged_root, opt_label)

                files = os.listdir(src)
                # 如果有任何以 extra_test 开头的文件，只复制这些；否则复制所有图片
                extra_files = [f for f in files if f.startswith('extra_test')]
                to_copy = extra_files if extra_files else files
                for fn in to_copy:
                    if fn.lower().endswith(('.jpg', '.jpeg', '.png')):
                        shutil.copy2(
                            os.path.join(src, fn),
                            os.path.join(dst, fn)
                        )
        else:
            # 单级结构：/a/airfield
            src = lvl1_path
            orig_key = f"/{letter}/{lvl1}"
            opt_label = mapping.get(orig_key)
            if not opt_label:
                continue
            dst = os.path.join(merged_root, opt_label)

            files = os.listdir(src)
            extra_files = [f for f in files if f.startswith('extra_test')]
            to_copy = extra_files if extra_files else files
            for fn in to_copy:
                if fn.lower().endswith(('.jpg', '.jpeg', '.png')):
                    shutil.copy2(
                        os.path.join(src, fn),
                        os.path.join(dst, fn)
                    )

# ———— 4. 处理 Place365 验证集 ————
for orig_key, opt_label in mapping.items():
    parts = orig_key.strip('/').split('/')
    rel_path = os.path.join(*parts[1:])   # e.g. "stadium/baseball" or "airfield"
    src_dir = os.path.join(place365_root, rel_path)
    if not os.path.isdir(src_dir):
        continue
    dst_dir = os.path.join(merged_root, opt_label)
    for fn in os.listdir(src_dir):
        if fn.lower().endswith(('.jpg', '.jpeg', '.png')):
            shutil.copy2(
                os.path.join(src_dir, fn),
                os.path.join(dst_dir, fn)
            )

# ———— 5. 统计 & 列出无图标签 ————
all_labels = set(mapping.keys())
has_imgs = set()
for orig_key, opt_label in mapping.items():
    folder = os.path.join(merged_root, opt_label)
    imgs = [f for f in os.listdir(folder)
            if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    if imgs:
        has_imgs.add(orig_key)

missing = sorted(all_labels - has_imgs)

print("映射表中标签总数       :", len(all_labels))
print("实际包含图像的标签数   :", len(has_imgs))
print("\n以下 original_label 对应的文件夹无任何图像：")
for m in missing:
    print(m)
