import json
import os
import shutil
from pathlib import Path

from tqdm import tqdm
import re # 用于更精确地匹配文件名中的数字

def print_json_structure(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        print(f"Structure of {file_path}:")
        print(f"Type: {type(data)}")
        if isinstance(data, dict):
            print(f"Keys: {list(data.keys())}")
            for key, value in data.items():
                print(f"Key: {key}")
                print(f"Type: {type(value)}")
                if isinstance(value, dict):
                    print(f"Keys: {list(value.keys())}")
                elif isinstance(value, list):
                    print(f"Length: {len(value)}")
                else:
                    print("Unknown structure")
                print()

                break
        elif isinstance(data, list):
            print(f"Length: {len(data)}")
        else:
            print("Unknown structure")

def organize_coco_train_dataset(
        extracted_json_path: str,
        original_json_path: str,
        src_image_dir: str,
        dst_image_dir: str
) -> None:
    """
    将筛选后的图片和对应的caption整理到目标目录

    Args:
        extracted_json_path: 筛选后的caption文件路径 (captions_train2017_extracted.json)
        original_json_path: 原始COCO标注文件路径 (captions_train2017.json)
        src_image_dir: 原始图片目录 (data/COCO/train2017)
        dst_image_dir: 目标目录 (data/COCO/train2017_extracted)

    Structure of data/prompts/captions_train2017_extracted.json:
    Type: <class 'list'>

    Structure of data/prompts/captions_train2017.json:
    Type: <class 'dict'>
        info: ...
        licenses: ...
        images: ...(list: 118287)
            file_name: xxxxxx.jpg
            id: xxxx
            ...
        annotations: ...(list: 591753)
            image_id: xxxx
            id: xxxx
            caption: ...
            ...
    """
    # 确保目标目录存在
    os.makedirs(dst_image_dir, exist_ok=True)

    # 加载数据
    with open(extracted_json_path, 'r') as f:
        extracted_data = json.load(f)  # List[dict]

    with open(original_json_path, 'r') as f:
        original_data = json.load(f)  # Dict

    # 构建快速索引
    # Step 1: 构建 image_id -> file_name 的映射
    image_id_to_file = {img["id"]: img["file_name"] for img in original_data["images"]}

    # Step 2: 构建 caption -> image_id, caption 的映射
    annotation_dict = {ann["caption"]: ann["image_id"] for ann in original_data["annotations"]}

    # 处理每个提取的caption
    for caption in tqdm(extracted_data, desc="Processing captions"):
        image_id = annotation_dict.get(caption, None)
        if image_id is None:
            print(f"Warning: Caption <{caption}> not found in original data")
            continue

        # 获取图片文件名
        file_name = image_id_to_file.get(image_id)
        if not file_name:
            print(f"Warning: Image ID {image_id} not found in original data")
            continue

        # 构造路径
        src_path = os.path.join(src_image_dir, file_name)
        dst_img_path = os.path.join(dst_image_dir, file_name)
        dst_txt_path = os.path.splitext(dst_img_path)[0] + ".txt"

        # 复制图片（如果尚未存在）
        if not os.path.exists(dst_img_path):
            try:
                shutil.copy(src_path, dst_img_path)
            except FileNotFoundError:
                print(f"Error: Source image {src_path} not found")
                continue

        # 写入caption到txt文件
        with open(dst_txt_path, 'w', encoding='utf-8') as f:
            f.write(caption)

def organize_coco_val_dataset(
        extracted_json_path: str,
        original_json_path: str,
        src_image_dir: str,
        dst_image_dir: str,
        how_many: int = 1000
) -> None:
    """
    将筛选后的图片和对应的caption整理到目标目录

    Args:
        extracted_json_path: 筛选后的caption文件路径 (captions_train2017_extracted.json)
        original_json_path: 原始COCO标注文件路径 (captions_train2017.json)
        src_image_dir: 原始图片目录 (data/COCO/train2017)
        dst_image_dir: 目标目录 (data/COCO/train2017_extracted)

    Structure of data/prompts/captions_train2017_extracted.json:
    Type: <class 'list'>

    Structure of data/prompts/captions_train2017.json:
    Type: <class 'dict'>
        info: ...
        licenses: ...
        images: ...(list: 118287)
            file_name: xxxxxx.jpg
            id: xxxx
            ...
        annotations: ...(list: 591753)
            image_id: xxxx
            id: xxxx
            caption: ...
            ...
    """
    print(f"Organizing dataset: processing up to {how_many} captions.")

    # 确保目标目录存在
    os.makedirs(dst_image_dir, exist_ok=True)

    # 加载数据
    try:
        with open(extracted_json_path, 'r', encoding='utf-8') as f:
            extracted_data = json.load(f)  # List[str] containing captions
        print(f"Loaded {len(extracted_data)} captions from {extracted_json_path}")

        with open(original_json_path, 'r', encoding='utf-8') as f:
            original_data = json.load(f)  # Dict
        print(f"Loaded original annotations from {original_json_path}")

    except FileNotFoundError as e:
        print(f"Error loading JSON file: {e}")
        return
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON file: {e}")
        return

    # --- MODIFICATION START: Limit the number of captions to process ---
    # If how_many is specified and positive, take only the first 'how_many' captions.
    # Python's slicing handles cases where how_many is larger than the list size gracefully.
    if how_many is not None and how_many > 0:
        captions_to_process = extracted_data[:how_many]
        print(f"Limiting processing to the first {len(captions_to_process)} captions based on how_many={how_many}.")
    else:
        # If how_many is None, 0 or negative, process all extracted captions
        captions_to_process = extracted_data
        print("Processing all extracted captions (how_many <= 0 or not specified).")
    # --- MODIFICATION END ---

    # 构建快速索引
    # Step 1: 构建 image_id -> file_name 的映射
    image_id_to_file = {img["id"]: img["file_name"] for img in original_data["images"]}

    # Step 2: 构建 caption -> image_id, caption 的映射
    annotation_dict = {ann["caption"]: ann["image_id"] for ann in original_data["annotations"]}

    # 处理每个提取的caption
    for caption in tqdm(captions_to_process, desc="Processing captions"):
        image_id = annotation_dict.get(caption, None)
        if image_id is None:
            print(f"Warning: Caption <{caption}> not found in original data")
            continue

        # 获取图片文件名
        file_name = image_id_to_file.get(image_id)
        if not file_name:
            print(f"Warning: Image ID {image_id} not found in original data")
            continue

        # 构造路径
        src_path = os.path.join(src_image_dir, file_name)
        dst_img_path = os.path.join(dst_image_dir, file_name)
        dst_txt_path = os.path.splitext(dst_img_path)[0] + ".txt"

        # 复制图片（如果尚未存在）
        if not os.path.exists(dst_img_path):
            try:
                shutil.copy(src_path, dst_img_path)
            except FileNotFoundError:
                print(f"Error: Source image {src_path} not found")
                continue
        else:
            print(f"Image {dst_img_path} already exists, skipping copy.")
            print(f'caption: {caption}')
            print(f'image_id: {image_id}')
            print(f'file_name: {file_name}')

def clean_ckpt_files(directory, prefix, suffix, threshold_inclusive, dry_run=True):
    """
    清理指定目录下的ckpt文件，删除编号大于等于threshold_inclusive的文件。

    Args:
        directory (str): 目标文件夹路径。
        prefix (str): ckpt文件名的前缀 (例如 "anole_")。
        suffix (str): ckpt文件名的后缀 (例如 ".ckpt")。
        threshold_inclusive (int): 删除的起始编号（包含此编号）。
                                   编号 < threshold_inclusive 的文件将被保留。
        dry_run (bool): 是否为试运行模式。True则只打印信息，不实际删除。
    """
    if not os.path.isdir(directory):
        print(f"错误：目录 '{directory}' 不存在。")
        return

    print(f"正在扫描目录: '{directory}'")
    print(f"文件名前缀: '{prefix}', 文件名后缀: '{suffix}'")
    print(f"将保留编号小于 {threshold_inclusive} 的文件。")
    print(f"将删除编号大于等于 {threshold_inclusive} 的文件。")
    if dry_run:
        print("模式：试运行 (不会删除任何文件)")
    else:
        print("模式：实际删除")
    print("-" * 30)

    files_to_delete = []
    files_to_keep_count = 0
    other_files_count = 0

    # 构建正则表达式来精确匹配文件名并提取数字
    # 例如: anole_(\d+).ckpt
    pattern_str = f"^{re.escape(prefix)}(\\d+){re.escape(suffix)}$"
    try:
        file_pattern = re.compile(pattern_str)
    except re.error as e:
        print(f"错误：无效的文件名匹配模式: {pattern_str} - {e}")
        return

    for filename in os.listdir(directory):
        full_file_path = os.path.join(directory, filename)

        if not os.path.isfile(full_file_path):
            # print(f"跳过非文件项目: {filename}")
            other_files_count +=1
            continue

        match = file_pattern.match(filename)
        if match:
            try:
                file_index = int(match.group(1)) # 提取括号匹配到的数字部分
                if file_index >= threshold_inclusive:
                    files_to_delete.append((full_file_path, file_index))
                else:
                    files_to_keep_count += 1
            except ValueError:
                print(f"警告：文件名 '{filename}' 中的数字部分无法解析，已跳过。")
                other_files_count += 1
        else:
            # 不是目标格式的ckpt文件，例如其他类型文件或不符合命名规则的文件
            # print(f"跳过不匹配目标格式的文件: {filename}")
            other_files_count +=1
            pass


    print("-" * 30)
    if not files_to_delete:
        print("没有找到符合删除条件的文件。")
    else:
        # 为了方便查看，可以按文件编号排序
        files_to_delete.sort(key=lambda item: item[1])
        print(f"找到 {len(files_to_delete)} 个符合删除条件的文件：")
        for f_path, f_index in files_to_delete:
            if dry_run:
                print(f"  [试运行] 将删除: {f_path} (编号: {f_index})")
            else:
                try:
                    os.remove(f_path)
                    print(f"  已删除: {f_path} (编号: {f_index})")
                except OSError as e:
                    print(f"  错误：删除文件 '{f_path}' 失败: {e}")

    print("-" * 30)
    print("操作总结:")
    print(f"  保留的匹配格式文件 (编号 < {threshold_inclusive}): {files_to_keep_count}")
    print(f"  标记/尝试删除的文件 (编号 >= {threshold_inclusive}): {len(files_to_delete)}")
    if not dry_run:
        # 重新统计实际删除成功的（如果os.remove有失败的）
        actually_deleted = 0
        for f_path, _ in files_to_delete:
            if not os.path.exists(f_path):
                 actually_deleted +=1
        print(f"  实际删除的文件数量: {actually_deleted}")
    print(f"  目录中其他文件/子目录 (已跳过): {other_files_count}")
    print("脚本执行完毕。")


# 使用示例
if __name__ == "__main__":
    # organize_coco_train_dataset(
    #     extracted_json_path="data/prompts/captions_train2017_extracted.json",
    #     original_json_path="data/prompts/captions_train2017.json",
    #     src_image_dir="data/COCO/train2017",
    #     dst_image_dir="data/COCO/train2017_extracted"
    # )

    # target_folder = "/home/yekeming/project/LANTERN/data/COCO/val2017_slice0_1000_test"
    # try:
    #     folder = Path(target_folder)
    #     if folder.is_dir():
    #         # sum(1 for item in folder.iterdir() if item.is_file())
    #         # 使用生成器表达式计算符合条件的文件数量
    #         file_count_oneliner = sum(1 for item in folder.iterdir() if item.is_file())
    #         print(f"(简洁写法) 文件夹 '{target_folder}' 下共有 {file_count_oneliner} 个文件。")
    #     else:
    #         print(f"(简洁写法) 错误：'{target_folder}' 不是一个有效的文件夹。")
    # except Exception as e:
    #     print(f"(简洁写法) 发生错误：{e}")
    #
    # file_list = os.listdir(target_folder)

    organize_coco_val_dataset(
        extracted_json_path="data/prompts/captions_val2017_longest.json",
        original_json_path="data/prompts/captions_val2017.json",
        src_image_dir="data/COCO/val2017",
        dst_image_dir="data/COCO/val2017_slice0_1000_test",
        how_many=1000
    )


    # # --- 配置参数 ---
    # DIR_PATH = "/home/yekeming/project/LANTERN/data/COCO/drafter_train_data/anole"
    # FILE_PREFIX = "data_"  # 文件名前缀
    # FILE_SUFFIX = ".ckpt"  # 文件名后缀
    # # 保留的文件索引上限（不包含此值）。例如，设置为30000，则会保留 anole_0.ckpt 到 anole_29999.ckpt
    # # 编号大于等于此值的文件将被删除。
    # DELETE_THRESHOLD_INCLUSIVE = 30000
    #
    # # !!! 安全开关：True 表示试运行，只打印会删除哪些文件；False 表示实际执行删除 !!!
    # DRY_RUN = True
    # # DRY_RUN = False # 如果确认无误，取消注释此行并注释上一行以执行实际删除
    #
    # # 执行清理函数
    # clean_ckpt_files(
    #     directory=DIR_PATH,
    #     prefix=FILE_PREFIX,
    #     suffix=FILE_SUFFIX,
    #     threshold_inclusive=DELETE_THRESHOLD_INCLUSIVE,
    #     dry_run=DRY_RUN
    # )



