import os
import json


def replace_image_with_video(obj):
    """
    Recursively traverse a JSON object and replace all occurrences of "<image>\n" with "<video>\n".

    Args:
        obj: The JSON object to process (can be dict, list, or primitive type)

    Returns:
        The processed JSON object
    """
    if isinstance(obj, dict):
        # Process dictionary
        for key, value in obj.items():
            obj[key] = replace_image_with_video(value)
        return obj
    elif isinstance(obj, list):
        # Process list
        return [replace_image_with_video(item) for item in obj]
    elif isinstance(obj, str):
        # Process string: replace all "<image>\n" with "<video>\n"
        return obj.replace("<image>\n", "<video>\n")
    else:
        # Other types (numbers, booleans, etc.) return as-is
        return obj


def extract_prefix(folder_name):
    """
    Extract prefix from folder name (e.g., "0_30_s" or "1_2_m")

    Args:
        folder_name: Folder name string

    Returns:
        Extracted prefix string, or None if format is invalid
    """
    parts = folder_name.split('_')
    # Check if at least 3 parts and third part is 's' or 'm'
    if len(parts) >= 3 and parts[2] in ['s', 'm']:
        # Check if first two parts are digits
        if parts[0].isdigit() and parts[1].isdigit():
            return f"{parts[0]}_{parts[1]}_{parts[2]}"
    return None


def process_json_files(input_folder, output_dir):
    """
    Process all processed.json files in folders, group and merge by extracted prefix.

    Args:
        input_folder: Root folder to search for JSON files
        output_dir: Output directory to save grouped JSON files
    """
    # Initialize dictionary to store data by prefix
    prefix_data = {}
    skipped_folders = []
    invalid_folders = []

    # Walk through directories to find all processed.json files
    for root, dirs, files in os.walk(input_folder):
        # Skip folders containing "llava_hound"
        if "llava_hound" in root.lower():
            print(f"Skipping folder: {root}")
            skipped_folders.append(root)
            dirs[:] = []  # Clear dirs to avoid traversing subdirectories
            continue

        # Get current folder name
        folder_name = os.path.basename(root)

        # Extract prefix (supports seconds 's' and minutes 'm' format)
        prefix = extract_prefix(folder_name)

        if prefix is None:
            print(f"Warning: Folder {folder_name} does not follow naming convention (should be number_number_s or number_number_m), skipping")
            invalid_folders.append(root)
            continue

        # Initialize data list for this prefix if not exists
        if prefix not in prefix_data:
            prefix_data[prefix] = []

        for file in files:
            if file.endswith("processed.json"):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path} (prefix: {prefix})")

                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    # Process each JSON object
                    for item in data:
                        # Update video path
                        if "video" in item and isinstance(item["video"], str):
                            current_dir = os.path.dirname(file_path)

                            # Skip if video path is already absolute
                            if os.path.isabs(item["video"]):
                                print(f"  Skipping absolute path: {item['video']}")
                            else:
                                # Build new path
                                new_video_path = os.path.join(current_dir, item["video"])
                                # Convert to forward slashes (Linux style)
                                new_video_path = new_video_path.replace('\\', '/')
                                item["video"] = new_video_path

                        # Recursively replace all "<image>\n" with "<video>\n"
                        item = replace_image_with_video(item)

                    # Add processed data to corresponding prefix list
                    prefix_data[prefix].extend(data)

                except Exception as e:
                    print(f"Error processing file {file_path}: {str(e)}")

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Write separate JSON file for each prefix
    for prefix, data_list in prefix_data.items():
        output_file = os.path.join(output_dir, f"{prefix}_combined.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data_list, f, indent=2, ensure_ascii=False)
        print(f"Data for prefix {prefix} saved to {output_file}, total {len(data_list)} records")

    # Print summary
    total_records = sum(len(data) for data in prefix_data.values())
    print(f"\n✅ Processing complete! Total records processed: {total_records}")
    print(f"⚠️  Skipped {len(skipped_folders)} folders containing 'llava_hound'")
    print(f"⚠️  Skipped {len(invalid_folders)} folders with invalid naming convention")
    print(f"📁 Results saved to directory: {output_dir}")


if __name__ == "__main__":
    input_folder = "/path/to/your/input/folder"   # e.g., "/path/to/LLaVA-Video-178K"
    output_dir = "/path/to/your/output/folder"    # e.g., "/path/to/LLaVA-Video-178K/combined_by_prefix"

    # Validate input folder exists
    if not os.path.isdir(input_folder):
        print(f"❌ Error: Input folder {input_folder} does not exist")
    else:
        process_json_files(input_folder, output_dir)