#!/usr/bin/env python3
"""
In-depth analysis of ROI path finding issues
Identify why certain samples cannot find ROI files
"""

import os
import sys
import glob
from pathlib import Path
from collections import defaultdict

# Add project root directory to path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

from transformers import AutoTokenizer
from data.dataset_english_medical import EnglishMedicalDatasetFast


def analyze_dataset_structure(dataset_root):
    """Analyze the directory structure of the dataset"""
    print("=" * 60)
    print("1. Analyzing Dataset Directory Structure")
    print("=" * 60)

    domains = [d for d in os.listdir(dataset_root)
               if os.path.isdir(os.path.join(dataset_root, d))]

    print(f"Found {len(domains)} medical domains:")

    roi_summary = {}

    for domain in domains:
        domain_path = os.path.join(dataset_root, domain)
        print(f"\n📁 {domain}:")

        # Check images directory
        images_dir = os.path.join(domain_path, 'images')
        images_count = 0
        if os.path.exists(images_dir):
            images_count = len([f for f in os.listdir(images_dir)
                                if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
            print(f"  🖼 images: {images_count} images")
        else:
            print(f"  ❌ images directory does not exist")

        # Check roi_images directory
        roi_dir = os.path.join(domain_path, 'roi_images')
        roi_count = 0
        roi_files = []
        if os.path.exists(roi_dir):
            roi_files = [f for f in os.listdir(roi_dir)
                         if f.lower().endswith('.png')]
            roi_count = len(roi_files)
            print(f"  🔍 roi_images: {roi_count} ROI files")

            # Analyze ROI file naming patterns
            if roi_files:
                sample_files = roi_files[:5]
                print(f"    Sample files: {sample_files}")

                # Count Normal vs Abnormal
                normal_count = len([f for f in roi_files if 'Normal' in f])
                abnormal_count = len([f for f in roi_files if 'Abnormal' in f])
                print(f"    Normal: {normal_count}, Abnormal: {abnormal_count}")
        else:
            print(f"  ❌ roi_images directory does not exist")

        roi_summary[domain] = {
            'images_count': images_count,
            'roi_count': roi_count,
            'roi_files': roi_files[:10] if roi_files else [],  # Keep only first 10 as samples
            'has_roi_dir': os.path.exists(roi_dir)
        }

    return roi_summary


def analyze_roi_naming_pattern(roi_summary):
    """Analyze the naming patterns of ROI files"""
    print("\n" + "=" * 60)
    print("2. Analyzing ROI File Naming Patterns")
    print("=" * 60)

    all_patterns = defaultdict(list)

    for domain, info in roi_summary.items():
        if not info['roi_files']:
            continue

        print(f"\n📁 {domain}:")
        for filename in info['roi_files']:
            print(f"  {filename}")

            # Analyze naming pattern
            parts = filename.replace('.png', '').split('_')
            if len(parts) >= 3:
                prefix = parts[0]
                roi_type = parts[1]
                image_id = '_'.join(parts[2:])

                pattern = f"{prefix}_{roi_type}_<ID>"
                all_patterns[pattern].append(domain)

                print(f"    -> Prefix:{prefix}, Type:{roi_type}, ID:{image_id}")

    print(f"\nDiscovered naming patterns:")
    for pattern, domains in all_patterns.items():
        print(f"  {pattern}: Used by {len(domains)} domains")
        print(f"    Domains: {domains[:3]}{'...' if len(domains) > 3 else ''}")


def test_roi_path_logic(dataset_root):
    """Test ROI path finding logic"""
    print("\n" + "=" * 60)
    print("3. Testing ROI Path Finding Logic")
    print("=" * 60)

    # Simulate domain mapping logic in the dataset
    domain_mapping = {
        'Abdominal Imaging': 'Abdominal',
        'Bone and Joint Imaging': 'Bone_Joint',
        'Breast Imaging': 'Breast',
        'Cardiac Imaging': 'Cardiac',
        'Chest Imaging': 'Chest',
        'Cranial Imaging': 'Cranial',
        'Dental Imaging': 'Dental',
        'Dermatological Imaging': 'Dermatological',
        'Endoscopy Imaging': 'Endoscopy',
        'Fundus Imaging': 'Fundus',
        'Gynecological Imaging': 'Gynecological',
        'Pathology Slide Imaging': 'Pathology'
    }

    domains = [d for d in os.listdir(dataset_root)
               if os.path.isdir(os.path.join(dataset_root, d))]

    for domain in domains[:3]:  # Only test first 3 domains
        print(f"\n📁 Testing domain: {domain}")

        # Check domain mapping
        roi_prefix = domain_mapping.get(domain, domain.split()[0])
        print(f"  Mapped prefix: {roi_prefix}")

        # Find actual ROI files
        roi_dir = os.path.join(dataset_root, domain, 'roi_images')
        if not os.path.exists(roi_dir):
            print(f"  ❌ ROI directory does not exist: {roi_dir}")
            continue

        roi_files = [f for f in os.listdir(roi_dir) if f.endswith('.png')]
        print(f"  Actual ROI files count: {len(roi_files)}")

        if roi_files:
            # Analyze actual prefixes
            actual_prefixes = set()
            for filename in roi_files:
                parts = filename.split('_')
                if len(parts) >= 2:
                    actual_prefixes.add(parts[0])

            print(f"  Actual prefixes: {list(actual_prefixes)}")
            print(f"  Expected prefix: {roi_prefix}")

            if roi_prefix not in actual_prefixes:
                print(f"  ❌ Prefix mismatch!")
                print(
                    f"    Suggested mapping fix: '{domain}' -> '{list(actual_prefixes)[0] if actual_prefixes else 'Unknown'}'")
            else:
                print(f"  ✅ Prefix match")


def find_missing_roi_samples(dataset_root):
    """Find specific samples missing ROI files"""
    print("\n" + "=" * 60)
    print("4. Finding Specific Samples Missing ROI Files")
    print("=" * 60)

    tokenizer_path = "/root/autodl-tmp/pubmedbert-base-uncased-abstract-local"
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # Create dataset instance
    dataset = EnglishMedicalDatasetFast(
        dataset_root=dataset_root,
        tokenizer=tokenizer,
        sample_ratio=0.01  # Use only 1% of data for testing
    )

    print(f"Dataset size: {len(dataset)}")

    missing_roi_samples = []
    domain_stats = defaultdict(lambda: {'total': 0, 'missing_roi': 0, 'no_finding': 0, 'has_finding': 0})

    # Check ROI path for each sample
    for i in range(len(dataset)):
        sample_data = dataset.samples[i]
        domain = sample_data['domain']
        is_no_finding = sample_data['is_no_finding']

        domain_stats[domain]['total'] += 1
        if is_no_finding:
            domain_stats[domain]['no_finding'] += 1
        else:
            domain_stats[domain]['has_finding'] += 1

        # Test ROI path finding
        roi_path = dataset._find_roi_path(
            sample_data['image_name'],
            sample_data['domain'],
            sample_data['is_no_finding']
        )

        if roi_path is None:
            domain_stats[domain]['missing_roi'] += 1
            missing_roi_samples.append({
                'index': i,
                'domain': domain,
                'image_name': sample_data['image_name'],
                'is_no_finding': is_no_finding,
                'expected_roi_type': 'Normal' if is_no_finding else 'Abnormal'
            })

    # Print statistical results
    print(f"\nDomain Statistics:")
    for domain, stats in domain_stats.items():
        missing_rate = stats['missing_roi'] / stats['total'] * 100 if stats['total'] > 0 else 0
        print(f"  {domain}:")
        print(f"    Total samples: {stats['total']}")
        print(f"    No Finding: {stats['no_finding']}, Has Finding: {stats['has_finding']}")
        print(f"    Missing ROI: {stats['missing_roi']} ({missing_rate:.1f}%)")

    # Show some samples missing ROI
    print(f"\nSamples missing ROI (first 10):")
    for sample in missing_roi_samples[:10]:
        print(f"  Sample {sample['index']}: {sample['domain']}")
        print(f"    Image: {sample['image_name']}")
        print(f"    Is No Finding: {sample['is_no_finding']}")
        print(f"    Expected ROI type: {sample['expected_roi_type']}")

        # Try to construct expected ROI filename
        base_name = os.path.splitext(sample['image_name'])[0]
        domain_mapping = {
            'Abdominal Imaging': 'Abdominal',
            'Bone and Joint Imaging': 'Bone_Joint',
            'Breast Imaging': 'Breast',
            'Cardiac Imaging': 'Cardiac',
            'Chest Imaging': 'Chest',
            'Cranial Imaging': 'Cranial',
            'Dental Imaging': 'Dental',
            'Dermatological Imaging': 'Dermatological',
            'Endoscopy Imaging': 'Endoscopy',
            'Fundus Imaging': 'Fundus',
            'Gynecological Imaging': 'Gynecological',
            'Pathology Slide Imaging': 'Pathology'
        }
        roi_prefix = domain_mapping.get(sample['domain'], sample['domain'].split()[0])
        expected_filename = f"{roi_prefix}_{sample['expected_roi_type']}_{base_name}.png"
        print(f"    Expected filename: {expected_filename}")

        # Check if similar files exist in ROI directory
        roi_dir = os.path.join(dataset_root, sample['domain'], 'roi_images')
        if os.path.exists(roi_dir):
            similar_files = [f for f in os.listdir(roi_dir)
                             if base_name in f and f.endswith('.png')]
            if similar_files:
                print(f"    Found similar files: {similar_files}")
        print()

    return missing_roi_samples


def main():
    dataset_root = "/root/autodl-tmp/dataset"

    print("🔍 In-depth analysis of ROI path finding issues...")

    # 1. Analyze dataset structure
    roi_summary = analyze_dataset_structure(dataset_root)

    # 2. Analyze ROI naming patterns
    analyze_roi_naming_pattern(roi_summary)

    # 3. Test ROI path logic
    test_roi_path_logic(dataset_root)

    # 4. Find samples missing ROI
    missing_samples = find_missing_roi_samples(dataset_root)

    print(f"\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"Total samples missing ROI files: {len(missing_samples)}")

    if missing_samples:
        # Group statistics by domain
        by_domain = defaultdict(list)
        for sample in missing_samples:
            by_domain[sample['domain']].append(sample)

        print(f"Distribution by domain:")
        for domain, samples in by_domain.items():
            print(f"  {domain}: {len(samples)} samples")

            # Analyze No Finding vs Has Finding
            no_finding_count = len([s for s in samples if s['is_no_finding']])
            has_finding_count = len([s for s in samples if not s['is_no_finding']])
            print(f"    No Finding: {no_finding_count}, Has Finding: {has_finding_count}")


if __name__ == "__main__":
    main()