import os
import pandas as pd
from pathlib import Path
import glob

class REFLACXRegionAnalyzer:
    def __init__(self, base_path="../../mimic-eye-integrating-mimic-datasets-with-reflacx-and-eye-gaze-for-multimodal-deep-learning-applications-1.0.0/mimic-eye"):
        self.base_path = base_path
        self.expected_regions = [
            'cardiac silhouette',
            'left clavicle', 
            'left costophrenic angle',
            'left hilar structures',
            'left lower lung zone',
            'left lung',
            'left mid lung zone', 
            'left upper lung zone',
            'right clavicle',
            'right costophrenic angle', 
            'right hilar structures',
            'right lower lung zone',
            'right lung',
            'right mid lung zone',
            'right upper lung zone',
            'trachea',
            'upper mediastinum'
        ]
        
    # Find all REFLACX bounding box CSV files
    def find_reflacx_bbox_files(self):
        pattern = f"{self.base_path}/*/REFLACX/main_data/*/*_bboxes.csv"
        bbox_files = glob.glob(pattern)
        print(f"Found {len(bbox_files)} REFLACX bounding box files")
        return bbox_files
    
    # Analyze a single bounding box file
    def analyze_bbox_file(self, file_path):
        try:
            df = pd.read_csv(file_path)
            
            regions_in_file = set(df['class_name'].unique())
            
            has_all_regions = all(region in regions_in_file for region in self.expected_regions)
            
            return {
                'file_path': file_path,
                'total_regions': len(regions_in_file),
                'has_all_17': has_all_regions,
                'missing_regions': set(self.expected_regions) - regions_in_file,
                'regions_found': regions_in_file
            }
            
        except Exception as e:
            return {
                'file_path': file_path,
                'error': str(e),
                'total_regions': 0,
                'has_all_17': False,
                'missing_regions': set(self.expected_regions),
                'regions_found': set()
            }
    
    # Analyze all REFLACX bounding box files
    def analyze_all_files(self):
        bbox_files = self.find_reflacx_bbox_files()
        
        results = []
        files_with_all_17 = 0
        region_count_distribution = {}
        
        print("Analyzing REFLACX bounding box files...")
        
        for i, file_path in enumerate(bbox_files):
            if i % 50 == 0:
                print(f"  Progress: {i}/{len(bbox_files)} files analyzed")
            
            result = self.analyze_bbox_file(file_path)
            results.append(result)
            
            if result['has_all_17']:
                files_with_all_17 += 1
            
            region_count = result['total_regions']
            region_count_distribution[region_count] = region_count_distribution.get(region_count, 0) + 1
        
        return {
            'total_files': len(bbox_files),
            'files_with_all_17': files_with_all_17,
            'percentage_complete': (files_with_all_17 / len(bbox_files)) * 100 if bbox_files else 0,
            'region_count_distribution': region_count_distribution,
            'detailed_results': results
        }
    
    # Print analysis summary
    def print_summary(self, analysis_results):
        print("REFLACX BOUNDING BOX ANALYSIS SUMMARY")
        print("=" * 50)
        print(f"Total REFLACX bounding box files: {analysis_results['total_files']}")
        print(f"Files with all 17 regions: {analysis_results['files_with_all_17']}")
        print(f"Percentage complete: {analysis_results['percentage_complete']:.1f}%")
        
        print("Region Count Distribution:")
        for count, freq in sorted(analysis_results['region_count_distribution'].items()):
            print(f"  {count} regions: {freq} files")
        
        incomplete_files = [r for r in analysis_results['detailed_results'] 
                          if not r['has_all_17'] and 'error' not in r]
        
        if incomplete_files:
            print(f"Examples of incomplete files (showing first 3):")
            for i, result in enumerate(incomplete_files[:3]):
                file_name = os.path.basename(result['file_path'])
                missing = len(result['missing_regions'])
                print(f"  {i+1}. {file_name}: {result['total_regions']} regions ({missing} missing)")
                if missing <= 5:
                    print(f"     Missing: {', '.join(list(result['missing_regions'])[:5])}")
    
    # Get list of files with all 17 regions
    def get_complete_files_list(self, analysis_results):
        complete_files = [r['file_path'] for r in analysis_results['detailed_results'] 
                         if r['has_all_17']]
        return complete_files

# Analyze REFLACX bounding box region completeness across all files
def main():
    print("Starting REFLACX bounding box region analysis...")
    
    analyzer = REFLACXRegionAnalyzer()
    results = analyzer.analyze_all_files()
    analyzer.print_summary(results)
    
    complete_files = analyzer.get_complete_files_list(results)
    
    print(f"Analysis complete!")
    print(f"{len(complete_files)} REFLACX files have all 17 regions and can be used for averaging")
    
    return results, complete_files

if __name__ == "__main__":
    results, complete_files = main()
