import os
from data.dataset_english_medical import create_fast_dataset

def create_dataset_from_config_simple(config, tokenizer):
    """
    Create ultra-fast dataset from config file
    
    Args:
        config: Configuration dictionary containing dataset-related config
        tokenizer: Tokenizer
    
    Returns:
        Dataset instance
    """
    dataset_config = config.get('dataset', {})
    dataset_root = dataset_config.get('root')
    
    if not dataset_root:
        raise ValueError("Missing dataset root directory config dataset.root")
    
    if not os.path.exists(dataset_root):
        raise FileNotFoundError(f"Dataset path does not exist: {dataset_root}")
    
    # Extract parameters
    dataset_params = {
        'dataset_root': dataset_root,
        'tokenizer': tokenizer,
        'image_size': dataset_config.get('image_size', 224),
        'max_text_length': dataset_config.get('max_text_length', 128),
        'sample_ratio': dataset_config.get('sample_ratio', 1.0)
    }
    
    return create_fast_dataset(**dataset_params)

def create_dataset_with_preset_simple(config, preset_name, tokenizer):
    """
    Create dataset using preset configuration
    
    Args:
        config: Base configuration
        preset_name: Preset name
        tokenizer: Tokenizer
    
    Returns:
        Dataset instance
    """
    preset_config = config.get('presets', {}).get(preset_name, {})
    
    if not preset_config:
        print(f"Warning: Preset config '{preset_name}' does not exist, using default config")
        return create_dataset_from_config_simple(config, tokenizer)
    
    # Merge preset configuration
    merged_config = config.copy()
    
    # Deep merge dataset configuration
    if 'dataset' in preset_config:
        dataset_config = merged_config.get('dataset', {})
        dataset_config.update(preset_config['dataset'])
        merged_config['dataset'] = dataset_config
    
    print(f"Using preset configuration: {preset_name}")
    sample_ratio = merged_config['dataset'].get('sample_ratio', 1.0)
    print(f"Data ratio: {sample_ratio*100:.1f}%")
    
    return create_dataset_from_config_simple(merged_config, tokenizer)

def print_dataset_summary_simple(dataset_root):
    """Print simplified dataset summary"""
    print(f"\n=== Dataset Summary: {dataset_root} ===")
    
    try:
        if not os.path.exists(dataset_root):
            print(f"Dataset directory does not exist")
            return
        
        # Check dataset structure
        domains = [d for d in os.listdir(dataset_root) 
                  if os.path.isdir(os.path.join(dataset_root, d))]
        
        print(f"Number of medical domains: {len(domains)}")
        print(f"Domain list: {', '.join(domains[:5])}{'...' if len(domains) > 5 else ''}")
        
        # Check data files
        total_jsonl_files = 0
        total_image_dirs = 0
        total_roi_dirs = 0
        
        for domain in domains:
            domain_path = os.path.join(dataset_root, domain)
            
            # Check JSONL files
            jsonl_files = [f for f in os.listdir(domain_path) 
                          if f.endswith('_region_en.jsonl')]
            total_jsonl_files += len(jsonl_files)
            
            # Check images directory
            images_dir = os.path.join(domain_path, 'images')
            if os.path.exists(images_dir):
                total_image_dirs += 1
            
            # Check roi_images directory
            roi_dir = os.path.join(domain_path, 'roi_images')
            if os.path.exists(roi_dir):
                total_roi_dirs += 1
        
        print(f"Number of JSONL files: {total_jsonl_files}")
        print(f"Number of image directories: {total_image_dirs}")
        print(f"Number of ROI directories: {total_roi_dirs}")
        
        # Data completeness check
        completeness = (total_image_dirs / len(domains)) * 100 if domains else 0
        roi_coverage = (total_roi_dirs / len(domains)) * 100 if domains else 0
        
        print(f"Data completeness: {completeness:.1f}%")
        print(f"ROI coverage: {roi_coverage:.1f}%")
        
        print("\nRecommended usage ratios:")
        print("  Fast test: sample_ratio=0.01 (1%)")
        print("  Small experiment: sample_ratio=0.1 (10%)")
        print("  Full training: sample_ratio=1.0 (100%)")
        
    except Exception as e:
        print(f"Error analyzing dataset: {e}")
    
    print("=" * 50)

def estimate_loading_time(dataset_root, sample_ratio):
    """Estimate dataset loading time"""
    try:
        # Estimate based on empirical formula
        domains = [d for d in os.listdir(dataset_root) 
                  if os.path.isdir(os.path.join(dataset_root, d))]
        
        # Estimate total samples (rough)
        estimated_total_samples = len(domains) * 1000  # Assume 1000 samples per domain
        actual_samples = int(estimated_total_samples * sample_ratio)
        
        # Loading time estimate (seconds)
        # Based on test results: approximately 1000-5000 samples per second
        estimated_time = actual_samples / 3000
        
        return {
            'estimated_samples': actual_samples,
            'estimated_time_seconds': estimated_time,
            'time_range': f"{estimated_time*0.5:.1f}-{estimated_time*2:.1f}s"
        }
    except:
        return None

def validate_dataset_path(dataset_root):
    """Validate dataset path"""
    if not os.path.exists(dataset_root):
        raise FileNotFoundError(f"Dataset path does not exist: {dataset_root}")
    
    # Check if there are medical domain directories
    domains = [d for d in os.listdir(dataset_root) 
              if os.path.isdir(os.path.join(dataset_root, d))]
    
    if not domains:
        raise ValueError(f"No medical domain subdirectories found in dataset directory: {dataset_root}")
    
    # Check if there are JSONL files
    has_jsonl = False
    for domain in domains:
        domain_path = os.path.join(dataset_root, domain)
        jsonl_files = [f for f in os.listdir(domain_path) 
                      if f.endswith('_region_en.jsonl')]
        if jsonl_files:
            has_jsonl = True
            break
    
    if not has_jsonl:
        raise ValueError(f"No *_region_en.jsonl files found in dataset")
    
    print(f"Dataset path validation passed: {len(domains)} medical domains")

if __name__ == "__main__":
    # Example usage
    import sys
    
    if len(sys.argv) > 1:
        dataset_root = sys.argv[1]
        
        print("Validating dataset path...")
        try:
            validate_dataset_path(dataset_root)
        except Exception as e:
            print(f"Validation failed: {e}")
            sys.exit(1)
        
        print_dataset_summary_simple(dataset_root)
        
        # Estimate loading time for different ratios
        ratios = [0.01, 0.1, 1.0]
        print("\nLoading time estimates:")
        for ratio in ratios:
            estimate = estimate_loading_time(dataset_root, ratio)
            if estimate:
                print(f"  {ratio*100:.0f}% data: ~{estimate['estimated_samples']:,} samples, {estimate['time_range']}")
        
    else:
        print("Usage: python dataset_factory_simple.py <dataset_root>")
        print("Example: python dataset_factory_simple.py /root/autodl-tmp/dataset")
