#!/usr/bin/env python3
"""
Memory-Safe Configuration for Schema Induction Pipeline

This module provides memory-safe settings to prevent system freezes
when processing large datasets on systems with limited RAM.
"""

import os
import psutil
from typing import Dict, Any

def get_system_memory_info() -> Dict[str, Any]:
    """Get current system memory information"""
    memory = psutil.virtual_memory()
    return {
        'total_gb': memory.total / (1024**3),
        'available_gb': memory.available / (1024**3),
        'used_gb': memory.used / (1024**3),
        'percent_used': memory.percent
    }

def get_memory_safe_settings() -> Dict[str, Any]:
    """
    Get memory-safe settings based on available system memory.
    
    Returns:
        Dictionary with safe settings for chunk sizes, concurrency, etc.
    """
    memory_info = get_system_memory_info()
    total_gb = memory_info['total_gb']
    available_gb = memory_info['available_gb']
    
    print(f"💾 System Memory: {total_gb:.1f}GB total, {available_gb:.1f}GB available")
    
    # Conservative settings for 8GB systems
    if total_gb <= 8:
        settings = {
            'embedding_chunk_size': 500,      # Reduced from 1000
            'max_concurrency': 64,            # Reduced from 128
            'batch_size': 100,                # Smaller batches
            'max_dataset_size': 50000,        # Limit dataset size
            'memory_threshold_gb': 1.0,       # Stop if less than 1GB available
            'cleanup_interval': 10,           # Cleanup every 10 batches
        }
        print("⚠️  Using conservative settings for 8GB system")
        
    # Balanced settings for 16GB+ systems
    elif total_gb <= 16:
        settings = {
            'embedding_chunk_size': 1000,
            'max_concurrency': 128,
            'batch_size': 200,
            'max_dataset_size': 100000,
            'memory_threshold_gb': 2.0,
            'cleanup_interval': 20,
        }
        print("⚖️  Using balanced settings for 16GB system")
        
    # Aggressive settings for 32GB+ systems
    else:
        settings = {
            'embedding_chunk_size': 2000,
            'max_concurrency': 256,
            'batch_size': 500,
            'max_dataset_size': 200000,
            'memory_threshold_gb': 4.0,
            'cleanup_interval': 50,
        }
        print("🚀 Using aggressive settings for 32GB+ system")
    
    return settings

def check_memory_safety() -> bool:
    """
    Check if current memory usage is safe for processing.
    
    Returns:
        True if safe to proceed, False if memory is too low
    """
    memory_info = get_system_memory_info()
    available_gb = memory_info['available_gb']
    
    # Get safe settings
    settings = get_memory_safe_settings()
    threshold_gb = settings['memory_threshold_gb']
    
    if available_gb < threshold_gb:
        print(f"🚨 WARNING: Only {available_gb:.1f}GB available, below threshold of {threshold_gb}GB")
        print("💡 Recommendations:")
        print("   1. Close other applications")
        print("   2. Restart your computer")
        print("   3. Use smaller dataset")
        print("   4. Process in smaller chunks")
        return False
    
    print(f"✅ Memory check passed: {available_gb:.1f}GB available")
    return True

def get_environment_overrides() -> Dict[str, str]:
    """
    Get environment variable overrides for memory-safe operation.
    
    Returns:
        Dictionary of environment variable name -> value
    """
    settings = get_memory_safe_settings()
    
    return {
        'EMBEDDING_CHUNK_SIZE': str(settings['embedding_chunk_size']),
        'VLLM_MAX_CONCURRENCY': str(settings['max_concurrency']),
        'BATCH_SIZE': str(settings['batch_size']),
    }

def apply_memory_safe_settings():
    """Apply memory-safe settings to environment variables"""
    overrides = get_environment_overrides()
    
    for var_name, value in overrides.items():
        os.environ[var_name] = value
        print(f"🔧 Set {var_name}={value}")

def get_dataset_size_warning(dataset_size: int) -> str:
    """
    Get warning message if dataset size is too large for current system.
    
    Args:
        dataset_size: Number of records in dataset
        
    Returns:
        Warning message or empty string if safe
    """
    settings = get_memory_safe_settings()
    max_size = settings['max_dataset_size']
    
    if dataset_size > max_size:
        return f"⚠️  Dataset size ({dataset_size:,}) exceeds recommended limit ({max_size:,}) for your system"
    
    return ""

if __name__ == "__main__":
    print("🧪 Testing Memory-Safe Configuration")
    print("="*50)
    
    # Check current memory
    check_memory_safety()
    
    # Get safe settings
    settings = get_memory_safe_settings()
    print(f"\n📋 Safe Settings:")
    for key, value in settings.items():
        print(f"   {key}: {value}")
    
    # Test environment overrides
    print(f"\n🔧 Environment Overrides:")
    overrides = get_environment_overrides()
    for var_name, value in overrides.items():
        print(f"   {var_name}={value}")
    
    # Test dataset size warnings
    test_sizes = [10000, 50000, 100000, 200000]
    print(f"\n📊 Dataset Size Warnings:")
    for size in test_sizes:
        warning = get_dataset_size_warning(size)
        if warning:
            print(f"   {size:,} records: {warning}")
        else:
            print(f"   {size:,} records: ✅ Safe") 