"""
Data processing utilities for MIMIC-IV analysis
"""

import numpy as np
import pandas as pd
import math
from typing import Optional, Tuple


class DataUtils:
    """Utility functions for data processing"""
    
    @staticmethod
    def to_datetime_safe(s: pd.Series, fmt: Optional[str] = None) -> pd.Series:
        """Safely convert series to datetime with error handling"""
        x = pd.to_datetime(s, errors="coerce", format=fmt)
        if x.isna().any():
            # Try again without format for failed conversions
            y = pd.to_datetime(s[x.isna()], errors="coerce")
            x.loc[x.isna()] = y
        return x
    
    @staticmethod
    def standardized_mean_difference(x: np.ndarray, t: np.ndarray, 
                                   w: Optional[np.ndarray] = None) -> float:
        """Calculate standardized mean difference between treatment groups"""
        x = np.asarray(x, dtype=float)
        t = np.asarray(t, dtype=int)
        
        if w is None:
            w = np.ones_like(t, dtype=float)
            
        if (t==1).sum() == 0 or (t==0).sum() == 0:
            return np.nan
            
        # Weighted means
        m1 = np.average(x[t==1], weights=w[t==1])
        m0 = np.average(x[t==0], weights=w[t==0])
        
        # Weighted variances
        v1 = np.average((x[t==1] - m1)**2, weights=w[t==1])
        v0 = np.average((x[t==0] - m0)**2, weights=w[t==0])
        
        return (m1 - m0) / np.sqrt((v1 + v0) / 2 + 1e-9)
    
    @staticmethod
    def evalue_from_hr(hr: float, lcl: float, ucl: float) -> Tuple[float, float]:
        """Calculate E-values from hazard ratio and confidence interval"""
        def _evalue(x: float) -> float:
            if x > 1:
                return x + math.sqrt(max(x, 0) * (max(x, 0) - 1.0))
            return 1.0
            
        return (_evalue(float(hr)), 
                _evalue(float(lcl)) if float(lcl) > 1 else 1.0)
    
    @staticmethod
    def validate_dataframe(df: pd.DataFrame, required_columns: list) -> bool:
        """Validate DataFrame has required columns"""
        missing = set(required_columns) - set(df.columns)
        if missing:
            raise ValueError(f"Missing required columns: {missing}")
        return True
    
    @staticmethod
    def safe_numeric_conversion(series: pd.Series, fill_value: float = 0.0) -> pd.Series:
        """Safely convert series to numeric with fallback"""
        return pd.to_numeric(series, errors="coerce").fillna(fill_value)
    
    @staticmethod
    def calculate_effective_sample_size(weights: np.ndarray) -> float:
        """Calculate effective sample size from weights"""
        weights = np.asarray(weights)
        return float((weights.sum()**2) / (weights**2).sum())
    
    @staticmethod
    def clip_and_trim_weights(weights: np.ndarray, 
                             clip_bounds: Tuple[float, float] = (0.001, 0.999),
                             trim_quantiles: Tuple[float, float] = (0.01, 0.99)) -> np.ndarray:
        """Clip extreme propensity scores and trim weights"""
        # First clip
        weights = np.clip(weights, *clip_bounds)
        
        # Then trim based on quantiles
        trim_bounds = np.quantile(weights, trim_quantiles)
        weights = np.clip(weights, *trim_bounds)
        
        return weights
