"""
Configuration settings for MIMIC-IV AKI Causal Inference Analysis
Supplementary Material Implementation
"""

from pathlib import Path
import pandas as pd


class Config:
    """Centralized configuration for reproducible analysis"""
    
    def __init__(self):
        # Data paths - MODIFY THESE FOR YOUR ENVIRONMENT
        self.BASE_DIR = Path(__file__).parent.parent
        self.MIMIC_DIR = Path("/path/to/mimic-iv/3.1/")  # UPDATE THIS PATH
        self.HOSP_DIR = self.MIMIC_DIR / "hosp"
        self.NOTE_DIR = self.MIMIC_DIR / "note"
        self.RESULTS_DIR = self.BASE_DIR / "results"
        
        # Create results directory
        self.RESULTS_DIR.mkdir(parents=True, exist_ok=True)
        
        # Analysis parameters (matching paper exactly)
        self.RANDOM_STATE = 7
        self.PS_CLIP = (1e-3, 1-1e-3)        # Propensity score clipping bounds
        self.W_TRIM = (0.01, 0.99)           # Weight trimming quantiles
        self.VPT_WINDOW_HOURS = 6            # Time window for combination therapy
        self.COX_PENALIZER = 0.1             # Cox regression L2 penalty
        
        # Clinical criteria patterns
        self.RX_SCR_LABEL = "creatinine"
        self.RX_SCR_FLUID = r"\b(?:serum|blood)\b"
        self.RX_MGDL = r"\bmg/dl\b"
        self.RX_MGL = r"\bmg/l\b"
        self.RX_VANCO_SUB = "vancomycin"
        self.RX_PTZ = r"(?:piperacillin|tazobactam|zosyn)"
        self.RX_EMERG_SUB = "EMER"
        
        # LLM-derived confounders (as described in paper)
        self.CONFOUNDERS = [
            "f_ckd_pre",      # Pre-existing chronic kidney disease
            "f_dm_pre",       # Pre-existing diabetes mellitus
            "f_hf_pre",       # Pre-existing heart failure
            "f_liver_pre",    # Pre-existing liver disease
            "f_nephrotox_pre" # Pre-existing nephrotoxic exposure
        ]
        
        # Data reading parameters
        self.READ_KW = dict(dtype_backend="pyarrow", low_memory=False)
        
        # LLM processing parameters
        self.LLM_MODEL = "gpt-4o-mini"
        self.LLM_TEMPERATURE = 0.0           # Deterministic extraction
        self.LLM_MAX_TOKENS = 200
        self.MAX_NOTE_CHARS = 15000          # Context limit for discharge notes
        
        # Validation parameters
        self.BOOTSTRAP_ITERATIONS = 300      # For precision assessment
        self.N_JOBS = -1                     # Parallel processing (-1 = all CPUs)
    
    def validate_paths(self):
        """Validate that required MIMIC-IV files exist"""
        required_files = [
            "prescriptions.csv.gz",
            "admissions.csv.gz", 
            "patients.csv.gz",
            "labevents.csv.gz",
            "d_labitems.csv.gz"
        ]
        
        missing_files = []
        for file in required_files:
            if not (self.HOSP_DIR / file).exists():
                missing_files.append(str(self.HOSP_DIR / file))
        
        if missing_files:
            raise FileNotFoundError(
                f"Missing required MIMIC-IV files:\n" + 
                "\n".join(missing_files) +
                f"\n\nPlease update MIMIC_DIR in config.py to point to your MIMIC-IV installation."
            )
        
        # Check note directory if using clinical notes
        if not self.NOTE_DIR.exists():
            print(f"Warning: Note directory not found: {self.NOTE_DIR}")
            print("Clinical note processing will be unavailable.")
    
    def get_summary(self):
        """Return configuration summary for logging"""
        return {
            "mimic_dir": str(self.MIMIC_DIR),
            "results_dir": str(self.RESULTS_DIR),
            "random_state": self.RANDOM_STATE,
            "vpt_window_hours": self.VPT_WINDOW_HOURS,
            "confounders": self.CONFOUNDERS,
            "llm_model": self.LLM_MODEL,
            "bootstrap_iterations": self.BOOTSTRAP_ITERATIONS
        }


# Default configuration instance
DEFAULT_CONFIG = Config()
