import pandas as pd
import numpy as np

class Framingham:
    def __init__(self, target_event='CVD'):
        self._target_event = target_event.upper()
        self._target_time_map = {
            'ANGINA': 'TIMEAP', 'HOSPMI': 'TIMEMI', 'ANYCHD': 'TIMECHD',
            'STROKE': 'TIMESTRK', 'CVD': 'TIMECVD', 'DEATH': 'TIMEDTH',
            'HYPERTEN': 'TIMEHYP'
        }

        assert self._target_event in self._target_time_map.keys(), f"Target event must be one of {list(self._target_time_map.keys())}"
        # Make sure to place the 'framingham.csv' file in a 'data/files' directory
        # or update the path accordingly.
        try:
            self._data = pd.read_csv('data/files/framingham.csv')
        except FileNotFoundError:
            raise FileNotFoundError("Please download the Framingham dataset and place it at 'data/files/framingham.csv'")

    @property
    def event(self):
        return self._target_event
    
    @property
    def time(self):
        return self._target_time_map[self._target_event]
    
    @property
    def continuous_predictors(self):
        # Note: 'TIME' here is a generic placeholder for the specific time-to-event column.
        return ['AGE', 'TOTCHOL', 'HDLC', 'LDLC', 'SYSBP', 'DIABP', 'BMI', 'GLUCOSE', 'CIGPDAY', 'HEARTRTE', 'TIME']
    
    @property
    def categorical_predictors(self):
        return ['SEX', 'CURSMOKE', 'DIABETES', 'educ', 'BPMEDS', 'PREVAP', 'PREVCHD', 'PREVMI', 'PREVSTRK', 'PREVHYP']

    @property
    def data(self):
        return self._data
        
    def preprocess(self):
        """
        Performs standard preprocessing for the Framingham dataset.

        This involves:
        1.  **Cohort Selection**: Filtering for the first examination period (`PERIOD == 1`).
        2.  **Feature Renaming**: Standardizing event and time columns to 'event' and 'time'.
        4.  **Data Cleaning**: Correcting logical inconsistencies.
        5.  **Feature Selection**: Keeping only the specified predictors and targets.

        Returns:
            pd.DataFrame: A cleaned DataFrame ready for survival analysis modeling.
        """
        df = self._data.copy()

        # 1. Cohort Selection: Use the baseline data from the first period
        df = df[df['PERIOD'] == 1].copy()
        df.drop(columns=['HDLC', 'LDLC'], axis=1)

        # 2. Rename the specific event and time columns to generic names
        df.rename(columns={self.event: 'event', self.time: 'time'}, inplace=True)
        df = df[df['time'] > 0] 

        # 4. Correct logical inconsistency
        # A non-smoker should not have cigarettes per day > 0
        df.loc[df['CURSMOKE'] == 0, 'CIGPDAY'] = 0

        # 5. Select final feature set
        predictor_cols = self.categorical_predictors + [p for p in self.continuous_predictors if p != 'TIME']
        target_cols = ['event', 'time']
        
        # Ensure all columns exist in the DataFrame before selection
        final_cols = target_cols + [p for p in predictor_cols if p in df.columns]
        df_processed = df[final_cols]
        
        # Ensure data types are appropriate (e.g., integers for categorical)
        for col in df_processed.columns:
            if pd.api.types.is_numeric_dtype(df_processed[col]) and (df_processed[col].dropna() % 1 == 0).all():
                df_processed[col] = df_processed[col]

        return df_processed.reset_index(drop=True)

