import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import os
import numpy as np
from utils import check_data

# Mapping dictionaries
GALLSTONE_STATUS = {0: 'No', 1: 'Yes'}
GENDER = {0: 'Male', 1: 'Female'}
COMORBIDITY = {
    0: 'No comorbidities present',
    1: 'One comorbid condition',
    2: 'Two comorbid conditions',
    3: 'Three or more comorbid conditions'
}
CAD = {0: 'No', 1: 'Yes'}
HYPOTHYROIDISM = {0: 'No', 1: 'Yes'}
HYPERLIPIDEMIA = {0: 'No', 1: 'Yes'}
DIABETES = {0: 'No', 1: 'Yes'}
HFA = {
    0: 'No fat accumulation',
    1: 'Grade 1 (mild)',
    2: 'Grade 2 (moderate)',
    3: 'Grade 3 (severe)',
    4: 'Grade 4 (very severe)'
}

################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# File paths
RAW_PATH = os.path.join(os.path.dirname(__file__), 'raw.csv')
OUT_PATH = os.path.join(os.path.dirname(__file__), 'gallstone.csv')

# Read data
df = pd.read_csv(RAW_PATH, dtype=str, na_filter=False)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Rename 'Gallstone Status' to 'label' and keep as-is
if 'Gallstone Status' in df.columns:
    df = df.rename(columns={'Gallstone Status': 'label'})
    print("Renamed 'Gallstone Status' column to 'label'.")

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

df = check_data(df)
if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

# Convert columns to numeric where needed for mapping (exclude 'label')
for col in [
    'Gender', 'Comorbidity',
    'Coronary Artery Disease (CAD)', 'Hypothyroidism',
    'Hyperlipidemia', 'Diabetes Mellitus (DM)', 'Hepatic Fat Accumulation (HFA)'
]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Apply mappings (exclude 'label')
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].map(GENDER)
if 'Comorbidity' in df.columns:
    df['Comorbidity'] = df['Comorbidity'].map(COMORBIDITY)
if 'Coronary Artery Disease (CAD)' in df.columns:
    df['Coronary Artery Disease (CAD)'] = df['Coronary Artery Disease (CAD)'].map(CAD)
if 'Hypothyroidism' in df.columns:
    df['Hypothyroidism'] = df['Hypothyroidism'].map(HYPOTHYROIDISM)
if 'Hyperlipidemia' in df.columns:
    df['Hyperlipidemia'] = df['Hyperlipidemia'].map(HYPERLIPIDEMIA)
if 'Diabetes Mellitus (DM)' in df.columns:
    df['Diabetes Mellitus (DM)'] = df['Diabetes Mellitus (DM)'].map(DIABETES)
if 'Hepatic Fat Accumulation (HFA)' in df.columns:
    df['Hepatic Fat Accumulation (HFA)'] = df['Hepatic Fat Accumulation (HFA)'].map(HFA)

# Save final processed data
df.to_csv(OUT_PATH, index=False, na_rep='')
print(f'Preprocessing complete. Saved to {OUT_PATH}')
print("STEP 4 COMPLETED: Final data saved")

print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)