import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import os
import numpy as np
from utils import check_data
from ucimlrepo import fetch_ucirepo

################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'cirrhosis.csv')

print("Fetching data...")
data_instance = fetch_ucirepo(id=878) 
      
# data (as pandas dataframes) 
X = data_instance.data.features
y = data_instance.data.targets

df = pd.concat([X, y], axis=1)
df.to_csv(RAW_PATH, index=False)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Rename 'Grade' to 'label' if present
if 'Status' in df.columns:
    df = df.rename(columns={'Status': 'label'})
    print("Renamed 'Status' column to 'label'.")

# Convert Status values: C, CL -> 0, D -> 1
if 'label' in df.columns:
    df['label'] = df['label'].map({'C': 0, 'CL': 0, 'D': 1})
    print("Converted Status values: C/CL -> 0, D -> 1")

# corner case: raw data has "NaNN" strings
print("Replacing 'NaNN' strings with np.nan...")
count_NaNN_string_before = df.apply(lambda x: x.astype(str).str.contains('NaNN').sum(), axis=0).sum()
df = df.replace('NaNN', np.nan)
count_NaNN_string_after = df.apply(lambda x: x.astype(str).str.contains('NaNN').sum(), axis=0).sum()
print("Processing NaNN strings: before: ", count_NaNN_string_before, "after: ", count_NaNN_string_after)

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# check data
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

value_maps = {
    'Sex': {'M': 'male', 'F': 'female'},
    'Ascites': {'N': 'No', 'Y': 'Yes'},
    'Hepatomegaly': {'N': 'No', 'Y': 'Yes'},
    'Spiders': {'N': 'No', 'Y': 'Yes'},
    'Edema': {'N': 'no edema and no diuretic therapy for edema', 'S': 'edema present without diuretics, or edema resolved by diuretics', 'Y': 'edema despite diuretic therapy'},
}
for col, mapping in value_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

# Save final processed data
df.to_csv(OUT_PATH, index=False)
print("STEP 4 COMPLETED: Final data saved")

print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)