import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import os
import numpy as np
from utils import check_data
from ucimlrepo import fetch_ucirepo


################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'glioma.csv')

# Fetch data from UCI repository
print("Fetching glioma data from UCI repository...")
data_instance = fetch_ucirepo(id=759) 
      
# Extract features and targets
X = data_instance.data.features
y = data_instance.data.targets

# Combine features and targets
print("Combining features and targets...")
df = pd.concat([X, y], axis=1)

# Save raw data
df.to_csv(RAW_PATH, index=False)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Rename 'Grade' to 'label' if present
if 'Grade' in df.columns:
    print("Converting 'Grade' column to 'label'...")
    df = df.rename(columns={'Grade': 'label'})
    print("Renamed 'Grade' column to 'label'.")

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# Validate data using check_data function
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

value_maps = {
    'Gender': {0: 'male', 1: 'female'},
    'IDH1': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'TP53': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'ATRX': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'PTEN': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'EGFR': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'CIC': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'MUC16': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'PIK3CA': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'NF1': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'PIK3R1': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'FUBP1': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'RB1': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'NOTCH1': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'BCOR': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'CSMD3': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'SMARCA4': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'GRIN2A': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'IDH2': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'FAT4': {0: 'NOT_MUTATED', 1: 'MUTATED'},
    'PDGFRA': {0: 'NOT_MUTATED', 1: 'MUTATED'},
}

for col, mapping in value_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

df.to_csv(OUT_PATH, index=False)
print("STEP 4 COMPLETED: Final data saved")

print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)