import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import os
import numpy as np
from utils import check_data
from ucimlrepo import fetch_ucirepo

################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'credit.csv')

print("Fetching data...")
data_instance = fetch_ucirepo(id=350) 
      
# data (as pandas dataframes) 
X = data_instance.data.features
y = data_instance.data.targets

df = pd.concat([X, y], axis=1)
df.to_csv(RAW_PATH, index=False)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)

################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Column mapping based on the provided table
column_mapping = {
    'X1': 'LIMIT_BAL',
    'X2': 'SEX',
    'X3': 'EDUCATION',
    'X4': 'MARRIAGE',
    'X5': 'AGE',
    'X6': 'PAY_0',
    'X7': 'PAY_2',
    'X8': 'PAY_3',
    'X9': 'PAY_4',
    'X10': 'PAY_5',
    'X11': 'PAY_6',
    'X12': 'BILL_AMT1',
    'X13': 'BILL_AMT2',
    'X14': 'BILL_AMT3',
    'X15': 'BILL_AMT4',
    'X16': 'BILL_AMT5',
    'X17': 'BILL_AMT6',
    'X18': 'PAY_AMT1',
    'X19': 'PAY_AMT2',
    'X20': 'PAY_AMT3',
    'X21': 'PAY_AMT4',
    'X22': 'PAY_AMT5',
    'X23': 'PAY_AMT6',
    'Y': 'label'
}

# Rename columns
df = df.rename(columns=column_mapping)
print("Applied column mapping:")
for old_col, new_col in column_mapping.items():
    print(f"  {old_col} -> {new_col}")

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)

################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# check data
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)

################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

# Value mappings for categorical variables
value_maps = {
    'SEX': {1: 'male', 2: 'female'},
    'EDUCATION': {1: 'graduate school', 2: 'university', 3: 'high school', 4: 'others'},
    'MARRIAGE': {1: 'married', 2: 'single', 3: 'others'}
}

# Payment history mappings for PAY_0 to PAY_6
payment_mapping = {
    -2: 'no consumption',
    -1: 'pay duly',
    0: 'no payment',
    1: 'payment delay for 1 month',
    2: 'payment delay for 2 months',
    3: 'payment delay for 3 months',
    4: 'payment delay for 4 months',
    5: 'payment delay for 5 months',
    6: 'payment delay for 6 months',
    7: 'payment delay for 7 months',
    8: 'payment delay for 8 months',
    9: 'payment delay for 9+ months'
}

# Apply payment history mapping to PAY columns
pay_columns = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
for col in pay_columns:
    if col in df.columns:
        df[col] = df[col].map(payment_mapping)

for col, mapping in value_maps.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

# Save final processed data
df.to_csv(OUT_PATH, index=False)
print("STEP 4 COMPLETED: Final data saved")

print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)