import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from utils import check_data


################################################################################
# (1) DATA LOADING
################################################################################
print("=" * 50)
print("STEP 1: DATA LOADING")
print("=" * 50)

# Set paths
base_dir = os.path.dirname(__file__)
RAW_PATH = os.path.join(base_dir, 'raw.csv')
OUT_PATH = os.path.join(base_dir, 'glass.csv')

# Fetch data from UCI repository
print("Fetching glass data from UCI repository...")
data_instance = fetch_ucirepo(id=42) 
      
# Extract features and targets
X = data_instance.data.features
y = data_instance.data.targets
info = data_instance.metadata

# Combine features and targets
print("Combining features and targets...")
df = pd.concat([X, y], axis=1)

# Save raw data
df.to_csv(RAW_PATH, index=False)

print("STEP 1 COMPLETED: Data loaded and raw file saved")
print("=" * 50)


################################################################################
# (2) FORMAT
################################################################################
print("STEP 2: FORMAT")
print("=" * 50)

# Convert target column to 'label' and map values
if 'Type_of_glass' in df.columns:
    print("Converting 'Type_of_glass' column to 'label'...")
    # Map classes: 1,2,3 -> normal (0), 5,6,7 -> anomaly (1)
    df['label'] = df['Type_of_glass'].map({
        1: 0,  # normal
        2: 0,  # normal  
        3: 0,  # normal
        5: 1,  # anomaly
        6: 1,  # anomaly
        7: 1   # anomaly
    })
    df.drop(columns=['Type_of_glass'], inplace=True)
    print("Mapped 'Type_of_glass' to 'label': classes 1,2,3 -> normal (0), classes 5,6,7 -> anomaly (1)")
    
    # Print class distribution
    print(f"Class distribution:")
    print(f"Normal (0): {(df['label'] == 0).sum()} samples")
    print(f"Anomaly (1): {(df['label'] == 1).sum()} samples")

print("STEP 2 COMPLETED: Data formatted and cleaned")
print("=" * 50)


################################################################################
# (3) VALIDATION
################################################################################
print("STEP 3: VALIDATION")
print("=" * 50)

# Validate data using check_data function
df = check_data(df)

if df is None:
    print("ERROR: Data validation failed!")
    exit()

print("STEP 3 COMPLETED: Data validation passed")
print("=" * 50)


################################################################################
# (4) POSTPROCESSING & SAVE
################################################################################
print("STEP 4: POSTPROCESSING & SAVE")
print("=" * 50)

# Save final processed data
df.to_csv(OUT_PATH, index=False)

print("STEP 4 COMPLETED: Final data saved")
print("=" * 50)
print("ALL PREPROCESSING STEPS COMPLETED!")
print("=" * 50)
