import pandas as pd
import numpy as np

def preprocess_data():
    """Simple preprocessing for telco customer churn dataset"""
    # Load raw data
    df = pd.read_csv('raw.csv')
    
    print(f"Original shape: {df.shape}")
    print("Original target distribution:")
    print(df['label'].value_counts())
    
    # Check for missing values
    print(f"\nMissing values per column:")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("No missing values found")
    
    # Handle TotalCharges - convert to numeric and remove rows with missing values
    print(f"\nTotalCharges data type: {df['TotalCharges'].dtype}")
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    
    # Check for any missing values after conversion
    missing_total_charges = df['TotalCharges'].isnull().sum()
    if missing_total_charges > 0:
        print(f"Found {missing_total_charges} rows with missing/invalid TotalCharges values")
        print(f"Removing {missing_total_charges} rows with missing values")
        df = df.dropna()
        print(f"Shape after removing missing values: {df.shape}")
    
    # Remove any other missing values if they exist
    initial_shape = df.shape[0]
    df = df.dropna()
    removed_rows = initial_shape - df.shape[0]
    if removed_rows > 0:
        print(f"Removed {removed_rows} additional rows with missing values")
        print(f"Final shape after removing all missing values: {df.shape}")
    
    # Convert SeniorCitizen from 0/1 to No/Yes
    print(f"\nConverting SeniorCitizen from numeric to categorical:")
    print(f"Original values: {df['SeniorCitizen'].unique()}")
    df['SeniorCitizen'] = df['SeniorCitizen'].map({0: 'No', 1: 'Yes'})
    print(f"Converted values: {df['SeniorCitizen'].unique()}")
    
    # Print unique values for all categorical features
    print(f"\nCategorical features and their unique values:")
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    if 'label' in categorical_cols:
        categorical_cols.remove('label')
    
    for col in categorical_cols:
        unique_vals = df[col].unique()
        print(f"{col}: {unique_vals} (count: {len(unique_vals)})")
    
    # Convert target to binary (Yes=1, No=0) - treating churn as anomaly
    df['label'] = (df['label'] == 'Yes').astype(int)
    
    print(f"Processed shape: {df.shape}")
    print("Processed target distribution:")
    print(df['label'].value_counts())
    
    # Save processed data
    df.to_csv('telco.csv', index=False)
    print("Simple preprocessing completed!")
    
    return df

if __name__ == "__main__":
    preprocess_data()
