import pandas as pd
import numpy as np

def preprocess_data():
    """Preprocessing for automobile dataset with symboling-based anomaly detection"""
    # Load raw data
    df = pd.read_csv('raw.csv')
    
    print(f"Original shape: {df.shape}")
    print("Original symboling distribution:")
    print(df['symboling'].value_counts().sort_index())
    
    # Check for missing values
    print(f"\nMissing values per column:")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("No missing values found")
    
    # Handle missing values by dropping rows with any missing values
    initial_shape = df.shape[0]
    df = df.dropna()
    removed_rows = initial_shape - df.shape[0]
    if removed_rows > 0:
        print(f"Removed {removed_rows} rows with missing values")
        print(f"Shape after removing missing values: {df.shape}")
    
    # Convert symboling to string if numeric
    df['symboling'] = df['symboling'].astype(str)
    print(f"\nSymboling values: {sorted(df['symboling'].unique())}")
    
    # Create binary label: 1 if symboling is +2 or +3, 0 otherwise
    df['label'] = ((df['symboling'] == '2') | (df['symboling'] == '3')).astype(int)
    
    print(f"\nLabel encoding rule: symboling +2 or +3 -> 1 (anomaly), others -> 0 (normal)")
    print("Label distribution:")
    print(df['label'].value_counts())
    
    # Drop symboling column since it's now used as label
    df = df.drop('symboling', axis=1)
    
    # Print unique values for categorical features
    print(f"\nCategorical features and their unique values:")
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    if 'label' in categorical_cols:
        categorical_cols.remove('label')
    
    for col in categorical_cols:
        unique_vals = df[col].unique()
        print(f"{col}: {unique_vals[:5]}{'...' if len(unique_vals) > 5 else ''} (count: {len(unique_vals)})")
    
    print(f"\nProcessed shape: {df.shape}")
    
    # Save processed data
    df.to_csv('autos.csv', index=False)
    print("Preprocessing completed!")
    
    return df

if __name__ == "__main__":
    preprocess_data()