
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


def load_income_data(random_state=42):
    adult = fetch_openml(name="adult", version=2, as_frame=True)
    X = adult.data.copy()
    y = adult.target.copy()
    
    X = X.drop(columns=["fnlwgt", "education-num"])
    
    y = (y == ">50K").astype(int)
    
    X = X.replace("?", np.nan)
    mask = X.notna().all(axis=1)
    X = X.loc[mask].reset_index(drop=True)
    y = y.loc[mask].reset_index(drop=True)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=random_state, stratify=y
    )
    
    return X_train, X_test, y_train, y_test









