import pandas as pd

# Load the original dataset
df = pd.read_csv("loan.csv", low_memory=False)

# Clean and filter
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df = df.dropna(subset=['annual_inc', 'loan_amnt', 'int_rate', 'installment'])

# Create binary target
df['y'] = (df['loan_status'] == 'Fully Paid').astype(int)

# print stats of the original dataset
print("📊 Original dataset statistics:")
stats = df.describe()
print(stats.T)

# Identify small categorical columns (3–10 unique values)
nunique = df.nunique()
small_cat_cols = nunique[(nunique >= 2) & (nunique <= 10)].index.tolist()

# Create A candidates
A_candidates = {}
for col in small_cat_cols:
    if df[col].isnull().all():
        continue
    top_val = df[col].value_counts(dropna=False).idxmax()
    new_col_name = f"A__{col}__is_{top_val}"
    A_candidates[new_col_name] = (df[col] == top_val).astype(int)

# Combine into DataFrame
A_binary_df = pd.DataFrame(A_candidates)
df_extended = pd.concat([df, A_binary_df], axis=1)

# print stats of A candidates
print("📊 A candidates statistics:")
stats = df_extended[A_binary_df.columns].describe()
print(stats.T)

# Correlation computation
correlation_df = pd.DataFrame([
    {'A_column': col, 'Correlation_with_y': df_extended[[col, 'y']].corr().iloc[0, 1]}
    for col in A_binary_df.columns
]).sort_values(by='Correlation_with_y', key=abs, ascending=False)

# Show top correlated A candidates
print("📈 Correlation with target (y) for A candidates:")
print(correlation_df.head(10))

# Save for future use
df_extended.to_csv("loan_extended_with_A_candidates.csv", index=False)
correlation_df.to_csv("A_candidates_correlations.csv", index=False)
