import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os
import shap
def remove_leading_zeros(x):
    try:
        s = str(x).lstrip('0')
        return s if s != '' else '0'
    except:
        return x
def preprocess_credit_approval(input_file='credit_approval.data', output_file='credit_approval_processed.csv'):
    src_path = os.path.dirname(os.path.realpath('__file__'))
    src_path = os.path.join(src_path, 'FaiGBFC')
    dataset_path = os.path.join(src_path, 'Datasets/raw dataset')
    data_path = os.path.join(dataset_path, input_file)

    data = pd.read_csv(data_path, header=0, na_values='?')

    continuous_indices = [1, 2, 7, 10, 13, 14]
    columns = data.columns.tolist()
    continuous_cols = [columns[i] for i in continuous_indices]
    label_col = columns[15] 

    for col in [columns[10], columns[13], columns[14]]:
        data[col] = data[col].astype(str).apply(remove_leading_zeros)

    for col in continuous_cols:
        data[col] = pd.to_numeric(data[col], errors='coerce')

    for col in data.columns:
        if col in continuous_cols:
            data[col] = data[col].fillna(data[col].mean())
        else:
            data[col] = data[col].fillna(data[col].mode()[0])
    data[label_col] = data[label_col].apply(lambda x: 1 if x == '+' else 0)

    for col in data.columns:
        if col not in continuous_cols and col != label_col and data[col].dtype == object:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))

    data = move_last_column_to_first(data)

    data.to_csv(output_file, index=False)
    print(f"✅ 数据预处理完成，保存为：{output_file}")


def preprocess_credit_default(input_file='default of credit card clients.xls', output_file='credit_default.csv'):
    src_path = os.path.dirname(os.path.realpath('__file__'))
    src_path = os.path.join(src_path, 'FaiGBFC')
    dataset_path = os.path.join(src_path, 'Datasets/raw dataset')
    data_path = os.path.join(dataset_path, input_file)

    data = pd.read_excel(data_path, header=1)

    data.drop(columns=['ID'], inplace=True)

    data = data.rename(columns={'default payment next month': 'Y'})

    repay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE'] + repay_cols + ['Y']
    continuous_cols = ['LIMIT_BAL', 'AGE'] + [f'BILL_AMT{i}' for i in range(1, 7)] + [f'PAY_AMT{i}' for i in range(1, 7)]

    for col in repay_cols:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        data[col] = data[col] + 2

    for col in data.columns:
        if col in continuous_cols:
            data[col] = pd.to_numeric(data[col], errors='coerce')
            data[col] = data[col].fillna(data[col].mean())
        elif col in categorical_cols:
            data[col] = data[col].fillna(data[col].mode()[0])

    data['SEX'] = data['SEX'].map({1: 0, 2: 1})
    data['Y'] = data['Y'].apply(lambda x: 0 if x == 1 else 1)

    data = move_last_column_to_first(data)

    data.to_csv(output_file, index=False)
    print(f"✅ 信用卡违约数据预处理完成，保存为：{output_file}")





def move_last_column_to_first(df):
    cols = df.columns.tolist()
    new_order = [cols[-1]] + cols[:-1]
    return df[new_order]


def main():
    # preprocess_credit_approval()
    # preprocess_adult()
    # preprocess_recruitment()
    preprocess_credit_default()
    # preprocess_thyroid()
    # preprocess_bail_data()
    # preprocess_compas_data()
    # preprocess_law_school_data()
    # preprocess_bank_data()
    
if __name__ == '__main__':
    main()