import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import copy

np.random.seed(32)

loans = pd.read_csv('../../_datasets/LendingClub/accepted_2007_to_2018Q4.csv.gz', compression='gzip', low_memory=False)
loans = loans.loc[loans['loan_status'].isin(['Fully Paid', 'Charged Off'])]
loans['charged_off'] = (loans['loan_status'] == 'Charged Off').apply(np.int)
loans.drop('loan_status', axis=1, inplace=True)

# balance dataset
zero_class = loans[loans['charged_off'] == 0]
one_class = loans[loans['charged_off'] == 1]
excess = len(zero_class) - len(one_class)
loc_to_drop = np.random.choice(zero_class.index, size=excess, replace=False)
loans = loans[np.logical_not(loans.index.isin(loc_to_drop))]

# remove features with more than 30% NaNs
missing_fractions = loans.isnull().mean().sort_values(ascending=False)
drop_list = sorted(list(missing_fractions[missing_fractions > 0.3].index))
loans.drop(labels=drop_list, axis=1, inplace=True)

keep_list = ['addr_state', 'annual_inc', 'application_type', 'charged_off',
             'dti', 'earliest_cr_line', 'emp_length', 'emp_title', 'fico_range_high',
             'fico_range_low', 'grade', 'home_ownership', 'id', 'initial_list_status',
             'installment', 'int_rate', 'issue_d', 'loan_amnt', 'mort_acc', 'open_acc',
             'pub_rec', 'pub_rec_bankruptcies', 'purpose', 'revol_bal', 'revol_util',
             'sub_grade', 'term', 'title', 'total_acc', 'verification_status', 'zip_code']
drop_list = [col for col in loans.columns if col not in keep_list]
loans.drop(labels=drop_list, axis=1, inplace=True)

# some initial pre-processing
loans.drop('id', axis=1, inplace=True)
loans['term'] = loans['term'].apply(lambda s: np.int(s.split()[0]))
loans.drop('grade', axis=1, inplace=True)
loans.drop(labels='emp_title', axis=1, inplace=True)
loans['emp_length'].replace(to_replace='10+ years', value='10 years', inplace=True)
loans['emp_length'].replace('< 1 year', '0 years', inplace=True)


def emp_length_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int(s.split()[0])


loans['emp_length'] = loans['emp_length'].apply(emp_length_to_int)
loans['home_ownership'].replace(['NONE', 'ANY'], 'OTHER', inplace=True)
loans['log_annual_inc'] = loans['annual_inc'].apply(lambda x: np.log10(x+1))
loans.drop('annual_inc', axis=1, inplace=True)
issue_d = copy.copy(loans['issue_d'])                # to split on
loans.drop('issue_d', axis=1, inplace=True) 
loans.drop('title', axis=1, inplace=True)
loans.drop(labels='zip_code', axis=1, inplace=True)
loans['earliest_cr_line'] = loans['earliest_cr_line'].apply(lambda s: int(s[-4:]))
loans['fico_score'] = 0.5*loans['fico_range_low'] + 0.5*loans['fico_range_high']
loans.drop(['fico_range_high', 'fico_range_low'], axis=1, inplace=True)
loans['log_revol_bal'] = loans['revol_bal'].apply(lambda x: np.log10(x+1))
loans.drop('revol_bal', axis=1, inplace=True)
# convert sub grade to numeric
loans['sub_grade'] = loans['sub_grade'].factorize(sort=True)[0]

# one hot encode
categorical_variables = ['home_ownership', 'verification_status', 'purpose', 'addr_state', 'term', 'emp_length',
                         'mort_acc', 'pub_rec_bankruptcies', 'pub_rec', 'initial_list_status', 'application_type']
ordinal_variables = []

ncat_of_cat_features = [loans[catvar].nunique() for catvar in categorical_variables]
loans = pd.get_dummies(loans, columns=categorical_variables, prefix_sep='__')
# Warning: some of the above contained NaN values, which were treated as an extra category

# Impute means to fill missing sort_values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputed_values = imputer.fit_transform(loans)
imputed_loans = pd.DataFrame(imputed_values, columns=loans.columns)
# print((np.where((imputed_loans.values != loans.values))[0] == np.where(loans.isnull())[0]).all())
processed_loans = imputed_loans

# Standard scale continuous variables
ss = StandardScaler()

continuous_variables = []
for col in processed_loans.columns:
    if not (processed_loans[col].isin([0, 1]).all()):
        continuous_variables.append(col)

loans_cont = processed_loans[continuous_variables]
scaled_values = ss.fit_transform(loans_cont)
processed_loans[continuous_variables] = pd.DataFrame(scaled_values, columns=loans_cont.columns)

# Remove Large Outliers in continuous features
bool_mask = (processed_loans[continuous_variables].abs() < 10).all(axis=1)
issue_d = issue_d.reset_index(drop=True)
issue_d = issue_d[bool_mask]
issue_d = issue_d.reset_index(drop=True)
processed_loans = processed_loans[bool_mask].reset_index(drop=True)

# train/test split, take 10% most recent loans as test set
issue_d = pd.to_datetime(issue_d)
(issue_d < issue_d.quantile(q=.9)).reset_index(drop=True)

train_split = (issue_d < issue_d.quantile(q=.85)).reset_index(drop=True)
val_split = ((issue_d < issue_d.quantile(q=.9)) & (issue_d >= issue_d.quantile(q=.85))).reset_index(drop=True)
test_split = (issue_d >= issue_d.quantile(q=.9)).reset_index(drop=True)

loans_train = processed_loans.loc[train_split]
loans_val = processed_loans.loc[val_split]
loans_test = processed_loans.loc[test_split]


print('class split in training set is: {}'.format((loans_train['charged_off'] == 0.).values.sum() / len(loans_train)))
print('class split in val set is: {}'.format((loans_val['charged_off'] == 0.).values.sum() / len(loans_val)))
print('class split in test set is: {}'.format((loans_test['charged_off'] == 0.).values.sum() / len(loans_test)))
                                                  
# re-balance validation set
zero_classv = loans_val[loans_val['charged_off'] == 0]
one_classv = loans_val[loans_val['charged_off'] == 1]
excessv = len(zero_classv) - len(one_classv)
loc_to_dropv = np.random.choice(one_classv.index, size=(-1)*excessv, replace=False)
loans_val = loans_val[np.logical_not(loans_val.index.isin(loc_to_dropv))]
print('re-balance val set')
print('class split in val set is: {}'.format((loans_val['charged_off'] == 0.).values.sum() / len(loans_val)))

# save pre-processed data 
loans_train.to_csv('../../_datasets/LendingClub/accepted_2007_to_2018Q4_preprocessed_train.csv.gz')
loans_val.to_csv('../../_datasets/LendingClub/accepted_2007_to_2018Q4_preprocessed_val.csv.gz')
loans_test.to_csv('../../_datasets/LendingClub/accepted_2007_to_2018Q4_preprocessed_test.csv.gz')
# loans.to_csv('../_datasets/LendingClub/accepted_2007_to_2018Q4_preprocessed.csv.gz')

variable_list = {"continuous_variables": continuous_variables,
                 "categorical_variables": categorical_variables
                 # "n_continuous_features": len(continuous_variables),
                 # "ncat_of_cat_features": ncat_of_cat_features,
                 # "n_ordinal_features": len(ordinal_variables)
                 }

with open("../../_datasets/LendingClub/variable_type_lengths.txt", "w") as output:
    output.write(str(variable_list))
