
import openml
import numpy as np
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from utilities import currupt_data
from architectures import NNClasifier, NNRegressor
from numpy.random import seed
from tensorflow.random import set_seed
import pickle
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor


############################    CONFIGS    ################################


problem = 'classification' # Select between 'classification' or 'regression' problems
save_path = '/home/' # Change this to your local path for saving results
runs = 20 # Number of experiment repetitions
fold_num = 2    # number of folds in K-fold CV
missing_rate = [0.1, 0.25, 0.50, 0.9] 
conditions = ['MCAR','MAR','MNAR']


# Downloading and preparing dataset using openML
if problem == 'classification':
    dataset_ids = [737, 40983, 728, 1489, 803, 923, 725, 42192, 1558, 310, 
               1046, 847, 40701, 41146, 1496, 1507, 823, 42493, 1120,
               4135, 137, 251, 1220, 151, 901, 881, 40922, 42477]
    scoring = 'roc_auc'
elif problem == 'regression':
    dataset_ids = [42675, 198, 23515, 189, 42636, 42688, 42183, 1199,
               197, 218, 1193, 216, 215, 23395, 42225, 1200, 1213, 42669]
    scoring = 'neg_mean_squared_error'
    

##############################################################################


for missing_values in conditions:
    

    ################################ PLACE HOLDERS ###############################
    
    
    RF_full = np.zeros([runs, len(dataset_ids)])
    NN_full = np.zeros([runs, len(dataset_ids)])
    NN_0_impute = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    NN_mean_impute = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    NN_knn_impute = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    NN_ite_impute = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    HGBC_clf = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    NN_promise_woc = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    NN_promise_wc = np.zeros([runs, len(dataset_ids), len(missing_rate)])
    s = np.zeros([runs], dtype=np.int32)
    
    
    ############################## EXPERIMENTAL RUNS #############################
    
    
    for r in range(runs):
        
        s[r] = np.random.randint(100000)
        
        for d, dataset_id in enumerate(dataset_ids):
            
            dataset = openml.datasets.get_dataset(dataset_id)
        
            X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="array", 
                                                                        target=dataset.default_target_attribute)
            
            seed(s[r])
            set_seed(s[r])
            if problem == 'classification':
                clf = HistGradientBoostingClassifier()
            elif problem == 'regression':
                clf = TransformedTargetRegressor(regressor=HistGradientBoostingRegressor(), 
                                                       transformer=StandardScaler())
            scaler = StandardScaler()
            estimator = make_pipeline(scaler, clf)
            full_scores = cross_val_score(estimator, X, y, scoring=scoring, cv=fold_num)
            if problem == 'classification':
                RF_full[r,d] = full_scores.mean()
            elif problem == 'regression':
                RF_full[r,d] = -full_scores.mean()/np.var(y)
            
            seed(s[r])
            set_seed(s[r])
            if problem == 'classification':
                clf = NNClasifier(X, use_nanDense=False)  
            elif problem == 'regression':
                clf = TransformedTargetRegressor(regressor=NNRegressor(X), 
                                                       transformer=StandardScaler())
            scaler = StandardScaler()
            estimator = make_pipeline(scaler, clf)
            nn_full = cross_val_score(estimator, X, y, scoring=scoring, cv=fold_num)
            if problem == 'classification':
                NN_full[r,d] = nn_full.mean()
            elif problem == 'regression':
                NN_full[r,d] = -nn_full.mean()/np.var(y)
            
            if problem == 'classification':
                fs = SelectKBest(score_func=mutual_info_classif,k=2)
            elif problem == 'regression':
                fs = SelectKBest(score_func=mutual_info_regression,k=2)
            fs.fit(X,y)
            feature_id = np.argsort(fs.scores_)[-1]
            feature_id2 = np.argsort(fs.scores_)[-2]
            
            for j, mr in enumerate(missing_rate):
                
                seed(s[r])
                Xc = currupt_data(X, feature_id=feature_id, feature_id2=feature_id2, 
                                  method=missing_values, missing_rate=mr)
                
                seed(s[r])
                set_seed(s[r])
                imputer =  SimpleImputer(missing_values=np.nan, strategy='constant', 
                                         fill_value=0)
                scaler = StandardScaler()
                if problem == 'classification':
                    clf = NNClasifier(Xc, use_nanDense=False)  
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=NNRegressor(Xc), 
                                                       transformer=StandardScaler())
                estimator = make_pipeline(imputer, scaler, clf)
                impute_scores = cross_val_score(estimator, Xc, y, scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    NN_0_impute[r,d,j] = impute_scores.mean()
                elif problem == 'regression':
                    NN_0_impute[r,d,j] = -impute_scores.mean()/np.var(y)
                
                seed(s[r])
                set_seed(s[r])
                imputer =  SimpleImputer(missing_values=np.nan, strategy='mean')
                scaler = StandardScaler()
                if problem == 'classification':
                    clf = NNClasifier(Xc, use_nanDense=False)  
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=NNRegressor(Xc), 
                                                       transformer=StandardScaler())
                estimator = make_pipeline(imputer, scaler, clf)
                impute_scores = cross_val_score(estimator, Xc, y, scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    NN_mean_impute[r,d,j] = impute_scores.mean()
                elif problem == 'regression':
                    NN_mean_impute[r,d,j] = -impute_scores.mean()/np.var(y)
        
                seed(s[r])
                set_seed(s[r])        
                imputer =  KNNImputer(missing_values=np.nan)
                scaler = StandardScaler()
                if problem == 'classification':
                    clf = NNClasifier(Xc, use_nanDense=False)  
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=NNRegressor(Xc), 
                                                       transformer=StandardScaler())
                estimator = make_pipeline(imputer, scaler, clf)
                impute_scores = cross_val_score(estimator, Xc, y, scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    NN_knn_impute[r,d,j] = impute_scores.mean()
                elif problem == 'regression':
                    NN_knn_impute[r,d,j] = -impute_scores.mean()/np.var(y)
                
                seed(s[r])
                set_seed(s[r])
                imputer =  IterativeImputer(missing_values=np.nan,
                                            sample_posterior=True)
                scaler = StandardScaler()
                if problem == 'classification':
                    clf = NNClasifier(Xc, use_nanDense=False)  
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=NNRegressor(Xc), 
                                                       transformer=StandardScaler())
                estimator = make_pipeline(imputer, scaler, clf)
                impute_scores = cross_val_score(estimator, Xc, y, scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    NN_ite_impute[r,d,j] = impute_scores.mean()
                elif problem == 'regression':
                    NN_ite_impute[r,d,j] = -impute_scores.mean()/np.var(y)
                
                seed(s[r])
                set_seed(s[r])
                scaler = StandardScaler()
                if problem == 'classification':
                    clf = HistGradientBoostingClassifier()
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=HistGradientBoostingRegressor(), 
                                                       transformer=StandardScaler())
                estimator = make_pipeline(scaler, clf)
                impute_scores = cross_val_score(estimator, Xc, y, 
                                                scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    HGBC_clf[r,d,j] = impute_scores.mean()
                elif problem == 'regression':
                    HGBC_clf[r,d,j] = -impute_scores.mean()/np.var(y)
        
                seed(s[r])
                set_seed(s[r])       
                if problem == 'classification':
                    clf = NNClasifier(Xc)  
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=NNRegressor(Xc), 
                                                       transformer=StandardScaler())  
                scaler = StandardScaler()
                estimator = make_pipeline(scaler, clf)
                nn = cross_val_score(estimator, Xc, y, scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    NN_promise_woc[r,d,j] = nn.mean()
                elif problem == 'regression':
                    NN_promise_woc[r,d,j] = -nn.mean()/np.var(y)
                
                seed(s[r])
                set_seed(s[r])
                if problem == 'classification':
                    clf = NNClasifier(Xc, use_c=True)  
                elif problem == 'regression':
                    clf = TransformedTargetRegressor(regressor=NNRegressor(Xc, use_c=True), 
                                                       transformer=StandardScaler())
                scaler = StandardScaler()
                estimator = make_pipeline(scaler, clf)
                nn = cross_val_score(estimator, Xc, y, scoring=scoring, cv=fold_num)
                if problem == 'classification':
                    NN_promise_wc[r,d,j] = nn.mean()
                elif problem == 'regression':
                    NN_promise_wc[r,d,j] = -nn.mean()/np.var(y)
                
                print(missing_values + ': Run=%d, Dataset=%d, mr=%.2f, RF:%.2f, \
                      NN:%.2f, ZI:%.2f,MI:%.2f, KNN:%.2f, MICE:%.2f, HGBC:%.2f, \
                      PRM:%.2f, MPRM:%.2f' 
                      %(r, dataset_id, mr, RF_full[r,d], NN_full[r,d], NN_0_impute[r,d,j],
                        NN_mean_impute[r,d,j],NN_knn_impute[r,d,j],NN_ite_impute[r,d,j],
                        HGBC_clf[r,d,j],NN_promise_woc[r,d,j],NN_promise_wc[r,d,j]))
                
        with open(save_path + 'Experiment2_'+ problem + missing_values + '_results.pkl','wb') as file:
            pickle.dump({'RF_full':RF_full, 'NN_full':NN_full, 'NN_0_impute':NN_0_impute,
                         'NN_mean_impute':NN_mean_impute, 'NN_knn_impute':NN_knn_impute, 
                         'NN_ite_impute':NN_ite_impute, 'NN_promise_woc':NN_promise_woc, 
                         'NN_promise_wc':NN_promise_wc,
                         'HGBC_clf':HGBC_clf, 'seed':s}, file)


############################# PLOTTING RESULTS  ###############################


with_gain = False

results = dict()

for m in conditions:
    with open(save_path + 'Experiment2_'+ problem + m + '_results.pkl','rb') as file:
        full_results = pickle.load(file)
    
    RF_full = full_results.pop('RF_full')
    NN_full = full_results.pop('NN_full')
    full_results.pop('seed')
    
    if with_gain:
        with open(save_path + 'Experiment2_GAIN_'+ problem + m + '_results.pkl','rb') as file:
            full_results_gain = pickle.load(file)
        NN_full_gain = full_results_gain.pop('NN_full')
        full_results_gain['NN_gain_impute'] = full_results_gain['NN_gain_impute']
    
    for key in full_results.keys():  
        for i in range(runs):
            for j in range(len(dataset_ids)):
                if key != 'HGBC_clf':
                    full_results[key][i,j,:] = NN_full[i,j] - full_results[key][i,j,:]
                else:
                    full_results[key][i,j,:] = RF_full[i,j] - full_results[key][i,j,:]
        if problem == 'regression':
            full_results[key] = -full_results[key]
        full_results[key][full_results[key]<0] = 0
        full_results[key] = np.median(full_results[key],axis=1)
        full_results[key] = np.mean(full_results[key],axis=0)
        
    if with_gain:
        key = 'NN_gain_impute'
        for i in range(runs):
            for j in range(len(dataset_ids)):
                full_results_gain[key][i,j,:] = NN_full_gain[i,j] - full_results_gain[key][i,j,:]      
        if problem == 'regression':
            full_results_gain[key] = -full_results_gain[key]
        full_results_gain[key][full_results_gain[key]<0] = 0
        full_results_gain[key] = np.median(full_results_gain[key],axis=1)
        full_results_gain[key] = np.mean(full_results_gain[key],axis=0)
        full_results[key] = full_results_gain[key]
                    
    
    results[m] =  full_results


##############################################################################


if with_gain:
    names = ['Zero', 'Mean', 'KNN', 'MICE', 'GAIN', 'HGB', 'PROMISSING', 'mPROMISSING']
    methods = ['NN_0_impute', 'NN_mean_impute', 'NN_knn_impute', 'NN_ite_impute', 
               'NN_gain_impute', 'HGBC_clf', 'NN_promise_woc', 'NN_promise_wc']
else:
    names = ['Zero', 'Mean', 'KNN', 'MICE', 'HGB', 'PROMISSING', 'mPROMISSING']
    methods = ['NN_0_impute', 'NN_mean_impute', 'NN_knn_impute', 'NN_ite_impute', 
               'HGBC_clf', 'NN_promise_woc', 'NN_promise_wc']

fig, ax = plt.subplots(1,3, dpi=150)
for i,key1 in enumerate(results.keys()):
    ax[i%3] = plt.subplot(1,3,i+1)  
    for k, key2 in enumerate(methods):
        ax[i%3].plot(missing_rate, results[key1][key2], label=names[k])
    ax[i%3].set_title('%s' %(str(key1)), fontsize=10)
    ax[i%3].grid(linestyle='--', linewidth=0.5)
    
    if i%3 == 0:
        if problem == 'regression':
            ax[i%3].set_ylabel('SMSE Increase') 
        else:
            ax[i%3].set_ylabel('AUC Drop') 
            
    ax[i%3].set_xlabel('Sample Missing Rate', fontsize=8)
    ax[i%3].set_xticks(missing_rate)
    for spine in ax[i%3].spines.values():
        spine.set_visible(False)
  
plt.legend(fontsize=5.75)

plt.savefig(save_path + 'Experiment2_'+ problem +'_results.png', dpi=300)


##############################################################################
