import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from scipy.stats import multivariate_normal
from sklearn.metrics import zero_one_loss
from sklearn.base import clone


def compute_zero_one_loss(true_labels, predicted_labels):
    return zero_one_loss(true_labels, predicted_labels)

files = ['car_evaluation'
             , 'monks2'
             , 'monks1'
             , 'monks3'
             , 'bar7'
             , 'compas'
             , 'fico'
             , 'bcw_bin'
             , 'restaurant_20'
             , 'bar'
             , 'coffee_house']

names = ['Car Eval'
             , 'Monks 2'
             , 'Monks 1'
             , 'Monks 3'
             , 'Bar-7'
             , 'Compas'
             , 'FICO'
             , 'BCW'
             , 'Restaurant'
             , 'Bar'
             , 'Coffee']
 
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import roc_auc_score

plt.figure()

delta_values = np.linspace(0, 0.2, 100)

for file_id, file in enumerate(files):
    file = '../datasets/' + file + '.csv'
    df = pd.read_csv(file)
        # Extract features and labels
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    
    # Initialize parameters
    depth = 5  # Chosen tree depth
    min_samples_leaf = 10  # Chosen min samples in each leaf

    # Fit model via cross-validation
    model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=min_samples_leaf)
    model.fit(X, y)
    
    # Initialize AUC scores list
    auc_scores = []
    # Calculate AUC scores for each feature
    for feature_idx in range(X.shape[1]):
        auc = roc_auc_score(y, X[:, feature_idx])
        if auc < 0.5:
            auc = 1 - auc
        auc_scores.append((feature_idx, auc))
    # Sort the AUC scores in descending order
    sorted_auc_scores = sorted(auc_scores, key=lambda x: x[1], reverse=True)
    # Get indices of top 3 features
    features_to_adjust = [idx for idx, _ in sorted_auc_scores[:3]]
    #print(features_to_adjust)
    
    to_plot = []
    
    for i_id, delta in enumerate(delta_values):
        
        variance_array = []
        for i in range(10):
            X_noisy = X.copy()
            for k in features_to_adjust:
                flip_indices = np.random.rand(X.shape[0]) < delta
                X_noisy[:, k] = X_noisy[:, k] ^ flip_indices

            predicted_labels = model.predict(X_noisy)
            zero_one_loss_value = compute_zero_one_loss(y, predicted_labels)
                
            variance = np.var((y == predicted_labels).astype(int))
            variance_array += [variance]

        variance_mean = np.mean(variance_array)
        variance_std = np.std(variance_array)
        to_plot.append((variance_mean, variance_std))
        
    to_plot_mean = np.array(to_plot)[:, 0]
    to_plot_std = np.array(to_plot)[:, 1]

    plt.plot(delta_values, to_plot_mean, label=names[file_id])
    plt.fill_between(delta_values, to_plot_mean-to_plot_std, to_plot_mean+to_plot_std, alpha = 0.1)

    plt.xlabel(r'Uniform attribute noise, $\rho_a$', fontsize = 20)
    plt.ylabel('Variance of the loss', fontsize = 20)
    

plt.legend(loc = (1.05, -0.15), fontsize = 16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.locator_params(axis='x', nbins=5)
plt.savefig("b_random.png", bbox_inches='tight', dpi = 200)