import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier, XGBRegressor
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import random
from sklearn.svm import SVC, SVR
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from ucimlrepo import fetch_ucirepo

def find_best_threshold(scores, labels):
    best_threshold = 0
    best_f1 = 0
    for threshold in scores:
        preds = np.where(scores >= threshold, 1, 0)
        f1 = f1_score(labels, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

class OHEWrapper:
    def __init__(self, categorical, task, seed,model, n_estimators = 100, max_depth = None, kernel = 'linear'):
        self.encoder = ce.OneHotEncoder(use_cat_names=True, cols = categorical)
        self.scaler = MinMaxScaler()
        self.task = task
        if task == 'classification':
            if 'svm' in model:
                self.model = SVC(probability=True, kernel=kernel, random_state=seed, max_iter = 10000)
            else:
                ValueError("Model not recognized")
            self.function = self.model.predict_proba
        else:
            if 'svm' in model:
                self.model = SVR(kernel=kernel,  max_iter = 10000)
            else:
                ValueError("Model not recognized")
            self.function = self.model.predict
                    
    def fit(self, X, y):
        self.columns = X.columns
        Xtr = self.encoder.fit_transform(X)
        Xtr = pd.DataFrame(self.scaler.fit_transform(Xtr), columns=Xtr.columns, index=Xtr.index)
        self.model.fit(Xtr, y)
        
    def predict(self, X):
        X = pd.DataFrame(X, columns=self.columns)
        Xte = self.encoder.transform(X)
        Xte = pd.DataFrame(self.scaler.transform(Xte), columns=Xte.columns, index=Xte.index)
        return self.function(Xte)
    
    def predict_estimators(self, X):
        X = pd.DataFrame(X, columns=self.columns)
        Xte = self.encoder.transform(X)
        Xte = pd.DataFrame(self.scaler.transform(Xte), columns=Xte.columns, index=Xte.index)
        predictions = np.zeros((Xte.shape[0], self.model.n_estimators))
        for tree in range(100):
            predictions[:,tree] = self.model.estimators_[tree].predict(Xte)
        return predictions
    

    
def compute_distance_mixed_dataset(x1, x2, used_mixed = False):
    cat = [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]
    num = [1, 4, 7, 10, 12, 15, 17]
    tot = 0
    x1 = x1.reshape((1,x1.size))
    x2 = x2.reshape((1,x2.size))
    
    if used_mixed:
        return np.linalg.norm(x1, x2)
    else:
        x1_cat = x1[:, cat]
        x2_cat = x2[:, cat]
            
        x1_num = x1[:, num]
        x2_num = x2[:, num]
        
        tot = np.sum((x1_num-x2_num)**2) + np.sum(np.where(x1_cat != x2_cat, 1, 0)[0])
                    
        return np.sqrt(tot)
    
    
def add_noise(x, X_train, distances, df_info, weights, rng, nsamples=1000):
    """
    This function is inspirated to the process of generating new instances in paper 
    'Buciluǎ, Cristian, Rich Caruana, and Alexandru Niculescu-Mizil. "Model compression." 
    Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. 2006.'
    
    A good improvement can be to handle the difference between categorical and ordinal features.
    """
    samples = pd.DataFrame(np.repeat(x.values.reshape(1,-1), nsamples, axis=0), columns=X_train.columns).reset_index(drop=True)

    for i in samples.index:
        features_to_perturb = X_train.columns[np.where(weights != 0)[0]]
        for f in features_to_perturb:
            if f in df_info["numerical"]:
                probabilities = np.nan_to_num((1/distances)/np.sum(1/distances), nan=0)
                probabilities /= np.sum(probabilities)  # Ensure the sum is 1
                idx_s = rng.choice(X_train.index, p=probabilities)
                samples.loc[i, f] += rng.normal(loc=0, scale=np.abs(x[f]-X_train.loc[idx_s, f]))
            else:
                probabilities = np.nan_to_num((1/distances)/np.sum(1/distances), nan=0)
                probabilities /= np.sum(probabilities)  # Ensure the sum is 1
                idx_s = rng.choice(X_train.index, p=probabilities)
                samples.loc[i, f] = X_train.loc[idx_s, f]
    return samples

def compute_matrix_distances(X, Xtrain, df_info):
    Xte = X.copy()
    Xte.fillna(-1, inplace=True)
    
    Xtr = Xtrain.copy()
    Xtr.fillna(-1, inplace=True)
    
    cat = df_info["categorical"]
    num = df_info["numerical"]
    distances = np.zeros((X.shape[0], Xtrain.shape[0]))
    i = 0
    j = 0
    for (_, x) in Xte.iterrows():
        j = 0
        for (_, ref) in Xtr.iterrows():
            euclidean = np.linalg.norm(x.loc[num] - ref.loc[num], 2)
            hamming = np.where(x.loc[cat] != ref.loc[cat], 1, 0).sum() 
            ## check if euclidean is nan
            distances[i, j] = euclidean + hamming if euclidean + hamming != 0 else 10000
            j += 1
        i += 1
    return distances


def load_data(dataset, seed):
    if dataset == 'adult': 
        # classification
        adult = fetch_ucirepo(id=2) 
        df = adult.data.features.copy()
        df.drop(['fnlwgt', 'education'], axis=1, inplace=True)
        df['labels'] =  adult.data.targets.replace({'<=50K': 0, '<=50K.':0, '>50K':1, '>50K.': 1}).astype(int).values.flatten()
        df['sex'] = df['sex'].replace({'Male': 0, 'Female': 1}).astype(int)
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        categorical = [1,3,4,5,6,11]
        X = df.iloc[:, :-1]
        y = df['labels'].values
        ## keep the first 10000 rows
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed, stratify=y)
    elif dataset == 'churn': 
        # classification
        churn = fetch_ucirepo(id=563) 
        df = churn.data.features.copy()
        df['labels'] =  churn.data.targets
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        categorical = []
        X = df.iloc[:, :-1]
        y = df['labels'].values
        ## keep the first 10000 rows
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed, stratify=y)
    elif dataset == 'wine':
        # regression
        wine = fetch_ucirepo(id=186)
        df = wine.data.features.copy()
        df['labels'] = wine.data.targets.values
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        X = df.iloc[:, :-1]
        y = df['labels'].values
        y = (y - y.min())/(y.max() - y.min())
        categorical = []
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed)
    elif dataset == 'parkinson':
        # regression
        parkinson = fetch_ucirepo(id=189) 
        df = parkinson.data.features.copy()
        df['labels'] = parkinson.data.targets["total_UPDRS"].values
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        X = df.iloc[:, :-1]
        y = df['labels'].values
        y = (y - y.min())/(y.max() - y.min())
        categorical = []
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed)
    elif dataset == 'bike':
        bike = fetch_ucirepo(id=275)
        df = bike.data.features.copy()
        df.drop(columns=['season','dteday','hr'], inplace=True)
        df['labels'] = bike.data.targets.values
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        X = df.iloc[:, :-1]
        y = df['labels'].values
        y = (y - y.min())/(y.max() - y.min())
        categorical = [1,3,5]
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed)
    elif dataset == 'compas':
        df = pd.read_csv(f'./{dataset}/{dataset}.csv')
        df.drop(['age_cat', 'score_text', 'v_type_of_assessment', 'v_decile_score', 'v_score_text'], axis=1, inplace=True)
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1].values + 1
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed, stratify=y)
        categorical = [2,9,10]
    elif dataset == 'power':
        creditcard = fetch_ucirepo(id=849)
        df = creditcard.data.features.copy()
        df.drop(columns=['DateTime'], inplace=True)
        df['labels'] = creditcard.data.targets['Zone 1 Power Consumption'].values
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        X = df.iloc[:, :-1]
        y = df['labels'].values
        categorical = []
        y = (y - y.min())/(y.max() - y.min())
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed)
    elif dataset == 'creditcard':
        creditcard = fetch_ucirepo(id=350)
        df = creditcard.data.features.copy()
        df['labels'] = creditcard.data.targets.values
        df.drop_duplicates(inplace=True)
        df.dropna(inplace=True)
        X = df.iloc[:, :-1]
        y = df['labels'].values
        categorical = []
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=1000, random_state=seed, stratify=y)
    else: 
        raise ValueError(f"Unknown dataset: {dataset}")

    if dataset == 'news' or dataset == 'wine' or dataset == 'parkinson' or dataset == 'appliances':
        task = 'regression'
    else:
        task = 'classification'
        
    cat_names = X.columns[categorical]
    return Xtrain, Xtest, ytrain, ytest, cat_names, categorical, task

def cosine_similarity(x, y):
    return np.sum(x*y)/(np.linalg.norm(x)*np.linalg.norm(y)) if np.linalg.norm(x)*np.linalg.norm(y) != 0 else 0