import numpy as np
import multiprocessing as mp

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

import copy
import time
from persim import sliced_wasserstein as SW
from persim import PersistenceImager
from persim import PersLandscapeApprox

from persistence_spheres import PerSPhere
from persistence_spheres_utils import make_weighting, persistence, from_DGMS_to_H

from pers_splines import make_PersSplines_vec

from pipelines_utils import *


class SpheresPipeline(object):
    def __init__(self, DGMS, Y, params,
                 MODELS = ['logistic_regression'],train_split = 0.8, n_splits = 5):

        # model = 'logistic_regression','linear_regression','ridge_regression','SVC','SVR','class_tree','regr_tree',

        self.DGMS = DGMS
        self.Y = Y

        self.train_split = train_split
        self.n_splits = n_splits
        
        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.kf_CV = StratifiedKFold(n_splits=self.n_splits)
        else:
            self.kf_CV = KFold(n_splits=self.n_splits)

        """
        Splitting Train and Test
        """
        INDEXES = np.arange(len(self.DGMS),dtype=int)
        train_idxs, test_idxs, self.y_train, self.y_test = train_test_split(INDEXES, self.Y, train_size=self.train_split)

        self.y_train = self.y_train[np.argsort(train_idxs)]        
        train_idxs = np.sort(train_idxs)    
        
        self.y_test = self.y_test[np.argsort(test_idxs)]        
        test_idxs = np.sort(test_idxs)    
        
        self.idxs_train = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_train[train_idxs.astype(int)] = True
        self.DGMS_train = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_train[i]]

        
        self.idxs_test = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_test[test_idxs.astype(int)] = True
        self.DGMS_test = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_test[i]]

        """
        Preparing the CV for Parameter Selection
        """
        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train, self.y_train))
        else:
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train))

        self.MODELS = []

        self.THETA, self.K, self.ALPHA = params[0][:3]
        
        for i,model in enumerate(MODELS):
            if model == 'logistic_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(LogisticRegression(penalty='l2',max_iter=100000,C=c)) for c in self.C]
            elif model == 'SVC':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(SVC(C=c,max_iter=100000)) for c in self.C]
            elif model == 'class_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestClassifier(n_estimators=c) for c in self.C]
            elif model == 'linear_regression':
                self.MODELS = self.MODELS+[LinearRegression(fit_intercept=True)]
            elif model == 'ridge_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[Ridge(alpha=c,fit_intercept=True) for c in self.C]
            elif model == 'SVR':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[SVR(C=c,max_iter=100000) for c in self.C]
            elif model == 'regr_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestRegressor(n_estimators=c) for c in self.C]

    
    def parameter_selection(self,):

        MP = True

        if MP:
            pool = mp.Pool()
            
            self.RESULTS = pool.map(make_CV_fold,([self.y_train, self.DGMS_train,
                                                 n_theta,alpha,k, 
                                                 model, 
                                                 train_idx, test_idx] for n_theta in self.THETA 
                                                                      for alpha in self.ALPHA
                                                                      for k in self.K
                                                                      for model in self.MODELS
                                                                      for (train_idx,test_idx) in self.CV_splits))
    
            pool.close()

        else:



        

            self.RESULTS = []
    
            for n_theta in self.THETA: 
                for alpha in self.ALPHA: 
                    for k in self.K: 
                        for model in self.MODELS:
                            for (train_idx,test_idx) in self.CV_splits:
                                aux = make_CV_fold([self.y_train, self.DGMS_train,
                                                 n_theta,alpha,k, 
                                                 model, 
                                                 train_idx, test_idx])
            
                                self.RESULTS.append(aux)
                        

    def read_CV_results(self,):

        try:
            self.RESULTS
        except:
            self.parameter_selection()

        self.SCORES = {}

        a = 0

        for i_theta in range(len(self.THETA)): 
            for i_a in range(len(self.ALPHA)): 
                for i_k in range(len(self.K)): 
                    for i_model in range(len(self.MODELS)):
                        SCORES_FOLD = []
        
                        for i_fold in range(self.n_splits):
                            SCORES_FOLD.append(self.RESULTS[a]['score'])
                            a+=1
                        self.SCORES[sum(SCORES_FOLD)] = [i_theta,i_a,i_k,i_model]

        self.best_score = np.max(list(self.SCORES.keys()))

        i_theta,i_a,i_k,i_model = self.SCORES[self.best_score]

        self.best_params = {}
        self.best_params['n_theta']=self.THETA[i_theta]
        self.best_params['alpha']=self.ALPHA[i_a]
        self.best_params['k']=self.K[i_k]
        self.best_params['model']=self.MODELS[i_model]  
        self.best_params['idx_model']=i_model


    def evaluate_best_model(self,):

        self.best_weighting = make_weighting(K=self.best_params['k'],alpha=self.best_params['alpha'])
        self.n_theta = self.best_params['n_theta']
        self.n_phi = 2*self.best_params['n_theta']

        training_data = PerSPhere(self.DGMS_train,weighting = self.best_weighting,n_theta = self.n_theta, n_phi = self.n_phi).sph_armonics
        test_data = PerSPhere(self.DGMS_test,weighting = self.best_weighting,n_theta = self.n_theta, n_phi = self.n_phi).sph_armonics

#        scaling = MinMaxScaler(feature_range=(-1,1)).fit(training_data)
#        training_data = scaling.transform(training_data)
#        test_data = scaling.transform(test_data)
        
        self.best_params['model'].fit(training_data,self.y_train)
        score = self.best_params['model'].score(test_data,self.y_test)

        return score


    def run_analysis(self,):

        self.parameter_selection()
        self.read_CV_results()
        score = self.evaluate_best_model()

        return score





        
class KernelPipeline(object):
    def __init__(self, DGMS, Y, params,D, 
                 model = 'SVC',train_split = 0.8, n_splits = 5, 
                 M=10):

        # model = 'SVC','SVR'

        self.DGMS = DGMS
        self.Y = Y
        self.M = M
        self.train_split = train_split
        self.n_splits = n_splits
        
        if model == 'SVC':
            self.kf_CV = StratifiedKFold(n_splits=self.n_splits)
        else:
            self.kf_CV = KFold(n_splits=self.n_splits)

        print('Making Graham Matrix.')

        t0 = time.time()
        if D is None:
            self.make_graham_matrix()
        else:
            self.D = D
        t1 = time.time()  
        
        print('Done!', t1-t0)

        """
        Splitting Train and Test
        """
        INDEXES = np.arange(len(self.DGMS),dtype=int)
        train_idxs, test_idxs, self.y_train, self.y_test = train_test_split(INDEXES, self.Y, train_size=self.train_split)
        
        self.y_train = self.y_train[np.argsort(train_idxs)]        
        train_idxs = np.sort(train_idxs)    
        
        self.y_test = self.y_test[np.argsort(test_idxs)]        
        test_idxs = np.sort(test_idxs) 
        
        self.idxs_train = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_train[train_idxs.astype(int)] = True
        
        self.X_train = self.D[self.idxs_train,:]
        self.X_train = self.X_train[:,self.idxs_train]
       
        self.DGMS_train = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_train[i]]

       
        self.idxs_test = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_test[test_idxs.astype(int)] = True
        
        self.X_test = self.D[self.idxs_test,:]
        self.X_test = self.X_test[:,self.idxs_train]
        
        self.DGMS_test = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_test[i]]

        """
        Preparing the CV for Parameter Selection
        """
        if model == 'SVC':
            self.CV_splits = list(self.kf_CV.split(self.X_train, self.y_train))
        else:
            self.CV_splits = list(self.kf_CV.split(self.X_train))

        if model == 'SVC':
            self.SIGMA,self.C = params
            self.MODELS = [OneVsRestClassifier(SVC(C=c,max_iter=100000)) for c in self.C]
        elif model == 'SVR':
            self.SIGMA,self.C = params
            self.MODELS = [SVR(C=c,max_iter=100000) for c in self.C]


    def make_graham_matrix(self,):

        pool = mp.Pool()
        RESULTS = pool.map(wrap_SW,([self.DGMS[i], self.DGMS[j], self.M] for i in range(len(self.DGMS)) for j in range(i)))
        pool.close()
        
        D = np.zeros((len(self.DGMS),len(self.DGMS)))

        a = 0
    
        for i in range(len(self.DGMS)):
            for j in range(i):
                D[i,j] = RESULTS[a]
                a+=1
            
        self.D = D+D.T
        
    def parameter_selection(self,):

        pool = mp.Pool()
        
        self.RESULTS = pool.map(kernel_make_CV_fold,([self.X_train, self.y_train, sigma,  
                                         model, 
                                         train_idx, test_idx] for sigma in self.SIGMA 
                                                              for model in self.MODELS
                                                              for (train_idx,test_idx) in self.CV_splits))

        pool.close()



    def read_CV_results(self,):

        try:
            self.RESULTS
        except:
            self.parameter_selection()

        self.SCORES = {}

        a = 0
        for i_sigma in range(len(self.SIGMA)): 
            for i_model in range(len(self.MODELS)):
                SCORES_FOLD = []

                for i_fold in range(self.n_splits):
                    SCORES_FOLD.append(self.RESULTS[a]['score'])
                    a+=1

                self.SCORES[sum(SCORES_FOLD)] = [i_sigma,i_model]


        self.best_score = np.max(list(self.SCORES.keys()))
        
        i_sigma,i_model = self.SCORES[self.best_score]

        self.best_params = {}
        self.best_params['sigma']=self.SIGMA[i_sigma]
        self.best_params['model']=self.MODELS[i_model]


    def evaluate_best_model(self,):

        self.best_params['model'].fit(self.X_train,self.y_train)
        score = self.best_params['model'].score(self.X_test,self.y_test)

        return score


    def run_analysis(self,):

        self.parameter_selection()
        self.read_CV_results()
        score = self.evaluate_best_model()

        return score




class PiPipeline(object):
    def __init__(self, DGMS, Y, params,
                 MODELS = ['logistic_regression'],train_split = 0.8, n_splits = 5, N=[100], N_SIGMA = [10], P = [1]):

        # model = 'logistic_regression','linear_regression','ridge_regression','SVC','SVR','class_tree','regr_tree',

        self.DGMS = DGMS
        self.Y = Y

        m_b = np.min([np.min(D[:,0]) for D in DGMS])
        m_d = np.min([np.min(D[:,1]) for D in DGMS])
        
        M_b = np.max([np.max(D[:,0]) for D in DGMS])
        M_d = np.max([np.max(D[:,1]) for D in DGMS])
        
        self.N = N
        self.PIX = [np.min([(M_b-m_b)/n,(M_d-m_d)/n]) for n in self.N]
        self.PIX = [10 ** np.ceil(np.log10(pix)) for pix in self.PIX]
        self.N_SIGMA = N_SIGMA
        self.SIGMA = [pix/n for pix in self.PIX for n in self.N]
        self.P = P

        self.train_split = train_split
        self.n_splits = n_splits

        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.kf_CV = StratifiedKFold(n_splits=self.n_splits)
        else:
            self.kf_CV = KFold(n_splits=self.n_splits)

        """
        Splitting Train and Test
        """
        INDEXES = np.arange(len(self.DGMS),dtype=int)
        train_idxs, test_idxs, self.y_train, self.y_test = train_test_split(INDEXES, self.Y, train_size=self.train_split)

        self.y_train = self.y_train[np.argsort(train_idxs)]        
        train_idxs = np.sort(train_idxs)    
        
        self.y_test = self.y_test[np.argsort(test_idxs)]        
        test_idxs = np.sort(test_idxs)    
        
        self.idxs_train = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_train[train_idxs.astype(int)] = True
        self.DGMS_train = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_train[i]]
        
        self.idxs_test = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_test[test_idxs.astype(int)] = True
        self.DGMS_test = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_test[i]]

        """
        Preparing the CV for Parameter Selection
        """
        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train, self.y_train))
        else:
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train))

        self.MODELS = []

        for i,model in enumerate(MODELS):
            if model == 'logistic_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(LogisticRegression(penalty='l2',max_iter=100000,C=c)) for c in self.C]
            elif model == 'SVC':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(SVC(C=c,max_iter=100000)) for c in self.C]
            elif model == 'class_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestClassifier(n_estimators=c) for c in self.C]
            elif model == 'linear_regression':
                self.MODELS = self.MODELS+[LinearRegression(fit_intercept=True)]
            elif model == 'ridge_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[Ridge(alpha=c,fit_intercept=True) for c in self.C]
            elif model == 'SVR':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[SVR(C=c,max_iter=100000) for c in self.C]
            elif model == 'regr_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestRegressor(n_estimators=c) for c in self.C]


        
    def parameter_selection(self,):
            
        pool = mp.Pool()
        
        self.RESULTS = pool.map(make_CV_fold_PI,([self.y_train, self.DGMS_train,
                                         model, pix, n_sigma, p, 
                                         train_idx, test_idx] for pix in self.PIX
                                                              for n_sigma in self.N_SIGMA
                                                              for p in self.P
                                                              for model in self.MODELS
                                                              for (train_idx,test_idx) in self.CV_splits))

        pool.close()



    def read_CV_results(self,):

        try:
            self.RESULTS
        except:
            self.parameter_selection()

        self.SCORES = {}

        a = 0

        for i_p in range(len(self.PIX)): 
            for i_sigma in range(len(self.N_SIGMA)): 
                for i_n in range(len(self.P)): 
                    for i_model in range(len(self.MODELS)):
                        SCORES_FOLD = []
                        for i_fold in range(self.n_splits):
                            SCORES_FOLD.append(self.RESULTS[a]['score'])
                            a+=1
                        self.SCORES[sum(SCORES_FOLD)] = [i_p,i_sigma,i_n,i_model]


        self.best_score = np.max(list(self.SCORES.keys()))

        i_p,i_k,i_n,i_model = self.SCORES[self.best_score]

        self.best_params = {}
        self.best_params['pix']=self.PIX[i_p]
        self.best_params['sigma']=self.PIX[i_p]/self.N_SIGMA[i_sigma]
        self.best_params['p']=self.P[i_n]
        self.best_params['model']=self.MODELS[i_model]  


    def evaluate_best_model(self,):

        self.pimgr = PersistenceImager()
        self.pimgr.pixel_size = self.best_params['pix']
        self.pimgr.kernel_params = {'sigma': self.best_params['sigma']}
        self.pimgr.weight_params['n'] = self.best_params['p']

        self.pimgr.fit(self.DGMS_train+self.DGMS_test)

        training_data = self.pimgr.transform(self.DGMS_train)
        training_data = [data.flatten() for data in training_data]
        
        test_data = self.pimgr.transform(self.DGMS_test)
        test_data = [data.flatten() for data in test_data]

        self.best_params['model'].fit(training_data,self.y_train)
        score = self.best_params['model'].score(test_data,self.y_test)

        return score


    def run_analysis(self,):

        self.parameter_selection()
        self.read_CV_results()
        score = self.evaluate_best_model()

        return score



class PLPipeline(object):
    def __init__(self, DGMS, Y, params,
                 MODELS = ['logistic_regression'],train_split = 0.8, n_splits = 5, n_grid = 5000):

        # model = 'logistic_regression','linear_regression','ridge_regression','SVC','SVR','class_tree','regr_tree',

        self.DGMS = DGMS
        self.Y = Y

        m_b = np.min([np.min(D[:,0]) for D in DGMS])
        M_d = np.max([np.max(D[:,1]) for D in DGMS])

        self.grid = np.linspace(m_b,M_d,n_grid)
        self.n_grid = n_grid
        
        self.train_split = train_split
        self.n_splits = n_splits

        self.PLS = []

        tmp = 0
        
        for dgm in DGMS:
            PL = PersLandscapeApprox(dgms=[dgm], hom_deg=0, start=m_b, stop=M_d, num_steps=n_grid)
            tmp = np.max([tmp,PL.values.shape[0]])
       
        self.N = tmp

        for dgm in DGMS:
            aux = np.zeros((tmp,n_grid))

            try:
                PL = PersLandscapeApprox(dgms=[dgm], hom_deg=0, start=m_b, stop=M_d, num_steps=n_grid)
                aux[:PL.values.shape[0],:] = PL.values[:PL.values.shape[0],:]
            except:
                pass
                
            self.PLS.append(aux.flatten())

        self.PLS = np.array(self.PLS)

        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.kf_CV = StratifiedKFold(n_splits=self.n_splits)
        else:
            self.kf_CV = KFold(n_splits=self.n_splits)

        """
        Splitting Train and Test
        """
        INDEXES = np.arange(len(self.DGMS),dtype=int)
        train_idxs, test_idxs, self.y_train, self.y_test = train_test_split(INDEXES, self.Y, train_size=self.train_split)

        self.y_train = self.y_train[np.argsort(train_idxs)]        
        train_idxs = np.sort(train_idxs) 
        
        
        self.y_test = self.y_test[np.argsort(test_idxs)]        
        test_idxs = np.sort(test_idxs)    
        
        self.idxs_train = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_train[train_idxs.astype(int)] = True
        self.DGMS_train = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_train[i]]
        self.X_train = np.array([pl for i,pl in enumerate(self.PLS) if self.idxs_train[i]])
        
        self.idxs_test = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_test[test_idxs.astype(int)] = True
        self.DGMS_test = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_test[i]]
        self.X_test = np.array([pl for i,pl in enumerate(self.PLS) if self.idxs_test[i]])

        """
        Preparing the CV for Parameter Selection
        """
        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train, self.y_train))
        else:
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train))

        self.MODELS = []

        for i,model in enumerate(MODELS):
            if model == 'logistic_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(LogisticRegression(penalty='l2',max_iter=100000,C=c)) for c in self.C]
            elif model == 'SVC':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(SVC(C=c,max_iter=100000)) for c in self.C]
            elif model == 'class_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestClassifier(n_estimators=c) for c in self.C]
            elif model == 'linear_regression':
                self.MODELS = self.MODELS+[LinearRegression(fit_intercept=True)]
            elif model == 'ridge_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[Ridge(alpha=c,fit_intercept=True) for c in self.C]
            elif model == 'SVR':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[SVR(C=c,max_iter=100000) for c in self.C]
            elif model == 'regr_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestRegressor(n_estimators=c) for c in self.C]


        
    def parameter_selection(self,):

        pool = mp.Pool()
        
        self.RESULTS = pool.map(make_CV_fold_PL,([self.X_train, self.y_train,
                                         model,
                                         train_idx, test_idx] for model in self.MODELS
                                                              for (train_idx,test_idx) in self.CV_splits))

        pool.close()



    def read_CV_results(self,):

        try:
            self.RESULTS
        except:
            self.parameter_selection()

        self.SCORES = {}

        a = 0

        for i_model in range(len(self.MODELS)):
            SCORES_FOLD = []
            for i_fold in range(self.n_splits):
                SCORES_FOLD.append(self.RESULTS[a]['score'])
                a+=1
            self.SCORES[sum(SCORES_FOLD)] = [i_model]


        self.best_score = np.max(list(self.SCORES.keys()))

        i_model = self.SCORES[self.best_score][0]

        self.best_params = {}
        self.best_params['model']=self.MODELS[i_model]  


    def evaluate_best_model(self,):


        training_data = self.X_train
        
        test_data = self.X_test

        self.best_params['model'].fit(training_data,self.y_train)
        score = self.best_params['model'].score(test_data,self.y_test)

        return score


    def run_analysis(self,):

        self.parameter_selection()
        self.read_CV_results()
        score = self.evaluate_best_model()

        return score



class PersSplinesPipeline(object):
    def __init__(self, DGMS, Y, params,
                 MODELS = ['logistic_regression'],train_split = 0.8, n_splits = 5, H=[40], ITER=[100]):

        # model = 'logistic_regression','linear_regression','ridge_regression','SVC','SVR','class_tree','regr_tree',

        self.DGMS = DGMS
        self.Y = Y

        self.m_b = np.min([np.min(D[:,0]) for D in DGMS])
        self.M_b = np.max([np.max(D[:,0]) for D in DGMS])
        
        self.M_p = np.max([np.max(D[:,1]-D[:,0]) for D in DGMS])
        
        self.H = H
        self.ITER=ITER
        
        self.train_split = train_split
        self.n_splits = n_splits
        
        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.kf_CV = StratifiedKFold(n_splits=self.n_splits)
        else:
            self.kf_CV = KFold(n_splits=self.n_splits)

        """
        Splitting Train and Test
        """
        INDEXES = np.arange(len(self.DGMS),dtype=int)
        train_idxs, test_idxs, self.y_train, self.y_test = train_test_split(INDEXES, self.Y, train_size=self.train_split)

        self.y_train = self.y_train[np.argsort(train_idxs)]        
        train_idxs = np.sort(train_idxs)    
        
        self.y_test = self.y_test[np.argsort(test_idxs)]        
        test_idxs = np.sort(test_idxs)    
        
        self.idxs_train = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_train[train_idxs.astype(int)] = True
        self.DGMS_train = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_train[i]]
        
        self.idxs_test = np.zeros((len(self.DGMS),), dtype=bool)
        self.idxs_test[test_idxs.astype(int)] = True
        self.DGMS_test = [dgm for i,dgm in enumerate(self.DGMS) if self.idxs_test[i]]

        """
        Preparing the CV for Parameter Selection
        """
        if MODELS[0] == 'logistic_regression' or MODELS[0] == 'SVC' or MODELS[0] =='class_tree':
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train, self.y_train))
        else:
            self.CV_splits = list(self.kf_CV.split(self.DGMS_train))

        self.MODELS = []

        for i,model in enumerate(MODELS):
            if model == 'logistic_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(LogisticRegression(penalty='l2',max_iter=100000,C=c)) for c in self.C]
            elif model == 'SVC':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[OneVsRestClassifier(SVC(C=c,max_iter=100000)) for c in self.C]
            elif model == 'class_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestClassifier(n_estimators=c) for c in self.C]
            elif model == 'linear_regression':
                self.MODELS = self.MODELS+[LinearRegression(fit_intercept=True)]
            elif model == 'ridge_regression':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[Ridge(alpha=c,fit_intercept=True) for c in self.C]
            elif model == 'SVR':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[SVR(C=c,max_iter=100000) for c in self.C]
            elif model == 'regr_tree':
                self.C = params[i][-1]
                self.MODELS = self.MODELS+[RandomForestRegressor(n_estimators=c) for c in self.C]

        
    def parameter_selection(self,):

        
        pool = mp.Pool()
        
        self.RESULTS = pool.map(make_CV_fold_PersSplines,([self.y_train, self.DGMS_train,
                                         model, 
                                         self.m_b,self.M_b,self.M_p,h,iter_, 
                                         train_idx, test_idx] for h in self.H
                                                              for iter_ in self.ITER
                                                              for model in self.MODELS
                                                              for (train_idx,test_idx) in self.CV_splits))

        pool.close()



    def read_CV_results(self,):

        try:
            self.RESULTS
        except:
            self.parameter_selection()

        self.SCORES = {}

        a = 0

        for i_h in range(len(self.H)): 
            for i_iter in range(len(self.ITER)): 
                for i_model in range(len(self.MODELS)):
                    SCORES_FOLD = []
                    for i_fold in range(self.n_splits):
                        SCORES_FOLD.append(self.RESULTS[a]['score'])
                        a+=1
                    self.SCORES[sum(SCORES_FOLD)] = [i_h,i_iter,i_model]


        self.best_score = np.max(list(self.SCORES.keys()))

        i_h,i_iter,i_model = self.SCORES[self.best_score]

        self.best_params = {}
        self.best_params['h']=self.H[i_h]
        self.best_params['iterations']=self.ITER[i_iter]
        self.best_params['model']=self.MODELS[i_model]  


    def evaluate_best_model(self,):

        h = self.best_params['h']
        iterations = self.best_params['iterations']

        training_data = [make_PersSplines_vec(dgm,self.m_b,self.M_b,self.M_p,h,sig=1e-10,iteration=iterations) for dgm in self.DGMS_train]
        training_data = np.array([data.flatten() for data in training_data])

        test_data = [make_PersSplines_vec(dgm,self.m_b,self.M_b,self.M_p,h,sig=1e-10,iteration=iterations) for dgm in self.DGMS_test]
        test_data = np.array([data.flatten() for data in test_data])

        self.best_params['model'].fit(training_data,self.y_train)
        score = self.best_params['model'].score(test_data,self.y_test)

        return score


    def run_analysis(self,):

        self.parameter_selection()
        self.read_CV_results()
        score = self.evaluate_best_model()

        return score
                            



