import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import numpy
from sklearn import datasets, linear_model
import statistics
import sys
sys.path.append('../')
from aaa_issta import lib
from sklearn.utils import shuffle
from sklearn.datasets import make_moons,make_classification,make_circles
from sklearn import utils
import scipy.stats as stats

from itertools import groupby

def projection(val):
    return val % 3


def get_data(datasetname):
    '''
    Description
        This function returns data based on the name of a dataset

    Parameters
        datasetname: 'iris','wine','heart','car','cancer','bank','adult','connect'

    Returns
        X_train,y_train,X_hold,y_hold
    '''


    if 'iris' in datasetname:
        iris = datasets.load_iris()
        X_train = iris.data
        y_train = iris.target


        X_hold = False
        y_hold = False
    if 'wine' in datasetname:
        wine = datasets.load_wine()
        X_train = wine.data
        y_train = wine.target
        X_hold = False
        y_hold = False
    if 'cancer' in datasetname:
        cancer = datasets.load_breast_cancer()
        X_train = cancer.data
        y_train = cancer.target
        X_hold = False
        y_hold = False
    if 'car' in datasetname:
        data = pandas.read_excel('./../dataset/car_data1.xlsx')
        le = LabelEncoder()
        for i in data.columns:
            data[i] = le.fit_transform(data[i])

        X_train = data[data.columns[:-1]]
        y_train = data['class']

        # X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.8)
        X_hold = False
        y_hold = False

    if 'heart' in datasetname:
        df = pandas.read_csv('./../dataset/cleveland.csv', header=None)
        df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol',
                      'fbs', 'restecg', 'thalach', 'exang',
                      'oldpeak', 'slope', 'ca', 'thal', 'target']
        df.isnull().sum()
        df['target'] = df.target.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})
        df['sex'] = df.sex.map({0: 'female', 1: 'male'})
        df['thal'] = df.thal.fillna(df.thal.mean())
        df['ca'] = df.ca.fillna(df.ca.mean())
        df['sex'] = df.sex.map({'female': 0, 'male': 1})

        X_train = df.iloc[:, :-1].values
        y_train = df.iloc[:, -1].values
        X_hold = False
        y_hold = False
        #X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.0001,random_state=42)
        #X_train, y_train = utils.shuffle(X_train, y_train)
    if 'bank' in datasetname:
        df = pandas.read_csv('./../dataset/banking_updated.csv')

        df.drop(['duration', 'contact', 'month', 'day_of_week', 'default', 'pdays', ], axis=1, inplace=True)
        df.head()
        df.isnull().sum()
        df.replace(['basic.6y', 'basic.4y', 'basic.9y'], 'basic', inplace=True)
        le = LabelEncoder()
        df.job = le.fit_transform(df.job)
        df.marital = le.fit_transform(df.marital)
        df.education = le.fit_transform(df.education)
        df.housing = le.fit_transform(df.housing)
        df.loan = le.fit_transform(df.loan)
        df.poutcome = le.fit_transform(df.poutcome)
        df.head()
        X = df.iloc[:, 0:14]
        y = df.iloc[:, 14]
        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X, y, test_size=0.5,
                                                                            random_state=42)
    if 'adult' in datasetname:
        names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'result']
        traindatafilepath = "./../dataset/adult.data.txt"
        testdatafilepath = "./../dataset/adult.test.txt"


        labelnum = 14
        headlist = ['2', '4', '6', '7', '8', '9', '10', '14']

        # preprocess training data
        whole_datset_use = pandas.read_csv(traindatafilepath, names=names)

        noisedegreelist = numpy.arange(0.0, 0.31, 0.02)

        lib.fit_string_data(whole_datset_use, headlist)

        array = whole_datset_use.values
        X_train = array[:, 0:labelnum]
        y_train = array[:, labelnum]

        labellist = list(set(list(y_train)))

        # preprocess test data
        whole_datset_test = pandas.read_csv(testdatafilepath, names=names)

        lib.fit_string_data(whole_datset_test, headlist)
        array = whole_datset_test.values
        X_hold = array[:, 0:labelnum]
        y_hold = array[:, labelnum]
    if 'connect' in datasetname:
        names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
                 '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
                 '36', '37', '38', '39', '40', '41', '42', 'result']
        traindatafilepath = "./../dataset/connect.data"


        labelnum = 42
        headlist = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
                    '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
                    '35', '36', '37', '38', '39', '40', '41', '42']

        # preprocess training data
        whole_datset_use = pandas.read_csv(traindatafilepath, names=names)

        lib.fit_string_data(whole_datset_use, headlist)

        array = whole_datset_use.values
        X_train = array[:, 0:labelnum]
        y_train = array[:, labelnum]


        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.8,
                                                                            random_state=42)

    X_train, y_train = shuffle(X_train, y_train)
    # x_sorted = sorted(list(y_train), key=projection)
    # x_grouped = [list(it) for k, it in groupby(x_sorted, projection)]
    # for each in (x_grouped):
    #     print(len(each))
    return X_train,y_train,X_hold,y_hold


def draw_Figure3(datasetname):
    '''
    Description
        This function draw sub-figures in Figure 3 in our submission. It shows the shape of PV when capacity increases

    Parameters
        datasetname: 'iris','wine','heart','car','cancer','bank','adult','connect'

    Returns
        a subfigure in Figure 3 of the defined dataset
    '''


    usetestaccuracy = False # whether to present test accuracy



    if 'adult' in datasetname or 'connect' in datasetname or 'bank' in datasetname or 'moon' in datasetname or 'circle' in datasetname:
        usetestaccuracy = True


    range_ = numpy.arange(1,10,1)
    if 'car' in datasetname:
        range_ = numpy.arange(1, 20, 2)
    if 'connect' in datasetname:
        range_ = numpy.arange(5, 31, 5)


    pvbestlist = []
    cvbestlist = []
    testbestlist = []

    for j in range(0,10): # interate for each depth

        pvlist = []
        cvlist = []
        testlist = []
        for i in range_: # repeat running 10 times to observe variance

            model = DecisionTreeClassifier(max_depth=i)
            X_train, y_train, X_hold, y_hold = get_data(datasetname)
            print('depth: '+str(i)+'   round: '+str(j))
            if not usetestaccuracy:
                pv = lib.get_PV_classic(model,X_train,y_train)
                cv = model_selection.cross_val_score(model, X_train, y_train, cv=3).mean()
            else:
                dic_metric_value = lib.get_allmetrics_classic(model,X_train,y_train,X_hold,y_hold)
                pv = dic_metric_value['pv']
                cv = dic_metric_value['cv']
                testaccuracy = dic_metric_value['test']

            pvlist.append(pv)
            cvlist.append(cv)
            if usetestaccuracy:
                testlist.append(testaccuracy)



        pvbestlist.append(range_[pvlist.index(max(pvlist))])
        cvbestlist.append(range_[cvlist.index(max(cvlist))])

        if usetestaccuracy:
            testbestlist.append(range_[testlist.index(max(testlist))])



    print(pvbestlist)


    print(statistics.variance(pvbestlist))
    print(cvbestlist)
    print(statistics.variance(cvbestlist))


    if usetestaccuracy:
        testbestlist.append(range_[testlist.index(max(testlist))])
        print(testbestlist)
        print(statistics.variance(testbestlist))







if __name__ == '__main__':
    # draw_Figure3('connect')
    print(statistics.variance([0.2, 0.0, 0.0, 0.2,0.0]))






