import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import numpy
from sklearn import datasets, linear_model
import statistics
import sys
sys.path.append('../')
from aaa_issta import lib
from sklearn.utils import shuffle
from sklearn.datasets import make_moons,make_classification,make_circles
from sklearn import utils
from sklearn.dummy import DummyClassifier
def get_data(datasetname):
    '''
    Description
        This function returns data based on the name of a dataset

    Parameters
        datasetname: 'iris','wine','heart','car','cancer','bank','adult','connect'

    Returns
        X_train,y_train,X_hold,y_hold
    '''

    if 'moon' in datasetname:
        X_train, y_train = make_moons(noise=0.0, random_state=1, n_samples=3000)
        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.5)
    if 'circle' in datasetname:
        X_train, y_train = make_circles(noise=0.0, random_state=1, n_samples=3000)
        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.5)


    if 'iris' in datasetname:
        iris = datasets.load_iris()
        X_train = iris.data
        y_train = iris.target
        X_hold = False
        y_hold = False
    if 'wine' in datasetname:
        wine = datasets.load_wine()
        X_train = wine.data
        y_train = wine.target
        X_hold = False
        y_hold = False
    if 'cancer' in datasetname:
        cancer = datasets.load_breast_cancer()
        X_train = cancer.data
        y_train = cancer.target
        X_hold = False
        y_hold = False
    if 'car' in datasetname:
        data = pandas.read_excel('./../dataset/car_data1.xlsx')
        le = LabelEncoder()
        for i in data.columns:
            data[i] = le.fit_transform(data[i])

        X_train = data[data.columns[:-1]]
        y_train = data['class']

        # X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.8)
        X_hold = False
        y_hold = False

    if 'heart' in datasetname:
        df = pandas.read_csv('./../dataset/cleveland.csv', header=None)
        df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol',
                      'fbs', 'restecg', 'thalach', 'exang',
                      'oldpeak', 'slope', 'ca', 'thal', 'target']
        df.isnull().sum()
        df['target'] = df.target.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})
        df['sex'] = df.sex.map({0: 'female', 1: 'male'})
        df['thal'] = df.thal.fillna(df.thal.mean())
        df['ca'] = df.ca.fillna(df.ca.mean())
        df['sex'] = df.sex.map({'female': 0, 'male': 1})

        X_train = df.iloc[:, :-1].values
        y_train = df.iloc[:, -1].values
        X_hold = False
        y_hold = False
        # X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.0001,random_state=42)
        #X_train, y_train = utils.shuffle(X_train, y_train)
    if 'bank' in datasetname:
        df = pandas.read_csv('./../dataset/banking_updated.csv')

        df.drop(['duration', 'contact', 'month', 'day_of_week', 'default', 'pdays', ], axis=1, inplace=True)
        df.head()
        df.isnull().sum()
        df.replace(['basic.6y', 'basic.4y', 'basic.9y'], 'basic', inplace=True)
        le = LabelEncoder()
        df.job = le.fit_transform(df.job)
        df.marital = le.fit_transform(df.marital)
        df.education = le.fit_transform(df.education)
        df.housing = le.fit_transform(df.housing)
        df.loan = le.fit_transform(df.loan)
        df.poutcome = le.fit_transform(df.poutcome)
        df.head()
        X = df.iloc[:, 0:14]
        y = df.iloc[:, 14]
        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X, y, test_size=0.5,
                                                                            random_state=42)
    if 'adult' in datasetname:
        names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'result']
        traindatafilepath = "./../dataset/adult.data.txt"
        testdatafilepath = "./../dataset/adult.test.txt"


        labelnum = 14
        headlist = ['2', '4', '6', '7', '8', '9', '10', '14']

        # preprocess training data
        whole_datset_use = pandas.read_csv(traindatafilepath, names=names)

        noisedegreelist = numpy.arange(0.0, 0.31, 0.02)

        lib.fit_string_data(whole_datset_use, headlist)

        array = whole_datset_use.values
        X_train = array[:, 0:labelnum]
        y_train = array[:, labelnum]

        labellist = list(set(list(y_train)))

        # preprocess test data
        whole_datset_test = pandas.read_csv(testdatafilepath, names=names)

        lib.fit_string_data(whole_datset_test, headlist)
        array = whole_datset_test.values
        X_hold = array[:, 0:labelnum]
        y_hold = array[:, labelnum]
    if 'connect' in datasetname:
        names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
                 '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
                 '36', '37', '38', '39', '40', '41', '42', 'result']
        traindatafilepath = "./../dataset/connect.data"


        labelnum = 42
        headlist = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
                    '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
                    '35', '36', '37', '38', '39', '40', '41', '42']

        # preprocess training data
        whole_datset_use = pandas.read_csv(traindatafilepath, names=names)

        lib.fit_string_data(whole_datset_use, headlist)

        array = whole_datset_use.values
        X_train = array[:, 0:labelnum]
        y_train = array[:, labelnum]


        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.8,
                                                                            random_state=42)

    X_train, y_train = shuffle(X_train, y_train)
    return X_train,y_train,X_hold,y_hold


def draw_Figure8_classic(datasetlist):
    '''
    Description
        This function draw sub-figures in Figure 3 in our submission. It shows the shape of PV when capacity increases

    Parameters
        datasetname: 'iris','wine','heart','car','cancer','bank','adult','connect'

    Returns
        a subfigure in Figure 3 of the defined dataset
    '''
    range_ = numpy.arange(0.1, 0.6, 0.1)
    plt.figure(figsize=(6, 5))
    modellist = range_
    ax = plt.subplot(111)
    ax.set_ylim([-0.05, 1.05])
    # plt.figtext(0.5, 0.9, datasetname, fontsize=30, ha='center')
    plt.tick_params(axis='x', labelsize=25)
    plt.tick_params(axis='y', labelsize=28)
    plt.xticks(numpy.arange(modellist[0], modellist[-1] + 0.1, 0.1))
    plt.xlabel("mutation degree $\eta$", fontsize=25)


    usetestaccuracy = False # whether to present test accuracy

    for datasetname in datasetlist:
        print(datasetname)

        if 'adult' in datasetname or 'connect' in datasetname or 'bank' in datasetname or 'moon' in datasetname or 'circle' in datasetname:
            usetestaccuracy = True



        # if 'car' in datasetname:
        #     range_ = numpy.arange(1, 20, 2)
        # if 'connect' in datasetname:
        #     range_ = numpy.arange(5, 31, 5)

        pvmeanlist = []
        pvstdlist = []
        pvmean_add_std = []
        pvmean_subs_std = []

        cvmeanlist = []
        cvstdlist = []
        cvmean_add_std = []
        cvmean_subs_std = []

        testmeanlist = []
        teststdlist = []
        testaccumean_add_std = []
        testaccumean_subs_std = []

        pvlist = []
        for i in range_: # interate for each depth
            print(i)
            #model = DecisionTreeClassifier(max_depth=3)
            model = DummyClassifier(strategy="most_frequent")

            cvlist = []
            testlist = []
            # for j in range(0,11): # repeat running 10 times to observe variance
            X_train, y_train, X_hold, y_hold = get_data(datasetname)
            # if not usetestaccuracy:
            pv = lib.get_PV_classic_withdifferentnoisedegree(model,X_train,y_train,i)

                # cv = model_selection.cross_val_score(model, X_train, y_train, cv=int(i*10)+1).mean()

                # cv = dic_metric_value['cv']
                # testaccuracy = dic_metric_value['test']

            pvlist.append(pv)


            # pvmean = statistics.mean(pvlist)
            # pvstd = statistics.stdev(pvlist)
            # pvmeanlist.append(pvmean)
            # pvstdlist.append(pvstd)
            # pvmean_add_std.append(pvmean+pvstd)
            # pvmean_subs_std.append(pvmean-pvstd)

        plt.plot(modellist, pvlist, 'o-',  label=datasetname) # draw PV

        # draw the orange shallow to show variance between different runs
        # plt.fill_between(modellist, pvmean_subs_std,
        #                  pvmean_add_std, alpha=0.3,
        #                  color="orange")

    plt.legend(fontsize=18, ncol=2)
    plt.savefig("./../issta/plots/uci-eta.pdf", bbox_inches='tight')
    plt.show()

def draw_Figure8_deep(datasetlist):

    range_ = numpy.arange(0.1, 0.6, 0.1)
    plt.figure(figsize=(6, 5))
    modellist = range_
    ax = plt.subplot(111)
    ax.set_ylim([-0.05, 1.05])
    # plt.figtext(0.5, 0.9, datasetname, fontsize=30, ha='center')
    plt.tick_params(axis='x', labelsize=25)
    plt.tick_params(axis='y', labelsize=28)
    plt.xticks(numpy.arange(modellist[0], modellist[-1] + 0.1, 0.1))
    plt.xlabel("mutation degree $\eta$", fontsize=25)

    pvlistfashion = [0.92222501039505, 0.9309083580970765, 0.9317708849906918, 0.9298624634742738, 0.9244583547115324]
    pvlistmnist = [0.9060041666030884, 0.9250791668891905, 0.9284292221069333, 0.9242000102996826, 0.9195208251476287]
    pvlistcifar10 = [0.7547150611877442, 0.8062799811363217, 0.8152400016784667, 0.7967499792575836, 0.7899749875068663]

    pvdic = {}
    pvdic['fashion'] = pvlistfashion
    pvdic['mnist'] = pvlistmnist
    pvdic['cifar10'] = pvlistcifar10


    for datasetname in datasetlist:
        print(datasetname)


        pvlist = []
        for i in range_: # interate for each depth
            print(i)
            model = DecisionTreeClassifier(max_depth=3)


            pvlist = pvdic[datasetname]


            # pvmean = statistics.mean(pvlist)
            # pvstd = statistics.stdev(pvlist)
            # pvmeanlist.append(pvmean)
            # pvstdlist.append(pvstd)
            # pvmean_add_std.append(pvmean+pvstd)
            # pvmean_subs_std.append(pvmean-pvstd)

        plt.plot(modellist, pvlist, 'o-',  label=datasetname) # draw PV

        # draw the orange shallow to show variance between different runs
        # plt.fill_between(modellist, pvmean_subs_std,
        #                  pvmean_add_std, alpha=0.3,
        #                  color="orange")

    plt.legend(fontsize=18, ncol=2)
    plt.savefig("./../plots/deep-eta.pdf", bbox_inches='tight')
    plt.show()


if __name__ == '__main__':
    # draw_Figure3(['iris','wine','cancer','car','heart','adult','bank','connect'])
    draw_Figure8_classic(['car'])
    #draw_Figure8_deep(['mnist','fashion','cifar10'])
    #draw_Figure3(['iris', 'wine'])





