import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import numpy
from sklearn.dummy import DummyClassifier
from sklearn import datasets, linear_model
import statistics
import sys
sys.path.append('../')
from aaa_issta import lib
from sklearn.utils import shuffle
from sklearn.datasets import make_moons,make_classification,make_circles
from sklearn import utils


from itertools import groupby

def projection(val):
    return val % 3


def get_data(datasetname):
    '''
    Description
        This function returns data based on the name of a dataset

    Parameters
        datasetname: 'iris','wine','heart','car','cancer','bank','adult','connect'

    Returns
        X_train,y_train,X_hold,y_hold
    '''


    if 'iris' in datasetname:
        iris = datasets.load_iris()
        X_train = iris.data
        y_train = iris.target


        X_hold = False
        y_hold = False
    if 'wine' in datasetname:
        wine = datasets.load_wine()
        X_train = wine.data
        y_train = wine.target
        X_hold = False
        y_hold = False
    if 'cancer' in datasetname:
        cancer = datasets.load_breast_cancer()
        X_train = cancer.data
        y_train = cancer.target
        X_hold = False
        y_hold = False
    if 'car' in datasetname:
        data = pandas.read_excel('./../dataset/car_data1.xlsx')
        le = LabelEncoder()
        for i in data.columns:
            data[i] = le.fit_transform(data[i])

        X_train = data[data.columns[:-1]]
        y_train = data['class']

        # X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.8)
        X_hold = False
        y_hold = False

    if 'heart' in datasetname:
        df = pandas.read_csv('./../dataset/cleveland.csv', header=None)
        df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol',
                      'fbs', 'restecg', 'thalach', 'exang',
                      'oldpeak', 'slope', 'ca', 'thal', 'target']
        df.isnull().sum()
        df['target'] = df.target.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})
        df['sex'] = df.sex.map({0: 'female', 1: 'male'})
        df['thal'] = df.thal.fillna(df.thal.mean())
        df['ca'] = df.ca.fillna(df.ca.mean())
        df['sex'] = df.sex.map({'female': 0, 'male': 1})

        X_train = df.iloc[:, :-1].values
        y_train = df.iloc[:, -1].values
        X_hold = False
        y_hold = False
        # X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.0001,random_state=42)
        #X_train, y_train = utils.shuffle(X_train, y_train)
    if 'bank' in datasetname:
        df = pandas.read_csv('./../dataset/banking_updated.csv')

        df.drop(['duration', 'contact', 'month', 'day_of_week', 'default', 'pdays', ], axis=1, inplace=True)
        df.head()
        df.isnull().sum()
        df.replace(['basic.6y', 'basic.4y', 'basic.9y'], 'basic', inplace=True)
        le = LabelEncoder()
        df.job = le.fit_transform(df.job)
        df.marital = le.fit_transform(df.marital)
        df.education = le.fit_transform(df.education)
        df.housing = le.fit_transform(df.housing)
        df.loan = le.fit_transform(df.loan)
        df.poutcome = le.fit_transform(df.poutcome)
        df.head()
        X = df.iloc[:, 0:14]
        y = df.iloc[:, 14]
        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X, y, test_size=0.5,
                                                                            random_state=42)
    if 'adult' in datasetname:
        names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', 'result']
        traindatafilepath = "./../dataset/adult.data.txt"
        testdatafilepath = "./../dataset/adult.test.txt"


        labelnum = 14
        headlist = ['2', '4', '6', '7', '8', '9', '10', '14']

        # preprocess training data
        whole_datset_use = pandas.read_csv(traindatafilepath, names=names)

        noisedegreelist = numpy.arange(0.0, 0.31, 0.02)

        lib.fit_string_data(whole_datset_use, headlist)

        array = whole_datset_use.values
        X_train = array[:, 0:labelnum]
        y_train = array[:, labelnum]

        labellist = list(set(list(y_train)))

        # preprocess test data
        whole_datset_test = pandas.read_csv(testdatafilepath, names=names)

        lib.fit_string_data(whole_datset_test, headlist)
        array = whole_datset_test.values
        X_hold = array[:, 0:labelnum]
        y_hold = array[:, labelnum]
    if 'connect' in datasetname:
        names = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
                 '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
                 '36', '37', '38', '39', '40', '41', '42', 'result']
        traindatafilepath = "./../dataset/connect.data"


        labelnum = 42
        headlist = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
                    '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
                    '35', '36', '37', '38', '39', '40', '41', '42']

        # preprocess training data
        whole_datset_use = pandas.read_csv(traindatafilepath, names=names)

        lib.fit_string_data(whole_datset_use, headlist)

        array = whole_datset_use.values
        X_train = array[:, 0:labelnum]
        y_train = array[:, labelnum]


        X_train, X_hold, y_train, y_hold = model_selection.train_test_split(X_train, y_train, test_size=0.8,
                                                                            random_state=42)

    X_train, y_train = shuffle(X_train, y_train)
    # x_sorted = sorted(list(y_train), key=projection)
    # x_grouped = [list(it) for k, it in groupby(x_sorted, projection)]
    # for each in (x_grouped):
    #     print(len(each))
    return X_train,y_train,X_hold,y_hold


def draw_Figure3(datasetname):
    '''
    Description
        This function draw sub-figures in Figure 3 in our submission. It shows the shape of PV when capacity increases

    Parameters
        datasetname: 'iris','wine','heart','car','cancer','bank','adult','connect'

    Returns
        a subfigure in Figure 3 of the defined dataset
    '''


    usetestaccuracy = False # whether to present test accuracy



    if 'adult' in datasetname or 'connect' in datasetname or 'bank' in datasetname or 'moon' in datasetname or 'circle' in datasetname:
        usetestaccuracy = True



    noiselist = numpy.arange(0.1, 0.6, 0.1)
    range_ = numpy.arange(1, 10, 1)





    plt.figure(figsize=(4, 5))
    modellist = range_
    ax = plt.subplot(111)
    ax.set_ylim([-0.05, 1.05])
    plt.figtext(0.5, 0.9, datasetname, fontsize=30, ha='center')
    plt.tick_params(axis='x', labelsize=25)
    plt.tick_params(axis='y', labelsize=28)
    plt.xticks(numpy.arange(modellist[0], modellist[-1] + 2, 2.0))
    plt.xlabel("maximum depth", fontsize=25)
    for m in noiselist:
        pvmeanlist = []
        pvstdlist = []
        pvmean_add_std = []
        pvmean_subs_std = []
        for i in range_: # interate for each depth

            #model = DecisionTreeClassifier(max_depth=i)
            model = DummyClassifier(strategy="most_frequent")
            pvlist = []
            for j in range(0,11): # repeat running 10 times to observe variance
                X_train, y_train, X_hold, y_hold = get_data(datasetname)

                pv = lib.get_PV_classic_withdifferentnoisedegree(model,X_train,y_train,m)
                pvlist.append(pv)


            pvmean = statistics.mean(pvlist)
            pvstd = statistics.stdev(pvlist)
            pvmeanlist.append(pvmean)
            pvstdlist.append(pvstd)
            pvmean_add_std.append(pvmean+pvstd)
            pvmean_subs_std.append(pvmean-pvstd)

        plt.plot(modellist, pvmeanlist, 'o-', label='$\eta:$'+str(round(m,2)))# draw PV


    # draw the orange shallow to show variance between different runs
        plt.fill_between(modellist, pvmean_subs_std,
                     pvmean_add_std, alpha=0.3,
                     color="orange")

    plt.legend(fontsize=20)
    plt.savefig('./../plots/iris-diffeta.pdf', bbox_inches='tight')

    plt.show()



if __name__ == '__main__':
    draw_Figure3('iris')





