import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']


grids = {
    'NN': dict(model__init_mode=init_mode, optimizer__learning_rate=learn_rate, optimizer__momentum=momentum, model__optimizer=optimizer),
    'LR': dict(alpha=[0.001, 0.01, 0.1, 1, 10, 100, 1000]),
    'KNN': dict(n_neighbors=list(range(1, 31))),
    'RF': { 
        'n_estimators': [20, 50, 100],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [i for i in range(5,10)]
    },
    'SVR': {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'C' : [1,5,10],'degree' : [3,8],'coef0' : [0.01,10,0.5],'gamma' : ('auto','scale')},
    
}


def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score


def prepare_data(rankings, data_path):
    rankings = {'attribute': [{'name': 'ARRIVAL_DELAY', 'distribution': 0.058823529411764705}, {'name': 'DAY_OF_WEEK', 'distribution': 0.058823529411764705}, {'name': 'FLIGHT_NUMBER', 'distribution': 0.058823529411764705}, {'name': 'TAIL_NUMBER', 'distribution': 0.058823529411764705}, {'name': 'SCHEDULED_DEPARTURE_HOUR', 'distribution': 0.058823529411764705}, {'name': 'SCHEDULED_TIME', 'distribution': 0.058823529411764705}, {'name': 'ELAPSED_TIME', 'distribution': 0.058823529411764705}, {'name': 'DEPARTURE_DELAY', 'distribution': 0.058823529411764705}, {'name': 'ARRIVAL_DELAY', 'distribution': 0.058823529411764705}, {'name': 'CANCELLED', 'distribution': 0.058823529411764705}, {'name': 'CANCELLATION_REASON', 'distribution': 0.058823529411764705}, {'name': 'AIR_SYSTEM_DELAY', 'distribution': 0.058823529411764705}, {'name': 'SECURITY_DELAY', 'distribution': 0.058823529411764705}, {'name': 'AIRLINE_DELAY', 'distribution': 0.058823529411764705}, {'name': 'LATE_AIRCRAFT_DELAY', 'distribution': 0.058823529411764705}, {'name': 'WEATHER_DELAY', 'distribution': 0.058823529411764705}, {'name': 'NONE', 'distribution': 0.058823529411764705}], 'filter': [{'name': 'DAY_OF_WEEK', 'distribution': 0.058827485960577665}, {'name': 'DATE', 'distribution': 0.05882608005426186}, {'name': 'SCHEDULED_TIME', 'distribution': 0.05882565223048587}, {'name': 'FLIGHT_NUMBER', 'distribution': 0.05882555804791938}, {'name': 'SCHEDULED_DEPARTURE_HOUR', 'distribution': 0.05882460799426518}, {'name': 'CANCELLED', 'distribution': 0.058824417484297835}, {'name': 'TAIL_NUMBER', 'distribution': 0.05882426063008911}, {'name': 'LATE_AIRCRAFT_DELAY', 'distribution': 0.05882416293367387}, {'name': 'AIR_SYSTEM_DELAY', 'distribution': 0.058823823684101}, {'name': 'ELAPSED_TIME', 'distribution': 0.058823572237279055}, {'name': 'ARRIVAL_DELAY', 'distribution': 0.058823521009809844}, {'name': 'SECURITY_DELAY', 'distribution': 0.05882350028018843}, {'name': 'CANCELLATION_REASON', 'distribution': 0.058823286305443194}, {'name': 'AIRLINE_DELAY', 'distribution': 0.05882186202419863}, {'name': 'DEPARTURE_DELAY', 'distribution': 0.05882107751944617}, {'name': 'WEATHER_DELAY', 'distribution': 0.058820179712022184}, {'name': 'NONE', 'distribution': 0.05881695189194067}], 'filter_operation': [{'name': 'IS BETWEEN', 'distribution': 0.09090909090909091}, {'name': 'IS AFTER', 'distribution': 0.09090909090909091}, {'name': 'IS MORE THAN', 'distribution': 0.09090909090909091}, {'name': 'IS LESS THAN', 'distribution': 0.09090909090909091}, {'name': 'IS EQUAL TO', 'distribution': 0.09090909090909091}, {'name': 'IS BEFORE', 'distribution': 0.09090909090909091}, {'name': 'HAS', 'distribution': 0.09090909090909091}, {'name': 'IS GREATER THAN', 'distribution': 0.09090909090909091}, {'name': 'IS ABOUT', 'distribution': 0.09090909090909091}, {'name': 'NOT', 'distribution': 0.09090909090909091}], 'aggregator': [{'name': 'TOTAL', 'distribution': 0.125}, {'name': 'MAXIMUM', 'distribution': 0.125}, {'name': 'MIN', 'distribution': 0.125}, {'name': 'MAX', 'distribution': 0.125}, {'name': 'AVERAGE', 'distribution': 0.125}, {'name': 'MEAN', 'distribution': 0.125}, {'name': 'MINIMUM', 'distribution': 0.125}], 'prediction_window': [{'name': 'YEAR', 'distribution': 0.1}, {'name': 'DAY', 'distribution': 0.1}, {'name': 'MONTH', 'distribution': 0.1}, {'name': 'WEEKS', 'distribution': 0.1}, {'name': 'MONTHS', 'distribution': 0.1}, {'name': 'TOMORROW', 'distribution': 0.1}, {'name': 'WEEK', 'distribution': 0.1}, {'name': 'YEARS', 'distribution': 0.1}, {'name': 'FEW DAYS', 'distribution': 0.1}]}

    target_attribute = rankings['attribute'][0]['name']
    aggregator = rankings['aggregator'][0]['name']
    filter_operation = rankings['filter_operation'][0]['name']
    filter = rankings['filter'][0]['name']
    prediction_window = rankings['prediction_window'][0]['name']

    # target_attribute, aggregator, filter_operation, filter, prediction_window

    data = pd.read_csv(data_path,sep=",")

    data = data[:100000]

    data['date'] = data[data.columns[0:3]].apply(
        lambda x: '-'.join(x.dropna().astype(str)),
        axis=1
    )

    data['date'] = pd.to_datetime(data['date'])
    data = data.set_index('date')

    if target_attribute in data.columns :
        print(data[target_attribute])
    elif target_attribute.lower() in data.columns:
        print(data[target_attribute.lower()])
        
        
    # creating new dataframe from consumption column
    # attribute = 'ARRIVAL_DELAY'
    data_consumption = data[target_attribute]


    df_categorical = data.select_dtypes(include=['object'])
    df_categorical

    le = preprocessing.LabelEncoder()
    df_categorical = df_categorical.apply(le.fit_transform)
    df = data.drop(df_categorical.columns,axis=1)
    df = pd.concat([df,df_categorical],axis=1)
    
    
    df = df.fillna(0)
    scaler = StandardScaler()

    test_split = 0.1
    test_size = int(test_split * len(df))
    train_size = int((1 - test_split) * len(df))


    x = df.drop([target_attribute], axis = 1)
    y = df[target_attribute]

    X_train = x[:train_size]
    y_train = y[:train_size]

    X_test = x.tail(test_size)
    y_test = y.tail(test_size)

    return X_train, y_train, X_test, y_test



def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))






# scaler.fit(X_train)
# # df
# normalized_df = scaler.transform(X_train)
# normalized_df = pd.DataFrame(normalized_df, columns=X_train.columns)
# normalized_df

# X_test = data_consumption['2017'].drop(['Consumption'], axis = 1)
# y_test = data_consumption.loc['2017', 'Consumption']




def model_delection(X_train, y_train, X_test, y_test):
    models = []
    models.append(('LR', LinearRegression()))
    models.append(('NN', MLPRegressor(solver = 'lbfgs')))  #neural network
    models.append(('KNN', KNeighborsRegressor())) 
    models.append(('RF', RandomForestRegressor(n_estimators = 10))) # Ensemble method - collection of many decision trees
    models.append(('SVR', SVR(gamma='auto'))) # kernel = linear
    # Evaluate each model in turn
    results = []
    names = []
    tscv = TimeSeriesSplit(n_splits=10)
    for name, model in models:
        # TimeSeries Cross validation
        
        cv_results = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
        results.append(cv_results.mean())
        names.append((name, model))
        print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    
    # Compare Algorithms
    # plt.boxplot(results, labels=names)
    # plt.title('Algorithm Comparison')
    # plt.show()
    name, model = names[results.index(max(results))]

    # model = RandomForestRegressor()
    param_search = grids[name]
    
    tscv = TimeSeriesSplit(n_splits=10)
    gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = mean_squared_error)
    gsearch.fit(X_train, y_train)
    best_score = gsearch.best_score_
    best_model = gsearch.best_estimator_


    rmse_score = make_scorer(rmse, greater_is_better = False)

    y_true = y_test.values
    y_pred = best_model.predict(X_test)
    # regression_results(y_true, y_pred)
    print('Mean Squire Error', mean_squared_error(y_true, y_pred))
    print('RMSE', rmse(y_true, y_pred))

    return y_true, y_pred

rankings = {'attribute': [{'name': 'ARRIVAL_DELAY', 'distribution': 0.058823529411764705}, {'name': 'DAY_OF_WEEK', 'distribution': 0.058823529411764705}, {'name': 'FLIGHT_NUMBER', 'distribution': 0.058823529411764705}, {'name': 'TAIL_NUMBER', 'distribution': 0.058823529411764705}, {'name': 'SCHEDULED_DEPARTURE_HOUR', 'distribution': 0.058823529411764705}, {'name': 'SCHEDULED_TIME', 'distribution': 0.058823529411764705}, {'name': 'ELAPSED_TIME', 'distribution': 0.058823529411764705}, {'name': 'DEPARTURE_DELAY', 'distribution': 0.058823529411764705}, {'name': 'ARRIVAL_DELAY', 'distribution': 0.058823529411764705}, {'name': 'CANCELLED', 'distribution': 0.058823529411764705}, {'name': 'CANCELLATION_REASON', 'distribution': 0.058823529411764705}, {'name': 'AIR_SYSTEM_DELAY', 'distribution': 0.058823529411764705}, {'name': 'SECURITY_DELAY', 'distribution': 0.058823529411764705}, {'name': 'AIRLINE_DELAY', 'distribution': 0.058823529411764705}, {'name': 'LATE_AIRCRAFT_DELAY', 'distribution': 0.058823529411764705}, {'name': 'WEATHER_DELAY', 'distribution': 0.058823529411764705}, {'name': 'NONE', 'distribution': 0.058823529411764705}], 'filter': [{'name': 'DAY_OF_WEEK', 'distribution': 0.058827485960577665}, {'name': 'DATE', 'distribution': 0.05882608005426186}, {'name': 'SCHEDULED_TIME', 'distribution': 0.05882565223048587}, {'name': 'FLIGHT_NUMBER', 'distribution': 0.05882555804791938}, {'name': 'SCHEDULED_DEPARTURE_HOUR', 'distribution': 0.05882460799426518}, {'name': 'CANCELLED', 'distribution': 0.058824417484297835}, {'name': 'TAIL_NUMBER', 'distribution': 0.05882426063008911}, {'name': 'LATE_AIRCRAFT_DELAY', 'distribution': 0.05882416293367387}, {'name': 'AIR_SYSTEM_DELAY', 'distribution': 0.058823823684101}, {'name': 'ELAPSED_TIME', 'distribution': 0.058823572237279055}, {'name': 'ARRIVAL_DELAY', 'distribution': 0.058823521009809844}, {'name': 'SECURITY_DELAY', 'distribution': 0.05882350028018843}, {'name': 'CANCELLATION_REASON', 'distribution': 0.058823286305443194}, {'name': 'AIRLINE_DELAY', 'distribution': 0.05882186202419863}, {'name': 'DEPARTURE_DELAY', 'distribution': 0.05882107751944617}, {'name': 'WEATHER_DELAY', 'distribution': 0.058820179712022184}, {'name': 'NONE', 'distribution': 0.05881695189194067}], 'filter_operation': [{'name': 'IS BETWEEN', 'distribution': 0.09090909090909091}, {'name': 'IS AFTER', 'distribution': 0.09090909090909091}, {'name': 'IS MORE THAN', 'distribution': 0.09090909090909091}, {'name': 'IS LESS THAN', 'distribution': 0.09090909090909091}, {'name': 'IS EQUAL TO', 'distribution': 0.09090909090909091}, {'name': 'IS BEFORE', 'distribution': 0.09090909090909091}, {'name': 'HAS', 'distribution': 0.09090909090909091}, {'name': 'IS GREATER THAN', 'distribution': 0.09090909090909091}, {'name': 'IS ABOUT', 'distribution': 0.09090909090909091}, {'name': 'NOT', 'distribution': 0.09090909090909091}], 'aggregator': [{'name': 'TOTAL', 'distribution': 0.125}, {'name': 'MAXIMUM', 'distribution': 0.125}, {'name': 'MIN', 'distribution': 0.125}, {'name': 'MAX', 'distribution': 0.125}, {'name': 'AVERAGE', 'distribution': 0.125}, {'name': 'MEAN', 'distribution': 0.125}, {'name': 'MINIMUM', 'distribution': 0.125}], 'prediction_window': [{'name': 'YEAR', 'distribution': 0.1}, {'name': 'DAY', 'distribution': 0.1}, {'name': 'MONTH', 'distribution': 0.1}, {'name': 'WEEKS', 'distribution': 0.1}, {'name': 'MONTHS', 'distribution': 0.1}, {'name': 'TOMORROW', 'distribution': 0.1}, {'name': 'WEEK', 'distribution': 0.1}, {'name': 'YEARS', 'distribution': 0.1}, {'name': 'FEW DAYS', 'distribution': 0.1}]}
X_train, y_train, X_test, y_test = prepare_data(rankings=rankings, data_path='src/data/csv/flights.csv')
model_delection(X_train, y_train, X_test, y_test)