# -*- coding: utf-8 -*-
import os
import sys
import signal
import glob
import time
import random
import concurrent.futures
import pickle
import itertools
import argparse

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import numpy as np
import matplotlib
# matplotlib.use("Agg")
import matplotlib.pyplot as plt
import GPy
from scipy.stats import qmc

# add my modules
import script.bayesian_opt as BO
import script.test_functions as test_functions


signal.signal(signal.SIGINT, signal.SIG_DFL)


def main(params):
    start = time.time()
    (BO_method, func_name, seed, ell, kernel_name, noise_std) = params


    rng_objective_function = np.random.default_rng(seed+123)
    # Set objective function
    if "GP" in func_name:
        theo_flag = True
        hyperparam_optimize = False

        if kernel_name == 'SE':
            kernel = GPy.kern.RBF(input_dim=4, lengthscale=ell, variance=1.)
        elif kernel_name == 'Matern32':
            kernel = GPy.kern.Matern32(input_dim=4, lengthscale=ell, variance=1.)
        elif kernel_name == 'Matern52':
            kernel = GPy.kern.Matern52(input_dim=4, lengthscale=ell, variance=1.)
        else:
            print('Kernel {} is not implemented'.format(kernel_name))
            exit(1)

        test_func = test_functions.GP_SamplePath(rng=rng_objective_function, kernel=kernel, noise_std=noise_std, rff_feature_dim = 2000)

        # making grid candidate points
        input_dim = test_func.d
        if input_dim != kernel.input_dim:
            print('Input dimension of kernel and test function does not match')
            exit(1)
        X_candidates = test_func.X
        X_all = X_candidates.copy()
        maximum = np.max(test_func.values(X_candidates))
    else:
        theo_flag = False
        test_func = eval('test_functions.'+func_name)(rng=rng_objective_function, noise_std=noise_std)

        if test_func.X is None:
            X_all = None
            X_candidates = None
        else:
            X_candidates = test_func.X.copy()
            X_all = X_candidates.copy()
        hyperparam_optimize = True

        # settings for test functions
        input_dim = test_func.d
        if test_func.maximum is None:
            maximum = None
        else:
            maximum = test_func.maximum

    bounds = test_func.bounds
    interval_size = bounds[1] - bounds[0]
    kernel_bounds = np.array([interval_size*1e-2, 0.5 * interval_size])


    # Set parameters for experiment
    ITR_MAX = 200
    if "GP" in func_name and noise_std >= 1.:
        ITR_MAX = 400
    FIRST_NUM = 2**input_dim
    eval_num = FIRST_NUM

    # set random seed
    rng = np.random.default_rng(seed)

    # Make a initial training data
    if X_all is None or X_candidates is None: # if no pool candidate points are given, use Sobol sequence
        sampler = qmc.Sobol(d=input_dim, scramble=True, seed=rng)
        training_input = sampler.random_base2(m=input_dim) * (bounds[1] - bounds[0]) + bounds[0]
        training_output = test_func.noisy_values(training_input)
    else:
        if "GP" in func_name: # if synthetic function is used (with pool candidate points), use the candidate points nearest to Sobol sequence
            sampler = qmc.Sobol(d=input_dim, scramble=True, seed=rng)
            training_input_temp = sampler.random_base2(m=input_dim)
            training_input = X_all[np.argmin(np.linalg.norm(X_all[:, np.newaxis] - training_input_temp, axis=2), axis=0)]
            training_output = test_func.noisy_values(training_input)
        else: # if real function is used (with pool candidate points), uniformly random points are selected
            training_input = X_all[rng.choice(X_all.shape[0], FIRST_NUM, replace=False)]
            training_output = test_func.noisy_values(training_input)


    # Set initial Gaussian process regressor
    gp_regressor = None
    np.random.seed(seed) # Set seed for GPy models
    if "GP" in func_name:
        if kernel_name == 'SE':
            kernel = GPy.kern.RBF(input_dim=input_dim, lengthscale=ell, variance=1.)
            gp_regressor = GPy.models.gp_regression.GPRegression(X=training_input,Y=training_output, kernel=kernel, noise_var=noise_std**2, normalizer=False)
            gp_regressor['.*rbf.variance'].constrain_fixed(1)
            gp_regressor['.*rbf.lengthscale'].constrain_fixed(test_func.rff_features.lengthscale)
            gp_regressor['.*Gaussian_noise.variance'].constrain_fixed(noise_std**2)
        elif kernel_name == 'Matern32':
            kernel = GPy.kern.Matern32(input_dim=input_dim, lengthscale=ell, variance=1.)
            gp_regressor = GPy.models.gp_regression.GPRegression(X=training_input,Y=training_output, kernel=kernel, noise_var=noise_std**2, normalizer=False)
            gp_regressor['.*Mat32.variance'].constrain_fixed(1)
            gp_regressor['.*Mat32.lengthscale'].constrain_fixed(test_func.rff_features.lengthscale)
            gp_regressor['.*Gaussian_noise.variance'].constrain_fixed(noise_std**2)
        elif kernel_name == 'Matern52':
            kernel = GPy.kern.Matern52(input_dim=input_dim, lengthscale=ell, variance=1.)
            gp_regressor = GPy.models.gp_regression.GPRegression(X=training_input,Y=training_output, kernel=kernel, noise_var=noise_std**2, normalizer=False)
            gp_regressor['.*Mat52.variance'].constrain_fixed(1)
            gp_regressor['.*Mat52.lengthscale'].constrain_fixed(test_func.rff_features.lengthscale)
            gp_regressor['.*Gaussian_noise.variance'].constrain_fixed(noise_std**2)
        else:
            print('Kernel {} is not implemented'.format(kernel_name))
            exit(1)

        func_name = 'GP_{}-lengthscale={}-noise_std={}'.format(kernel_name, ell, noise_std)
    else:
        if kernel_name == 'SE':
            kernel = GPy.kern.RBF(input_dim=input_dim, lengthscale=interval_size / 2., variance=1., ARD=True)

            gp_regressor = GPy.models.gp_regression.GPRegression(X=training_input,Y=training_output, kernel=kernel, noise_var=0.01**2, normalizer=True)
            gp_regressor['.*rbf.variance'].constrain_fixed(1)
            for i in range(input_dim):
                gp_regressor['.*rbf.lengthscale'][[i]].constrain_bounded(kernel_bounds[0, i], kernel_bounds[1, i])
        elif kernel_name == 'Matern32':
            kernel = GPy.kern.Matern32(input_dim=input_dim, lengthscale=interval_size / 2., variance=1., ARD=True)

            gp_regressor = GPy.models.gp_regression.GPRegression(X=training_input,Y=training_output, kernel=kernel, noise_var=0.01**2, normalizer=True)
            gp_regressor['.*Mat32.variance'].constrain_fixed(1)
            for i in range(input_dim):
                gp_regressor['.*Mat32.lengthscale'][[i]].constrain_bounded(kernel_bounds[0, i], kernel_bounds[1, i])
        elif kernel_name == 'Matern52':
            kernel = GPy.kern.Matern52(input_dim=input_dim, lengthscale=interval_size / 2., variance=1., ARD=True)

            gp_regressor = GPy.models.gp_regression.GPRegression(X=training_input,Y=training_output, kernel=kernel, noise_var=0.01**2, normalizer=True)
            gp_regressor['.*Mat52.variance'].constrain_fixed(1)
            for i in range(input_dim):
                gp_regressor['.*Mat52.lengthscale'][[i]].constrain_bounded(kernel_bounds[0, i], kernel_bounds[1, i])
        else:
            print('Kernel {} is not implemented'.format(kernel_name))
            exit(1)
        # gp_regressor['.*Gaussian_noise.variance'].constrain_bounded(10**-10, 0.1**2)
        gp_regressor['.*Gaussian_noise.variance'].constrain_fixed(10**-6)

        gp_regressor.optimize_restarts(num_restarts=10)
        func_name = func_name+'_{}-noise_std={}'.format(kernel_name, noise_std)


    results_path = func_name+'_results/'+BO_method+'/seed='+str(seed)+'/'

    if not os.path.exists(results_path):
        os.makedirs(results_path)
    if not os.path.exists(results_path+'optimizer_log/'):
        os.makedirs(results_path+'optimizer_log/')

    InferenceRegret_list = list()
    CumulativeRegret_list = list()
    InstantaneousRegret_list = list()
    BestRegret_list = list()
    EvaluatedPrediction_list = list()

    NUM_SAMPLING = 10 # number of Monte Carlo sampling for several acquisition functions
    # bayesian optimizer
    if 'MES' == BO_method:
        optimizer = BO.MaxValueEntropySearch(gp_regressor, bounds = bounds, rng = rng, sampling_num=NUM_SAMPLING)
    elif 'JES' == BO_method:
        optimizer = BO.JointEntropySearch(gp_regressor, bounds = bounds, rng = rng, sampling_num=NUM_SAMPLING)
    elif 'EEEI' == BO_method:
        optimizer = BO.EEEI(gp_regressor, bounds = bounds, rng = rng, sampling_num=NUM_SAMPLING)
    elif 'PIMS' == BO_method:
        optimizer = BO.PI_from_MaxSample(gp_regressor, bounds = bounds, rng = rng)
    elif 'EIMS' == BO_method:
        optimizer = BO.EI_from_MaxSample(gp_regressor, bounds = bounds, rng = rng)
    elif 'TS' == BO_method:
        optimizer = BO.ThompsonSampling(gp_regressor, bounds = bounds, rng = rng)
    elif 'PI' == BO_method:
        optimizer = BO.ProbabilityImprovement(gp_regressor, bounds = bounds, rng = rng)
    elif 'US' == BO_method:
        optimizer = BO.UncertaintySampling(gp_regressor, bounds = bounds, rng = rng)
    elif 'EI' == BO_method:
        optimizer = BO.ExpectedImprovement(gp_regressor, bounds = bounds, rng = rng)
    elif 'EI_wang' == BO_method:
        if theo_flag and X_all is not None:
            candidate_size = np.shape(X_all)[0]
            def root_nu_func(i):
                return 2 * np.log(candidate_size * ( (i+1) **2) / np.sqrt(2 * np.pi) + 1)
        else:
            """
            K. Kandasamy, J. Schenider, and B. Poczos. High Dimensional Bayesian Optimisation and Bandits ´ via Additive Models. In International Conference on Machine Learning, 2015.
            Multi-fidelity Gaussian Process Bandit Optimisation Kirthevasan Kandasamy, Gautam Dasarathy, Junier B. Oliva, Jeff Schneider, Barnabas Poczos
            """
            def root_nu_func(i):
                return np.sqrt(0.2 * input_dim * np.log(2 * (i+1)))
        optimizer = BO.EI_from_MaxMean(gp_regressor, bounds = bounds, root_nu_func=root_nu_func, rng = rng)
    elif 'GPUCB' == BO_method:
        if theo_flag and X_all is not None:
            candidate_size = np.shape(X_all)[0]
            # See Appendix B of https://proceedings.mlr.press/v202/takeno23a.html
            def root_beta_func(i):
                return 2 * np.log(candidate_size * ((i+1)**2) / np.sqrt(2 * np.pi))
        else:
            """
            K. Kandasamy, J. Schenider, and B. Poczos. High Dimensional Bayesian Optimisation and Bandits ´ via Additive Models. In International Conference on Machine Learning, 2015.
            Multi-fidelity Gaussian Process Bandit Optimisation Kirthevasan Kandasamy, Gautam Dasarathy, Junier B. Oliva, Jeff Schneider, Barnabas Poczos
            """
            def root_beta_func(i):
                return np.sqrt(0.2 * input_dim * np.log(2 * (i+1)))

        optimizer = BO.GP_UCB(gp_regressor, bounds = bounds, root_beta_func=root_beta_func, rng = rng)
    elif 'IRGPUCB' == BO_method:
        if theo_flag and X_all is not None:
            candidate_size = np.shape(X_all)[0]
            # See Theorem 4.2 of https://proceedings.mlr.press/v202/takeno23a.html
            def s(i):
                return 2 * np.log(candidate_size / 2.)
        else:
            def s(i):
                return np.max([0.2 * input_dim * np.log(2 * (i+1)) - 2., 0])

        optimizer = BO.IRGP_UCB(gp_regressor, bounds = bounds, s=s, rng = rng)
    else:
        print('Corresponding BO method is not implemented')
        exit()

    print('Elapsed time before loop: {}'.format(time.time() - start))

    # Start BO loop
    for i in range(ITR_MAX):
        print('-------------------------------------')
        print(str(i)+'th iteration')
        print('-------------------------------------')
        if i % 5 == 0:
            print('Standardizing mean: {}, std: {}'.format(optimizer.mean, optimizer.std))
            print(gp_regressor[''])

        # Choose LCB maximizer as the recommended point
        if func_name == "GP":
            inference_point, _ = optimizer.posterior_maximum(pool_X=X_all)
        else:
            inference_point = optimizer.LCB_maximizer(width_param=2., pool_X=X_all)
        print('posterior at inference point : ', optimizer.GPmodel.predict_noiseless(np.atleast_2d(inference_point)))
        print("posterior at origin point : ", optimizer.GPmodel.predict_noiseless(np.zeros((1, input_dim))))

        InferenceRegret_list.append(maximum - test_func.values(np.atleast_2d(inference_point)).ravel()[0])
        InstantaneousRegret_list.append(maximum - test_func.values(np.atleast_2d(training_input[-1])).ravel()[0])
        CumulativeRegret_list.append(np.sum(InstantaneousRegret_list))
        BestRegret_list.append(np.min(InstantaneousRegret_list))

        start_dump = time.time()

        with open(results_path + 'InferenceRegret.pickle', 'wb') as f:
            pickle.dump(np.array(InferenceRegret_list), f)

        with open(results_path + 'InstantaneousRegret.pickle', 'wb') as f:
            pickle.dump(np.array(InstantaneousRegret_list), f)

        with open(results_path + 'CumulativeRegret.pickle', 'wb') as f:
            pickle.dump(np.array(CumulativeRegret_list), f)

        with open(results_path + 'BestRegret.pickle', 'wb') as f:
            pickle.dump(np.array(BestRegret_list), f)

        with open(results_path + 'EvalNum.pickle', 'wb') as f:
            pickle.dump(np.array(eval_num), f)

        with open(results_path + 'EvaluatedPrediction.pickle', 'wb') as f:
            pickle.dump(np.array(EvaluatedPrediction_list), f)

        print('Elapsed time for dump: {}'.format(time.time() - start_dump))

        if i % 5 == 0:
            print('Eval_num {}, InferenceRegret {:.3f}, InstantaneousRegret {:.3f}, BestRegre {:.3f}'.format(eval_num, InferenceRegret_list[-1], InstantaneousRegret_list[-1], BestRegret_list[-1]))

        if i == ITR_MAX:
            with open(results_path + 'optimizer_log/' + 'optimizer'+str(int(i))+'.pickle', 'wb') as f:
                pickle.dump(optimizer, f)
            break


        start_next_input = time.time()
        if BO_method in ["MES", "JES", "PIMS", "EIMS", "EI_wang", "EEEI"]:
            optimizer.pre_computation_acq(X_candidates=X_all)

        # add new input
        if X_all is None:
            new_inputs = optimizer.next_input()
        else:
            new_inputs, _ = optimizer.next_input_pool(X_all)

        print('Elapsed time for next input: {}'.format(time.time() - start_next_input))

        # new observation is added
        new_output = test_func.noisy_values(new_inputs)
        eval_num += 1

        training_input = np.r_[training_input, np.atleast_2d(new_inputs)]
        training_output = np.r_[training_output, np.atleast_2d(new_output)]
        print("new_inputs : ", new_inputs)

        mean_t, var_t = optimizer.GPmodel.predict_noiseless(new_inputs)
        print("its prediction:", mean_t, var_t)
        print("new_output : ", new_output)
        EvaluatedPrediction_list.append((mean_t.ravel()[0], np.sqrt(var_t.ravel()[0])))

        if (i+1) % 5 == 0 and hyperparam_optimize:
            optimizer.update(np.atleast_2d(new_inputs), np.atleast_2d(new_output), optimize=True)
        else:
            optimizer.update(np.atleast_2d(new_inputs), np.atleast_2d(new_output), optimize=False)

    print('Total time: {}'.format(time.time() - start))



if __name__ == '__main__':
    parse = argparse.ArgumentParser(description='Bayesian Optimization Experiment')
    parse.add_argument('BO_method', type=str, help='Bayesian optimization method to use')
    parse.add_argument('test_func', type=str, help='Test function to optimize')
    parse.add_argument('initial_seed', type=int, help='Initial seed for random number generation')
    parse.add_argument('--ell', type=float, default=0.1, help='Length scale for the kernel')
    parse.add_argument('--kernel_name', type=str, default="SE", help='Kernel name for the Gaussian Process')
    parse.add_argument('--noise_std', type=float, default=0.01, help='Noise standard deviation for the Gaussian Process and actual observations')

    args = parse.parse_args()
    BO_method = args.BO_method
    test_func = args.test_func
    initial_seed = args.initial_seed
    ell = args.ell
    kernel_name = args.kernel_name
    noise_std = args.noise_std

    NUM_WORKER = 16

    # When seed = -1, experiments of seed of 0-9 is done for parallel
    # When other seed is set, experiments of the seed is done
    if initial_seed >= 0:
        main((BO_method, test_func, initial_seed, ell, kernel_name, noise_std))
        exit()

    params = list()
    for seed in np.arange(NUM_WORKER):
        params.append((BO_method, test_func, seed, ell, kernel_name, noise_std))

    with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKER) as executor:
        results = executor.map(main, params)
