import click
import importlib
import pandas as pd
import numpy as np
import seaborn as sns
from jax import random, vmap
from jax.example_libraries import optimizers
import jax
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import scipy
import jaxopt
import jax.numpy as jnp

@click.command()
@click.option('--model', default='OutlierRegression')
@click.option('--objective', default='VIBasic')
@click.option('--regularizer', default='VIBasic')
@click.option('--posterior', default='Basic')
@click.option('--seed', default=0)
@click.option('--lamb', default=0.0)
@click.option('--outlier_scale', default=10)
@click.option('--outlier_N', default=10)
@click.option('--alpha2', default=0.1)
@click.option('--s', default=1)
@click.option('--iterations', default=1000)
@click.option('--n', default=100)
@click.option('--g', default=5)
@click.option('--prediction_sample', default=10000)
@click.option('--learning_rate', default=0.001)
@click.option('--vi_training', is_flag = True)
@click.option('--test_bias', is_flag = True)
@click.option('--optimizer', default = 'sgd')
@click.option('--plot', is_flag = True)
def main(model, objective, regularizer, posterior, seed, lamb, outlier_scale, outlier_n, alpha2, s, iterations, n, g, prediction_sample,
         learning_rate, vi_training, test_bias, optimizer, plot):
    module1 = importlib.import_module('model')
    m = getattr(module1, model)(n, g, outlier_scale=outlier_scale, outlier_N=outlier_n, alpha=alpha2)
    rng_key = random.PRNGKey(seed)
    #data_key, rng_key = random.split(rng_key)
    y = m.data(random.PRNGKey(1))
    test_y = None
    if type(y) == tuple:
        y, test_y = y

    module2 = importlib.import_module('posterior')
    p = getattr(module2, posterior)(m.d)
    params = p.gen_params()

    module3 = importlib.import_module('objective')
    o = getattr(module3, objective)(m, p, y, s=s, )

    r = getattr(module3, regularizer)(m, p, y, s=s)

    def scheduler(step):
        if step < iterations // 2:
            return learning_rate
        return learning_rate / 10


    if optimizer == 'sgd':
        opt_init, opt_update, get_params = optimizers.sgd(scheduler,)
    elif optimizer == 'nesterov':
        opt_init, opt_update, get_params = optimizers.nesterov(scheduler, 0.9)
    elif optimizer == 'rmsprop':
        opt_init, opt_update, get_params = optimizers.rmsprop_momentum(scheduler, )
    else:
        raise ValueError(optimizer)
    opt_state = opt_init(params), rng_key

    if vi_training:
        o2 = getattr(module3, 'VIBasic')(m, p, y, s=s)
        print('Pretraining with VI...')
        def step(step, opt_state):
            param, rng_key = opt_state
            data_key, rng_key = random.split(rng_key)
            value, grads = o2.value_and_grad(data_key, get_params(param))
            grads = grads / n
            updated_state = opt_update(step, -grads, param)
            return value, (updated_state, rng_key)

        data = []
        for i in tqdm(range(iterations // 2)):
            value, opt_state = step(i, opt_state)
            param, rng_key = opt_state
            if i % 1000 == 0:
                print(i, value, get_params(param))
            data.append({'step': i, 'loss': float(value)})
    if lamb != 0:
        def regularized_objective(key, params):
            key1, key2 = random.split(key)
            v1, g1 = o.value_and_grad(key1, params)
            v2, g2 = r.value_and_grad(key2, params)
            return v1 + lamb * v2, g1 + lamb * g2
    else:
        def regularized_objective(key, params):
            key1, key2 = random.split(key)
            v1, g1 = o.value_and_grad(key1, params)
            return v1 , g1
    def step(step, opt_state):
        param, rng_key = opt_state
        data_key, rng_key = random.split(rng_key)
        value, grads = regularized_objective(data_key, get_params(param))
        #print(t)
        #print(rejection_map)
        #if step %1000 == 0:
        #    print(flags, grads)
        #print(value, grads)
        norm = jnp.linalg.norm(grads)
        grads = grads / n
        if norm > 100:
            grads = grads / norm * 100
        if jnp.sum(jnp.isnan(grads)):
            updated_state = param
        else:
            updated_state = opt_update(step, -grads, param)
        return value, (updated_state, rng_key)

    rej = None
    if test_bias:
        rej = getattr(module3, 'PVIRejection')(m, p, y, s=s)


    data = []
    bias_data = []
    best_pred = -np.inf
    best_param = params
    for i in tqdm(range(iterations)):
        #param, rng_key = opt_state
        #theta_sample = p.sample(rng_key, get_params(param), 1)
        #print(theta_sample)
        # log_likelihoods = vmap(m.log_likelihoods, in_axes=(0, None))(theta_sample, y)
        # predictive_ll = np.sum(np.array(scipy.special.logsumexp(log_likelihoods, axis=0)) - np.log(prediction_sample))
        #print(m.log_prior(theta_sample), m.log_likelihoods(theta_sample, y),
        #      p.log_posterior(theta_sample, get_params(param)))
        value, opt_state = step(i, opt_state)
        param, rng_key = opt_state
        if i % 1000 == 0:
            print(i, value, get_params(param))

            theta_sample = p.sample(rng_key, get_params(param), prediction_sample)
            log_likelihoods = vmap(m.valid_log_likelihoods)(theta_sample)
            predictive_ll = np.sum(
                np.array(scipy.special.logsumexp(log_likelihoods, axis=0)) - np.log(prediction_sample))
            if predictive_ll >= best_pred:
                best_pred = predictive_ll
                best_param = get_params(param)
            print(predictive_ll)
            if test_bias and i > 0:
                grad1 = []
                grad2 = []
                for _ in range(1000):
                    bias_key, rng_key = random.split(rng_key)
                    key1, key2 = random.split(bias_key)
                    _, g1 = rej.value_and_grad(key1, get_params(param))
                    _, g2 = o.value_and_grad(key2, get_params(param))
                    grad1.append(g1)
                    grad2.append(g2)
                print(np.linalg.norm(np.mean(grad1,axis=0)- np.mean(grad2,axis=0)))
#                print(np.linalg.norm(np.mean(biases,axis=0))/np.linalg.norm(np.mean(grads,axis=0)), np.linalg.norm(np.mean(biases,axis=0)), np.linalg.norm(np.mean(grads,axis=0)))
                bias_data.append({'step':i})

            #print(np.mean(biases))
        data.append({'step':i, 'loss': float(value)})
        #print(theta_sample)

    param, rng_key = opt_state
    rng_key1, rng_key2 = random.split(rng_key)
    #prediction_key = random.split(rng_key, prediction_sample)
    param_list = get_params(param)
    theta_sample = p.sample(rng_key1, param_list, prediction_sample)
    log_likelihoods = vmap(m.valid_log_likelihoods)(theta_sample,)
    predictive_ll = np.sum(np.array(scipy.special.logsumexp(log_likelihoods, axis=0)) - np.log(prediction_sample))
    #print(m.log_prior(theta_sample), m.log_likelihoods(theta_sample,y), p.log_posterior(theta_sample,get_params(param)))
    #print(theta_sample)

    print(best_param, predictive_ll)
    #param_list = param_list.at[-2].set(-2.)
    #theta_sample = p.sample(rng_key, param_list, prediction_sample)
    #log_likelihoods = vmap(m.log_likelihoods, in_axes=(0, None))(theta_sample, y)
    #predictive_ll = np.sum(np.array(scipy.special.logsumexp(log_likelihoods, axis=0)) - np.log(prediction_sample))
    #print(param_list, predictive_ll)

    if test_y is not None:
        #rng_key2 = random.split(rng_key2, theta_sample.shape[0])
        #ys = vmap(m.sample_test_datapoint)(rng_key2, theta_sample)
        #if ys.shape[1] == 1:
        #    ys = ys[:, 0, ...]
        #y0 = jnp.swapaxes(ys, 0, 1)
        #y = m.test_y
        #if len(y.shape) == 1:
        #    y = jnp.expand_dims(y, 1)
        #l = jnp.quantile(y0, alpha / 2, axis=1)
        #u = jnp.quantile(y0, 1 - alpha / 2, axis=1)
        #print(np.sum((l<=y)&(y<=u)), y.shape[0])
        #print(np.sum((l<=y)&(y<=u))/y.shape[0])
        #print(l.shape, u.shape, y.shape)
        log_likelihoods = vmap(m.test_log_likelihoods, in_axes=(0, None))(theta_sample, test_y)
        test_ll = np.sum(np.array(scipy.special.logsumexp(log_likelihoods, axis=0)) - np.log(prediction_sample))
        predictive_ll = test_ll
        print('test set :', test_ll)

    os.makedirs(f"result/{model}_{outlier_scale}_{outlier_n}_{alpha2}_{g}_{n}_{posterior}_{objective}/", exist_ok=True)
    result_file = f'result/{model}_{outlier_scale}_{outlier_n}_{alpha2}_{g}_{n}_{posterior}_{objective}/{regularizer}_{lamb}_{seed}_{s}'
    print('writing to',result_file)

    #if hasattr(p, 'diagonosis') and plot:
    #    p.diagonosis(param_list, m, o.name(), m.gt)

    with open(result_file, 'w') as f:
        #params = best_param
        print(' '.join(map(str, get_params(param))), predictive_ll, file=f)

        if hasattr(m, 'valid_y'):
            log_likelihoods = vmap(m.valid_log_likelihoods,)(theta_sample,)
            valid_ll = np.sum(np.array(scipy.special.logsumexp(log_likelihoods, axis=0)) - np.log(prediction_sample))
            print('valid set :', valid_ll, )
            print('valid set :', valid_ll, file=f)

    data = pd.DataFrame(data)
    sns.lineplot(data = data, x='step', y = 'loss')
    plt.savefig(result_file+'.pdf')


if __name__ == '__main__':
    main()