import argparse

import jax
import jax.numpy as np

from jax import grad, jit, vmap, pmap, value_and_grad
from jax import random

from jax.tree_util import tree_multimap, tree_map
from utils import optimizers
from utils import adaptation_utils
from utils.regularizers import weighted_parameter_loss
import haiku as hk

import numpy as onp

import tensorflow_datasets as tfds
import tensorflow as tf

from jax.config import config

import os
import requests

import pickle
import time

from models.util import get_model

from utils.training_utils import train_epoch
from utils.eval import eval_ds_all

from utils.losses import nll, accuracy, entropy, brier, ece
from utils.misc import get_single_copy, manual_pmap_tree

from posteriors.utils import sample_weights_diag
from posteriors.swag import init_swag, update_swag, collect_posterior

import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
parser = argparse.ArgumentParser(description='Runs basic train loop on a supervised learning task')
parser.add_argument(
    "--dir",
    type=str,
    default=None,
    required=False,
    help="Training directory for logging results"
)
parser.add_argument(
    "--log_prefix",
    type=str,
    default=None,
    required=False,
    help="Name prefix for logging results"
)
parser.add_argument(
    "--data_dir",
    type=str,
    default='datasets',
    required=False,
    help="Directory for storing datasets"
)
parser.add_argument(
    "--seed",
    type=int,
    default=0,
    required=False
)
parser.add_argument(
    "--wd",
    type=float,
    default=0.,
    required=False
)
parser.add_argument(
    "--model",
    type=str,
    default="ResNet50",
    required=False,
    help="Model class"
)
parser.add_argument(
    "--corruption_type",
    type=str,
    default="brightness",
    required=False,
)
parser.add_argument(
    "--corruption_level",
    type=int,
    default=1,
    required=False,
)
parser.add_argument(
    "--n_epochs",
    type=int,
    default=1,
    required=False,
)
parser.add_argument(
    "--batch_size",
    type=int,
    default=64,
    required=False,
)
parser.add_argument(
    "--lr",
    type=float,
    default=0.00025,
    required=False,
)
parser.add_argument(
    "--adapt_bn_only",
    dest="adapt_bn_only",
    action='store_true'
)
parser.add_argument(
    "--use_swag_posterior",
    dest="use_swag_posterior",
    action='store_true'
)
parser.add_argument(
    "--swag_posterior_weight",
    type=float,
    default=1e-3,
    required=False,
)
parser.add_argument(
    "--swag_posterior_damp",
    type=float,
    default=1e-4,
    required=False,
)

args = parser.parse_args()

### ImageNet channel means and stddevs
channel_means = (0.485, 0.456, 0.406)
channel_stds = (0.229, 0.224, 0.225)

n_classes = 1000

n_devices = jax.device_count()

batch_size = args.batch_size
def preprocess_inputs(datapoint):
    image, label = datapoint['image'], datapoint['label']
    image = image / 255
    image = (image - channel_means) / channel_stds
    label = tf.one_hot(label, n_classes) 
    return image, label

def augment_train_data(image, label):
    return image, label

model = get_model(args.model, n_classes)
### removes RNG component and runs with is_training=True
@jit
def net_apply(params, state, rng, x):
    return model.apply(params, state, rng, x, True)

@jit
def net_apply_eval(params, state, x):
    return model.apply(params, state, None, x, False)

@jit
def net_apply_eval_bn(params, state, x):
    return model.apply(params, state, None, x, True)

rng = random.PRNGKey(0)
rng = np.broadcast_to(rng, (n_devices,) + rng.shape)
swag_filename = 'imagenet_models/swag_models/seed{}/saved_swag_state.pkl'.format(args.seed)
with open(swag_filename, 'rb') as f:
    swag_state = pickle.load(f)

swag_state_filename = 'imagenet_models/swag_models/seed{}/saved_swa_net_state.pkl'.format(args.seed)
with open(swag_state_filename, 'rb') as f:
    # loads batch norm statistics for the SWA solution
    single_state = pickle.load(f)

swag_means, swag_vars = collect_posterior(swag_state)
del swag_state
single_params = swag_means

init_params, init_state = single_params, single_state
net_state = init_state

num_epochs = args.n_epochs

def step_size_schedule(i):
    return args.lr

if args.adapt_bn_only:
    all_param_names = init_params.keys()
    bn_params, other_params = hk.data_structures.partition(lambda m, n, p: 'batchnorm' in m, init_params)
    orig_net_apply = net_apply
    orig_net_apply_eval = net_apply_eval
    orig_net_apply_eval_bn = net_apply_eval_bn

    bn_only_net_apply = lambda bn_p, state, rng, x: orig_net_apply(hk.data_structures.merge(bn_p, other_params), state,  rng, x)
    net_apply = jit(bn_only_net_apply)
    bn_only_net_apply_eval = lambda bn_p, state, x: orig_net_apply_eval(hk.data_structures.merge(bn_p, other_params), state, x)
    net_apply_eval = jit(bn_only_net_apply_eval)

    bn_only_net_apply_eval_bn = lambda bn_p, state, x: orig_net_apply_eval_bn(hk.data_structures.merge(bn_p, other_params), state, x)
    net_apply_eval_bn = jit(bn_only_net_apply_eval_bn)
    net_params = bn_params
    print("Working with adapt bn only", flush=True)
else:
    net_params = init_params

if args.use_swag_posterior:
    print("Using swag posterior")
    regularizer = lambda params: args.swag_posterior_weight * weighted_parameter_loss(params, swag_means, swag_vars, args.swag_posterior_damp)
    regularizer(single_params)
else:
    regularizer = None

opt_init, opt_update, get_params = optimizers.momentum(step_size=step_size_schedule, mass=0.9, wd=args.wd)
opt_state = opt_init(net_params)

corruption_str = '{}_{}'.format(args.corruption_type, args.corruption_level)

ds_train = tfds.load('imagenet2012_corrupted/{}'.format(corruption_str), split='validation', data_dir=args.data_dir, shuffle_files=True).shuffle(50000, reshuffle_each_iteration=True).map(preprocess_inputs, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(augment_train_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
ds_test = tfds.load('imagenet2012_corrupted/{}'.format(corruption_str), split='validation', data_dir=args.data_dir).map(preprocess_inputs, num_parallel_calls=tf.data.experimental.AUTOTUNE).map(augment_train_data, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(128, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

options = ds_train.options()
options.experimental_threading.private_threadpool_size = 48
options.experimental_threading.max_intra_op_parallelism = 1

rng = random.PRNGKey(args.seed)
rng = np.broadcast_to(rng, (n_devices,) + rng.shape)

def eval(eval_params, eval_net_state, with_logits=False):
    start = time.time()
    if with_logits:
        test_results, logits = eval_ds_all(tfds.as_numpy(ds_test), 
                               eval_params, 
                               eval_net_state, 
                               net_apply_eval, 
                               (nll, entropy, accuracy, brier, ece),
                               with_logits)
        return test_results, logits
    test_results = eval_ds_all(tfds.as_numpy(ds_test), 
                           eval_params, 
                           eval_net_state, 
                           net_apply_eval, 
                           (nll, entropy, accuracy, brier, ece),
                           with_logits)
    return test_results, test_results

def eval_bn(eval_params, eval_net_state, with_logits=False):
    start = time.time()

    if with_logits:
        test_results, logits = eval_ds_all(tfds.as_numpy(ds_test), 
                               eval_params, 
                               eval_net_state, 
                               net_apply_eval_bn, 
                               (nll, entropy, accuracy, brier, ece),
                               with_logits)
        return test_results, logits
    test_results = eval_ds_all(tfds.as_numpy(ds_test), 
                           eval_params, 
                           eval_net_state, 
                           net_apply_eval_bn, 
                           (nll, entropy, accuracy, brier, ece),
                           with_logits)
    return test_results, test_results

eval_params = get_params(opt_state)
eval_params, eval_net_state = eval_params, net_state 

bn_only_str = 'adaptbnonly_' if args.adapt_bn_only else ''

filename = 'logs/entropy_minimization_imagenet/{}/posteriorweight{}_posteriordamp{}_{}lr{}_batchsize{}/seed{}_{}.pkl'.format(args.model, args.swag_posterior_weight, args.swag_posterior_damp, bn_only_str, args.lr, args.batch_size, args.seed, corruption_str)
os.makedirs(os.path.dirname(filename), exist_ok=True)
print(filename, flush=True)
try:
    pickle.load(open(filename, 'rb'))
    print(filename, 'file loaded')
except:
    print(filename, 'file not found')

t = time.time()
test_results, initial_logits = eval(eval_params, eval_net_state, with_logits=True)
log_dict = {}
log_dict["Initial Test"] = test_results
log_dict["Initial Train"] = test_results
print("Initial Results", test_results)
print('eval time', time.time() - t, flush=True)
initial_logits_filename = 'logs/entropy_minimization_imagenet/{}/posteriorweight{}_posteriordamp{}_{}lr{}_batchsize{}/seed{}_{}_initial_logits.npy'.format(args.model, args.swag_posterior_weight, args.swag_posterior_damp, bn_only_str, args.lr, args.batch_size, args.seed, corruption_str)
print('logits shape', initial_logits)
np.save(initial_logits_filename, initial_logits)

test_results, bn_logits = eval_bn(eval_params, eval_net_state, with_logits=True)
log_dict["Initial Batchnorm Adapted Test"] = test_results
log_dict["Initial Batchnorm Adapted Train"] = test_results
print("Initial Batchnorm Results", test_results, flush=True)
initial_bn_logits_filename = 'logs/entropy_minimization_imagenet/{}/posteriorweight{}_posteriordamp{}_{}lr{}_batchsize{}/seed{}_{}_bn_logits.npy'.format(args.model, args.swag_posterior_weight, args.swag_posterior_damp, bn_only_str, args.lr, args.batch_size, args.seed, corruption_str)
np.save(initial_bn_logits_filename, bn_logits)


rng = random.PRNGKey(args.seed)
for epoch in range(num_epochs):
    # constructs numpy iterator
    start = time.time()
    np_ds = tfds.as_numpy(ds_train)
    opt_state, net_state, train_loss = train_epoch(epoch, 
                                                   opt_state, 
                                                   net_state, 
                                                   rng,
                                                   np_ds, 
                                                   entropy, 
                                                   get_params, 
                                                   net_apply, 
                                                   opt_update, 
                                                   regularizer=regularizer,
                                                   distributed=False)
    print('Epoch {}: {} {}'.format(epoch, train_loss, time.time() - start), flush=True)
    if epoch % 1 == 0:
        # neesd to flatten params for non-distributed eval, arbitrarily takes first copy of params
        eval_params = get_params(opt_state)
        eval_params, eval_net_state = get_single_copy((eval_params, net_state))
        test_results, final_logits = eval_bn(eval_params, eval_net_state, with_logits=True)
        log_dict['Epoch_{} Test'.format(epoch)] = test_results
        log_dict['Epoch_{} Train'.format(epoch)] = test_results
        print("Evaluation {}".format(epoch), test_results, time.time() - start)
        final_logits_filename = 'logs/entropy_minimization_imagenet/{}/posteriorweight{}_posteriordamp{}_{}lr{}_batchsize{}/seed{}_{}_final_logits.npy'.format(args.model, args.swag_posterior_weight, args.swag_posterior_damp, bn_only_str, args.lr, args.batch_size, args.seed, corruption_str)
        np.save(final_logits_filename, final_logits)

print(corruption_str)

pickle.dump(log_dict, open(filename, 'wb'))

