# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""All functions related to loss computation and optimization.
"""

import torch
import torch.optim as optim
import numpy as np
import torch.nn as nn
import torch.nn.functional as nf
import utils
from models import utils as mutils
from sde_lib import VESDE, VPSDE
import tensorflow as tf
from lib.data import load_data, batch_generator


def get_optimizer(config, params):
  """Returns a flax optimizer object based on `config`."""
  if config.optim.optimizer == 'Adam':
    optimizer = optim.Adam(params, lr=config.optim.lr, betas=(config.optim.beta1, 0.999), eps=config.optim.eps,
                           weight_decay=config.optim.weight_decay)
  else:
    raise NotImplementedError(
      f'Optimizer {config.optim.optimizer} not supported yet!')

  return optimizer


def optimization_manager(config):
  """Returns an optimize_fn based on `config`."""

  def optimize_fn(optimizer, params, step, lr=config.optim.lr,
                  warmup=config.optim.warmup,
                  grad_clip=config.optim.grad_clip):
    """Optimizes with warmup and gradient clipping (disabled if negative)."""
    if warmup > 0:
      for g in optimizer.param_groups:
        g['lr'] = lr * np.minimum(step / warmup, 1.0)
    if grad_clip >= 0:
      torch.nn.utils.clip_grad_norm_(params, max_norm=grad_clip)
    optimizer.step()

  return optimize_fn

def get_sde_loss_fn(sde, train, eps=1e-5):

  reduce_op = lambda *args, **kwargs: 0.5 * torch.sum(*args, **kwargs)

  def loss_fn(model, batch):
    losses = 0
    score_fn = mutils.get_conditional_score_fn(sde, model, train=train)
    length = batch.shape[1]
    import pdb; pdb.set_trace()
    h0, batch0 = torch.zeros_like(batch[:,0,:]), batch[:,0,:]
    t = torch.rand(batch0.shape[0], device=batch0.device) * (sde.T - eps) + eps
    z = torch.randn_like(batch0)
    mean, std = sde.marginal_prob(batch0, t)
    perturbed_data = mean + std[:, None] * z
    score = score_fn(h0, perturbed_data, t)

    losses += torch.square(score * std[:, None] + z)

    for i in range(1,length):
      h0, batch0 = batch[:,i-1,:], batch[:,i,:]
      t = torch.rand(batch0.shape[0], device=batch0.device) * (sde.T - eps) + eps
      z = torch.randn_like(batch0)
      mean, std = sde.marginal_prob(batch0, t)
      perturbed_data = mean + std[:, None] * z
      score = score_fn(h0, perturbed_data, t)

      losses += torch.square(score * std[:, None] + z)

    loss = losses.mean()
    return loss
  return loss_fn

def get_conditional_step_fn(sde, train, optimize_fn=None):
    
  loss_fn = get_sde_loss_fn(sde, train)

  def step_fn(state, batch):

    model = state['conditional_model']
    
    if train:
      optimizer = state['optimizer']
      optimizer.zero_grad()
      loss = loss_fn(model, batch)
      loss.backward()
      optimize_fn(optimizer, model.parameters(), step=state['step'])
      state['step'] += 1
      state['ema'].update(model.parameters())
    else:
      with torch.no_grad():
        ema = state['ema']
        ema.store(model.parameters())
        ema.copy_to(model.parameters())
        loss = loss_fn(model, batch)
        ema.restore(model.parameters())
    return loss

  return step_fn

def get_ER_loss_fn():

  def loss_fn(encoder, decoder, X):
    H = encoder(X)
    X_tilde = decoder(H)
    losses = torch.square(X-X_tilde)
    loss = losses.mean()
    return loss, H.max(), H.min()

  return loss_fn

def get_ER_step_fn():

  ER_fn = get_ER_loss_fn()
    
  def step_fn(state, X):
    nete = state['encoder']
    netr = state['decoder']
    optimizer_e = state['opt_e']
    optimizer_r = state['opt_r']
    nete.train()
    netr.train()

    optimizer_e.zero_grad()
    optimizer_r.zero_grad()
    loss, max_val, min_val = ER_fn(nete, netr, X)
    loss.backward(retain_graph=True)
    optimizer_e.step()
    optimizer_r.step()
    return loss, max_val, min_val

  return step_fn