"""
Use q-iteration to solve for an optimal policy

Usage: q_iteration(env, gamma=discount factor, ent_wt= entropy bonus)
"""
import numpy as np
from scipy.misc import logsumexp as sp_lse
from rlutil.logging import logger

def softmax(q, alpha=1.0):
    q = (1.0/alpha)*q
    q = q-np.max(q)
    probs = np.exp(q)
    probs = probs/np.sum(probs)
    return probs

def logsumexp(q, alpha=1.0, axis=1):
    if alpha == 0:
        return np.max(q, axis=axis)
    return alpha*sp_lse((1.0/alpha)*q, axis=axis)


def get_policy(q_fn, ent_wt=1.0):
    v_rew = logsumexp(q_fn, alpha=ent_wt)
    adv_rew = q_fn - np.expand_dims(v_rew, axis=1)
    if ent_wt == 0:
        pol_probs = adv_rew
        pol_probs[pol_probs >= 0 ] = 1.0
        pol_probs[pol_probs < 0 ] = 0.0
    else:
        pol_probs = np.exp((1.0/ent_wt)*adv_rew)
    pol_probs /= np.sum(pol_probs, axis=1, keepdims=True)
    assert np.all(np.isclose(np.sum(pol_probs, axis=1), 1.0)), str(pol_probs)
    return pol_probs


def softq_iteration(env, transition_matrix=None, reward_matrix=None, num_itrs=50, discount=0.99, ent_wt=0.1, warmstart_q=None, policy=None):
    """
    Perform tabular soft Q-iteration
    """
    dim_obs = env.num_states
    dim_act = env.num_actions
    if reward_matrix is None:
        reward_matrix = env.reward_matrix()
    reward_matrix = reward_matrix[:,:,0]

    if warmstart_q is None:
        q_fn = np.zeros((dim_obs, dim_act))
    else:
        q_fn = warmstart_q

    if transition_matrix is None:
        t_matrix = env.transition_matrix()
    else:
        t_matrix = transition_matrix

    for k in range(num_itrs):
        if policy is None:
            v_fn = logsumexp(q_fn, alpha=ent_wt)
        else:
            v_fn = np.sum((q_fn - ent_wt*np.log(policy))*policy, axis=1)
        new_q = reward_matrix + discount*t_matrix.dot(v_fn)
        q_fn = new_q
    return q_fn


def q_iteration(env, **kwargs):
    return softq_iteration(env, ent_wt=0.0, **kwargs)


def compute_visitation(env, q_fn, ent_wt=1.0, env_time_limit=50, discount=1.0):
  pol_probs = get_policy(q_fn, ent_wt=ent_wt)

  dim_obs = env.num_states
  dim_act = env.num_actions
  state_visitation = np.zeros((dim_obs, 1))
  for (state, prob) in env.initial_state_distribution.items():
    state_visitation[state] = prob
  t_matrix = env.transition_matrix()  # S x A x S
  sa_visit_t = np.zeros((dim_obs, dim_act, env_time_limit))

  for i in range(env_time_limit):
    sa_visit = state_visitation * pol_probs
    # sa_visit_t[:, :, i] = (discount ** i) * sa_visit
    sa_visit_t[:, :, i] = sa_visit
    # sum-out (SA)S
    new_state_visitation = np.einsum('ij,ijk->k', sa_visit, t_matrix)
    state_visitation = np.expand_dims(new_state_visitation, axis=1)
  return np.sum(sa_visit_t, axis=2) / float(env_time_limit)


def compute_occupancy(env, q_fn, ent_wt=1.0, env_time_limit=50, discount=1.0):
  pol_probs = get_policy(q_fn, ent_wt=ent_wt)

  dim_obs = env.num_states
  dim_act = env.num_actions
  state_visitation = np.zeros((dim_obs, 1))
  for (state, prob) in env.initial_state_distribution.items():
    state_visitation[state] = prob
  t_matrix = env.transition_matrix()  # S x A x S
  sa_visit_t = np.zeros((dim_obs, dim_act, env_time_limit))

  for i in range(env_time_limit):
    sa_visit = state_visitation * pol_probs
    sa_visit_t[:, :, i] = (discount ** i) * sa_visit
    # sa_visit_t[:, :, i] = sa_visit
    # sum-out (SA)S
    new_state_visitation = np.einsum('ij,ijk->k', sa_visit, t_matrix)
    state_visitation = np.expand_dims(new_state_visitation, axis=1)
  return np.sum(sa_visit_t, axis=2) #/ float(env_time_limit)
