import copy
import warnings

import numpy as np
from scipy import stats

from sklearn.utils.validation import check_random_state

N_PARTICLES = 1
N_ACTION_SEQUENCES = 3000
N_ELITES = 12  # number of samples used to update the distribution
PLANNING_HORIZON = 10
MAX_ITER = 5  # maximum iteration of the CEM
CEM_LEARNING_RATE = 0.1
MIN_ELITES = 5

N_ACTION_FEATURES = 2

# distribution shape is [a_1, ..., a_p] where p is the PLANNING_HORIZON
# and each a_i of size N_ACTION_FEATURES
N_DIST_FEATURES = PLANNING_HORIZON * N_ACTION_FEATURES
ACTION_LOWER_BOUND = np.array([-1] * N_DIST_FEATURES)
ACTION_UPPER_BOUND = np.array([1] * N_DIST_FEATURES)


class Agent:
    """Cross entropy method (CEM) for maximization of the return.

    See "A Tutorial on the Cross-Entropy Method" by De Boer et al., 2005.

    We follow the code of the PETS paper, see kchua/handful-of-trials github
    repository, MIT license.

    Parameters
    ----------
    env : gym environment
        Environment with which to run the random shooting.
    epoch_output_dir : string
        Path of the output directory of the current epoch. Can be used to save
        results.
    epsilon : float
        Value of epsilon for the epsilon-greedy exploration. Set to None if
        not epsilon-greedy not used.
    gamma : float
        Discount factor.
    random_action : bool
        Whether to draw actions at random.
    seed : int
        Seed of the RNG.
    """

    def __init__(self, env, epoch_output_dir,
                 epsilon=None, gamma=1, beta=0.4, random_action=False,
                 seed=None):

        self.seed(seed)
        self.epoch_output_dir = epoch_output_dir
        self.env = env
        self.epsilon = epsilon
        self.gamma = gamma
        self.beta = beta

        self.random_action = random_action
        self.minimal_elites = MIN_ELITES
        self.init_mean = np.zeros(N_DIST_FEATURES)

    def seed(self, seed=None):
        # seed for numpy
        self.np_random = check_random_state(seed)
        return [seed]

    def reset(self):
        self.init_mean = np.zeros(N_DIST_FEATURES)

    def act(self, observations, restart):
        """Return the action to take given the observations.

        Parameters
        ----------
        observations : array, shape (1, n_features)
            Observations
        restart : int
            Whether the observation is the first of an episode.

        Returns
        -------
        action : int
            The action to take.
        """
        if self.random_action:
            return self.np_random.uniform(
                low=-1, high=1,
                size=(N_ACTION_FEATURES,))
        else:
            # the actions are assumed to be in [-1, 1] so initial mean is set
            # to 0
            mean = self.init_mean
            # for initial variance take same value as the model based RL
            # benchmark paper, see singletask_pets_sampler.py script
            var = 0.25
            rv = stats.truncnorm(
                -2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean))

            iteration = 0
            best_return = -np.inf

            while iteration < MAX_ITER and np.max(var) > 0.001:
                # constrain the variance
                observation_vec = np.tile(
                    observations, (N_ACTION_SEQUENCES * N_PARTICLES, 1))
                restart_vec = np.array(
                    [restart] * N_ACTION_SEQUENCES * N_PARTICLES)
                restart_vec = restart_vec.reshape(-1, 1)
                self.env.add_observations_to_history(observation_vec, restart_vec)

                dist_lower_bound = mean - ACTION_LOWER_BOUND
                dist_upper_bound = ACTION_UPPER_BOUND - mean
                constrained_var = np.minimum(
                    np.minimum(np.square(dist_lower_bound / 2),
                               np.square(dist_upper_bound / 2)),
                    var)

                action_sequences = rv.rvs(
                    size=(N_ACTION_SEQUENCES, N_DIST_FEATURES))
                action_sequences = np.sqrt(constrained_var) * action_sequences
                action_sequences = action_sequences + mean
                # reshape for ease of computation of return
                action_sequences = action_sequences.reshape(
                    N_ACTION_SEQUENCES, PLANNING_HORIZON, N_ACTION_FEATURES)

                # compute the returns for each action sequence
                action_sequences_reps = np.repeat(
                    action_sequences, N_PARTICLES, axis=0)

                all_returns = np.zeros(N_ACTION_SEQUENCES * N_PARTICLES)
                safety_costs = np.zeros(N_ACTION_SEQUENCES * N_PARTICLES)

                for horizon in range(PLANNING_HORIZON):
                    actions = action_sequences_reps[:, horizon]
                    _, rewards, _, info = self.env.step(actions)
                    all_returns += (self.gamma ** horizon * rewards)
                    safety_costs += (self.beta ** horizon * info['cost'])

                all_returns = all_returns.reshape(N_ACTION_SEQUENCES, N_PARTICLES)
                safety_costs = safety_costs.reshape(N_ACTION_SEQUENCES, N_PARTICLES)

                returns = np.mean(all_returns, axis=1)
                cost = np.max(safety_costs, axis=1)  # We have to be safe so we take the max cost among particles

                feasible_idx = cost == 0
                feasible_action_sequences_reward = all_returns[feasible_idx]
                feasible_action_sequences = action_sequences[feasible_idx]  # [num, sol_dim]

                feasible_num = feasible_action_sequences.shape[0]

                if feasible_num < self.minimal_elites:
                    n = self.minimal_elites - feasible_num
                    ind_elite = np.argsort(cost)
                    sub_elites = action_sequences[ind_elite][:n]
                    elite_action_sequences = np.concatenate((sub_elites, feasible_action_sequences), axis=0).squeeze()
                    elite_action_sequences = elite_action_sequences.reshape(
                        self.minimal_elites, N_DIST_FEATURES)
                else:
                    ind_elite = np.argsort(feasible_action_sequences_reward)[::-1]
                    n_elites_current = N_ELITES if feasible_num >= N_ELITES else  feasible_num
                    elite_action_sequences = feasible_action_sequences[ind_elite][:n_elites_current].squeeze()
                    elite_action_sequences = elite_action_sequences.reshape(
                        n_elites_current, N_DIST_FEATURES)

                new_mean = np.mean(elite_action_sequences, axis=0)
                new_var = np.var(elite_action_sequences, axis=0)

                mean = (CEM_LEARNING_RATE * mean +
                        (1 - CEM_LEARNING_RATE) * new_mean)
                var = (CEM_LEARNING_RATE * var +
                       (1 - CEM_LEARNING_RATE) * new_var)
                if returns[ind_elite[0]] > best_return or iteration == 0:
                    best_return = returns[ind_elite[0]]
                    best_action_sequence = elite_action_sequences[0]

                iteration += 1

            action = best_action_sequence[:N_ACTION_FEATURES]
            self.init_mean = np.hstack(
                [mean[N_ACTION_FEATURES:].squeeze(), np.zeros(N_ACTION_FEATURES)])

            return action
