import numpy as np

from rlkit.envs.mujoco.ant_multitask_base import MultitaskAntEnv
from rlkit.envs import register_env


@register_env('ant-dir')
class AntDirEnv(MultitaskAntEnv):

    def __init__(self, task={}, n_tasks=2, forward_backward=False, max_episode_steps=200, randomize_tasks=True, **kwargs):
        self.forward_backward = forward_backward
        super(AntDirEnv, self).__init__(task, n_tasks, max_episode_steps, **kwargs)


    def step(self, action):
        torso_xyz_before = np.array(self.get_body_com("torso"))

        direct = (np.cos(self._goal), np.sin(self._goal))

        self.do_simulation(action, self.frame_skip)
        torso_xyz_after = np.array(self.get_body_com("torso"))
        torso_velocity = torso_xyz_after - torso_xyz_before
        forward_reward = np.dot((torso_velocity[:2]/self.dt), direct)

        ctrl_cost = .5 * np.square(action).sum()
        contact_cost = 0.5 * 1e-3 * np.sum(
            np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
        survive_reward = 1.0
        reward = forward_reward - ctrl_cost - contact_cost + survive_reward
        state = self.state_vector()
        notdone = np.isfinite(state).all() \
                  and state[2] >= 0.2 and state[2] <= 1.0
        done = not notdone
        ob = self._get_obs()
        self._step += 1
        if self._step >= self._max_episode_steps:
            done = True
        return ob, reward, done, dict(
            reward_forward=forward_reward,
            reward_ctrl=-ctrl_cost,
            reward_contact=-contact_cost,
            reward_survive=survive_reward,
            torso_velocity=torso_velocity,
        )
    
    def reset(self):
        self._step = 0
        return super().reset()

    def sample_tasks(self, num_tasks):
        if self.forward_backward:
            assert num_tasks == 2
            velocities = np.array([0., np.pi])
        else:
            velocities = np.random.uniform(0., 2.0 * np.pi, size=(num_tasks,))
        tasks = [{'goal': velocity} for velocity in velocities]
        return tasks


@register_env('sparse-ant-dir')
class SparseAntDirEnv(AntDirEnv):

    def __init__(self, task={}, n_tasks=2, forward_backward=False, max_episode_steps=200, randomize_tasks=True, goal_radius=4, **kwargs):
        self.goal_radius = goal_radius
        super(SparseAntDirEnv, self).__init__(task, n_tasks, forward_backward, max_episode_steps, randomize_tasks=True, **kwargs)

    def step(self, action):
        ob, reward, done, d = super().step(action)
        sparse_reward = self.sparsify_rewards(reward)
        #if reward >= self.goal_radius:
        #    sparse_reward += 1
        d.update({'sparse_reward': sparse_reward})
        return ob, reward, done, d

    def sparsify_rewards(self, r):
        ''' zero out rewards when outside the goal radius '''
        #mask = (r >= self.goal_radius)
        #r = r * mask
        #return r
        if r >= self.goal_radius:
            sparse_reward = (r - self.goal_radius) * (1/abs(self.goal_radius))
        else:
            sparse_reward = r * 0
        #return (r+1) if r!=0 else r
        return sparse_reward