import gym
import numpy as np
import argparse

class NoisyObsWrapper(gym.ObservationWrapper):
	def __init__(self, env, sigma):
		super().__init__(env)
		self.sigma = sigma
	def observation(self, obs):
		return obs + self.sigma*np.random.standard_normal(size=obs.shape)

class BooleanRewardWrapper(gym.RewardWrapper):
	def __init__(self, env):
		super().__init__(env)
	def reward(self, reward):
		return 0 if reward <= 0 else 1

from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from gym.wrappers import FrameStack, FlattenObservation

from stable_baselines3.common.callbacks import EvalCallback

parser = argparse.ArgumentParser(description='Mountain Car DDPG example')
parser.add_argument('--sigma', type=float, default=0.0, metavar='N',
					help='How much noise to smooth observations')
args = parser.parse_args()


if __name__ == '__main__':
	# Multiprocess environment
	sigma = args.sigma
	env =  BooleanRewardWrapper(FlattenObservation(FrameStack(NoisyObsWrapper(gym.make("MountainCarContinuous-v0"), sigma), 5) ))
	eval_env =  BooleanRewardWrapper(FlattenObservation(FrameStack(NoisyObsWrapper(gym.make("MountainCarContinuous-v0"), sigma), 5) ))
	n_actions = env.action_space.shape[-1]
	action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions))
	eval_callback = EvalCallback(eval_env, best_model_save_path="mountain_car_sigma_"+ str(sigma),
		log_path="./logs_mountain_car/"+ str(sigma)+'/', eval_freq=2000, n_eval_episodes=10)

	policy_kwargs = {}
	model = DDPG("MlpPolicy", env, action_noise=action_noise,  verbose=1,
				tensorboard_log="./logs_mountain_car/"+ str(sigma)+'/')
	model.learn(total_timesteps=3e5, callback=eval_callback)
	#model.save("deepq_cartpole_simple_sigma_"+ str(sigma))
