import torch

from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment.deterministic import set_seed
from garage.sampler import RaySampler
from garage.torch.algos import VPG
from garage.torch.algos import SCRN,TRPO
from garage.torch.policies import GaussianMLPPolicy
from garage.torch.value_functions import GaussianMLPValueFunction
from garage.trainer import Trainer
from garage.np.baselines import LinearFeatureBaseline
from garage.torch.optimizers import OptimizerWrapper
from garage.torch.optimizers.SCRN_optimizer import SCRNOptimizer

inner_itr = 50
c_prime = 0.5
ro = 1000
l = 300
epsilon = 1e-2
step_size = 0.001


@wrap_experiment(archive_launch_repo=False,
                 log_dir="/root/Data/reacher_scrn_itr={}c-prime={}ro={}l={}eps{}".format(inner_itr,
                                                                                          c_prime, ro, l,
                                                                                          epsilon,
                                                                                          ))
def srcn_reacher(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    n_epochs = 1000
    sampler_batch_size = 10000

    set_seed(seed)
    env = GymEnv('Reacher-v2')

    trainer = Trainer(ctxt)

    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[64, 64],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)

    # value_function = GaussianMLPValueFunction(env_spec=env.spec,
    #                                           hidden_sizes=(32, 32),
    #                                           hidden_nonlinearity=torch.tanh,
    #                                           output_nonlinearity=None)
    value_function = LinearFeatureBaseline(env_spec=env.spec)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length)

    policy_optimizer = OptimizerWrapper((SCRNOptimizer, {
        "inner_itr": inner_itr, "c_prime": c_prime, "ro": ro, "l": l, "epsilon": epsilon, "step_size": step_size,
    }), policy)

    algo = SCRN(env_spec=env.spec,
                policy=policy,
                value_function=value_function,
                sampler=sampler,
                discount=0.99,
                center_adv=False,
                policy_optimizer=policy_optimizer,
                neural_baseline=False,
                )

    trainer.setup(algo, env)
    trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)


@wrap_experiment(archive_launch_repo=False,
                 log_dir="/root/Data/mujoco/trpo_reacher")
def trpo_reacher(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    n_epochs = 1000
    sampler_batch_size = 10000

    set_seed(seed)
    env = GymEnv('Reacher-v2')

    trainer = Trainer(ctxt)

    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[64, 64],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)
    # value_function = LinearFeatureBaseline(env_spec=env.spec)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length)

    # policy_optimizer = OptimizerWrapper((SCRNOptimizer, {
    #     "inner_itr": inner_itr, "c_prime": c_prime, "ro": ro, "l": l, "epsilon": epsilon, "step_size": step_size,
    # }), policy)

    # algo = SCRN(env_spec=env.spec,
    #             policy=policy,
    #             value_function=value_function,
    #             sampler=sampler,
    #             discount=0.99,
    #             center_adv=False,
    #             policy_optimizer=policy_optimizer,
    #             neural_baseline=False,
    #             )
    algo = TRPO(env_spec=env.spec,
                policy=policy,
                value_function=value_function,
                sampler=sampler,
                discount=0.99,
                center_adv=False,
                )
    trainer.setup(algo, env)
    trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)


trpo_reacher()
