
from garage.torch import set_gpu_mode

import torch

from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment.deterministic import set_seed
from garage.sampler import RaySampler
from garage.torch.algos import VPG, TRPO, SCRN
from garage.torch.policies import GaussianMLPPolicy
from garage.torch.value_functions import GaussianMLPValueFunction
from garage.trainer import Trainer
import numpy as np
from garage.torch.optimizers import OptimizerWrapper
from garage.torch.optimizers.SCRN_optimizer import SCRNOptimizer
from garage.torch.optimizers.SGD_optimizer import SGD
from garage.np.baselines import LinearFeatureBaseline

import argparse

parser = argparse.ArgumentParser()

parser.add_argument("--seed", type=int, default=1, help="random seed")
parser.add_argument("--inner_itr", type=int, default=30, help="number of inner iterations")
parser.add_argument("--c_prime", type=float, default=1, help="c_prime")
parser.add_argument("--ro", type=float, default=1000, help="ro")
parser.add_argument("--l", type=float, default=1000, help="l")
parser.add_argument("--epsilon", type=float, default=1e-4, help="epsilon")
parser.add_argument("--hidden_sizes", type=int, default=8, help="hidden size of the policy network")
parser.add_argument("--batch_size", type=int, default=10000, help="batch size")

args = parser.parse_args()
print(args)

inner_itr = args.inner_itr
c_prime = args.c_prime
ro = args.ro
l = args.l
epsilon = args.epsilon

# @wrap_experiment(archive_launch_repo=False,
#                  log_dir="humanoid_scrn_seed={}itr={}c-prime={}ro={}l={}eps{}".format(args.seed, inner_itr,
#                                                                                 c_prime, ro, l,
#                                                                                 epsilon,
#                                                                                 ))
@wrap_experiment(archive_launch_repo=False,
                 log_dir="humanoid_scrn_seed={}hidden_size={}batch_size={}".format(args.seed, args.hidden_sizes, args.batch_size,))


def scrn_humanoid(ctxt=None, seed=args.seed):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    n_epochs = 1000
    sampler_batch_size = args.batch_size

    set_seed(seed)
    env = GymEnv('Humanoid-v2')

    trainer = Trainer(ctxt)

    policy = GaussianMLPPolicy(env.spec,
                               hidden_sizes=[args.hidden_sizes, args.hidden_sizes],
                               hidden_nonlinearity=torch.tanh,
                               output_nonlinearity=None)


    value_function = LinearFeatureBaseline(env_spec=env.spec)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length)

    policy_optimizer = OptimizerWrapper((SCRNOptimizer, {
        "inner_itr": inner_itr, "c_prime": c_prime, "ro": ro, "l": l, "epsilon": epsilon,
    }), policy)

    # policy_optimizer = OptimizerWrapper((SGD, {
    #     "lr": 0.01,
    # }), policy)

    algo = SCRN(env_spec=env.spec,
                policy=policy,
                value_function=value_function,
                sampler=sampler,
                discount=0.99,
                center_adv=False,
                policy_optimizer=policy_optimizer,
                neural_baseline=False,
                )

    trainer.setup(algo, env)
    trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)

scrn_humanoid()