import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy

from imitations.policies.serialize import load_policy
from imitations.util.util import make_vec_env
from imitations.data.wrappers import RolloutInfoWrapper
from imitations.data import rollout
from imitations.algorithms.adversarial.airl import AIRL
from imitations.rewards.reward_nets import BasicShapedRewardNet
from imitations.feasibility.feasibility_nets import BasicShapedFeasibilityNet
from imitations.util.networks import RunningNorm
from gymnasium.envs.classic_control.meerkat import MeerkatVectorEnv


SEED = 42

FAST = True

if FAST:
    N_RL_TRAIN_STEPS = 500_000
else:
    N_RL_TRAIN_STEPS = 2_000_000

venv = make_vec_env(
    "Meerkat-v0",
    rng=np.random.default_rng(SEED),
    n_envs=8,
    post_wrappers=[
        lambda env, _: RolloutInfoWrapper(env)
    ],  # needed for computing rollouts later
)
a = MeerkatVectorEnv(num_envs=8)
rollouts = a.rollout()

learner = PPO(
    env=venv,
    policy=MlpPolicy,
    n_steps=4091,
    batch_size=32728,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)
reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)
feasibility_net = BasicShapedFeasibilityNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)
airl_trainer = AIRL(
    demonstrations=rollouts,
    demo_batch_size=32738,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
    feasibility_net=feasibility_net,
)

venv.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True
)
airl_trainer.train(N_RL_TRAIN_STEPS)
venv.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True
)

print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))
