import numpy as np
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy

from imitations.policies.serialize import load_policy
from imitations.util.util import make_vec_env
from imitations.data.wrappers import RolloutInfoWrapper
from imitations.data import rollout
from imitations.algorithms.adversarial.airl import AIRL
from imitations.rewards.reward_nets import BasicShapedRewardNet
from imitations.feasibility.feasibility_nets import BasicShapedFeasibilityNet
from imitations.util.networks import RunningNorm
from gymnasium.envs.classic_control.wall_gird_word import WallGridworld

FAST = True
if FAST:
    N_RL_TRAIN_STEPS = 100_000
else:
    N_RL_TRAIN_STEPS = 2_000_000


csv_file_path1 = r'.\New_Dis_Acc\CCAIRL_Grid.csv'
csv_file_path2 = r'.\New_Violation_Rate\CCAIRL_Grid.csv'

for i in range(10):  # Loop to run the experiment 10 times
    SEED = np.random.randint(10, 101)

    venv = make_vec_env(
        "GridWorld-v0",
        rng=np.random.default_rng(SEED),
        n_envs=8,
        post_wrappers=[
            lambda env, _: RolloutInfoWrapper(env)
        ],  # needed for computing rollouts later
    )

    a = WallGridworld()
    rollouts = a.rollout()

    learner = PPO(
        env=venv,
        policy=MlpPolicy,
        batch_size=16384,
        ent_coef=0.0,
        learning_rate=0.0005,
        gamma=0.95,
        clip_range=0.1,
        vf_coef=0.1,
        n_epochs=5,
        seed=SEED,
    )
    reward_net = BasicShapedRewardNet(
        observation_space=venv.observation_space,
        action_space=venv.action_space,
        normalize_input_layer=RunningNorm,
    )
    feasibility_net = BasicShapedFeasibilityNet(
        observation_space=venv.observation_space,
        action_space=venv.action_space,
        normalize_input_layer=RunningNorm,
    )
    airl_trainer = AIRL(
        demonstrations=rollouts,
        demo_batch_size=700,
        gen_replay_buffer_capacity=512,
        n_disc_updates_per_round=16,
        venv=venv,
        gen_algo=learner,
        reward_net=reward_net,
        feasibility_net=feasibility_net,
    )

    venv.seed(SEED)
    learner_rewards_before_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
    )
    airl_trainer.train(N_RL_TRAIN_STEPS)
    venv.seed(SEED)
    learner_rewards_after_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
    )

    # 在两个CSV文件中记录迭代完成
    df = pd.DataFrame({'iteration': [i + 1], 'seed': [SEED]})
    with open(csv_file_path1, encoding='utf-8-sig', mode='a', newline='') as f:
        df.to_csv(f, header=f.tell() == 0, index=False)
    with open(csv_file_path2, encoding='utf-8-sig', mode='a', newline='') as f:
        df.to_csv(f, header=f.tell() == 0, index=False)

    print(f"Experiment {i + 1} with SEED {SEED}:")
    print("Iteration completed and recorded.")
    print()
