import time
import wandb
import numpy as np
from functools import reduce
import torch
import pickle
from tqdm import tqdm
from runner.shared.base_runner import Runner
from utils.util import TrainDataSampler


def _t2n(x):
    return x.detach().cpu().numpy()


class HandsRunner(Runner):
    """Runner class to perform training, evaluation. and data collection for SMAC. See parent class for details."""
    def __init__(self, config):
        super(HandsRunner, self).__init__(config)

    def run(self):
        self.warmup()

        start = time.time()
        episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads

        train_episode_rewards = [0 for _ in range(self.n_rollout_threads)]
        done_episodes_rewards = []

        for episode in range(episodes):
            if self.use_linear_lr_decay:
                self.trainer.policy.lr_decay(episode, episodes)

            # act with env to sampling training data
            for step in range(self.episode_length):
                # Sample actions
                values, actions, action_log_probs, rnn_states, rnn_states_critic = self.collect(step)

                # Obser reward and next obs
                obs, share_obs, rewards, dones, infos, available_actions = \
                    self.envs.step(torch.tensor(actions.transpose(1, 0, 2)))
                obs = _t2n(obs)
                share_obs = _t2n(share_obs)
                rewards = _t2n(rewards)
                dones = _t2n(dones)

                dones_env = np.all(dones, axis=1)
                reward_env = np.mean(rewards, axis=1).flatten()
                train_episode_rewards += reward_env
                for t in range(self.n_rollout_threads):
                    if dones_env[t]:
                        done_episodes_rewards.append(train_episode_rewards[t])
                        train_episode_rewards[t] = 0

                data = obs, share_obs, rewards, dones, infos, available_actions, \
                       values, actions, action_log_probs, \
                       rnn_states, rnn_states_critic

                # insert data into buffer
                self.insert(data)

            # get next value from critic and compute gae advantage
            self.compute()
            # update network
            train_infos = self.train()

            # calculate total simulation steps
            total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
            # save model at each saving interval or at last episode
            if episode % self.save_interval == 0 or episode == episodes - 1:
                self.save(episode)

            # log information at each log interval
            if episode % self.log_interval == 0:
                end = time.time()
                print("\n Task {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                        .format(self.all_args.task,
                                self.algorithm_name,
                                self.experiment_name,
                                episode,
                                episodes,
                                total_num_steps,
                                self.num_env_steps,
                                int(total_num_steps / (end - start))))

                self.log_train(train_infos, total_num_steps)

                if len(done_episodes_rewards) > 0:
                    aver_episode_rewards = np.mean(done_episodes_rewards)
                    print("some episodes done, average rewards: ", aver_episode_rewards)
                    self.writter.add_scalars("train_episode_rewards", {"aver_rewards": aver_episode_rewards}, total_num_steps)
                    done_episodes_rewards = []

            # eval model at each eval interval
            if episode % self.eval_interval == 0 and self.use_eval:
                self.eval(total_num_steps)

    ############################# my method
    ######## behaviour cloning
    def run_bc(self):
        # load train set for path
        with open(self.all_args.train_data_path, 'rb') as f:
            train_set = pickle.load(f)
        train_data_sampler = TrainDataSampler(train_set)
        for ep_i in range(self.all_args.num_epochs):
            # start a training episode
            self.policy.train()
            train_pbar = tqdm(enumerate(range(self.all_args.num_steps_per_epochs)),
                              total=self.all_args.num_steps_per_epochs, mininterval=1 if self.all_args.quick_tqdm else 10)
            all_train_losses = []
            train_info = {}
            for step_i in train_pbar:
                share_states, states, actions = train_data_sampler.sample_batch_data(batch_size=self.all_args.batch_size, device=self.device)
                # update network and get train info
                train_loss = self.train_offline(share_states, states, actions)
                all_train_losses.append(train_loss)
                # print('train_loss', train_loss)
            print('episode', ep_i, 'mean_loss', np.mean(train_loss))
            train_info['mean_loss'] = np.mean(train_loss)
            # eval model at each episode
            if self.use_eval:
                eval_env_infos = self.eval_online()
                for k in eval_env_infos:
                    train_info[k] = eval_env_infos[k]
            # log medium result to tensorboard
            self.log_episode(train_info, ep_i)
            # save model at each saving interval or at last episode
            if ep_i % self.save_interval == 0 or ep_i == self.all_args.num_epochs - 1:
                self.save(ep_i)

    def train_offline(self, share_states, states, actions):
        return self.trainer.train_offline(share_states, states, actions)
    ######## behaviour cloning

    ######## gail / wail
    def run_gail(self):
        self.warmup()

        start = time.time()
        episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads

        train_episode_rewards = [0 for _ in range(self.n_rollout_threads)]
        done_episodes_rewards = []

        # add for record classifier reward
        all_classifier_rewards = []
        for episode in range(episodes):
            if self.use_linear_lr_decay:
                self.trainer.policy.lr_decay(episode, episodes)

            # act with env to sampling training data
            for step in range(self.episode_length):
                # Sample actions
                values, actions, action_log_probs, action_probs, rnn_states, rnn_states_critic, disc_values = self.collect(step)

                # Obser reward and next obs
                obs, share_obs, rewards, dones, infos, available_actions = \
                    self.envs.step(torch.tensor(actions.transpose(1, 0, 2)))
                obs = _t2n(obs)
                share_obs = _t2n(share_obs)
                rewards = _t2n(rewards)
                dones = _t2n(dones)

                dones_env = np.all(dones, axis=1)
                reward_env = np.mean(rewards, axis=1).flatten()
                train_episode_rewards += reward_env
                for t in range(self.n_rollout_threads):
                    if dones_env[t]:
                        done_episodes_rewards.append(train_episode_rewards[t])
                        train_episode_rewards[t] = 0

                # get reward from classifier
                if self.all_args.use_classifier_reward:
                    if self.all_args.classifier_use_gru:
                        # update history act sequence with now action
                        self.buffer.update_history_act_record(actions, dones)
                        # get updated history act sequence
                        classifier_input_action = self.buffer.get_history_act_record()
                    else:
                        classifier_input_action = actions
                    classifier_rewards = self.policy.get_classifier_reward(obs, classifier_input_action).detach().cpu().numpy()
                    classifier_rewards = classifier_rewards.reshape(self.n_rollout_threads, self.num_agents, 1)
                    all_classifier_rewards.append(classifier_rewards.reshape(-1).mean())
                    # print('classifier_reward', classifier_rewards.shape)
                else:
                    classifier_rewards = None

                data = obs, share_obs, rewards, dones, infos, available_actions, \
                       values, actions, action_log_probs, action_probs, \
                       rnn_states, rnn_states_critic, disc_values, classifier_rewards

                # insert data into buffer
                self.insert(data)

            # get next value from critic and compute gae advantage
            self.compute()
            # update network
            train_infos = self.train()

            # calculate total simulation steps
            total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
            # save model at each saving interval or at last episode
            if episode % self.save_interval == 0 or episode == episodes - 1:
                self.save(episode)

            # log information at each log interval
            if episode % self.log_interval == 0:
                end = time.time()
                print("\n Task {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                        .format(self.all_args.task,
                                self.algorithm_name,
                                self.experiment_name,
                                episode,
                                episodes,
                                total_num_steps,
                                self.num_env_steps,
                                int(total_num_steps / (end - start))))

                # add classifier info
                if self.all_args.use_classifier_reward:
                    self.writter.add_scalars("classifier_rewards", {"classifier_rewards": np.mean(all_classifier_rewards)}, total_num_steps)
                    all_classifier_rewards = []

                self.log_train(train_infos, total_num_steps)

                if len(done_episodes_rewards) > 0:
                    aver_episode_rewards = np.mean(done_episodes_rewards)
                    print("some episodes done, average rewards: ", aver_episode_rewards)
                    self.writter.add_scalars("train_episode_rewards", {"aver_rewards": aver_episode_rewards}, total_num_steps)
                    done_episodes_rewards = []

            # eval model at each eval interval
            if episode % self.eval_interval == 0 and self.use_eval:
                self.eval(total_num_steps)
    ######## gail / wail

    ######## pretrain classifier
    def pretrain_classifier(self):
        for ep_i in range(self.all_args.num_epochs):
            # start a training episode
            train_pbar = tqdm(enumerate(range(self.all_args.num_steps_per_epochs)),
                              total=self.all_args.num_steps_per_epochs, mininterval=1 if self.all_args.quick_tqdm else 10)
            all_train_losses = []
            train_infos = {}
            self.policy.agent_classifier.train()
            for _ in train_pbar:
                # sample data form expert dataset
                obs, actions, tags = self.expert_buffer.sample_classifier_batch_data(batch_size=self.all_args.batch_size)
                loss = self.agent_classifier_trainer.update(obs, actions, tags)
                all_train_losses.append(loss)
            print('episode', ep_i, 'mean_train_loss', np.mean(all_train_losses))
            train_infos['train_loss'] = np.mean(all_train_losses)
            # start a validating episode
            finish = False
            all_valid_losses = []
            self.expert_buffer.start_val()
            self.policy.agent_classifier.eval()
            while not finish:
                val_states, val_actions, val_tags, finish = self.expert_buffer.get_all_valid_data(self.all_args.batch_size)
                val_states, val_actions, val_tags = val_states.to(self.device), val_actions.to(self.device), val_tags.to(self.device)
                # print('val_states', val_states.shape)
                val_loss = self.policy.agent_classifier.get_loss(val_states, val_actions, val_tags)
                all_valid_losses.append(val_loss.item())
            print('episode', ep_i, 'mean_valid_loss', np.mean(all_valid_losses))
            train_infos['val_loss'] = np.mean(all_valid_losses)
            # # eval model at each episode
            # if self.use_eval:
            #     self.eval(ep_i)
            # log medium result to tensorboard
            self.log_train(train_infos, ep_i)
            # save model at each saving interval or at last episode
            if ep_i % self.save_interval == 0 or ep_i == self.all_args.num_epochs - 1:
                self.save_classifier(ep_i)
    ######## pretrain classifier

    ############################# my method

    def warmup(self):
        # reset env
        obs, share_obs, _ = self.envs.reset()

        # replay buffer
        if not self.use_centralized_V:
            share_obs = obs

        self.buffer.share_obs[0] = _t2n(share_obs).copy()
        self.buffer.obs[0] = _t2n(obs).copy()
        if self.all_args.mat_use_history:
            self.buffer.history_obs_record[0: self.buffer.history_obs_len - 1] = self.buffer.history_obs_record[1: self.buffer.history_obs_len]
            self.buffer.history_obs_record[:, :, self.buffer.history_obs_len - 1] = _t2n(obs).copy()
            self.buffer.his_obs[0] = self.buffer.history_obs_record.copy()

    @torch.no_grad()
    def collect(self, step):
        self.trainer.prep_rollout()
        value, action, action_log_prob, action_prob, rnn_state, rnn_state_critic, disc_value \
            = self.trainer.policy.get_actions(
            np.concatenate(self.buffer.share_obs[step]),
            np.concatenate(self.buffer.obs[step]),
            self.buffer.get_history_obs_record(),
            np.concatenate(self.buffer.rnn_states[step]),
            np.concatenate(self.buffer.rnn_states_critic[step]),
            np.concatenate(self.buffer.masks[step]))
        # [self.envs, agents, dim]
        values = np.array(np.split(_t2n(value), self.n_rollout_threads))
        actions = np.array(np.split(_t2n(action), self.n_rollout_threads))
        action_log_probs = np.array(np.split(_t2n(action_log_prob), self.n_rollout_threads))
        action_probs = np.array(np.split(_t2n(action_prob), self.n_rollout_threads))
        rnn_states = np.array(np.split(_t2n(rnn_state), self.n_rollout_threads))
        rnn_states_critic = np.array(np.split(_t2n(rnn_state_critic), self.n_rollout_threads))
        disc_values = np.array(np.split(_t2n(disc_value), self.n_rollout_threads))

        return values, actions, action_log_probs, action_probs, rnn_states, rnn_states_critic, disc_values

    def insert(self, data):
        obs, share_obs, rewards, dones, infos, available_actions, \
        values, actions, action_log_probs, action_probs, \
        rnn_states, rnn_states_critic, disc_values, classifier_rewards = data

        dones_env = np.all(dones, axis=1)

        rnn_states[dones_env == True] = np.zeros(((dones_env == True).sum(), self.num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32)
        rnn_states_critic[dones_env == True] = np.zeros(((dones_env == True).sum(), self.num_agents, *self.buffer.rnn_states_critic.shape[3:]), dtype=np.float32)

        masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
        masks[dones_env == True] = np.zeros(((dones_env == True).sum(), self.num_agents, 1), dtype=np.float32)

        active_masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
        active_masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
        active_masks[dones_env == True] = np.ones(((dones_env == True).sum(), self.num_agents, 1), dtype=np.float32)

        # bad_masks = np.array([[[0.0] if info[agent_id]['bad_transition'] else [1.0] for agent_id in range(self.num_agents)] for info in infos])

        if not self.use_centralized_V:
            share_obs = obs

        self.buffer.insert(share_obs, obs, rnn_states, rnn_states_critic,
                           actions, action_log_probs, action_probs, values, rewards, dones, masks, None, active_masks,
                           None, disc_values, classifier_rewards)

    def log_train(self, train_infos, total_num_steps):
        train_infos["average_step_rewards"] = np.mean(self.buffer.rewards)
        print("average_step_rewards is {}.".format(train_infos["average_step_rewards"]))
        for k, v in train_infos.items():
            if self.use_wandb:
                wandb.log({k: v}, step=total_num_steps)
            else:
                if k != 'disc_expert_score' and k != 'disc_policy_score' and \
                        k != 'all_expert_agent_mean_scores' and k != 'all_policy_agent_mean_scores':
                    self.writter.add_scalars(k, {k: v}, total_num_steps)
                else:
                    self.writter.add_scalars(k, v, total_num_steps)

    @torch.no_grad()
    def eval(self, total_num_steps):
        eval_episode = 0

        eval_episode_rewards = []
        one_episode_rewards = []

        eval_obs, eval_share_obs, _ = self.eval_envs.reset()

        eval_rnn_states = np.zeros((self.n_eval_rollout_threads, self.num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32)
        eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)

        while True:
            self.trainer.prep_rollout()
            eval_actions, eval_rnn_states = \
                self.trainer.policy.act(np.concatenate(eval_share_obs),
                                        np.concatenate(eval_obs),
                                        np.concatenate(eval_rnn_states),
                                        np.concatenate(eval_masks),
                                        deterministic=True)
            eval_actions = np.array(np.split(_t2n(eval_actions), self.n_eval_rollout_threads))
            eval_rnn_states = np.array(np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads))

            # Obser reward and next obs
            eval_obs, eval_share_obs, eval_rewards, eval_dones, _, _ = self.eval_envs.step(eval_actions.transpose(1, 0, 2))
            one_episode_rewards.append(eval_rewards)

            eval_dones_env = np.all(eval_dones, axis=1)

            eval_rnn_states[eval_dones_env == True] = np.zeros(((eval_dones_env == True).sum(), self.num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32)

            eval_masks = np.ones((self.all_args.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
            eval_masks[eval_dones_env == True] = np.zeros(((eval_dones_env == True).sum(), self.num_agents, 1), dtype=np.float32)

            for eval_i in range(self.n_eval_rollout_threads):
                if eval_dones_env[eval_i]:
                    eval_episode += 1
                    eval_episode_rewards.append(np.sum(one_episode_rewards, axis=0))
                    one_episode_rewards = []

            if eval_episode >= self.all_args.eval_episodes:
                # eval_episode_rewards = np.array(eval_episode_rewards)
                eval_env_infos = {'eval_average_episode_rewards': eval_episode_rewards,
                                  'eval_max_episode_rewards': [np.max(eval_episode_rewards)]}

                self.log_env(eval_env_infos, total_num_steps)
                print("eval_average_episode_rewards is {}.".format(np.mean(eval_episode_rewards)))
                break

    def eval_online(self):
        eval_episode = 0
        eval_episode_rewards = []
        all_episode_rewards = np.zeros(self.n_eval_rollout_threads)
        all_step_rewards = []

        eval_obs, eval_share_obs, _ = self.eval_envs.reset()

        eval_rnn_states = np.zeros((self.n_eval_rollout_threads, self.num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32)
        eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)

        while True:
            self.trainer.prep_rollout()
            eval_actions, eval_rnn_states = \
                self.trainer.policy.act(np.concatenate(eval_share_obs.cpu().numpy()),
                                        np.concatenate(eval_obs.cpu().numpy()),
                                        np.concatenate(eval_rnn_states),
                                        np.concatenate(eval_masks),
                                        deterministic=True)
            eval_actions = np.array(np.split(_t2n(eval_actions), self.n_eval_rollout_threads))
            eval_rnn_states = np.array(np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads))

            # Obser reward and next obs
            eval_obs, eval_share_obs, eval_rewards, eval_dones, _, _ = self.eval_envs.step(torch.tensor(eval_actions.transpose(1, 0, 2)))
            eval_rewards = eval_rewards.cpu().numpy()
            eval_dones = eval_dones.cpu().numpy()
            all_episode_rewards += np.mean(eval_rewards, axis=1).flatten()
            all_step_rewards.append(np.mean(eval_rewards, axis=1).flatten())
            eval_dones_env = np.all(eval_dones, axis=1)
            eval_rnn_states[eval_dones_env == True] = np.zeros(((eval_dones_env == True).sum(), self.num_agents, self.recurrent_N, self.hidden_size), dtype=np.float32)
            eval_masks = np.ones((self.all_args.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
            eval_masks[eval_dones_env == True] = np.zeros(((eval_dones_env == True).sum(), self.num_agents, 1), dtype=np.float32)

            for eval_i in range(self.n_eval_rollout_threads):
                if eval_dones_env[eval_i]:
                    eval_episode += 1
                    eval_episode_rewards.append(all_episode_rewards[eval_i])
                    all_episode_rewards[eval_i] = 0

            if eval_episode >= self.all_args.eval_episodes:
                print('eval_average_episode_rewards', np.mean(eval_episode_rewards))
                print('eval_max_episode_rewards', np.max(eval_episode_rewards))
                print('eval_step_reward', np.mean(np.concatenate(all_step_rewards, axis=0)))
                return {
                    'eval_average_episode_rewards': np.mean(eval_episode_rewards),
                    'eval_max_episode_rewards': np.max(eval_episode_rewards),
                    'eval_step_reward': np.mean(np.concatenate(all_step_rewards, axis=0))
                }

    def log_episode(self, train_infos, episode_num):
        for k, v in train_infos.items():
            if self.use_wandb:
                wandb.log({k: v}, step=episode_num)
            else:
                self.writter.add_scalars(k, {k: v}, episode_num)

