import numpy as np
from .utils.buffer import ReplayMemory
from .utils.disc import DiscTrainer
from .utils.sac import SACTrainer

class DiaynTrainer:
    def __init__(self, obs_shape, action_space, rc, src, args) -> None:
        self.memories = [ReplayMemory(args.buffer_limit) for _ in range(args.num_modes)]
        self.disc_memory = ReplayMemory(args.buffer_limit)
        self.trajectory = []
        self.trainers = [SACTrainer(obs_shape, action_space, args) for _ in range(args.num_modes)]
        self.disc_trainer = DiscTrainer(obs_shape, args)
        self.total_episodes = 0
        self.step_t = 0
        self.sr_min = np.log(1/args.num_modes)*10
        self.args = args
        self.rc = rc
        self.src = src

    def start_episode(self, label):
        self.current_id = label
        self.current_trainer = self.trainers[label]
        self.current_memory = self.memories[label]
        self.trajectory = []
        self.obs_list = []
        self.reward_list = []
        self.sr_list = []
        self.acc_list = []
        self.r_list = []

    def act(self, obs):
        a, logprob = self.current_trainer.act(obs)
        if len(a.shape) > 1:
            a = a[0]
        return a, logprob

    def record(self, obs, action, logprob, reward, new_obs, mask):
        self.trajectory.append([obs, action, logprob, reward, new_obs, mask])
        self.obs_list.append(obs)
        self.r_list.append(reward)
        self.step_t += 1
        label = np.array([self.current_id])
        score = self.disc_trainer.score(obs, label)
        acc = np.exp(score)
        sr = np.clip(score, self.sr_min, None)
        self.sr_list.append(sr)
        self.acc_list.append(acc)
        comp_reward = reward * self.rc + sr * self.src
        self.reward_list.append(comp_reward)
        self.current_memory.push((obs, action, logprob, comp_reward, new_obs, mask))
        self.disc_memory.push((label, obs))
         
    def end_episode(self):
        assert self.step_t == len(self.trajectory)
        original_episode_return = np.sum(self.r_list)
        original_episode_sr = np.sum(self.sr_list)
        episode_return = np.sum(self.reward_list)
        avg_acc = np.mean(self.acc_list)
        
        self.step_t = 0
        self.total_episodes += 1
        self.trajectory = []
        self.obs_list = []
        self.reward_list = []
        self.sr_list = []
        self.acc_list = []
        self.r_list = []
        return original_episode_return, original_episode_sr, episode_return, avg_acc

    def update_policy(self, updates):
        state_batch, action_batch, logprob_batch, reward_batch, next_state_batch, mask_batch = self.current_memory.sample(batch_size=self.args.batch_size)
        c1_loss, c2_loss, p_loss, ent_loss, alpha = self.current_trainer.update_parameters((state_batch, action_batch, logprob_batch, reward_batch, next_state_batch, mask_batch), updates)
        return c1_loss, c2_loss, p_loss, ent_loss, alpha

    def can_update_policy(self):
        return len(self.current_memory) > self.args.batch_size

    def update_disc(self):
        label_batch, state_batch = self.disc_memory.sample(batch_size=self.args.disc_batch_size)
        d_loss = self.disc_trainer.update_parameters((label_batch, state_batch))
        return d_loss
   
    def can_update_disc(self):
        return len(self.disc_memory) > self.args.disc_batch_size
        
    def save_models(self):
        for i in range(len(self.trainers)):
            self.trainers[i].save_model(env_name=self.args.scenario, suffix="{}".format(i))