from core import utils
import numpy as np, random, sys

def rollout_worker(args, id, env_constructor, worker_type, task_pipe, result_pipe, models_bucket, store_transitions, random_baseline):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            worker_id (int): Specific Id unique to each worker spun
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            noise (object): A noise generator object
            exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
            pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
            difficulty (int): Difficulty of the task
            use_rs (bool): Use behavioral reward shaping?
            store_transition (bool): Log experiences to exp_list?


            models_bucket: [popn_id, agent_id]

        Returns:
            None
    """

    FITNESS_AVERAGING = 4
    env = env_constructor.make_env()

    np.random.seed(id); random.seed(id)

    while True:

        team_id = task_pipe.recv() #Wait until a signal is received  to start rollout
        if team_id == 'TERMINATE': exit(0)  # Kill yourself
        team_id = team_id%len(models_bucket)
        team = models_bucket[team_id]
        # Get the current team actors
        fitness = 0; shaped_reward = [0, 0]; entropy_bonus = 0
        frame = 0
        for its in range(FITNESS_AVERAGING):
            episode_step = 0
            state_n = env.reset()
            rollout_trajectory = [[] for _ in range(len(team))]
            #print(joint_state.shape)
            while episode_step < 3000: #unless done
                joint_state = [utils.to_tensor(state)[None] for state in state_n]
                action_type = "noisy_action" if worker_type == 'pg' else "clean_action"
                action_n = [getattr(agent, action_type)(state, return_only_action=True)[0].detach().numpy() for agent, state in zip(team, joint_state)]
                next_state_n, reward_n, done_n, global_reward = env.step(action_n)  # Simulate one step in environment
                episode_step += 1
                step_reward = sum(reward_n)/len(reward_n)
                fitness += step_reward
                if store_transitions:
                    for i in range(len(team)):
                      if action_n[i].ndim == 0:
                          action_n[i] = np.array([action_n[i]])
                      rollout_trajectory[i].append((state_n[i],
                                                    action_n[i],
                                                    next_state_n[i],
                                                    [reward_n[i]],
                                                    [done_n[i]]))
                state_n = next_state_n
                frame+=1
                #DONE FLAG IS Received
                if all(done_n):
                    break


        #Renormalize
        entropy_bonus /= (FITNESS_AVERAGING*1.0)
        fitness/=(FITNESS_AVERAGING*1.0)
        shaped_reward = [item /(FITNESS_AVERAGING*1.0) + entropy_bonus for item in shaped_reward ]
        entropy_fitness = fitness + entropy_bonus
        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([team_id, [fitness], frame, rollout_trajectory, entropy_fitness, shaped_reward])
