#!/usr/bin/env python
# coding: utf-8

# In[ ]:


def MA_OSEA(agents,targets):
    rate=1/(N_STEPS**(1.5))
    length=len(agents)
    n_actions=len(ACTIONS)
    for idx in range(N_TRIAL):
            np.random.seed(idx)
            for t in range(N_STEPS):
                for target in targets:
                    # all target move for one step
                    target.update_distance(agents) 
                    target.record_reward(agents)
                    target.update_state(HORIZON,agents,t)
                    target.traj.append(target.state)
                
                
                for i in range(length):
                    #probability
                    pro=agents[i].pre_prob_dist[i]
                    # sample the next action
                    next_action_index = np.random.choice(agents[i].action_indices, 1, p=pro/sum(pro))[0]
                    agents[i].next_action_index = next_action_index
                    #Aggregate the probability
                    #print(np.sum(agents[i].action_prob_dist))
                    agents[i].action_prob_dist=sum([weight* agents[id_num].pre_prob_dist for id_num,weight in agents[i].neighbors.items()])
                    agents[i].action_prob_dist=(1-rate)*agents[i].action_prob_dist+(rate/(length*n_actions))*np.ones((length,n_actions))
                    #print(np.sum(agents[i].action_prob_dist))
                    #update the probability:
                    #    1. first step: compute the stochastic gradient
                    gradient=np.zeros((1,n_actions))
                    for size in range(SAMPLE_SIZE):
                        Location=[]
                        z=math.log((1-math.exp(-1))*np.random.rand()+math.exp(-1))+1
                        for i1 in range(length):
                            state=agents[i1].state
                            for j1 in range(n_actions):
                                action=ACTIONS[j1]
                                if np.random.rand()<=z*agents[i].pre_prob_dist[i1][j1]:
                                    Location.append(np.array([state[0]+action[0],state[1]+action[1]]))
                        gradient+=agents[i].get_losses(Location, targets).reshape(1,n_actions)/SAMPLE_SIZE
                    #    2. Projection
                    ###### for avoiding overflow of np.exp()
                    Index_Set=100*gradient[0]/math.sqrt(t+1)
                    MAX=max(Index_Set)
                    if MAX>700:
                        Index_Set*=700/MAX  
                    ############################
                    agents[i].action_prob_dist[i]*=np.exp(Index_Set)
                    agents[i].action_prob_dist[i]=agents[i].action_prob_dist[i]/sum(agents[i].action_prob_dist[i])   
                for i in range(length):
                    # apply the next action 
                    agents[i].apply_next_action()
                    agents[i].traj.append(agents[i].state)
                    agents[i].pre_prob_dist=copy.deepcopy(agents[i].action_prob_dist)

