#!/usr/bin/env python
# coding: utf-8

# In[ ]:


def MA_MPL(agents,targets,K=15,batch_size=10):
    length=len(agents)
    n_actions=len(ACTIONS)
    G=cvxopt.matrix(np.concatenate((np.ones((1,n_actions)),np.eye(n_actions),-np.eye(n_actions)),axis=0))
    g=cvxopt.matrix(np.concatenate((np.ones((1,1)),np.ones((n_actions,1)),np.zeros((n_actions,1))),axis=0))
    P=cvxopt.matrix(2*np.eye(n_actions))
    #intialized K oracles for each agent
    Oracles=[[(1/n_actions)*np.ones((1,n_actions)) for k in range(K)] for i in range(length)]
    for idx in range(N_TRIAL):
            np.random.seed(idx)
            for t in range(N_STEPS):
                for target in targets:
                    # all target move for one step
                    target.update_distance(agents)
                    target.record_reward(agents)
                    target.update_state()
                    target.traj.append(target.state)
                
                for k in range(K):
                    for i in range(length):
                        agents[i].pre_prob_dist[i]+=(1/K)*Oracles[i][k][0]
                    for i in range(length):
                        agents[i].action_prob_dist=np.maximum.reduce([agents[id_num].pre_prob_dist for id_num,weight in agents[i].neighbors.items()])
                        #1. first step: compute the stochastic gradient
                        gradient=np.zeros((1,n_actions))
                        for size in range(batch_size):
                            Location=[]
                            for i1 in range(length):
                                #generate actions
                                new_pre_prob=(agents[i].action_prob_dist[i]).tolist()+[1-sum(agents[i].action_prob_dist[i])]
                                new_action_indices=agents[i].action_indices.tolist()+['empty']
                                new_action=np.random.choice(new_action_indices, 1, p=new_pre_prob)[0]
                                if new_action!='empty':
                                    Location.append((i1,int(new_action)))
                            gradient+=agents[i].get_losses2(Location,i,targets,agents).reshape(1,n_actions)/batch_size
                        #2. Projection
                        Oracles[i][k]+=10*gradient/math.sqrt(t+1)
                        q=cvxopt.matrix(-2*Oracles[i][k][0])
                        sol=cvxopt.solvers.qp(P,q,G,g)
                        point=np.array(sol['x'])
                        point=point.reshape(1,n_actions)
                        Oracles[i][k]=point
                        Oracles[i][k]=(Oracles[i][k]>0)*Oracles[i][k]
                        SUM=sum(Oracles[i][k][0])
                        if SUM>1:
                            Oracles[i][k]=0.999*Oracles[i][k]/SUM
                            
                    for i in range(length):
                        # update pre_prob
                        agents[i].pre_prob_dist=copy.deepcopy(agents[i].action_prob_dist)
                for i in range(length):
                    #take action
                    pro=agents[i].action_prob_dist[i]
                    # sample the next action
                    next_action_index = np.random.choice(agents[i].action_indices, 1, p=pro/sum(pro))[0]
                    agents[i].next_action_index = next_action_index
                    agents[i].apply_next_action()
                    agents[i].traj.append(agents[i].state)
                    agents[i].Initialize(length,False)

