#!/usr/bin/env python
# coding: utf-8

# In[ ]:


def MA_OSMA(agents,targets):
    length=len(agents)
    n_actions=len(ACTIONS)
    G=cvxopt.matrix(np.concatenate((np.ones((1,n_actions)),np.eye(n_actions),-np.eye(n_actions)),axis=0))
    g=cvxopt.matrix(np.concatenate((np.ones((1,1)),np.ones((n_actions,1)),np.zeros((n_actions,1))),axis=0))
    P=cvxopt.matrix(2*np.eye(n_actions))
    for idx in range(N_TRIAL):
            np.random.seed(idx)
            for t in range(N_STEPS):
                for target in targets:
                    # all target move for one step
                    target.update_distance(agents)
                    target.record_reward(agents)
                    target.update_state(HORIZON,agents,t)
                    target.traj.append(target.state)
                
                for i in range(length):
                    #probability
                    pro=agents[i].pre_prob_dist[i]
                    # sample the next action
                    next_action_index = np.random.choice(agents[i].action_indices, 1, p=pro/sum(pro))[0]
                    agents[i].next_action_index = next_action_index
                    #Aggregate the probability
                    agents[i].action_prob_dist=sum([weight* agents[id_num].pre_prob_dist for id_num,weight in agents[i].neighbors.items()])
                    #update the probability:
                    #    1. first step: compute the stochastic gradient
                    gradient=np.zeros((1,n_actions))
                    for size in range(SAMPLE_SIZE):
                        Location=[]
                        z=math.log((1-math.exp(-1))*np.random.rand()+math.exp(-1))+1
                        for i1 in range(length):
                            state=agents[i1].state
                            for j1 in range(n_actions):
                                action=ACTIONS[j1]
                                if np.random.rand()<=z*agents[i].pre_prob_dist[i1][j1]:
                                    Location.append(np.array([state[0]+action[0],state[1]+action[1]]))
                        gradient+=agents[i].get_losses(Location, targets).reshape(1,n_actions)/SAMPLE_SIZE
                    #    2. Projection
                    agents[i].action_prob_dist[i]+=10*gradient[0]/math.sqrt(t+1)
                    q=cvxopt.matrix(-2*agents[i].action_prob_dist[i])
                    sol=cvxopt.solvers.qp(P,q,G,g)
                    point=np.array(sol['x'])
                    point=point.reshape(1,n_actions)
                    agents[i].action_prob_dist[i]=point
                    agents[i].action_prob_dist[i]=(agents[i].action_prob_dist[i]>0)*agents[i].action_prob_dist[i]
                for i in range(length):
                    # apply the next action 
                    agents[i].apply_next_action()
                    agents[i].traj.append(agents[i].state)
                    agents[i].pre_prob_dist=copy.deepcopy(agents[i].action_prob_dist)

