clear all; close all; clc;
n = 3; % number of states
m = 4; % number of actions per state
num_iter = 1000000;
gamma = 0.8;
% Payoff matrices
R1 = zeros(m,m,n);
R2 = zeros(m,m,n);
for i = 1:n
   R1(:,:,i) = rand(m,m)*i^2; 
   R2(:,:,i) = R1(:,:,i)';
end
% transition probabilities
pp = cell(n,n);
for i = 1:n
    for j1 = 1:m
        for j2 = 1:m
            ppp = rand(n,1)*rand + 0.1*ones(n,1); ppp = ppp/sum(ppp);
            for ii = 1:n
                pp{ii,i}(j1,j2) = ppp(ii);
            end
        end
    end
end
state = 1; % initial state

% Initialization
Q1 = R1;
v1 = zeros(num_iter+1,n);
vdif = zeros(num_iter,n);
p1 = zeros(m,n);
Q2 = R2;
v2 = zeros(num_iter+1,n);
p2 = zeros(m,n);
for i = 1:n
    p1(:,i) = rand(m,1);
    p1(:,i) = p1(:,i)/sum(p1(:,i));
    p2(:,i) = rand(m,1);
    p2(:,i) = p2(:,i)/sum(p2(:,i));
end
% Iterations
pstate= zeros(n,1);
for k = 1:num_iter
    pstate(state) = pstate(state)+1;
    a1 = best(Q1(:,:,state)*p2(:,state)); % uniform tie breaking rule
    a2 = best(Q2(:,:,state)*p1(:,state)); % uniform tie breaking rule
    p1(:,state) = p1(:,state) + alpha(k-1)*(a1-p1(:,state));
    p2(:,state) = p2(:,state) + alpha(k-1)*(a2-p2(:,state));
    Q1(:,:,state) = Q1(:,:,state) + beta(k-1)*...
        (R1(:,:,state) + gamma*exp_cont_payoff(v1(k,:),pp,state) - Q1(:,:,state));
    Q2(:,:,state) = Q2(:,:,state) + beta(k-1)*...
        (R2(:,:,state) + gamma*exp_cont_payoff(v2(k,:),pp,state)' - Q2(:,:,state));
    for i = 1:n
        v1(k+1,i) = max(Q1(:,:,i)*p2(:,i));
        v2(k+1,i) = max(Q2(:,:,i)*p1(:,i));
        vdif(k+1,i) = v1(k+1,i)-v2(k+1,i);
    end
    state = next(pp,state,a1,a2);
end

pstate = pstate/num_iter;

figure,
for i = 1:n
    semilogx(100:100:num_iter,v1(100:100:num_iter,i)','r',...
        100:100:num_iter,v2(100:100:num_iter,i)','b',...
        100:100:num_iter,vdif(100:100:num_iter,i)','k')
    hold on
end