function Policy=Whittle_index(R, lam1, lam2)
% initialization
%K=2; % Two bandits
%lambda=40; %birth rate
%mu=50; % death rate
%delta=0.01; %  unit time gap
%c=[2 1.5];
%b=[0.1 1]; % coeeficients for the cost function


state = 0:1:10;                      % set of states. Assume the maximum number of request is 10
rho=R;
lambda1=lam1;
lambda2=lam2;
mu1=lambda1*rho;
mu2=lambda2*rho;


W1=zeros(1,length(state));
W2=zeros(1,length(state));    % Whittle index for two bandits

Ecost1=zeros(1,length(state));
Ecost2=zeros(1,length(state));   


 action ={[0 0] [0 1] [1 0]};                     % set of actions.  maximun number of active bandit is 1

%% Calculate the Wittle index
Iteration=100; % For expectation 
T=100; % Time duration for running the policy

C1=zeros(1, Iteration);    % Cost value
C2=zeros(1, Iteration);
for N=0:max(state)
    for index=1:Iteration
        
                if index==1
                s1(1)=5;
                s2(1)=5;
                else
                 s1(1)=s1(T);
                 s2(1)=s2(T);
                end
                
            for i=2:T
               if s1(i-1)<N % Index policy
                a1=0;
                s1(i)=s1(i-1)+1;
               else
                a1=1;
                 temp=rand(1); 
                 if temp<lambda1/(lambda1+mu1*s1(i-1)) && s1(i-1)<10 
                 s1(i)=s1(i-1)+1;
                 else 
                 s1(i)=s1(i-1)-1;
                 end
               end
               
                
               if s2(i-1)<N % Index policy
                a2=0;
                s2(i)=s2(i-1)+1;
               else
                a2=1;
                 temp=rand(1); 
                 if temp<lambda2/(lambda2+mu2*s2(i-1))  && s2(i-1)<10 
                 s2(i)=s2(i-1)+1;
                 else 
                 s2(i)=s2(i-1)-1;
                 end
                end
           
            
            C1(index)=C1(index)+reward(s1(i-1),lambda1);
            C2(index)=C2(index)+reward(s2(i-1),lambda2);
               
            end
           
    end
    Ecost1(N+1)=mean(C1);
    Ecost2(N+1)=mean(C2);
end



% calculate the stationary probability
Q=zeros(11);
for i=1:length(state)
   for j=1:length(state)
       
       if j<i
       Q(i,j)=0;
       elseif j>i
           temp1=1;
           for m=1:j-i
           temp1=temp1*(i-1+m);
           end
         Q(i,j)=(1/rho)^(j-i)*Q(i,i)/temp1;
       else
           temp=0;
           
           for m=1:1:11-i
               temp1=1;
               for n=1:m
               temp1=temp1*(i-1+n);
               end
               temp=temp+(1/rho)^(m)/temp1;
           end
           Q(i,j)=1/(1+temp);
       end
   end

end



for i=2:length(state)
    W1(i)= (Ecost1(i)- Ecost1(i-1));
    W2(i)= (Ecost2(i)- Ecost2(i-1));
end
    



% for i=2:length(state)-1
%     W1(i)= (Ecost1(i)- Ecost1(i-1))/(Q(i,i)-Q(i-1,i-1));
%     W2(i)= (Ecost2(i)- Ecost2(i-1))/(Q(i,i)-Q(i-1,i-1));
% end
%     W1(length(state))=W1(length(state)-1);
%      W2(length(state))=W2(length(state)-1);

%%
I=zeros(11);
for n1=1:11
    for n2=1:11
      if n1==1&&n2==1
          I(n1, n2)=1;
      elseif W1(n1)>=W2(n2)
          I(n1, n2)=3;
      else
          I(n1, n2)=2;
      end
    end
    
end

%% show the results
% disp(W1);                                       % show the final whittle index
% disp(W2);
% disp('Optimal Policy');
% disp('*');
 Policy=reshape(I',121,1);
% disp(Policy);
% disp('*');
end


%% this function is the reward function for the task (stochastic)
% the inputs are: the current state, the chosen action, and the next state
% the output is the expected reward in the next state
% the reward actually doesn't depend on the chosen action, in this case
function r = reward(s,a)
r=s/a;
end