%function Policy=RMAB_value_iteration
% initialization
clear all


S = [1 2 3 4];                      % set of states.  
 
action ={0 1};                            % set of actions.  maximun number of active bandit is 1


% make all (state,action) pairs indices
[A,B] = meshgrid(1:length(S),1:length(action));
C = cat(2,A',B');
D = reshape(C,[],2);                        % D includes all possible combination of (state,action) pairs indices
%%

%%==================================
Q = zeros(length(S),length(action));    % initial Q can be chosen arbitrariliy
Qold = Q;                                   % save a backup to compare later
Horizon = 10000;                                     % maximum number of iterations
gamma = 1;                                % discounting factor
epsilon = 0.0001;                            % final error to stop the algorithm
%% Stochastic Q-iteration algorithm
for l = 1:Horizon
    disp(['iteration: ' num2str(l)]);
    for ii = 1:size(D,1)
        [Next_state,Prob,Reward] = model(D(ii,1),D(ii,2)); %% includes the next states and the probability and Reward
       
        
       
      Q(D(ii,1),D(ii,2)) = Prob(1)*(Reward+gamma*max(Q(Next_state(1),:)))+Prob(2)*(Reward+gamma*max(Q(Next_state(2),:)))+Prob(3)*(Reward+gamma*max(Q(Next_state(3),:)))+Prob(4)*(Reward+gamma*max(Q(Next_state(4),:)));
        
    end
    if abs(sum(sum(Q - Qold))) < epsilon
        disp('Epsilon criteria satisfied!');
        break;
    else
        % disp(Q);                            % show Q matrix in each iteration
        Qold = Q;
    end
end
%% show the results
Q=Q/Horizon;
disp(Q)                                       % show the final Q matrix
[C,I]=max(Q,[],2);                              % finding the min values
disp('Q(optimal):');
disp(C);
disp('Optimal Policy');
disp('*');
Policy=I;
disp(I)
%disp(S)
disp('*');
%%


 
%% find the possible next states together with their probability value
function [Next_state,Prob,Reward] = model(s,a)
  
   Next_state = [1 2 3 4];
   Prob =  [prob(s,a,1)  prob(s,a,2) prob(s,a,3) prob(s,a,4)];
   Reward = reward(s,a);
end
%% this function is the transition model of the robot
% the inputs are: the current state, the chosen action, and the next desired state
% the output is the probability for going to the next state considering the stochasity
%
% ** in the deterministic case the model gives the next state
% ** but in the stochastic case the model gives the probability for going
% to the next state
function f = prob(s,a,s_next)
Transition_kernel=cell(2,1);
Transition_kernel{1}=[0.5 0 0 0.5; 0.5 0.5 0 0; 0 0.5 0.5 0; 0 0 0.5 0.5];
Transition_kernel{2}=[0.5 0 0 0.5; 0.5 0.5 0 0; 0 0.5 0.5 0; 0 0 0.5 0.5]';

f=Transition_kernel{a}(s, s_next);
end
%% this function is the reward function for the task (stochastic)
% the inputs are: the current state, the chosen action, and the next state
% the output is the expected reward in the next state
function r = reward(s,a)
if s==1
    r=-1+(1-a)*(0);
elseif s==2 || s==3
    r=0+(1-a)*(0);
else
    r=1+(1-a)*(0);
end
end