function Reward=UCWhittle1(N, M, P0, P0next, P0prev, P1, P1next, P1prev, H, gamma, epsilon, R, T)

%% reward initialization
Reward=zeros(1,T);

%% initialization of counts 
C0=ones(N, T);
C0next=ones(N, T);
C0prev=ones(N, T);
C1=ones(N, T);
C1next=ones(N, T);
C1prev=ones(N, T);
eta=0.05;


%% Initial probability
Q0next(1:N)=0.25;
Q0prev(1:N)=0.25;
Q0(1:N)=1-Q0next(1:N)-Q0prev(1:N);
Q1next(1:N)=0.25;
Q1prev(1:N)=0.25;
Q1(1:N)=1-Q1next(1:N)-Q1prev(1:N);
Q0Last(1:N)=0.25;
Q1Last(1:N)=0.25;

Whittle=zeros(N, length(R));
for t=1:T

    %% Whittle Index Calculation
    num=sqrt(2*log10(length(R)*N*T*(t^4)/eta));
    state=ones(1, N);
    for i=1:N
    Whittle(i,:)=WhittleIndex1(R, Q0(i), Q0next(i), Q0prev(i), Q0Last(i), Q1(i), Q1next(i), Q1prev(i), Q1Last(i),  gamma);
    end

    %% Episodes
    for h=1:H

        %% action selection 
        a=zeros(1, N);
        for i=1:N
           W(i)=Whittle(i,state(i));
        end
        [~, index]=maxk(W, M);
        a(index)=1;

        %% State Transitions
        for i=1:N
            s=state(i);
            randnumber=rand;
            if a(i)==1
                if randnumber<=P1(i, t)
                    state(i)=state(i);
                    C1(i, t)=C1(i, t)+1;
                elseif randnumber>P1(i, t) && randnumber<=P1(i, t)+P1next(i, t) 
                    C1next(i, t)=C1next(i, t)+1;
                    if s<length(R)
                       state(i)=state(i)+1;
                    else
                        state(i)=state(i);
                    end
                else
                    C1prev(i, t)=C1prev(i, t)+1;
                    if s>1
                       state(i)=state(i)-1;
                    else
                        state(i)=state(i);
                    end
                end
            else
                if randnumber<=P0(i, t)
                    state(i)=state(i);
                    C0(i, t)=C0(i, t)+1;
                elseif randnumber>P0(i, t) && randnumber<=P0(i,t)+P0next(i, t)
                    C0next(i, t)=C0next(i, t)+1;
                    if s<length(R)
                       state(i)=state(i)+1;
                    else
                        state(i)=state(i);
                    end
                else
                    C0prev(i, t)=C0prev(i, t)+1;
                    if s>1
                       state(i)=state(i)-1;
                    else
                       state(i)=state(i);
                    end
                end
            end
        end

        %% Reward Calculation 
        for i=1:N
        Reward(t)=Reward(t)+gamma^(h-1)*R(state(i));
        end
    end

    %% Optimistic Probability 
    for i=1:N
        lw=1;
        c=sum(C0(i,lw:t));
        cprev=sum(C0prev(i,lw:t));
        delta=num/sqrt((c+cprev));
        Q0Last(i)=min(delta,1); %in optimistic estimation, action value is maximized when last state is visited

        if c/(c+cprev)<1-Q0Last(i)
             Q0(i)=c/(c+cprev);
%              if cprev/(c+cprev)<1-Q0(i)-Q0Last(i)
%                   Q0(i)=c/(c+cprev);
%              else
%                   Q0(i)=max(1-Q0Last(i), 0);
%              end
        else
            Q0(i)=1-Q0Last(i);
        end
        Q0prev(i)=max(1-Q0(i)-Q0Last(i),0);

       
        c=sum(C1(i,lw:t));
        cnext=sum(C1next(i,lw:t));
        delta=num/sqrt((c+cnext));
        Q1Last(i)=min(delta,1);
        if cnext/(c+cnext)<1-Q1Last(i)
             Q1next(i)=min(cnext/(c+cnext), 1);
%              if c/(c+cnext)<1-Q1next(i)-Q1Last(i)
%                   Q1(i)=c/(c+cnext);
%              else
%                   Q1(i)=max(1-Q1next(i)-Q1Last(i), 0);
%              end
        else
            Q1next(i)=1-Q1Last(i);
        end
         Q1(i)=max(1-Q1Last(i)-Q1next(i));

%         q=Q1next(i)
%         p=P1next(i, t)
%         q1=Q1Last(i)
    end
end 
