function Reward=OurPolicyWindow(N, M, P0, P0next, P0prev, P1, P1next, P1prev, H, gamma, epsilon, win, R, T)
Reward=zeros(1,T);
C0=ones(1, T);
C0prev=ones(1, T);
C1=ones(1, T);
C1next=ones(1, T);
eta=0.05;
num=sqrt(2*log10(N*T/eta));
%num=0;

for i=1:N
%     if i>N/2
      Q0next=0;
      Q0prev=0.5;
      Q0=1-Q0next-Q0prev;
      Q1next=0.5;
      Q1prev=0;
      Q1=1-Q1next-Q1prev;
      Q0Last=0;
      Q1Last=0;
%     else
%      Q0next(i)=P0next(i);
%       Q0prev(i)=P0prev(i);
%       Q0(i)=P0(i);
%       Q1next(i)=P1next(i);
%       Q1prev(i)=P1prev(i);
%       Q1(i)=P1(i);
%       Q0Last(i)=0;
%       Q1Last(i)=0;
%     end
end

Whittle=zeros(N, length(R));
for t=1:T
    state=ones(1, N);
    for i=1:N
    if i>N/2
    Whittle(i,:)=WhittleIndex1(R, Q0, 0, Q0prev, 0, Q1, Q1next, 0, 0,  gamma);
    else
    Whittle(i,:)=WhittleIndex1(R, P0(i, t), P0next(i, t), P0prev(i, t), 0, P1(i, t), P1next(i, t), P1prev(i, t), 0,  gamma);
    end
    end
    for h=1:H
        a=zeros(1, N);
        for i=1:N
           W(i)=Whittle(i,state(i));
        end
  
        [~, index]=maxk(W, M);

        a(index)=1;
        for i=1:N
            s=state(i);
            randnumber=rand;
            if a(i)==1
                if randnumber<=P1(i, t)
                    state(i)=state(i);
                    if i>N/2
                    C1(t)=C1(t)+1;
                    end
                    
                else 
                    if i>N/2
                    C1next(t)=C1next(t)+1;
                    end
                    if s<length(R)
                       state(i)=state(i)+1;
                    else
                        state(i)=state(i);
                    end
                end
            else
                if randnumber<=P0(i, t)
                    state(i)=state(i);
                    if i>N/2
                    C0(t)=C0(t)+1;
                    end
                else
                    if i>N/2
                    C0prev(t)=C0prev(t)+1;
                    end
                    if s>1
                       state(i)=state(i)-1;
                    else
                       state(i)=state(i);
                    end
                end
            end
        end
        for i=1:N
        Reward(t)=Reward(t)+gamma^(h-1)*R(state(i));
        end
    end

       
        c=sum(C1(1:t));
        cnext=sum(C1next(1:t));
        delta=num/sqrt((c+cnext));
        Q1next=min(c/(c+cnext)+delta, 1);
        Q1=1-Q1next;

           %epsilon estimate 
        if t>1
         c=sum(C0(t-1:t));
         cprev=sum(C0prev(t:t));
         Qemp(t)=min(c/(c+cprev), 1)
        end
        if t>2
         epsilon=max(epsilon, 2*abs(Qemp(t)-Qemp(t-1)));
         end
       epsilon
       win=ceil(1/epsilon);

        lw=max(t-win+1,1);
        c=sum(C0(lw:t));
        cprev=sum(C0prev(lw:t));
        delta=num/sqrt((c+cprev));
        Q0=min(c/(c+cprev)+delta+win*epsilon/2, 1);
        Q0prev=1-Q0;

    end
        
    end

