function Reward=OurPolicy1(user, M,  Channel, T, H, gamma, actualprob, w, epsilon)

Reward=zeros(1,T);
count0=zeros(user, T); %failure event 
count1=zeros(user, T); %success event

eta=0.05; 
num=2*log10(user*T/eta);
prob=actualprob(:,1)';


for t=1:T
    t
    R=zeros(1, user);
for i=1:user
     win(i)=ceil((1/epsilon(i))^(2/3));
end 
    %% Episode
    AoI=zeros(1, user);

    for h=1:H
        time=(t-1)*H+h;
        a=zeros(1, user);
        for i=1:user
           Whittle(i)=WhittleIndex(AoI(i), w(i), prob(i));
        end
       if rand<0
       index=randperm(user,M);
       else
       [~, index]=maxk(Whittle, M);
       end

       a(index)=1;
       for i=1:user
           if a(i)==1 
               if rand<=actualprob(i, t)
                   count1(i, t)=count1(i, t)+1;
                   AoI(i)=1;
               else
                   count0(i, t)=count0(i, t)+1;
                   AoI(i)=AoI(i)+1;
               end
           else
               AoI(i)=AoI(i)+1;
           end
       end
        for i=1:user
         R(i)=R(i)+gamma^(h-1)*AoIfunction(AoI(i), w(i));
         Reward(t)=Reward(t)+gamma^(h-1)*AoIfunction(AoI(i), w(i));
        end
    end



    %% estimate
    for i=1:user
    lw(i)=max(t-win(i)+1,1);
    C0=sum(count0(i, lw(i):t));
    C1=sum(count1(i, lw(i):t));
    den=max(C1+C0,1);
    prob(i)=min(C1/(C1+C0), 1);
    prob(i)=min(prob(i)+num/den+sqrt(epsilon(i)/2), 1);
     end
     cact=actualprob(:,t)';
end




