H=10; %set num of steps
S=10; %set num of steps
A=5; %set num of actions
[P,R,P_kernel]=buildmodel(S,A);
[pi_tar,pi_opt,delta1,delta2,Q_tar,V_opt]=pi_gen(P,R,H,0.2);
%%  Set parameters
K=10^9; %set num of episodes
core = 3; %set number of workers
delete(gcp('nocreate'))
parpool(core)
c1=1;
c2=0.25;
tic
T=sqrt(H*K);
if mod(T,1)~=0 
   T=fix(sqrt(H*K))+1;
end
Loss_alpha_UCBH=zeros(1,T);
Cost_alpha_UCBH=zeros(1,T);
Loss_LCB_UCBH=zeros(1,T);
Cost_LCB_UCBH=zeros(1,T);
Regret_UCBH=zeros(1,T);
Regret_r_UCBH=zeros(1,T);
Loss_alpha_UCBVI=zeros(1,T);
Cost_alpha_UCBVI=zeros(1,T);
Loss_LCB_UCBVI=zeros(1,T);
Cost_LCB_UCBVI=zeros(1,T);
Regret_UCBVI=zeros(1,T);
Regret_r_UCBVI=zeros(1,T);
Loss_alpha_UCBB=zeros(1,T);
Cost_alpha_UCBB=zeros(1,T);
Loss_LCB_UCBB=zeros(1,T);
Cost_LCB_UCBB=zeros(1,T);
Regret_UCBB=zeros(1,T);
Regret_r_UCBB=zeros(1,T);
parfor (i=1:9,core)
    if i==1
        [Loss_1,Cost_1]=alpha_UCBB(Q_tar,K,H,S,A,P_kernel,pi_tar,pi_opt,c1,c2,R);
        Loss_alpha_UCBB=Loss_1+Loss_alpha_UCBB;
        Cost_alpha_UCBB=Cost_1+Cost_alpha_UCBB;
    end
    if i==2
        [Loss_2,Cost_2]=LCBH_UCBB(K,H,S,A,P_kernel,pi_tar,pi_opt,c1,c2,R);
        Loss_LCB_UCBB=Loss_2+Loss_LCB_UCBB;
        Cost_LCB_UCBB=Cost_2+Cost_LCB_UCBB;
    end
    if i==3
        [Regret_1,Regret_2]=UCBB_agent(P,K,H,S,A,P_kernel,pi_opt,c1,c2,R);
        Regret_UCBB=Regret_UCBB+Regret_1;
        Regret_r_UCBB=Regret_r_UCBB+Regret_2;
    end
    if i==4
        [Loss_3,Cost_3]=alpha_UCBVI(Q_tar,K,H,S,A,P_kernel,pi_tar,pi_opt,c2,R)
        Loss_alpha_UCBVI=Loss_3+Loss_alpha_UCBVI;
        Cost_alpha_UCBVI=Cost_3+Cost_alpha_UCBVI;
    end
    if i==5
        [Loss_4,Cost_4]=LCBH_UCBVI(K,H,S,A,P_kernel,pi_tar,pi_opt,c2,R)
        Loss_LCB_UCBVI=Loss_4+Loss_LCB_UCBVI;
        Cost_LCB_UCBVI=Cost_4+Cost_LCB_UCBVI;
    end
    if i==6
        [Regret_3,Regret_4]=UCBVI_agent(P,K,H,S,A,P_kernel,pi_opt,c2,R)
        Regret_UCBVI=Regret_UCBVI+Regret_3;
        Regret_r_UCBVI=Regret_r_UCBVI+Regret_4;
    end
    if i==7
        [Loss_5,Cost_5]=alpha_UCBH(Q_tar,K,H,S,A,P_kernel,pi_tar,pi_opt,c2,R);
        Loss_alpha_UCBH=Loss_5+Loss_alpha_UCBH;
        Cost_alpha_UCBH=Cost_5+Cost_alpha_UCBH;
    end
    if i==8
        [Loss_6,Cost_6]=LCBH_UCBH(K,H,S,A,P_kernel,pi_tar,pi_opt,c2,R);
        Loss_LCB_UCBH=Loss_6+Loss_LCB_UCBH;
        Cost_LCB_UCBH=Cost_6+Cost_LCB_UCBH;
    end
    if i==9
        [Regret_5,Regret_6]=UCBH_agent(P,K,H,S,A,P_kernel,pi_opt,c2,R);
        Regret_UCBH=Regret_UCBH+Regret_5;
        Regret_r_UCBH=Regret_r_UCBH+Regret_6;
    end
end
toc
filename = 'UCBH_UCBVI_UCBB.mat';
save(filename)

%% plot UCBB
x=[(1:(length(Loss_LCB_UCBB)-1)).^2,K*H];
figure()
semilogx(x,Loss_LCB_UCBB)
hold on
semilogx(x,Cost_LCB_UCBB)
hold on
semilogx(x,Loss_alpha_UCBB)
hold on
semilogx(x,Cost_alpha_UCBB)
hold on 
semilogx(x,Regret_UCBB)
hold on
semilogx(x,Regret_r_UCBB)
%xlim([10^6 10^10])
legend('Loss of LCB-H black-box attack', 'Cost of LCB-H black-box attack','Loss of 1/H-portion white-box attack', 'Cost of 1/H-portion white-box attack','Non-optimal action pull count under no attack','Regret of UCB-B under no attack','Location','northwest')
xlabel('Tiem step (t)')
figure()
plot(x,Loss_LCB_UCBB)
hold on
plot(x,Cost_LCB_UCBB)
hold on
plot(x,Loss_alpha_UCBB)
hold on
plot(x,Cost_alpha_UCBB)
hold on 
plot(x,Regret_UCBB)
hold on
plot(x,Regret_r_UCBB)
legend('Loss of LCB-H black-box attack', 'Cost of LCB-H black-box attack','Loss of 1/H-portion white-box attack', 'Cost of 1/H-portion white-box attack','Non-optimal action pull count under no attack','Regret of UCB-B under no attack','Position',[0.5 0.3 0.1 0.2])
xlabel('Tiem step (t)')
%% plot UCBVI
x=[(1:(length(Loss_LCB_UCBVI)-1)).^2,K*H];
figure()
semilogx(x,Loss_LCB_UCBVI)
hold on
semilogx(x,Cost_LCB_UCBVI)
hold on
semilogx(x,Loss_alpha_UCBVI)
hold on
semilogx(x,Cost_alpha_UCBVI)
hold on 
semilogx(x,Regret_UCBVI)
hold on
semilogx(x,Regret_r_UCBVI)
%xlim([10^6 10^10])
legend('Loss of LCB-H black-box attack', 'Cost of LCB-H black-box attack','Loss of 1/H-portion white-box attack', 'Cost of 1/H-portion white-box attack','Non-optimal action pull count under no attack','Regret of UCBVI under no attack','Location','northwest')
xlabel('Tiem step (t)','FontSize',12)
figure()
plot(x,Loss_LCB_UCBVI)
hold on
plot(x,Cost_LCB_UCBVI)
hold on
plot(x,Loss_alpha_UCBVI)
hold on
plot(x,Cost_alpha_UCBVI)
hold on 
plot(x,Regret_UCBVI)
hold on
plot(x,Regret_r_UCBVI)
legend('Loss of LCB-H black-box attack', 'Cost of LCB-H black-box attack','Loss of 1/H-portion white-box attack', 'Cost of 1/H-portion white-box attack','Non-optimal action pull count under no attack','Regret of UCBVI under no attack','Position',[0.5 0.3 0.1 0.2])
xlabel('Tiem step (t)','FontSize',12)

%% plot UCBH
x=[(1:(length(Loss_LCB_UCBH)-1)).^2,K*H];
figure()
semilogx(x,Loss_LCB_UCBH)
hold on
semilogx(x,Cost_LCB_UCBH)
hold on
semilogx(x,Loss_alpha_UCBH)
hold on
semilogx(x,Cost_alpha_UCBH)
hold on 
semilogx(x,Regret_UCBH)
hold on
semilogx(x,Regret_r_UCBH)
%xlim([10^6 10^10])
legend('Loss of LCB-H black-box attack', 'Cost of LCB-H black-box attack','Loss of 1/H-portion white-box attack', 'Cost of 1/H-portion white-box attack','Non-optimal action pull count under no attack','Regret of UCB-H under no attack','Location','northwest')
xlabel('Tiem step (t)')
figure()
plot(x,Loss_LCB_UCBH)
hold on
plot(x,Cost_LCB_UCBH)
hold on
plot(x,Loss_alpha_UCBH)
hold on
plot(x,Cost_alpha_UCBH)
hold on 
plot(x,Regret_UCBH)
hold on
plot(x,Regret_r_UCBH)
legend('Loss of LCB-H black-box attack', 'Cost of LCB-H black-box attack','Loss of 1/H-portion white-box attack', 'Cost of 1/H-portion white-box attack','Non-optimal action pull count under no attack','Regret of UCB-H under no attack','Position',[0.5 0.3 0.1 0.2])
xlabel('Tiem step (t)')

%% 1/H-portion
function [Loss,Cost]=alpha_UCBB(Q_tar,K,H,S,A,P_kernel,pi_tar,pi_opt,c1,c2,R)
%% Initialization
iota=log(2*S*A*K*H/0.1);
t=0;
reg=0;
loss=0;
cost=0;
T=fix(sqrt(H*K));
Loss=zeros(1,T);
Cost=zeros(1,T);
Regret=zeros(1,T);
N_hsa=zeros(S,H,A);
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
V_agt=H*ones(S,H+1);
mu_agt=zeros(S,H,A);
sigma_agt=zeros(S,H,A);
beta_agt=zeros(S,H,A);
V_agt(:,H+1)=zeros(S,1);
w=ones(1,H+1);
% main
for k=1:K
    s=randi(S);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        if a_agt == pi_tar(s,h)
            a_atk=a_agt;
            w(h)=1;
        elseif rand > 1/H
                a_atk=pi_tar(s,h);
                w(h)=H/(H-1);
            else
                [~,a_atk]=min(Q_tar(s,h,:));
        end
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        cost=cost+(a_agt~=a_atk);
        loss=loss+(a_agt~=pi_tar(s,h));
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Cost(sqrt(t))=cost;
        Loss(sqrt(t))=loss;
        end
        %update attacker's observations
        N_hsa(s,h,a_atk)=N_hsa(s,h,a_atk)+1;
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        alpha=(H+1)/(H+N_agt(s,h,a_agt));
        mu_agt(s,h,a_agt)=mu_agt(s,h,a_agt)+V_agt(s_next,h+1);
        sigma_agt(s,h,a_agt)=sigma_agt(s,h,a_agt)+V_agt(s_next,h+1)^2;
        beta=min([ c1*(sqrt(iota*H/N_agt(s,h,a_agt)*(H+(sigma_agt(s,h,a_agt)-mu_agt(s,h,a_agt)^2)/N_agt(s,h,a_agt))) + sqrt(H^7*S*A)*iota/N_agt(s,h,a_agt)), c2*sqrt(H^3*iota/N_agt(s,h,a_agt))]);
        b=(beta-(1-alpha)*beta_agt(s,h,a_agt))/(2*alpha);
        beta_agt(s,h,a_agt) = beta;
        Q_agt(s,h,a_agt)=(1-alpha)*Q_agt(s,h,a_agt)+alpha*(r+V_agt(s_next,h+1)+b);
        V_agt(s,h)=min(H,max(Q_agt(s,h,:)));
        s=s_next;
    end
end
if mod(sqrt(t),1)~=0 
   Cost=[Cost,cost];
   Loss=[Loss,loss];
end
end

%% LCB-attack
function [Loss,Cost]=LCBH_UCBB(K,H,S,A,P_kernel,pi_tar,pi_opt,c1,c2,R)
iota=log(2*S*A*K*H/0.1);
t=0;
reg=0;
loss=0;
cost=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
Loss=zeros(1,T);
Cost=zeros(1,T);
N_hsa=zeros(S,H,A);
L=-inf*ones(S,H,A);
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
V_agt=H*ones(S,H+1);
V_agt(:,H+1)=zeros(S,1);
mu_agt=zeros(S,H,A);
sigma_agt=zeros(S,H,A);
beta_agt=zeros(S,H,A);
w=ones(1,H+1);
G=zeros(1,H+1);
Q_atk=zeros(S,H,A);
% main
for k=1:K
    s=randi(S);
    traj=zeros(2,H);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        if a_agt == pi_tar(s,h)
            a_atk=a_agt;
            w(h)=1;
        elseif rand > 1/H
                a_atk=pi_tar(s,h);
                w(h)=H/(H-1);
            else
                min_L=inf;
                for a=1:A
                    if a~=pi_tar(s,h) && L(s,h,a)<min_L
                        a_atk=a; %attack
                        min_L=L(s,h,a);
                    end
                end
                w(h)=0;
        end
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        cost=cost+(a_agt~=a_atk);
        loss=loss+(a_agt~=pi_tar(s,h));
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Cost(sqrt(t))=cost;
        Loss(sqrt(t))=loss;
        end
        %update attacker's observations
        N_hsa(s,h,a_atk)=N_hsa(s,h,a_atk)+1;
        G(h)=r;
        traj(1,h)=s;
        traj(2,h)=a_atk;
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        alpha=(H+1)/(H+N_agt(s,h,a_agt));
        mu_agt(s,h,a_agt)=mu_agt(s,h,a_agt)+V_agt(s_next,h+1);
        sigma_agt(s,h,a_agt)=sigma_agt(s,h,a_agt)+V_agt(s_next,h+1)^2;
        beta=min([ c1*(sqrt(iota*H/N_agt(s,h,a_agt)*(H+(sigma_agt(s,h,a_agt)-mu_agt(s,h,a_agt)^2)/N_agt(s,h,a_agt))) + sqrt(H^7*S*A)*iota/N_agt(s,h,a_agt)), c2*sqrt(H^3*iota/N_agt(s,h,a_agt))]);
        b=(beta-(1-alpha)*beta_agt(s,h,a_agt))/(2*alpha);
        beta_agt(s,h,a_agt) = beta;
        Q_agt(s,h,a_agt)=(1-alpha)*Q_agt(s,h,a_agt)+alpha*(r+V_agt(s_next,h+1)+b);
        V_agt(s,h)=min(H,max(Q_agt(s,h,:)));
        s=s_next;
    end
    for h=H:-1:1
        s=traj(1,h);
        a_atk=traj(2,h);
        Q_atk(s,h,a_atk)=(1-1/N_hsa(s,h,a_atk))*Q_atk(s,h,a_atk)+1/N_hsa(s,h,a_atk)*(G(h)+G(h+1)*w(h+1));
        L(s,h,a_atk)=Q_atk(s,h,a_atk)-c2*(exp(1)*(H-h)+1)*sqrt(2*iota/N_hsa(s,h,a_atk));
        G(h)=G(h)+G(h+1);
        w(h)=w(h)*w(h+1);
    end
end
if mod(sqrt(t),1)~=0 
   Cost=[Cost,cost];
   Loss=[Loss,loss];
end
end

%% UCBB algorithm
function [Regret,Regret_r]=UCBB_agent(P,K,H,S,A,P_kernel,pi_opt,c1,c2,R)
[~,~,~,~,~,V_opt]=pi_gen(P,R,H,0);
iota=log(2*S*A*K*H/0.1);
t=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
Regret_r=zeros(1,T);
reg=0;
reg_reward=0;
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
V_agt=H*ones(S,H+1);
V_agt(:,H+1)=zeros(S,1);
mu_agt=zeros(S,H,A);
sigma_agt=zeros(S,H,A);
beta_agt=zeros(S,H,A);
%main
for k=1:K
    s=randi(S);
    reg_reward=reg_reward+V_opt(s,1);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        a_atk=a_agt;
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        reg_reward=reg_reward-r;
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Regret_r(sqrt(t))=reg_reward;
        end
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        alpha=(H+1)/(H+N_agt(s,h,a_agt));
        mu_agt(s,h,a_agt)=mu_agt(s,h,a_agt)+V_agt(s_next,h+1);
        sigma_agt(s,h,a_agt)=sigma_agt(s,h,a_agt)+V_agt(s_next,h+1)^2;
        beta=min([ c1*(sqrt(iota*H/N_agt(s,h,a_agt)*(H+(sigma_agt(s,h,a_agt)-mu_agt(s,h,a_agt)^2)/N_agt(s,h,a_agt))) + sqrt(H^7*S*A)*iota/N_agt(s,h,a_agt)), c2*sqrt(H^3*iota/N_agt(s,h,a_agt))]);
        b=(beta-(1-alpha)*beta_agt(s,h,a_agt))/(2*alpha);
        beta_agt(s,h,a_agt) = beta;
        Q_agt(s,h,a_agt)=(1-alpha)*Q_agt(s,h,a_agt)+alpha*(r+V_agt(s_next,h+1)+b);
        V_agt(s,h)=min(H,max(Q_agt(s,h,:)));
        s=s_next;
    end
end
if mod(sqrt(t),1)~=0 
   Regret=[Regret,reg];
   Regret_r=[Regret_r,reg_reward];
end
end


%% 1/H-portion
function [Loss,Cost]=alpha_UCBVI(Q_tar,K,H,S,A,P_kernel,pi_tar,pi_opt,c,R)
%% Initialization
iota=log(2*S*A*K*H/0.1);
t=0;
reg=0;
loss=0;
cost=0;
T=fix(sqrt(H*K));
Loss=zeros(1,T);
Cost=zeros(1,T);
Regret=zeros(1,T);

N_agt=zeros(S,H,A);
r_hsa=zeros(S,H,A);
N_hsa=zeros(S,H,A);
N_hssa=zeros(S,S,H,A);

Q_agt=H*ones(S,H,A);

w=ones(1,H+1);
% main
for k=1:K
    s=randi(S);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        if a_agt == pi_tar(s,h)
            a_atk=a_agt;
            w(h)=1;
        elseif rand > 1/H
                a_atk=pi_tar(s,h);
                w(h)=H/(H-1);
            else
                [~,a_atk]=min(Q_tar(s,h,:));
        end
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        cost=cost+(a_agt~=a_atk);
        loss=loss+(a_agt~=pi_tar(s,h));
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Cost(sqrt(t))=cost;
        Loss(sqrt(t))=loss;
        end
        %update attacker's observations
        N_hsa(s,h,a_atk)=N_hsa(s,h,a_atk)+1;
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        N_hssa(s,s_next,h,a_agt)=N_hssa(s,s_next,h,a_agt)+1;
        r_hsa(s,h,a_agt)=(1-1/N_agt(s,h,a_agt))*r_hsa(s,h,a_agt)+r/N_agt(s,h,a_agt);
        s=s_next;
    end
    V_agt=zeros(S,H+1);
    for h=H:-1:1
        for s =1:S
            for a =1:A
                if N_agt(s,h,a)~= 0
                    b=H*c*sqrt(iota/N_agt(s,h,a));
                    Q_bonus=r_hsa(s,h,a)+b;
                    for ss =1:S
                        Q_bonus=Q_bonus+N_hssa(s,ss,h,a)./N_agt(s,h,a)*V_agt(ss,h+1);
                    end
                    if Q_bonus < Q_agt(s,h,a)
                        Q_agt(s,h,a) = Q_bonus;
                    end
                end
                if Q_agt(s,h,a) > V_agt(s,h)
                    V_agt(s,h)=Q_agt(s,h,a);
                end
            end
        end
    end
end
if mod(sqrt(t),1)~=0 
   Cost=[Cost,cost];
   Loss=[Loss,loss];
end
end

%% LCB-attack
function [Loss,Cost]=LCBH_UCBVI(K,H,S,A,P_kernel,pi_tar,pi_opt,c,R)
iota=log(2*S*A*K*H/0.1);
t=0;
reg=0;
loss=0;
cost=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
Loss=zeros(1,T);
Cost=zeros(1,T);
N_hsa=zeros(S,H,A);
L=-inf*ones(S,H,A);
Q_agt=H*ones(S,H,A);

N_agt=zeros(S,H,A);
r_hsa=zeros(S,H,A);
N_hssa=zeros(S,S,H,A);

w=ones(1,H+1);
G=zeros(1,H+1);
Q_atk=zeros(S,H,A);
% main
for k=1:K
    s=randi(S);
    traj=zeros(2,H);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        if a_agt == pi_tar(s,h)
            a_atk=a_agt;
            w(h)=1;
        elseif rand > 1/H
                a_atk=pi_tar(s,h);
                w(h)=H/(H-1);
            else
                min_L=inf;
                for a=1:A
                    if a~=pi_tar(s,h) && L(s,h,a)<min_L
                        a_atk=a; %attack
                        min_L=L(s,h,a);
                    end
                end
                w(h)=0;
        end
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        cost=cost+(a_agt~=a_atk);
        loss=loss+(a_agt~=pi_tar(s,h));
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Cost(sqrt(t))=cost;
        Loss(sqrt(t))=loss;
        end
        %update attacker's observations
        N_hsa(s,h,a_atk)=N_hsa(s,h,a_atk)+1;
        G(h)=r;
        traj(1,h)=s;
        traj(2,h)=a_atk;
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        N_hssa(s,s_next,h,a_agt)=N_hssa(s,s_next,h,a_agt)+1;
        r_hsa(s,h,a_agt)=(1-1/N_agt(s,h,a_agt))*r_hsa(s,h,a_agt)+r/N_agt(s,h,a_agt);
        s=s_next;
    end
    for h=H:-1:1
        s=traj(1,h);
        a_atk=traj(2,h);
        Q_atk(s,h,a_atk)=(1-1/N_hsa(s,h,a_atk))*Q_atk(s,h,a_atk)+1/N_hsa(s,h,a_atk)*(G(h)+G(h+1)*w(h+1));
        L(s,h,a_atk)=Q_atk(s,h,a_atk)-c*(exp(1)*(H-h)+1)*sqrt(2*iota/N_hsa(s,h,a_atk));
        G(h)=G(h)+G(h+1);
        w(h)=w(h)*w(h+1);
    end
    V_agt=zeros(S,H+1);
    for h=H:-1:1
        for s =1:S
            for a =1:A
                if N_agt(s,h,a)~= 0
                    b=H*c*sqrt(iota/N_agt(s,h,a));
                    Q_bonus=r_hsa(s,h,a)+b;
                    for ss =1:S
                        Q_bonus=Q_bonus+N_hssa(s,ss,h,a)./N_agt(s,h,a)*V_agt(ss,h+1);
                    end
                    if Q_bonus < Q_agt(s,h,a)
                        Q_agt(s,h,a) = Q_bonus;
                    end
                end
                if Q_agt(s,h,a) > V_agt(s,h)
                    V_agt(s,h)=Q_agt(s,h,a);
                end
            end
        end
    end
end
if mod(sqrt(t),1)~=0 
   Cost=[Cost,cost];
   Loss=[Loss,loss];
end
end

%% UCBB algorithm
function [Regret,Regret_r]=UCBVI_agent(P,K,H,S,A,P_kernel,pi_opt,c,R)
[~,~,~,~,~,V_opt]=pi_gen(P,R,H,0);
iota=log(2*S*A*K*H/0.1);
t=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
Regret_r=zeros(1,T);
reg=0;
reg_reward=0;
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
r_hsa=zeros(S,H,A);
N_hssa=zeros(S,S,H,A);
% main
for k=1:K
    s=randi(S);
    reg_reward=reg_reward+V_opt(s,1);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        a_atk=a_agt;
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        reg_reward=reg_reward-r;
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Regret_r(sqrt(t))=reg_reward;
        end
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;      
        N_hssa(s,s_next,h,a_agt)=N_hssa(s,s_next,h,a_agt)+1;
        r_hsa(s,h,a_agt)=(1-1/N_agt(s,h,a_agt))*r_hsa(s,h,a_agt)+r/N_agt(s,h,a_agt);
        s=s_next;
    end
    V_agt=zeros(S,H+1);
    for h=H:-1:1
        for s =1:S
            for a =1:A
                if N_agt(s,h,a)~= 0
                    b=H*c*sqrt(iota/N_agt(s,h,a));
                    Q_bonus=r_hsa(s,h,a)+b;
                    for ss =1:S
                        Q_bonus=Q_bonus+N_hssa(s,ss,h,a)./N_agt(s,h,a)*V_agt(ss,h+1);
                    end
                    if Q_bonus < Q_agt(s,h,a)
                        Q_agt(s,h,a) = Q_bonus;
                    end
                end
                if Q_agt(s,h,a) > V_agt(s,h)
                    V_agt(s,h)=Q_agt(s,h,a);
                end
            end
        end
    end

end
if mod(sqrt(t),1)~=0 
   Regret=[Regret,reg];
   Regret_r=[Regret_r,reg_reward];
end
end

%% 1/H-portion UCBH
function [Loss,Cost]=alpha_UCBH(Q_tar,K,H,S,A,P_kernel,pi_tar,pi_opt,c,R)
%% Initialization
iota=log(2*S*A*K*H/0.1);
t=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
reg=0;
loss=0;
cost=0;
Loss=zeros(1,T);
Cost=zeros(1,T);
N_hsa=zeros(S,H,A);
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
V_agt=H*ones(S,H+1);
V_agt(:,H+1)=zeros(S,1);
w=ones(1,H+1);
% main
for k=1:K
    s=randi(S);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        if a_agt == pi_tar(s,h)
            a_atk=a_agt;
            w(h)=1;
        elseif rand > 1/H
                a_atk=pi_tar(s,h);
                w(h)=H/(H-1);
            else
                [~,a_atk]=min(Q_tar(s,h,:));
        end
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        cost=cost+(a_agt~=a_atk);
        loss=loss+(a_agt~=pi_tar(s,h));
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Cost(sqrt(t))=cost;
        Loss(sqrt(t))=loss;
        end
        %update attacker's observations
        N_hsa(s,h,a_atk)=N_hsa(s,h,a_atk)+1;
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        b=c*sqrt(H*(H-h+1)^2*iota/N_agt(s,h,a_agt));
        alpha=(H+1)/(H+N_agt(s,h,a_agt));
        Q_agt(s,h,a_agt)=(1-alpha)*Q_agt(s,h,a_agt)+alpha*(r+V_agt(s_next,h+1)+b);
        V_agt(s,h)=min(H,max(Q_agt(s,h,:)));
        s=s_next;
    end

end
if mod(sqrt(t),1)~=0 
   Cost=[Cost,cost];
   Loss=[Loss,loss];
end
end

%% LCB-attack UCBH
function [Loss,Cost]=LCBH_UCBH(K,H,S,A,P_kernel,pi_tar,pi_opt,c,R)
iota=log(2*S*A*K*H/0.1);
t=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
reg=0;
loss=0;
cost=0;
Loss=zeros(1,T);
Cost=zeros(1,T);
N_hsa=zeros(S,H,A);
L=-inf*ones(S,H,A);
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
V_agt=H*ones(S,H+1);
V_agt(:,H+1)=zeros(S,1);
w=ones(1,H+1);
G=zeros(1,H+1);
Q_atk=zeros(S,H,A);
% main
for k=1:K
    s=randi(S);
    traj=zeros(2,H);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        if a_agt == pi_tar(s,h)
            a_atk=a_agt;
            w(h)=1;
        elseif rand > 1/H
                a_atk=pi_tar(s,h);
                w(h)=H/(H-1);
            else
                min_L=inf;
                for a=1:A
                    if a~=pi_tar(s,h) && L(s,h,a)<min_L
                        a_atk=a; %attack
                        min_L=L(s,h,a);
                    end
                end
                w(h)=0;
        end
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        cost=cost+(a_agt~=a_atk);
        loss=loss+(a_agt~=pi_tar(s,h));
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Cost(sqrt(t))=cost;
        Loss(sqrt(t))=loss;
        end
        %update attacker's observations
        N_hsa(s,h,a_atk)=N_hsa(s,h,a_atk)+1;
        G(h)=r;
        traj(1,h)=s;
        traj(2,h)=a_atk;
        %update agent's observations
        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        b=c*sqrt(H*(H-h+1)^2*iota/N_agt(s,h,a_agt));
        alpha=(H+1)/(H+N_agt(s,h,a_agt));
        Q_agt(s,h,a_agt)=(1-alpha)*Q_agt(s,h,a_agt)+alpha*(r+V_agt(s_next,h+1)+b);
        V_agt(s,h)=min(H,max(Q_agt(s,h,:)));
        s=s_next;
    end
    for h=H:-1:1
        s=traj(1,h);
        a_atk=traj(2,h);
        Q_atk(s,h,a_atk)=(1-1/N_hsa(s,h,a_atk))*Q_atk(s,h,a_atk)+1/N_hsa(s,h,a_atk)*(G(h)+G(h+1)*w(h+1));
        L(s,h,a_atk)=Q_atk(s,h,a_atk)-c*(exp(1)*(H-h)+1)*sqrt(2*iota/N_hsa(s,h,a_atk));
        G(h)=G(h)+G(h+1);
        w(h)=w(h)*w(h+1);
    end
end
if mod(sqrt(t),1)~=0 
   Cost=[Cost,cost];
   Loss=[Loss,loss];
end
end

%% UCBH algorithm
function [Regret,Regret_r]=UCBH_agent(P,K,H,S,A,P_kernel,pi_opt,c,R)
[~,~,~,~,~,V_opt]=pi_gen(P,R,H,0);
iota=log(2*S*A*K*H/0.1);
t=0;
T=fix(sqrt(H*K));
Regret=zeros(1,T);
Regret_r=zeros(1,T);
reg=0;
reg_reward=0;
Q_agt=H*ones(S,H,A);
N_agt=zeros(S,H,A);
V_agt=H*ones(S,H+1);
V_agt(:,H+1)=zeros(S,1);
% main
for k=1:K
    s=randi(S);
    reg_reward=reg_reward+V_opt(s,1);
    for h=1:H
        t=t+1;
        %agent algorithm
        [~,a_agt]=max(Q_agt(s,h,:));
        %attacker algorithm
        a_atk=a_agt;
        %transpose to the next state
        r=reward_gen(R,s,a_atk);
        s_next=state_tans(P_kernel,s,a_atk,S);
        reg=reg+(a_agt~=pi_opt(s,h));
        reg_reward=reg_reward-r;
        if mod(sqrt(t),1)==0 
        Regret(sqrt(t))=reg;
        Regret_r(sqrt(t))=reg_reward;
        end

        N_agt(s,h,a_agt)=N_agt(s,h,a_agt)+1;
        b=c*sqrt(H*(H-h+1)^2*iota/N_agt(s,h,a_agt));
        alpha=(H+1)/(H+N_agt(s,h,a_agt));
        Q_agt(s,h,a_agt)=(1-alpha)*Q_agt(s,h,a_agt)+alpha*(r+V_agt(s_next,h+1)+b);
        V_agt(s,h)=min(H,max(Q_agt(s,h,:)));
        s=s_next;
    end

end
if mod(sqrt(t),1)~=0 
   Regret=[Regret,reg];
   Regret_r=[Regret_r,reg_reward];
end
end

%% Funcitons
function [P,R,P_kernel]=buildmodel(S,A)
    P=zeros(S,S,A);
    P_kernel=zeros(S,A,A);
    p_main=rand(S,A)/2;
    R_set=0.2:0.6/(A-1):0.8;
    R=zeros(S,A);
    for s=1:S
        for a=1:A
            P(s,s,a)=1;
            kernel=[1 1 1 1 1]*p_main(s,a)/5;
            kernel(a)=kernel(a)+(1-p_main(s,a));
            P(s,:,a)=cconv(P(s,:,a),cconv([0 0 0 0 0 0 0 0 1 0 ], kernel, 10),S);
            P_kernel(s,:,a)=kernel;
        end
        R(s,:)=R_set(randperm(A));
    end
end

function [pi_tar,pi_opt,delta1,delta2,Q_tar,V_opt]=pi_gen(P,R,H,delta)
    pi_tar=zeros(size(R,1),H);
    pi_opt=zeros(size(R,1),H);
    V=zeros(size(R,1),H);
    V_opt=zeros(size(R,1),H);
    Q_tar=zeros(size(R,1),H,size(R,2));
    delta1=1;
    delta2=1;
    for s=1:size(R,1)
        [~,pi_opt(s,H)]=max(R(s,:));
        a_nonw=find(R(s,:)>(min(R(s,:))+delta));
        pi_tar(s,H)=a_nonw(randi(length(a_nonw)));
        V(s,H)=R(s,pi_tar(s,H));
        V_opt(s,H)=R(s,pi_opt(s,H));
        delta1=min(delta1,R(s,pi_opt(s,H))-max(R(s,R(s,:)<max(R(s,:)))));
        delta2=min(delta2,R(s,pi_tar(s,H))-min(R(s,:)));
        Q_tar(s,H,:)= R(s,:);
    end
    for h=H-1:-1:1
    for s=1:size(R,1)
        P_s=reshape(P(s,:,:),size(R));
        Q=R(s,:)'+P_s'*V(:,h+1);
        Q_tar(s,h,:)= Q;
        Q_opt=R(s,:)'+P_s'*V_opt(:,h+1);
        [~,pi_opt(s,h)]=max(Q_opt);
        a_nonw=find(Q>(min(Q)+delta));
        pi_tar(s,h)=a_nonw(randi(length(a_nonw)));
        V(s,h)=Q(pi_tar(s,h));
        V_opt(s,h)=Q_opt(pi_opt(s,h));
        delta1=min(delta1,V_opt(s,h)-max(Q_opt(Q_opt<V_opt(s,h))));
        delta2=min(delta2,V(s,h)-min(Q));
    end
    end
end
function r= reward_gen(R,s,a)
    if rand>R(s,a)
        r=0;
    else
        r=1;
    end
end
function s_next=state_tans(P_kernel,s,a,S)
    step=find(cumsum(P_kernel(s,:,a))>rand,1);
    if s+step-3 > S
        s_next = s+step-3-S;
    elseif s+step-3 < 1
        s_next = s+step-3+S;  
    else
        s_next = s+step-3;
    end
end

