
We are now ready to analyze the theoretical results of our algorithm under the uncertainly set.

\subsection{Results under $(s,a)$-rectangular uncertainty set} Equipped with Algorithm \ref{alg} and the bonus function described in Equation (\ref{bonus_sa}). 
%\begin{align*}
%    b_h^k(s,a) = \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} + H \sqrt{\frac{4 S \log(3SAH^2K^{3/2}(4 + \rho)/ \delta)}{N_h^k(s,a)}} + \frac{1}{\sqrt{K}}\,,
%\end{align*}
We obtain the regret upper bound under $(s,a)$-rectangular uncertainty set described in the following theorem.
\begin{thm}\label{thm:sa}
With learning rate $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$ and bonus term $b_h^k$ as (\ref{bonus_sa}), with probability at least $ 1 - \delta$, the regret incurred by Algorithm \ref{alg} over $K$ episodes is bounded by 
$$
    \text{Regret}(K) 
    = O \left( \frac{H^2 S}{c} \sqrt{AK\log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)}\right) \,.
$$
\end{thm}
%\begin{restatable*}[Regret under $(s,a)$-rectangular uncertainty set]{thm}{sa}
%\label{thm:sa}

%\end{restatable*}

By the definition of $\rho$ as the $\ell_1$ distance, it is at most $1$. This indicates that the regret scales logarithmically with $\rho$ for small $\rho$, and is capped at some constant in the large-$\rho$ regime.
It concludes that our algorithm derives a robust enough policy in a way that if there is a policy that achieves a high return then this policy achieves a high return. 

%Thus when $\rho > 0$, our result is only logarithmically dependent on $\rho$. 

%We further note that this also matches the dependency in the sample complexity results \cite{yang2021towards,panaganti2022sample}.

%Our algorithm subtly characterizes the uncertainty from all sources, it 


\begin{rem}
When $\rho = 0$, the problem reduces to non-robust reinforcement learning. In such case, our regret upper bound is $\tilde{O}\left( H^2 S \sqrt{AK}/c \right)$, which is $1/c$ order away from the regret bound for policy optimization algorithms for the non-robust case \cite{shani2020optimistic}. This is due to the over-cautiousness of our algorithm for robust performance. Our algorithm subtly characterizes the uncertainty from all sources, it derives a robust enough policy in a way that if there is a policy that achieves a high return then this policy achieves a high return. Thus when $\rho > 0$, our result is only logarithmically dependent on $\rho$. We further note that this also matches the dependency in the sample complexity results \cite{yang2021towards,panaganti2022sample}. 
\end{rem}
While we defer the detailed proof to Appendix \ref{appendix:thm1}, we highlight the challenges in the proof below.
%    Under the convention of optimism, we should hope that the bonus function $b_h^k(s,a)$ ``covers'' the estimation errors of the reward and transition functions. That is $r(s, a) - \hat{r}(s,a)$ and $\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)$ are both smaller than our choice of bonus function. We now show that this is indeed the case under our designed bonus function.
    


% {\color{purple}First we decomposed the regret as follows:
% \begin{align*}
%     \text{Regret}(K) 
%     = \ & \sum^K_{k=1} V_1^{\ast}(s) - V_1^{\pi_k}(s)
%     =  \sum^K_{k=1} \left(V_1^{\ast} (s)- \hat{V}_1^{\pi_k}(s) \right) + \left(\hat{V}_1^{\pi_k} (s) - V_1^{\pi_k}(s)\right) \,.
%     \end{align*}

% For the first term, we use the robust bellman equation and 
% the update rule to derive the recursion inequality conditioned on the good event:
% \begin{align*}
%     & V_h^{\ast}(s) - \hat{V}_h^{\pi_k}(s) 
%     \le  \mathbb{E}_{\pi_\ast, p_h} \left[ V_{h+1}^\ast(s) - \hVpn(s) \right] + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \,.
% \end{align*}

% Then we apply above relation recursively and standard results for online mirror descent.
% % \begin{align*}
% %      \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s) = O \left(H^2\sqrt{K \log A} \right)\,.
% % \end{align*}

% For the second term, By applying the recursion inequality and the design of bonus function we have 
% \begin{align*}
%     & (\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) 
%     \leq \sum^H_{h=1} \mathbb{E} \left[ \hat{r}_h^k(s, a) - r_h^k(s, a) + \hPsa(\hVpn)(s) -  \sPsa (\hVpn)(s) + b_h^k(s,a)\right]\,.
% \end{align*}

% Then the main challenge of deriving a robust regret is to quantify the uncertainty of the transition.  With the compound causes of uncertainty we choose not to directly use concentration inequality $\sigma_{\hat{\gP}_{(s,a)}}(V) - \sigma_{\gP_{(s,a)}}(V)$ and instead resort to the dual form Equation (\ref{eq:inner_sa}). }

% Using the dual form, we get the following bound:
% \begin{align*}
%     \sPsa (\hVpn)(s)  - \hPsa(\hVpn)(s)
%     \leq \ &   H\sqrt{\frac{4 S \log(3SAH^3K^{3/2}(4+\rho)/ \delta^\prime)}{N_h^k(s,a)}} + \frac{1}{H\sqrt{K}}\,.
% \end{align*}

% Then we have
% \begin{align*}
%     \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s)  = O \left(H^2 S\sqrt{AK \log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)} \right) \,.
% \end{align*}
%To overcome this, we focus on the dual formulation of $\sigma_{\hat{\gP}_{(s,a)}}(V)$ to decouple the uncertainty caused by limited interaction and uncertainty set. 
% Notice that now the difference of $\sigma_{\hat{\gP}_{(s,a)}}(V) - \sigma_{\gP_{(s,a)}}(V)$ is only incurred by difference in the value of $\sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+}$. When $\eta$ is bounded, we can use Hoeffding's inequality to control it. We then investigate the range of possible optimal values of $\eta$ and use an $\epsilon$-net argument.
We start with decomposing the regret as 
\begin{align*}
    \sum^K_{k=1} (V_1^{\ast} (s)- \hat{V}_1^{\pi_k}(s) ) + \sum^K_{k=1}(\hat{V}_1^{\pi_k} (s) - V_1^{\pi_k}(s)) \,.
\end{align*}
    
%For the first term, for any $h \in [H]$, we can further decompose it as 
%\begin{align*}
%& (\hVp - \Vp )(s) \\
%= \ & \langle \hQp (s, \cdot) - \Qp (s, \cdot) , \pi_k(\cdot \mid s) \rangle \\
%= \ & \left\langle \hat{r}(s, \cdot) - r(s, \cdot)  + ( \hPsa(\hVpn)(s,\cdot) - \sPsa (\Vpn)(s,\cdot) ) + b_h^k(s,\cdot),  \pi_k(\cdot \mid s) \right\rangle  \,.
%\end{align*}
%The main technical challenge lies on deriving the appropriate bonus function with a time-varying transition kernel due to the robustness concern. 
In the case of policy optimization for non-robust MDP, the first term is upper bounded through the value difference lemma \citep{shani2020optimistic}. Yet this can be no longer applied to the robust MDP case, due to the presence of policy-dependent adversarial transition kernel. Moreover, naively employing a recursive relation with respect to a fixed transition kernel in a similar way to the value difference lemma may lead to linear regret. 

To address the issue of varying transition kernel, we decompose the first term as,
\begin{align*}
&  V_h^{\ast}(s) - \hat{V}_h^{\pi_k}(s) \\
 \leq \ & 
    \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle + \mathbb{E}_{\pi_\ast} \left[ (r_h(s, a) - \hat{r}_{h}^{k}(s,a)) \right. \\
    & \ + \left. (\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a)\right] \\
    & \ + \mathbb{E}_{\pi_\ast} \left[  \sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a)\right]\,.
\end{align*}
We then apply this decomposition repeatedly by conditioning on varying transition kernel $q_h(\cdot \mid s,a) = \mathop{\arg\max}_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a)(\hVpn - \Vpn)$. By setting the optimism bonus $b_h^k(s,a)$ carefully, we obtain 
\begin{align*}
   &  \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s)\\
   \le \ & \sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_\ast, \{q_{t}\}^{h-1}_{t=1}} \left[ \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \,,
\end{align*}
% $
%     \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s)\le \sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_\ast, \{q_{t}\}^{h-1}_{t=1}} \left[ \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \,,
%     %&= \sum^H_{h=1} \mathbb{E}_{\pi_\ast, \{q_{t}\}^{h-1}_{t=1}} \left[ \sum^K_{k=1}\langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \,.
% $
This can be upper bounded by standard results of online mirror descent. 

However, we remark that designing such a bonus function is non-trivial as the expectation of each time steps $h$ is taken with respect to a different transition kernel.
%One key difficulty of ensuring optimism under this case is due to the coupled uncertainty caused by limited interaction and uncertainty set. 
To establish such optimism bonus, we first derive the dual formulation of inner optimization problem $\sigma_{\hat{\gP}_{(s,a)}}(V)$ (Equation (\ref{eq:inner_sa})). This allows us to decouple the uncertainty and bound each source of uncertainty separately. With a change of variable $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can write the Lagrangian form of $\sPsa (\hVpn)(s,a)$ as
\begin{align*}
    & \ \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) \\
    & \ + \lambda \left( \sum_{s^\prime} | \Tilde{P}_h(s^\prime \mid s,a)  - 1| P_h^o(s^\prime \mid s,a)  - \rho \right)\\
    & \ - \eta \left(\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) - 1 \right) \,,
\end{align*}
where $\eta, \lambda$ are both Lagrangian multipliers. 

Under the characterization of $\ell_1$ distance, we can use the convex conjugate of $f(x) = |x - 1|$ to optimize out $\Tilde{P}$, resulting with Equation (\ref{eq:inner_sa}).
Notice that now the difference of $\sigma_{\hat{\gP}_{(s,a)}}(V) - \sigma_{\gP_{(s,a)}}(V)$ is only incurred by the difference in  $\sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+}$. We then show that $\eta$ must be bounded at its optimum by inspecting certain pivot points and by the convexity of the dual. When we have the desired bounds of $\eta$, applying Hoeffding's inequality with an $\epsilon$-net argument will yield the desired bonus function. 

%If the dual variable $\eta$ takes an  arbitrary value, then this difference in value can also be arbitrary. We thus characterize the optimal solution region of the dual formulation through convexity and uniformly apply Hoeffding's type concentration over it with an $\epsilon$-net argument. %When $\eta$ is bounded, we can use Hoeffding's inequality to control it. 
Our algorithm and analysis techniques can also extend to other probability distances, such as KL divergence constrained uncertainly set. We include the result for KL divergence in Appendix \ref{appendix:thm3}. 

\subsection{Results under $s$-rectangular uncertainty set}
Beyond the $(s,a)$-rectangular uncertainty set, we also extend to $s$-rectangular uncertainty set (Definition \ref{def:s}).
Recall that value-based methods do not extend to $s$-rectangular uncertainty set as there might not exist a deterministic optimal policy.
%In this setting, all optimal policy can be stochastic \citep{wiesemann2013robust}. This implies the inefficiency of value-based learning methods which are studied more extensively in robust RL literature \citep{panaganti2022sample,yang2021towards}. To our best knowledge, this is the first theoretical guarantee of RL under robust MDPs with $s$-rectangular uncertainty set.

%Equipped with Algorithm \ref{alg}, which learns a stochastic policy, and the bonus function described in Equation \ref{bonus_s}, we present the following Theorem. 
\begin{thm}[Regret under $s$-rectangular uncertainty set]
    \label{thm:s}
With learning rate $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$ and bonus term $b_h^k$ as (\ref{bonus_s}), with probability at least $ 1 - \delta$, the regret of Algorithm \ref{alg} is bounded by 
$$
    \text{Regret}(K) 
    =O \left( \frac{SA^2 H^2}{c}\sqrt{K\log(SA^2H^2K^{3/2}(1+\rho) / \delta)}\right) \,.
$$
\end{thm}
%\begin{restatable*}[Regret under $s$-rectangular uncertainty set]{thm}{s}
%\end{restatable*}
%\input{s_rect}
\begin{rem}
When $\rho = 0$, the problem reduces to non-robust reinforcement learning. In such case, our regret upper bound is $\tilde{O}\left( SA^2 H^2 \sqrt{K}/c \right)$. Our result is the first theoretical result for learning a robust policy under $s$-rectangular uncertainty set, as previous results only learn the robust value function \citep{yang2021towards}. When $\rho > 0$, our result is only logarithmically dependent on $\rho$, which matches the dependency in the sample complexity results \cite{yang2021towards,panaganti2022sample}. %We remark that $\rho$ is at most a constant due to the definition of $\ell_1$ distance.
\end{rem}

The analysis and techniques used for Theorem \ref{thm:s} hold great similarity to those ones used for Theorem \ref{thm:sa}. The main difference is on bounding $\hPs (\hVpn)(s,a)  - \sPs(\hVpn)(s,a)$. We defer the detailed proof to the appendix \ref{appendix:thm2}.







% As the robustness of $\hPs (\hVpn)(s,a)$ is no longer independent for different actions, we can not reduce the problem of $\hPs (\hVpn)(s,a)$ into a scalar optimization problem. Instead, through analyzing the Lagrangian form, we obtain the $A$-dimensional convex optimization problem (\ref{eq:inner_s}), which is solvable in $O(A)$. Different from the $(s,a)$-rectangular case, our Lagrangian form has $A$ times more variables, which complicates the solution regions of the optimum. Through proof by contradiction argument, we find the optimal ranges of each dual variable separately. With the optimum of $\eta$, we can apply concentration inequalities uniformly over the range of dual variables. The theorem follows the same arguments of Theorem \ref{thm:sa} thereafter. 


