\section{Proofs of Theorem \ref{thm:sa}}\label{appendix:thm1}
\iffalse
First, we restate here the Algorithm and the notations for readability:
%\addtocounter{algorithm}{1}
\begin{algorithm}
\renewcommand{\thealgorithm}{}
    \caption{Robust Optimistic Policy Optimization} 
    %\label{alg}
    \begin{algorithmic}
    \STATE Input: learning rate $\beta$, bonus function $b_h^k$.
        \FOR{$k = 1, \ldots, K$}
        \STATE Collect a trajectory of samples by executing $\pi_k$.
        \STATE{ {\color{gray}\# Robust Policy Evaluation}}
        \FOR{$h = H, \ldots, 1$}
        \FOR{ $\forall (s,a) \in \gS \times \gA$}
        \STATE Solve $\sigma_{\hat{\gP}_h}(\hat{V}_{h+1}^\pi)(s,a)$ according to Equation (\ref{eq:inner_sa}).
        \STATE $\hat{Q}^{k}_h (s,a) = \min\left\{\hat{r}(s,a) + \sigma_{\hat{\gP}_h}(\hat{V}_{h+1}^\pi)(s,a) + b_h^k(s,a), H\right\} $.
        \ENDFOR
        \FOR{ $\forall s \in \gS$}
        \STATE $\hat{V}_h^k(s) = \left\langle \hat{Q}_h^k(s, \cdot), \pi_h^k(\cdot \mid s) \right\rangle$.
        \ENDFOR
        \ENDFOR
        \STATE{ {\color{gray} \# Policy Improvement}}
        \FOR{$\forall h, s, a \in [H] \times \gS \times \gA$}
        \STATE $\pi_h^{k+1}(a \mid s) = \frac{\pi_h^{k}\exp(-\beta \hat{Q}^{\pi}_h (s,a))}{\sum_{a^\prime} \exp(-\beta \hat{Q}^{\pi}_h (s,a^\prime))} $.
        \ENDFOR
        \STATE Update empirical estimate $\hat{r}$, $\hat{P}$ with Equation (\ref{eq:empirical}).
        \ENDFOR
    \end{algorithmic}
\end{algorithm}


To estimate the rewards and the nominal transition function, we use the following estimators. 
\begin{align*}
    \hat{r}_h^k(s,a) = \frac{\sum^{k - 1}_{k^\prime = 1} R_h^{k^\prime}(s,a)\mathbb{I} \left\{s_h^{k^\prime} = s, a_h^{k^\prime} = a\right\}}{N_h^k(s,a)} \,, \quad \hat{P}_h^{o,k}(s,a) = \frac{\sum^{k - 1}_{k^\prime = 1} \mathbb{I} \left\{s_h^{k^\prime} = s, a_h^{k^\prime} = a, s_{h+1}^{k^\prime} = s^\prime\right\}}{N_h^k(s,a)} \,,
\end{align*}
where $N_h^k(s,a) = \max \left\{ \sum^{k-1}_{k^\prime = 1}  \mathbb{I}\left\{s_h^{k^\prime} = s, a_h^{k^\prime} = a\right\},1\right\}$ is the counter for cumulative visitation.

%\begin{rem}[Bounded $\mathrm{Q}$ and value estimators]
%For any $k, h, s, a, Q_{h}^{k}(s, a) \in[0, H]$ and $V_{h}^{k}(s) \in[0, H]$. To see that, first note that by the update rule, we have that for any $k, h, s, a, Q_{h}^{k}(s, a) \le H$. Moreover, by the design of bonus function, $Q_{h}^{k}$ is always bigger than 0.

%\end{rem}


%\subsection{Design of the bonus function}

In the case of $(s,a)$-rectangular uncertainty set, we use the following bonus function $b_h^k(s,a)$ to encourage exploration. 
\begin{align*}
    b_h^k(s,a) = \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} + (2 \rho + H) \sqrt{\frac{4 S \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} + \frac{6 + \rho}{H\sqrt{K}}.
\end{align*}


The proof of Theorem 1 is then structured as follow. We first define a few good events that happens with high probability probability if one runs the algorithm with our designed bonus function. Then conditioned on the good events, we prove the high probability results in Theorem 1. 
\fi




\subsection{Good events}
We first define the following good events, in which case we estimate the reward function and the nominal transition functions fairly accurately. 
\begin{align*}
    \gG_{k}^{r} = \ & \left\{\forall s, a, h:\left|r_{h}(s, a)-\hat{r}_{h}^{k}(s, a)\right| \leq \sqrt{\frac{2 \ln (2 SAH^2K / \delta^\prime)}{N_h^k(s,a)}}\right\} \,, \\
    \gG_k^{p} = \ & \left\{\forall s, a, h:\sPsa (\hVpn)(s,a)   - \hPsa(\hVpn)(s,a) 
    \leq C_h^k(s,a) \right\}\,,
\end{align*}
where $C_h^k(s,a) = H \sqrt{\frac{4 S \log(3SAH^2K^{3/2}(4 + \rho)/ \delta^\prime)}{N_h^k(s,a)}} + \frac{1}{\sqrt{K}}$.

When the two good events happens at the same time, we say the algorithm in inside the good event $\gG = \left( \bigcap^K_{k=1} \gG_{k}^{r}\right) \bigcap \left( \bigcap^K_{k=1} \gG_{k}^{p}\right)$. The following lemma shows that $\gG$ happens with high probability by setting $\delta^\prime$ properly. 

\begin{lem}[Good event]
Let $\delta = 2 \delta^{\prime}$,  then the good event happens with high probability, i.e. $\mathbb{P}\left[ \gG\right] \geq 1 - \delta$.
\end{lem}
\begin{proof}
    By Hoeffding's inequality and an union bound on all $s,a$, all possible values of $N_k(s,a)$ and $k$, we have $\mathbb{P}\left[ \bigcap^K_{k=1} \gG_{k}^{r}\right] \geq 1 - \delta^\prime$. By Lemma \ref{lem:sa_con}, we have $\mathbb{P}\left[ \bigcap^K_{k=1} \gG_{k}^{p}\right] \geq 1 - \delta^\prime$ Then set $\delta = 2\delta^\prime$ and we have the desired result. 
\end{proof}

\subsection{Design of the bonus function}

In the case of $(s,a)$-rectangular uncertainty set, we use the following bonus function $b_h^k(s,a)$ to encourage exploration. 
\begin{align}\label{bonus_sa}
    b_h^k(s,a) = \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} + H \sqrt{\frac{4 S \log(3SAH^2K^{3/2}(4 + \rho)/ \delta)}{N_h^k(s,a)}} + \frac{1}{\sqrt{K}} \,. 
\end{align}



\subsection{Regret Analysis}


Armed with the defined good event, we are now ready to present the analysis of Theorem \ref{thm:sa}, which establishes the regret of the Algorithm under $(s,a)$-uncertainty set.



\begin{proof}
    We start with decomposing the regret as follows,
    \begin{align*}
    \text{Regret}(K) 
    = \ & \sum^K_{k=1} V_1^{\ast}(s) - V_1^{\pi_k}(s)\\
    = \ & \sum^K_{k=1} \left(V_1^{\ast} (s)- \hat{V}_1^{\pi_k}(s) \right) + \left(\hat{V}_1^{\pi_k} (s) - V_1^{\pi_k}(s)\right) \,.
    \end{align*}
    By Lemma \ref{lem:pseudo_regret_sa} and Lemma \ref{lem:estimation_sa}, with probability at least $1 - \delta$, we have 
    \begin{align*}
        \text{Regret}(K) 
        = \ & O \left(\frac{H^2\sqrt{K \log A} }{c}\right) +O  \left( \frac{H^2 S}{c}\sqrt{AK \log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)} \right) \\
        = \ & O \left( \frac{H^2 S}{c} \sqrt{AK\log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)}\right) \,.
    \end{align*}
\end{proof}

\begin{lem}\label{lem:pseudo_regret_sa}
With probability at least $ 1 - \delta$, we have
\begin{align*}
     \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s) = O \left(\frac{H^2\sqrt{K \log A} }{c}\right)\,.
\end{align*}
\end{lem}

\begin{proof}
    For any $h \in [1, H]$, we have
\begin{align*}
    & V_h^{\ast}(s) - \hat{V}_h^{\pi_k}(s) \\
    = \ & \langle Q_h^{\ast}(s, \cdot) , \pi_\ast(\cdot \mid s) \rangle - \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_k(\cdot \mid s) \rangle \\
    = \ & \langle Q_h^{\ast}(s, \cdot) -  \hat{Q}_h^{\pi_k}(s, \cdot), \pi_\ast(\cdot \mid s) \rangle + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \\
    = \ & \mathbb{E}_{\pi_\ast} \left[ (r_h(s, a) - \hat{r}_{h}^{k}(s,a)) + (\sPsa (V_{h+1}^\ast)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a)\right] \\
    & \ + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \\
    = \ & \mathbb{E}_{\pi_\ast} \left[ (r_h(s, a) - \hat{r}_{h}^{k}(s,a)) +  (\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a)\right] \\
    & \ + \mathbb{E}_{\pi_\ast} \left[  \sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a)\right] + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \,,
\end{align*}
where the third equality is by the update rule of our algorithm and the robust bellman equation.

By the design of our bonus function, conditioned on the good event, we have
\begin{align*}
     (r_h(s, a) - \hat{r}_{h}^{k}(s,a)) + (\sPsa (V_{h+1}^\ast)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a) \leq 0 \,.
\end{align*}

Let $q_h(\cdot \mid s,a) = \mathop{\arg\min}_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a) \hVpn $, then we have
\begin{align*}
    & \sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a)  \\
    = \ & \min_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a) V_{h+1}^\ast - \min_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a) \hVpn  \\
    = \ & \min_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a) V_{h+1}^\ast - q_h(\cdot \mid s,a) \hVpn  \\
    \le \ & q_h(\cdot \mid s,a) (V_{h+1}^\ast -  \hVpn)  \\
     \le \ & \max_{P_h\in \gP_h} \limits P_h(\cdot \mid s,a) (V_{h+1}^\ast - \hVpn)\,.
\end{align*}


Let $q_h(\cdot \mid s,a) = \mathop{\arg\max}_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a) \left((V_{h+1}^\ast)(s,a) - \hVpn\right)$, Then we have the following relation hold conditioned on the good event:
\begin{align*}
    & V_h^{\ast}(s) - \hat{V}_h^{\pi_k}(s) \\
    \le \ & \mathbb{E}_{\pi_\ast} \left[  \sup_{P_h\in \gP_h} \limits P_h(\cdot \mid s,a) (V_{h+1}^\ast - \hVpn)\right] + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \\
    = \ & \mathbb{E}_{\pi_\ast, q_h} \left[ V_{h+1}^\ast(s) - \hVpn(s) \right] + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \,.
\end{align*}

Then, by applying the above relation recursively and with the fact that for any policy $\pi$ and state $s$, $V_{H+1}^{\ast}(s) = \hat{V}_{H+1}^{\pi_k}(s) = 0$, we have 
\begin{align*}
    V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s)
    \le \sum^H_{h=1} \mathbb{E}_{\pi_\ast, \{q_{t}\}^{h-1}_{t=1}} \left[ \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \,.
\end{align*}

Let $\omega_h =  \{p_t\}^h_{t=1} / P_h^o$, and assume $P_h^o(\cdot \mid s,a) \geq c, \forall (s,a)$.
Summing over $k$, we get

\begin{align*}
    \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s)
    & \le \sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_\ast, \{q_{t}\}^{h-1}_{t=1}} \left[ \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \\ 
    & \leq \sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_\ast, P_h^o} \left[ \omega_h\langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \\
    & \leq \frac{1}{c}\sum^H_{h=1} \mathbb{E}_{\pi_\ast,P_h^o} \left[ \sum^K_{k=1}\langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \,.
\end{align*}

By standard results for online mirror descent (Lemma \ref{lem:omd}), we have
%\dong{add expect}
\begin{align*}
    \sum^K_{k=1} \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \leq \frac{\log (A)}{\beta} + \frac{\beta}{2} \sum^K_{k=1} \sum_{a \in \gA} \pi_h^\ast (a \mid s) (\hat{Q}_h^{\pi_k}(s, a) )^2 \,.
\end{align*}
By the update rule of Algorithm \ref{alg}, we have $0 \le \hat{Q}_h^{\pi_k}(s, a) \le H$, for all $h, k$. Then take $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$, 

\begin{align*}
    \sum^K_{k=1} \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \leq 
   \sqrt{2 H^2 K\log A}\,.
\end{align*}

Finally, we have
\begin{align*}
     \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s) \le \frac{H}{c} \sqrt{2 H^2 K\log A} = O \left(\frac{H^2\sqrt{K \log A} }{c}\right)\,.
\end{align*}
\end{proof}

\begin{lem}\label{lem:estimation_sa}
With probability at least $ 1 - \delta$, we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s)  =  O \left( \frac{H^2 S}{c}\sqrt{AK \log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)} \right)\,.
\end{align*}
\end{lem}

\begin{proof}
    By the algorithm's update rule and the robust bellman equation, we have 
\begin{align*}
    (\hVp - \Vp )(s)
    = \ & \langle \hQp (s, \cdot) - \Qp (s, \cdot) , \pi_k(\cdot \mid s) \rangle \\
    = \ & \left\langle \hat{r}_h^k(s, \cdot) - r_h^k(s, \cdot)  + ( \sigma_{\hat{\gP}_{(s,\cdot)}}(\hVpn)(s,\cdot) - \sigma_{\gP_{(s,\cdot)}}(\Vpn)(s,\cdot) ) + b_h^k(s,\cdot),  \pi_k(\cdot \mid s) \right\rangle  \\
    = \ & \mathbb{E}_{\pi_k}\left[ \hat{r}_h^k(s, a) - r_h^k(s, a)  + ( \hPsa(\hVpn)(s,a)  - \sPsa (\Vpn)(s,a)  ) + b_h^k(s,a) \right]\,.
\end{align*}
By adding and subtracting a term $\sPsa (\hVpn)(s, a) $, we have 
\begin{align*}
    & \hPsa(\hVpn)(s,a)  - \sPsa (\Vpn)(s,a)  \\
    = \ & \hPsa(\hVpn)(s,a)   - \sPsa (\hVpn)(s,a)  + \sPsa (\hVpn)(s,a)   - \sPsa (\Vpn)(s,a) \\
    \leq \ &   \hPsa (\hVpn)(s,a)  - \sPsa (\hVpn)(s,a)  + \max_{P_h \in \gP_h}P_h(\cdot \mid s,a)( \hVpn - \Vpn) \,.
\end{align*}
%where the last inequality follows from the similar calculation of $\sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a) $ in Lemma \ref{lem:pseudo_regret_sa}.

Let $p_h(\cdot \mid s,a) = \mathop{\arg\max}_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a)(\hVpn - \Vpn)$, we have
\begin{align*}
    &(\hVp - \Vp )(s) \\
    \leq \ & \mathbb{E}_{\pi_k}\left[ \hat{r}_h^k(s, a) - r_h^k(s, a)+ \hPsa (\hVpn)(s,a)  - \sPsa (\hVpn)(s,a)  + p_h(\cdot \mid s,a)(\hVpn - \Vpn)  + b_h^k(s,a) \right] \\
    = \ & \mathbb{E}_{\pi_k, p_h}\left[ \hat{r}_h^k(s, a) - r_h^k(s, a)+ \hPsa (\hVpn)(s,a)  - \sPsa (\hVpn)(s,a) + \hVpn(s) - \Vpn(s)  + b_h^k(s,a) \right] 
\end{align*}

By applying the above relation recursively and with the fact that for any policy $\pi$ and state $s$, $V_{H+1}^{\pi_k}(s) =  \hat{V}_{H+1}^{\pi_k}(s) = 0$, we have 
\begin{align*}
    & (\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) 
    \leq \ &\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[ \hat{r}_h^k(s, a) - r_h^k(s, a) + \hPsa(\hVpn)(s,a)  -  \sPsa (\hVpn)(s,a)  + b_h^k(s,a)\right]\,.
\end{align*}

Conditioned on the good event and by the design of our bonus function, we have 
\begin{align*}
    \hat{r}_h^k(s, a) - r_h^k(s, a) + \hPsa(\hVpn)(s,a)  -  \sPsa (\hVpn)(s,a)  \le b_h^k(s,a) \,.
\end{align*}
Then, with probability at least $1 - \delta$, we have
\begin{align*}
     \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) 
    \leq \ &\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[ 2 b_h^k(s,a)\right]\\
    \leq \ & H\sqrt{K}+ O \left(  H \sqrt{S \log(SAH^2K^{3/2}(4 + \rho) / \delta)} \right)\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[\sqrt{\frac{1}{N_h^k(s,a)}} \right]\,.
\end{align*}
%{\color{blue}FIX} 

Let $\omega_h =  \{p_t\}^h_{t=1} / P_h^o$, and assume $P_h^o(\cdot \mid s,a) \geq c, \forall (s,a)$.
By Lemma \ref{lem:visit}, we have the bound of the visitation counts:
\begin{align*}
    \sum^K_{k=1}\sum^H_{h=1}  \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[\sqrt{\frac{1}{N_h^k(s,a)}}\right] 
    \leq \ & \sum^K_{k=1}\sum^H_{h=1}  \mathbb{E}_{\pi_k, P_h^o}  \left[\omega_h\sqrt{\frac{1}{N_h^k(s,a)}}\right] \\
    \leq \ & \max_{h \in [H]}\omega_h  \sum^K_{k=1}  \sum^H_{h=1}  \mathbb{E}_{\pi_k, P_h^o}  \left[\sqrt{\frac{1}{N_h^k(s,a)}}\right] \\
    \leq \ &  \frac{2 H \sqrt{SAK}}{c} \,.
\end{align*}
Combining everything, with probability at least $1 - \delta$
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s)  = O \left( \frac{H^2 S}{c}\sqrt{AK \log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)} \right) \,.
\end{align*}

\end{proof}

\begin{lem}\label{lem:sa_con}
For any $h,k,s,a$, the following inequality holds with probability at least $1-\delta^\prime$,
\begin{align*}
    \sPsa (\hVpn)(s,a)   - \hPsa(\hVpn)(s,a) 
    \leq \ &   H\sqrt{\frac{4 S \log(3SAH^3K^{3/2}(4+\rho)/ \delta^\prime)}{N_h^k(s,a)}} + \frac{1}{H\sqrt{K}}\,.
\end{align*}
\end{lem}
\begin{proof}
    By the definition of $ \sPsa (\hVpn)(s,a) 
    = \  \min_{P_h \in \gP_h} \limits \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) $, we have the following optimization problem:
\begin{equation*}
\begin{split}
&\min_{P_h} \,\, \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) \\
&\text{s.t.}\quad  \left\{\begin{array}{lc}
\sum_{s^\prime} | P_h(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a)| \leq \rho \,, \\
\sum_{s^\prime} P_h(s^\prime \mid s,a) = 1 \,, \\
P_h^o(\cdot \mid s,a) > 0, P_h(\cdot \mid s,a) \ge 0  \,. 
\end{array}\right.
\end{split}
\end{equation*}

    

Define $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can rewrite the above optimization problem as
\begin{equation*}
\begin{split}
&\min_{\Tilde{P}_h } \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) \\
&\text{s.t.}\quad  \left\{\begin{array}{lc}
\sum_{s^\prime} | \Tilde{P}_h(s^\prime \mid s,a)  - 1| P_h^o(s^\prime \mid s,a) \leq \rho \,, \\
\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) = 1 \,, \\
\Tilde{P}_h(s^\prime \mid s,a) \ge 0 \quad \forall s^\prime \in \gS\,. 
\end{array}\right.
\end{split}
\end{equation*}


Using the Lagrangian multiplier method, we have the following Lagrangian $L(\Tilde{P}_h, \eta, \lambda)$ with Lagrangian multiplier $\eta\in \mathbb{R}, \lambda\geq 0$,
\begin{align*}
    L(\Tilde{P}_h, \eta, \lambda) (s,a) 
    = \ & \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) + \lambda \left( \sum_{s^\prime} | \Tilde{P}_h(s^\prime \mid s,a)  - 1| P_h^o(s^\prime \mid s,a)  - \rho \right) \\
    & \ - \eta \left(\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) - 1 \right) \\
    = \ &  \eta - \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left(\frac{\eta}{\lambda}\Tilde{P}_h(s^\prime \mid s,a) - | \Tilde{P}_h(s^\prime \mid s,a)  - 1| - \frac{\Tilde{P}_h(s^\prime \mid s,a) \hVpn(s^\prime)}{\lambda} \right)  \\
    = \ &  \eta - \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left(\frac{\eta - \hVpn(s^\prime) }{\lambda}\Tilde{P}_h(s^\prime \mid s,a) - | \Tilde{P}_h(s^\prime \mid s,a)  - 1| \right)  \,.
\end{align*}

We define $f(x) = |x - 1|$ and the convex conjugate is $f^\ast(y) = \max_{x} \limits  \langle x, y\rangle - f(x) $. 
Let $x$ be $\Tilde{P}_h$ and by using $f^\ast$, we can optimize over $\Tilde{P}_h$ and rewrite the Lagrangian as 
\begin{align*}
    L(\eta, \lambda) (s,a) = \min_{\Tilde{P}_h} \limits L(\Tilde{P}_h, \eta, \lambda) (s,a) 
    =  \eta - \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) f^\ast \left(\frac{\eta - \hVpn(s^\prime)}{\lambda} \right) \,.
\end{align*}

Notice that conditioned on $x \ge 0$,  $f(x) = |x-1|$'s convex conjugate has the following closed form:
\begin{equation*}
f^\ast(y)=\max_{x} \limits  \langle x, y\rangle - f(x) =\left\{
\begin{aligned}
-1& \quad  \text{$y \le -1$} \,,\\
y&  \quad \text{$y \in [-1,1]$} \,,\\
+\infty&  \quad \text{$y > 1$} \,.
\end{aligned}
\right.
\end{equation*}

Let $\Tilde{\eta} = \eta + \lambda$, then using the closed form of $f^\ast(y)$, the equality $\max\left\{a,b\right\} = (a - b)_{+} + b$ and condition on $\frac{\eta - \hVpn(s^\prime)}{\lambda} \le 1$,  we can rewrite the optimization problem as
\begin{align*}
    L(\Tilde{\eta}, \lambda) (s,a)
   = \ & \eta - \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) f^\ast \left(\frac{\eta - \hVpn(s^\prime)}{\lambda} \right) \\
   = \ &\Tilde{\eta} - \lambda -  \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \max\left\{ \frac{\eta - \hVpn(s^\prime)}{\lambda}, -1 \right\} \\
   = \ & \Tilde{\eta} - \lambda -  \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left(\left( \frac{\eta - \hVpn(s^\prime)}{\lambda} - (-1)\right)_{+} + (-1)\right)\\
   = \ & \Tilde{\eta} - \lambda -  \lambda \rho -  \sum_{s^\prime}P_h^o(s^\prime \mid s,a) (\Tilde{\eta} - \hVpn(s^\prime))_{+} + \lambda\\
   = \ & \Tilde{\eta} - \lambda \rho -  \sum_{s^\prime}P_h^o(s^\prime \mid s,a) (\Tilde{\eta} - \hVpn(s^\prime))_{+} \,.
\end{align*}
with the constraint of $\lambda$ being 
$$\lambda \ge 0,   \quad \Tilde{\eta} - \min_s \limits \hVpn(s) \leq 2 \lambda .$$

Then we discuss the constraint of $\Tilde{\eta}= \eta + \lambda$ and show that $\Tilde{\eta} \in R$. We discuss this by cases. 

For any $x \le  \min_s \limits \hVpn(s)$, taking $\eta = x$, $\lambda = 0$, then we have $\Tilde{\eta} = x$.

For any $x > \min_s \limits \hVpn(s)$, taking $\eta = \frac{x + \min_s \limits \hVpn(s)}{2}$, $\lambda = \frac{x - \min_s \limits \hVpn(s)}{2}$, then we have $\Tilde{\eta} = x$.

Then we have $\Tilde{\eta} \in R$. Fixing any $\Tilde{\eta}$, from the definition of $L$, we need to choose $\lambda = 
\frac{(\Tilde{\eta} - \min_s \limits \hVpn(s))_{+}}{2}$ to achieve the maximum of $L$. Then by directly optimizing it over $\lambda$, we can reduce the problem to 
\begin{align*}
    L(\Tilde{\eta}) (s,a) = \Tilde{\eta} - \frac{ (\Tilde{\eta} - \min_s \limits \hVpn(s))_{+}}{2}\rho - \sum_{s^\prime}P_h^o(s^\prime \mid s,a) ( \Tilde{\eta} - \hVpn(s^\prime))_{+} \,.
\end{align*}
with the constraint $\Tilde{\eta} \in R$.

Define the function $g$ as 
\[
g(\Tilde{\eta}, P_h^o) = - L(\Tilde{\eta})(s,a) = \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \Tilde{\eta} - \hVpn(s^\prime)\right)_{+}  - \Tilde{\eta} + \frac{ (\Tilde{\eta} - \min_s\limits  \hVpn(s))_{+}}{2} \rho \,.
\]

Then we investigate the optimum of $g$.
First notice that $g(0) = 0$, when $\Tilde{\eta} \leq 0$, $g(\Tilde{\eta}, P_h^o) = - \Tilde{\eta} \geq 0$.

On the other hand, when $\Tilde{\eta} \ge H$, 
\begin{align*}
    g(\Tilde{\eta}, P_h^o) 
    = \ &\sum_{s^\prime}P_h^o(s^\prime \mid s,a) ( \Tilde{\eta} - \hVpn(s^\prime))  - \Tilde{\eta} + \frac{ (\Tilde{\eta} - \min_s\limits  \hVpn(s))}{2} \rho \\
    = \ & -\sum_{s^\prime}P_h^o(s^\prime \mid s,a) \hVpn(s^\prime)  + \frac{ (\Tilde{\eta} - \min_s\limits  \hVpn(s))}{2} \rho \,.
\end{align*}

Note that now $g$ is directly proportional to $\Tilde{\eta}$, therefore $g$ achieves the minimum within the range of $\Tilde{\eta} \in [0, H]$. We remark that the same form is also used for analyzing robust policy evaluation (Lemma B.1 \citep{yang2021towards}).

With this, we can rewrite 
\begin{align*}
    \hPsa(\hVpn)(s,a)  - \sPsa (\hVpn)(s,a)   
    = \ & -\min_{\eta_1 \in [0, H] } g(\eta_1, \hat{P}_h^{o,k}) + \min_{\eta_2 \in [0, H] }g\left(\eta_2, P_h^o\right)\\
    \leq \ & \max_{\eta \in [0, H ]}| g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right) |\,.
\end{align*}

To upper bound $ \hPsa(\hVpn)(s,a)  - \sPsa (\hVpn)(s,a)  $, we first upper bound $| g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right) |$.
\begin{align*}
    | g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right) |
    = \ & \left|  \sum_{s^\prime}\hat{P}_h^{o,k}(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+}  - \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+} \right|  \\
    \leq \ & \left\|\hat{P}_h^{o,k}(\cdot \mid s,a) -  P_h^o(\cdot \mid s,a)\right\|_1 \ \max_{s \in \gS} \limits | \eta - \hVpn(s)|_\infty \\ 
    \leq \ &  H \left\|\hat{P}_h^{o,k}(\cdot \mid s,a) -  P_h^o(\cdot \mid s,a)\right\|_1 \,,
\end{align*}
where the first inequality is by Cauchy-Schwarz inequality, the second inequality follows from $\eta \in [0, H ]$.

By Hoeffding's inequality and an union bound over all $s,a$, the following inequality holds with probability at least $1 - \delta^\prime$:
\begin{align*}
    \left\|\hat{P}_h^{o,k}(\cdot \mid s,a) -  P_h^o(\cdot \mid s,a)\right\|_1 \leq \sqrt{\frac{4 S \log(3SAH^2K/ \delta^\prime)}{N_h^k(s,a)}} \,.
\end{align*}

To upper bound the error with maximum over $\eta$, we first create an $\epsilon$-net $N_\epsilon(\eta)$ with $g$ over $\eta\in [0, H ]$ such that
\begin{align*}
    \max_{\eta\in [0, H ]} |  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)| \leq \max_{\eta\in N_\epsilon(\eta)} |  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)| + 2 \epsilon \,.
\end{align*}

By taking an union bound over $N_\epsilon(\eta)$, we have 
\begin{align*}
    \max_{\eta\in [0, H ]} |  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)| \leq H \sqrt{\frac{4 S \log(3SAH^2K | N_\epsilon(\eta)| / \delta^\prime)}{N_h^k(s,a)}} + 2 \epsilon \,,
\end{align*}
where $| N_\epsilon(\eta)| $ is the size of the $\epsilon$-net.

It now remains to bound the size of $| N_\epsilon(\eta)| $, which can be obtained easily if $g$ is Lischitz. 
Notice that
\begin{align*}
     |g(\Tilde{\eta_1}, P_h^o) -  g(\Tilde{\eta_2}, P_h^o) | 
     \le \ &  \sum_{s^\prime}P_h^o(s^\prime \mid s,a) | \Tilde{\eta_1} - \Tilde{\eta_2}|  + | \Tilde{\eta_1} - \Tilde{\eta_2}| + \frac{ |\Tilde{\eta_1} - \Tilde{\eta_2}|}{2} \rho \\
     = \ & \frac{4 + \rho}{2} |\Tilde{\eta_1} - \Tilde{\eta_2}| \,, 
\end{align*}
where the first inequality is by the absolute inequality and 
$|(a)_{+}-(b)_{+}| \le |a-b|$.

Then $g$ is a $\frac{4 + \rho}{2}$-Lipschitz function over $\eta\in [0, H ]$, thus combined with Lemma \ref{lem:eps_cover}, we have $| N_\epsilon(\eta)| = O\left(\frac{4 + \rho}{2 \epsilon}\right)$. Hence, we have the following inequality happens with at least $1-\delta^\prime$ probability:
\begin{align*}
    \max_{\eta\in [0, H ]} |  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)| \leq  H \sqrt{\frac{4 S \log(3SAH^2K (4 + \rho)/2\epsilon \delta^\prime)}{N_h^k(s,a)}} + 2 \epsilon\,.
\end{align*}
Take $\epsilon = \frac{1}{2\sqrt{K}}$, we have the following inequality happens with at least $1-\delta^\prime$ probability:
\begin{align*}
    \sPsa (\hVpn)(s,a)   - \hPsa(\hVpn)(s,a) 
    \leq \ & \max_{\eta\in [0, H ]} |  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)| \\
    \leq \ &  H \sqrt{\frac{4 S \log(3SAH^2K^{3/2}(4 + \rho)/ \delta^\prime)}{N_h^k(s,a)}} + \frac{1}{\sqrt{K}}\,.
\end{align*}
\end{proof}


\newpage
\section{Proof of Theorem \ref{thm:s}}\label{appendix:thm2}




\subsection{Good events}
We first define the following good events, in which case we estimate the reward function and the nominal transition functions fairly accurately. 
\begin{align*}
    \gG_{k}^{r} = \ & \left\{\forall s, a, h:\left|r_{h}(s, a)-\hat{r}_{h}^{k}(s, a)\right| \leq \sqrt{\frac{2 \ln (2 SAH^2K / \delta^\prime)}{N_h^k(s,a)}}\right\} \,, \\
    \gG_k^{p} = \ & \left\{\forall s, a, h: \sPs (\hVpn)(s,a)   - \hPs(\hVpn)(s,a) 
    \leq C_h^k(s,a)\right\}\,,
\end{align*}
where \[C_h^k(s,a) = AH\sqrt{\frac{4 S A\log(3SA^2H^3K^{3/2}(4+\rho) / \delta^\prime)}{N_h^k(s,a)}} +  \frac{1}{H\sqrt{K}} \,. \]
When the two good events happens at the same time, we say the algorithm in inside the good event $\gG = \left( \bigcap^K_{k=1} \gG_{k}^{r}\right) \bigcap \left( \bigcap^K_{k=1} \gG_{k}^{p}\right)$. The following lemma shows that $\gG$ happens with high probability.

\begin{lem}[Good event]
Let $\delta = 2 \delta^{\prime}$,  then the good event happens with high probability, i.e. $\mathbb{P}\left[ \gG\right] \geq 1 - \delta$.
\end{lem}

\begin{proof}
    By Hoeffding's inequality and an union bound on all $s,a$, all possible values of $N_k(s,a)$ and $k$, we have $\mathbb{P}\left[ \bigcap^K_{k=1} \gG_{k}^{r}\right] \geq 1 - \delta^\prime$. By Lemma \ref{lem:con_s}, we have $\mathbb{P}\left[ \bigcap^K_{k=1} \gG_{k}^{p}\right] \geq 1 - \delta^\prime$ Then set $\delta = 2\delta^\prime$ and we have the desired result. 
\end{proof}

\subsection{Design of the bonus function}

In the case of $s$-rectangular uncertainty set, we use the following bonus function $b_h^k(s,a)$ to encourage exploration. 
\begin{align}\label{bonus_s}
    b_h^k(s,a) =  AH\sqrt{\frac{4 S A\log(3SA^2H^2K^{3/2}(4+\rho) / \delta)}{N_h^k(s,a)}} +  \frac{1}{\sqrt{K}}   + \sqrt{\frac{2 \log(3SAH^2K/ \delta^\prime)}{N_h^k(s,a)}}  \,.
\end{align}
\subsection{Regret analysis}


\begin{proof}
    Similar to the case of $(s,a)$-rectangular set, we start with decomposing the regret as follows,
    \begin{align*}
    \text{Regret}(K) 
    = \ & \sum^K_{k=1} V_1^{\ast}(s) - V_1^{\pi_k}(s)\\
    = \ & \sum^K_{k=1} \left(V_1^{\ast} (s)- \hat{V}_1^{\pi_k}(s) \right) + \left(\hat{V}_1^{\pi_k} (s) - V_1^{\pi_k}(s)\right) \,.
    \end{align*}
    By Lemma \ref{lem:pseudo_regret_sa} and Lemma \ref{lem:estimation_s}, with probability at least $1 - \delta$, we have 
    \begin{align*}
        \text{Regret}(K) 
        = \ & O \left(\frac{H^2\sqrt{K \log A} }{c}\right) + O \left( \frac{SA^2 H^2}{c}\sqrt{K\log(SA^2H^2K^{3/2}(1+\rho) / \delta)}\right)  \\
        = \ & O \left( \frac{SA^2 H^2}{c}\sqrt{K\log(SA^2H^2K^{3/2}(1+\rho) / \delta)}\right)  \,.
    \end{align*}
\end{proof}

\begin{lem}\label{lem:estimation_s}
    With Algorithm \ref{alg}, we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) = O \left( \frac{SA^2 H^2}{c}\sqrt{K\log(SA^2H^2K^{3/2}(1+\rho) / \delta)}\right) \,.
\end{align*}
\end{lem}

\begin{proof}
    Similar to the case with $(s,a)$-rectangular uncertainty set, for any $k$, we can decompose $(\hat{V}_1^{\pi_k} - \hat{V}_1^{\pi_k})(s)$ as, 
\begin{align*}
    & (\hat{V}_1^{\pi_k} - \hat{V}_1^{\pi_k})(s) \\
    \leq \ & \sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[ (r_h^k(s, a) - \hat{r}_h^k(s, a)) + \left( \hPs \left(\hVpn\right)(s,a)   - \sPs\left(\hVpn\right)(s,a) \right) + b_h^k(s,a)\right]\,.
\end{align*}


Thus by the design of our bonus function and with probability at least $ 1 - \delta$, we have
\begin{align*}
    & \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) \\
    \leq \ & 2\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[b_h^k(s,a) \right] \\
    = \ & H\sqrt{K} + O\left(  HA\sqrt{SA\log(SA^2H^2K^{3/2}(1+\rho) / \delta)}\right)\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[ \sqrt{\frac{1}{N_h^k(s,a)}}\right]  \,.
\end{align*}

% By Lemma \ref{lem:visit}, 
% we have the bound of visitation counts:
% \begin{align*}
%     \sum^K_{k=1}\sum^H_{h=1} \sqrt{\frac{1}{N_h^k(s,a)}} \leq 2 H \sqrt{SAK} \,.
% \end{align*}
Let $\omega_h =  \{p_t\}^h_{t=1} / P_h^o$, and assume $P_h^o(\cdot \mid s,a) \geq c, \forall (s,a)$. By Lemma \ref{lem:visit}, we have the bound of the visitation counts:
\begin{align*}
    \sum^K_{k=1}\sum^H_{h=1}  \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[\sqrt{\frac{1}{N_h^k(s,a)}}\right] 
    \leq \ & \sum^K_{k=1}\sum^H_{h=1}  \mathbb{E}_{\pi_k, P_h^o}  \left[\omega_h\sqrt{\frac{1}{N_h^k(s,a)}}\right] \\
    \leq \ & \max_{h \in [H]}\omega_h  \sum^K_{k=1}  \sum^H_{h=1}  \mathbb{E}_{\pi_k, P_h^o}  \left[\sqrt{\frac{1}{N_h^k(s,a)}}\right] \\
    \leq \ &  \frac{2 H \sqrt{SAK}}{c} \,.
\end{align*}

Combining everything, conditioned on the good event we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) = O \left( \frac{SA^2 H^2}{c}\sqrt{K\log(SA^2H^2K^{3/2}(1+\rho) / \delta)}\right) \,.
\end{align*}

\end{proof}

\begin{lem}\label{lem:con_s}
    For any $h,k,s,a$, the following inequality holds with probability at least $1-\delta$,
    \begin{align*}
    \hPs (\hVpn)(s,a)   - \sPs (\hVpn)(s,a) 
    \leq \ & AH\sqrt{\frac{4 S A\log(3SA^2H^2K^{3/2}(4+\rho) / \delta)}{N_h^k(s,a)}} +  \frac{1}{\sqrt{K}} \,.
\end{align*}
\end{lem}
\begin{proof}
  By the definition of $ \sPs (\hVpn)(s,a) 
    =  \inf_{P_h \in \gP_h} \limits \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) $, we consider the following optimization problem:
\begin{equation*}
\begin{split}
&\min_{P_h} \,\, \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) \\
&\text{s.t.}\quad  \left\{\begin{array}{lc}
\sum_{s^\prime, a^\prime} | P_h(s^\prime \mid s,a^\prime) - P_h^o(s^\prime \mid s,a^\prime)| \leq A\rho \,, \\
\sum_{s^\prime} P_h(s^\prime \mid s,a^\prime) = 1 \,,  \forall a^\prime \in \gA \,, \\
P_h^o(\cdot \mid s,a^\prime) > 0, P_h(\cdot \mid s,a^\prime) \ge 0  \,,  \forall a^\prime \in \gA \,. \\
\end{array}\right.
\end{split}
\end{equation*}

Let $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can rewrite the above optimization problem as
\begin{equation*}
\begin{split}
&\min_{\Tilde{P}_h } \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) \\
&\text{s.t.}\quad  \left\{\begin{array}{lc}
\sum_{s^\prime, a^\prime} | (\Tilde{P}_h(s^\prime \mid s,a^\prime) - 1| P_h^o(s^\prime \mid s,a^\prime) \leq A\rho \,, \\
\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a^\prime)P_h^o(s^\prime \mid s,a^\prime) = 1 \,, \quad  \forall a^\prime \in \gA\\
\Tilde{P}_h(\cdot \mid s,a^\prime) \geq 0\,, \quad  \forall a^\prime \in \gA\,.
\end{array}\right.
\end{split}
\end{equation*}

Use the Lagrangian multiplier method and $f(x) = |x - 1|$, we have the Lagrangian $L(\Tilde{P}_h, \eta, \lambda)$ with multiplier $\eta = \{\eta_a\}_{a\in\gA}, \eta_a \in \mathbb{R}$, $\lambda \geq 0$,
\begin{align*}
     & L\left(\Tilde{P}_h, \eta, \lambda\right) (s,a) \\
    = \ & \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) + \lambda \left( \sum_{s^\prime, a^\prime} \left| (\Tilde{P}_h(s^\prime \mid s,a^\prime) - 1\right| P_h^o(s^\prime \mid s,a^\prime) - A\rho \right)\\ 
    & - \sum_{a^\prime} \eta_{a^\prime} \left( \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a^\prime)P_h^o(s^\prime \mid s,a^\prime) - 1 \right) \\
     = \ &  - \lambda A \rho + \sum_{a^\prime} \eta_{a^\prime} + 
    \lambda \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left( f\left(\Tilde{P}_h(s^\prime \mid s,a^\prime) \right) - \Tilde{P}_h(s^\prime \mid s,a^\prime) \left(\frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda} \right)\right) \,.
\end{align*}


The convex conjugate of $f$ is $f^\ast(y) = \max_{x} \limits \langle x, y\rangle - f(x) $. 
Using $f^\ast$, we can thus optimize over $\Tilde{P}_h$ and rewrite the Lagrangian over as
\begin{align*}
    L(\eta, \lambda) (s,a) = \ &  \min_{\Tilde{P}_h} \limits L\left(\Tilde{P}_h, \eta, \lambda\right) (s,a) \\
    = \ & - \lambda A \rho +  \sum_{a^\prime} \eta_{a^\prime} - \lambda \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) f^\ast \left( \frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda}\right) \,.
\end{align*}

Conditioned on $x \ge 0$, $f(x) = |x-1|$, notice that the conjugate $f^\ast(y)$ has the following closed form,
\begin{equation*}
f^\ast(y)=\max_{x} \limits  \langle x, y\rangle - f(x) =\left\{
\begin{aligned}
-1& \quad  \text{$y \le -1$} \,,\\
y&  \quad \text{$y \in [-1,1]$} \,,\\
+\infty&  \quad \text{$y > 1$} \,.
\end{aligned}
\right.
\end{equation*}

Let $\Tilde{\eta}_a = \eta_a + \lambda$, using the closed form of $f^\ast(y)$, the equality $\max\left\{a,b\right\} = (a - b)_{+} + b$ and conditioned on $\frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda} \le 1$,  we can rewrite the optimization problem as
\begin{align*}
    L(\Tilde{\eta}, \lambda) (s,a) &= - \lambda A \rho +  \sum_{a^\prime} \eta_{a^\prime} - \lambda \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) f^\ast \left( \frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda}\right) \\
    &= - \lambda A \rho -\lambda A +  \sum_{a^\prime} \Tilde{\eta}_{a^\prime} - \lambda \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \max \left\{ \frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda}, -1\right\} \\
    &= - \lambda A \rho + \sum_{a^\prime} \Tilde{\eta}_{a^\prime} -  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \,.
\end{align*}
where constraint of $\lambda$ is 
\[\lambda \ge 0,   \quad \Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)\leq 2\lambda, \ \forall a^\prime,s^\prime \,.\]



Note that the above Lagrangian is inversely proportional to $\lambda$ and it achieves the maximum when $\lambda = 
\max_{s^\prime, a^\prime} \limits \frac{ (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} $. 
Directly optimize over $\lambda$,  we can reduce the problem to
\begin{align*}
    L(\Tilde{\eta}) (s,a) = \sum_{a^\prime} \Tilde{\eta}_{a^\prime} -  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} - \max_{s^\prime, a^\prime}\frac{A \rho (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} \,.
\end{align*}
Define $g\left(\Tilde{\eta}, P_h^o\right) = -L(\Tilde{\eta}) (s,a) $ as 
\begin{align*}
    g(\Tilde{\eta}, P_h^o) 
    = \ &  - \sum_{a^\prime} \Tilde{\eta}_{a^\prime} +  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} + \max_{s^\prime, a^\prime} \frac{A \rho (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} \,.
\end{align*}

%Then we need to find the minimum of $g$.
Assume $g$ achieves its minimum when $\Tilde{\eta} = \left\{\Tilde{\eta}_1, \cdots , \Tilde{\eta}_A\right\}$.
Suppose $\Tilde{\eta}$ has a component $\Tilde{\eta}_a < 0$. Consider $\eta^\prime = \left\{\Tilde{\eta}_1, \cdots, 0, \cdots, \Tilde{\eta}_a\right\}$, where we change the zero element $\Tilde{\eta}_a$ to 0 and keep other components unchanged. Then we have
$$g(\Tilde{\eta}, P_h^o) - g(\eta^\prime, P_h^o) = -\Tilde{\eta}_A > 0\,, $$
which contradict with the hypothesis that $g$ achieves its minimum in $\Tilde{\eta}$.

On the other hand, suppose $\Tilde{\eta} $ has a component $\Tilde{\eta}_a > H$. Then consider $\eta^\prime = \left\{\Tilde{\eta}_1, \cdots, H, \cdots, \Tilde{\eta}_a\right\}$, where we change corresponding $\Tilde{\eta}_a$ to 0 and keep other components unchanged. Denote $f(\Tilde{\eta}) = \max_{s^\prime, a^\prime} \limits \frac{A \rho (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2}$, and we have

\begin{align*}
    g\left(\Tilde{\eta}, P_h^o\right) - g\left(\eta^\prime, P_h^o\right) 
    =\ & -\Tilde{\eta}_A + H + \sum_{s^\prime} P_h^o(s^\prime \mid s,a) (\Tilde{\eta}_{a} - H)  + f(\Tilde{\eta}) - f(\eta^\prime)\\
    \ge \ & -\Tilde{\eta}_A + H + \sum_{s^\prime} P_h^o(s^\prime \mid s,a) (\Tilde{\eta}_{a} - H) \\
    = \ & 0 \,.
\end{align*}
Therefore, $g$ achieves its minimum with $\Tilde{\eta}$, with $0 \le \eta_{a} \le H, \forall a \in \gA$. We remark that a similar form and technique are also used for analyzing robust policy evaluation (Lemma C.1 \citep{yang2021towards}).

We can now rewrite 
\begin{align*}
     \hPs \left(\hVpn\right)(s,a)  - \sPs\left(\hVpn\right)(s,a)
     = \ & \min_{\eta_{1}\in [0, H]^{|\gA|}} g(\eta_1, \hat{P}_h^{o,k}) - \min_{\eta_{2}\in [0, H]^{|\gA|}} g(\eta_2, P_h^o) \\
     \leq \ & \max_{\eta\in [0, H]^{|\gA|}} \left|g\left(\eta, \hat{P}_h^{o,k}\right) -  g\left(\eta, P_h^o\right) \right| \,.
     %= \ & \sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime}  \left|g(\eta, \hat{P}_h^{o,k}^o) -  g\left(\eta, P_h^o\right) \right| \,.
\end{align*}

To upper bound $\hPs \left(\hVpn\right)(s,a)  - \sPs\left(\hVpn\right)(s,a)$, we first consider the bound of $\left|g\left(\eta, \hat{P}_h^{o,k}\right) -  g\left(\eta, P_h^o\right) \right|$,
\begin{align*}
    &  \left|g\left(\eta, \hat{P}_h^{o,k}\right) -  g\left(\eta, P_h^o\right) \right| \\
    = \ & \left| \sum_{s^\prime, a^\prime} \hat{P}_h^{o,k}(s^\prime \mid s,a^\prime) \left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} - \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \right|\\
    = \ & \left| \  \sum_{a^\prime}  \sum_{s^\prime} \left( \hat{P}_h^{o,k}(s^\prime \mid s,a^\prime) - P_h^o(s^\prime \mid s,a^\prime) \right)\left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \right|  \\
    \leq \ & \sum_{a^\prime} \left\| \hat{P}_h^{o,k}(\cdot \mid s,a^\prime) - P_h^o(\cdot \mid s,a^\prime) \right\|_1 \max_{s \in \gS}\left| \eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s)  \right|\\
    \le \ & H \sum_{a^\prime} \left\| \hat{P}_h^{o,k}(\cdot \mid s,a^\prime) - P_h^o(\cdot \mid s,a^\prime) \right\|_1\,,
\end{align*}
where the first inequality is by Cauchy-Schwarz inequality, the second inequality follows from $\eta_a \in [0, H ] , \ \forall a \in \gA$.

By Hoeffding's inequality and an union bound over all $s,a^\prime$, $N_h^k(s,a)$, the following inequality holds with probability at least $1 - \delta$,
\begin{align*}
    \left\| \hat{P}_h^{o,k}(\cdot \mid s,a^\prime) - P_h^o(\cdot \mid s,a^\prime) \right\|_1  
    \leq \ &  \sqrt{\frac{4S \log(SAH^2 K / \delta)}{N_h^k(s,a)}} \,.
\end{align*}

To upper bound $\max_{\eta\in [0, H]^{|\gA|}} \left|g\left(\eta, \hat{P}_h^{o,k}\right) -  g\left(\eta, P_h^o\right) \right|$, we first create an $\epsilon$-net $N_\epsilon(\eta)$ with $g$ over $\eta\in [0, H ]$ such that
\begin{align*}
    \max_{\eta\in [0, H ]} \left|  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)\right| \leq \max_{\eta\in N_\epsilon(\eta)} \left|  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)\right| + 2 \epsilon \,.
\end{align*}

Taking an union bound over $N_\epsilon(\eta)$, we have 
\begin{align*}
    \max_{\eta\in [0, H ]} \left|  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)\right| \leq HA \sqrt{\frac{4 S \log(3SAH^2K | N_\epsilon(\eta)| / \delta)}{N_h^k(s,a)}} + 2 \epsilon \,,
\end{align*}
where $| N_\epsilon(\eta)|$ is the size of the $\epsilon$-net.

It now remains to find the size of the $\epsilon$-net, which can be easily obtained if $g$ is Lipschitz. Notice that 
\begin{align*}
     &|g(\Tilde{\eta}_1, P_h^o) -  g(\Tilde{\eta}_2, P_h^o) | \\
     \le \  &  \sum_{s^\prime, a^\prime}P_h^o(s^\prime \mid s,a) | \Tilde{\eta}_{1, a^\prime} - \Tilde{\eta}_{2, a^\prime}|  + \sum_{a^\prime}| \Tilde{\eta}_{1, a^\prime} - \Tilde{\eta}_{2, a^\prime}| + \frac{ \max_{a^\prime}\limits| \Tilde{\eta}_{1, a^\prime} - \Tilde{\eta}_{2, a^\prime}|}{2} A\rho \\
     \le \ & \frac{A(4 + \rho)}{2} \|\Tilde{\eta_1} - \Tilde{\eta_2}\|_{\infty} \,, 
\end{align*}
where the first inequality is by the absolute inequality, the property of maximum function and $|(a)_{+}-(b)_{+}| \le |a-b|$, the second inequality follows from the definition of infinity norm.

Therefore $g$ is a $\frac{A(4 + \rho)}{2}$-Lipschitz function over $\eta\in [0, H ]$. Thus combining with Lemma \ref{lem:eps_cover}, we have $| N_\epsilon(\eta)| \leq \left( \frac{A(4 + \rho)}{2\epsilon}\right)^A$. 
Hence, we have the following inequality happens with at least $1-\delta^\prime$ probability:
\begin{align*}
\hPs (\hVpn)(s,a)   - \sPs (\hVpn)(s,a) 
    \leq \ & \max_{\eta_{a} \in [0, H]^{|\gA|}} \left|  g\left(\eta, \hat{P}_h^{o,k}\right)  - g\left(\eta, P_h^o\right)\right| \\
    \leq \ & AH\sqrt{\frac{4 S A\log(3SA^2H^2K ( 4 + \rho)/ 2\epsilon \delta^\prime)}{N_h^k(s,a)}} + 2 \epsilon \,.
\end{align*}

Take $\epsilon = \frac{1}{2\sqrt{K}}$, then 
\begin{align*}
    \hPs (\hVpn)(s,a)   - \sPs (\hVpn)(s,a) 
    \leq \ & AH\sqrt{\frac{4 S A\log(3SA^2H^2K^{3/2}(4+\rho) / \delta^\prime)}{N_h^k(s,a)}} +  \frac{1}{\sqrt{K}} \,.
\end{align*}
\end{proof}

\newpage
\section{Extension to uncertainty set with KL divergence}\label{appendix:thm3}

In this section, we extend our algorithm and analysis to uncertainty sets with KL divergence as a distance metric. We first formally define the uncertainty set considered, which is similar to the one in Definition \ref{def:sa}. 
\begin{defn}[$(s,a)$-rectangular uncertainty set \cite{iyengar2005robust,wiesemann2013robust}]\label{def:kl}
For all time step $h$ and with a given state-action pair $(s,a)$, the $(s,a)$-rectangular uncertainty set $\gP_h(s,a)$ is defined as 
\[
\gP_h(s,a) = \left\{\text{D}_{KL}\left(P_h(\cdot \mid s,a),  P_h^o(\cdot \mid s,a)\right) \leq \rho \,,P_h(\cdot \mid s,a) \in \Delta(\gS) \right\} \,,
\]
where $P_h^o$ is the nominal transition kernel at $h$, $P_h^o(\cdot \mid s,a) > 0, \forall (s,a) \in \gS \times \gA$, $\rho$ is the level of uncertainty and $\text{D}_{KL}\left(p(\cdot \mid s,a), q(\cdot \mid s, a)\right) = \sum_{s^\prime \in \gS} p(s^\prime \mid s,a) \log \left( \frac{ p(s^\prime \mid s,a)}{ q(s^\prime\mid s,a)}\right)$.
\end{defn}

With the above described uncertainty set, our algorithm solves $\sigma_{\hat{\gP}_h}(\hat{V}_{h+1}^\pi)(s,a)$ by solving the following sub-problem, 
\begin{align*}
    \min_{\lambda} \lambda \rho + \lambda \log\left(\sum_{s^\prime} \hat{P}_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  \hat{V}_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)\,.
\end{align*}
Our algorithm also uses the following bonus function in the robust policy evaluation step, 
\begin{align*}
    b_h^k(s,a) 
    = \ & C_h^k(s,a)+ \sqrt{\frac{2 \log(3SAH^2K/ \delta^\prime)}{N_h^k(s,a)}}  \,.
\end{align*}

With these modifications to algorithm \ref{alg}, the following theorem states the formal regret guarantee. 
\begin{thm}[Regret under KL divergence $(s,a)$-rectangular uncertainty set]
\label{thm:kl}
Setting the learning rate $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$, then with probability at least $ 1 - \delta$, the regret incurred by Algorithm over $K$ episodes is bounded by 
\begin{align*}
    \text{Regret}(K) 
    = O \left(\frac{SH}{\rho c^2} \sqrt{AK\log(SAH^4K^{3/2}/ \delta)}\right) \,,
\end{align*}
where $0 < c \leq 1$ the minimal element of $P_h^o$, over all $h \in [H]$.
\end{thm}
In the following, we present the detailed analysis of Theorem \ref{thm:kl}

\subsection{Good events}
We first define the following good events, in which case we estimate the reward function and the nominal transition functions fairly accurately. 
\begin{align*}
    \gG_{k}^{r} = \ & \left\{\forall s, a, h:\left|r_{h}(s, a)-\hat{r}_{h}^{k}(s, a)\right| \leq \sqrt{\frac{2 \ln (2 SAH^2K / \delta^\prime)}{N_h^k(s,a)}}\right\} \,, \\
    \gG_k^{p} = \ & \left\{\forall s, a, h: \sPs (\hVpn)(s,a)  - \hPs(\hVpn)(s,a)
    \leq C_h^k(s,a)\right\}\,,
\end{align*}
where \[C_h^k(s,a) =\frac{2H}{\rho c}  \sqrt{\frac{4 S  \log(8SAH^4K^2/ \delta^\prime \rho)}{N_h^k(s,a)}} +  \frac{1}{\sqrt{K}} \,,\]
and $c$ is the minimal element of $P_h^o$, over all $h \in [H]$.
When the two good events happens at the same time, we say the algorithm in inside the good event $\gG = \left( \bigcap^K_{k=1} \gG_{k}^{r}\right) \bigcap \left( \bigcap^K_{k=1} \gG_{k}^{p}\right)$. The following lemma shows that $\gG$ happens with high probability.

\begin{lem}[Good event]
Let $\delta = 2 \delta^{\prime}$,  then the good event happens with high probability, i.e. $\mathbb{P}\left[ \gG\right] \geq 1 - \delta$.
\end{lem}

\begin{proof}
    By Hoeffding's inequality and an union bound on all $s,a$, all possible values of $N_k(s,a)$ and $k$, we have $\mathbb{P}\left[ \bigcap^K_{k=1} \gG_{k}^{r}\right] \geq 1 - \delta^\prime$. By Lemma \ref{lem:kl_con_sa}, we have $\mathbb{P}\left[ \bigcap^K_{k=1} \gG_{k}^{p}\right] \geq 1 - \delta^\prime$ Then set $\delta = 2\delta^\prime$ and we have the desired result. 
\end{proof}

\subsection{Regret analysis}
%\kl
%\begin{restatable*}[Regret under KL divergence $(s,a)$-rectangular uncertainty set]{thm}{kl}


%where $\rho$ is the radius of the $(s,a)$-rectangular uncertainty set defined in Definition \ref{def:sa}.
%\end{restatable*}

\begin{proof}
    Similar to the case of $(s,a)$-rectangular set, we start with decomposing the regret as follows,
    \begin{align*}
    \text{Regret}(K) 
    = \ & \sum^K_{k=1} V_1^{\ast}(s) - V_1^{\pi_k}(s)\\
    = \ & \sum^K_{k=1} \left(V_1^{\ast} (s)- \hat{V}_1^{\pi_k}(s) \right) + \left(\hat{V}_1^{\pi_k} (s) - V_1^{\pi_k}(s)\right) \,.
    \end{align*}
    By Lemma \ref{lem:pseudo_regret_sa} and Lemma \ref{lem:kl}, with probability at least $1 - \delta$, we have 
    \begin{align*}
        \text{Regret}(K) 
        = \ & O \left(\frac{H^2\sqrt{K \log A} }{c}\right)  + O \left(\frac{S H}{\rho c^2}\sqrt{AK\log(SAH^4K^{3/2}/ \delta)}\right)  \\
        = \ & O \left(\frac{S H}{\rho c^2}\sqrt{AK\log(SAH^4K^{3/2}/ \delta)}\right)\,,
    \end{align*}
    where $c$ is the minimal element of $P_h^o$, over all $h \in [H]$.
\end{proof}

\begin{lem}\label{lem:kl}
    With Algorithm \ref{alg}, we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) = O \left(\frac{S H}{\rho c^2} \sqrt{AK\log(SAH^4K^{3/2}/ \delta)}\right)  \,.
\end{align*}
\end{lem}

\begin{proof}
    Similar to the case with $(s,a)$-rectangular uncertainty set, for any $k$, we can decompose $(\hat{V}_1^{\pi_k} - \hat{V}_1^{\pi_k})(s)$ as, 
\begin{align*}
    (\hat{V}_1^{\pi_k} - \hat{V}_1^{\pi_k})(s) 
    \leq \ \sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[ (r_h^k(s, a) - \hat{r}_h^k(s, a)) + \left( \hPs \left(\hVpn\right)(s,a)  - \sPs\left(\hVpn\right)(s,a)\right) + b_h^k(s,a)\right]\,.
\end{align*}


Thus by the design of our bonus function and with probability at least $ 1 - \delta$, we have
\begin{align*}
    & \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) \\
    \leq \ & 2\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[b_h^k(s,a) \right] \\
    = \ &H\sqrt{K} + O\left( \frac{1}{\rho c}\sqrt{S\log(SAH^4K^{3/2}/ \delta)}\right)\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[ \sqrt{\frac{1}{N_h^k(s,a)}}\right]  \,,
\end{align*}
where $c$ is a problem dependent constant.

Let $\omega_h =  \{p_t\}^h_{t=1} / P_h^o$, and assume $P_h^o(\cdot \mid s,a) \geq c, \forall (s,a)$. By Lemma \ref{lem:visit}, we have the bound of the visitation counts:
\begin{align*}
    \sum^K_{k=1}\sum^H_{h=1}  \mathbb{E}_{\pi_k, \{p_t\}^h_{t=1}} \left[\sqrt{\frac{1}{N_h^k(s,a)}}\right] 
    \leq \ & \sum^K_{k=1}\sum^H_{h=1}  \mathbb{E}_{\pi_k, P_h^o}  \left[\omega_h\sqrt{\frac{1}{N_h^k(s,a)}}\right] \\
    \leq \ & \max_{h \in [H]}\omega_h  \sum^K_{k=1}  \sum^H_{h=1}  \mathbb{E}_{\pi_k, P_h^o}  \left[\sqrt{\frac{1}{N_h^k(s,a)}}\right] \\
    \leq \ &  \frac{2 H \sqrt{SAK}}{c} \,.
\end{align*}
Combining everything, conditioned on the good event we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) = O \left(\frac{SH}{\rho c^2} \sqrt{AK\log(SAH^4K^{3/2}/ \delta)}\right) \,.
\end{align*}

\end{proof}

\begin{lem}\label{lem:kl_con_sa}
    For any $h,k,s,a$, the following inequality holds with probability at least $1-\delta$,
    \begin{align*}
   \hPs (\hVpn)(s,a)  - \sPs (\hVpn)(s,a)
    \leq \ &  \frac{2H}{\rho c}  \sqrt{\frac{4 S  \log(8SAH^4K^2/ \delta^\prime \rho)}{N_h^k(s,a)}} +  \frac{1}{\sqrt{K}} \,.
\end{align*}
where $c$ is the minimal element of $P_h^o$.
\end{lem}
\begin{proof}
  By the definition of $ \sPs \left(\hVpn\right)(s,a) 
    =  \inf_{P_h \in \gP_h} \limits \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) $, we consider the following optimization problem:
\begin{equation*}
\begin{split}
&\min_{P_h} \,\, \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) \\
&\text{s.t.}\quad  \left\{\begin{array}{lc}
\sum_{s^\prime} P_h(s^\prime \mid s,a)\log\left(\frac{P_h(s^\prime \mid s,a)}{P_h^o(s^\prime \mid s,a)}\right)  \leq \rho \,, \\
\sum_{s^\prime} P_h(s^\prime \mid s,a) = 1  \,, \\
P_h^o(\cdot \mid s,a) > 0, P_h(\cdot \mid s,a) \ge 0  \,. \\
\end{array}\right.
\end{split}
\end{equation*}

Let $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can rewrite the above optimization problem as
\begin{equation*}
\begin{split}
&\min_{\Tilde{P}_h } \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) \\
&\text{s.t.}\quad  \left\{\begin{array}{lc}
\sum_{s^\prime}  \Tilde{P}_h(s^\prime \mid s,a^\prime) P_h^o(s^\prime \mid s,a^\prime)\log\left(\Tilde{P}_h(s^\prime \mid s, a)\right) \leq \rho \,, \\
\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a^\prime)P_h^o(s^\prime \mid s,a) = 1 \,, \\
\Tilde{P}_h(\cdot \mid s,a) \geq 0  \,.
\end{array}\right.
\end{split}
\end{equation*}

Use the Lagrangian multiplier method and $f(x) = x \log x$, we have the Lagrangian $L(\Tilde{P}_h, \eta, \lambda)$ with multiplier $\eta \in \mathbb{R}$, $\lambda \geq 0$,
\begin{align*}
     & L(\Tilde{P}_h, \eta, \lambda) (s,a) \\
    = \ & \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) + \lambda \left( \sum_{s^\prime}  \Tilde{P}_h(s^\prime \mid s,a^\prime) P_h^o(s^\prime \mid s,a^\prime)\log(\Tilde{P}_h(s^\prime \mid s, a)) - \rho \right)\\ 
    & -  \eta \left( \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a)P_h^o(s^\prime \mid s,a) - 1 \right) \\
     = \ &  - \lambda \rho +\eta + 
    \lambda \sum_{s^\prime} P_h^o(s^\prime \mid s,a) \left( f\left(\Tilde{P}_h(s^\prime \mid s,a^\prime) \right) - \Tilde{P}_h(s^\prime \mid s,a^\prime) \left(\frac{\eta - V_{h+1}^{\pi_k}(s^\prime)}{\lambda} \right)\right) \,.
\end{align*}


The convex conjugate of $f$ is $f^\ast(y) = \max_{x} \limits \langle x, y\rangle - f(x) $. 
Using $f^\ast$, we can thus optimize over $\Tilde{P}_h$ and rewrite the Lagrangian over as
\begin{align*}
    L(\eta, \lambda) (s,a) = \min_{\Tilde{P}_h} \limits L(\Tilde{P}_h, \eta, \lambda) (s,a)  = - \lambda \rho +   \eta - \lambda \sum_{s^\prime} P_h^o(s^\prime \mid s,a) f^\ast \left( \frac{\eta - V_{h+1}^{\pi_k}(s^\prime)}{\lambda}\right) \,.
\end{align*}

Conditioned on $x \ge 0$, $f(x) = x \log x$, notice that the conjugate $f^\ast(y)$ has the following closed form,
\begin{equation*}
f^\ast(y)=\max_{x} \limits  \langle x, y\rangle - f(x) =\exp(y-1) \,.
\end{equation*}

Using the closed form of $f^\ast(y)$, we can rewrite the optimization problem as
\begin{align*}
    L(\eta, \lambda) (s,a) &= - \lambda  \rho +  \eta - \lambda \sum_{s^\prime} P_h^o(s^\prime \mid s,a) f^\ast \left( \frac{\eta -  V_{h+1}^{\pi_k}(s^\prime)}{\lambda}\right) \\
    &= - \lambda \rho +  \eta - \lambda \sum_{s^\prime} P_h^o(s^\prime \mid s,a) \exp\left(\frac{\eta -  V_{h+1}^{\pi_k}(s^\prime) - \lambda}{\lambda}\right)\,.
\end{align*}

Taking the derivative of $\eta$,

\begin{align*}
    \frac{\partial L}{\partial \eta} &= 1 - \sum_{s^\prime} P_h^o(s^\prime \mid s,a) \exp\left(\frac{\eta -  V_{h+1}^{\pi_k}(s^\prime) - \lambda}{\lambda}\right) = 0\,, \\
    \eta &= \lambda - \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right) \,.
\end{align*}

Directly optimize over $\eta$,  we can reduce the problem to
\begin{align*}
    L(\lambda) (s,a) &= \lambda(1-\rho) -  \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)  - \lambda \,, \\
    & = - \lambda \rho - \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)   \,.
\end{align*}
Define $g(\lambda, P_h^o) = -L(\lambda) (s,a) $ as 
\begin{align*}
    g(\lambda, P_h^o) 
    = \ &  \lambda \rho + \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)\,.
\end{align*}

%Then we need to find the minimum of $g$.

Note that the Lagrangian multiplier $\lambda \ge 0$.
Then we prove $g$ is bounded within $[-H, H]$ over $[0, H/ \rho]$.

\begin{align*}
    g(\lambda, P_h^o) 
    = \ &  \lambda \rho + \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)\,, \\
    \le \ &  \lambda \rho + \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  0 }{\lambda}\right)\right)\,, \\
    = \ & \lambda \rho \le H \,,
\end{align*}
where the first inequality follows from $V_{h+1}^{\pi_k}(s^\prime) \ge 0$ and the second inequality is by $\lambda \le H/ \rho$.

\begin{align*}
    g(\lambda, P_h^o) 
    = \ &  \lambda \rho + \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)\,, \\
    \ge \ &  \lambda \rho + \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  H }{\lambda}\right)\right)\,, \\
    = \ & \lambda \rho - H  \ge -H \,,
\end{align*}
where the first inequality follows from $V_{h+1}^{\pi_k}(s^\prime) \le H$ and the second inequality is by $\lambda \ge 0$.

Moreover, from the induction above we know that for any $P$, $g(0, P) \le 0$ and for $\lambda > H / \rho$,
\begin{align*}
    g\left( \lambda , P \right) 
    \geq \lambda \rho  + \lambda \log (\exp( -H / \lambda)) > 0 \,. 
\end{align*}
Therefore, g achieves its minimum  over $\lambda \in [0, H / \rho]$. We remark that the same form is also used for sample complexity results ( \citep{badrinath2021robust,yang2021towards}).

We can now rewrite 
\begin{align*}
     \hPs \left(\hVpn\right)(s,a)  - \sPs\left(\hVpn\right)(s,a)
     = \ & \min_{0 \le \lambda_1 \le H/ \rho} g\left(\lambda_1, \hat{P}_h^{o,k}\right) - \min_{0 \le \lambda_2 \le H/ \rho} g\left(\lambda_2, P_h^o\right) \\
     \leq \ & \max_{0 \le \lambda \le H/ \rho} \left|g\left(\lambda, \hat{P}_h^{o,k}\right) -  g\left(\lambda, P_h^o\right) \right| \,.
\end{align*}

By \cite{nilim2005robust} (Appendix C), when $\lambda = 0$, $g\left(\lambda, \hat{P}_h^{o,k}\right) = g\left(\lambda, P_h^o\right) = \min_{s \in \gS} V^{\pi_k}_{h+1}(s)$. Therefore, it suffice to bound over $\max_{c \leq \lambda \le H/ \rho} \left|g\left(\lambda, \hat{P}_h^{o,k}\right) -  g\left(\lambda, P_h^o\right) \right|$, where $c > 0$. We now have 
\begin{align*}
    &  \left|g\left(\lambda, \hat{P}_h^{o,k}\right) -  g\left(\lambda, P_h^o\right) \right| \\
    = \ & \left| \lambda \log\left(\sum_{s^\prime} \hat{P}_h^{o,k}(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right) - \lambda \log\left(\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)\right)\right|\\
    = \ & \left | \lambda \log \left( 1 + \frac{\sum_{s^\prime} (\hat{P}_h^{o,k}(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a))\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)}{\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)} \right)\right| \\
    \leq \ & 2 \lambda \left|  \frac{\sum_{s^\prime} (\hat{P}_h^{o,k}(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a))\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)}{\sum_{s^\prime} P_h^o(s^\prime \mid s,a)\exp\left(\frac{ -  V_{h+1}^{\pi_k}(s^\prime) }{\lambda}\right)} \right|\\
    \leq \ & 2\lambda \max_{s^\prime} \left| \frac{\hat{P}_h^{o,k}(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a)}{P_h^o(s^\prime \mid s,a)} \right|
\end{align*}
where the first inequality follows from $|\log(1 + x)| \leq 2|x|$ and the second inequality follows from the Holder's inequality.



By Hoeffding's inequality and an union bound over all $s,a^\prime$, $N_h^k(s,a)$, the following inequality holds with probability at least $1 - \delta$,
\begin{align*}
    \max_{s^\prime} \left| \hat{P}_h^{o,k}(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a) \right| \le \left\| \hat{P}_h^{o,k}(\cdot \mid s,a) - P_h^o(\cdot \mid s,a) \right\|_1  
    \leq \ &  \sqrt{\frac{4S \log(SAH^2 K / \delta)}{N_h^k(s,a)}} \,.
\end{align*}

Then we create an $\epsilon$-net $N_\epsilon(\lambda)$ with $g$ over $\lambda \in [0, H / \rho]$ such that
\begin{align*}
    \max_{\lambda \in [0, H / \rho]} |  g(\lambda, \hat{P}_h^{o,k})  - g(\lambda, P_h^o)| \leq \max_{\lambda\in N_\epsilon(\eta)} |  g(\lambda, \hat{P}_h^{o,k})  - g(\lambda, P_h^o)| + 2 \epsilon \,.
\end{align*}

Then we know that $| N_\epsilon(\lambda)| $ is bounded by the area of the rectangle $[0, H/ \rho] \times [-H, H] $ over $\epsilon^2$, 
\begin{align*}
     | N_\epsilon(\lambda)| \leq \frac{2H^2}{\rho \epsilon^2} \,.
\end{align*}


%Therefore $g$ is a $(2\rho + H)$-Lipschitz function over $\lambda \in [c, H / \rho]$. Combining with Lemma \ref{lem:eps_cover}, we have $| N_\epsilon(\lambda)| \leq  \frac{(1 + 2\rho + H)}{2\epsilon}$. 

Taking an union bound over $N_\epsilon(\lambda)$ and denote $c = \min_{s^\prime} \limits P_h^o(\cdot \mid s,a)$, we have the following inequality happens with at least $1-\delta^\prime$ probability:
\begin{align*}
\hPs (\hVpn)(s,a)  - \sPs (\hVpn)(s,a)
\leq \ & \max_{\lambda\in [0, H / \rho]} |  g(\lambda, \hat{P}_h^{o,k})  - g(\lambda, P_h^o)| \\
    \leq \ & \max_{\lambda\in N_\epsilon(\lambda)} |  g(\lambda, \hat{P}_h^{o,k})  - g(\lambda, P_h^o)| + 2 \epsilon \\
    %\leq \ &  \lambda\exp(H / \lambda) \sqrt{\frac{4 S \log(3SAH^4K  / \epsilon^2 \rho \delta^\prime)}{N_h^k(s,a)}} + 2 \epsilon \\
    \leq \ & 2 \frac{H}{\rho}  \max_{s^\prime} \left| \frac{\hat{P}_h^{o,k}(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a)}{P_h^o(s^\prime \mid s,a)} \right| + 2 \epsilon \\
    \le \ &  2 \frac{H}{\rho c} \sqrt{\frac{4S \log(2SAH^4 K / \delta^\prime \rho \epsilon^2)}{N_h^k(s,a)}}  + 2 \epsilon \,,
\end{align*}
%where $c$ is the minimal element of $P_h^o$.


Take $\epsilon = \frac{1}{2\sqrt{K}}$, then 
\begin{align*}
    \hPs (\hVpn)(s,a)  - \sPs (\hVpn)(s,a) 
    \leq \ & 2 \frac{H}{\rho c}  \sqrt{\frac{4 S  \log(8SAH^4K^2/ \delta^\prime \rho)}{N_h^k(s,a)}} +  \frac{1}{\sqrt{K}} \,.
\end{align*}
\end{proof}


\newpage
\section{Proof of Proposition 1}\label{appendix:prop}
\hard*
\begin{proof}
We consider a robust MDP with three states $s_0, s_1, s_2$ and two actions $a_0, a_1$. Without loss of generality, we let $s_0$ be the initial state.  On the initial state $s_0$, both actions will lead to a reward of $0$. On state $s_1$, a reward of $1 / (H-1)$ is given for both actions. On state $s_2$, a reward of $-1 / (H-1)$ is given for both actions. The nominal transition dynamic of the MDP is the following. Taking action $a_0$ on $s_0$ will be transited to $s_1$ with a probability of $\epsilon$ and be transited to $s_2$ with a probability of $\epsilon$, while $\epsilon > 0.5$. Taking the other action $a_1$ will have equal probability of transiting to $s_1$ and $s_2$. The states $s_1$ and $s_2$ are absorbing, in the sense that taking any action on these two states will be transited by to the same state. The transition of the MDP is also illustrated in Figure \ref{fig:hard}, where a dashed line denotes a probabilistic transition and a solid line denotes deterministic transition.  
\begin{figure}[h]
    \centering
    \includegraphics[width=0.4\textwidth]{hard1.png}
    \includegraphics[width=0.4\textwidth]{hard2.png}
    \caption{The left figure describes the nominal transition dynamic of the MDP. The right figure describes the robust transition dynamic of the MDP. }
    \label{fig:hard}
\end{figure}
With the nominal transition, it is clear that an optimal policy would be always taking $a_0$. Denote this policy as $\pi_{o, \ast}$, the value for this policy under nominal transition over $K$ episodes is 
\begin{align*}
    V^{\pi_{o, \ast}}(s_0) = K (H - 1) \left(\epsilon \cdot \frac{1}{H - 1} - (1 - \epsilon) \cdot \frac{1}{H - 1} \right) = 2\epsilon - 1 > 0 \,,
\end{align*}
where the last inequality is due to $\epsilon > 0$.

However, consider the uncertainty radius $\rho$ and the robust transition denoted by the right figure of Figure \ref{fig:hard}. That is, taking $a_0$ on $s_0$ will leads to a transition to $s_1$ with probability $ \epsilon - \rho / 2$ and to $s_2$ with probability $1 - \epsilon + \rho / 2$. Note that as $\epsilon > 0.5$, $\rho \leq 1$, $\epsilon - \rho / 2 > 0$. Moreover, this transition is indeed the worst case transition for any non-uniform policy. Let $\Tilde{V}$ denotes the robust value under the above described transition. With a uniform policy $\pi$, the value of it under this transition is 
\begin{align*}
    \Tilde{V}^\pi(s_0) = K (H - 1) \left(0.5 \left(\epsilon - \frac{\rho}{2} \right)\cdot \frac{1}{H - 1} - 0.5 \left(1 - \epsilon +\frac{\rho}{2}\right)) \cdot \frac{1}{H - 1} \right) = \epsilon - \rho / 2 - 0.5 \,.
\end{align*}
The value of $\pi_{o,\ast}$ is, however, 
\begin{align*}
    \Tilde{V}^{\pi_{o, \ast}}(s_0) = K (H - 1) \left(\left(\epsilon - \frac{\rho}{2} \right)\cdot \frac{1}{H - 1} - \left(1 - \epsilon +\frac{\rho}{2}\right)) \cdot \frac{1}{H - 1} \right) = 2\epsilon - \rho  - 1 \,.
\end{align*}
For any $2\epsilon - 1 \leq \rho \leq 1$, we have $\Tilde{V}^{\pi_{o, \ast}}(s_0) \leq \Tilde{V}^\pi(s_0) $. Since $\epsilon > 0.5$ is arbitrary, the optimal policy under the nominal transition is non-robust even under the slightest perturbation. 
\end{proof}
\newpage
\section{Auxiliary lemmas}
\begin{lem}[\cite{bartlett2013theoretical}]\label{lem:eps_cover}
An $\epsilon$-cover of a subset $T$ of a pseudometric space $(S, d)$ is a set $\hat{T} \subset T$ such that for each $t \in T$ there is a $\hat{t} \in \hat{T}$ such that $d(t, \hat{t}) \leq \epsilon$. The $\epsilon$-covering number of $T$ is
$$
N(\epsilon, T, d)=\min \left\{|\hat{T}|: \hat{T} \text { is an } \epsilon \text {-cover of } T \right\} \,.
$$
    Let $F_{d}$ be the set of $L$-Lipschitz functions (wrt $\|\cdot\|_{\infty}$ ) mapping from $[0,1]^{d}$ to $[0,1]$. Then
$$
\log N\left(\epsilon, F_{d},\|\cdot\|_{\infty}\right)=\Theta\left(\left(\frac{L}{\epsilon}\right)^{d}\right) \,.
$$
\end{lem}

%\begin{lem}[Covering Number of a bounded real line] \label{lem:bounded_covering} Let $\Theta \subset \mathbb{R}$ with $\Theta=[l, u]$ for some real numbers $u>l$. Let $\mathcal{N}_{\Theta}(\eta)$ be a minimal $\eta$-cover of $\Theta$ with respect to the distance metric $d\left(\theta, \theta^{\prime}\right)=\left|\theta-\theta^{\prime}\right|$ for some fixed $\eta \in(0,1)$. Then we have $\left|\mathcal{N}_{\Theta}(\eta)\right| \leq 3(u-l) / \eta$.
%\end{lem}

\begin{lem}[Lemma 7.5 \cite{agarwal2019reinforcement}]\label{lem:visit}
For arbitrary $K$ sequence of trajectories $\{s_h^k, a_h^k\}_{h=1}^H$, $k = 1, \ldots, K$, we have
    \begin{align*}
        \sum^K_{k=1} \sum^H_{h=1} \frac{1}{\sqrt{N_h^k(s_h^k, a_h^k)}} \leq 2 H \sqrt{SAK} \,.
    \end{align*}
\end{lem}
\begin{proof}
We have
\begin{align*}
\sum^K_{k=1} \sum^H_{h=1} \frac{1}{\sqrt{N_h^k\left(s_h^k, a_h^k\right)}} 
= \ & \sum_{h=1}^{H} \sum_{(s, a) \in \gS \times \gA} \sum_{i=1}^{N_h^K(s, a)} \frac{1}{\sqrt{i}} \\
\leq \ & 2 \sum_{h=1}^{H}  \sum_{(s, a) \in \gS \times \gA}\sqrt{N_h^K(s, a)} \\
\leq \ & \sum_{h=1}^{H} \sqrt{S A \sum_{s, a} N_h^K(s, a)} \\
=\ & H \sqrt{S A K} \,,
\end{align*}
where the first inequality is by $\sum^N_{i=1} \frac{1}{ \sqrt{i}} \leq 2 \sqrt{N}$ and the second inequality follows by Cauchy-Schwarz inequality.
\end{proof}

\iffalse
\paragraph{Online mirror descent}


We need to define the OMD problem first!!!

In each iteration of Online Mirror Descent (OMD), the following problem is solved:
$$
x_{k+1} \in \underset{x \in \Delta_{d}}{\arg \min } t_{K}\left\langle g_{k}, x-x_{k}\right\rangle+B_{\omega}\left(x, x_{k}\right) .
$$
The following lemma provides a fundamental inequality which will be used in our analysis.


\begin{lem}[Fundamental inequality of Online Mirror Descent, \cite{orabona2019modern}, Theorem 10.4]\label{lem:omd}
    
Assume for $g_{k, i} \geq 0$ for $k=1, \ldots, K$ and $i=1, \ldots, d$. Let $C=\Delta_{d}$ and $\eta>0$. Using OMD with the KL-divergence, learning rate $t_{K}$, and with uniform initialization, $x_{1}=[1 / d, \ldots, 1 / d]$, the following holds for any $u \in \Delta_{d}$
$$
\sum_{k=1}^{K}\left\langle g_{t}, x_{k}-u\right\rangle \leq \frac{\log d}{t_{K}}+\frac{t_{K}}{2} \sum_{k=1}^{K} \sum_{i=1}^{d} x_{k, i} g_{k, i}^{2} .
$$
\end{lem}


\fi

% In our analysis, we will be solving the OMD problem for each time-step $h$ and state $s$ separately,
% $$
% \pi_{h}^{k+1}(\cdot \mid s) \in \underset{\pi \in \Delta_{\mathcal{A}}}{\arg \min } t_{K}\left\langle Q_{h}^{k}(s, \cdot), \pi-x_{h}^{k}(\cdot \mid s)\right\rangle+d_{K L}\left(\pi \| \pi_{h}^{k}(\cdot \mid s)\right) .
% $$
% Therefore, by adapting the above lemma to our notation, we get the following lemma,


\begin{lem}[Fundamental inequality of Online Mirror Descent for RL (Lemma 17 \cite{shani2020optimistic})]\label{lem:omd}
Let $\beta>0$. Let $\pi_{h}^{1}(\cdot \mid s)$ be the uniform distribution. Then, by updating with OMD and with KL divergence regularization, for any $k \in[K], h \in[H]$ and $s \in \mathcal{S}$, the following holds for any stationary policy $\pi$, 
\begin{align}\label{eq:omd}
    \sum_{k=1}^{K}\left\langle Q_{h}^{k}(\cdot \mid s), \pi_{h}^{k}(\cdot \mid s)-\pi_{h}(\cdot \mid s)\right\rangle \leq \frac{\log A}{\beta}+\frac{\beta}{2} \sum_{k=1}^{K} \sum_{a} \pi_{h}^{k}(a \mid s)\left(Q_{h}^{k}(s, a)\right)^{2} \,.
\end{align}
\end{lem}
\newpage
\section{More experimental details}
\paragraph{Other configurations and set up }
The episode length is set to $20$ and all algorithms are trained with $3000$ episodes. The evaluation results are averaged over $20$ runs and is presented with $1$ standard deviation. All experiments are conducted with 64 core ADM 3990X. 

%\paragraph{Results with $\ell_1$ distance constrained s-rectangular uncertainty sets}
%With the uncertainty set described with $\ell_1$ distance with $s$-rectangular set, we present the following experimental results. 

\paragraph{Results with KL divergence constrained $(s,a)$-rectangular uncertainty sets}
With the uncertainty set described with KL divergence, we present the following experimental results. All other configurations and set up remains the same with those for uncertainty set with $\ell_1$ distance.
\begin{figure}[h]
     \centering
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{POMD_gridworld_kl_rho_01.png}
         \caption{$\rho = 0.1$}
         \label{fig:kl_rho1}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{POMD_gridworld_kl_rho_02.png}
          \caption{$\rho = 0.2$}
         \label{fig:kl_rho2}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.3\textwidth}
         \centering
         \includegraphics[width=\textwidth]{POMD_gridworld_kl_rho_03.png}
          \caption{$\rho = 0.3$}         \label{fig:kl_rho3}
     \end{subfigure}
    \caption{Cumulative rewards obtained by robust and non-robust policy optimization on robust transition with different level of uncertainty $\rho = 0.1, 0.2, 0.3$ under KL divergence.}
        \label{fig:exp_kl}
\end{figure}