\begin{restatable*}[Regret under $(s,a)$-rectangular uncertainty set]{thm}{sa}
\label{thm:sa}
Setting $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$, the regret of Algorithm is bounded by 
\begin{align*}
    \text{Regret}(K) 
    = \Tilde{O} \left( (\rho + H) HS \sqrt{AK}\right) \,.
\end{align*}
\end{restatable*}


\paragraph{Design of bonus function}
The bonus function $b_h^k(s,a)$ is defined to be 
\begin{align*}
    b_h^k(s,a) = \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} + (2 \rho + H) \sqrt{\frac{4 S \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} + \frac{6 + \rho}{H\sqrt{K}}
\end{align*}

\paragraph{Decomposition and OMD regret}
We first decompose the regret as 
\begin{align*}
    \text{Regret}(K) 
    = \ & \sum^K_{k=1} V_1^{\pi_\ast} - V_1^{\pi_k} \\
    = \ & \sum^K_{k=1} V_1^{\pi_\ast} - \hat{V}_1^{\pi_k} + \hat{V}_1^{\pi_k}  - V_1^{\pi_k} \,.
\end{align*}

For any $h \in [1, H]$, by bellman equation and our update rule, we have
\begin{align*}
    & V_h^{\pi_\ast}(s) - \hat{V}_h^{\pi_k}(s) \\
    = \ & \langle Q_h^{\pi_\ast}(s, \cdot) , \pi_\ast(\cdot \mid s) \rangle - \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_k(\cdot \mid s) \rangle \\
    = \ & \langle Q_h^{\pi_\ast}(s, \cdot) -  \hat{Q}_h^{\pi_k}(s, \cdot), \pi_\ast(\cdot \mid s) \rangle + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \\
    = \ & \mathbb{E}_{\pi_\ast} \left[ (r(s, a) - \hat{r}(s,a)) + (\sPsa (V_{h+1}^\ast)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a)\right] \\
    & \ + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \\
    = \ & \mathbb{E}_{\pi_\ast} \left[ (r(s, a) - \hat{r}(s,a)) +  (\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a))) - b_h^k(s,a)\right] \\
    & \ + \mathbb{E}_{\pi_\ast} \left[  \sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a)\right] + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \,.
\end{align*}

By the design of our bonus function, with probability $ 1 - 2/ H$, we have
\begin{align*}
     (r(s, a) - \hat{r}(s,a)) + (\sPsa (V_{h+1}^\ast)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a) \leq 0 \,.
\end{align*}

Notice that $\sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a) \leq \sup_{p\in \gP_h} p_h(\cdot \mid s,a) (V_{h+1}^\ast - \hVpn) $. Then, by recursion, we have 
\begin{align*}
    \sum^K_{k=1}V_1^{\pi_\ast}(s) - \hat{V}_1^{\pi_k}(s)
    \le \sum^K_{k=1} \sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \,,
\end{align*}
where $q_h = \arg\max_{p\in \gP_h} p_h(\cdot \mid s,a) (V_{h+1}^\ast - \hVpn)$.

By the fundamental inequality of online mirror descent (\cite{orabona2019modern} Theorem 10.4), we have
%\dong{add expect}
\begin{align*}
    \sum^K_{k=1} \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \leq \frac{\log (A)}{\beta} + \frac{\beta}{2} \sum^K_{k=1} \sum_{a \in \gA} \pi_h^\ast (a \mid s) (\hat{Q}_h^{\pi_k}(s, a) )^2 \,.
\end{align*}
By the update rule, we have $\hat{Q}_h^{\pi_k}(s, a) \in [0, H]$, for all $h, k$. Then take $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$ we have 
\begin{align*}
     \sum^K_{k=1}V_1^{\pi_\ast}(s) - \hat{V}_1^{\pi_k}(s) = \Tilde{O} \left(H^2\sqrt{K} \right)\,.
\end{align*}
{\color{blue} TODO: Need to create good events. }


\paragraph{Bounding the estimation error}

By the algorithm's update rule and the robust bellman equation, we have 
\begin{align*}
    (\hVp - \Vp )(s)
    = \ & \langle \hQp (s, \cdot) - \Qp (s, \cdot) , \pi_k(\cdot \mid s) \rangle \\
    = \ & \left\langle \hat{r}(s, \cdot) - r(s, \cdot)  + ( \hPsa(\hVpn)(s,\cdot) - \sPsa (\Vpn)(s,\cdot) ) + b_h^k(s,\cdot),  \pi_k(\cdot \mid s) \right\rangle  \,.
\end{align*}
By adding and subtracting a term $\sPsa (\hVpn)(s) $, we have 
\begin{align*}
    & \hPsa(\hVpn)(s,\cdot) - \sPsa (\Vpn)(s,\cdot) \\
    = \ & \hPsa(\hVpn)(s,\cdot)  - \sPsa (\hVpn)(s,a) + \sPsa (\hVpn)(s,a)  - \sPsa (\Vpn)(s,\cdot)\\
    \leq \ &   \hPsa (\hVpn)(s,\cdot) - \sPsa (\hVpn)(s,\cdot) + \arg\max_{P_h \in \gP_h}P_h(\cdot \mid s,a)( \hVpn - \Vpn) \,.
\end{align*}

Define $q_h = \arg\max_{P_h \in \gP_h} \limits P_h(\cdot \mid s,a)(\hVpn - \Vpn)$, $q = \{q_h\}$, we have
\begin{align*}
    & (\hVp - \Vp )(s) \\
    \leq \ & \mathbb{E}_{\pi_k}\left[ (r(s, a) - \hat{r}(s, a)) + \left( \hPsa(\hVpn)(s,a) - \sPsa (\hVpn)(s,a)  \right) + b_h^k(s,a) + q_h (\hVpn - \Vpn )(s,a) \right]\,.
\end{align*}

By doing a recursion with $q_h$ and noticing $V_{H+1}^{\pi_k} = \hat{V}_{H+1}^{\pi_k} = 0$, we have
\begin{align*}
    & (\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) \\
    \leq \ &\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ (r(s, a) - \hat{r}(s, a)) + \left( \hPsa(\hVpn)(s,a) -  \sPsa (\hVpn)(s,a) \right) + b_h^k(s,a)\right]\,.
\end{align*}

In the case of $(s,a)$-rectangular set, we have
\begin{align*}
    & \sPsa (\hVpn)(s,a) 
    = \  \inf_{P_h \in \gP_h(s,a)} \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) \\
     \text{subject to } & \sum_{s^\prime} | P_h(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a)| \leq \rho \,, \\
    &  \sum_{s^\prime} P_h(s^\prime \mid s,a) = 1 \,, \\
    &  P_h(\cdot \mid s,a) << P_h^o(\cdot \mid s,a)\,.
\end{align*}
{\color{blue} How do I align this?}

Define $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can rewrite the above equation as
\begin{align*}
    & \sPsa (\hVpn)(s,a) 
    = \  \inf_{P_h \in \gP_h(s,a)} \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) \\
     \text{subject to } & \sum_{s^\prime} | \Tilde{P}_h(s^\prime \mid s,a)  - 1| P_h^o(s^\prime \mid s,a) \leq \rho \,, \\
    &  \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) = 1 \,, \\
    &  \Tilde{P}_h(s^\prime \mid s,a) \ge 0 \quad \forall s^\prime \in \gS\,.
\end{align*}

We can obtain the Lagrangian $L(\Tilde{P}_h, \eta, \lambda)$ with Lagrangian multiplier $\eta\in \mathbb{R}, \lambda\geq 0$,
\begin{align*}
    L(\Tilde{P}_h, \eta, \lambda) (s,a) 
    = \ & \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) + \lambda \left( \sum_{s^\prime} | \Tilde{P}_h(s^\prime \mid s,a)  - 1| P_h^o(s^\prime \mid s,a)  - \rho \right) \\
    & \ - \eta \left(\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) - 1 \right) \,.
\end{align*}

Let $f(x) = |x - 1|$, then the convex conjugate of $f$, $f^\ast(y) = \sup_{x} \limits ( \langle x, y\rangle - f(x) )$. 
Using $f^\ast$, we can thus optimize over $\Tilde{P}_h$ and rewrite the Lagrangian over as 
\begin{align*}
    L(\eta, \lambda) (s,a) 
    = \ & \eta - \lambda \rho - \lambda \sum_{s^\prime}P_h^o(s^\prime \mid s,a) f^\ast \left(\frac{1}{\lambda} \left(\eta - \hVpn(s^\prime)\right) \right) \,.
\end{align*}
Notice that when $x \geq 0$, $f^\ast(y)$ has value $-1$ when $y \leq -1$, $f^\ast(y)=y$ when $y \in [-1, 1]$ and $f^\ast(y) = \infty$, when $y > 1$. Let $\Tilde{\eta} = \eta + \lambda$, then using the values of $f^\ast(y)$ and the equality $\max\left\{a,b\right\} = (a - b)_{+} + b$, we can rewrite the optimization problem as
\begin{align*}
    \sup_{\lambda \geq 0,  \Tilde{\eta} - \min_s \limits \hVpn(s) \leq 2 \lambda} \Tilde{\eta} - (\rho + 2) \lambda - \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \Tilde{\eta} - \hVpn(s^\prime)\right)_{+} \,.
\end{align*}

Optimize over $\lambda$, we can simplify the above problem into 
\begin{align*}
    \sup_{\Tilde{\eta}} \Tilde{\eta} - \frac{ (\Tilde{\eta} - \min_s \limits \hVpn(s))_{+}}{2}(\rho + 2) - \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \Tilde{\eta} - \hVpn(s^\prime)\right)_{+} \,.
\end{align*}
Define $g(\Tilde{\eta}, P_h^o)$ as 
\[
g(\Tilde{\eta}, P_h^o) = \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \Tilde{\eta} - \hVpn(s^\prime)\right)_{+}  - \Tilde{\eta} + \frac{ (\eta - \min_s\limits  \hVpn(s))_{+}}{2} (\rho + 2) \,.
\]
First notice that $g$ is a convex function. Moreover, notice that when $\Tilde{\eta} \leq 0$, $g(\Tilde{\eta}, P_h^o) = - \Tilde{\eta} \geq 0$. When $\Tilde{\eta} = 2 \rho + 2H$, $g(\Tilde{\eta}, P_h^o) \geq 0$. 
Thus it suffice to optimize $g(\Tilde{\eta}, P_h^o)$ within the range of $\Tilde{\eta} \in [0, 2 \rho + 2H]$.

We can thus rewrite 
\begin{align*}
    \hPsa(\hVpn)(s,a) - \sPsa (\hVpn)(s,a)  
    = \ & \inf_{\eta_1 \in [0, 2 \rho + 2H] } g(\eta_1, \hat{P}_h) - \inf_{\eta_2 \in [0, 2 \rho + 2H] }g(\eta_2, P_h^o)\\
    \leq \ & \sup_{\eta \in [0, 2\rho + 2H ]} | g(\eta, \hat{P}_h)  - g(\eta, P_h^o) |\,.
\end{align*}
With $\eta \in [0, 2\rho + 2H ]$ and by Holder's inequality, we have 
\begin{align*}
    | g(\eta, \hat{P}_h)  - g(\eta, P_h^o) |
    = \ & \left|  \sum_{s^\prime}\hat{P}_h(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+}  - \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+} \right|  \\
    \leq \ & \left\|\hat{P}_h(\cdot \mid s,a) -  P_h^o(\cdot \mid s,a)\right\|_1 \| \eta - \hVpn(s)\|_\infty \\ 
    \leq \ & (2 \rho + H) \left\|\hat{P}_h(\cdot \mid s,a) -  P_h^o(\cdot \mid s,a)\right\|_1 
\end{align*}

By Hoeffding's inequality and an union bound over all $s,a$, the following inequality holds with probability at least $1 - \delta$
\begin{align*}
    \left\|\hat{P}_h(\cdot \mid s,a) -  P_h^o(\cdot \mid s,a)\right\|_1 \leq \sqrt{\frac{4 S \log(3SAH^2K/ \delta)}{N_h^k(s,a)}} \,.
\end{align*}
{\color{blue} TODO: put this as a good event.}

To bound $\sup_{\eta\in [0, 2\rho + 2H ]} | g(\eta, \hat{P}_h)  - g(\eta, P_h^o) |$ with high probability, we first create an $\epsilon$-net $N_\epsilon(\eta)$ with $g$ over $\eta\in [0, 2\rho + 2H ]$ such that
\begin{align*}
    \sup_{\eta\in [0, 2\rho + 2H ]} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \leq \sup_{\eta\in N_\epsilon(\eta)} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| + 2 \epsilon \,.
\end{align*}
Taking an union bound over $N_\epsilon(\eta)$, we have 
\begin{align*}
    \sup_{\eta\in [0, 2\rho + 2H ]} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \leq (2 \rho + H)\sqrt{\frac{4 S \log(3SAH^2K | N_\epsilon(\eta)| / \delta)}{N_h^k(s,a)}} + 2 \epsilon \,.
\end{align*}

Notice that $g$ is a $\frac{6 + \rho}{2}$-Lipschitz function over $\eta\in [0, 2\rho + 2H ]$, thus we have $| N_\epsilon(\eta)| = O\left(\frac{6 + \rho}{2 \epsilon}\right)$. Hence, we have
\begin{align*}
    \sup_{\eta\in [0, 2\rho + 2H ]} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \leq  (2 \rho + H) \sqrt{\frac{4 S \log(3SAH^2K (6 + \rho)/2\epsilon \delta)}{N_h^k(s,a)}} + 2 \epsilon\,.
\end{align*}
Take $\epsilon = \frac{6 + \rho}{2 H\sqrt{K}}$, we have
\begin{align*}
    \sPsa (\hVpn)(s,a)  - \hPsa(\hVpn)(s,a)
    \leq \ & \sup_{\eta\in [0, 2\rho + 2H ]} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \\
    \leq \ &  (2 \rho + H) \sqrt{\frac{4 S \log(3SAH^2K^{3/2}/ \delta)}{N_h^k(s,a)}} + \frac{6 + \rho}{H\sqrt{K}}\,.
\end{align*}

By Hoeffding's inequality and an union bound on all $(s,a)$, we also have the following with probability at least $1- \delta$
\begin{align*}
    |r(s,a) - \hat{r}(s,a)| \leq \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}}  \,.
\end{align*}

Thus, by setting $\delta^\prime = 2 \delta$, $\delta = 1/ H$, then with probability at least $1 - 2/ H$, we have
\begin{align*}
     \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) 
    \leq \ &\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ 2 b_h^k(s,a)\right]\\
    \leq \ & \sqrt{K}(6 + \rho) + O \left( (\rho + H) \sqrt{S \log(SAHK^{3/2})} \right)\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[\sqrt{\frac{1}{N_h^k(s,a)}} \right]\,.
\end{align*}

By Lemma 7.5 of RL theory book {\color{blue} add reference here}, 
we have 
\begin{align*}
    \sum^K_{k=1}\sum^H_{h=1} \sqrt{\frac{1}{N_h^k(s,a)}} \leq 2 H \sqrt{SAK} \,.
\end{align*}
Combining everything, with probability at least $1 - 2/ H$, we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s)  = \Tilde{O} \left((\rho + H) HS\sqrt{AK} \right) \,.
\end{align*}
