\begin{restatable*}[Regret under $s$-rectangular uncertainty set]{thm}{s}
\label{thm:s}
Setting $\beta = \sqrt{\frac{2 \log A}{H^2 K}}$, the regret of Algorithm is bounded by 
\begin{align*}
    \text{Regret}(K) 
    = \Tilde{O} \left( (\rho + H) SA H\sqrt{K} \right) \,.
\end{align*}
\end{restatable*}

\paragraph{Design of bonus function}
\begin{align*}
    b_h^k(s,a) = (2 \rho + 4H)\sqrt{\frac{4 S A\log(6SAH^2K^{3/2} / \delta)}{N_h^k(s,a)}} + 2 \frac{(A ( 2 + \rho) + 2)}{\exp(1/A^2)\sqrt{K}}  + \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}}  \,.
\end{align*}

\paragraph{Decomposition and OMD regret}
Similar to the $(s,a)$-rectangular case, we have
\begin{align*}
     \sum^K_{k=1}V_1^{\pi_\ast}(s) - \hat{V}_1^{\pi_k}(s) = \Tilde{O} \left(H^2\sqrt{K} \right)\,.
\end{align*}
{\color{blue} TODO: Need to create good events. }


\paragraph{Bounding the estimation error}
Similar to the case with $(s,a)$-rectangular uncertainty set, we have
\begin{align*}
    & (\hat{V}_1^{\pi_k} - \hat{V}_1^{\pi_k})(s) \\
    \leq \ &\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ (r(s, a) - \hat{r}(s, a)) + \left( \hPs (\hVpn)(s,a)  - \sPs(\hVpn)(s,a)\right) + b_h^k(s,a)\right]\,.
\end{align*}
Thus the key is to upper bound $\left( \hPs (\hVpn)(s,a)  - \sPs(\hVpn)(s,a)\right) $.
By the definition of $s$-rectangular set, we have
\begin{align*}
     & \sPs (\hVpn)(s,a) = \inf_{P_h \in \gP_h(s)} \sum_{s^\prime} P_h(s^\prime \mid s,a) \hVpn(s^\prime) \\
     \text{subject to } & \sum_{s^\prime, a^\prime} | P_h(s^\prime \mid s,a^\prime) - P_h^o(s^\prime \mid s,a^\prime)| \leq A\rho \,, \\
    &  \sum_{s^\prime} P_h(s^\prime \mid s,a^\prime) = 1 \,, \quad  \forall a^\prime \in \gA\\
    &  P_h(\cdot \mid s,a^\prime) >> P_h^o(s^\prime \mid s,a^\prime)\,, \quad  \forall a^\prime \in \gA\,.
\end{align*}

Define $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can rewrite the above optimization problem as 
\begin{align*}
     &\inf_{P_h \in \gP_h(s)} \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) \\
     \text{subject to } & \sum_{s^\prime, a^\prime} | (\Tilde{P}_h(s^\prime \mid s,a^\prime) - 1| P_h^o(s^\prime \mid s,a^\prime) \leq A\rho \,, \\
    &  \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a^\prime)P_h^o(s^\prime \mid s,a^\prime) = 1 \,, \quad  \forall a^\prime \in \gA\\
    &  \Tilde{P}_h(\cdot \mid s,a^\prime) \geq 0\,, \quad  \forall a^\prime \in \gA\,.
\end{align*}
We can then obtain the Lagrangian $L(\Tilde{P}_h, \eta, \lambda)$ with $\eta = \{\eta_a\}_{a\in\gA}, \eta_a \in \mathbb{R}$, $\lambda \geq 0$.

\begin{align*}
    L(\Tilde{P}_h, \eta, \lambda) (s,a) 
    = \ & \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) + \lambda \left( \sum_{s^\prime, a^\prime} | (\Tilde{P}_h(s^\prime \mid s,a^\prime) - 1| P_h^o(s^\prime \mid s,a^\prime) - A\rho \right) \\
    & \ - \sum_{a^\prime} \eta_{a^\prime} \left( \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a^\prime)P_h^o(s^\prime \mid s,a^\prime) - 1 \right) \,.
\end{align*}

Let $f(x) = |x - 1|$, then the convex conjugate of $f$, $f^\ast(y) = \sup_{x} \limits ( \langle x, y\rangle - f(x) )$. 
Using $f^\ast$, we can rewrite the Lagrangian over as 
\begin{align*}
    & L(\Tilde{P}_h, \eta, \lambda) (s,a) \\
    = \ & - \lambda A \rho + \sum_{a^\prime} \eta_{a^\prime} + \lambda \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left( f\left(\Tilde{P}_h(s^\prime \mid s,a^\prime) \right) - \Tilde{P}_h(s^\prime \mid s,a^\prime) \left(\frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda} \right)\right) \,.
\end{align*}
Thus using the convex conjugate, we can optimize over $\Tilde{P}$ and obtain
\begin{align*}
    L(\eta, \lambda) (s,a) =  - \lambda A \rho +  \sum_{a^\prime} \eta_{a^\prime} - \lambda \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) f^\ast \left( \frac{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime)}{\lambda}\right) \,.
\end{align*}
Notice that when $x \geq 0$, $f^\ast(y)$ has value $-1$ when $y \leq -1$, $f^\ast(y)=y$ when $y \in [-1, 1]$ and $f^\ast(y) = \infty$, when $y > 1$. Let $\Tilde{\eta} = \eta + \lambda$, then using the values of $f^\ast(y)$ and the equality $\max\left\{a,b\right\} = (a - b)_{+} + b$, we can rewrite the optimization problem as
\begin{align*}
    \sup_{\lambda \geq 0, \  (\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}\leq \lambda, \ \forall a}
    - \lambda A \rho + \sum_{a^\prime} \eta_{a^\prime} -  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \max\left\{\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime), -\lambda\right\} \,.
\end{align*}

Let $\Tilde{\eta}_{a} = \eta_a + \lambda$, then using the identity $\max\{a, b\} = (a -b)_{+} + b$,  the problem is equivalent to
\begin{align*}
     \sup_{\lambda \geq 0, \  (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}\leq 2\lambda, \ \forall a^\prime,s^\prime} - \lambda A (\rho + 2) + \sum_{a^\prime} \Tilde{\eta}_{a^\prime} -  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \,.
\end{align*}

Optimize over $\lambda$, we can reduce the problem into only optimizing with respect to a $A$-dimensional vector $\Tilde{\eta}$.
\begin{align*}
    \sup_{\Tilde{\eta}} \ \sum_{a^\prime} \Tilde{\eta}_{a^\prime} -  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} - \max_{s^\prime, a^\prime}\frac{A (\rho + 2) (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} \,.
\end{align*}

Define $g(\Tilde{\eta}, P_h^o)$ as 
\begin{align*}
    g(\Tilde{\eta}, P_h^o) 
    = \ &  - \sum_{a^\prime} \Tilde{\eta}_{a^\prime} +  \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} + \max_{s^\prime, a^\prime} \frac{A (\rho + 2) (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} \,.
\end{align*}

First notice that $g$ is a convex function with respect to $\Tilde{\eta}$. Moreover, the infimum of $g$ can be achieved when $\Tilde{\eta}$ is a zero vector. 
Then, restricting each value in $\Tilde{\eta}$ to take values $\geq 0$, we have
\begin{align*}
    g(\Tilde{\eta}, P_h^o) 
    \geq \ & - \sum_{a^\prime} \Tilde{\eta}_{a^\prime} + \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right) + \max_{s^\prime, a^\prime} \frac{A (\rho + 2) (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} \\
    \geq \ & \max_{s^\prime, a^\prime} \frac{A (\rho + 2) (\Tilde{\eta}_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime))_{+}}{2} - H \,.
\end{align*}
When $\Tilde{\eta}_{a^\prime} = 3 H + 2 \rho$, $\forall a^\prime \in \gA$, $g(\Tilde{\eta}, P_h^o) \geq  A (2 + \rho)(\rho + H) - H \geq 0$. Thus, it suffice to optimize within the interval of $\eta_{a^\prime} \in [0, 3 H + 2 \rho]$, $\forall a^\prime$. 

We can now rewrite $ \sPs (\hVpn)(s,a)  - \hPs(\hVpn)(s,a)$ as 
\begin{align*}
     \hPs (\hVpn)(s,a)  - \sPs(\hVpn)(s,a)
     = \ & \inf_{\eta_{1, a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} g(\eta_1, \hat{P}_h) - \inf_{\eta_{2, a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} g(\eta_2, P_h^o) \\
     \leq \ & \sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} \left|g(\eta, \hat{P}_h^o) -  g(\eta, P_h^o) \right| \,.
     %= \ & \sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime}  \left|g(\eta, \hat{P}_h^o) -  g(\eta, P_h^o) \right| \,.
\end{align*}

By Holder's inequality, we have
\begin{align*}
    &  \left|g(\eta, \hat{P}_h^o) -  g(\eta, P_h^o) \right| \\
    = \ &  \sum_{s^\prime, a^\prime} \hat{P}_h(s^\prime \mid s,a^\prime) \left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} - \sum_{s^\prime, a^\prime} P_h^o(s^\prime \mid s,a^\prime) \left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \\
    = \ & \sum_{a^\prime} \left( \sum_{s^\prime}  \hat{P}_h(s^\prime \mid s,a^\prime) - P_h^o(s^\prime \mid s,a^\prime) \right)\left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \\
    \leq \ & \sum_{a^\prime} \left\| \hat{P}_h(\cdot \mid s,a^\prime) - P_h^o(\cdot \mid s,a^\prime) \right\|_1 \left\| \left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \right\|_\infty \,. 
\end{align*}

With Hoffding's inequality and the fact that we optimize within the range of $\eta_{a^\prime} \in [0, 3 H + 2 \rho]$, $\forall a^\prime$, we have the following inequality holds with probability at least $1 - \delta$,
\begin{align*}
    \sum_{a^\prime}\left\| \hat{P}_h(\cdot \mid s,a^\prime) - P_h^o(\cdot \mid s,a^\prime) \right\|_1  \left\| \left(\eta_{a^\prime} - \mathbb{I}\{a^\prime = a\} V_{h+1}^{\pi_k}(s^\prime) \right)_{+} \right\|_\infty 
    \leq \ & A(2\rho + 4H ) \sqrt{\frac{4S \log(SAH^2 K / \delta)}{N_h^k(s,a)}} \,.
\end{align*}

To bound $\sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} | g(\eta, \hat{P}_h)  - g(\eta, P_h^o) |$ with high probability, we first create an $\epsilon$-net $N_\epsilon(\eta)$ with $g$ over $\eta\in [0, 2\rho + 2H ]$ such that
\begin{align*}
    \sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \leq \sup_{\eta\in (\eta)} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| + 2 \epsilon \,.
\end{align*}
Taking an union bound over $N_\epsilon(\eta)$, we have 
\begin{align*}
    \sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \leq A(2 \rho + 4H)\sqrt{\frac{4 S \log(3SAH^2K | N_\epsilon(\eta)| / \delta)}{N_h^k(s,a)}} + 2 \epsilon \,.
\end{align*}

With $\eta = \{\eta_a\}$, notice that $g$ is $(A ( 2 + \rho) + 2)$-Lipschitz. Thus we have $| N_\epsilon(\eta)| \leq \left( \frac{(A ( 2 + \rho) + 2)}{\epsilon}\right)^A$. {\color{blue} add reference here, I found it on Peter Bartlett's lecture slides.}
Thus we have
\begin{align*}
\hPs (\hVpn)(s,a)  - \sPs (\hVpn)(s,a) 
    \leq \ & \sup_{\eta_{a^\prime} \in [0, 3 H + 2 \rho], \ \forall a^\prime} |  g(\eta, \hat{P}_h)  - g(\eta, P_h^o)| \\
    \leq \ & A(2 \rho + 4H)\sqrt{\frac{4 S A\log(3SAH^2K  (A ( 2 + \rho) + 2)/ \epsilon \delta)}{N_h^k(s,a)}} + 2 \epsilon \,.
\end{align*}

Take $\epsilon = \frac{(A ( 2 + \rho) + 2) }{2\exp(1/A^2)\sqrt{K}}$, then 
\begin{align*}
    \hPs (\hVpn)(s,a)  - \sPs (\hVpn)(s,a)
    \leq \ & (2 \rho + 4H)\sqrt{\frac{4 S A\log(6SAH^2K^{3/2} / \delta)}{N_h^k(s,a)}} + 2 \frac{(A ( 2 + \rho) + 2)}{\exp(1/A^2)\sqrt{K}} \,.
\end{align*}
By Hoeffding's inequality, we also have the following inequality holds with probability at least $1 - \delta$.
\begin{align*}
    |r(s,a) - \hat{r}(s,a)| \leq \sqrt{\frac{2 \log(3SAH^2K/ \delta)}{N_h^k(s,a)}}  \,.
\end{align*}

Thus by the design of our bonus function, we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s)
    \leq \ & 2\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[b_h^k(s,a) \right] \\
    = \ & \Tilde{O} \left( (\rho + H)\sqrt{SA}\right)\sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ \sqrt{\frac{1}{N_h^k(s,a)}}\right]  \,.
\end{align*}

By Lemma 7.5 of RL theory book {\color{blue} add reference here}, 
we have 
\begin{align*}
    \sum^K_{k=1}\sum^H_{h=1} \sqrt{\frac{1}{N_h^k(s,a)}} \leq 2 H \sqrt{SAK} \,.
\end{align*}
Combining everything, set $\delta = 1/H$, then with probability at least $1 - 2/ H$, we have
\begin{align*}
    \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) = \Tilde{O} \left( (\rho + H) SA H\sqrt{K}\right) \,.
\end{align*}