\begin{proofsketch}
We start with decomposing the regret as follows,
    \begin{align*}
    \text{Regret}(K) 
    = \  \sum^K_{k=1} V_1^{\ast}(s) - V_1^{\pi_k}(s)
    = \  \sum^K_{k=1}\left(\hat{V}_1^{\pi_k} (s) - V_1^{\pi_k}(s)\right) + \sum^K_{k=1} \left(V_1^{\ast} (s)- \hat{V}_1^{\pi_k}(s) \right)\,.
    \end{align*}
    
    For the first term, for any $h \in [H]$, we can further decompose it as 
    \begin{align*}
    (\hVp - \Vp )(s)
    %= \ & \langle \hQp (s, \cdot) - \Qp (s, \cdot) , \pi_k(\cdot \mid s) \rangle \\
    = \ & \left\langle \hat{r}(s, \cdot) - r(s, \cdot)  + ( \hPsa(\hVpn)(s,\cdot) - \sPsa (\Vpn)(s,\cdot) ) + b_h^k(s,\cdot),  \pi_k(\cdot \mid s) \right\rangle  \,.
\end{align*}
    Under the convention of optimism, we should hope that the bonus function $b_h^k(s,a)$ ``covers'' the estimation errors of the reward and transition functions. That is $r(s, a) - \hat{r}(s,a)$ and $\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)$ are both smaller than our choice of bonus function. We now show that this is indeed the case under our designed bonus function.
    
    The estimation error of the rewards can rather be straightforwardly upper bounded by Hoeffding's inequality and an union bound argument. In contrast, the estimation error of $\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)$ cannot be controlled by direct applications of concentration inequalities due to the presence of the $\sigma$ operator. Therefore, we resort to a primal-dual argument to first simply the problem into a $1$-dimensional convex optimization problem, where the estimation error can be better quantified. Similar primal-dual techniques are also used in previous works for robust dynamic programming and robust RL \cite{iyengar2005robust,panaganti2022sample,yang2021towards}. (\dong{this part is very bad})
    
%    First note that in the case of $(s,a)$-rectangular set, we have
%\begin{align*}
%    & \sPsa (\hVpn)(s,a) 
%    = \  \inf_{P_h \in \gP_h(s,a)} \sum_{s^\prime} P_h(s^\prime \mid s,a) %\hVpn(s^\prime) \\
%     \text{subject to } & \sum_{s^\prime} | P_h(s^\prime \mid s,a) - P_h^o(s^\prime \mid s,a)| \leq \rho \,, \\
%    &  \sum_{s^\prime} P_h(s^\prime \mid s,a) = 1 \,, \\
%    &  P_h(\cdot \mid s,a) << P_h^o(\cdot \mid s,a)\,.
%\end{align*}
Through a change of variable with $\Tilde{P}_h(s^\prime \mid s, a) = \frac{ P_h(s^\prime \mid s,a) }{ P_h^o(s^\prime \mid s,a)}$, we can write the Lagrangian form of $\sPsa (\hVpn)(s,a)$ as
\begin{align*}
    L(\Tilde{P}_h, \eta, \lambda) (s,a) 
    = \ & \sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) \hVpn(s^\prime) + \lambda \left( \sum_{s^\prime} | \Tilde{P}_h(s^\prime \mid s,a)  - 1| P_h^o(s^\prime \mid s,a)  - \rho \right) \\
    & \ - \eta \left(\sum_{s^\prime} \Tilde{P}_h(s^\prime \mid s,a) P_h^o(s^\prime \mid s,a) - 1 \right) \,,
\end{align*}
where $\eta, \lambda$ are both Lagrangian multipliers. 
Under the characterization of $\ell_1$ distance, we can use the convex conjugate of $f(x) = |x - 1|$ to optimize out $\Tilde{P}$, resulting with 
\begin{align}\label{eq:reduced_sa}
    \sup_{\lambda \geq 0,  \eta - \min_s \limits \hVpn(s) \leq 2 \lambda} \eta - \rho \lambda - \sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+} \,.
\end{align}
Notice that now $\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)$ is equivalent to the difference between the two solutions of equation \ref{eq:reduced_sa} with the $P_h^o$ and $\hat{P}_h^{o,k}$. Moreover, this difference in the $P_h^o$ only incurs a difference in the value of $\sum_{s^\prime}P_h^o(s^\prime \mid s,a) \left( \eta - \hVpn(s^\prime)\right)_{+}$, which we can use Hoeffding's inequality to control the difference if $\eta$ is bounded. Lastly, we investigate the range of possible optimal values of $\eta$ and use an $\epsilon$-net argument to account for the supremum.

By the optimism achieved by our bonus function, we have
\begin{align*}
     \sum^K_{k=1}(\hat{V}_1^{\pi_k} - V_1^{\pi_k})(s) 
    \leq \ \sum^K_{k=1}\sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ 2 b_h^k(s,a)\right]
    \leq \ O \left(H^2 S\sqrt{AK \log \left( SAH^2 K^{3/2} ( 1 + \rho) / \delta \right)} \right)\,.
\end{align*}


Then we consider the second term. For any $h \in [1, H]$, by bellman equation and our update rule, it can be decomposed as 
\begin{align*}
    & V_h^{\ast}(s) - \hat{V}_h^{\pi_k}(s) \\
    = \ & \mathbb{E}_{\pi_\ast} \left[ (r(s, a) - \hat{r}(s,a)) +  (\sPsa(\hVpn)(s,a) -  \hPsa(\hVpn)(s,a)) - b_h^k(s,a)\right] \\
    & \ + \mathbb{E}_{\pi_\ast} \left[  \sPsa (V_{h+1}^\ast)(s,a) - \sPsa(\hVpn)(s,a)\right] + \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \,.
\end{align*}

By the design of our bonus function and recursion, we can obtain
\begin{align*}
    \sum^K_{k=1}V_1^{\ast}(s) - \hat{V}_1^{\pi_k}(s)
    \le \sum^K_{k=1} \sum^H_{h=1} \mathbb{E}_{\pi_k, \{q_h\}^H_{h=1}} \left[ \langle \hat{Q}_h^{\pi_k}(s, \cdot) , \pi_\ast(\cdot \mid s) - \pi_k(\cdot \mid s)  \rangle \right] \leq \Tilde{O} \left(H^2 \sqrt{K}\right) \,,
\end{align*}
where $q_h = \arg\max_{p\in \gP_h} p_h(\cdot \mid s,a) (V_{h+1}^\ast - \hVpn)$ and the last inequality is by standard results for online mirror descent.

Combining the two results yields the final regret upper bound.
\end{proofsketch}