\begin{lemma}[Simulation Lemma \citep{zhang2022efficient}]
\label{lemma:simulation}
Given two transition models \(\mathcal{P}\) and \(\mathcal{P}'\), we have:
\[
    V_{\mathcal{P}', r + b, 1}^{\pi, d_{1}} - V_{\mathcal{P}, r, 1}^{\pi, d_{1}} = \sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}',h}^{\pi}}[b_{h}(s,a) + (\mathcal{P}_{h}' -\mathcal{P}_{h})V_{\mathcal{P}, r, h+1}^{\pi}(s,a)],
\]
\[
    V_{\mathcal{P}', r + b, 1}^{\pi, d_{1}} - V_{\mathcal{P}, r, 1}^{\pi, d_{1}} = \sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P},h}^{\pi}}[b_{h}(s,a) + (\mathcal{P}_{h}' -\mathcal{P}_{h})V_{\mathcal{P}', r+b, h+1}^{\pi}(s,a)].
\]
\end{lemma}

\begin{lemma}[\citep{he2021logarithmic}]
\label{lemma:regret_gap_equivalence}
For any \(h\in[H]\), \(s\in\mathcal{S}\), and \(\pi\in\Pi\):
\[
    V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi^{\star}}(s) - V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi}(s) = \mathbb{E}[\sum_{h'=h}^{H}\Delta_{h'}(s_{h'},a_{h'})|s_{h} = s, \pi, \mathcal{P}^{\star}],
\]
Hence the regret after $T$ episodes can be expressed as:
\begin{align*}
    \mathcal{R}(T) &= \sum_{t=1}^{T} V_{\mathcal{P}^{\star}, r^{\star},1}^{\pi^{\star}, d_{1}} - V_{\mathcal{P}^{\star},r^{\star},1}^{\pi_{t}, d_{1}} = \sum_{t=1}^{T} \mathbb{E}_{s\sim d_{1}}[\sum_{h=1}^{H}\Delta_{h}(s_{h},a_{h})|s_{1}=s, \pi_{t}, \mathcal{P}^{\star}] \\
    &= \sum_{t=1}^{T} \sum_{h=1}^{H}\mathbb{E}_{(s,a)\sim d_{\mathcal{P}^{\star},h}^{\pi_{t}}}[\Delta_{h}(s,a)]
\end{align*}    
\end{lemma}
\begin{proof}
\begin{align*}
    &V_{\mathcal{P}^{\star},r^{\star}, h}^{\pi^{\star}}(s) - V_{\mathcal{P}^{\star}, r^{\star},h}^{\pi}(s) \\
    &= \Delta_{h}(s, \pi_{h}(s)) + Q_{\mathcal{P}^{\star},r^{\star},h}^{\pi^{\star}}(s, \pi_{h}(s)) - V_{\mathcal{P}^{\star},r^{\star},h}^{\pi}(s) \\
    &= \Delta_{h}(s, \pi_{h}(s)) + r_{h}^{\star}(s, \pi_{h}(s)) + \mathcal{P}_{h}^{\star}V_{\mathcal{P}^{\star}, r^{\star}, h+1}^{\pi^{\star}}(s, \pi_{h}(s)) - r_{h}^{\star}(s, \pi_{h}(s)) - \mathcal{P}_{h}^{\star}V_{\mathcal{P}^{\star}, r^{\star}, h+1}^{\pi}(s, \pi_{h}(s)) \\
    &= \Delta_{h}(s, \pi_{h}(s)) + \mathcal{P}_{h}^{\star}(V_{h+1}^{\pi^{\star}} - V_{h+1}^{\pi})(s, \pi_{h}(s))
\end{align*}
Unravelling the recursion gives the result.
\end{proof}

\begin{theorem}[\citep{piziak1999full}]
\label{thm:low_rank_factorization}
    Every matrix \(A\in\mathbb{C}^{n\times m}\) with \(\textnormal{rank}(A)=r>0\) has infinitely many full rank factorizations. However, if \(A=FG=\bar{F}\bar{G}\) are two full rank factorizations of \(A\), then there exists an invertible matrix \(R\in\mathbb{C}^{r\times r}\) such that \(\bar{F}=FR\) and \(\bar{G}=R^{-1}G\).
\end{theorem}

\begin{lemma}[Lemma D.1. in \cite{jin2020provably}]
\label{lemma:bounded_squared_uncertainty}
    Let \(\Sigma_{t} = \lambda I + \sum_{i=1}^{t}\phi_{i}\phi_{i}^{T}\) where \(\phi_{i}\in\mathbb{R}^{d}\) and \(\lambda > 0\). Then,
    \[
        \sum_{i=1}^{t}\phi_{i}^{T}\Sigma_{t}^{-1}\phi_{i} = \textnormal{Tr}(\Sigma_{t}^{-1}\sum_{i=1}^{t}\phi_{i}\phi_{i}^{T}) \leq d.
    \]
\end{lemma}

\begin{lemma}[Elliptical potential lemma \citep{abbasi2011improved}]
\label{lemma:elliptical_potential}
    Consider a sequence of \(d\times d\) positive semidefinite matrices \(X_{1}, ...,X_{T} \) with \(tr(X_{t})\leq1\) for all \(t\in[T]\). Define \(M_{0}=\lambda_{0}I\) and \(M_{t}=M_{t-1} + X_{t}\). Then
\[
\sum_{t=1}^{T}tr(X_{t}M_{t-1}^{-1}) \leq 2d\log(1 + \frac{T}{d\lambda_{0}})
\]
\end{lemma}


\begin{prop}[Matrix Azuma \citep{tropp2012user}]
\label{prop:matrix_azuma}
    Let \(\{X_{k}\}_{k=1}^{t}\) be a finite adapted sequence of symmetric matrices of dimension d, and \(\{C_{k}\}_{k=1}^{t}\) a sequence of symmetric matrices such that for all k, \(\mathbb{E}_{k}[X_{k}] = 0\) and \(X_{k}^{2}\preccurlyeq C_{k}^{2}\) almost surely. Then, with probability at least \(1-\delta\):
    \[
    \lambda_{max}(\sum_{k=1}^{t}X_{k}) \leq \sqrt{8\sigma^{2}\log(d/\delta)},
    \]
    where \(\sigma^{2}=\Vert\sum_{k=1}^{t}C_{k}^{2}\Vert\).
\end{prop}

\begin{lemma}\label{lemma:azuma}(Azuma's inequality)
Let \((X_{k})_{k=1}^{t}\) be a finite adapted sequence such that for all k, \(\mathbb{E}_{k}[X_{k}]=0\) and \(|X_{t}|\leq a\) almost surely. Then, with probability at least \(1-\delta\):

\[
    |\sum_{k=1}^{t}X_{k}| \leq a\sqrt{t\log(2/\delta)}
\]
    
\end{lemma}

\begin{lemma}[MLE guarantee \citep{cheng2023improved}]
\label{lemma:MLE}
Fix \(\delta\in(0,1)\). Then, with probability \(1 - \delta/2\),\\

$(1)$ for all \(h=2, ..., H\) and \(t\in\mathbb{N}\),
\[
    \mathbb{E}_{(s,a)\sim \rho_{t,h}'(s,a)}[\Vert\hat{\mathcal{P}}_{h, t}(\cdot|s, a) - \mathcal{P}_{h}^{\star}(\cdot|s,a)\Vert_{\textnormal{TV}}^{2}] \leq \zeta_{t},
\]

$(2)$ for \(h=1\) and all \(t\in\mathbb{N}\),
\[
    \mathbb{E}_{(s,a)\sim\rho_{t,h}(s,a)}[\Vert\hat{\mathcal{P}}_{h, t}(\cdot|s, a) - \mathcal{P}_{h}^{\star}(\cdot|s,a)\Vert_{\textnormal{TV}}^{2}] \leq \zeta_{t},
\]
where \(\zeta_{t}=\frac{2\log(4t|\Phi||\Psi|H/\delta)}{t}\).
    
\end{lemma}