\subsection{Difference Lemma}
\begin{lemma}[Occupancy Measure Difference, Lemma D.3.1 of \cite{jin2021best}] \label{lemma: general occupancy measure difference}
For any transition functions $\transeasy_1,\transeasy_2$ and any policy $\policy$,
\begin{align}
    \occmeasure^{\transeasy_1, \policy}(\state) - \occmeasure^{\transeasy_2, \policy}(\state) &= \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy_1, \policy}(u,v) \rbr{\transeasy_1(w\vert u,v) - \transeasy_2(w\vert u,v)} \occmeasure^{\transeasy_2, \policy}(\state\vert w) \notag \\
    &= \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy_2, \policy}(u,v) \rbr{\transeasy_1(w\vert u,v) - \transeasy_2(w\vert u,v)} \occmeasure^{\transeasy_1, \policy}(\state\vert w), \notag
\end{align}
where $\occmeasure^{\transeasy', \policy}(\state\vert w)$ is the probability of visiting $\state$ starting $w$ under policy $\policy$ and transition $\transeasy'$. 
\end{lemma}

\begin{lemma}[Lemma C.5 of \cite{dann23b}]
For any policies $\policy_1,\policy_2$ and any transition function $\transeasy,\transeasy$,
\begin{align}\notag
    \sum_{\state\neq\state_\horizontotal}\sum_{\action\in\actionspace} \abr{\occmeasure^{\transeasy, \policy_1}(\state,\action) - \occmeasure^{\transeasy, \policy_2}(\state,\action)} \leq \horizontotal \sum_{\state\neq\state_\horizontotal}\sum_{\action\in\actionspace} \occmeasure^{\transeasy, \policy_1}(\state) \abr{\policy_1(\action\vert\state) - \policy_2(\action\vert\state)},
\end{align}
\end{lemma}

Following the same idea in the proofs of Lemma C.4 in \cite{dann23b}, and Lemma D.3.8 in \cite{jin2023noregret}, we consider a tight bound of the difference between occupancy measures in the following lemma.
\begin{lemma}\label{lemma: occupancy measure error in transition confidence set}
Suppose the event in Lemma \ref{lemma: concentration of private transition error} holds. 
For any state $\state\neq\state_\horizontotal$, episode $\episode$ and transition $\transeasy' \in \transspace_{\episode}$, policy $\policy_\episode$, we have
\begin{align}
    \abr{\occmeasure^{\transeasy', \policy_\episode}(\state) - \occmeasure^{\transeasy, \policy_\episode}(\state)} \leq &  \cO\rbr{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)}  \notag\\ 
    &+ \cO\rbr{\statesize^2 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}. \notag
\end{align}
\end{lemma}
\begin{proof}
% For any valid transition $\transeasy$ and $\sas\in\sasspace$, we denote $\transeasy_{\sas}:= \transeasy\rbr{\state'\vert\state,\action}$ for simplication.
According to Lemma \ref{lemma: general occupancy measure difference}, we have
\begin{align}
\Big\vert & \occmeasure^{\transeasy', \policy_\episode}(\state) - \occmeasure^{\transeasy, \policy_\episode}(\state)\Big\vert 
\leq \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \abr{\transeasy'(w\vert u,v) - \transeasy(w\vert u,v)} \occmeasure^{\transeasy', \policy_\episode}(\state\vert w)  \notag\\
= & \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \abr{\transeasy'(w\vert u,v) - \transeasy(w\vert u,v)} \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)  \notag\\
& + \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \abr{\transeasy'(w\vert u,v) - \transeasy(w\vert u,v)} \rbr{ \occmeasure^{\transeasy', \policy_\episode}(\state\vert w) - \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)}  \notag \\
\leq & \underbrace{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \abr{\transeasy'(w\vert u,v) - \transeasy(w\vert u,v)} \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)}_{\text{\textsc{Term} (A)}}  \notag\\
& + \underbrace{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \abr{\transeasy'(w\vert u,v) - \transeasy(w\vert u,v)} \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \abr{\transeasy'(o\vert m,n) - \transeasy(o\vert m,n)}}_{\text{\textsc{Term} (B)}} \notag\\
\leq & \cO\rbr{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)} \tag{\textsc{Term} (A.1)}\\
& + \cO\rbr{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)} \tag{\textsc{Term} (A.2)}\\
& \tag{\textsc{Term} (B.1)} 
\begin{aligned} 
    + \cO\Biggl(\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} &\occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}  \\
    & \cdot \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \sqrt{\frac{\transeasy\rbr{o\vert m,n}\rbr{1-\transeasy\rbr{o\vert m,n}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}}}\Biggr) 
\end{aligned}\\
& \tag{\textsc{Term} (B.2)} 
\begin{aligned}
    + \cO\Biggl(\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} & \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \\
    & \cdot \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}}\Biggl)
\end{aligned} \\
& + \cO\rbr{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \cdot \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w)}, \tag{\textsc{Term} (B.3)} 
\end{align}
where the second step firstly subtracts and adds $\occmeasure^{\transeasy,\policy_\episode}\rbr{\state\vert w}$ and then applies Lemma \ref{lemma: general occupancy measure difference} again for $\abr{ \occmeasure^{\transeasy', \policy_\episode}(\state\vert w) - \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)}$ to obtain \textsc{Term} (A) and \textsc{Term} (B). 
Following Lemma \ref{lemma: element-wise error in confidence set}, we can decompose them into five terms as \textsc{Term} (A.1), (A.2), and \textsc{Term} (B.1), (B.2), (B.3).
Then, we bound these terms separately.

Clearly, we can bound \textsc{Term} (A.2) by letting $x$ be $x_\horizontotal$, 
$$\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w) \leq \statesize \sum_{\state\neq\state_\horizontotal}\sum_{\action\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}. $$

For \textsc{Term} (B.1), we have 
\begin{align}
&\begin{aligned} \notag
    \text{\textsc{Term} (B.1)} = \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} & \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}  \\
    & \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \sqrt{\frac{\transeasy\rbr{o\vert m,n}\rbr{1-\transeasy\rbr{o\vert m,n}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}}} 
\end{aligned}\\
\leq & \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\transeasy\rbr{w\vert u,v} \rbr{1-\transeasy\rbr{o\vert m,n}} \ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \notag\\
& + \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \cdot\frac{\transeasy\rbr{o\vert m,n}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \notag\\
=& \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon}} \frac{\rbr{1-\transeasy\rbr{o\vert m,n}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}} \rbr{\sum_{\horizon'=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \transeasy\rbr{w\vert u,v} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w)} \notag\\
&+ \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \frac{\occmeasure^{\transeasy, \policy_\episode}(u,v) \rbr{1-\transeasy\rbr{w\vert u,v}} \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \rbr{\sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w)\cdot \transeasy\rbr{o\vert m,n}} \notag \\
\leq& \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon}} \frac{\horizontotal\occmeasure^{\transeasy, \policy_\episode}(m,n)\ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}} +  \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_{\horizon}} \frac{\horizontotal\occmeasure^{\transeasy, \policy_\episode}(u,v)\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \notag\\
\leq& \cO\rbr{\horizontotal\statesize\sum_{\horizon=0}^{\horizontotal-1} \sum_{\state\in\statespace_\horizon}\sum_{\action\in\actionspace}\frac{\occmeasure^{\transeasy, \policy_\episode}(\state,\action)\ln\iota}{\visitxatotalhateasy_\episode\rbr{\state,\action}}}, \notag
\end{align}
where the second step applies $\sqrt{xy}\leq x+y$ for any $x,y\geq0$; the third step rearranges the summation order, and the fourth step follows the facts that $\sum_{\uvw\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \transeasy\rbr{w\vert u,v} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) = \occmeasure^{\transeasy, \policy_\episode}(m,n)$ and $\sum_{m\in\statespace_\horizon'}\sum_{\action\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w)\cdot \transeasy\rbr{o\vert m,n} = \occmeasure^{\transeasy,\policy_\episode}\rbr{o\vert w}$.

Similarly, we have \textsc{Term} (B.2) bounded as 
\begin{align}
&\begin{aligned} \notag
    \text{\textsc{Term} (B.2)} = \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} & \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}  \\
    & \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}}
\end{aligned}\\
\leq & \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon}  \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \transeasy\rbr{w\vert u,v} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}} \notag \\
& + \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon}  \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \notag \\
\leq & \sum_{\horizon'=0}^{\horizontotal-1} \sum_{\mno\in\sasspace_{\horizon'}} \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}} \rbr{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \transeasy\rbr{w\vert u,v} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w)} \notag \\
& + \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \frac{\occmeasure^{\transeasy, \policy_\episode}(u,v)\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \rbr{\sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w)} \notag\\
\leq & \sum_{\horizon'=0}^{\horizontotal-1} \sum_{\mno\in\sasspace_{\horizon'}} \sum_{\horizon=0}^{\horizon(\state)-1} \occmeasure^{\transeasy, \policy_\episode}(m,n)\cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{m,n}} + \statesize \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \frac{\occmeasure^{\transeasy, \policy_\episode}(u,v)\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}\notag\\ 
\leq & \horizontotal\statesize\sum_{\state\neq\state_\horizontotal}\sum_{\action\in\actionspace}\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action} \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{\state,\action}} + \statesize^2\sum_{\state\neq\state_\horizontotal}\sum_{\action\in\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action} \ln\iota}{\visitxatotalhateasy_\episode\rbr{\state,\action}}, \notag
\end{align}
where the first inequality uses the fact that $\visitxatotalhat_\episode\rbr{\state,\action}\geq\cO\rbr{\confcountxa+\ln\iota}$ according to the definition in Eq.\ref{def: visitxatotalhat}; the second inequality follows the facts that $\sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \transeasy\rbr{w\vert u,v} \cdot \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) = \occmeasure^{\transeasy, \policy_\episode}(m,n)$ and $\sum_{m\in\statespace_{\horizon'}} \sum_{\action\in\actionspace}\occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \leq 1$.
For \textsc{Term} (B.3), we have 
\begin{align}
\text{\textsc{Term} (B.3)} &=
\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \cdot \sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{\mno\in\sasspace_{\horizon'}} \occmeasure^{\transeasy, \policy_\episode}(m,n\vert w) \notag\\
&= \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}} \cdot \rbr{\sum_{\horizon'=\horizon+1}^{\horizon(\state)-1} \sum_{o\in\statespace_{\horizon'+1}} 1} \notag \\
&\leq \statesize^2 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}. \notag 
\end{align}

Putting all the bounds for these terms together yields the bound of the lemma,
\begin{align}
    \abr{\occmeasure^{\transeasy', \policy_\episode}(\state) - \occmeasure^{\transeasy, \policy_\episode}(\state)} \leq &  \cO\rbr{\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)}  \notag\\ 
    &+ \cO\rbr{\statesize^2 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}. \notag
\end{align}
\end{proof}

% \begin{lemma}
% Suppose the event in Lemma \ref{lemma: concentration of private transition error} holds. 
% For any transition $\transeasy' \in \transspace_{\episode}$ and policy $\policy_\episode$, we have
% \begin{align}
% \sum_{\state\in\statespace} \abr{\occmeasure^{\transeasy',\policy_\episode}(\state) - \occmeasure^{\transeasy,\policy_\episode}(\state)} \leq & \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizontotal-1} \sum_{u\in\state_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\locsupport_{u,v}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}} \notag\\
% & + \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \notag
% \end{align}
% \end{lemma}
% \begin{proof}
% According to Lemma \ref{lemma: occupancy measure error in transition confidence set}, we have 
% \begin{align}
% & \sum_{\state\in\statespace} \abr{\occmeasure^{\transeasy',\policy_\episode}(\state) - \occmeasure^{\transeasy,\policy_\episode}(\state)} \notag\\
% \leq & \cO\rbr{\sum_{\state\in\statespace} \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)} \notag \\ 
% &+ \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \notag\\
% \leq & \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}} \notag \\
% &+ \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \notag \\
% \leq & \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizontotal-1} \sum_{u\in\state_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\locsupport_{u,v}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}} + \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}. \notag
% \end{align}
% \end{proof}

% \begin{lemma}
% Suppose the event in Lemma \ref{lemma: concentration of private transition error} holds. 
% For any transition sequence $\cbr{\transeasy'_\episode}_{\episode\in[\episodetotal]}$ and policy sequence $\cbr{\policy_\episode}_{\episode\in[\episodetotal]}$, where $\transeasy'_\episode \in \transspace_{\episode}$, we have
% $$\sum_{\episode=1}^\episodetotal \sum_{\state\in\statespace} \abr{\occmeasure^{\transeasy'_\episode,\policy_\episode}(\state) - \occmeasure^{\transeasy_\episode,\policy_\episode}(\state)} \leq \cO\rbr{\horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sqrt{\ln\iota  \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \statesize^4\actionsize\rbr{\confcountxa + \ln\iota}}. $$
% \end{lemma}
% \begin{proof}
% According to Lemma \ref{lemma: occupancy measure error in transition confidence set}, we have 
% \begin{align}
% & \sum_{\episode=1}^\episodetotal \sum_{\state\in\statespace} \abr{\occmeasure^{\transeasy'_\episode,\policy_\episode}(\state) - \occmeasure^{\transeasy_\episode,\policy_\episode}(\state)}  \notag \\
% \leq & \cO\rbr{\horizontotal\sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{u\in\state_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\locsupport_{u,v}\ln\iota}{\visitxatotaleasy_\episode\rbr{u,v}}}} 
% + \cO\rbr{\statesize^3 \sum_{\episode=1}^\episodetotal \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotaleasy_\episode\rbr{u,v}}} \notag \\
% \leq & \cO\rbr{\horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sqrt{\ln\iota}\rbr{ \sqrt{\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \sqrt{\statesize_{\horizon+1}}\statesize_\horizon\actionsize\log\episodetotal + \sqrt{\statesize_{\horizon+1}}\log\iota}} \notag\\
% &+ \cO\rbr{\statesize^3 \sum_{\horizon=0}^{\horizontotal-1} \rbr{\confcountxa + \ln\iota} \rbr{\statesize_\horizon\actionsize\log\episodetotal + \log\iota}} \notag \\
% \leq & \cO\rbr{\horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sqrt{\ln\iota  \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \statesize^4\actionsize\rbr{\confcountxa + \ln\iota}}. 
% \end{align}
% \end{proof}

\subsection{Estimation Error}
\begin{lemma}[Lemma 10 in \cite{jin20c}]\label{lemma: estimate error without variance parameter}
    With probability at least $1-\delta$, we have for all $\horizon\in [\horizontotal]$, 
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\max\cbr{\visitxatotal,1}} = \cO\rbr{\statesize_\horizon\actionsize\log\episodetotal + \log\rbr{\frac{\horizontotal}{\delta}}},$$
and 
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\sqrt{\max\cbr{\visitxatotal,1}}} = \cO\rbr{\sqrt{\statesize_\horizon\actionsize\episodetotal} + \statesize_\horizon\actionsize\log\episodetotal + \log\rbr{\frac{\horizontotal}{\delta}}}.$$
\end{lemma}

\begin{proposition}\label{prop: estimate error without variance parameter in our paper}
Let $\cE_{EST}$ be the event such that we have for all $\horizon\in[\horizontotal]$ simultaneously
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\max\cbr{\visitxatotal,1}} = \cO\rbr{\statesize_\horizon\actionsize\log\episodetotal + \log\iota},$$
and 
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\sqrt{\max\cbr{\visitxatotal,1}}} = \cO\rbr{\sqrt{\statesize_\horizon\actionsize\episodetotal} + \statesize_\horizon\actionsize\log\episodetotal + \log\iota}.$$
We have $\prob\sbr{\cE_{EST}} \geq 1-\delta$.
\end{proposition}
\begin{proof}
    The proof directly follows from the definition of $\iota$, which ensures that $\iota\geq\frac{\horizontotal}{\delta}$.
\end{proof}

Based on the event $\cE_{EST}$, we introduce the following lemma which is critical in analyzing the estimation error.
\begin{lemma}\label{lemma: estimate error with variance parameter}
Suppose the event $\cE_{EST}$ defined in Proposition \ref{prop: estimate error without variance parameter in our paper} holds. 
Then we have for all $\horizon\in [\horizontotal]$,  
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\visitxatotalhat} = \cO\rbr{\statesize_\horizon\actionsize\log\episodetotal + \log\iota},$$
and 
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\sqrt{\visitxatotalhat}}\sqrt{\locsupport_{\state,\action}} \leq \cO\rbr{\sqrt{\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \sqrt{\statesize_{\horizon+1}}\statesize_\horizon\actionsize\log\episodetotal + \sqrt{\statesize_{\horizon+1}}\log\iota} $$
\end{lemma}
\begin{proof}
Since $\visitxatotalhat\geq\visitxatotal$ always holds by definition, the first equation directly follows Proposition \ref{prop: estimate error without variance parameter in our paper}.

For the second equation, similar to the proof of Lemma 10 in \cite{jin20c}, we decompose the term as 
\begin{align}
\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\sqrt{\visitxatotalhat}}\sqrt{\locsupport_{\state,\action}} &\leq \sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\sqrt{\visitxatotal}}\sqrt{\locsupport_{\state,\action}}  \notag\\
&= \sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\II_\episode\rbr{\state,\action}}{\sqrt{\visitxatotal}}\sqrt{\locsupport_{\state,\action}} 
+ \sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action) - \II_\episode\rbr{\state,\action}}{\sqrt{\visitxatotal}}\sqrt{\locsupport_{\state,\action}}.  \notag
\end{align}
The first term is bounded by
\begin{align}
\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\II_\episode\rbr{\state,\action}}{\sqrt{\visitxatotal}}\sqrt{\locsupport_{\state,\action}} &= \cO\rbr{\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace}\sqrt{\locsupport_{\state,\action}}\cdot \sqrt{\visitxatotaleasy_\episodetotal\rbr{\state,\action}}} \notag\\
&\leq \cO\rbr{\sqrt{\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal}},
\end{align}
according to Lemma \ref{lemma: jaksch sequence number lemma} and Cauchy-Schwarz inequality.

For the second term, we apply Lemma \ref{lemma: concentration for martingale} with $Y_\episode = \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action) - \II_\episode\rbr{\state,\action}}{\sqrt{\visitxatotal}} \leq 1,\lambda=1$, and the fact 
$$\expect_{\episode}\sbr{Y_\episode^2} \leq \expect_{\episode} \sbr{\rbr{ \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\II_\episode\rbr{\state,\action}}{\sqrt{\visitxatotal}}}^2} = \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\visitxatotal},$$
and combine the upper bound of $\locsupport_{\state,\action}$ in Lemma \ref{lemma: upper bound for local effective support} and Lemma \ref{lemma: estimate error without variance parameter},
$$\sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action) - \II_\episode\rbr{\state,\action}}{\sqrt{\visitxatotal}}\sqrt{\locsupport_{\state,\action}} \leq \sqrt{\statesize_{\horizon+1}} \cdot \rbr{ \sum_{\episode=1}^\episodetotal \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \frac{\occmeasure^{\transeasy,\policy_\episode}(\state,\action)}{\visitxatotal} + \ln\iota}.$$
Combining both terms, we prove the result.
\end{proof}