\subsection{Confidence Bound with Privacy}
We first define the \emph{non-private} empirical transition probability and the \emph{private} empirical transition probability as follows,
$$\transesteasy_\episode \rbr{\state^\prime\vert\state,\action} := \frac{\visitxaxtotal}{\visitxatotal}, \quad \transprieasy_\episode \rbr{\state^\prime\vert\state,\action} := \frac{\visitxaxtotalpri}{\visitxatotalpri}.$$

% \begin{lemma}\label{lemma: good event non-private transition estimate error bound}
% For any $\delta\in(0,1]$, we have a good event on the non-private transition estimate with high probability $1-\delta$, 
% \begin{align}\notag
% \forall \episode,\state,\action: \Vert\transeasy_\episode \rbr{\cdot \vert \state, \action} & - \transesteasy_\episode \rbr{\cdot \vert \state, \action} \Vert_1 
% \leq \sqrt{\frac{2}{\visitxatotal}\ln\frac{\rbr{2^{\statesize_{\horizon+1}}-2}\statesize\actionsize\episodetotal}{\delta}} \\
% &\leq \sqrt{\frac{2}{\visitxatotal}\rbr{\statesize_{\horizon+1}\ln2 + \ln\frac{\statesize\actionsize\episodetotal}{\delta}}}
% \triangleq \frac{\confconstant}{\sqrt{\visitxatotal}}, \notag
% \end{align}
% where $\confconstant = \sqrt{2\rbr{\statesize_{\horizon+1}\ln2 + \ln\frac{\statesize\actionsize\episodetotal}{\delta}}}$.
% \end{lemma}
% \begin{proof}
%     This result follows from \cite{weissman2003inequalities} by applying logarithmic math operations and the union bound over all $\rbr{\state,\action}\in\statespace\times\actionspace$ and all possible values of $\visitxatotal$ and $\episode$.
% \end{proof}

% \begin{lemma}[Concentration of private estimates]
% \label{lemma: concentration of transition estimates}
%     Fix any $\pripara>0$ and $\delta\in(0,1]$. Then, under assumption \ref{assp: private counts}, with probability at least $1-3\delta$, uniformly over all $\rbr{\state,\action,\horizon,\episode}$,
%     \begin{equation}
%         \normtranserror_\horizon^\episode \rbr{\state,\action} \triangleq \nbr{\transeasy_\horizon \rbr{\cdot \vert \state, \action} - \transprieasy_\horizon^\episode \rbr{\cdot \vert \state, \action} }_1 \leq \confnormtrans{\rbr{\state,\action}},
%     \end{equation}
%     where $\confnormtrans{\rbr{\state,\action}} := \frac{\confconstant}{\sqrt{\visitxatotalpri}} + \frac{2\statesize_{\horizon+1}\confcountxax}{\visitxatotalpri}$, and $\confconstant := \sqrt{2\statesize_{\horizon+1}\ln\frac{\statesize\actionsize\episodetotal}{\delta}}$.
% \end{lemma}

% \begin{proof}[Proof of Lemma \ref{lemma: concentration of private transition L1 error}]
% Assuming that the good event in Lemma~\ref{lemma: good event non-private transition estimate error bound} and the event specified in Assumption \ref{assp: private counts}$(1)$ hold, we employ similar proof used in Lemma 1 in \cite{chowdhury2022differentially} and Lemma B.2 in \cite{qiao2023near}, then Lemma~\ref{lemma: concentration of private transition L1 error} holds.
% That is, with probability $1-3\delta$, we have
% \begin{equation}\label{eq: confidence set of transition}
% \begin{aligned}
%      \forall \episode,\state,\action, \quad \nbr{\transeasy \rbr{\cdot \vert \state, \action} - \transprieasy_{\episode} \rbr{\cdot \vert \state, \action} }_1 
%     \leq \beta_{\episode}\rbr{\state,\action},  
% \end{aligned}
% \end{equation}
% % with $\confnormtrans{\rbr{\state,\action}} := $ $ \sqrt{\frac{2\statesize_{\horizon\rbr{\state}+1}\ln\rbr{\statesize\actionsize\episodetotal/\delta}}{\visitxatotalpri}} + \frac{2\statesize_{\horizon\rbr{\state}+1}\confcountxax}{\visitxatotalpri}$.
% with $\confnormtrans{\rbr{\state,\action}} := \frac{\confconstant}{\sqrt{\visitxatotalpri}} + \frac{2\statesize_{\horizon\rbr{\state}+1}\confcountxax}{\visitxatotalpri}$.
% \end{proof}

\begin{lemma}[Lemma 2, \citep{jin20c}] \label{lemma: element-wise confidence set of empirical transition}
    With probability at least $1-4\delta$, we have a good event
    \begin{equation}\notag
    \begin{aligned}
     \forall \sas\in\sasspace_\horizon, \forall \horizon,\episode, \quad \abr{\transeasy \rbr{\state' \vert \state, \action} - \transesteasy_{\episode} \rbr{\state' \vert \state, \action} } 
    \leq \bar{\confpwtrans}_{\episode}\rbr{\state'\vert\state,\action},  
    \end{aligned}
    \end{equation}
and $\bar{\confpwtrans}_{\episode}\rbr{\state'\vert\state,\action}$ for any $\sas\in\sasspace_\horizon$ and $\horizon\in\sbr{\horizontotal}$ is defined as 
$$\bar{\confpwtrans}_{\episode}\rbr{\state'\vert\state,\action} = \min\cbr{1,\sqrt{\frac{2\transesteasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transesteasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotal}} + \frac{14\ln\iota}{3\visitxatotal}}.$$
\end{lemma}

Then, we have the difference lemmas between the private transition estimate and the non-private transition estimate as follows.
\begin{lemma} \label{lemma: difference between private transition estimate and the non-private transition estimate - element wise}
    For all $\sas\in\sasspace_\horizon, \horizon \in [\horizontotal], \episode\in [\episodetotal]$, we have
    $$\abr{\transprieasy_\episode\rbr{\state'\vert\state,\action} - \transesteasy_\episode\rbr{\state'\vert\state,\action} }\leq  \frac{2\confcountxax}{\visitxatotalpri}.$$
\end{lemma}
\begin{proof}
By definition, we have 
\begin{align}
    \abr{\transprieasy_\episode\rbr{\state'\vert\state,\action} - \transesteasy_\episode\rbr{\state'\vert\state,\action} } &\leq \abr{\frac{\visitxaxtotalpri}{\visitxatotalpri} - \frac{\visitxaxtotal}{\visitxatotalpri}} + \abr{\frac{\visitxaxtotal}{\visitxatotalpri} - \frac{\visitxaxtotal}{\visitxatotal}} \notag\\
    &\leq \frac{\confcountxa}{\visitxatotalpri} + \frac{\visitxaxtotal\confcountxa} {\visitxatotal\cdot\visitxatotalpri} \notag\\
    &\leq \frac{2\confcountxax}{\visitxatotalpri}. \notag
\end{align}
\end{proof}

\begin{lemma}\label{lemma: difference between private transition estimate and the non-private transition estimate - element variance wise}
    For all $\sas\in\sasspace_\horizon, \horizon \in [\horizontotal], \episode\in [\episodetotal]$, we have,
    $$\abr{\transprieasy_\episode\rbr{\state'\vert\state,\action} \rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}} - \transesteasy_\episode\rbr{\state'\vert\state,\action} \rbr{1-\transesteasy_\episode\rbr{\state'\vert\state,\action}} }\leq  \frac{4\confcountxax}{\visitxatotalpri}.$$
\end{lemma}
\begin{proof}
Similar to Lemma \ref{lemma: difference between private transition estimate and the non-private transition estimate - element wise}, we have,
\begin{align}
    &\abr{\transprieasy_\episode\rbr{\state'\vert\state,\action} \rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}} - \transesteasy_\episode\rbr{\state'\vert\state,\action} \rbr{1-\transesteasy_\episode\rbr{\state'\vert\state,\action}} }  \notag\\
    =& \abr{\frac{\visitxaxtotal}{\visitxatotal} \cdot \frac{\visitxatotal-\visitxaxtotal}{\visitxatotal} - \frac{\visitxaxtotalpri}{\visitxatotalpri} \cdot \frac{\visitxatotalpri-\visitxaxtotalpri}{\visitxatotalpri}}  \notag\\
    \leq& \abr{\frac{\visitxatotal-\visitxaxtotal}{\visitxatotal}\sbr{\frac{\visitxaxtotal}{\visitxatotal} - \frac{\visitxaxtotal}{\visitxatotalpri}}}  \notag\\
     & + \abr{\frac{\visitxatotalpri-\visitxaxtotalpri}{\visitxatotalpri} \sbr{\frac{\visitxaxtotalpri}{\visitxatotalpri} - \frac{\visitxaxtotal}{\visitxatotalpri}}} \notag\\
     & + \abr{\frac{\visitxaxtotal}{\visitxatotalpri}\sbr{\frac{\visitxaxtotalpri}{\visitxatotalpri} - \frac{\visitxaxtotal}{\visitxatotal}}} \notag\\
   \leq & \frac{\visitxaxtotal\confcountxa}{\visitxatotal\cdot\visitxatotalpri} + \frac{\confcountxax}{\visitxatotalpri} + \frac{\visitxaxtotal}{\visitxatotal}\cdot\frac{2\confcountxa}{\visitxatotalpri} \notag \\
   \leq & \frac{4\confcountxa}{\visitxatotalpri}. \notag
\end{align}
\end{proof}

With the help of the lemmas above, it's ready to prove Lemma \ref{lemma: concentration of private transition error} now.
\begin{lemma}[Restatement of Lemma \ref{lemma: concentration of private transition error}]\label{lemma: element-wise confidence set of private transition}
    With probability at least $1-6\delta$, we have a good event,
    \begin{align}
     \forall \sas\in\sasspace_\horizon, \forall \horizon,\episode, \quad \abr{\transeasy \rbr{\state' \vert \state, \action} - \transprieasy_{\episode} \rbr{\state' \vert \state, \action} } 
    \leq \confpwtrans_{\episode}\rbr{\state'\vert\state,\action},  \notag
    \end{align}
and $\confpwtrans_{\episode}\rbr{\state'\vert\state,\action}$ for any $\sas\in\sasspace_\horizon$ and $\horizon\in\sbr{\horizontotal}$ is defined as 
\begin{align}
\confpwtrans_{\episode}\rbr{\state'\vert\state,\action} &= \min\cbr{1,\sqrt{\frac{2\transprieasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalpri}} + \frac{4\confcountxa+7\ln\iota}{\visitxatotalpri}} \notag\\
&= \min\cbr{1,\sqrt{\frac{2\transprieasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalhat}} + \frac{4\confcountxa+7\ln\iota}{\visitxatotalhat}}. \notag
\end{align}
\end{lemma}
\begin{proof}[Proof of Lemma \ref{lemma: element-wise confidence set of private transition}]
With standard decomposition, we have,
\notag
\begin{align}
    \abr{\transprieasy_{\episode} \rbr{\state' \vert \state, \action} - \transeasy \rbr{\state' \vert \state, \action}} \leq& \abr{\frac{\visitxaxtotalpri-\visitxaxtotal}{\visitxatotalpri}} + \abr{\frac{\visitxaxtotal}{\visitxatotalpri} - \transeasy \rbr{\state' \vert \state, \action}}  \notag\\
    \leq & \frac{\confcountxa}{\visitxatotalpri} + \abr{\frac{\visitxaxtotal}{\visitxatotal}\cdot\frac{\visitxatotal}{\visitxatotalpri} - \transeasy \rbr{\state' \vert \state, \action}} \notag\\
    \leq & \frac{\confcountxa}{\visitxatotalpri} + \abr{\frac{\visitxatotal}{\visitxatotalpri}\cdot\rbr{\frac{\visitxaxtotal}{\visitxatotal} - \transeasy \rbr{\state' \vert \state, \action}}} + \abr{\transeasy \rbr{\state' \vert \state, \action} \rbr{\frac{\visitxatotal}{\visitxatotalpri} - 1}} \notag\\
    \leq & \frac{2\confcountxa}{\visitxatotalpri} + \frac{\visitxatotal}{\visitxatotalpri} \cdot \abr{\transesteasy_{\episode}\rbr{\state' \vert \state, \action} - \transeasy\rbr{\state' \vert \state, \action}} \notag\\
    \leq & \frac{2\confcountxa}{\visitxatotalpri} + \frac{\visitxatotal}{\visitxatotalpri} \cdot \rbr{ \sqrt{\frac{2\transesteasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transesteasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotal}} + \frac{14\ln\iota}{3\visitxatotal}} \notag\\
    \leq & \frac{2\confcountxa}{\visitxatotalpri} + \sqrt{\frac{2\rbr{\transprieasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}} + \frac{4\confcountxa}{\visitxatotalpri}}\ln\iota}{\visitxatotalpri}} + \frac{14\ln\iota}{3\visitxatotalpri} \notag \\
    % \leq & \frac{2\confcountxa}{\visitxatotalpri} + 2\sqrt{\frac{\transprieasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalpri}} + \frac{4\sqrt{\confcountxa\ln\iota}}{\visitxatotalpri}+ \frac{14\ln\iota}{3\visitxatotalpri} \\
    \leq& \sqrt{\frac{2\transprieasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalpri}} + \frac{2\confcountxa+5\ln\iota+4\sqrt{\confcountxa\ln\iota}}{\visitxatotalpri} \notag\\
    \leq& \sqrt{\frac{2\transprieasy_\episode\rbr{\state'\vert\state,\action}\rbr{1-\transprieasy_\episode\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalpri}} + \frac{4\confcountxa+7\ln\iota}{\visitxatotalpri}, \notag
\end{align}
where the fifth inequality follows Lemma \ref{lemma: element-wise confidence set of empirical transition}, and the sixth inequality follows Lemma \ref{lemma: difference between private transition estimate and the non-private transition estimate - element variance wise}.
The seventh and eighth step use $\sqrt{x+y}\leq\sqrt{x}+\sqrt{y}$ and $2\sqrt{xy}\leq x+y$ for $x,y\geq0$.
The second equality in the lemma follows the definition of $\visitxatotalhat$ in the Eq.\ref{def: visitxatotalhat}.
\end{proof}

\begin{lemma}\label{lemma: element-wise error in confidence set}
    Conditioning on event in Lemma \ref{lemma: element-wise confidence set of private transition}, it holds for any episode $\episode$ and any transition $\transeasy' \in \transspace_\episode$,
    $$\abr{\transeasy' \rbr{\state' \vert \state, \action} - \transeasy \rbr{\state' \vert \state, \action} } 
    \leq \min\cbr{1, \sqrt{\frac{2\transeasy\rbr{\state'\vert\state,\action}\rbr{1-\transeasy\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalhat}} + \frac{7\confcountxa + 26\ln\iota}{\visitxatotalhat}}.$$
    % \leq \cO\rbr{\min\cbr{1,\sqrt{\frac{\transeasy\rbr{\state'\vert\state,\action}\rbr{1-\transeasy\rbr{\state'\vert\state,\action}} \ln\iota}{\visitxatotal}} + \frac{\confcountxax+\ln\iota}{\visitxatotal}}}$$
\end{lemma}
\begin{proof}
We have for any episode $\episode$ and any transition $\transeasy' \in \transspace_\episode$,
\begin{equation}\notag
\begin{aligned}
    \Big\vert\transeasy' &\rbr{\state' \vert \state, \action} - \transeasy \rbr{\state' \vert \state, \action} \Big\vert \leq \sqrt{\frac{2\transeasy'\rbr{\state'\vert\state,\action}\rbr{1-\transeasy'\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalhat}} + \frac{4\confcountxa+7\ln\iota}{\visitxatotalhat} \\
    \leq & \sqrt{\frac{2\ln\iota}{\visitxatotalhat}}\cdot\rbr{\sqrt{\transeasy\rbr{\state'\vert\state,\action}\rbr{1-\transeasy\rbr{\state'\vert\state,\action}}} + 3.8\sqrt{\frac{\ln\iota}{\visitxatotalhat}}+  1.5\sqrt{\frac{4\confcountxa+7\ln\iota}{\visitxatotalhat}}} + \frac{4\confcountxa+7\ln\iota}{\visitxatotalhat}  \\
    \leq & \sqrt{\frac{2\transeasy\rbr{\state'\vert\state,\action}\rbr{1-\transeasy\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalhat}} + \frac{7\confcountxa + 26\ln\iota}{\visitxatotalhat},
\end{aligned}
\end{equation}
where we apply Lemma \ref{lemma: revised bourel lemma} to Lemma \ref{lemma: element-wise confidence set of private transition} for the second step, and the third step follows from the fact that $\sqrt{x+y}\leq\sqrt{x}+\sqrt{y}$ and $\sqrt{xy}\leq\frac{1}{2}(x+y)$ for any $x,y\geq0$.
\end{proof}

\begin{corollary}\label{lemma: l1-norm error in confidence set}
    Conditioning on event in Lemma \ref{lemma: element-wise confidence set of private transition}, it holds for any episode $\episode$ and any transition $\transeasy' \in \transspace_\episode$,
    $$ \nbr{\transeasy' \rbr{\cdot \vert \state, \action} - \transeasy \rbr{\cdot \vert \state, \action} }_1 \leq  
    \min\rbr{2,\sqrt{\frac{2\locsupport_{\state,\action}\ln\iota}{\visitxatotalhat}} + \frac{\statesize_{\horizon(\state)+1}\rbr{7\confcountxa + 26\ln\iota}}{\visitxatotalhat}},$$
    % \cO\rbr{\min\cbr{2,\sqrt{\frac{\statesize_{\horizon(\state)} \ln\iota}{\visitxatotal}} + \frac{\statesize_{\horizon(\state)}\rbr{\confcountxax+\ln\iota}}{\visitxatotal}}},$$
    where for all $\rbr{\state,\action}$ as $\locsupport_{\state,\action} := \rbr{\sum_{\state'\in\statespace_{\horizon(\state)+1}} \sqrt{\transeasy\rbr{\state'\vert\state,\action}\rbr{1-\transeasy\rbr{\state'\vert\state,\action}}}}^2$ from \cite{bourel2020tightening}.
\end{corollary}
\begin{proof}
We introduce \emph{local effective support} as $\locsupport_{\state,\action}$ from \cite{bourel2020tightening}.
We have for any episode $\episode$ and any transition $\transeasy' \in \transspace_\episode$,
\begin{equation}
\begin{aligned}
   \nbr{\transeasy' \rbr{\cdot \vert \state, \action} - \transeasy \rbr{\cdot \vert \state, \action} }_1 &\leq \sum_{\state'\in\statespace_{\horizon(\state)+1}} \rbr{\sqrt{\frac{2\transeasy\rbr{\state'\vert\state,\action}\rbr{1-\transeasy\rbr{\state'\vert\state,\action}}\ln\iota}{\visitxatotalhat}} + \frac{7\confcountxa + 26\ln\iota}{\visitxatotalhat}} \notag\\
   &\leq \sqrt{\frac{2\locsupport_{\state,\action}\ln\iota}{\visitxatotalhat}} + \frac{\statesize_{\horizon(\state)+1}\rbr{7\confcountxa + 26\ln\iota}}{\visitxatotalhat} \notag.
\end{aligned}
\end{equation}
\end{proof}

\begin{lemma}[Local effective support, Lemma 4 in \cite{bourel2020tightening}] \label{lemma: upper bound for local effective support}
    For any state-action pair $\rbr{\state,\action}$, the local effective support is bounded,
    $$\locsupport_{\state,\action} \leq \statesize_{\horizon(\state)+1} - 1.$$
\end{lemma}
    
\begin{lemma}[Lower bound of upper occupancy measure]\label{lemma: lower bound of upper occupancy measure}
For any episode $\episode$ and state $\state\neq\state_\horizontotal$, it always holds that $\uppocc_\episode\rbr{\state}\geq\frac{1}{\statesize\episodetotal}$.
\end{lemma}
\begin{proof}
Similar to Lemma D.28 in \cite{jin2023noregret}, one can construct a specific transition $\hat{\transeasy}\in\transspace_\episode$, such that $\occmeasure^{\hat{\transeasy},\policy}(\state)\geq \frac{1}{\statesize\episodetotal}$ for any $\policy$ given episode $\episode$ and $\state$, which suffices due to the definition of $\uppocc_\episode\rbr{\state}$.

For any tuple $\sas\in\sasspace_\horizon$ and $\horizon=0,\cdots,\horizontotal-1$,
$\hat{\transeasy}(\state'\vert\state,\action) = \transprieasy_\episode\rbr{\state'\vert\state,\action}\cdot\rbr{1-\frac{1}{\episodetotal}} + \frac{1}{\statesize_{\horizon+1}\episodetotal}$.
It is easy to verify that $\hat{\transeasy}$ is a valid transition function and belongs to the confidence set $\transspace_\episode$.
Finally, we have the lower bound of the upper occupancy measure,
$$\occmeasure^{\hat{\transeasy},\policy}(\state) = \sum_{u\in\statespace_{\horizon-1}}\sum_{v\in\actionspace} \occmeasure^{\hat{\transeasy},\policy}(u,v) \hat{\transeasy}(\state'\vert u,v) \geq \sum_{u\in\statespace_{\horizon-1}}\sum_{v\in\actionspace} \occmeasure^{\hat{\transeasy},\policy}(u,v) \cdot \frac{1}{\statesize\episodetotal} = \frac{1}{\statesize\episodetotal}.$$
\end{proof}