\subsection{Regret guarantee in the full-information setting}
\label{appen: Regret guarantee on full-information setting}
% \subsubsection{Decomposition}
% \begin{align}
%     \regret = \max _{\occmeasure \in \occmeasureset{\transeasy}} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode}-\occmeasure}{ \loss_\episode}  
%     = \underbrace{\max _{\occmeasure \in \occmeasureset{\transeasy}} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode} - \occmeasure^{\transeasy_\episode, \policy_\episode}}{ \loss_\episode}}_{\textsc{ERROR}}
%         + \underbrace{ \max _{\occmeasure \in \occmeasureset{\transeasy}} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode, \policy_\episode}-\occmeasure}{ \loss_\episode}}_{\textsc{REG}}
% \end{align}

We decompose the regret in the same way as the non-private setting \citep{rosenberg2019onlineamdp}, and bound \textsc{Error} and \textsc{Reg} terms separately:
\begin{align}
    % \regret = \max _{\occmeasure \in \occmeasureset\rbr{\transeasy}} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode}-\occmeasure}{ \loss_\episode}  
    % = \underbrace{\max _{\occmeasure \in \occmeasureset\rbr{\transeasy}} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode} - \occmeasure^{\transeasy_\episode, \policy_\episode}}{ \loss_\episode}}_{\textsc{Error}}
    %     + \underbrace{ \max _{\occmeasure \in \occmeasureset\rbr{\transeasy}} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode, \policy_\episode}-\occmeasure}{ \loss_\episode}}_{\textsc{Reg}}
    \regret =  \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode}-\occmeasure^*}{ \loss_\episode}  
    = \underbrace{ \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode} - \occmeasure^{\transeasy_\episode, \policy_\episode}}{ \loss_\episode}}_{\textsc{Error}}
        + \underbrace{  \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode, \policy_\episode}-\occmeasure^*}{ \loss_\episode}}_{\textsc{Reg}},
\end{align}
where $\occmeasure^{\transeasy_\episode, \policy_\episode}:=\occmeasure_\episode$ is an intermediate variable that helps bound the regret, and the transition estimate $\transeasy_\episode$ is associated with the occupancy measure $\occmeasure_\episode$.



\subsubsection{Bounding \sc{Error}}
\begin{lemma} 
\label{lemma: full-info occupancy error by privacy}
With Assumption \ref{assp: private counts} and Assumption \ref{assp: Private loss in full-information setting}, with high probability $1-7\delta$, we have 
\begin{align}\notag
    \textsc{Error} \leq \cO\rbr{\horizontotal\cumlocsupport\sqrt{\episodetotal} + \horizontotal\statesize^2\actionsize\ln\iota\log\episodetotal + \horizontotal\statesize^2\actionsize\confcountxa\log\episodetotal}.
\end{align}
\end{lemma}
\begin{proof}
For this proof, we consider that events in Lemma \ref{lemma: element-wise error in confidence set} and $\cE_{EST}$ in Proposition \ref{prop: estimate error without variance parameter in our paper} hold with high probability.
Since for every state-action pair and episode $\loss_\episode \rbr{\state,\action} \in \sbr{0,1}$, and by H\"{o}lder's inequality, we have 
\begin{align}
    & \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy, \policy_\episode} - \occmeasure^{\transeasy_\episode, \policy_\episode}}{ \loss_\episode} \notag \\ 
    & \leq \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{\state \in \statespace_\horizon}   \sum_{\action \in \actionspace} \abr{\occmeasure^{\transeasy_\episode,\policy_\episode} \rbr{\state, \action} - \occmeasure^{\transeasy, \policy_\episode} \rbr{\state, \action}} \notag \\
    & = \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{\state \in \statespace_\horizon} \abr{\occmeasure^{\transeasy_\episode,\policy_\episode} \rbr{\state} - \occmeasure^{\transeasy, \policy_\episode} \rbr{\state}} \notag \\
    & \leq \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{\state \in \statespace_\horizon}
    \sum_{\horizon'=0}^{\horizon-1} \sum_{\uvw\in\sasspace_\horizon'} \occmeasure^{\transeasy, \policy_\episode}(u,v) \abr{\transeasy_\episode(w\vert u,v) - \transeasy(w\vert u,v)} \occmeasure^{\transeasy_\episode, \policy_\episode}(\state\vert w) \notag\\
    &\leq \horizontotal \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{u\in\statespace_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \nbr{\transeasy_\episode(\cdot\vert u,v) - \transeasy(\cdot\vert u,v)}_1 \notag\\
    &\leq \horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{\episode=1}^\episodetotal  \sum_{u\in\statespace_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \cO\rbr{\sqrt{\frac{\locsupport_{u,v}\ln\iota}{\visitxatotalhateasy_\episode(u,v)}} + \frac{\statesize_{\horizon+1}\rbr{\confcountxa + \ln\iota}}{\visitxatotalhateasy_\episode(u,v)}} \notag\\
    &\leq \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizontotal-1}\rbr{\sqrt{\ln\iota\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \rbr{\sqrt{\statesize_{\horizon+1}\ln\iota} + \statesize_{\horizon+1} \rbr{\confcountxa+\ln\iota}} \rbr{\statesize_\horizon\actionsize\log\episodetotal + \ln\iota}}} \notag\\
    &\leq \cO\rbr{\horizontotal\cumlocsupport\sqrt{\episodetotal} + \horizontotal\statesize^2\actionsize\ln\iota\log\episodetotal + \horizontotal\statesize^2\actionsize\confcountxa\log\episodetotal},
\end{align}
where the third step applies Lemma \ref{lemma: general occupancy measure difference} and the fourth step rearranges the summation and uses the fact that $\sum_{\horizon=0}^{\horizontotal-1} \sum_{\state\in\statespace_\horizon}\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state\vert w} \leq \horizontotal$; the fifth step follows the bound of $L_1$ norm error between transitions within the confidence set in Lemma \ref{lemma: l1-norm error in confidence set}; the final steps follows the corollary of $\cE_{EST}$ in Lemma \ref{lemma: estimate error with variance parameter}.
\end{proof}

% \begin{lemma}[{\cite[Lemma B.2]{rosenberg2019onlineamdp}}]
%     Let $\cbr{\policy_\episode}_{\episode=1}^\episodetotal$ be policies and let $\cbr{\transeasy_\episode}_{\episode=1}^\episodetotal$ be transition functions. 
%     Then, for every $\horizon = 1,\cdots,\horizontotal -1$ and every $\episode = 1,\cdots,\episodetotal$, it holds that 
%     \begin{align}
%         \sum_{\state_\horizon \in \statespace_\horizon}   \sum_{\action_\horizon \in \actionspace} \abr{\occmeasure^{\transeasy_\episode,\policy_\episode} \rbr{\state_\horizon, \action_\horizon} - \occmeasure^{\transeasy, \policy_\episode} \rbr{\state_\horizon, \action_\horizon}}  
%         \leq \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \sum_{\action_s \in \actionspace_s} \occmeasure^{\transeasy,\policy_\episode}\rbr{\state_s,\action_s} \normtranserror_\episode \rbr{\state_s,\action_s},
%     \end{align}
%     where for every $\episode = 1, \cdots, \episodetotal$, and $\rbr{\state,\action}\in\statespace\times\actionspace$,  $\normtranserror_\episode \rbr{\state,\action} \triangleq \nbr{\transeasy_\episode \rbr{\cdot \vert \state, \action} - \transeasy \rbr{\cdot \vert \state, \action}}_1$.
% \end{lemma}

% Subsequently, we present the following lemma to bound the \textsc{Error} term. 
% This lemma represents a private version of the results presented in \cite{neu2012adversarial} and Lemma B.3 in \cite{rosenberg2019onlineamdp}, adapted to the private counters and a transition function confidence set, ensuring the attainment of the specified bounds.
% \begin{lemma}
% Let $\cbr{\policy_\episode}_{\episode=1}^\episodetotal$ be policies and let $\cbr{\transeasy_\episode}_{\episode=1}^\episodetotal$ be transition functions. 
% Then, with high probability at least $1-4\delta$,
% \begin{equation}
%     \begin{aligned}
%         \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \sum_{\action_s \in \actionspace_s} \occmeasure^{\transeasy,\policy_\episode}\rbr{\state_s,\action_s} \normtranserror_\episode \rbr{\state_s,\action_s} 
%         \leq& 3\horizontotal\sqrt{2\statesize\actionsize\episodetotal}\cdot\sqrt{\statesize\ln2+\horizontotal\ln(\statesize\actionsize\episodetotal/\delta)} \\
%         & + 2\horizontotal\statesize \sqrt{2\episodetotal\ln \frac{\horizontotal}{\delta}} + c \horizontotal \statesize^2 \actionsize  \confcountxax \ln\rbr{\frac{\episodetotal}{\actionsize}}.
%     \end{aligned}
% \end{equation}
% \end{lemma}
% \begin{proof}
%     For every $1\leq\episode\leq\episodetotal, 0\leq\horizon\leq\horizontotal-1$, we have 
%     \begin{equation}\label{eq: ERROR term for full-info}
%     \begin{aligned}
%         &\sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \sum_{\action_s \in \actionspace_s} \occmeasure^{\transeasy,\policy_\episode}\rbr{\state_s,\action_s} \normtranserror_\episode \rbr{\state_s,\action_s} \\ 
%         =& \sum_{s=0}^{\horizon-1}  \normtranserror_\episode \rbr{\state_s^\episode,\action_s^\episode} + \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \sum_{\action_s \in \actionspace_s} \rbr{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state_s,\action_s} - \II\cbr{\state_s^\episode = \state_s, \action_s^\episode = \action_s}} \normtranserror_\episode \rbr{\state_s,\action_s}.
%     \end{aligned}
%     \end{equation}
%     For the first term of Eq.~\eqref{eq: ERROR term for full-info}, denote $\confconstant = \sqrt{2\statesize_{\horizon+1}\ln\rbr{\statesize\actionsize\episodetotal/\delta}}$ by Lemma \ref{lemma: concentration of private transition L1 error}, then we decompose it as term A and term B using the optimism property of private counters (Assumption \ref{assp: private counts}(1)). 
%     \begin{equation}
%     \begin{aligned}
%         \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \normtranserror_\episode \rbr{\state_s^\episode, \action_s^\episode} 
%         \leq& \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \frac{\confconstant[s]}{\sqrt{\max\cbr{1, \visitxatotalprieasy_\episode \rbr{\state_s^\episode, \action_s^\episode}}}}
%         + \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \frac{2\statesize_{s+1}\confcountxax}{\max\cbr{1, \visitxatotalprieasy_\episode \rbr{\state_s^\episode, \action_s^\episode}}} \\
%         \leq& \underbrace{\sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1}  \frac{\confconstant[s]}{\sqrt{\max\cbr{1, \visitxatotaleasy_\episode \rbr{\state_s^\episode, \action_s^\episode}}}}}_{\text{term A}}
%         + \underbrace{\sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \frac{2\statesize_{s+1}\confcountxax}{\max\cbr{1, \visitxatotaleasy_\episode \rbr{\state_s^\episode, \action_s^\episode}}}}_{\text{term B}}. 
%     \end{aligned}
%     \end{equation}
%     We bound term A as below, and the second line results from the argument for the regret analysis of UCRL-2 \citep{jaksch2010near} (Lemma 19).
%     The third step follows from Jensen's inequality and the fourth step follows from Cauchy's inequality.
%     \begin{equation}
%     \begin{aligned}
%         \text{term A} &= \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1}  \frac{\confconstant[s]}{\sqrt{\max\cbr{1, \visitxatotaleasy_\episode \rbr{\state_s^\episode, \action_s^\episode}}}} 
%         = \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \sum_{\state_s,\action_s} \sum_{\episode=1}^\episodetotal \frac{\confconstant[s]\cdot \II\rbr{\state_s^\episode,\action_s^\episode=\state_s,\action_s}} {\sqrt{\max\cbr{1, \visitxatotaleasy_\episode \rbr{\state_s, \action_s}}}} \\
%         &\leq \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \confconstant[s] \sum_{\state_s,\action_s} 3\sqrt{\visitxatotaleasy_\episode \rbr{\state_s, \action_s}} \\
%         &\leq \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \confconstant[s] \cdot 3 \sqrt{\statesize_s\actionsize\episodetotal} 
%         = \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} 3\sqrt{2} \cdot \sqrt{\rbr{\statesize_{s+1}\ln 2 + \ln(\statesize\actionsize\episodetotal/\delta)}\cdot \statesize_s\actionsize\episodetotal} \\
%         &\leq \sum_{\horizon=0}^\horizontotal 3\sqrt{2} \cdot
%         \sqrt{\rbr{\statesize\ln 2 + \horizon\ln(\statesize\actionsize\episodetotal/\delta)}\cdot \statesize\actionsize\episodetotal} \\
%         &\leq 3\horizontotal\sqrt{2\statesize\actionsize\episodetotal}\cdot\sqrt{\statesize\ln2+\horizontotal\ln(\statesize\actionsize\episodetotal/\delta)}.
%     \end{aligned}
%     \end{equation}
%     Term B can be bound as follows, and $c$ is an appropriate constant,
%     \begin{equation}
%     \begin{aligned}
%         \text{term B} &= \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \frac{2\statesize_{s+1}\confcountxax}{\max\cbr{1, \visitxatotaleasy_\episode \rbr{\state_s^\episode, \action_s^\episode}}} \\
%         &= \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} 2\statesize_{s+1}\confcountxax \cdot \sum_{\state_s,\action_s} \sum_{i=1}^{\visitxatotaleasy_\episode\rbr{\state_s,\action_s}} \frac{1}{i} \\
%         &\leq c \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \statesize_{s+1} \confcountxax \cdot \sum_{\state_s,\action_s} \ln\rbr{\visitxatotaleasy_\episode\rbr{\state_s,\action_s}} \\
%         &\leq c \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \statesize_{s+1} \confcountxax \cdot \statesize_s \actionsize \ln\rbr{\frac{\episodetotal}{\statesize_s\actionsize}} \\
%         &\leq c \sum_{\horizon=0}^\horizontotal \statesize^2 \actionsize  \confcountxax \ln\rbr{\frac{\episodetotal}{\actionsize}} \\
%         &\leq c \horizontotal \statesize^2 \actionsize  \confcountxax \ln\rbr{\frac{\episodetotal}{\actionsize}}.
%     \end{aligned}
%     \end{equation}
%     For the second term of Eq.~\eqref{eq: ERROR term for full-info}, notice that $\rbr{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state_s} - \II\cbr{\state_s^\episode = \state_s}}$ form a martingale difference sequence with respect to trajectory history $\traj_{1:\episode-1}$
%     % $\history$ 
%     and by Azuma-Hoeffding inequality and $\normtranserror_\episode \rbr{\state,\action} \leq 2$, we have with probability at least $1-\delta$, 
%     \begin{equation}
%     \begin{aligned}
%         \sum_{\episode=1}^\episodetotal & \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \sum_{\action_s \in \actionspace_s} \rbr{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state_s,\action_s} - \II\cbr{\state_s^\episode = \state_s, \action_s^\episode = \action_s}} \normtranserror_\episode \rbr{\state_s,\action_s} \\
%         \leq& 2 \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \sum_{\action_s \in \actionspace_s} \rbr{\occmeasure^{\transeasy,\policy_\episode} \rbr{\state_s,\action_s} - \II\cbr{\state_s^\episode = \state_s, \action_s^\episode = \action_s}} \\
%         =& 2 \sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} \rbr{\occmeasure^{\transeasy,\policy_\episode} \rbr{\state_s} - \II\cbr{\state_s^\episode = \state_s}} \\
%         \leq& \sum_{\horizon=0}^\horizontotal \sum_{s=0}^{\horizon-1} \sum_{\state_s \in \statespace_s} 2\sqrt{2\episodetotal\ln\frac{\horizontotal}{\delta}} \\
%         =& 2\horizontotal\statesize \sqrt{2\episodetotal\ln\frac{\horizontotal}{\delta}}.
%     \end{aligned}
%     \end{equation}
%     Putting everything together, we obtain the bound of \sc{Error}.
% \end{proof}

\subsubsection{Bounding \sc{Reg}}
\label{sssection: bounding reg in full-info setting}
To bound \textsc{Reg}, we apply a technique similar to that utilized in the private Follow-The-Regularized-Leader (FTRL) algorithm for private online learning as described in Theorem 3.4 in \cite{agarwal2017price} and Lemma 30 in \cite{agarwal2023differentially}.
\begin{lemma}
\label{lemma: full-info REG bound}
With private loss satisfying Assumption \ref{assp: Private loss in full-information setting}, and parameter $\FTRLpara = \sqrt{\frac{\ln\rbr{\statesize\actionsize/\horizontotal}}{\episodetotal}}$ and $\delta=\frac{\statesize\actionsize}{\episodetotal}$, we have 
    \begin{align}
        \EE\sbr{\textsc{Reg}} \leq 2\horizontotal\sqrt{\episodetotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}+ \horizontotal\conflossf.
    \end{align}
\end{lemma}
\begin{proof}
    Note that $\noise$ is the difference between the cumulative loss of state-action pair $\losscum$ and its private version $\losscumpri$ in $\episode$-th episode, i.e.,
    $\noise = \losscumpri - \losscum.$
    In accordance with Assumption \ref{assp: Private loss in full-information setting} for private cumulative losses, 
    % \begin{align}
    %     \noise = \losscumpri - \losscum = \sum_{i=1}^{\noisecount} b_\horizon^\episode\rbr{i}
    % \end{align}
    % Where each $b_\horizon^\episode\rbr{i}$ is sampled from $\lap{\lambda}$ independently at each step, and $\lambda = \frac{3\horizontotal\log\episodetotal}{\pripara}$ for JDP setting, and $\lambda = \frac{3\horizontotal}{\pripara}$ for LDP setting.
    we can infer that $\noise$ follows the same distribution and is sampled independently at each step for all $\rbr{\episode,\state,\action}$.
    % Where each $b_\horizon^\episode\rbr{i}$ is sampled from $\lap{\lambda}$ independently at each step, and $\lambda = \frac{3\horizontotal\log\episodetotal}{\pripara}$ for JDP setting, and $\lambda = \frac{3\horizontotal}{\pripara}$ for LDP setting.

    For ease of analysis, we introduce a pseudo-private algorithm that performs a one-shot noise injection, which follows the same distribution as $\noise$, but is sampled only once at the beginning of the algorithm.
    The learning agent then operates based on the loss with this one-shot noise.
    % Compared with the noise $\noise$ sampled at each step in our private algorithm, we consider a virtual algorithm that performs a one-shot noise injection, which is sampled at the beginning of the algorithm and plays with respect to that.
    Although the pseudo-private algorithm may not be differentially private, it experiences the same expected regret as our private algorithm due to the equal expected loss.
    
    Formally, consider a one-shot sampled noise $\widehat{\noiseeasy}\rbr{\state,\action}$ that is independently sampled before the interaction begins.
    Using this one-shot noise, we define a pseudo-private cumulative loss $\widehat{\losscumeasy}_\episode\rbr{\state,\action} = \losscum + \widehat{\noiseeasy}\rbr{\state,\action}$ for all $\rbr{\state,\action}$ pair at all episodes.
    Next, we establish the sequence of pseudo occupancy measures $\occmeasvir$ as follow: 
    \begin{equation}
        \occmeasvir_1 \triangleq \occmeasure_1, \quad \occmeasvir_\episode \triangleq \argmin_{\occmeasure \in \occmeasureset\rbr{\transspace_{\episode}}} \inner{\widehat{\losscumeasy}_\episode}{\occmeasure} + \frac{1}{\FTRLpara}\regularizer\rbr{\occmeasure}. \notag
    \end{equation}
    In expectation, it can be observed that $\expect_{\noiseeasy_\episode}\sbr{\inner{\occmeasure_\episode}{\loss_\episode}} = \expect_{\widehat{\noiseeasy}}\sbr{\inner{\occmeasvir_\episode}{\loss_\episode}}$ since $\occmeasvir_\episode$ has the same distribution as $\occmeasure_\episode$. 
    This leads to the following equality, 
    \begin{align}\notag
    \expect_{Z_1,\cdots,Z_\episodetotal} \sbr{\sum_{\episode=1}^\episodetotal\inner{\occmeasure_\episode}{\loss_\episode}} = \expect_{Z}\sbr{\sum_{\episode=1}^\episodetotal\inner{\occmeasvir_\episode}{\loss_\episode}}.
    \end{align}
    Therefore, the pseudo-private algorithm experiences the same regret in expectation as our private algorithm, and it is sufficient to bound the regret of sequence $\cbr{\occmeasvir_\episode}_{\episode=1}^\episodetotal$.
    The proof follows the standard template of FTRL analysis in \cite{hazan2016introduction},
    and the key intuition is that the addition of one-shot noise does not impact the stability term of the FTRL analysis, but incurs a cost in the bias term instead.

    We define an augmented series of loss function as $\loss_0\rbr{\occmeasure} = \inner{\widehat{\noiseeasy}}{\occmeasure} + \frac{1}{\FTRLpara}\regularizer\rbr{\occmeasure}$. 
    By applying the ``Be the Leader" Lemma in \cite{hazan2016introduction}, we obtain that for any fixed $u \in \cap_\episode \occmeasureset\rbr{\transspace_\episode}$ in the confidence set,
    \begin{equation}\notag
    \sum_{\episode=0}^{\episodetotal} \inner{u}{\loss_\episode} \geq \sum_{\episode=0}^{\episodetotal} \inner{\occmeasvir_{\episode+1}}{\loss_\episode}.
    \end{equation}
    Consequently, we can conclude that 
    \begin{align}
        \sum_{\episode=1}^\episodetotal \inner{\occmeasvir_\episode-u}{\loss_\episode} \leq & \sum_{\episode=1}^\episodetotal \inner{\occmeasvir_\episode-\occmeasvir_{\episode+1}}{\loss_\episode} + \inner{u - \occmeasvir_1}{\loss_0} \notag\\
        \leq & \sum_{\episode=1}^\episodetotal \inner{\occmeasvir_\episode-\occmeasvir_{\episode+1}}{\loss_\episode} + \frac{1}{\eta} \distc_\psi + \distc_{\widehat{\noiseeasy}}, \notag
    \end{align}
    where the two bias terms are $\distc_\psi \triangleq \max_{\occmeasure\in\occmeasureset\rbr{P}} \regularizer\rbr{\occmeasure} - \min_{\occmeasure\in\occmeasureset\rbr{P}} \regularizer\rbr{\occmeasure}$, and $\distc_{\widehat{\noiseeasy}} \triangleq \max_{\occmeasure\in\occmeasureset\rbr{P}} \inner{\occmeasure}{\widehat{\noiseeasy}} - \min_{\occmeasure\in\occmeasureset\rbr{P}} \inner{\occmeasure}{\widehat{\noiseeasy}} $. 
    Via Jensen's inequality, we have $\distc_\psi \leq \horizontotal\ln\frac{\statesize\actionsize}{\horizontotal}$.
    % Via Lemma \ref{lemma: radius of D_Z} and union bound,
    % we have $\expect\sbr{\distc_{\widehat{\noiseeasy}}} \leq O\rbr{\lambda\noisecount\horizontotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}$.
    By Assumption \ref{assp: Private loss in full-information setting},
    we have $\expect\sbr{\distc_{\widehat{\noiseeasy}}} \leq \horizontotal\conflossf$.
    Following \cite{hazan2016introduction}, the stability term is bounded by the $L_\infty$ norm of the loss function,
    \begin{equation}
        % \inner{\occmeasvir_\episode-\occmeasvir_{\episode+1}}{\loss_\episode} 
        % \leq \sum_{\horizon=1}^{\horizontotal} \nbr{\occmeasvir_\horizon^{\episode}-\occmeasvir_\horizon^{\episode+1}}_1 \nbr{\loss_\horizon^{\episode}}_\infty 
        % \leq \sum_{\horizon=1}^{\horizontotal} \FTRLpara \nbr{\loss_\horizon^{\episode}}_\infty^2
        % \leq \FTRLpara\horizontotal
        \inner{\occmeasvir_\episode-\occmeasvir_{\episode+1}}{\loss_\episode} 
        \leq \FTRLpara \nbr{\loss_{\episode}}_\infty^2
        \leq \FTRLpara\horizontotal.
    \end{equation}
    Using Lemma \ref{lemma: concentration of private transition error}, we have $\occmeasure^* \in \cap_\episode \occmeasureset\rbr{\transspace_\episode}$ with probability at least $1-6\delta$.
    Thus, putting everything together and letting $\FTRLpara = \sqrt{\frac{\ln\rbr{\statesize\actionsize/\horizontotal}}{\episodetotal}}$, we obtain the following expected bound, with probability at least $1-6\delta$ (with probability of at most $3\delta$, we can bound it as $\episodetotal\horizontotal$ and setting $\delta=\frac{\statesize\actionsize}{\episodetotal}$ eliminates this term),
    % \begin{equation}
    % \begin{aligned}
    %     \expect\sbr{\textsc{Reg}} \leq & \FTRLpara \horizontotal\episodetotal + \frac{1}{\FTRLpara} \horizontotal\frac{\statesize\actionsize}{\horizontotal} + O\rbr{\lambda\noisecount\horizontotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}  \\
    %     \leq & 2\horizontotal\sqrt{\episodetotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}+ O\rbr{\lambda\noisecount\horizontotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}
    % \end{aligned}
    % \end{equation}
    \begin{align}\notag
        \expect\sbr{\textsc{Reg}} 
        % & \FTRLpara \horizontotal\episodetotal + \frac{1}{\FTRLpara} \horizontotal\frac{\statesize\actionsize}{\horizontotal} + \horizontotal\conflossf  \\
        \leq 2\horizontotal\sqrt{\episodetotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}+ \horizontotal\conflossf.
    \end{align}
\end{proof}
Putting everything together and using Lemma \ref{lemma: expectation on random variables}, we obtain the expected regret bound (Theorem \ref{thm: Regret bound of Private UC-O-REPS}), 
\begin{equation}
\begin{aligned} \notag
    \expect{\sbr{\regret}} =& \expect\sbr{\textsc{Error}} + \expect\sbr{\textsc{Reg}}. \\
    \leq & \cO\rbr{\horizontotal\cumlocsupport \sqrt{\episodetotal} + \horizontotal\statesize^2\actionsize\confcountxa + \horizontotal\conflossf}.
\end{aligned}
\end{equation}
