\subsection{Regret Guarantee in the Bandit Setting}
\label{appen: Regret guarantee on bandit setting}
Our analysis starts from a decomposition similar to Appendix C.3 in \cite{lee2020bias} as follows. 
% \begin{equation}\notag
% \begin{aligned}
%     \regret = \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode} - \occmeasure^*}{\loss_\episode} 
%     = \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode} - \occmeasure^{\transeasy_\episode,\policy_\episode}}{\loss_\episode}}_{\textsc{Error}}
%     + \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeasure^*}{\loss_\episode - \lossesteasy_\episode}}_{\textsc{Bias}}
%     + \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeasure^*}{\lossesteasy_\episode}}_{\textsc{Reg}},
% \end{aligned}
% \end{equation}
\begin{equation}
\begin{aligned}
    \regret = \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode} - \occmeasure^*}{\loss_\episode} 
    &= \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode} - \occmeasure^{\transeasy_\episode,\policy_\episode}}{\loss_\episode}}_{\textsc{Error}}
    + \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\loss_\episode - \lossesteasy_\episode}}_{\textsc{Bias}} \\
    & + \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\lossesteasy_\episode}}_{\textsc{Reg}} + \sum_{\episode=1}^\episodetotal \inner{\occmeascom - \occmeasure^*}{\loss_\episode}.
\end{aligned}
\end{equation}
Here, the $\occmeascom$ is defined as 
$$\occmeascom = \rbr{1-\frac{1}{\episodetotal}}\occmeasure^* + \frac{1}{\actionsize\episodetotal} \sum_{\action\in\actionspace} \occmeasure^{\transeasy_0,\policy_a},$$
where $\policy_a$ is the policy that chooses action $a$ at every state, and a specific transition $\transeasy_0$ is defined in Lemma C.4 in \cite{lee2020bias}, satisfying $\transeasy_0 \in \cap_\episode \transspace_\episode$ and $\transeasy_0\rbr{\state'\vert,\state,\action}\geq\frac{1}{\episodetotal\statesize}$ for all $\horizon<\horizontotal,\rbr{\state,\action,\state'}\in\statespace_\horizon\times\actionspace\times\statespace_{\horizon+1}$. 
Besides, according to Lemma C.4 in \cite{lee2020bias}, we also have $\occmeasure^{\transeasy_0,\policy_a}\in\cap_\episode\occmeasureset\rbr{\transspace_\episode}$, and $\occmeascom$ is also in that convex set due to convex combination rule.
Note that the last term can be trivially bounded by,
$$\sum_{\episode=1}^\episodetotal \inner{\occmeascom - \occmeasure^*}{\loss_\episode} \leq \frac{1}{\actionsize\episodetotal}\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_0,\policy_a}}{\loss_\episode} \leq \horizontotal.$$
Then, we bound each term as follows.

\subsubsection{Bounding \sc{Error}}
% \textbf{Bounding $\textsc{Error}$ term:}
Since we use the same estimator for the transition function for both the full-information setting and bandit-feedback setting, the term \textsc{Error} can be bounded by using Lemma \ref{lemma: full-info occupancy error by privacy}.
That is, with probability at least $1-7\delta$,
\begin{equation}\notag
    \textsc{Error}  
    \leq \cO\rbr{\horizontotal\cumlocsupport \sqrt{\episodetotal} + \horizontotal\statesize^2\actionsize\ln\iota\log\episodetotal + \horizontotal\statesize^2\actionsize\confcountxa\log\episodetotal}.
\end{equation}

\subsubsection{Bounding \sc{Bias}}
% \begin{lemma} 
%     Under Assumption \ref{assp: private counts}(3) and the $\alpha$-reachability assumption, we have,
%     $$\abr{\expect\sbr{\textsc{Bias}}} \leq \frac{2\ninterval}{2\ninterval+\statesize\episodetotal} \cdot \expect\sbr{{\regret}} + \rbr{\frac{\ninterval+1}{\alpha\cdot(2\ninterval+\statesize\episodetotal)} - \frac{2\ninterval}{2\ninterval+\statesize\episodetotal}} \cdot \expect\sbr{\textsc{Error}}.$$
% \end{lemma}
% \begin{proof}
For all $\episode,\state,\action$, we define intermediate variables $g_\episode(\state,\action) = \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}$.
Thus, \textsc{Bias} term can be decomposed as follows,
$$\textsc{Bias} = \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\loss_\episode - \lossesteasy_\episode} 
= \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\loss_\episode - g_\episode}}_{\textsc{Bias}_1} 
+ \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode}}{g_\episode - \lossesteasy_\episode}}_{\textsc{Bias}_2}
+ \underbrace{\sum_{\episode=1}^\episodetotal \inner{\occmeascom}{\lossesteasy_\episode - g_\episode}}_{\textsc{Bias}_3}.$$
Before we bound the three terms, we restate the construction of our loss estimator. 
We first privatize the observed loss $\loss_\episode\rbr{\state,\action}\II_\episode\rbr{\state,\action}$ to $\losspri$, and then scale it to $[0,1]$ to get the intermediate loss $\ddot{\loss}_\episode$.
By using the upper occupancy measure $\uppocc_\episode$, we finally get the loss estimator $\lossest$.
That is,
\begin{equation}
    \lossest = \frac{\ddot{\loss}_\episode(\state,\action)}{\uppocc_\episode\rbr{\state,\action}} = \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\loss_\episode\rbr{\state,\action}\II_\episode\rbr{\state,\action} + \noise + \ninterval}{2\ninterval+1}.
\end{equation}
Observe that the estimated loss $\lossesteasy_\episode$ is biased when given previous trajectories $\traj_{1:\episode-1}$, 
since $\transeasy_\episode$ may be different from the true transition function $\transeasy$ and the losses are perturbed by noise and then scaled.
The randomness comes from the random policy, stochastic transition function, and zero-mean injected noise.
\begin{equation}\label{eq: expectation of private bandit loss}
\begin{aligned}
    & \expect\sbr{\lossesteasy_{\episode}\rbr{\state,\action} \vert \traj_{1:\episode-1}} = \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1} + \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}}\cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}.
    % &= \expect\sbr{\frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{2\ninterval + 1}\cdot\rbr{\frac{\loss_\episode\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}} + \noise + \ninterval} + \frac{1-\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{2\ninterval + 1}\cdot\rbr{\noise + \ninterval}} \\
    % & = \frac{\ninterval}{2\ninterval+1} + \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}\cdot\frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}} \\
    % & = \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state}}{\uppocc_\episode\rbr{\state}} \cdot g_\episode(\state,\action) + \rbr{1-\frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state}}{\uppocc_\episode\rbr{\state}}}\cdot\frac{\ninterval}{2\ninterval+1}.
\end{aligned}
\end{equation}

\textbf{$\textsc{Bias}_1$}: By definition and nice decomposition, we have $\textsc{Bias}_1$ controlled by $\regret - \textsc{Error}$.
\begin{equation}\notag
\begin{aligned}
    \textsc{Bias}_1 &= \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\loss_\episode - g_\episode} = \frac{2\ninterval}{2\ninterval+1} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\loss_\episode}\\
    &\leq \frac{2\ninterval}{2\ninterval+1} \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeasure^*}{\loss_\episode} \\
    &= \frac{2\ninterval}{2\ninterval+1} \sbr{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode} - \occmeasure^*}{\loss_\episode} - \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode} - \occmeasure^{\transeasy_\episode,\policy_\episode}}{\loss_\episode}} \\
    &= \frac{2\ninterval}{2\ninterval+1}\sbr{\regret - \textsc{Error}}.
\end{aligned}
\end{equation}

\textbf{$\textsc{Bias}_2$}: In expectation, we have,
\begin{equation}
\begin{aligned}\notag
\expect\sbr{\textsc{Bias}_2} &= \expect\sbr{\sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy_\episode,\policy_\episode}}{g_\episode - \lossesteasy_\episode}} \\
&= \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action} \rbr{\frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1} - \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1} - \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}}\cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}}} \\
&= \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action} \rbr{\frac{\uppocc_\episode\rbr{\state,\action}-\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}}\cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1} - \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1} }} \\
&\leq\expect\sbr{\frac{1}{2\ninterval+1}\cdot\sum_{\episode=1}^\episodetotal \sum_{\state} \abr{\uppocc_\episode\rbr{\state} - \occmeasure^{\transeasy,\policy_\episode}\rbr{\state}} - \sum_{\episode=1}^\episodetotal \sum_{\state,\action} \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1}}, 
\end{aligned}
\end{equation}
where the second equality applies the expectation of private loss estimator in Eq. \ref{eq: expectation of private bandit loss}; and the fourth inequality is due to the definition of the upper occupancy measure, i.e., $\uppocc_\episode\rbr{\state,\action}\geq \occmeasure^{\transeasy_\episode,\policy_\episode}$; Lemma \ref{lemma: concentration of private transition error}, i.e., $\transeasy\in\transspace_\episode$ with high probability, and $\loss_\episode\rbr{\state,\action}\in[0,1]$ for all $\rbr{\state,\action,\episode}$.  

Similar to the proof of Lemma \ref{lemma: full-info occupancy error by privacy}, we consider that the event in Lemma \ref{lemma: element-wise error in confidence set} and $\cE_{EST}$ in Proposition \ref{prop: estimate error without variance parameter in our paper} hold with high probability.
Then, we have 
\begin{align}
& \sum_{\state\in\statespace} \abr{\uppocc_\episode\rbr{\state} - \occmeasure^{\transeasy,\policy_\episode}(\state)} \notag\\
\leq & \cO\rbr{\sum_{\state\in\statespace} \sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \cdot \occmeasure^{\transeasy, \policy_\episode}(\state\vert w)} \notag \\ 
&+ \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \notag\\
\leq & \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizon(\state)-1} \sum_{\uvw\in\sasspace_\horizon} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\transeasy\rbr{w\vert u,v}\rbr{1-\transeasy\rbr{w\vert u,v}}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}} \notag \\
&+ \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \notag \\
\leq & \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizontotal-1} \sum_{u\in\state_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\locsupport_{u,v}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}} + \cO\rbr{\statesize^3 \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}, \notag
\end{align}
where the first step applies the occupancy difference lemma in Lemma \ref{lemma: occupancy measure error in transition confidence set}, and the second inequality is due to the fact that $\sum_{\state\in\statespace} \occmeasure^{\transeasy, \policy_\episode}(\state\vert w) \leq \horizontotal$.

Taking the summation over all episodes yields the following:
% According to Lemma \ref{lemma: occupancy measure error in transition confidence set}, we have 
\begin{align}
& \sum_{\episode=1}^\episodetotal \sum_{\state\in\statespace} \abr{\uppocc_\episode\rbr{\state} - \occmeasure^{\transeasy,\policy_\episode}(\state)}  \notag \\
\leq & \cO\rbr{\horizontotal\sum_{\episode=1}^\episodetotal \sum_{\horizon=0}^{\horizontotal-1} \sum_{u\in\state_\horizon}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \sqrt{\frac{\locsupport_{u,v}\ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}}} 
+ \cO\rbr{\statesize^3 \sum_{\episode=1}^\episodetotal \sum_{u\neq\state_\horizontotal}\sum_{v\in\actionspace} \occmeasure^{\transeasy, \policy_\episode}(u,v) \cdot \frac{\confcountxa + \ln\iota}{\visitxatotalhateasy_\episode\rbr{u,v}}} \notag \\
\leq & \cO\rbr{\horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sqrt{\ln\iota}\rbr{ \sqrt{\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \sqrt{\statesize_{\horizon+1}}\statesize_\horizon\actionsize\log\episodetotal + \sqrt{\statesize_{\horizon+1}}\log\iota}} \notag\\
&+ \cO\rbr{\statesize^3 \sum_{\horizon=0}^{\horizontotal-1} \rbr{\confcountxa + \ln\iota} \rbr{\statesize_\horizon\actionsize\log\episodetotal + \log\iota}} \notag \\
% \leq & \cO\rbr{\horizontotal\cumlocsupport\sqrt{\episodetotal\ln\iota} + \statesize^4\actionsize\rbr{\confcountxa + \ln\iota}}, \notag
\leq & \cO\rbr{\horizontotal\cumlocsupport\sqrt{\episodetotal\ln\iota} + \statesize^4\actionsize\rbr{\confcountxa + \ln\iota}\ln\iota}, \notag
\end{align}
where the second inequality follows the corollary of $\cE_{EST}$ in Lemma \ref{lemma: estimate error with variance parameter}. 
Thus, we derive the upper bound of $\textsc{Bias}_2$,
$$\expect\sbr{\textsc{Bias}_2}\leq \cO\rbr{ \frac{1}{2\ninterval+1}\cdot \rbr{\horizontotal\cumlocsupport\sqrt{\episodetotal} + \statesize^4\actionsize\confcountxa}} - \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1}} .$$

\textbf{$\textsc{Bias}_3$}: We apply the Eq.\ref{eq: expectation of private bandit loss} and the definition of upper occupancy measure $\uppocc_\episode\rbr{\state,\action}$ and Lemma \ref{lemma: concentration of private transition error} directly, then obtain $\textsc{Bias}_3$ bounded.
\begin{equation}
\begin{aligned}\notag
\expect\sbr{\textsc{Bias}_3} &= \expect\sbr{\sum_{\episode=1}^\episodetotal \inner{\occmeascom}{\lossesteasy_\episode - g_\episode}}  \notag\\
&= \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \occmeascom\rbr{\state,\action}\cdot\rbr{\frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1} + \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}}\cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1} - \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}}} \\
&= \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \occmeascom\rbr{\state,\action}\cdot\rbr{ \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action} - \uppocc_\episode\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1} + \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1}}} \\ 
&\leq \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1}} .
\end{aligned}
\end{equation}
% \end{proof}

\subsubsection{Bounding \sc{Reg}}
\begin{lemma}
    With private loss estimator satisfying Assumption \ref{assp: Private loss in bandit feedback setting}, and let $\FTRLpara=\sqrt{\frac{\statesize}{\episodetotal}},\delta=\frac{\statesize\actionsize}{\episodetotal}$, we have,
    $$\expect\sbr{\textsc{Reg}}\leq \cO\rbr{\frac{\sqrt{\statesize\episodetotal}}{2\ninterval+1}\cdot\rbr{\actionsize\statesize\ninterval+\horizontotal}}$$
\end{lemma}
\begin{proof}
Using the standard analysis of OMD with log-barrier (e.g., Lemma 12 in \cite{agarwal2017corralling}), we have, for any $u\in\cap_\episode\occmeasureset\rbr{\transeasy_\episode}$, 
\begin{equation}\label{eq: REG with log-barrier function}
\begin{aligned}
    \sum_{\episode=1}^{\episodetotal} \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - u}{\lossesteasy_\episode} &\leq  \frac{\divg{u}{\occmeasure_1}}{\FTRLpara} + \FTRLpara\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action,\state'} \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action,\state'}^2\lossest^2 \\
    &= \frac{\divg{u}{\occmeasure_1}}{\FTRLpara} + \FTRLpara\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action,\state'} \rbr{\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action}\transeasy_\episode\rbr{\state'\vert\state,\action}}^2\lossest^2 \\
    &\leq \frac{\divg{u}{\occmeasure_1}}{\FTRLpara} + \FTRLpara\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action} \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action}^2 \lossest^2  \cdot\rbr{\sum_{\state'}\transeasy_\episode\rbr{\state'\vert\state,\action}^2} \\
    &\leq \frac{\divg{u}{\occmeasure_1}}{\FTRLpara} + \FTRLpara\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action} \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action}^2 \lossest^2,
\end{aligned}
\end{equation}
where the first step is due to $\FTRLpara\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action,\state'}\lossest\geq0$ under the Assumption \ref{assp: Private loss in bandit feedback setting}, and the second step uses the fact that $\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action,\state'} = \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action}\transeasy_\episode\rbr{\state'\vert\state,\action}$; the final step follows the fact that $\sum_{\state'}\transeasy_\episode\rbr{\state'\vert\state,\action} = 1$.

Eq. \ref{eq: REG with log-barrier function} also applies for $\occmeascom$, since $\occmeascom\in\cap_\episode\occmeasureset\rbr{\transeasy_\episode}$ by definition. 
Therefore, by direct calculation, we have
\begin{equation}\notag
\begin{aligned}
    \divg{\occmeascom}{\occmeasure_1} &= \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sasspace_\horizon} \rbr{\log\rbr{\frac{\occmeasure_1\sas}{\occmeascom\sas}} + \frac{\occmeascom\sas}{\occmeasure_1\sas} -1 } \\
    &= \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sasspace_\horizon} \log\rbr{\frac{\occmeasure_1\sas}{\occmeascom\sas}} + \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sasspace_\horizon} \rbr{\statesize_\horizon\actionsize\statesize_{\horizon+1}\occmeascom\sas -1} \\
    &= \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sasspace_\horizon} \log\rbr{\frac{\occmeasure_1\sas}{\occmeascom\sas}} \\
    &\leq 3\statesize^2\actionsize\log\iota,
\end{aligned}
\end{equation}
where the second step uses the definition of $\occmeasure_1\sas=\frac{1}{\statesize_\horizon\actionsize\statesize_{\horizon+1}}$ for all horizon $\horizon$, and the fourth step uses the lower bounds $\occmeascom\sas\geq\frac{1}{\statesize^2\actionsize\episodetotal^3}$ from Lemma C.10 in \cite{lee2020bias},  and then $\occmeascom\sas\geq\frac{1}{\iota^3}$ and upper bounds $\occmeasure_1\sas\leq1$.
For the second term, we have
\begin{align}
&\expect\sbr{\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action} \occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action}^2 \lossest^2} \notag\\
&= \expect\sbr{\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action}\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action}^2 \cdot \frac{\ddot{\loss}_\episode(\state,\action)}{\uppocc_\episode\rbr{\state,\action}} \cdot \lossest} \notag\\
&\leq \expect\sbr{\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action}\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action} \cdot \lossest} \notag \\
&\leq
\expect\sbr{\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action}\occmeasure^{\transeasy_\episode,\policy_\episode}\rbr{\state,\action} \cdot \rbr{\frac{1}{\uppocc_\episode\rbr{\state,\action}} \cdot \frac{\ninterval}{2\ninterval+1} + \frac{\occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}}\cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}}} \notag \\
&\leq \expect\sbr{\sum_{\episode=1}^{\episodetotal}\sum_{\state,\action} \rbr{\frac{\ninterval}{2\ninterval+1} + \occmeasure^{\transeasy,\policy_\episode}\rbr{\state,\action} \cdot \frac{\loss_\episode\rbr{\state,\action}}{2\ninterval+1}}} \notag \\
&\leq \frac{\statesize\actionsize\episodetotal\ninterval+\horizontotal\episodetotal}{2\ninterval+1} , \notag 
\end{align}
where the first equality applies the definition of $\lossest$, and the second inequality follows the definition of $\uppocc_\episode$ and the fact that $\ddot{\loss}_\episode\rbr{\state,\action}\in[0,1]$ for all $\rbr{\state,\action}$ due to the scaling procedure; the third inequality follows the expectation of $\lossest$ in Eq.\ref{eq: expectation of private bandit loss}, and the fourth inequality applies the definition of $\uppocc_\episode$ again, and the fact that $\loss_\episode\in[0,1]$.

Thus, setting $\FTRLpara=\sqrt{\frac{\statesize}{\episodetotal}},\delta=\frac{\statesize\actionsize}{\episodetotal}$ we obtain the bound of \textsc{Reg},
\begin{align}
\expect\sbr{\sum_{\episode=1}^{\episodetotal} \inner{\occmeasure^{\transeasy_\episode,\policy_\episode} - \occmeascom}{\lossesteasy_\episode}} 
\leq \cO\rbr{\frac{\sqrt{\statesize\episodetotal}}{2\ninterval+1}\cdot\rbr{\actionsize\statesize\ninterval+\horizontotal}}
% \leq \cO\rbr{\frac{\actionsize\ninterval\sqrt{\statesize^3\episodetotal}+\horizontotal\sqrt{\statesize\episodetotal}}{2\ninterval+1}}
% \leq \actionsize\log\iota\sqrt{\statesize^3\episodetotal} + \frac{\actionsize\ninterval\sqrt{\statesize^3\episodetotal} + \horizontotal\sqrt{\statesize\episodetotal}}{2\ninterval+1}
\end{align}
% We first reformulate the occupancy measure updating step as a two-step optimization problem, similar to the technique in \ref{subsec: Updating Occupancy Measure - full-info} and \cite{jin20c},
% \begin{equation}
% \label{eq: update occupancy, bandit-info, two-steps}
% \begin{aligned}
%     \occmeasmid_{\episode+1} &= \argmin_{\occmeasure} \inner{\lossesteasy_{\episode}}{\occmeasure} + \frac{1}{\FTRLpara}\divg{\occmeasure}{\occmeasure_\episode}, \\
%     \occmeasure_{\episode+1} &= \argmin_{\occmeasure \in \occmeasureset\rbr{\transspace_{\episode+1}}} \divg{\occmeasure}{\occmeasmid_{\episode+1}}.
% \end{aligned}
% \end{equation}
% Building upon Lemma \ref{lemma: concentration of private transition error}, we can confidently establish that $\occmeasure^* \in \cap_\episode \occmeasureset\rbr{\transspace_\episode}$ with a high probability. 
% Subsequently, by following the classical proof of Online Mirror Descent (OMD) as presented in \cite{hazan2016introduction}, we can deduce that,
% \begin{equation}
% \begin{aligned}\notag
%     \expect\sbr{\textsc{Reg}} &\leq \expect\sbr{\sum_{\episode=1}^\episodetotal \inner{\occmeasure_\episode - \occmeasure_{\episode+1}^\prime}{\lossesteasy_\episode}} + \frac{\divg{\occmeasure}{\occmeasure_1}}{\FTRLpara}. 
% \end{aligned}
% \end{equation}
% For the unconstrained optimization problem, the exact form of $\occmeasmid_{\episode+1}$ is given by $\occmeasmid_{\episode+1}\rbr{\state,\action} = \frac{\occmeasure_\episode\rbr{\state,\action}\exp\rbr{-\FTRLpara\lossesteasy_\episode\rbr{\state,\action}}}{\sum_{\state,\action}\occmeasure_\episode\rbr{\state,\action}\exp\rbr{-\FTRLpara \lossesteasy_\episode\rbr{\state,\action}}}$. 
% Leveraging the inequality $e^x \geq 1+x$,  and $\lossesteasy_\episode\rbr{\state,\action}\geq 0$ with high probability established by Assumption \ref{assp: Private loss in bandit feedback setting} and our scale step (Eq.~\eqref{eq: loss rescale}), we can deduce the following:
% $\occmeasmid_{\episode+1}\rbr{\state,\action} \geq \occmeasure_\episode\rbr{\state,\action} - \FTRLpara \occmeasure_\episode\rbr{\state,\action} \lossesteasy_\episode\rbr{\state,\action}$.
% Therefore,
% \begin{equation}
% \begin{aligned}\notag
%     \expect\sbr{\sum_{\episode=1}^\episodetotal \inner{\occmeasure_\episode - \occmeasure_{\episode+1}^\prime}{\lossesteasy_\episode}} 
%     &\leq \FTRLpara \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \occmeasure_\episode\rbr{\state,\action} \lossesteasy_\episode^2\rbr{\state,\action} } \\
%     &= \FTRLpara \expect\sbr{\sum_{\episode=1}^\episodetotal \sum_{\state,\action} \occmeasure_\episode\rbr{\state,\action} \lossesteasy_\episode\rbr{\state,\action} \cdot \frac{1}{2\ninterval + 1}\cdot\rbr{\frac{\loss_\episode\rbr{\state,\action} \II_\episode\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}} + \noise + \ninterval}} \\
%     &\leq \FTRLpara \expect\rbr{\frac{1}{2\ninterval + 1}\cdot \sum_{\episode=1}^\episodetotal \sum_{\state,\action} \rbr{\frac{\occmeasure_\episode\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}} + \occmeasure_\episode\rbr{\state,\action}\ninterval}} \\
%     &\leq \frac{\eta\episodetotal}{2\ninterval + 1} \cdot\rbr{\statesize\actionsize+\ninterval\horizontotal},
% \end{aligned}
% \end{equation}
% where the second inequality is due to the fact that $\lossesteasy_\episode\rbr{\state,\action}\leq 1$ with high probability, and $\loss_\episode\rbr{\state,\action}\leq 1$.
% Regarding the second term of \textsc{Reg} bound, we apply standard arguments related to the Bregman divergence,
% % associated with negative entropy,
% $\divg{\occmeasure}{\occmeasure_1}\leq \horizontotal\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}$.
% By setting $\FTRLpara = \sqrt{\frac{\ln\rbr{\frac{\statesize\actionsize}{\horizontotal}}}{\episodetotal}},\delta=\frac{\statesize\actionsize}{\episodetotal}$, we obtain the bound of $\textsc{Reg}$.
\end{proof}

Putting everything together and using Lemma \ref{lemma: expectation on random variables}, we obtain the expected regret bound (Theorem \ref{thm: Regret bound of Private UOB-LBPS}), 
\begin{equation}
\begin{aligned} \notag
    \expect{\sbr{\regret}} =& \expect\sbr{\textsc{Error}} + \rbr{2\ninterval+1} \cdot \rbr{\expect\sbr{\textsc{Bias}_2} + \expect\sbr{\textsc{Reg}}}. \\
    % \leq& \cO\rbr{\horizontotal\sum_{\horizon=0}^{\horizontotal-1} \sqrt{\ln\iota\sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \horizontotal\statesize^2\actionsize\ln\iota\log\episodetotal + \horizontotal\statesize^2\actionsize\confcountxa\log\episodetotal} \\
    % & + \cO\rbr{\horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sqrt{\ln\iota  \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \statesize^4\actionsize\rbr{\confcountxa + \ln\iota}} \\
    % & + \cO\rbr{\horizontotal\ninterval\sqrt{\episodetotal} + \statesize\actionsize\sqrt{\episodetotal}} \\
    % \leq & \cO\rbr{\horizontotal \sum_{\horizon=0}^{\horizontotal-1} \sqrt{  \sum_{(\state,\action)\in\statespace_\horizon\times\actionspace} \locsupport_{\state,\action}\episodetotal} + \horizontotal\statesize^4\actionsize\confcountxa + \horizontotal\ninterval\sqrt{\episodetotal}}
    \leq & \widetilde{\cO}\rbr{\horizontotal\cumlocsupport \sqrt{\episodetotal} + \horizontotal\statesize^4\actionsize\confcountxa + \actionsize\ninterval\sqrt{\statesize^3\episodetotal}}.
\end{aligned}
\end{equation}
