\subsection{Private UC-O-REPS Algorithm}
% \XD{When you have two (or more) symbols, do not use $\alpha,\beta,\gamma$, instead, use $\alpha$, $\beta$, $\gamma$.}
\begin{algorithm}[!htbp]
\caption{Private UC-O-REPS} 
\textbf{Parameters:} state space $\statespace$, action space $\actionspace$, episode number $\episodetotal$, episode length $\horizontotal$, learning rate $\FTRLpara >0$, confidence parameter $\delta\in(0,1]$, privacy budget $\pripara > 0$ and a Privatizer \\
\textbf{Initialization:} \\
Initialize confidence set $\transspace_1$ as the set of all transitions. \\
% For all $\horizon = 0,\cdots,\horizontotal-1$ and all $\rbr{\state,\action,\state^\prime}  \in  \statespace_\horizon  \times  \actionspace  $ $\times\statespace_{\horizon+1}$, initialize private counts, 
% $\visitxaxtotalprieasy_h^1 \rbr{\state,\action,\state^\prime} =$ $ \visitxatotalprieasy_h^1 \rbr{\state,\action} = $ $ \losscumprieasy_h^1 \rbr{\state,\action} = 0, $ 
% and occupancy measure, $\occmeasure_1\rbr{\state,\action} = \frac{1}{\statesize_\horizon \actionsize}$. \\
For all $\rbr{\state,\action,\state^\prime}  \in  \statespace  \times  \actionspace \times \statespace_{\horizon\rbr{\state}+1}$, initialize private counts, 
$\visitxaxtotalprieasy_1 \rbr{\state,\action,\state^\prime} =$ $ \visitxatotalprieasy_1 \rbr{\state,\action} = $ $ \losscumprieasy_1 \rbr{\state,\action} = 0, $ 
and occupancy measure, $\occmeasure_1\rbr{\state,\action,\state'} = \frac{1}{\statesize_{\horizon(\state)} \actionsize \statesize_{\horizon{\state+1}}}$. \\
Initialize policy $\policy_1 = \policy^{\occmeasure_1}$.\\
Set precision levels $\confcountxa$ and $\conflossf$ for Privatizer. 
\begin{algorithmic}[1]
\FOR{$\episode=1$ to $\episodetotal$}
\STATE Execute policy $\policy_\episode$ for $\horizontotal$ steps and obtain interaction history $\cbr{\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode}}_{\horizon=0}^{\horizontotal-1}$, and observe loss function $\loss_\episode$. \
\STATE Recieve private counts $\losscumprieasy_{\episode+1}\rbr{\state,\action}$, $\visitxatotalprieasy_{\episode+1}\rbr{\state,\action}$, $\visitxaxtotalprieasy_{\episode+1}\rbr{\state,\action,\state^\prime}$ from Privatizer.\
\STATE Compute private transition estimate, for all $ \rbr{\state,\action,\state'}\in\statespace\times\actionspace\times\statespace_{\horizon(\state)+1}$,
    $$\transprieasy_{\episode+1}\rbr{\state'\vert\state,\action} = \frac{\visitxaxtotalprieasy_{\episode+1}\sas}{\visitxatotalprieasy_{\episode+1}(\state,\action)}.$$
\STATE Update the confidence set $\transspace_{\episode+1}$ based Equation~\eqref{eq: confidence set of transition}.
    $$ \transspace_{\episode+1} =  \cbr{ \transeasy :\abr{\transeasy \rbr{\state' \vert \state, \action}  -  \transprieasy_{\episode+1} \rbr{\state' \vert \state, \action} }  \leq \confpwtrans_{\episode+1}\rbr{\state'\vert\state,\action}, \quad
    \forall \rbr{\state,\action,\state'}\in\statespace_\horizon \times\actionspace \times\statespace_{\horizon+1},\horizon\in\sbr{\horizontotal} }.$$
\STATE Update the occupancy measure 
$$\occmeasure_{\episode+1} = \argmin_{\occmeasure \in \occmeasureset\rbr{\transspace_{\episode+1}}} \inner{\losscumprieasy_{\episode+1}}{\occmeasure} + \frac{1}{\FTRLpara}\regularizer\rbr{\occmeasure},$$\
where $\regularizer\rbr{\occmeasure} = \sum_{\rbr{\state,\action,\state^\prime}\in \statespace\times\actionspace\times\statespace} \occmeasure\rbr{\state,\action,\state^\prime} \ln \occmeasure\rbr{\state,\action,\state^\prime}.$
\STATE Update the policy 
$$\policy_{\episode+1} = \policy^{\occmeasure_{\episode+1}}.$$
\ENDFOR
\end{algorithmic}
\label{algo: private UC-O-REPS}
\end{algorithm}

\newpage
\input{./appendix/update occupancy measure}

\newpage
\clearpage
\subsection{Private UOB-LBPS Algorithm}
\begin{algorithm}[!htbp]
\caption{Private UOB-LBPS} 
\textbf{Parameters:} state space $\statespace$, action space $\actionspace$, episode number $\episodetotal$, episode length $\horizontotal$, learning rate $\FTRLpara >0$, confidence parameter $\delta\in(0,1]$, privacy parameter $\pripara > 0$ and a Privatizer \\
\textbf{Initialization:} \\
Initialize confidence set $\transspace_1$ as the set of all transitions. \\
For all $\rbr{\state,\action,\state^\prime}  \in  \statespace  \times  \actionspace \times \statespace_{\horizon\rbr{\state}+1}$, initialize private counts, 
$\visitxaxtotalprieasy_1 \rbr{\state,\action,\state^\prime} =$ $ \visitxatotalprieasy_1 \rbr{\state,\action} = 0, $ 
and occupancy measure, $\occmeasure_1\rbr{\state,\action,\state'} = \frac{1}{\statesize_{\horizon(\state)} \actionsize \statesize_{\horizon\rbr{\state}+1}}$. \\
Initialize policy $\policy_1 = \policy^{\occmeasure_1}$.\\
Set precision levels $\confcountxa$ and $\ninterval$ for Privatizer. 
\begin{algorithmic}[1]
\FOR{$\episode=1$ to $\episodetotal$}
\STATE Execute policy $\policy_\episode$ for $\horizontotal$ steps and obtain interaction history $\cbr{\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode}}_{\horizon=0}^{\horizontotal-1}$, and observe losses $\cbr{\loss_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode}}_{\horizon=0}^{\horizontotal-1}$. \
\STATE Receive private counts $\visitxatotalprieasy_{\episode+1}\rbr{\state,\action}$,  $\visitxaxtotalprieasy_{\episode+1}\rbr{\state,\action,\state^\prime}$ and private loss $\lossprieasy_\episode\rbr{\state,\action}$ from Privatizer. \
\STATE Scale private loss to $[0,1]$, for all $\rbr{\state,\action}\in\statespace\times\actionspace$,  
$$\ddot{\loss}_\episode\rbr{\state,\action} = \frac{\lossprieasy_\episode\rbr{\state,\action} + \ninterval}{2\ninterval + 1}.$$
\STATE Compute optimistic loss estimator $\lossest$ for all $\rbr{\state,\action}\in\statespace\times\actionspace$ using upper occupancy measure bound $\uppocc_\episode$, 
$$ \lossest = \frac{\ddot{\loss}_\episode\rbr{\state,\action}}{\uppocc_\episode\rbr{\state,\action}},$$
where $\uppocc_\episode(\state,\action) = \max_{\transeasy\in\transspace_{\episode}} \occmeasure^{\transeasy,\policy_\episode}(\state,\action)$.

\STATE Compute private transition estimate, for all $ \rbr{\state,\action,\state'}\in\statespace\times\actionspace\times\statespace_{\horizon(\state)+1}$,
    $$\transprieasy_{\episode+1}\rbr{\state'\vert\state,\action} = \frac{\visitxaxtotalprieasy_{\episode+1}\sas}{\visitxatotalprieasy_{\episode+1}(\state,\action)}.$$
\STATE Update confidence set $\transspace_{\episode+1}$ using  Equation~\eqref{eq: confidence set of transition},
    $$ \transspace_{\episode+1} =  \cbr{ \transeasy :\abr{\transeasy \rbr{\state' \vert \state, \action}  -  \transprieasy_{\episode+1} \rbr{\state' \vert \state, \action} }  \leq \confpwtrans_{\episode+1}\rbr{\state'\vert\state,\action}, \quad
    \forall \rbr{\state,\action,\state'}\in\statespace_\horizon \times\actionspace \times\statespace_{\horizon+1},\horizon\in\sbr{\horizontotal} }.$$
\STATE Update the occupancy measure
    $$\occmeasure_{\episode+1} = \argmin_{\occmeasure \in \occmeasureset\rbr{\transspace_{\episode+1}}} \inner{\lossesteasy_{\episode}}{\occmeasure} + \frac{1}{\FTRLpara} \divg{\occmeasure}{\occmeasure_{\episode}}.$$
    where $\regularizer\rbr{\occmeasure}$ is a log-barrier regularizer defined in Eq.~\ref{eq: regularizer in bandit setting}, and $\divg{\occmeasure}{\occmeasure_{\episode}}$ is the Bregman divergence of $\regularizer$ between $\occmeasure$ and $\occmeasure_\episode$.
\STATE Update the policy 
    $$\policy_{\episode+1} = \policy^{\occmeasure_{\episode+1}}.$$
\ENDFOR
\end{algorithmic}
\label{algo: Private Bounded Bandit UC-O-REPS}
\end{algorithm}

% \begin{algorithm}[!htbp]
% \caption{Private Bounded Bandit UC-O-REPS} 
% \textbf{Parameters:} state space $\statespace$, action space $\actionspace$, episode number $\episodetotal$, horizon number $\horizontotal$, learning rate $\FTRLpara >0$, confidence parameter $\delta\in(0,1]$, state reachable probability $\alpha$, privacy parameter $\pripara > 0$ and a Privatizer \\
% \textbf{Initialization:} \\
% Initialize confidence set $\transspace_1$ as the set of all transitions. \\
% Initialize policy $\policy_1\rbr{\action \vert \state} = \frac{1}{\actionsize}$ for all $\state\in\statespace$.\\
% For all $\horizon = 0,\cdots,\horizontotal-1$ and all $\rbr{\state,\action,\state^\prime} \in \statespace_\horizon \times \actionspace\times\statespace_{\horizon+1}$, initialize private counts, 
% $\visitxatotalprieasy_h^1 \rbr{\state,\action} = \visitxaxtotalprieasy_h^1 \rbr{\state,\action,\state^\prime} = 0 $, and set occupancy measure $\occmeasure_1\rbr{\state,\action} = \frac{1}{\statesize_\horizon \actionsize}$ \\
% Set precision levels $\confcountxa,\ninterval$ for Privatizer. \\
% \begin{algorithmic}[1]
% \FOR{$\episode=1$ to $\episodetotal$}
% \STATE Execute policy $\policy_\episode$ for $\horizontotal$ steps and obtain trajectory $\cbr{\state_{\horizon}^\episode,\action_\horizon^\episode}_{\horizon=0}^{\horizontotal-1}$, and observe losses $\cbr{\loss_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode}}_{\horizon=0}^{\horizontotal-1}$. \
% \STATE Receive private counts: $\visitxatotalprieasy_{\episode+1}\rbr{\state,\action}$, $\visitxaxtotalprieasy_{\episode+1}\rbr{\state,\action,\state^\prime}$ and $\cbr{\lossprieasy_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode}}_{\horizon=0}^{\horizontotal-1}$ from Privatizer.\
% \IF{$\forall \horizon, \abr{\lossprieasy_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode} -\loss_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode}} \leq \ninterval$} 
% \STATE Scale losses to $[0,1]$,
%     $$\forall \horizon, \lossprieasy_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode} = \frac{\lossprieasy_\episode\rbr{\state_{\horizon}^\episode,\action_\horizon^\episode} + \ninterval}{2\ninterval + 1}$$
% \STATE Estimate loss,
%     $$\frac{\lossprieasy_\episode\rbr{\state,\action}}{\occmeasure_\episode\rbr{\state,\action}} \II\cbr{\state_{\horizon(\state)}^{\episode} = \state, \action_{\horizon(\state)}^{\episode} = \action}$$
% \STATE Update confidence set $\transspace_{\episode+1}$ using  Equation \ref{eq: confidence set of transition},
%     $$ \transspace_\episode  =  \cbr{ \transeasy  :  \nbr{\transeasy_\horizon \rbr{\cdot \vert \state, \action}  -  \transprieasy_\horizon^\episode \rbr{\cdot \vert \state, \action} }_1  \leq  \confnormtrans{\rbr{\state,\action}}, \forall \rbr{\state,\action}\in\statespace_\horizon\times\actionspace, \horizon=0,\dots,\horizontotal-1 }$$
% \STATE Update occupancy measure, 
%     $$\occmeasure_{\episode+1} = \argmin_{\occmeasure \in \occmeasureset_{\alpha}\rbr{\transspace_{\episode+1}}} \inner{\lossesteasy_{\episode}}{\occmeasure} + \frac{1}{\FTRLpara}\divg{\occmeasure}{\occmeasure_{\episode}}$$
% \STATE Update policy 
%     $$\policy_{\episode+1} = \policy^{\occmeasure_{\episode+1}}$$
% \ELSE
% \STATE Continue 
%     $$\policy_{\episode+1} = \policy_{\episode}$$
% \ENDIF
% \ENDFOR
% \end{algorithmic}
% \label{algo: Private Bounded Bandit UC-O-REPS}
% \end{algorithm}