\subsection{Updating Occupancy Measure Efficiently for Private-UC-O-REPS}
\label{subsec: Updating Occupancy Measure - full-info}
This subsection explains how to implement the update defined in \eqref{eq: update occupancy measure full info} for Algorithm \ref{algo: private UC-O-REPS}, Private-UC-O-REPS, efficiently.

Similar to the approaches in \cite{neu2012adversarial,rosenberg2019onlineamdp,jin20c}, we provide details of the modification here for completeness.
Based on the property of FTRL, this optimization can be reformulated as first solving the unconstrained optimization problem,
\begin{equation}
\label{eq: update occupancy 1 unconstrained step}
    \occmeasmid_{\episode+1} = \argmin_{\occmeasure} \inner{\losscumprieasy_{\episode+1}}{\occmeasure} + \frac{1}{\FTRLpara}\regularizer\rbr{\occmeasure},
\end{equation}
and then projecting the result 
\begin{equation}
\label{eq: update occupancy 2 projection step}
    \occmeasure_{\episode+1} = \argmin_{\occmeasure \in \occmeasureset\rbr{\transspace_{\episode+1}}} \divg{\occmeasure}{\occmeasmid_{\episode+1}},
\end{equation}
where $\regularizer\rbr{\occmeasure}$ is negative entropy regularizer function defined in Eq.~\eqref{eq: regularizer}, and $\divg{\occmeasure}{\occmeasmid} = \sum_{\rbr{\state,\action,\state^\prime}\in \statespace\times\actionspace\times\statespace_{\horizon\rbr{\state}+1}}\rbr{\occmeasure\rbr{\state,\action,\state^\prime} \ln \frac{\occmeasure\rbr{\state,\action,,\state^\prime}}{\occmeasmid\rbr{\state,\action,\state^\prime}} - \rbr{\occmeasure\rbr{\state,\action,\state^\prime} - \occmeasmid\rbr{\state,\action,\state^\prime}}}$ is the corresponding Bregman divergence.
For the unconstrained optimization, we have the optimal solution directly through the Lagrange method,
$\occmeasmid_{\episode+1}\rbr{\state,\action,\state^\prime} = \frac{\exp\rbr{-\FTRLpara \losscumprieasy_{\episode+1}\rbr{\state,\action}}}{\sum_{\rbr{\state,\action,\state^\prime}\in \sashspace} \rbr{\exp\rbr{-\FTRLpara \losscumprieasy_{\episode+1}\rbr{\state,\action}}}}$ 
for all $\rbr{\state,\action,\state^\prime}\in \statespace\times\actionspace\times\statespace_{\horizon\rbr{\state}+1}$.

For the second step, we can rewrite this constrained optimization problem with the following set of linear equations, which can be solved in polynomial time.
% \begin{equation}
% \begin{aligned} \label{prob: projection via divergence}
% &\min_{\occmeasure} \quad  \divg{\occmeasure}{\occmeasmid_{\episode+1}}\\
% &\begin{array}{r@{\quad}r@{}l@{\quad}l}
% s.t. &\sum_{\sas\in\sashspace} \occmeasure\rbr{\state,\action,\state^\prime}&= 1  &\forall \horizon\\
%      &\sum_{\action\in\actionspace, \state^\prime\in\statespace_{\horizon+1}} \occmeasure\sas &= \sum_{\state^\prime\in\statespace_{\horizon-1}, \action\in\actionspace} \occmeasure\rbr{\state^\prime,\action,\state}  & \forall \horizon, \forall \state \in \statespace_\horizon\\
%      &\rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action} + \confpwtrans\sas} \cdot \sum_{\state^\prime\in\statespace_{\horizon+1}} \occmeasure\sas  &\geq \occmeasure\sas & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1} \\
%      &\rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action} - \confpwtrans\sas} \cdot \sum_{\state^\prime\in\statespace_{\horizon+1}} \occmeasure\sas  &\leq \occmeasure\sas & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1} \\
%      &\sum_{\state^\prime\in\statespace_{\horizon+1}} \confpwtrans\sas  &\leq \confnormtrans\rbr{\state,\action} & \forall \horizon, \forall \rbr{\state,\action}\in\statespace_\horizon \times \actionspace \\
%      &\occmeasure\sas  &\geq 0 & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1} \\
%      &\confpwtrans\sas  &\geq 0 & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1}
% \end{array}
% \end{aligned}
% \end{equation}
\begin{equation}
\begin{aligned} \label{prob: projection via divergence}\notag
&\min_{\occmeasure} \quad  \divg{\occmeasure}{\occmeasmid_{\episode+1}}\\
&\begin{array}{r@{\quad}r@{}l@{\quad}l}
s.t. &&\sum_{\sas\in\sashspace} \occmeasure\rbr{\state,\action,\state^\prime}= 1  &\forall \horizon\\
     &&\sum_{\action\in\actionspace, \state^\prime\in\statespace_{\horizon+1}} \occmeasure\sas = \sum_{\state^\prime\in\statespace_{\horizon-1}, \action\in\actionspace} \occmeasure\rbr{\state^\prime,\action,\state}  & \forall \horizon, \forall \state \in \statespace_\horizon\\
     &&\rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action} + \confpwtrans_{\episode+1}\sas} \cdot \sum_{\state^\prime\in\statespace_{\horizon+1}} \occmeasure\sas  \geq \occmeasure\sas & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1} \\
     &&\rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action} - \confpwtrans_{\episode+1}\sas} \cdot \sum_{\state^\prime\in\statespace_{\horizon+1}} \occmeasure\sas  \leq \occmeasure\sas & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1} \\
     % &&\sum_{\state^\prime\in\statespace_{\horizon+1}} \confpwtrans\sas  \leq \confnormtranseasy_{\episode+1}\rbr{\state,\action} & \forall \horizon, \forall \rbr{\state,\action}\in\statespace_\horizon \times \actionspace \\
     &&\occmeasure\sas  \geq 0 & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1} \\
     % &&\confpwtrans\sas  \geq 0 & \forall \horizon, \forall \sas\in\statespace_\horizon \times \actionspace \times \statespace_{\horizon+1}
\end{array}
\end{aligned}
\end{equation}

This problem can be further reformulated into a dual problem, which is a convex optimization problem with only non-negativity constraints and thus can be solved more efficiently.
\begin{lemma}
\label{lemma: dual problem of updating occupancy measure}
    The dual problem of  \eqref{eq: update occupancy 2 projection step} is 
    \begin{equation} \label{prob: dual problem}
    \dualvtrp_\episode, \dualvtrn_\episode, \dualvmid_\episode = \argmin_{\dualvtrp,\dualvtrn,\dualvmid \geq 0} \sum_{\horizon=0}^{\horizontotal-1} \ln \rbr{\sum_{\sas\in \statespace_\horizon \times  \actionspace \times \statespace_{\horizon+1}} \vsdual_{\episode,\horizon}^{\dualvtrp,\dualvtrn,\dualvmid}\sas},
    \end{equation}
    where $\dualvtrp:=\cbr{\dualvtrp\sas}_{\sas}, \dualvtrn:=\cbr{\dualvtrn\sas}_{\sas}$ and $\dualvmid:=\cbr{\dualvmid\rbr{\state}}_\state$ are dual variables and 
    \begin{align}
        & \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas 
        = \frac{1}{\sum_{\sas\in \statespace_\horizon \times  \actionspace \times \statespace_{\horizon+1}} \exp \rbr{-\FTRLpara\losscumprieasy_{\episode+1}\rbr{\state,\action}}} \exp\rbr{-\FTRLpara \losscumprieasy_{\episode+1}\rbr{\state,\action} + \vbellman_{\episode,\horizon}^{\dualvtrp,\dualvtrn, \dualvmid} \sas} \label{eq: variable for solving dual variables}, \notag\\
        &\begin{aligned} \notag
        \vbellman_{\episode,\horizon}^{\dualvtrp,\dualvtrn, \dualvmid} &\sas 
        = - \dualvmid\rbr{\state} + \dualvmid\rbr{\state^\prime} - \dualvtrp\sas + \dualvtrn\sas \\
        & + \sum_{y\in\statespace_{\horizon(\state)+1}} \sbr{\transprieasy_{\episode+1}\rbr{y\vert\state,\action}\rbr{\dualvtrp\rbr{\state,\action,y}-\dualvtrn\rbr{\state,\action,y}} + \confpwtrans_{\episode+1}\rbr{\state,\action,y} \rbr{\dualvtrp\rbr{\state,\action,y}+\dualvtrn\rbr{\state,\action,y}} }.
        \end{aligned}
    \end{align}
    Furthermore, the optimal solution to this projection is given by, for any $\sas\in \statespace_\horizon \times  \actionspace \times \statespace_{\horizon+1}$,
    \begin{align}
        \occmeasure_{\episode+1}\sas = \frac{\vsdual_{\episode,\horizon}^{\dualvtrp,\dualvtrn,\dualvmid}\sas}{\sum_{\sas\in \statespace_\horizon \times  \actionspace \times \statespace_{\horizon+1}} \vsdual_{\episode,\horizon}^{\dualvtrp,\dualvtrn,\dualvmid}\sas}.
    \end{align}
\end{lemma}
\begin{proof}
    In the following proof, we omit the non-negativity constraints of Problem \eqref{prob: projection via divergence}.
    This is without loss of generality since the optimal solution for the modified version without non-negativity constraints turns out to always satisfy the non-negativity constraints.
    We write the Lagrangian as,
    \begin{equation}\notag
    \begin{aligned}
        \lag(\occmeasure, \dualvsum, &\dualvmid, \dualvtrp, \dualvtrn)
        = \divg{\occmeasure}{\occmeasmid_{\episode+1}}
        + \sum_{\horizon=0}^{\horizontotal-1}\dualvsum_\horizon \rbr{\sum_{\sas\in\sashspace} \occmeasure\sas - 1} \\
        &+ \sum_{\horizon=1}^{\horizontotal-1} \sum_{\state\in\statespace_\horizon} \dualvmid\rbr{\state} \rbr{\sum_{\action\in\actionspace,\state^\prime\in\statespace_{\horizon+1}}\occmeasure\rbr{\state,\action,\state^\prime} - \sum_{\state^\prime\in\statespace_{\horizon-1},\action\in\actionspace}\occmeasure\rbr{\state^\prime,\action,\state}} \\
        &+ \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sashspace} \dualvtrp\sas \rbr{\occmeasure\sas - \rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action}+ \confpwtrans_{\episode+1}\sas} \cdot \sum_{y\in\statespace_{\horizon+1}} \occmeasure\rbr{\state,\action,y}} \\
        &+ \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sashspace} \dualvtrn\sas \rbr{\rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action}- \confpwtrans_{\episode+1}\sas} \cdot \sum_{y\in\statespace_{\horizon+1}} \occmeasure\rbr{\state,\action,y} - \occmeasure\sas}     \end{aligned}
    \end{equation}
where $\lambda:=\cbr{\lambda_\horizon}_\horizon, \nu:={\nu(\state)}_\state$ and $\mu:=\cbr{\mu^+\sas,\mu^-\sas}_{\sas}$ are Lagrange multipliers.
We denote $\dualvmid(\state_0) = \dualvmid(\state_{\episodetotal}) = 0$ to avoid addressing the edge cases explicitly. 
% Now, taking the derivative we have,
% \begin{align}\notag
%     \frac{\partial \lag}{\partial \confpwtrans\sas } &= \rbr{ - \dualvtrp\sas -\dualvtrn\sas} \sum_{\state^\prime}\occmeasure\sas + \dualvtr\rbr{\state,\action}.
% \end{align}
% Setting the gradient to zero, we obtain
% \begin{align}\notag
%    \dualvtr\rbr{\state,\action} =  \rbr{\dualvtrp\sas + \dualvtrn\sas} \sum_{\state^\prime}\occmeasure\sas.
% \end{align}
% Putting $\dualvtr\rbr{\state,\action}$ back and we can obtain an equivalent Lagrangian, then get rid of the $\confpwtrans\sas$ variables.
% \begin{equation}\notag
% \begin{aligned}
%     \lag(\occmeasure, \confpwtrans, \dualvsum, &\dualvmid, \dualvtrp, \dualvtrn\dualvtr)
%     = \divg{\occmeasure}{\occmeasmid_{\episode+1}}
%     + \sum_{\horizon=0}^{\horizontotal-1}\dualvsum_\horizon \rbr{\sum_{\sas\in\sashspace} \occmeasure\sas - 1} \\
%     &+ \sum_{\horizon=1}^{\horizontotal-1} \sum_{\state\in\statespace_\horizon} \dualvmid\rbr{\state} \rbr{\sum_{\action\in\actionspace,\state^\prime\in\statespace_{\horizon+1}}\occmeasure\rbr{\state,\action,\state^\prime} - \sum_{\state^\prime\in\statespace_{\horizon-1},\action\in\actionspace}\occmeasure\rbr{\state^\prime,\action,\state}} \\
%     &+ \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sashspace} \dualvtrp\sas \rbr{\occmeasure\sas\rbr{1-\confnormtranseasy_{\episode+1}\rbr{\state,\action}} - \transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action}\cdot \sum_{y\in\statespace_{\horizon+1}} \occmeasure\rbr{\state,\action,y}} \\
%     &+ \sum_{\horizon=0}^{\horizontotal-1}\sum_{\sas\in\sashspace} \dualvtrn\sas \rbr{\transprieasy_{\episode+1}\rbr{\state^\prime\vert\state,\action}\cdot \sum_{y\in\statespace_{\horizon+1}} \occmeasure\rbr{\state,\action,y} - \occmeasure\sas \rbr{1+\confnormtranseasy_{\episode+1}\rbr{\state,\action}}}. 
% \end{aligned}
% \end{equation}
Now, we consider the derivative with respect to $\occmeasure\sas$. 
\begin{equation}\notag
\begin{aligned}
    \frac{\partial \lag}{\partial \occmeasure\sas} =& \ln\occmeasure\sas - \ln\occmeasmid\sas + \dualvsum_\horizon + \dualvmid\rbr{\state} - \dualvmid\rbr{\state^\prime} + \dualvtrp\sas - \dualvtrn\sas \\
    & - \sum_{y\in\statespace_{\horizon(\state)+1}} \sbr{\transprieasy_{\episode+1}\rbr{y\vert\state,\action}\rbr{\dualvtrp\rbr{\state,\action,y}-\dualvtrn\rbr{\state,\action,y}} + \confpwtrans_{\episode+1}\rbr{\state,\action,y} \rbr{\dualvtrp\rbr{\state,\action,y}+\dualvtrn\rbr{\state,\action,y}} } \\
    =& \ln\occmeasure\sas - \ln\occmeasmid\sas + \dualvsum_\horizon - \vbellman_{\episode,\horizon}^{\dualvtrp,\dualvtrn, \dualvmid} \sas. 
\end{aligned}
\end{equation}
Setting the gradient to zero and using the explicit form of $\occmeasmid_{\episode+1}\sas$ we obtain, 
\begin{equation}\notag
\begin{aligned}
    \occmeasure_{\episode+1}\sas =& \occmeasmid_{\episode+1}\sas \cdot \exp\rbr{-\dualvsum_\horizon + \vbellman_{\episode,\horizon}^{\dualvtrp,\dualvtrn, \dualvmid} \sas} \\
    =& \frac{1}{\sum_{\sas\in\sashspace} \exp\rbr{-\FTRLpara\losscumprieasy_{\episode+1}\rbr{\state,\action}}} \exp \rbr{-\FTRLpara\losscumprieasy_{\episode+1}\rbr{\state,\action} -\dualvsum_\horizon + \vbellman_{\episode,\horizon}^{\dualvtrp,\dualvtrn, \dualvmid} \sas}.
\end{aligned}
\end{equation}
Using the first constraint $\sum_{\sas\in\sashspace}\occmeasure\sas = 1$ to discover that
\begin{equation}\notag
\begin{aligned}
    e^{\dualvsum_\horizon} &= \sum_{\sas\in\sashspace} \frac{1}{\sum_{\sas\in\sashspace} \exp\rbr{-\FTRLpara\losscumprieasy_{\episode+1}\rbr{\state,\action}}} \exp \rbr{-\FTRLpara\losscumprieasy_{\episode+1}\rbr{\state,\action} + \vbellman_{\episode,\horizon}^{\dualvtrp,\dualvtrn, \dualvmid} \sas} \\
    &= \sum_{\sas\in\sashspace} \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas.
\end{aligned}
\end{equation}
Taking $\dualvsum_\horizon$ back, we obtain the explicit form of the solution,
\begin{align}\notag
    \occmeasure_{\episode+1}\sas = \frac{ \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas}{\sum_{\sas\in\sashspace} \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas}.
\end{align}
We note the equivalent formula of Lagrangian,
\begin{align}\notag
    \lag(\occmeasure, \dualvsum, &\dualvmid, \dualvtrp, \dualvtrn\dualvtr) = \sum_{\horizon=0}^{\horizontotal-1} \sum_{\sas\in\sashspace} \rbr{\rbr{\frac{\partial \lag}{\partial\occmeasure\sas} - 1} \occmeasure\sas + \occmeasmid_{\episode+1}\sas} - \sum_{\horizon=1}^{\horizontotal-1} \dualvsum_\horizon.
\end{align}
It is straightforward to check that strong duality holds, and thus the optimal dual variables of  $\dualvtrp,\dualvtrn,\dualvmid$ are given by
\begin{align}
    \dualvtrp{}^*,\dualvtrn{}^*,\dualvmid{}^* =& \argmax_{\dualvtrp,\dualvtrn,\dualvmid} \max_{\dualvsum} \min_{\occmeasure} \lag\rbr{\occmeasure, \dualvsum, \dualvmid, \dualvtrp, \dualvtrn} = \argmax_{\dualvtrp,\dualvtrn,\dualvmid} \lag\rbr{\occmeasure^*, \dualvsum^*, \dualvmid, \dualvtrp, \dualvtrn} \\
    =& \argmax_{\dualvtrp,\dualvtrn,\dualvmid} \rbr{-\horizontotal + \sum_{\horizon=0}^{\horizontotal-1} \sum_{\sas\in\sashspace} \occmeasmid_{\episode+1}\sas - \sum_{\horizon=0}^{\horizontotal-1} \dualvsum_\horizon} \label{eq: apply dl/dq=0 to solving dual} \\
    =& \argmin_{\dualvtrp,\dualvtrn,\dualvmid} \sum_{\horizon=0}^{\horizontotal-1} \dualvsum_\horizon = \argmin_{\dualvtrp,\dualvtrn,\dualvmid} \sum_{\horizon=0}^{\horizontotal-1} \ln\rbr{\sum_{\sas\in\sashspace} \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas}. \label{eq: apply independence to solving dual}
\end{align}
We apply $\frac{\partial\lag}{\partial\occmeasure\sas} = 0$ to Eq.~\eqref{eq: apply dl/dq=0 to solving dual}, and in Eq.~\eqref{eq: apply independence to solving dual} the first two terms are independent of dual variables.
Thus, combing all equations for $\occmeasure_{\episode+1}, \dualvsum^*, \dualvtrp{}^*,\dualvtrn{}^*,\dualvmid{}^*$ finishes the proof.
\end{proof}

% \clearpage
The pseudo-code of this efficient algorithm for Private-UC-O-REPS is as follows.
\begin{algorithm}[!htbp]
\caption{Updating Occupancy Measure and Policy Procedure} 
\textbf{Input:} transition function estimate $\transprieasy_{\episode+1}$, cumulative private loss function $\losscumprieasy_{\episode+1}$\\
\begin{algorithmic}[1]
\STATE Solve optimization problem \ref{prob: dual problem}
$$\dualvtrp_\episode, \dualvtrn_\episode, \dualvmid_\episode = \argmin_{\dualvtrp,\dualvtrn,\dualvmid \geq 0} \sum_{\horizon=0}^{\horizontotal-1} \ln \rbr{\sum_{\sas\in \statespace_\horizon \times  \actionspace \times \statespace_{\horizon+1}} \vsdual_{\episode,\horizon}^{\dualvtrp,\dualvtrn,\dualvmid}\sas},$$
    where $\dualvtrp:=\cbr{\dualvtrp\sas}_{\sas}, \dualvtrn:=\cbr{\dualvtrn\sas}_{\sas}$ and $\dualvmid:=\cbr{\dualvmid\rbr{\state}}_\state$, and $\vsdual_{\episode,\horizon}^{\dualvtrp,\dualvtrn,\dualvmid}$ is defined by \ref{eq: variable for solving dual variables}.
\STATE Compute next occupancy measure for all $\sas$:
$$\occmeasure_{\episode+1}\sas = \frac{ \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas}{\sum_{\sas\in\sashspace} \vsdual_{\episode,\horizon}^{\dualvtrp, \dualvtrn,\dualvmid}\sas},
$$
where $\horizon=\horizon\rbr{\state}$ is the index of the layer of the state $\state$.
\STATE Compute next policy for all $\rbr{\state,\action}$
$$\policy_{\episode+1}\rbr{\action\vert\state} = \frac{\sum_{\state^\prime}\occmeasure_{\episode+1}\rbr{\state,b,\state^{\prime}}}{\sum_{b\in\actionspace}\sum_{\state^\prime} \occmeasure_{\episode+1}\rbr{\state,b,\state^{\prime}}}.$$
\STATE \textbf{output:} $\rbr{\occmeasure_{\episode+1}, \policy_{\episode+1}}$
\end{algorithmic}
\label{algo: Updating Occupancy Measure and Policy Procedure}
\end{algorithm}