\section{Preliminary}
\subsection{Adversarial MDPs}
An episodic loop-free adversarial MDP is defined by a tuple $\left(\statespace, \actionspace, \transeasy, \cbr{\loss_\episode}_{\episode=1}^\episodetotal ,H\right)$, where $\statespace,\actionspace$ are state and action spaces with respective cardinalities $\statesize$ and $\actionsize$. %, respectively.
$\transeasy\!:\!\statespace \times \actionspace \times \statespace \rightarrow [0,1]$ is the transition function, with $\trans$ being the probability of transferring to state $\state^{\prime}$ when executing action $\action$ in state $\state$, 
and $\loss_k: \statespace \times \actionspace\!\rightarrow\![0,1]$ is the loss function for episode $k$. Finally, $H$ denotes the length of an episode.
% (sometime it is also defined as $\loss_k: \statespace \times \actionspace \times \statespace \rightarrow[0,1]$) 
% \footnote{Note that for any $\sas\in \sashspace$, here we always have $\loss_\episode\sas = \loss_\episode\rbr{\state,\action}$.} 
We assume that the state space $\mathcal X$ can be decomposed into $\horizontotal+1$ non-intersecting layers $\statespace_0, \ldots, \statespace_\horizontotal$ such that the first and the last layers are singletons, i.e., $\statespace_0=\left\{\state_0\right\}$ and $\statespace_\horizontotal=\left\{\state_\horizontotal\right\}$.
% For a specific horizon $\horizon$, we denote $\statesize_\horizon$ as the size of its state space, and $\statesize=\sum_{\horizon=0}^\horizontotal \statesize_\horizon$. 
Furthermore, the loop-free assumption means that transitions are only possible between consecutive layers\footnote{This assumption -- also known as layered, loop-free assumption -- is a standard one in the adversarial %MDPs 
MDP literature \citep{neu2012adversarial,jin20c}.%Dai2023Refined  
Although not necessary, it will %but 
simplify some arguments.
% and have a nice interpretation as a game with $\horizontotal$ steps played for $\episodetotal$ times. 
This model is a strict generalization of the episodic setting studied in \cite{azar2017minimax,jin2018q}, where the transitions are stationary across episode steps% (except for the first and the last one)
.}. 
In the following, we may write $\horizon(\state)$ to refer to the index of the layer to which $\state$ belongs.
% \sadegh{Regarding the footnote:  in fact the assumption appears to be necessary and may not be easily relaxed without substantially changing algorithms and analyses thereof. Yet, this is a standard assumption in the domain of AMDPs. Maybe the latter argument could be used instead.}

The learner interacts with the %environment 
MDP for $\episodetotal$ episodes without knowing $\transeasy$.
Before the interaction starts, an oblivious adversary selects the loss functions for all episodes $\loss_\episode$ arbitrarily.
In episode $\episode$, the learner starts %from 
at state $\state_0$ and decides a policy $\policy_\episode:\statespace\times\actionspace\rightarrow[0,1]$, where we write $\policy_\episode\rbr{\action\vert\state}$ to  denote the probability of taking action $\action$ at state $\state$.
Then the learner executes $\policy_\episode$ in the MDP, generating $\horizontotal$ state-action pairs $\cbr{\rbr{\state_\horizon^\episode,\action_\horizon^\episode}}_{\horizon=0}^{\horizontotal-1}$, where for $h\!\in\![H\!-\!1]$\footnote{For $n\!\in\!\mathbb N$, we define $[n]\!:=\!\{1,\ldots,n\}$.}, $\action_\horizon^\episode\sim\policy_\episode(\cdot|\state_\horizon^\episode)$ and $\state_{\horizon+1}^\episode\sim \transeasy(\cdot|\state_\horizon^\episode,\action_\horizon^\episode)$.
%In 
At the end of episode $\episode$, the learner observes the loss feedback, which is the entire loss function $\loss_\episode$ under the full-information setting, or the incurred losses $\cbr{\loss_\episode(\state_\horizon^\episode,\action_\horizon^\episode)}_{\horizon=0}^{\horizontotal-1}$ under the bandit setting.
% The interaction between the learner and environment is described in Protocol \ref{protocol: Learner Environment Interaction}. 
% In each episode, the learner traverses a trajectory $\history=(\state_0^{(t)},a_0^{(t)},\dots,x_{L-1}^{(t)},a_{L-1}^{(t)},x_{L}^{(t)})$ using $\pi_t$, and observes bandit feedback.
% \begin{algorithm}[h]
% \small
% \caption{Learner-Environment Interaction}
% \textbf{Parameters:} state space $\statespace$, action space $\actionspace$, unknown transition $\transeasy$
% \begin{algorithmic}[1]
% \FOR{$\episode=1$ to $\episodetotal$}
% \STATE adversary decides a loss function $\loss_\episode: \statespace \times \actionspace \rightarrow[0,1]$\
% \STATE learner decides a policy $\policy_\episode$ and starts in state $\state_0^{\episode}$\
% \FOR{$\horizon=0$ to $\horizontotal-1$}
% \STATE learner selects action $\action_\horizon^{\episode} \sim \policy_\episode\rbr{\cdot \mid \state_\horizon^{\episode}}$
% \STATE learner suffers loss $\loss_\episode(\state_\horizon^{\episode}, \action_\horizon^{\episode})$
% \STATE environment draws new state $\state_{\horizon+1}^{\episode} \sim \transeasy(\cdot|\state_{\horizon}^{\episode},\action_{\horizon}^{\episode})$
% \STATE learner observes state $\state_{\horizon+1}^{\episode}$
% \ENDFOR
% \STATE learner observes loss information, i.e., the loss function $\loss_\episode$ in full-information setting, or the suffered losses $\cbr{\loss_\episode(\state_\horizon^\episode,\action_\horizon^\episode)}_{\horizon=0}^\horizontotal$ in bandit setting.
% \ENDFOR
% \end{algorithmic}
% \label{protocol: Learner Environment Interaction}
% \end{algorithm}

The goal of the learner is to minimize the incurred loss in $\episodetotal$ episodes in expectation. More formally, for any policy $\policy$ and loss function $\ell$, we define %its total 
the corresponding expected cumulative loss per episode %with respect to %loss function 
%$\loss$ and %transition function 
%$\transeasy$ %in one episode
as 
\begin{equation}
    \valuef(\policy,\loss) = \EE\sbr{\sum_{\horizon=0}^{\horizontotal-1}\loss(\state_\horizon,\action_\horizon)\bigg|\transeasy,\policy},
\end{equation}
where the expectation is taken over trajectories %the state-action pairs 
$\cbr{\rbr{\state_\horizon,\action_\horizon}}_{\horizon=0}^{\horizontotal-1}$ generated by following $\policy$.
%are random variables generated according to the transition function $\transeasy$ and stochastic policy $\policy$.
% At the beginning of each episode $\episode$, the learner picks a policy $\policy_\episode$, and 
%For a specific RL algorithm $\mathbb{A}$ deciding $(\pi_k)_{k\ge 1}$, its goal is to minimize its total expected loss.
The performance of a given learning algorithm $\mathbb{A}$ deciding $(\pi_k)_{k\ge 1}$ is measured %by
through the notion of regret, which compares the cumulative expected loss under $\mathbb A$ to that incurred by the best stationary policy in hindsight,
%comparison to the best stationary policy in hindsight, 
i.e., $\policy^* \in \argmin_{\policy} \sum_{\episode=1}^\episodetotal \valuef\rbr{\policy,\loss_\episode}$.
That is,
\begin{equation}
    \cR_\episodetotal^\mathbb{A}  = \sum_{\episode=1}^\episodetotal \valuef(\policy_\episode,\loss_\episode) - \sum_{\episode=1}^\episodetotal \valuef(\policy^*,\loss_\episode).
\end{equation}
% \sadegh{To be more precise, let's refer to an algorithm $\mathbb A$ deciding $(\pi_k)_{k\ge 1}$ and then define $\mathcal R_{\mathbb A,K}$. The later dependence on $\mathbb A$ could be dropped. }
Alternatively, the goal is to minimize the regret. By default, $\mathbb{A}$ is omitted unless explicitly noted. 

%\textbf{Notation.} %For $n\!\in\!\mathbb N$, we define $[n]\!:=\!\{1,\ldots,n\}$. 
%We denote $\horizon(\state)$ as the index of the layer that $\state$ belongs to.
% $\II\cbr{\cdot}$ denotes the indicator function whose value is $1$ if its input holds true and $0$ otherwise.  



\subsubsection{Occupancy Measures}
%The adversarial MDP problem 
Learning in adversarial MDPs can be reformulated as an online linear optimization problem, using the notion of occupancy measures \citep{zimin2013online}. Given $\policy$ and $P$,
%For a policy $\policy$ and a transition $P$, 
the occupancy measure $\occmeasure^{\transeasy,\policy}:\statespace \times \actionspace \times \statespace \rightarrow [0,1]$ is defined as follows:
\begin{equation}\notag
\occmeasure^{\transeasy,\policy}\rbr{\state, \action, \state^{\prime}} = \prob\sbr{\state_\horizon=\state, \action_\horizon=\action, \state_{\horizon+1}=\state^{\prime} \vert \transeasy, \policy},
\end{equation}
% where $\state\in \statespace_\horizon$ and $\state^{\prime} \in \statespace_{\horizon+1}$.
where $\horizon=\horizon\rbr{\state}$ is the index of the layer to which $\state$ belongs.
With slight abuse of notation, we define the probability of visiting state-action pair $\rbr{\state,\action}$ and that of visiting state $\state$ as follows,
% \XD{These notations might cause confusion because the same symbol $q^{P,\pi}$ can represent functions of one, two and three arguments!}
\begin{equation}\notag
\begin{aligned}
% \occmeasure^{\transeasy, \policy}(\state, \action) & =\prob\sbr{\state_\horizon=\state, \action_\horizon=\action \mid \transeasy, \policy}=\sum_{\state^{\prime} \in \state_{\horizon+1}} \occmeasure^{\transeasy, \policy}\left(\state, \action, \state^{\prime}\right) \\
% \occmeasure^{\transeasy, \policy}(\state) & =\prob\sbr{\state_\horizon=\state \mid \transeasy, \policy}=\sum_{\action \in \actionspace} \occmeasure^{\transeasy, \policy}(\state, \action) .
\occmeasure^{\transeasy, \policy}(\state, \action)\!= \!\!\!\!\!\!\! \sum_{\state^{\prime} \in \statespace_{\horizon\rbr{\state}+1}} \!\!\!\!\!\! \occmeasure^{\transeasy, \policy}\left(\state, \action, \state^{\prime}\right),  \;
\occmeasure^{\transeasy, \policy}(\state)\! = \!\! \sum_{\action \in \actionspace} \occmeasure^{\transeasy, \policy}(\state, \action) .
\end{aligned}
\end{equation}
As established in \citep{zimin2013online}, a valid occupancy measure $\occmeasure$ satisfies: For all $h\!\in\![H\!-\!1]$,
\begin{align*}
(i)\!:& \sum_{\state \in \statespace_\horizon} \sum_{\action \in \actionspace} \sum_{\state^{\prime} \in \statespace_{\horizon+1}} \occmeasure\left(\state, \action, \state^{\prime}\right)=1,\\
(ii)\!:& \!\!\! \sum_{\state^{\prime} \in \statespace_{\horizon\!-\!1}} \! \sum_{\action \in \actionspace} \occmeasure\left(\state^{\prime}\!, \action, \state\right) 
  \!  = \!\!\!\!\! \sum_{\state^{\prime} \in \statespace_{\horizon+1}} \! \sum_{\action \in \actionspace} \occmeasure\!\left(\state, \action, \state^{\prime}\right),\,\! \forall x \!\in \!\mathcal X_{h}.
\end{align*}
Both $(i)$ and $(ii)$ follow from the loop-free structure: $(i)$ holds since each layer is visited exactly once, whereas $(ii)$ holds due to conservation law across layers. %
Further, any function $\occmeasure\!:\!\statespace\times\actionspace\times\statespace\!\rightarrow\!
[0,1]$ satisfying $(i)$-$(ii)$ induces the following transition function and policy
\citep{rosenberg2019onlineamdp}:
\begin{equation}
\label{eq: occupancy, transition, policy}
\transeasy^\occmeasure\rbr{\state^{\prime}\vert\state,\action}=\frac{\occmeasure\left(\state, \action, \state^{\prime}\right)}{\occmeasure(\state, \action)}, \, \policy^\occmeasure(\action \vert \state)=\frac{\occmeasure(\state, \action)}{\occmeasure(\state)}.
\end{equation}
$\occmeasureset$ denotes the set of all valid occupancy measures.
For a fixed transition function $\transeasy$, $\occmeasureset\rbr{\transeasy}$ denotes the set of occupancy measures whose induced transition $\transeasy^\occmeasure$  %is exactly
equals $\transeasy$.
Similarly, given a set of transition functions $\transspace$, $\occmeasureset\rbr{\transspace}$ denotes the set of occupancy measures whose induced transition $\transeasy^\occmeasure$ belongs to %the set of transition functions 
$\transspace$.

% Notice that every occupancy measure $\occmeasure$ induces a transition $\transeasy^\occmeasure$ and a policy $\policy^\occmeasure$, that can be computed as follows:
% \begin{equation}
% \label{eq: occupancy, transition, policy}
% \transeasy^\occmeasure\left(\state^{\prime} \mid \state, \action \right)=\frac{\occmeasure\left(\state, \action, \state^{\prime}\right)}{\occmeasure(\state, \action)} \quad ; \quad \policy^\occmeasure(\action \mid \state)=\frac{\occmeasure(\state, \action)}{\occmeasure(\state)}
% \end{equation}
% \begin{lemma}[Occupancy Measure Space] \label{lemma: Occupancy Measure Space}
% For every $\occmeasure \in[0,1]^{\statesize \times \actionsize \times \statesize}$ it holds that $\occmeasure \in \occmeasureset\rbr{\transeasy}$ if and only if:
% \begin{itemize}
    % \item Distribution property: $\sum_{\state \in \statespace_\horizon} \sum_{\action \in \actionspace} \sum_{\state^{\prime} \in \statespace_{\horizon+1}} \occmeasure\left(\state, \action, \state^{\prime}\right)=1 \quad \forall \horizon=0, \ldots, \horizontotal-1$
    % \item In-out balance: $\sum_{\state^{\prime} \in \statespace_{\horizon+1}} \sum_{\action \in \actionspace} \occmeasure\left(\state, \action, \state^{\prime}\right)
    % =\sum_{\state^{\prime} \in \statespace_{\horizon-1}} \sum_{\action \in \actionspace} \occmeasure\left(\state^{\prime}, \action, \state\right) \quad \forall \horizon=1, \ldots, \horizontotal-1, \forall \state \in \statespace_\horizon$
%     \item Model consistency: $\transeasy^\occmeasure=\transeasy$ 
% \end{itemize}
% \end{lemma}
Equipped with these definitions, we can rewrite %the expected loss of policy $\policy$ for one episode as 
\begin{equation} \notag
    \valuef(\policy,\loss) = \sum_{\horizon=0}^{\horizontotal-1}\sum_{\state\in\statespace_\horizon}\sum_{\action\in\actionspace} \occmeasure^{\transeasy,\policy}(\state, \action) \loss(\state, \action) = \inner{\occmeasure^{\transeasy,\policy}}{\loss}.
\end{equation}
% \XD{Should $q(x,a)$ be $q^{P,\pi}(x,a)$ in the above equation? Superscripts are also missing for $q$ in (2.3) and (2.4).}
Thus, the regret of the learner can be rewritten as 
\begin{equation}
    \regret^{\mathbb A}  = \sum_{\episode=1}^\episodetotal \inner{\occmeasure^{\transeasy,\policy_\episode}-\occmeasure^*}{\loss_\episode},
\end{equation}
where $\occmeasure^*\in\argmin_{\occmeasure\in\occmeasureset\rbr{\transeasy}}\sum_{\episode=1}^\episodetotal \inner{\occmeasure}{\loss_\episode}$ is the optimal occupancy measure in $\occmeasureset\rbr{\transeasy}$.

When the transition function is known and the loss function is revealed at the end of each episode, this problem can be solved by an online linear optimization method \citep{hazan2016introduction}. 
However, in our setting, both $\transeasy$ and $\occmeasureset\rbr{\transeasy}$ are unknown, and we face noisy and even partial information on $\loss_\episode$ under the bandit setting. %, which would be a challenge.

\subsection{Differential Privacy in Adversarial MDPs}
In adversarial MDPs, each episode $\episode \in \sbr{\episodetotal}$ can be viewed as a trajectory representing a specific user. Let  $\userspace$ denote the set of all users, and
let $\userseq_\episodetotal=\rbr{\user_1,\dots,\user_\episodetotal}\in\userspace^\episodetotal$ denote a sequence of $\episodetotal$ users participating in the private adversarial MDP protocol with an RL agent $\Agent$. %, where $\userspace$ is the set of all users.
Each user $\user_\episode$ is identified by her interaction trajectory $\traj_\episode$, including the visited state-action pairs and the observed losses.
% the observed losses ($\loss_\episode$ or $\cbr{\loss_\episode\rbr{\state_\horizon^\episode,\action_\horizon^\episode}}_{\horizon=1}^{\horizontotal-1}$) and state responses $\cbr{\state_{\horizon+1}^\episode}_{\horizon=0}^{\horizontotal-1}$ she/he gives to the actions $\cbr{\action_\horizon^\episode}_{\horizon=1}^\horizontotal$ chosen by the agent.
We denote $\Agent\rbr{\userseq_\episodetotal}:=\rbr{\action _1^1,\cdots,\action _\horizontotal^\episodetotal} \in \actionspace^{\episodetotal\horizontotal}$ as the set of all actions chosen by $\Agent$ when interacting with the user sequence $\userseq_\episodetotal$, 
and  $\Agent_{-\episode}\rbr{\userseq_\episodetotal} \!:=\!\Agent\rbr{\userseq_\episodetotal} \backslash \rbr{\action_\horizon^\episode}_{\horizon=1}^\horizontotal$ as all the actions chosen by $\Agent$ excluding those recommended to $\user_\episode$.
% Informally, we wish the knowledge of the output $\Agent\rbr{\userseq_\episodetotal}$ and all users but $\user_\episode$ together will not reveal "much" information about $\user_\episode$. 
% We formalize this idea by adapting differential privacy \cite{dwork2014algorithmic}.
% \begin{definition}[Differential Privacy (DP)] \label{def: dp}
%     For any $\pripara\geq 0$ and $\delta\in [0,1]$, a mechanism $\Agent: \userspace^\episodetotal \rightarrow \actionspace^{\episodetotal \horizontotal}$ is $(\pripara,\delta)$-differentially private if for all $\userseq_\episodetotal,\userseq^\prime_\episodetotal \in \userspace^\episodetotal$ differing on a single user and for all subset of actions $\cE_0 \subset \actionspace^{\episodetotal \horizontotal}$, 
%     $$\prob\sbr{\Agent\rbr{\userseq_\episodetotal}\in \cE_0}  
%     \leq \exp\rbr{\pripara} \cdot \prob\sbr{ \Agent\rbr{\userseq^\prime_\episodetotal}\in \cE_0} + \delta$$
%     If $\delta=0$, then we have $\pripara$-differential private ($\pripara$-DP).
% \end{definition}
% The standard definition of DP is too stringent for this setting, since it is impractical to privately recommend actions to user $\user_\episode$ while protecting the information of her states and losses.
% Therefore, we consider a relaxed notion of DP, \textit{Joint Differential Privacy} (JDP) \cite{kearns2014mechanism},
Then, we first consider the notion of JDP  \citep{kearns2014mechanism,vietri2020private}.
% which ensures that simultaneously {\color{red}for all user $\user_\episode$, the joint distribution of the actions recommended to all users other than $\user_\episode$ be differentially private in the type of the user $\user_\episode$.}
% We denote $\Agent_{-\episode}\rbr{\userseq_\episodetotal}:=\Agent\rbr{\userseq_\episodetotal} \backslash \rbr{\action_\horizon^\episode}_{\horizon=1}^\horizontotal$ as all the actions chosen by $\Agent$ excluding those recommended to $\user_\episode$ and formally define JDP as follows.
% JDP weakens the constraint of DP, while JDP is still a strong definition since it protects $\user_\episode$ from any arbitrary collusion of other users against her.
% JDP is first defined and analyzed under RL by \cite{vietri2020private}.
\begin{definition}[Joint Differential Privacy (JDP)] \label{def: JDP}
    For any $\pripara > 0$, a mechanism $\Agent: \userspace^\episodetotal \rightarrow \actionspace^{\episodetotal \horizontotal}$ is \emph{$\pripara$-Joint Differentially Private ($\pripara$-JDP)} if for all $\episode\in\sbr{\episodetotal}$, for all user sequences $\userseq_\episodetotal,\userseq^\prime_\episodetotal \in \userspace^\episodetotal$ differing only on the $\episode$-th user, and for all sets of actions $\cE_0 \subset \actionspace^{\rbr{\episodetotal-1} \horizontotal}$, 
    $$\prob\sbr{\Agent_{-\episode}\rbr{\userseq_\episodetotal}\in \cE_0}  
    \leq \exp\rbr{\pripara} \cdot \prob\sbr{ \Agent_{-\episode}\rbr{\userseq^\prime_\episodetotal}\in \cE_0}.$$
    % where $\Agent_{-\episode}\rbr{\userseq_\episodetotal}\in \cE_0$ means the sequence of actions recommended to all users but $\user_\episode$ belongs to the set $\cE_0$.
\end{definition}
JDP ensures that even if an adversary can observe the recommended actions to all users but $\user_\episode$, it is still statistically difficult to identify the trajectory of $\user_\episode$ accurately. 
JDP assumes that the agent $\Agent$ is allowed to access the raw trajectories from users. %, but
However, in some scenarios, the users may not be willing to share their data with the agent directly, which motivates LDP \citep{duchi2013local}.
% In some scenarios, the users may not be willing to share their data with the agent directly, which motivates \textit{Local Differential Privacy} (LDP) \citep{duchi2013local}.
In this setting, the agent $\Agent$ sends policy $\policy_\episode$ to the user $\user_\episode$, 
and the user executes $\policy_\episode$ and gets her trajectory $\traj_\episode$, 
and then privatizes it to $\widetilde{\traj}_\episode$ and sends it to $\Agent$.
% To this end, we denote by a trajectory  and by $\trajset$ the set of all possible trajectories.
We denote the privacy mechanism on the users' side by $\Agent^\prime$ and %define 
recall the definition of local differential privacy below. %formally below.
% \XD{I suggest using a different symbol for the privacy mechanism on the user's side since $\mathcal{M}$ was used to represent the output of the RL agent before, which is a different thing from privatizing $S$.}
\begin{definition}[Local Differential Privacy (LDP)] \label{def: LDP}
    For any $\pripara\geq 0$, a mechanism $\Agent^\prime$ is \emph{$\pripara$-Local Differentially Private ($\pripara$-LDP)} if for all trajectories $\traj,\traj^\prime \in \trajset$ and for all possible subset $\cE_0 \subset \cbr{\Agent^\prime\rbr{\traj}\vert \traj\in\trajset}$, 
    $$\prob\sbr{\Agent^\prime\rbr{\traj}\in \cE_0}  
    \leq \exp\rbr{\pripara} \cdot \prob\sbr{ \Agent^\prime\rbr{\traj^\prime}\in \cE_0},$$
    where $\trajset$ is the set of all possible trajectories.
\end{definition}
LDP ensures that if any adversary observes the privacy reply of user $\user_\episode$, it is still impossible to identify her trajectory.
LDP is first introduced and analyzed under RL by \cite{garcelon2021local}.

% \XD{Is it necessary to introduce these notations here? Can they be introduced when they are needed?
% When defining $\datas_\horizon^\episode\rbr{\state,\action}\!:=\!\II\rbr{\state_\horizon^\episode,\action_\horizon^\episode=\state,\action}$, why the indicator function is needed? Is it sufficient to say that $\datas_\horizon^\episode\rbr{\state,\action}\in\{0,1\}$ indicates blablabla
% }
% \sadegh{Also, not so clear why $\sigma$ is introduced. Keeping the indicator should work well as I see. Also in the defs below, an alternative way is to define $N(x,a,x')$ first and write $N(x,a)=\sum_{...} N(x,a,x')$.}
We introduce some notations for later analysis. 
$\II_\episode\cbr{\state,\action}$ denotes an indicator function whose value is $1$ if $\rbr{\state,\action}$ is visited in episode $\episode$ and $0$ otherwise. 
Similar definition also applies to $\II_\episode\cbr{\state,\action,\state'}$. 
Denote $\traj_\episode\!\!:=\!\cbr{\!\rbr{\state_\horizon^\episode,\action_\horizon^\episode}\!}_{\horizon=0}^{\horizontotal\!-\!1} \!\cup \cbr{\loss_\episode\rbr{\state,\action}}_{\rbr{\state,\action}}$ and $\traj_\episode\!\!:=\!\!\cbr{\rbr{\state_\horizon^\episode,\action_\horizon^\episode,\loss_\episode\rbr{\state_\horizon^\episode,\action_\horizon^\episode}}}_{\horizon=0}^{\horizontotal-1}$ as the trajectory of episode $\episode$ under full-information setting and bandit setting, respectively.

Under a given algorithm, denote 
% $\visitxaxtotal\!:=\!\sum_{i=1}^{\episode-1} \II \cbr{\state_\horizon^i,\action_\horizon^i,\state_{\horizon+1}^i=\state,\action,\state^\prime}$, where $\horizon=\horizon(\state)$, 
$\visitxaxtotal\!:=\!\sum_{i=1}^{\episode-1} \II_\episode\cbr{\state,\action,\state'}$
as the number of visits to the state-action pair $(\state,\action)$  followed by a visit to $\state^\prime$ \emph{before} episode $\episode$, 
%
and denote $\visitxatotal\!\!:=\!\!\sum_{\state^\prime\in\statespace_{\horizon+1}} \visitxaxtotal$. % as the number of times that the state-action pair is visited \emph{before} episode $\episode$.
% We denote $\datas_\horizon^\episode\rbr{\state,\action}\!:=\!\II\rbr{\state_\horizon^\episode,\action_\horizon^\episode=\state,\action}$ indicating whether the state-action pair $\rbr{\state,\action}$ is visited at horizon $\horizon$ in episode $\episode$.
% Then $\visitxatotal\!:=\!\sum_{s=1}^{\episode-1} \datas_\horizon^\episode\rbr{\state,\action}$ denotes the number of times that the state-action pair is visited at horizon $\horizon$ \emph{before} episode $\episode$.
% Similarly, we have the indicator $\datas_\horizon^\episode\rbr{\state,\action,\state^\prime} \!:=\! \II \cbr{\state_\horizon^\episode,\action_\horizon^\episode,\state_{\horizon+1}^\episode=\state,\action,\state^\prime}$ 
% and counter $\visitxaxtotal\!:=\!\sum_{s=1}^{\episode-1} \datas_\horizon^\episode\rbr{\state,\action,\state^\prime}$ denoting the number of going to state $\state^\prime$ from $\state$ upon playing action $\action$ at horizon $\horizon$ \emph{before} episode $\episode$.
Finally, $\losscum\!:=\!\sum_{i=1}^{\episode-1}\loss_i\rbr{\state,\action}$ denotes the cumulative loss of taking action $\action$ at state $\state$ \emph{before} episode $\episode$.
In non-private learning, these counters are sufficient to find estimates of the transition function $\transeasy$ to design %the 
a policy $\policy_\episode$ for each episode $\episode$ by using model-based algorithms \citep{neu2012adversarial}. 
However, these counters are derived from the raw user trajectories, which may contain sensitive information.
Therefore, we must release the counts in a privacy-preserving way, namely Privatizer, on which the learning agent would rely.
% Let $\visitxatotalpri,\visitxaxtotalpri$ and $\losscumpri,\lossprieasy_{\episode}\rbr{\state,\action}$ denote the privatized version of $\visitxatotal,\visitxaxtotal$ and $\losscum,\loss_\episode(\state,\action)$, respectively.
Let $\visitxatotalpri,\visitxaxtotalpri$, and $\losscumpri$ denote the privatized versions of $\visitxatotal,\visitxaxtotal$, and $\losscum$, respectively.
Assumption \ref{assp: private counts} below requires that the private visitation counts closely approximate the true counts.
The private loss for full-information and bandit settings will be specified in Section \ref{sec: full-information setting} and Section \ref{sec: bandit-feedback setting}, respectively.
All of the private counters will be justified by our Privatizers in Section \ref{sec: privacy and regret guarantees}.

\begin{assumption}[Private visitation counts] \label{assp: private counts}
For any privacy budget $\pripara >0$ and failure probability $\delta\in(0,1]$, the private visitation counts returned by Privatizer satisfy,
for some $\confcountxax > 0$, with probability at least $1-2\delta$, uniformly over all $\rbr{\state,\action,\state^\prime,\episode}$,
$\abr{\visitxatotalpri\!-\!\visitxatotal}$ $\leq\!\! \confcountxa, \abr{\visitxaxtotalpri\!-\!\visitxaxtotal}\!\!\leq\!\!\confcountxax$
and $\visitxatotalpri$ $ =\sum_{\state^\prime\in\statespace_{\horizon(\state)+1}}$ $\visitxaxtotalpri$ $\geq$ $\visitxatotal$, $\visitxaxtotalpri > 0$.

% $(2)$ Under the full information setting, for all $\rbr{\state,\action,\episode}$, $\noise\!:=\!\losscumpri - \!\losscum$ follows the same distribution, satisfying
% $\expect\sbr{\max_{\state,\action} \noise - \min_{\state,\action} \noise} \!\leq\! \conflossf$, for some $\conflossf > 0$. 

% $(3)$ Under the bandit setting, for all $\rbr{\state,\action,\episode}$, $\noise:= \losspri - \II\cbr{\state_{\horizon(\state)}^\episode,\action_{\horizon(\state)}^\episode=x,a}\cdot\loss_\episode\rbr{\state_{\horizon(\state)}^\episode,\action_{\horizon(\state)}^\episode}$  follows zero-mean distribution; and for some $\ninterval >0$, with probability at least $1-\delta$ uniformly over all $\rbr{\state,\action,\episode}$, $\abr{\noise}$ $\leq$ $\ninterval$.
\end{assumption}

Using Assumption \ref{assp: private counts}, we introduce the following private estimation of $P$ built using data available up to episode $\episode$:
\begin{equation}    
\label{eq: private transition estimate}
\transprieasy_\episode\rbr{\state^\prime\vert\state,\action}:= \frac{\widetilde{N}_\episode\rbr{\state,\action,\state^\prime}}{\visitxatotalpri}.
\end{equation}
Note that %Similar to \cite{qiao2023near}, 
by construction of Privatizer, 
$\transprieasy_\episode\rbr{\cdot\vert\state,\action}$ is a valid probability distribution.
%Similar to \cite{qiao2023near}, this private estimation $\transprieasy_\episode\rbr{\cdot\vert\state,\action}$ is a valid probability distribution, resulting from the construction of Privatizer.
