In this section, we present the proof of privacy guarantees in Section \ref{sec: privacy and regret guarantees}.
Recall that for visitation counters, $\visitxatotal$ is the original count, $\visitxatotalbin$ is the noisy count after step (1) of both Privatizers and $\visitxatotalpri$ is the final private counts.
For losses, $\losscum$ is the original cumulative loss in full-information setting, and $\ddot{\loss}_\episode\rbr{\state,\action}$ is the non-private loss estimator in bandit-feedback setting, and $\losscumpri$ and $\losspri$ are the private version of $\losscum$ and $\ddot{\loss}_\episode\rbr{\state,\action}$.

\subsection{A Variant of the Binary Mechanism}
\label{subsec: A Variant of the Binary Mechanism}
Firstly, we introduce a variant of the Binary Mechanism (Algorithm \ref{algo: Private Counter (PC) for losscum}), which has also been introduced in \cite{agarwal2017price}. 
This variant of Binary Mechanism deals with a continual observation $\cbr{\datas_\episode}_{\episode\in[\episodetotal]}$ with each $\datas_\episode\in[0,1]$. 
In each step, the mechanism releases a number denoted as $\widetilde{\Sigma}_\episode$, which is the private version of the sum of the previous $\episode$ observed number, i.e., $\Sigma_\episode = \sum_{i=1}^{\episode}\datas_i$.
Initialized with privacy budget level $\pripara$, this mechanism follows the properties in Lemma~\ref{lemma: Guarantees of the Variant of Binary Mechanism}.
% And it is employed to privatize the cumulative loss $\losscum$ under full information setting, with the privacy budget initialized as $\pripara/3\horizontotal$.
\begin{algorithm}[htbp] 
\caption{A Variant of the Binary Mechanism} 
\textbf{Input:} Time upper bound $\episodetotal$, privacy budget $\pripara > 0$, online stream $\datas \in \sbr{0,1}^{\episodetotal}$, binary tree $\tree$ \\
\textbf{Initialization:} $\pripara^\prime = \pripara / \log \episodetotal$, noisy sum $\widetilde{\Sigma}_\episode = 0$, initialize the binary tree $\tree$ over $\episodetotal$ leaves with all nodes\\
\begin{algorithmic}[1]
\FOR{$\episode=1$ to $\episodetotal$}
    \STATE Express $\episode$ in binary form: $s_\episode$, where $\episode = \sum_{j=1}^{\lceil \log_2 \episodetotal \rceil} s_\episode\rbr{j}\cdot 2^{\lceil \log_2 \episodetotal \rceil - j}$. 
    For example, if $s_\episode = 110$, then $s_\episode\rbr{1} = 1, s_\episode\rbr{2} = 1, s_\episode\rbr{3} = 0$
    \STATE Populate the $s_\episode$-th entry of $\tree$: $\tree_{s_\episode} \leftarrow \datas_\episode$
    \STATE Perturb the $s_\episode$-th entry of $\tree$: $\hat{\tree}_{s_\episode} \leftarrow \tree_{s_\episode} + b_{s_\episode}$, where $b_{s_\episode} \sim \lap{\frac{1}{\pripara^\prime}}$
    \STATE Let $S_\episode$ be the set of all ancestors $s$ of $s_\episode$ in the tree $\tree$, such that all the leaves in the sub-tree rooted at $s$ are already populated
    \FOR{all $s\in S_\episode$}
    \STATE Update the value of the node with the values of its children: $\tree_s \leftarrow \tree_{s\circ0} + \tree_{s\circ 1}$ 
    \STATE Perturb the value of the node: $\hat{\tree}_{s} \leftarrow \tree_{s} + b_{s}$, where $b_{s} \sim \lap{\frac{1}{\pripara^\prime}}$
    \ENDFOR
    \FOR{$j=1$ to $\lceil \log_2 \episodetotal \rceil$}
    \IF{$s_\episode\rbr{j} = 1$}
    \STATE Form binary string $s^q = s_\episode\rbr{1} \circ s_\episode\rbr{2} \circ \cdots \circ s_\episode\rbr{j-1} \circ 0$ of length $j$
    \STATE $\widetilde{\Sigma}_\episode \leftarrow \widetilde{\Sigma}_\episode + \hat{\tree}_{s^q}$
    \ELSE
    \STATE $\widetilde{\Sigma}_\episode \leftarrow \widetilde{\Sigma}_\episode + b$, where $b \sim \lap{\frac{1}{\pripara^\prime}}$
    \ENDIF
    \ENDFOR
    \STATE \textbf{output:} $\widetilde{\Sigma}_\episode$
\ENDFOR
\end{algorithmic}
\label{algo: Private Counter (PC) for losscum}
\end{algorithm}

\begin{lemma}[Guarantees of the Variant of Binary Mechanism]
\label{lemma: Guarantees of the Variant of Binary Mechanism}
The following claims hold true:\\ 
    $(1)$ \textbf{Privacy}: The sequence $\cbr{\widetilde{\Sigma}_\episode}_{\episode=1}^\episodetotal$ is $\pripara$-differentially private. \\
    $(2)$ \textbf{Distribution}: For all $\episode\in\sbr{\episodetotal}$, the injected noise $\widetilde{\Sigma}_\episode - \Sigma_\episode = \sum_{i=1}^{\lceil \log \episodetotal \rceil} b_i$, and each $b_i$ is i.i.d. sampled from $\lap{\log\episodetotal/\pripara}$. 
    % $(3)$ \textbf{Utility}: For any $1\leq\episode\leq\episodetotal$, 
    % % with probability at least $1-\delta$, $\abr{\Sigma_\episode - \widetilde{\Sigma}_\episode} \leq O\rbr{\frac{1}{\pripara}\log^{1.5}\episodetotal\log\rbr{\frac{1}{\delta}}}$. Beside, 
    % $\expect\sbr{\max \noiseeasy_\episode - \min \noiseeasy_\episode} \leq O\rbr{\frac{1}{\pripara}\log^2\episodetotal \ln\rbr{\statesize\actionsize}}$
\end{lemma}
\begin{proof}
    Our privacy guarantee is established through the Binary Mechanism with Laplace noise, accompanied by the post-processing property as outlined in \cite{dwork2014algorithmic}. 
    We introduce additional noise beyond the standard Binary Mechanism, ensuring the injection of an exact $\lceil \log \episodetotal\rceil$ number of noise samples, in line with our distribution property. 
    % Additionally, the utility bound is derived from the sum of independent and identically distributed (i.i.d.) Laplace random variables, following the principles detailed in \cite{chan2011private}.
\end{proof}

\subsection{Privacy Guarantees}
First, we demonstrate that our private loss for the bandit-feedback setting simultaneously satisfies LDP and DP in the streaming setting.

% \begin{lemma}[Privacy Guarantees of Bandit Losses]
% \label{lemma: Privacy Guarantees of Bandit Losses}
%      Defined in Section \ref{sec: privacy and regret guarantees}, the sequence $\cbr{\lossprieasy_\episode\rbr{\state_\horizon^\episode,\action_\horizon^\episode}}_{\rbr{\horizon,\episode}}$ satisfy $\pripara/3$-DP, as well as $\pripara/3$-LDP.
% \end{lemma}
\begin{proof}[Proof of Lemma~\ref{lemma: Privacy Guarantees of Bandit Losses}]
    First, we focus on a single episode $\episode$, and consider the private version $\lossprieasy_{\episode}\rbr{\state,\action}$ of the observed loss $\loss_{\episode}\rbr{\state,\action}\II_{\episode}\rbr{\state,\action}$.
    This private version $\lossprieasy_{\episode}\rbr{\state,\action}$ is obtained from $\loss_{\episode}\rbr{\state,\action}\II_{\episode}\rbr{\state,\action}$ via the Laplace mechanism with noise level $\frac{3\horizontotal}{\pripara}$.
    Since the sensitivity of the input function is $1$, the Laplace mechanism satisfies $\frac{\pripara}{3\horizontotal}$-DP and $\frac{\pripara}{3\horizontotal}$-LDP (refer to \cite{dwork2014algorithmic}).
    Additionally, since every episode involves at most $\horizontotal$ bandit losses, according to \cite[Lemma 34]{hsu2014private}, the composition of all these $\statesize\actionsize\episodetotal$ different Laplace mechanisms are $\frac{\pripara}{3}$-LDP and $\frac{\pripara}{3}$-DP.
\end{proof}

% \begin{theorem} \label{thm: algo-JDP}
%     Algorithm \ref{algo: private UC-O-REPS} Private-UC-O-REPS is $\pripara$-JDP. \todo{this can be added in appendix}
% \end{theorem}
% We prove the JDP guarantees using billboard lemma (\cite{hsu2014private}, Lemma 9), which, informally, states that an algorithm is JDP if the output sent to each user is a function of the user’s private data and a common quantity computed using a standard DP mechanism.
% Note that by Lemma \ref{} and the post-processing property of DP \cite{dwork2014algorithmic}, the sequence of policies $\cbr{\policy_\episode}_{\episode=1}^\episodetotal$ are $\pripara$-DP.
% Therefore, by the billboard model, the actions $\cbr{\action_\horizon^\episode}_{\horizon,\episode}$ suggested to all the users are $\pripara$-JDP.
Now, let's proceed to establish the proof for the JDP guarantee.
\begin{proof}[Proof of Lemma \ref{lemma: Properties of Central-PRIVATIZER}]
Under the full-information setting, the release of $\cbr{\visitxatotalbin}_{\rbr{\episode,\state,\action}}$ satisfies $\frac{\pripara}{3}$-DP according to \cite[Theorem 3.5]{chan2011private} and \cite[Lemma 34]{hsu2014private}.
Similarly, the releases of $\cbr{\visitxaxtotalbin}_{\rbr{\episode,\state,\action,\state^\prime}}$ and $\cbr{\losscumbin}_{\rbr{\episode,\state,\action}}$ also satisfy $\frac{\pripara}{3}$-DP.
% Therefore, the release of $\cbr{\visitxatotalbin}_{\rbr{\episode,\state,\action}}$,$\cbr{\visitxaxtotalbin}_{\rbr{\episode,\state,\action,\state^\prime}}$, $\cbr{\losscumbin}_{\rbr{\episode,\state,\action}}$ satisfy $\pripara$-DP.
Therefore, the release of all these counts satisfies $\pripara$-DP.
Due to post-processing \citep[Proposition 2.1]{dwork2014algorithmic}, the release of all private counts $\cbr{\visitxatotalpri}_{\rbr{\episode,\state,\action}}$, $ \cbr{\visitxaxtotalpri}_{\rbr{\episode,\state,\action,\state^\prime}}$, $ \cbr{\losscumpri}_{\rbr{\episode,\state,\action}}$ also satisfy $\pripara$-DP.

Under the bandit setting, and with the help of Lemma \ref{lemma: Privacy Guarantees of Bandit Losses} and post-processing property, the release of all private counts $\cbr{\visitxatotalpri}_{\rbr{\episode,\state,\action}}$, $ \cbr{\visitxaxtotalpri}_{\rbr{\episode,\state,\action,\state^\prime}}$, $\cbr{\lossprieasy_\episode\rbr{\state,\action}}_{\rbr{\episode,\state,\action}}$ also satisfy $\pripara$-DP.

% Then it holds that the release of all $\policy_\episode$ is $\pripara$-DP according to post-processing under both full-information and bandit settings.
% Finally, the guarantee of $\pripara$-JDP results from Billboard Lemma \cite[Lemma 9]{hsu2014private}.

For utility analysis, we analyze the visitation counters and losses, separately.

\textbf{Private visitation counters.} 
By applying \cite[Theorem 3.6]{chan2011private} and setting $\pripara^\prime=\frac{\pripara}{3\horizontotal\log\episodetotal}$ in Binary mechanism, and using a union bound, we can establish that with probability $1-2
\delta$, for all $\rbr{\episode,\state,\action,\state^\prime}$,
\begin{equation}\notag
\small
\begin{aligned}
    \abr{\visitxaxtotalbin - \visitxaxtotal} \leq O\rbr{\frac{3\horizontotal}{\pripara}\log^{1.5}\episodetotal \log\rbr{ \frac{\statesize^2\actionsize\episodetotal}{\delta}}}, \quad \abr{\visitxatotalbin - \visitxatotal } \leq O\rbr{\frac{3\horizontotal}{\pripara}\log^{1.5}\episodetotal \log\rbr{ \frac{\statesize\actionsize\episodetotal}{\delta}}}.  
\end{aligned}
\end{equation}
Referring to the post-processing procedures and Lemma 5.1 in \cite{qiao2023near}, the Central Privatizer satisfies Assumption \ref{assp: private counts} with $\confcountxax = O\rbr{\frac{\horizontotal}{\pripara}\log^{1.5}\episodetotal \log\rbr{ \frac{\statesize\actionsize\episodetotal}{\delta}}}$, and $\visitxatotal \leq \visitxatotalpri \leq \visitxatotal + \confcountxax$.
Furthermore, in accordance with the constraints of the optimization problem, we also observe that $\visitxatotalbin = \sum_{\state^\prime\in\statespace_{\horizon+1}}\visitxaxtotalbin$, which implies that $\visitxatotalpri = \sum_{\state^\prime\in\statespace_{\horizon+1}} \visitxaxtotalpri$.
% By \cite[Theorem 3.6]{chan2011private}, our choice $\pripara^\prime=\frac{\pripara}{3\horizontotal\log\episodetotal}$ in Binary mechanism and a union bound, with probability $1-2
% \delta$, for all $\rbr{\episode,\state,\action,\state^\prime}$,
% \begin{equation}\notag
% \small
% \begin{aligned}
%     \abr{\visitxaxtotalbin - \visitxaxtotal} \leq O\rbr{\frac{3\horizontotal}{\pripara}\log^{1.5}\episodetotal \log\rbr{ \frac{\statesize^2\actionsize\episodetotal}{\delta}}}, \quad \abr{\visitxatotalbin - \visitxatotal } \leq O\rbr{\frac{3\horizontotal}{\pripara}\log^{1.5}\episodetotal \log\rbr{ \frac{\statesize\actionsize\episodetotal}{\delta}}}.  
% \end{aligned}
% \end{equation}
% Together with Lemma \ref{lemma: counter property of binary mechanism}, the Central Privatizer satisfies Assumption \ref{assp: private counts} with $\confcountxax = O\rbr{\frac{\horizontotal}{\pripara}\log^{1.5}\episodetotal \log\rbr{ \frac{\statesize^2\actionsize\episodetotal}{\delta}}}$.\\
% Since the true counts $\cbr{\visitxaxtotal}_{\state^\prime\in\statespace_{\horizon+1}}$ is a feasible solution to the optimization problem \eqref{opt: counter post-processing}, we have 
% \begin{equation}\notag
%     \max_{\state^\prime} \abr{\visitxaxtotalopt - \visitxaxtotalbin} \leq \max_{\state^\prime} \abr{\visitxaxtotal - \visitxaxtotalbin} \leq \frac{\confcountxax}{4}.
% \end{equation}
% Combining the condition in Lemma \ref{lemma: counter property of binary mechanism} with respect to $\visitxaxtotalbin$, it holds that 
% \begin{equation}\notag
%     \abr{\visitxaxtotalopt - \visitxaxtotal} \leq 
%     \abr{\visitxaxtotalopt - \visitxaxtotalbin}
%     + \abr{\visitxaxtotalbin - \visitxaxtotal}
%     \leq \frac{\confcountxax}{2}.
% \end{equation}
% Since $\visitxaxtotalpri = \visitxaxtotalopt + \frac{\confcountxax}{2\statesize_{\horizon+1}}$, and $\visitxaxtotalopt\geq 0$, we have
% \begin{equation}\notag
%     \visitxaxtotalpri \geq 0, \quad \abr{\visitxaxtotalpri - \visitxaxtotal} \leq \confcountxax.
% \end{equation}
% For $\visitxatotalbin$, according to the constraints in the optimization problem \eqref{opt: counter post-processing}, it holds that 
% $$\visitxaxtotalopt - \visitxatotalbin \leq \frac{\confcountxax}{4}.$$
% Combining the condition in Lemma \ref{lemma: counter property of binary mechanism} with respect to $\visitxatotalbin$, it holds that
% \begin{equation}\notag
%     \abr{\visitxatotalopt - \visitxatotal} \leq 
%     \abr{\visitxatotalopt - \visitxatotalbin}
%     + \abr{\visitxatotalbin - \visitxatotal}
%     \leq \frac{\confcountxax}{2}.
% \end{equation}
% Since $\visitxatotalpri = \visitxatotalopt + \frac{\confcountxax}{2}$, we have
% \begin{equation}\notag
%     \visitxatotal \leq \visitxatotalpri \leq \visitxatotal + \confcountxax.
% \end{equation}
% Finally, according to the constraints of the optimization problem, we have $\visitxatotalbin = \sum_{\state^\prime\in\statespace_{\horizon+1}}\visitxaxtotalbin$, then
% \begin{equation}\notag
%     \visitxatotalpri = \sum_{\state^\prime\in\statespace_{\horizon+1}} \visitxaxtotalpri.
% \end{equation}

\textbf{Private loss in full-information setting.}
In the full-information setting, we have constructed a variant of the Binary Mechanism (Algorithm \ref{algo: Private Counter (PC) for losscum}) to obtain $\losscumpri$ in Section \ref{subsec: A Variant of the Binary Mechanism}.
% Facing with one data stream $\cbr{\datas_\episode}_{\episode=1}^\episodetotal$ where $\datas_\episode\in[0,1]$, in the classical $\episodetotal-$bounded binary mechanism, any partial sum $\widetilde{\Sigma}_\episode = \sum_{i=1}^\episode \datas_i$ is injected a sum of i.i.d. Laplace variable samples, and the number of the samples is the size of the minimum tree nodes that can {\color{red}comput} this partial sum, and is $\log\episodetotal$ at most.
Since the cumulative loss $\losscum$ is privatized by this variant with privacy budget $\pripara/3\horizontotal$, the injected noise $\noise$ is a sample of the summation of $\lceil \log\episodetotal \rceil$ i.i.d. Laplace variables with parameter for all $\rbr{\episode,\state,\action}$.
Therefore, for any episode $\episode$, when $\episodetotal>\sqrt{\statesize\actionsize}$, we have $\expect\sbr{\max_{\state,\action} \noise - \min_{\state,\action} \noise} \leq \cO\rbr{\frac{\horizontotal}{\pripara}\sqrt{\log^3\episodetotal \ln\rbr{\statesize\actionsize}}}$, which follows the maxima property of the sum of i.i.d.~Laplace variables as demonstrated in Lemma \ref{lemma: Maxima of Laplace Variables}.

\textbf{Private loss in bandit-feedback setting} 
Under the bandit setting, we directly apply the Laplace Mechanism, and the noise injected is $\noise = \losspri - \loss_\episode\rbr{\state,\action}\II_\episode\rbr{\state,\action}$, where $\noise \sim \lap{3\horizontotal/\pripara}$.
Then using the concentration of the Laplace variable (refer to Lemma \ref{lemma: Concentration of Laplace variable}) and a union bound, we can conclude that with probability at least $1-\delta$, $\abr{\losspri - \loss_\episode\rbr{\state,\action}\II_\episode\rbr{\state,\action}} \leq \frac{3\horizontotal}{\pripara} \ln\rbr{\frac{\statesize\actionsize\episodetotal}{\delta}}$ for all $\rbr{\episode,\state,\action}$.
\end{proof}

\begin{proof}[Proof of Theorem \ref{crl: Regret under JDP}]
    With $\pripara$-DP guarantee for private counters and losses, the release of all $\policy_\episode$ is also $\pripara$-DP according to post-processing property under both full-information and bandit settings.
    Finally, the guarantee of $\pripara$-JDP for final action sequences follows the Billboard Lemma \cite[Lemma 9]{hsu2014private}.
    Besides, the regret bound is obtained by plugging $\confcountxax$, and corresponding $\conflossf,\ninterval$ in Lemma \ref{lemma: Properties of Central-PRIVATIZER} into Theorem \ref{thm: Regret bound of Private UC-O-REPS} and Theorem \ref{thm: Regret bound of Private UOB-LBPS}.
\end{proof}


\begin{proof}[Proof of Lemma \ref{lemma: Properties of Local-PRIVATIZER}]
The privacy guarantee directly results from properties of Laplace Mechanism and composition of DP \citep{dwork2014algorithmic}, and Lemma \ref{lemma: Privacy Guarantees of Bandit Losses}.
For utility analysis, we also analyze the visitation counters and losses as below.

\textbf{Private visitation counters.} According to \cite{dwork2014algorithmic} Corollary 12.4 and a union bound, with probability $1-2\delta$, for all $\rbr{\episode,\state,\action,\state^\prime}$,
\begin{equation}
\small
\begin{aligned}
    \abr{\visitxaxtotalbin - \visitxaxtotal} \leq \cO\rbr{\frac{3\horizontotal}{\pripara}\sqrt{\episodetotal \log\rbr{ \frac{\statesize^2\actionsize\episodetotal}{\delta}}}}, \quad \abr{\visitxatotalbin - \visitxatotal } \leq \cO\rbr{\frac{3\horizontotal}{\pripara} \sqrt{\episodetotal \log\rbr{ \frac{\statesize\actionsize\episodetotal}{\delta}}}}.
\end{aligned}
\end{equation}
Together with Lemma \ref{lemma: counter property of binary mechanism}, the Local Privatizer satisfies Assumption \ref{assp: private counts} with $\confcountxax = \cO\rbr{\frac{\horizontotal}{\pripara}\sqrt{\episodetotal \log\rbr{ \frac{\statesize\actionsize\episodetotal}{\delta}}}}$.

\textbf{Private loss in full-information setting.} 
Under the full-information setting, we apply the Laplace Mechanism with privacy budget $\pripara'=\frac{\pripara}{3\horizontotal}$ directly on the point-wise loss $\loss_\episode\rbr{\state,\action}$ and make summation to obtain $\losscumpri$.
Thus, the noise injected $\noise$ is a realization of a summation of $\episodetotal$ Laplace variables for all $\rbr{\episode,\state,\action}$.
Therefore, for any episode $\episode$, when $\episodetotal>\frac{\ln\rbr{\statesize\actionsize}}{2}$, $\expect\sbr{\max_{\state,\action} \noise - \min_{\state,\action} \noise} \leq \cO\rbr{\frac{\horizontotal}{\pripara} \sqrt{\episodetotal\ln\rbr{\statesize\actionsize}}}$, which follows Lemma \ref{lemma: Maxima of Laplace Variables}.

\textbf{Private loss in bandit-feedback setting.} Under the bandit setting, the utility is the same as the JDP setting.
\end{proof}
\begin{proof}[Proof of Theorem \ref{crl: Regret under LDP}]
    % Similar to the JDP setting, with $\pripara$-LDP guarantee for private counters and losses, the release of all $\policy_\episode$ and policy outputs satisfies $\pripara$-LDP according to post-processing under both full-information and bandit settings.
    The regret bound is obtained by plugging $\confcountxax$, and corresponding $\conflossf,\ninterval$ in Lemma \ref{lemma: Properties of Local-PRIVATIZER} into Theorem \ref{thm: Regret bound of Private UC-O-REPS} and Theorem \ref{thm: Regret bound of Private UOB-LBPS}.
\end{proof}
