\documentclass{article}

\usepackage[margin=1.1in]{geometry} 
  

% \usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}  

\newcommand{\remind}[1]{{\color{blue}#1}}
\newcommand{\issue}[1]{{\color{red}#1}}

\usepackage{style}
 
% \usepackage{refcheck}
\usepackage{mathrsfs}
\usepackage{comment}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsthm,amsmath,amssymb}
\usepackage{subfigure}
\usepackage{appendix}
\usepackage{xargs}

\usepackage{longtable}

%%%%%%%%%%  Notations %%%%%%%%%%%%%
% MDP model definitions
\newcommand{\statespace}[0]{\mathcal{X}} %state space
\newcommand{\statesize}[0]{X} %state space size
\newcommand{\state}[0]{x} %state
\newcommand{\actionspace}[0]{\mathcal{A}} %action space
\newcommand{\actionsize}[0]{A} %action space size
\newcommand{\action}[0]{a} %action
\newcommand{\sas}[0]{\rbr{\state,\action,\state^\prime}} %state-action-state^prime
\newcommand{\sashspace}[0]{\statespace_{\horizon} \times \actionspace \times \statespace_{\horizon+1}} %state action state space at horizon h

\newcommand{\transspace}[0]{\mathcal{P}} %acceptable transition space
\newcommand{\trans}[0]{P(x^{\prime}\vert x,a)} %transition
\newcommand{\transeasy}[0]{P} %transition
\newcommand{\transest}[0]{\bar{P}(x^{\prime}\vert x,a)} %transition estimated by true counts
\newcommand{\transesteasy}[0]{\bar{P}} %transition estimated by true counts
\newcommand{\transpri}[0]{\widetilde{P}(x^{\prime}\vert x,a)} %transition estimated by private counts
\newcommand{\transprieasy}[0]{\widetilde{P}} %transition estimated by private counts
\newcommand{\episode}[0]{k} %episode
\newcommand{\episodetotal}[0]{K} %episode total number
\newcommand{\horizon}[0]{h} %horizon
\newcommand{\horizontotal}[0]{H} %horizon total number
\newcommand{\timetotal}[0]{T} %time total number = HK
\newcommand{\losspri}[0]{\widetilde{\ell}_h^k(x,a)} %private loss
\newcommand{\lossprieasy}[0]{\widetilde{\ell}} %private loss
\newcommand{\lossest}[0]{\widehat{\ell}_h^k(x,a)} %estimated loss
\newcommand{\lossesteasy}[0]{\widehat{\ell}} %estimated loss

\newcommand{\losscum}[0]{L_h^k(x,a)} %state-action cumulative loss
\newcommand{\losscumeasy}[0]{L} %state-action cumulative loss
\newcommand{\losscumbin}[0]{\widehat{L}_h^k(x,a)} %state-action cumulative loss from binary mechanism
\newcommand{\losscumbineasy}[0]{\widehat{L}} %state-action 
\newcommand{\losscumpri}[0]{\widetilde{L}_h^k(x,a)} %state-action private cumulative loss  
\newcommand{\losscumprieasy}[0]{\widetilde{L}} %state-action private cumulative loss  
\newcommand{\policy}[0]{\pi} %policy
\newcommand{\valuef}[0]{V} %value function of one episode
\newcommand{\regret}[0]{\mathcal{R}_K} %regret
\newcommand{\nregret}[0]{\widetilde{\mathcal{R}}_\cK} %regret for noises

%algorithm parameters
\newcommand{\visitxatotal}[0]{N_h^k(x,a)} %state-action pair visit total counts
\newcommand{\visitxatotaleasy}[0]{N} %state-action pair visit total counts
\newcommand{\visitxaxtotal}[0]{N_h^k(x,a,x^{\prime})} %state-action-state pair visit total counts
\newcommand{\visitxaxtotaleasy}[0]{N} %state-action-state pair visit total counts
\newcommand{\visitxatotalbin}[0]{\widehat{N}_h^k(x,a)} %state-action pair visit binary mechanism total counts
\newcommand{\visitxatotalbineasy}[0]{\widehat{N}} %state-action pair visit binary mechanism total counts
\newcommandx{\visitxaxtotalbin}[0]{\widehat{N}_h^k(x,a,x^{\prime})}
%state-action-state pair visit binary mechanism total counts
\newcommandx{\visitxaxtotalbineasy}[0]{\widehat{N}}
\newcommand{\visitxatotalopt}[0]{\bar{N}_h^k(x,a)} %state-action pair visit optimization total counts
\newcommand{\visitxatotalopteasy}[0]{\bar{N}} %state-action pair visit optimization total counts
\newcommandx{\visitxaxtotalopt}[0]{\bar{N}_h^k(x,a,x^{\prime})}
%state-action-state pair visit optimization total counts
\newcommandx{\visitxaxtotalopteasy}[0]{\bar{N}}
\newcommand{\visitxatotalpri}[0]{\widetilde{N}_h^k(x,a)} %state-action pair visit private total counts
\newcommand{\visitxatotalprieasy}[0]{\widetilde{N}} %state-action pair visit private total counts
\newcommandx{\visitxaxtotalpri}[0]{\widetilde{N}_h^k(x,a,x^{\prime})}
%state-action-state pair visit private total counts
\newcommandx{\visitxaxtotalprieasy}[0]{\widetilde{N}}
\newcommand{\badeventloss}[0]{F^l} %bad event with loss out of confidence 
\newcommand{\badeventtrans}[0]{F^p} %bad event with transition out of confidence 
\newcommand{\goodevent}[0]{G} %good event totally 
\newcommand{\badevent}[0]{\bar{G}} %bad event totally 

\newcommand{\occmeasure}[0]{q} %occupancy measure
\newcommand{\occmeasvir}[0]{\widehat{q}} %virtual occupancy measure
\newcommand{\occmeasmid}[0]{q^\prime} %middle variable occupancy measure
\newcommand{\transbyocc}[0]{P^q(x^\prime\vert x,a)} %transition introduced by occupancy measure
\newcommand{\policybyocc}[0]{\pi^q} %policy introduced by occupancy measure
\newcommand{\occmeasureset}[0]{\Delta} %acceptable occupancy measure set introduced by given transition
\newcommand{\FTRLpara}[0]{\eta} %parameter of FTRL
\newcommand{\regularizer}[1]{\psi(#1)} %parameter of FTRL
\newcommand{\divg}[2]{\mathcal{D}_{\psi}(#1\Vert#2)} %divergence


%privacy parameters
\newcommand{\pripara}[0]{\varepsilon} %DP parameter
\newcommand{\tree}[0]{\mathcal{B}} %binary tree
\newcommand{\noisecount}[0]{E_{\episodetotal}}
\newcommand{\conflossf}[0]{E_{\varepsilon}} %full information loss noise confidence 
\newcommand{\confcountxa}[0]{E_{\varepsilon,\delta}} %confidence radius for state-action pair counts, i.e., N(x,a)-\tilde{N}(x,a)
\newcommand{\confcountxax}[0]{E_{\varepsilon,\delta}} %confidence radius for state-action-state pair counts, i.e., N(x,a,x)-\tilde{N}(x,a,x)
\newcommand{\confnormtrans}[0]{\beta_h^{k,p}} %confidence radius for transition L1 norm, i.e., |P(\cdot\vert x,a)-\tilde{P}(\cdot\vert x,a)|_1
\newcommand{\confpwtrans}[0]{\beta} %confidence radius for transition error pointwisely, i.e., |P(x^\prime\vert x,a)-\tilde{P}(x^\prime\vert x,a)|
\newcommand{\confconstant}[0]{L_p(\delta)} %constant in confidence radius for transition L1 norm.
\newcommand{\confconstpw}[0]{L_p^\prime(\delta)} %constant in confidence radius for transition pointwisely.
\newcommand{\normtranserror}[0]{\xi} %L1 estimate error of transition
\newcommand{\datas}[0]{\sigma} %data stream
\newcommand{\dataspri}[0]{\widetilde{\sigma}} %data stream
\newcommand{\noise}[0]{Z_h^k(x,a)} %noise added to real sum
\newcommand{\noiseeasy}[0]{Z} %noise added to real sum easy
\newcommand{\lap}[1]{\text{Lap}\rbr{#1}} %noise added to real sum easy
\newcommand{\ninterval}[0]{c} %noise interval parameter to ignore
\newcommand{\basealgo}[0]{\Lambda} %base non-private algorithm for bandit setting
\newcommand{\lag}[0]{\mathcal{L}} %Lagrange function
\newcommand{\dualvsum}[0]{\lambda} %dual variable of occupancy sum 
\newcommand{\dualvmid}[0]{\nu} %dual variable of  occupancy measure middle relation
\newcommand{\dualvtrp}[0]{\mu^{+}} %dual variable of transition positive error
\newcommand{\dualvtrn}[0]{\mu^{-}} %dual variable of transition negative error
\newcommand{\dualvtr}[0]{\mu} %dual variable of transition error sum bound
\newcommand{\vsdual}[0]{S} %middle variable of solving dual variables
\newcommand{\vbellman}[0]{B} %middle variable of bellman error for dual computation




\newcommand{\userspace}[0]{\mathcal{U}} %user space
\newcommand{\user}[0]{u} %user
\newcommand{\userseq}[0]{U} %user sequence
\newcommand{\Agent}[0]{\mathcal{M}} %RL agent algorithm
\newcommand{\history}[1][t]{\mathcal{H}_{#1}} %random event history up to time t
\newcommand{\traj}[0]{S} %one trajectory of one episode
\newcommand{\trajset}[0]{\mathcal{S}} %the trajectory set of one episode

\newcommand{\prob}[0]{\mathbb{P}} %probability
% \newcommand{\distb}[0]{\mathbb{D}} %distribution
\newcommand{\expect}[0]{\mathbb{E}} %distribution
\newcommand{\distc}[0]{R} %distance


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand\sj[1]{\textcolor{red}{#1}}
\newcommand\todo[1]{\textcolor{blue}{#1}}



\makeatletter
\def\maketag@@@#1{\hbox{\m@th\normalfont\normalsize#1}} 
\makeatother
 
% \author{Shaojie Bai \and } 
\author{Shaojie Bai\\
	\text{white.shaojie@gmail.com}
}
\begin{document}

\title{Private Online Reinforcement Learning}

\date{}
\maketitle
   
 


\begin{abstract}
This paper studies online reinforcement learning with differential privacy guarantees under full information setting and bandit information setting. 

\end{abstract}


\input{1-introduction} 

\input{2-preliminary}

\input{3-Full-Information Setting}

\input{4-Bandit-Information Setting}

\input{5-guarantee}

\input{6-Experiments}

\input{7-conclusion}

\newpage

\bibliographystyle{ims}
% \bibliographystyle{unsrt}
\bibliography{main}

\input{8-appendix}

\end{document}
