% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


%%%%%%%%%%%%%
%----- bold fonts -----%

\newcommand{\ab}{\mathrm{a}}
\newcommand{\bbb}{\mathrm{b}}
\newcommand{\cbb}{\mathrm{c}}
\newcommand{\db}{\mathrm{d}}
\newcommand{\eb}{\mathrm{e}}
\newcommand{\fb}{\mathrm{f}}
\newcommand{\gb}{\mathrm{g}}
\newcommand{\hb}{\mathrm{h}}
\newcommand{\ib}{\mathrm{i}}
\newcommand{\jb}{\mathrm{j}}
\newcommand{\kb}{\mathrm{k}}
\newcommand{\lb}{\mathrm{l}}
\newcommand{\mb}{\mathrm{m}}
\newcommand{\nbb}{\mathrm{n}}
\newcommand{\ob}{\mathrm{o}}
\newcommand{\pb}{\mathrm{p}}
\newcommand{\qb}{\mathrm{q}}
\newcommand{\rb}{\mathrm{r}}
\newcommand{\sbb}{\mathrm{s}}
\newcommand{\tb}{\mathrm{t}}
\newcommand{\ub}{\mathrm{u}}
\newcommand{\vb}{\mathrm{v}}
\newcommand{\wb}{\mathrm{w}}
\newcommand{\xb}{\mathrm{x}}
\newcommand{\yb}{\mathrm{y}}
\newcommand{\zb}{\mathrm{z}}

\newcommand{\ba}{\bm{a}}
\newcommand{\bb}{\bm{b}}
\newcommand{\bc}{\bm{c}}
\newcommand{\bd}{\bm{d}}
\newcommand{\be}{\bm{e}}
\newcommand{\bbf}{\bm{f}}
\newcommand{\bg}{\bm{g}}
\newcommand{\bh}{\bm{h}}
\newcommand{\bi}{\bmf{i}}
\newcommand{\bj}{\bm{j}}
\newcommand{\bk}{\bm{k}}
\newcommand{\bl}{\bm{l}}
\newcommand{\bbm}{\bm{m}}
\newcommand{\bn}{\bm{n}}
\newcommand{\bo}{\bm{o}}
\newcommand{\bp}{\bm{p}}
\newcommand{\bq}{\bm{q}}
\newcommand{\br}{\bm{r}}
\newcommand{\bs}{\bm{s}}
\newcommand{\bt}{\bm{t}}
\newcommand{\bu}{\bm{u}}
\newcommand{\bv}{\bm{v}}
\newcommand{\bw}{\bm{w}}
\newcommand{\bx}{\bm{x}}
\newcommand{\by}{\bm{y}}
\newcommand{\bz}{\bm{z}}




\newcommand{\Ab}{\mathrm{A}}
\newcommand{\Bb}{\mathrm{B}}
\newcommand{\Cb}{\mathrm{C}}
\newcommand{\Db}{\mathrm{D}}
\newcommand{\Eb}{\mathrm{E}}
\newcommand{\Fb}{\mathrm{F}}
\newcommand{\Gb}{\mathrm{G}}
\newcommand{\Hb}{\mathrm{H}}
\newcommand{\Ib}{\mathrm{I}}
\newcommand{\Jb}{\mathrm{J}}
\newcommand{\Kb}{\mathrm{K}}
\newcommand{\Lb}{\mathrm{L}}
\newcommand{\Mb}{\mathrm{M}}
\newcommand{\Nb}{\mathrm{N}}
\newcommand{\Ob}{\mathrm{O}}
\newcommand{\Pb}{\mathrm{P}}
\newcommand{\Qb}{\mathrm{Q}}
\newcommand{\Rb}{\mathrm{R}}
\newcommand{\Sbb}{\mathrm{S}}
\newcommand{\Tb}{\mathrm{T}}
\newcommand{\Ub}{\mathrm{U}}
\newcommand{\Vb}{\mathrm{V}}
\newcommand{\Wb}{\mathrm{W}}
\newcommand{\Xb}{\mathrm{X}}
\newcommand{\Yb}{\mathrm{Y}}
\newcommand{\Zb}{\mathrm{Z}}

\newcommand{\bA}{\bm{A}}
\newcommand{\bB}{\bm{B}}
\newcommand{\bC}{\bm{C}}
\newcommand{\bD}{\bm{D}}
\newcommand{\bE}{\bm{E}}
\newcommand{\bF}{\bm{F}}
\newcommand{\bG}{\bm{G}}
\newcommand{\bH}{\bm{H}}
\newcommand{\bI}{\bm{I}}
\newcommand{\bJ}{\bm{J}}
\newcommand{\bK}{\bm{K}}
\newcommand{\bL}{\bm{L}}
\newcommand{\bM}{\bm{M}}
\newcommand{\bN}{\bm{N}}
\newcommand{\bO}{\bm{O}}
\newcommand{\bP}{\bm{P}}
\newcommand{\bQ}{\bm{Q}}
\newcommand{\bR}{\bm{R}}
\newcommand{\bS}{\bm{S}}
\newcommand{\bT}{\bm{T}}
\newcommand{\bU}{\bm{U}}
\newcommand{\bV}{\bm{V}}
\newcommand{\bW}{\bm{W}}
\newcommand{\bX}{\bm{X}}
\newcommand{\bY}{\bm{Y}}
\newcommand{\bZ}{\bm{Z}}


%----- calligraphic fonts -----%

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}




%----- blackboard bold fonts-----%
\newcommand{\CC}{\mathbb{C}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\VV}{\mathbb{V}}
\newcommand{\II}{\mathbb{I}}
\newcommand{\KK}{\mathbb{K}}
\newcommand{\LL}{\mathbb{L}}
\newcommand{\MM}{\mathbb{M}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\QQ}{\mathbb{Q}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\SSS}{\mathbb{S}}
\newcommand{\ZZ}{\mathbb{Z}}
\newcommand{\XX}{\mathbb{X}}
\newcommand{\YY}{\mathbb{Y}}
\newcommand{\OOmega}{\mathbb{\Omega}}




%----- bold greek fonts -----%

\newcommand{\balpha}{\bm{\alpha}}
\newcommand{\bbeta}{\bm{\beta}}
\newcommand{\bgamma}{\bm{\gamma}}
\newcommand{\bdelta}{\bm{\delta}}
\newcommand{\bepsilon}{\bm{\epsilon}}
\newcommand{\bvarepsilon}{\bm{\varepsilon}}
\newcommand{\bzeta}{\bm{\zeta}}
\newcommand{\btheta}{\pmb{\theta}}
\newcommand{\bvartheta}{\bm{\vartheta}}
\newcommand{\bkappa}{\bm{\kappa}}
\newcommand{\blambda}{\bm{\lambda}}
\newcommand{\bmu}{\bm{\mu}}
\newcommand{\bnu}{\bm{\nu}}
\newcommand{\bxi}{\bm{\xi}}
\newcommand{\bpi}{\bm{\pi}}
\newcommand{\bvarpi}{\bm{\varpi}}
\newcommand{\brho}{\bm{\varrho}}
\newcommand{\bsigma}{\bm{\sigma}}
\newcommand{\bvarsigma}{\bm{\varsigma}}
\newcommand{\btau}{\bm{\tau}}
\newcommand{\bupsilon}{\bm{\upsilon}}
\newcommand{\bphi}{\bm{\phi}}
\newcommand{\bvarphi}{\bm{\varphi}}
\newcommand{\bchi}{\bm{\chi}}
\newcommand{\bpsi}{\bm{\psi}}
\newcommand{\bomega}{\bm{\omega}  
}

\newcommand{\bGamma}{\bm{\Gamma}}
\newcommand{\bDelta}{\bm{\Delta}}
\newcommand{\bTheta}{\bm{\Theta}}
\newcommand{\bThetas}{\bm{\Theta}^*}
\newcommand{\bLambda}{\bm{\Lambda}}
\newcommand{\bXi}{\bm{\Xi}}
\newcommand{\bPi}{\bm{\Pi}}
\newcommand{\bSigma}{\bm{\Sigma}}
\newcommand{\bUpsilon}{\bm{\Upsilon}}
\newcommand{\bPhi}{\bm{\Phi}}
\newcommand{\bPsi}{\bm{\Psi}}
\newcommand{\bOmega}{\bm{\Omega}}


\newcommand{\vect}{\text{vec}}
\newcommand{\Card}{\text{Card}}
\newcommand{\rank}{\text{rank}}
\newcommand{\row}{\text{row}}
\newcommand{\col}{\text{col}}
\newcommand{\tTheta}{\tilde{\bTheta}}
\newcommand{\tdelta}{\tilde{\delta}}
\newcommand{\oR}{\bar{R}}
\newcommand{\ocR}{\bar{\cR}}
\newcommand{\oP}{\bar{P}}
\newcommand{\ocP}{\bar{\cP}}
\newcommand{\oV}{\bar{V}}
\newcommand{\oQ}{\bar{Q}}
\newcommand{\tQ}{\tilde{Q}}
\newcommand{\hQ}{\hat{Q}}
\newcommand{\tpi}{\tilde{\pi}}
\newcommand{\hpi}{\hat{\pi}}
\newcommand{\thpi}{\tilde{\hat{\pi}}}
\newcommand{\htpi}{\hat{\tilde{\pi}}}




%----- Some standard definitions -----%

\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\argmaxmin}{\mathop{\mathrm{argmaxmin}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}

\newcommand{\sign}{\mathop{\mathrm{sign}}}
\newcommand{\tr}{\mathop{\mathrm{Tr}}}

\DeclareMathOperator{\Var}{{\rm Var}}
\DeclareMathOperator*{\Cor}{\rm Corr}
\DeclareMathOperator*{\Cov}{\rm Cov}
\DeclareMathOperator*{\ind}{\mathds{1}}  % Indicator
\newcommand{\smallfrac}[2]{{\textstyle \frac{#1}{#2}}}  
                                                        
\newcommand*{\zero}{{\bm 0}}
\newcommand*{\one}{{\bm 1}}

\newcommand{\diag}{{\rm diag}}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%% Norms

\newcommand{\norm}[1]{||#1||}
\newcommand{\bignorm}[1]{\bigg|\bigg|#1\bigg|\bigg|}
\newcommand{\opnorm}[2]{| \! | \! | #1 | \! | \! |_{{#2}}}

%%%%% Dot product
\newcommand{\dotp}[2]{\langle{#1},{#2}\rangle}

%%%%  brackets
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\rbr}[1]{\left(#1\right)}
\newcommand{\sbr}[1]{\left[#1\right]}
\newcommand{\cbr}[1]{\left\{#1\right\}}
\newcommand{\nbr}[1]{\left\|#1\right\|}
\newcommand{\abr}[1]{\left|#1\right|}

%%%%%%%%%  Other commands

\newcommand{\mcomment}[1]{\marginpar{\tiny{#1}}}
\newcommand{\fcomment}[1]{\footnote{\tiny{#1}}}
%\newcommand{\overbar}[1]{\mkern 2mu\overline{\mkern-2mu#1\mkern-2mu}\mkern 2mu}
\newcommand{\overbar}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}
\newcommand{\ud}{{\mathrm{d}}}


%%%%%%%% Zhuoran Yang's Modifications %%%%%%%%

\newcommand{\Sc}{\cS^{\perp}}
\newcommand{\Ac}{\cA^{\perp}}
\newcommand{\supp}{\mathrm{supp}} %support
\newcommand{\tx}{\tilde{\bX}}
\newcommand{\tX}{\tilde{\bX}}
\newcommand{\hbbeta}{\hat{\bbeta}}
\newcommand{\hbdelta}{\hat{\bdelta}}
\newcommand{\tbbeta}{\tilde{\bbeta}}
\newcommand{\bbetas}{\bbeta^*}





\newcommand{\hbgamma} {\hat{\bgamma}}
\newcommand{\tbdelta}{\tilde \bmeta}
\newcommand{\htheta}{\hat \btheta}

\newcommand{\hOmega}{\hat{\bOmega}}
\newcommand{\hTheta}{\hat{\bTheta}}
\newcommand{\hSigma}{\hat{\bSigma}}
\newcommand{\tSigma}{\tilde{\bSigma}}
\newcommand{\hbomega}{\hat{\bomega}}
\newcommand{\hbvartheta}{\hat \bvartheta}

\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\limn}{\lim\limits_{n\rightarrow \infty} }

\newcommand{\loss}{\ell} %loss function

\newcommand{\grad}{\nabla \loss }
\newcommand{\hess}{\nabla^2\loss}


\newcommand{\halpha}{\hat\alpha}
\newcommand{\hdelta}{\hat\delta}
\newcommand{\lalpha}{\bar{\alpha}}

%%%%%%%%  amsmath %%%%%%%%%%
% \newtheoremstyle{mytheoremstyle} % name
%     {\topsep}                    % Space above
%     {\topsep}                    % Space below
%     {\normalfont}                   % Body font
%     {}                           % Indent amount
%     {\bfseries}                   % Theorem head font
%     {.}                          % Punctuation after theorem head
%     {.5em}                       % Space after theorem head
%     {}  % Theorem head spec (can be left empty, meaning ‘normal’)

% \theoremstyle{mytheoremstyle}

\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}  % end of proof

\def\QED{~\rule[-1pt]{5pt}{5pt}\par\medskip}

\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
%\newenvironment{proof}{\emph{Proof. }}{ \hfill \QED}

\newtheorem{theorem}{Theorem}
\newtheorem{example}[theorem]{Example}
\newtheorem{property}{Property}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{assumption}[theorem]{Assumption}
\numberwithin{equation}{section}
\numberwithin{theorem}{section}






%%%%%%%%%%   package   %%%%%%%%%%%%%%%%
\newcommand{\remind}[1]{{\color{blue}#1}}
\newcommand{\issue}[1]{{\color{red}#1}}
 
% \usepackage{refcheck}
\usepackage{mathrsfs}
\usepackage{comment}
% \usepackage{amsthm,amsmath,amssymb}
% amsmath包含amsthm包，只用选前者就可以，否则会报错
\usepackage{amsmath,amssymb}
% \usepackage{subfigure}
\usepackage{appendix}
\usepackage{xargs}
\usepackage{graphicx}

\usepackage{caption}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{bbm,bm}
\usepackage{color}
% 引用超链接
\usepackage[
% colorlinks,
% linkcolor=red,
%anchorcolor=blue
% citecolor=blue
]{hyperref}

\allowdisplaybreaks[4]
%%%%%%%%%%  Notations %%%%%%%%%%%%%
% MDP model definitions
\newcommand{\statespace}[0]{\mathcal{X}} %state space
\newcommand{\statesize}[0]{X} %state space size
\newcommand{\state}[0]{x} %state
\newcommand{\actionspace}[0]{\mathcal{A}} %action space
\newcommand{\actionsize}[0]{A} %action space size
\newcommand{\action}[0]{a} %action
\newcommand{\sas}[0]{\rbr{\state,\action,\state^\prime}} %state-action-state^prime
\newcommand{\uvw}[0]{\rbr{u,v,w}} %state-action-state^prime
\newcommand{\mno}[0]{\rbr{m,n,o}} %state-action-state^prime
\newcommand{\sasspace}[0]{W} %state action state space at horizon ?
\newcommand{\sashspace}[0]{\statespace_{\horizon} \times \actionspace \times \statespace_{\horizon+1}} %state action state space at horizon h

\newcommand{\transspace}[0]{\mathcal{P}} %acceptable transition space
\newcommand{\trans}[0]{P(x^{\prime}\vert x,a)} %transition
\newcommand{\transeasy}[0]{P} %transition
\newcommand{\transest}[0]{\bar{P}(x^{\prime}\vert x,a)} %transition estimated by true counts
\newcommand{\transesteasy}[0]{\bar{P}} %transition estimated by true counts
\newcommand{\transpri}[0]{\widetilde{P}(x^{\prime}\vert x,a)} %transition estimated by private counts
\newcommand{\transprieasy}[0]{\widetilde{P}} %transition estimated by private counts
\newcommand{\episode}[0]{k} %episode
\newcommand{\episodetotal}[0]{K} %episode total number
\newcommand{\horizon}[0]{h} %horizon
\newcommand{\horizontotal}[0]{H} %horizon total number
\newcommand{\timetotal}[0]{T} %time total number = HK
\newcommand{\locsupport}[0]{C} % local effective support
\newcommand{\cumlocsupport}[0]{C_M} % cumulative local effective support
% \newcommand{\losspri}[0]{\widetilde{\ell}_h^k(x,a)} %private loss
% \newcommand{\lossprieasy}[0]{\widetilde{\ell}} %private loss
% \newcommand{\lossest}[0]{\widehat{\ell}_h^k(x,a)} %estimated loss
% \newcommand{\lossesteasy}[0]{\widehat{\ell}} %estimated loss
% \newcommand{\losscum}[0]{L_h^k(x,a)} %state-action cumulative loss
% \newcommand{\losscumeasy}[0]{L} %state-action cumulative loss
% \newcommand{\losscumbin}[0]{\widehat{L}_h^k(x,a)} %state-action cumulative loss from binary mechanism
% \newcommand{\losscumbineasy}[0]{\widehat{L}} %state-action 
% \newcommand{\losscumpri}[0]{\widetilde{L}_h^k(x,a)} %state-action private cumulative loss  
% \newcommand{\losscumprieasy}[0]{\widetilde{L}} %state-action private cumulative loss  
\newcommand{\losspri}[0]{\tilde{\ell}_k(x,a)} %private loss
\newcommand{\lossprieasy}[0]{\tilde{\ell}} %private loss
\newcommand{\lossest}[0]{\widehat{\ell}_k(x,a)} %estimated loss
\newcommand{\lossesteasy}[0]{\widehat{\ell}} %estimated loss
\newcommand{\losscum}[0]{L_k(x,a)} %state-action cumulative loss
\newcommand{\losscumeasy}[0]{L} %state-action cumulative loss
\newcommand{\losscumbin}[0]{\ddot{L}_k(x,a)} %state-action cumulative loss from binary mechanism
\newcommand{\losscumbineasy}[0]{\ddot{L}} %state-action 
\newcommand{\losscumpri}[0]{\widetilde{L}_k(x,a)} %state-action private cumulative loss  
\newcommand{\losscumprieasy}[0]{\widetilde{L}} %state-action private cumulative loss  
\newcommand{\policy}[0]{\pi} %policy
\newcommand{\valuef}[0]{V} %value function of one episode
\newcommand{\regret}[0]{\mathcal{R}_K} %regret
\newcommand{\nregret}[0]{\widetilde{\mathcal{R}}_\cK} %regret for noises

%algorithm parameters
\newcommand{\visitxatotal}[0]{N_k(x,a)} %state-action pair visit total counts
\newcommand{\visitxatotaleasy}[0]{N} %state-action pair visit total counts
\newcommand{\visitxatotalhateasy}[0]{\widehat{N}} %state-action pair visit total hat counts
\newcommand{\visitxatotalhat}[0]{\widehat{N}_k(x,a)} %state-action pair visit total hat counts
\newcommand{\visitxaxtotal}[0]{N_k(x,a,x^{\prime})} %state-action-state pair visit total counts
\newcommand{\visitxaxtotaleasy}[0]{N} %state-action-state pair visit total counts
\newcommand{\visitxatotalbin}[0]{\ddot{N}_k(x,a)} %state-action pair visit binary mechanism total counts
\newcommand{\visitxatotalbineasy}[0]{\ddot{N}} %state-action pair visit binary mechanism total counts
\newcommandx{\visitxaxtotalbin}[0]{\ddot{N}_k(x,a,x^{\prime})}
%state-action-state pair visit binary mechanism total counts
\newcommandx{\visitxaxtotalbineasy}[0]{\ddot{N}}
\newcommand{\visitxatotalopt}[0]{\bar{N}_k(x,a)} %state-action pair visit optimization total counts
\newcommand{\visitxatotalopteasy}[0]{\bar{N}} %state-action pair visit optimization total counts
\newcommandx{\visitxaxtotalopt}[0]{\bar{N}_k(x,a,x^{\prime})}
%state-action-state pair visit optimization total counts
\newcommandx{\visitxaxtotalopteasy}[0]{\bar{N}}
\newcommand{\visitxatotalpri}[0]{\widetilde{N}_k(x,a)} %state-action pair visit private total counts
\newcommand{\visitxatotalprieasy}[0]{\widetilde{N}} %state-action pair visit private total counts
\newcommandx{\visitxaxtotalpri}[0]{\widetilde{N}_k(x,a,x^{\prime})}
%state-action-state pair visit private total counts
\newcommandx{\visitxaxtotalprieasy}[0]{\widetilde{N}}




\newcommand{\badeventloss}[0]{F^l} %bad event with loss out of confidence 
\newcommand{\badeventtrans}[0]{F^p} %bad event with transition out of confidence 
\newcommand{\goodevent}[0]{G} %good event totally 
\newcommand{\badevent}[0]{\bar{G}} %bad event totally 

\newcommand{\occmeasure}[0]{q} %occupancy measure
\newcommand{\occmeasvir}[0]{\widehat{q}} %virtual occupancy measure
\newcommand{\occmeasmid}[0]{q^\prime} %middle variable occupancy measure
\newcommand{\uppocc}[0]{u} %upper occupancy measure bound
\newcommand{\occmeascom}[0]{\acute{q}} %combined variable occupancy measure


\newcommand{\transbyocc}[0]{P^q(x^\prime\vert x,a)} %transition introduced by occupancy measure
\newcommand{\policybyocc}[0]{\pi^q} %policy introduced by occupancy measure
\newcommand{\occmeasureset}[0]{\Delta} %acceptable occupancy measure set introduced by given transition
\newcommand{\FTRLpara}[0]{\eta} %parameter of FTRL
\newcommand{\regularizer}[0]{\psi} %parameter of FTRL
\newcommand{\divg}[2]{\mathcal{D}_{\psi}(#1\Vert#2)} %divergence
\newcommand{\divgeasy}[0]{\mathcal{D}_{\psi}} %divergence



%privacy parameters
\newcommand{\pripara}[0]{\varepsilon} %DP parameter
\newcommand{\tree}[0]{\mathcal{B}} %binary tree
\newcommand{\noisecount}[0]{E_{\episodetotal}}
\newcommand{\conflossf}[0]{E_{\varepsilon}} %full information loss noise confidence 
\newcommand{\confcountxa}[0]{E_{\varepsilon,\delta}} %confidence radius for state-action pair counts, i.e., N(x,a)-\tilde{N}(x,a)
\newcommand{\confcountxax}[0]{E_{\varepsilon,\delta}} %confidence radius for state-action-state pair counts, i.e., N(x,a,x)-\tilde{N}(x,a,x)
\newcommand{\confnormtrans}[0]{\beta_k} %confidence radius for transition L1 norm, i.e., |P(\cdot\vert x,a)-\tilde{P}(\cdot\vert x,a)|_1
\newcommand{\confnormtranseasy}[0]{\beta} %confidence radius for transition L1 norm, i.e., |P(\cdot\vert x,a)-\tilde{P}(\cdot\vert x,a)|_1
\newcommand{\confpwtrans}[0]{\beta} %confidence radius for transition error pointwisely, i.e., |P(x^\prime\vert x,a)-\tilde{P}(x^\prime\vert x,a)|
% \newcommand{\confconstant}[0]{L_p(\delta)} %constant in confidence radius for transition L1 norm.
\newcommand{\confconstant}[1][h]{L_p(\delta,#1)} %constant in confidence radius for transition L1 norm.
\newcommand{\confconstpw}[0]{L_p^\prime(\delta)} %constant in confidence radius for transition pointwisely.
\newcommand{\normtranserror}[0]{\xi} %L1 estimate error of transition
\newcommand{\datas}[0]{\sigma} %data stream
\newcommand{\dataspri}[0]{\widetilde{\sigma}} %data stream
% \newcommand{\noise}[0]{Z_h^k(x,a)} %noise added to real sum
\newcommand{\noise}[0]{Z_k(x,a)} %noise added to real sum
\newcommand{\noiseeasy}[0]{Z} %noise added to real sum easy
\newcommand{\lap}[1]{\text{Lap}\rbr{#1}} %noise added to real sum easy
\newcommand{\ninterval}[0]{E_{\varepsilon,\delta}^\prime} %noise interval parameter to ignore
% \newcommand{\ninterval}[0]{c} %noise interval parameter to ignore
\newcommand{\basealgo}[0]{\Lambda} %base non-private algorithm for bandit setting
\newcommand{\lag}[0]{\mathcal{L}} %Lagrange function
\newcommand{\dualvsum}[0]{\lambda} %dual variable of occupancy sum 
\newcommand{\dualvmid}[0]{\nu} %dual variable of  occupancy measure middle relation
\newcommand{\dualvtrp}[0]{\mu^{+}} %dual variable of transition positive error
\newcommand{\dualvtrn}[0]{\mu^{-}} %dual variable of transition negative error
\newcommand{\dualvtr}[0]{\mu} %dual variable of transition error sum bound
\newcommand{\vsdual}[0]{S} %middle variable of solving dual variables
\newcommand{\vbellman}[0]{B} %middle variable of bellman error for dual computation




\newcommand{\userspace}[0]{\mathcal{U}} %user space
\newcommand{\user}[0]{u} %user
\newcommand{\userseq}[0]{U} %user sequence
\newcommand{\Agent}[0]{\mathcal{M}} %RL agent algorithm
\newcommand{\history}[1][t]{\mathcal{H}_{#1}} %random event history up to time t
\newcommand{\traj}[0]{S} %one trajectory of one episode
\newcommand{\trajset}[0]{\mathcal{S}} %the trajectory set of one episode

\newcommand{\prob}[0]{\mathbb{P}} %probability
% \newcommand{\distb}[0]{\mathbb{D}} %distribution
\newcommand{\expect}[0]{\mathbb{E}} %distribution
\newcommand{\distc}[0]{R} %distance


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand\sj[1]{\textcolor{blue}{#1}}
%\newcommand\todo[1]{\textcolor{blue}{#1}}
\newcommand\ST[1]{{\color{purple}{#1}}}

\usepackage[colorinlistoftodos, textwidth=4cm, shadow]{todonotes}
\newcommand{\sadegh}[1]{\todo[inline,color=orange!40]{{\it Sadegh:~}#1}}

\newcommand{\XD}[1]{\todo[inline,color=orange!40]{{\it XD:~}#1}}

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}


\title{Differentially Private No-regret Exploration in \\Adversarial Markov Decision Processes}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,3]{Shaojie Bai}
\author[1]{Lanting Zeng}
\author[1]{Chengcheng Zhao}
\author[2]{Xiaoming Duan} 
\author[3]{Mohammad Sadegh Talebi}
\author[1]{\newline Peng Cheng \thanks{Corresponding author}}
\author[1]{Jiming Chen}
% Add affiliations after the authors
\affil[1]{%
    % Computer Science Dept.\\
    Zhejiang University\\
    Hangzhou, China
}
\affil[2]{%
    Shanghai Jiaotong University\\
    Shanghai, China
}
\affil[3]{%
    University of Copenhagen\\
    Copenhagen, Denmark
  }
  
\begin{document}
\maketitle

\begin{abstract}
We study learning adversarial Markov decision process (MDP) in the episodic setting under the constraint of differential privacy (DP).
This is motivated by the widespread applications of reinforcement learning (RL) in non-stationary and even adversarial scenarios, where protecting users' sensitive information is vital.
We first propose two efficient frameworks for adversarial MDPs, spanning full-information and bandit settings.
Within each framework, we consider both Joint DP (JDP), where a central agent is trusted to protect the sensitive data, and Local DP (LDP), where the information is protected directly on the user side.
Then, we design novel privacy mechanisms to privatize the stochastic transition and adversarial losses.
By instantiating such privacy mechanisms to satisfy JDP and LDP requirements, we obtain near-optimal regret guarantees for both frameworks.
To our knowledge, these are the first algorithms to tackle the challenge of private learning in adversarial MDPs.
\end{abstract}

\input{./Sections/1-introduction} 
\input{./Sections/2-preliminary}
\input{./Sections/3-Full-Information Setting}
\input{./Sections/4-Bandit-Information Setting}
\input{./Sections/5-guarantee}
% \input{./Sections/6-Experiments}
\input{./Sections/7-conclusion}


% Shaojie Bai is supported by Fundamental Research Funds for the Central Universities (FRF) 226-2023-00111，226-2024-00004, and National Natural Science Foundation of China under Grants (NSFC) 62293511.
% Lanting Zeng is supported by NSFC-62103371 and Zhejiang Provincial Natural Science Foundation under Grant (ZPNSFC) LZ23F030009.
% Chengcheng Zhao is supported by NSFC-62273305 and ZPNSFC-LZ22F030010.
% Xiaoming Duan is supported by Shanghai Pujiang Program under grant 22PJ1404900.

\begin{acknowledgements} 
    We thank the anonymous reviewers and area chair for their valuable comments, and the helpful discussion with Chloé Rouyer, Yi-Shan Wu. 
    This work is supported by National Natural Science Foundation of China (NSFC) under Grant 62293511, 62103371, 62273305; 
    Fundamental Research Funds for the Central Universities (FRF) 226-2023-00111, 226-2024-00004; 
    Zhejiang Provincial Natural Science Foundation under Grant (ZPNSFC) LZ23F030009, LZ22F030010; 
    Shanghai Pujiang Program under grant 22PJ1404900; and the Independent Research Fund Denmark under grant 1026-00397B. 
\end{acknowledgements}

% \bibliographystyle{plainnat}
%加nat是在正文引用格式不用number，同时后面参考文献也没有number
\bibliography{main}
\newpage

\onecolumn

\title{Differentially Private No-regret Exploration in \\ Adversarial Markov Decision Processes (Supplementary Material)}
\maketitle
\appendix

\input{./Sections/8-appendix}

%\input{./supplement}
\end{document}

