\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\usepackage{times}
\usepackage{xcolor}

\usepackage{algorithm, algpseudocode}
\usepackage{amsfonts,amssymb, amsmath, amsthm}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% \usepackage{geometry} % fix the margin of pdf
\usepackage{wrapfig}
\usepackage{bm}

\usepackage{enumitem}
\usepackage{xspace}
\usepackage{tcolorbox}

\usepackage{xr}
\usepackage{hyperref}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{ren_338}
\usepackage{url}

\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    citecolor=cyan,
    filecolor=green,      
    urlcolor=black,
}

\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\RequirePackage{latexsym} \RequirePackage{amsmath}
%\RequirePackage{amssymb} \RequirePackage{bm} \RequirePackage{url}
\usepackage{latexsym}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{mathrsfs}
\usepackage{url}


%%%%%%%% Stock standard definitions %%%%%%%%%%%%%%%

\newcommand{\mfe}{\mathfrak{e}}
\newcommand{\mfb}{\mathfrak{b}}


\newcommand{\alphab}{\boldsymbol{\alpha}}
\newcommand{\betab}{\boldsymbol{\beta}}
\newcommand{\gammab}{\boldsymbol{\gamma}}
\newcommand{\thetab}{\boldsymbol{\theta}}
\newcommand{\phib}{\boldsymbol{\phi}}
\newcommand{\omegab}{\boldsymbol{\omega}}

\newcommand{\Phib}{\boldsymbol{\Phi}}
% \newcommand{\ab}{\bm{a}}
% \newcommand{\bb}{\bm{b}}
% \newcommand{\cbb}{\bm{c}}
% \newcommand{\db}{\bm{d}}
% \newcommand{\eb}{\bm{e}}
% \newcommand{\fb}{\bm{f}}
% \newcommand{\gb}{\bm{g}}
% \newcommand{\hb}{\bm{h}}
% \newcommand{\ib}{\bm{i}}
% \newcommand{\jb}{\bm{j}}
% \newcommand{\kb}{\bm{k}}
% \newcommand{\lb}{\bm{l}}
% \newcommand{\mb}{\bm{m}}
% \newcommand{\nbb}{\bm{n}}
% \newcommand{\ob}{\bm{o}}
% \newcommand{\pb}{\bm{p}}
% \newcommand{\qb}{\bm{q}}
% \newcommand{\rb}{\bm{r}}
% \newcommand{\sbb}{\bm{s}}
% \newcommand{\tb}{\bm{t}}
% \newcommand{\ub}{\bm{u}}
% \newcommand{\vb}{\bm{v}}
% \newcommand{\wb}{\bm{w}}
% \newcommand{\xb}{\bm{x}}
% \newcommand{\yb}{\bm{y}}
% \newcommand{\zb}{\bm{z}}

\newcommand{\ab}{\mathbf{a}}
\newcommand{\bb}{\mathbf{b}}
\newcommand{\cbb}{\mathbf{c}}
\newcommand{\db}{\mathbf{d}}
\newcommand{\eb}{\mathbf{e}}
\newcommand{\fb}{\mathbf{f}}
\newcommand{\gb}{\mathbf{g}}
\newcommand{\hb}{\mathbf{h}}
\newcommand{\ib}{\mathbf{i}}
\newcommand{\jb}{\mathbf{j}}
\newcommand{\kb}{\mathbf{k}}
\newcommand{\lb}{\mathbf{l}}
\newcommand{\mb}{\mathbf{m}}
\newcommand{\nbb}{\mathbf{n}}
\newcommand{\ob}{\mathbf{o}}
\newcommand{\pb}{\mathbf{p}}
\newcommand{\qb}{\mathbf{q}}
\newcommand{\rb}{\mathbf{r}}
\newcommand{\sbb}{\mathbf{s}}
\newcommand{\tb}{\mathbf{t}}
\newcommand{\ub}{\mathbf{u}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
\newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}

\newcommand{\abtil}{\tilde{\ab}}
\newcommand{\bbtil}{\tilde{\bb}}
\newcommand{\cbtil}{\tilde{\cbb}}
\newcommand{\dbtil}{\tilde{\db}}
\newcommand{\ebtil}{\tilde{\eb}}
\newcommand{\fbtil}{\tilde{\fb}}
\newcommand{\gbtil}{\tilde{\gb}}
\newcommand{\hbtil}{\tilde{\hb}}
\newcommand{\ibtil}{\tilde{\ib}}
\newcommand{\jbtil}{\tilde{\jb}}
\newcommand{\kbtil}{\tilde{\kb}}
\newcommand{\lbtil}{\tilde{\lb}}
\newcommand{\mbtil}{\tilde{\mb}}
\newcommand{\nbtil}{\tilde{\nbb}}
\newcommand{\obtil}{\tilde{\ob}}
\newcommand{\pbtil}{\tilde{\pb}}
\newcommand{\qbtil}{\tilde{\qb}}
\newcommand{\rbtil}{\tilde{\rb}}
\newcommand{\sbtil}{\tilde{\sbb}}
\newcommand{\tbtil}{\tilde{\tb}}
\newcommand{\ubtil}{\tilde{\ub}}
\newcommand{\vbtil}{\tilde{\vb}}
\newcommand{\wbtil}{\tilde{\wb}}
\newcommand{\xbtil}{\tilde{\xb}}
\newcommand{\ybtil}{\tilde{\yb}}
\newcommand{\zbtil}{\tilde{\zb}}


\newcommand{\atil}{\tilde{a}}
\newcommand{\btil}{\tilde{b}}
\newcommand{\ctil}{\tilde{c}}
\newcommand{\dtil}{\tilde{d}}
\newcommand{\etil}{\tilde{e}}
\newcommand{\ftil}{\tilde{f}}
\newcommand{\gtil}{\tilde{g}}
\newcommand{\htil}{\tilde{h}}
\newcommand{\itil}{\tilde{i}}
\newcommand{\jtil}{\tilde{j}}
\newcommand{\ktil}{\tilde{k}}
\newcommand{\ltil}{\tilde{l}}
\newcommand{\mtil}{\tilde{m}}
\newcommand{\ntil}{\tilde{n}}
\newcommand{\otil}{\tilde{o}}
\newcommand{\ptil}{\tilde{p}}
\newcommand{\qtil}{\tilde{q}}
\newcommand{\rtil}{\tilde{r}}
\newcommand{\stil}{\tilde{s}}
\newcommand{\ttil}{\tilde{t}}
\newcommand{\util}{\tilde{u}}
\newcommand{\vtil}{\tilde{v}}
\newcommand{\wtil}{\tilde{w}}
\newcommand{\xtil}{\tilde{x}}
\newcommand{\ytil}{\tilde{y}}
\newcommand{\ztil}{\tilde{z}}

\newcommand{\Atil}{\tilde{A}}
\newcommand{\Btil}{\tilde{B}}
\newcommand{\Ctil}{\tilde{C}}
\newcommand{\Dtil}{\tilde{D}}
\newcommand{\Etil}{\tilde{E}}
\newcommand{\Ftil}{\tilde{F}}
\newcommand{\Gtil}{\tilde{G}}
\newcommand{\Htil}{\tilde{H}}
\newcommand{\Itil}{\tilde{I}}
\newcommand{\Jtil}{\tilde{J}}
\newcommand{\Ktil}{\tilde{K}}
\newcommand{\Ltil}{\tilde{L}}
\newcommand{\Mtil}{\tilde{M}}
\newcommand{\Ntil}{\tilde{N}}
\newcommand{\Otil}{\tilde{O}}
\newcommand{\Ptil}{\tilde{P}}
\newcommand{\Qtil}{\tilde{Q}}
\newcommand{\Rtil}{\tilde{R}}
\newcommand{\Stil}{\tilde{S}}
\newcommand{\Ttil}{\tilde{T}}
\newcommand{\Util}{\tilde{U}}
\newcommand{\Vtil}{\tilde{V}}
\newcommand{\Wtil}{\tilde{W}}
\newcommand{\Xtil}{\tilde{X}}
\newcommand{\Ytil}{\tilde{Y}}
\newcommand{\Ztil}{\tilde{Z}}

\newcommand{\abar}{\bar{a}}
\newcommand{\bbar}{\bar{b}}
\newcommand{\cbar}{\bar{c}}
\newcommand{\dbar}{\bar{d}}
\newcommand{\ebar}{\bar{e}}
\newcommand{\fbar}{\bar{f}}
\newcommand{\gbar}{\bar{g}}
\newcommand{\hbr}{\bar{h}}
\newcommand{\ibar}{\bar{i}}
\newcommand{\jbar}{\bar{j}}
\newcommand{\kbar}{\bar{k}}
\newcommand{\lbar}{\bar{l}}
\newcommand{\mbar}{\bar{m}}
\newcommand{\nbar}{\bar{n}}
\newcommand{\obar}{\bar{o}}
\newcommand{\pbar}{\bar{p}}
\newcommand{\qbar}{\bar{q}}
\newcommand{\rbar}{\bar{r}}
\newcommand{\sbar}{\bar{s}}
\newcommand{\tbar}{\bar{t}}
\newcommand{\ubar}{\bar{u}}
\newcommand{\vbar}{\bar{v}}
\newcommand{\wbar}{\bar{w}}
\newcommand{\xbar}{\bar{x}}
\newcommand{\ybar}{\bar{y}}
\newcommand{\zbar}{\bar{z}}

\newcommand{\abbar}{\bar{\ab}}
\newcommand{\bbbar}{\bar{\bb}}
\newcommand{\cbbar}{\bar{\cb}}
\newcommand{\dbbar}{\bar{\db}}
\newcommand{\ebbar}{\bar{\eb}}
\newcommand{\fbbar}{\bar{\fb}}
\newcommand{\gbbar}{\bar{\gb}}
\newcommand{\hbbar}{\bar{\hb}}
\newcommand{\ibbar}{\bar{\ib}}
\newcommand{\jbbar}{\bar{\jb}}
\newcommand{\kbbar}{\bar{\kb}}
\newcommand{\lbbar}{\bar{\lb}}
\newcommand{\mbbar}{\bar{\mb}}
\newcommand{\nbbar}{\bar{\nbb}}
\newcommand{\obbar}{\bar{\ob}}
\newcommand{\pbbar}{\bar{\pb}}
\newcommand{\qbbar}{\bar{\qb}}
\newcommand{\rbbar}{\bar{\rb}}
\newcommand{\sbbar}{\bar{\sbb}}
\newcommand{\tbbar}{\bar{\tb}}
\newcommand{\ubbar}{\bar{\ub}}
\newcommand{\vbbar}{\bar{\vb}}
\newcommand{\wbbar}{\bar{\wb}}
\newcommand{\xbbar}{\bar{\xb}}
\newcommand{\ybbar}{\bar{\yb}}
\newcommand{\zbbar}{\bar{\zb}}

% \newcommand{\Ab}{\bm{A}}
% \newcommand{\Bb}{\bm{B}}
% \newcommand{\Cb}{\bm{C}}
% \newcommand{\Db}{\bm{D}}
% \newcommand{\Eb}{\bm{E}}
% \newcommand{\Fb}{\bm{F}}
% \newcommand{\Gb}{\bm{G}}
% \newcommand{\Hb}{\bm{H}}
% \newcommand{\Ib}{\bm{I}}
% \newcommand{\Jb}{\bm{J}}
% \newcommand{\Kb}{\bm{K}}
% \newcommand{\Lb}{\bm{L}}
% \newcommand{\Mb}{\bm{M}}
% \newcommand{\Nb}{\bm{N}}
% \newcommand{\Ob}{\bm{O}}
% \newcommand{\Pb}{\bm{P}}
% \newcommand{\Qb}{\bm{Q}}
% \newcommand{\Rb}{\bm{R}}
% \newcommand{\Sbb}{\bm{S}}
% \newcommand{\Tb}{\bm{T}}
% \newcommand{\Ub}{\bm{U}}
% \newcommand{\Vb}{\bm{V}}
% \newcommand{\Wb}{\bm{W}}
% \newcommand{\Xb}{\bm{X}}
% \newcommand{\Yb}{\bm{Y}}
% \newcommand{\Zb}{\bm{Z}}

\newcommand{\Ab}{\mathbf{A}}
\newcommand{\Bb}{\mathbf{B}}
\newcommand{\Cb}{\mathbf{C}}
\newcommand{\Db}{\mathbf{D}}
\newcommand{\Eb}{\mathbf{E}}
\newcommand{\Fb}{\mathbf{F}}
\newcommand{\Gb}{\mathbf{G}}
\newcommand{\Hb}{\mathbf{H}}
\newcommand{\Ib}{\mathbf{I}}
\newcommand{\Jb}{\mathbf{J}}
\newcommand{\Kb}{\mathbf{K}}
\newcommand{\Lb}{\mathbf{L}}
\newcommand{\Mb}{\mathbf{M}}
\newcommand{\Nb}{\mathbf{N}}
\newcommand{\Ob}{\mathbf{O}}
\newcommand{\Pb}{\mathbf{P}}
\newcommand{\Qb}{\mathbf{Q}}
\newcommand{\Rb}{\mathbf{R}}
\newcommand{\Sbb}{\mathbf{S}}
\newcommand{\Tb}{\mathbf{T}}
\newcommand{\Ub}{\mathbf{U}}
\newcommand{\Vb}{\mathbf{V}}
\newcommand{\Wb}{\mathbf{W}}
\newcommand{\Xb}{\mathbf{X}}
\newcommand{\Yb}{\mathbf{Y}}
\newcommand{\Zb}{\mathbf{Z}}

% \newcommand{\Abtil}{\tilde{\Ab}}
% \newcommand{\Bbtil}{\tilde{\Bb}}
% \newcommand{\Cbtil}{\tilde{\Cb}}
% \newcommand{\Dbtil}{\tilde{\Db}}
% \newcommand{\Ebtil}{\tilde{\Eb}}
% \newcommand{\Fbtil}{\tilde{\Fb}}
% \newcommand{\Gbtil}{\tilde{\Gb}}
% \newcommand{\Hbtil}{\tilde{\Hb}}
% \newcommand{\Ibtil}{\tilde{\Ib}}
% \newcommand{\Jbtil}{\tilde{\Jb}}
% \newcommand{\Kbtil}{\tilde{\Kb}}
% \newcommand{\Lbtil}{\tilde{\Lb}}
% \newcommand{\Mbtil}{\tilde{\Mb}}
% \newcommand{\Nbtil}{\tilde{\Nb}}
% \newcommand{\Obtil}{\tilde{\Ob}}
% \newcommand{\Pbtil}{\tilde{\Pb}}
% \newcommand{\Qbtil}{\tilde{\Qb}}
% \newcommand{\Rbtil}{\tilde{\Rb}}
% \newcommand{\Sbtil}{\tilde{\Sbb}}
% \newcommand{\Tbtil}{\tilde{\Tb}}
% \newcommand{\Ubtil}{\tilde{\Ub}}
% \newcommand{\Vbtil}{\tilde{\Vb}}
% \newcommand{\Wbtil}{\tilde{\Wb}}
% \newcommand{\Xbtil}{\tilde{\Xb}}
% \newcommand{\Ybtil}{\tilde{\Yb}}
% \newcommand{\Zbtil}{\tilde{\Zb}}

\newcommand{\Abar}{\bar{A}}
\newcommand{\Bbar}{\bar{B}}
\newcommand{\Cbar}{\bar{C}}
\newcommand{\Dbar}{\bar{D}}
\newcommand{\Ebar}{\bar{E}}
\newcommand{\Fbar}{\bar{F}}
\newcommand{\Gbar}{\bar{G}}
\newcommand{\Hbar}{\bar{H}}
\newcommand{\Ibar}{\bar{I}}
\newcommand{\Jbar}{\bar{J}}
\newcommand{\Kbar}{\bar{K}}
\newcommand{\Lbar}{\bar{L}}
\newcommand{\Mbar}{\bar{M}}
\newcommand{\Nbar}{\bar{N}}
\newcommand{\Obar}{\bar{O}}
\newcommand{\Pbar}{\bar{P}}
\newcommand{\Qbar}{\bar{Q}}
\newcommand{\Rbar}{\bar{R}}
\newcommand{\Sbar}{\bar{S}}
\newcommand{\Tbar}{\bar{T}}
\newcommand{\Ubar}{\bar{U}}
\newcommand{\Vbar}{\bar{V}}
\newcommand{\Wbar}{\bar{W}}
\newcommand{\Xbar}{\bar{X}}
\newcommand{\Ybar}{\bar{Y}}
\newcommand{\Zbar}{\bar{Z}}

\newcommand{\Abbar}{\bar{\Ab}}
\newcommand{\Bbbar}{\bar{\Bb}}
\newcommand{\Cbbar}{\bar{\Cb}}
\newcommand{\Dbbar}{\bar{\Db}}
\newcommand{\Ebbar}{\bar{\Eb}}
\newcommand{\Fbbar}{\bar{\Fb}}
\newcommand{\Gbbar}{\bar{\Gb}}
\newcommand{\Hbbar}{\bar{\Hb}}
\newcommand{\Ibbar}{\bar{\Ib}}
\newcommand{\Jbbar}{\bar{\Jb}}
\newcommand{\Kbbar}{\bar{\Kb}}
\newcommand{\Lbbar}{\bar{\Lb}}
\newcommand{\Mbbar}{\bar{\Mb}}
\newcommand{\Nbbar}{\bar{\Nb}}
\newcommand{\Obbar}{\bar{\Ob}}
\newcommand{\Pbbar}{\bar{\Pb}}
\newcommand{\Qbbar}{\bar{\Qb}}
\newcommand{\Rbbar}{\bar{\Rb}}
\newcommand{\Sbbar}{\bar{\Sb}}
\newcommand{\Tbbar}{\bar{\Tb}}
\newcommand{\Ubbar}{\bar{\Ub}}
\newcommand{\Vbbar}{\bar{\Vb}}
\newcommand{\Wbbar}{\bar{\Wb}}
\newcommand{\Xbbar}{\bar{\Xb}}
\newcommand{\Ybbar}{\bar{\Yb}}
\newcommand{\Zbbar}{\bar{\Zb}}

\newcommand{\Ahat}{\widehat{A}}
\newcommand{\Bhat}{\widehat{B}}
\newcommand{\Chat}{\widehat{C}}
\newcommand{\Dhat}{\widehat{D}}
\newcommand{\Ehat}{\widehat{E}}
\newcommand{\Fhat}{\widehat{F}}
\newcommand{\Ghat}{\widehat{G}}
\newcommand{\Hhat}{\widehat{H}}
\newcommand{\Ihat}{\widehat{I}}
\newcommand{\Jhat}{\widehat{J}}
\newcommand{\Khat}{\widehat{K}}
\newcommand{\Lhat}{\widehat{L}}
\newcommand{\Mhat}{\widehat{M}}
\newcommand{\Nhat}{\widehat{N}}
\newcommand{\Ohat}{\widehat{O}}
\newcommand{\Phat}{\widehat{P}}
\newcommand{\Qhat}{\widehat{Q}}
\newcommand{\Rhat}{\widehat{R}}
\newcommand{\Shat}{\widehat{S}}
\newcommand{\That}{\widehat{T}}
\newcommand{\Uhat}{\widehat{U}}
\newcommand{\Vhat}{\widehat{V}}
\newcommand{\What}{\widehat{W}}
\newcommand{\Xhat}{\widehat{X}}
\newcommand{\Yhat}{\widehat{Y}}
\newcommand{\Zhat}{\widehat{Z}}

\newcommand{\ahat}{\widehat{a}}
\newcommand{\bhat}{\widehat{b}}
\newcommand{\chat}{\widehat{c}}
\newcommand{\dhat}{\widehat{d}}
\newcommand{\ehat}{\widehat{e}}
\newcommand{\fhat}{\widehat{f}}
\newcommand{\ghat}{\widehat{g}}
\newcommand{\hhat}{\widehat{h}}
\newcommand{\ihat}{\widehat{i}}
\newcommand{\jhat}{\widehat{j}}
\newcommand{\khat}{\widehat{k}}
\newcommand{\lhat}{\widehat{l}}
\newcommand{\mhat}{\widehat{m}}
\newcommand{\nhat}{\widehat{n}}
\newcommand{\ohat}{\widehat{o}}
\newcommand{\phat}{\widehat{p}}
\newcommand{\qhat}{\widehat{q}}
\newcommand{\rhat}{\widehat{r}}
\newcommand{\shat}{\widehat{s}}
\newcommand{\that}{\widehat{t}}
\newcommand{\uhat}{\widehat{u}}
\newcommand{\vhat}{\widehat{v}}
\newcommand{\what}{\widehat{w}}
\newcommand{\xhat}{\widehat{x}}
\newcommand{\yhat}{\widehat{y}}
\newcommand{\zhat}{\widehat{z}}

\newcommand{\Abhat}{\hat{\Ab}}
\newcommand{\Bbhat}{\hat{\Bb}}
\newcommand{\Cbhat}{\hat{\Cb}}
\newcommand{\Dbhat}{\hat{\Db}}
\newcommand{\Ebhat}{\hat{\Eb}}
\newcommand{\Fbhat}{\hat{\Fb}}
\newcommand{\Gbhat}{\hat{\Gb}}
\newcommand{\Hbhat}{\hat{\Hb}}
\newcommand{\Ibhat}{\hat{\Ib}}
\newcommand{\Jbhat}{\hat{\Jb}}
\newcommand{\Kbhat}{\hat{\Kb}}
\newcommand{\Lbhat}{\hat{\Lb}}
\newcommand{\Mbhat}{\hat{\Mb}}
\newcommand{\Nbhat}{\hat{\Nb}}
\newcommand{\Obhat}{\hat{\Ob}}
\newcommand{\Pbhat}{\hat{\Pb}}
\newcommand{\Qbhat}{\hat{\Qb}}
\newcommand{\Rbhat}{\hat{\Rb}}
\newcommand{\Sbhat}{\hat{\Sb}}
\newcommand{\Tbhat}{\hat{\Tb}}
\newcommand{\Ubhat}{\hat{\Ub}}
\newcommand{\Vbhat}{\hat{\Vb}}
\newcommand{\Wbhat}{\hat{\Wb}}
\newcommand{\Xbhat}{\hat{\Xb}}
\newcommand{\Ybhat}{\hat{\Yb}}
\newcommand{\Zbhat}{\hat{\Zb}}

\newcommand{\Acal}{\mathcal{A}}
\newcommand{\Bcal}{\mathcal{B}}
\newcommand{\Ccal}{\mathcal{C}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\Ecal}{\mathcal{E}}
\newcommand{\Fcal}{\mathcal{F}}
\newcommand{\Gcal}{\mathcal{G}}
\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Ical}{\mathcal{I}}
\newcommand{\Jcal}{\mathcal{J}}
\newcommand{\Kcal}{\mathcal{K}}
\newcommand{\Lcal}{\mathcal{L}}
\newcommand{\Mcal}{\mathcal{M}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Ocal}{\mathcal{O}}
\newcommand{\Pcal}{\mathcal{P}}
\newcommand{\Qcal}{\mathcal{Q}}
\newcommand{\Rcal}{\mathcal{R}}
\newcommand{\Scal}{{\mathcal{S}}}
\newcommand{\Tcal}{{\mathcal{T}}}
\newcommand{\Ucal}{\mathcal{U}}
\newcommand{\Vcal}{\mathcal{V}}
\newcommand{\Wcal}{\mathcal{W}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\Zcal}{\mathcal{Z}}

\newcommand{\Ascr}{\mathscr{A}}
\newcommand{\Bscr}{\mathscr{B}}
\newcommand{\Cscr}{\mathscr{C}}
\newcommand{\Dscr}{\mathscr{D}}
\newcommand{\Escr}{\mathscr{E}}
\newcommand{\Fscr}{\mathscr{F}}
\newcommand{\Gscr}{\mathscr{G}}
\newcommand{\Hscr}{\mathscr{H}}
\newcommand{\Iscr}{\mathscr{I}}
\newcommand{\Jscr}{\mathscr{J}}
\newcommand{\Kscr}{\mathscr{K}}
\newcommand{\Lscr}{\mathscr{L}}
\newcommand{\Mscr}{\mathscr{M}}
\newcommand{\Nscr}{\mathscr{N}}
\newcommand{\Oscr}{\mathscr{O}}
\newcommand{\Pscr}{\mathscr{P}}
\newcommand{\Qscr}{\mathscr{Q}}
\newcommand{\Rscr}{\mathscr{R}}
\newcommand{\Sscr}{{\mathscr{S}}}
\newcommand{\Tscr}{{\mathscr{T}}}
\newcommand{\Uscr}{\mathscr{U}}
\newcommand{\Vscr}{\mathscr{V}}
\newcommand{\Wscr}{\mathscr{W}}
\newcommand{\Xscr}{\mathscr{X}}
\newcommand{\Yscr}{\mathscr{Y}}
\newcommand{\Zscr}{\mathscr{Z}}

\newcommand{\Afra}{\mathfrak{A}}
\newcommand{\Bfra}{\mathfrak{B}}
\newcommand{\Cfra}{\mathfrak{C}}
\newcommand{\Dfra}{\mathfrak{D}}
\newcommand{\Efra}{\mathfrak{E}}
\newcommand{\Ffra}{\mathfrak{F}}
\newcommand{\Gfra}{\mathfrak{G}}
\newcommand{\Hfra}{\mathfrak{H}}
\newcommand{\Ifra}{\mathfrak{I}}
\newcommand{\Jfra}{\mathfrak{J}}
\newcommand{\Kfra}{\mathfrak{K}}
\newcommand{\Lfra}{\mathfrak{L}}
\newcommand{\Mfra}{\mathfrak{M}}
\newcommand{\Nfra}{\mathfrak{N}}
\newcommand{\Ofra}{\mathfrak{O}}
\newcommand{\Pfra}{\mathfrak{P}}
\newcommand{\Qfra}{\mathfrak{Q}}
\newcommand{\Rfra}{\mathfrak{R}}
\newcommand{\Sfra}{{\mathfrak{S}}}
\newcommand{\Tfra}{{\mathfrak{T}}}
\newcommand{\Ufra}{\mathfrak{U}}
\newcommand{\Vfra}{\mathfrak{V}}
\newcommand{\Wfra}{\mathfrak{W}}
\newcommand{\Xfra}{\mathfrak{X}}
\newcommand{\Yfra}{\mathfrak{Y}}
\newcommand{\Zfra}{\mathfrak{Z}}


\newcommand{\Acalb}{\bm{\Acal}}
\newcommand{\Bcalb}{\bm{\Bcal}}
\newcommand{\Ccalb}{\bm{\Ccal}}
\newcommand{\Dcalb}{\bm{\Dcal}}
\newcommand{\Ecalb}{\bm{\Ecal}}
\newcommand{\Fcalb}{\bm{\Fcal}}
\newcommand{\Gcalb}{\bm{\Gcal}}
\newcommand{\Hcalb}{\bm{\Hcal}}
\newcommand{\Icalb}{\bm{\Ical}}
\newcommand{\Jcalb}{\bm{\Jcal}}
\newcommand{\Kcalb}{\bm{\Kcal}}
\newcommand{\Lcalb}{\bm{\Lcal}}
\newcommand{\Mcalb}{\bm{\Mcal}}
\newcommand{\Ncalb}{\bm{\Ncal}}
\newcommand{\Ocalb}{\bm{\Ocal}}
\newcommand{\Pcalb}{\bm{\Pcal}}
\newcommand{\Qcalb}{\bm{\Qcal}}
\newcommand{\Rcalb}{\bm{\Rcal}}
\newcommand{\Scalb}{\bm{\Scal}}
\newcommand{\Tcalb}{\bm{\Tcal}}
\newcommand{\Ucalb}{\bm{\Ucal}}
\newcommand{\Vcalb}{\bm{\Vcal}}
\newcommand{\Wcalb}{\bm{\Wcal}}
\newcommand{\Xcalb}{\bm{\Xcal}}
\newcommand{\Ycalb}{\bm{\Ycal}}
\newcommand{\Zcalb}{\bm{\Zcal}}

\newcommand{\Ascrb}{\bm{\Ascr}}
\newcommand{\Bscrb}{\bm{\Bscr}}
\newcommand{\Cscrb}{\bm{\Cscr}}
\newcommand{\Dscrb}{\bm{\Dscr}}
\newcommand{\Escrb}{\bm{\Escr}}
\newcommand{\Fscrb}{\bm{\Fscr}}
\newcommand{\Gscrb}{\bm{\Gscr}}
\newcommand{\Hscrb}{\bm{\Hscr}}
\newcommand{\Iscrb}{\bm{\Iscr}}
\newcommand{\Jscrb}{\bm{\Jscr}}
\newcommand{\Kscrb}{\bm{\Kscr}}
\newcommand{\Lscrb}{\bm{\Lscr}}
\newcommand{\Mscrb}{\bm{\Mscr}}
\newcommand{\Nscrb}{\bm{\Nscr}}
\newcommand{\Oscrb}{\bm{\Oscr}}
\newcommand{\Pscrb}{\bm{\Pscr}}
\newcommand{\Qscrb}{\bm{\Qscr}}
\newcommand{\Rscrb}{\bm{\Rscr}}
\newcommand{\Sscrb}{\bm{\Sscr}}
\newcommand{\Tscrb}{\bm{\Tscr}}
\newcommand{\Uscrb}{\bm{\Uscr}}
\newcommand{\Vscrb}{\bm{\Vscr}}
\newcommand{\Wscrb}{\bm{\Wscr}}
\newcommand{\Xscrb}{\bm{\Xscr}}
\newcommand{\Yscrb}{\bm{\Yscr}}
\newcommand{\Zscrb}{\bm{\Zscr}}

\newcommand{\Afrab}{\bm{\Afra}}
\newcommand{\Bfrab}{\bm{\Bfra}}
\newcommand{\Cfrab}{\bm{\Cfra}}
\newcommand{\Dfrab}{\bm{\Dfra}}
\newcommand{\Efrab}{\bm{\Efra}}
\newcommand{\Ffrab}{\bm{\Ffra}}
\newcommand{\Gfrab}{\bm{\Gfra}}
\newcommand{\Hfrab}{\bm{\Hfra}}
\newcommand{\Ifrab}{\bm{\Ifra}}
\newcommand{\Jfrab}{\bm{\Jfra}}
\newcommand{\Kfrab}{\bm{\Kfra}}
\newcommand{\Lfrab}{\bm{\Lfra}}
\newcommand{\Mfrab}{\bm{\Mfra}}
\newcommand{\Nfrab}{\bm{\Nfra}}
\newcommand{\Ofrab}{\bm{\Ofra}}
\newcommand{\Pfrab}{\bm{\Pfra}}
\newcommand{\Qfrab}{\bm{\Qfra}}
\newcommand{\Rfrab}{\bm{\Rfra}}
\newcommand{\Sfrab}{\bm{\Sfra}}
\newcommand{\Tfrab}{\bm{\Tfra}}
\newcommand{\Ufrab}{\bm{\Ufra}}
\newcommand{\Vfrab}{\bm{\Vfra}}
\newcommand{\Wfrab}{\bm{\Wfra}}
\newcommand{\Xfrab}{\bm{\Xfra}}
\newcommand{\Yfrab}{\bm{\Yfra}}
\newcommand{\Zfrab}{\bm{\Zfra}}

\newcommand{\Atilde}{\widetilde{A}}
\newcommand{\Btilde}{\widetilde{B}}
\newcommand{\Ctilde}{\widetilde{C}}
\newcommand{\Dtilde}{\widetilde{D}}
\newcommand{\Etilde}{\widetilde{E}}
\newcommand{\Ftilde}{\widetilde{F}}
\newcommand{\Gtilde}{\widetilde{G}}
\newcommand{\Htilde}{\widetilde{H}}
\newcommand{\Itilde}{\widetilde{I}}
\newcommand{\Jtilde}{\widetilde{J}}
\newcommand{\Ktilde}{\widetilde{K}}
\newcommand{\Ltilde}{\widetilde{L}}
\newcommand{\Mtilde}{\widetilde{M}}
\newcommand{\Ntilde}{\widetilde{N}}
\newcommand{\Otilde}{\widetilde{O}}
\newcommand{\Ptilde}{\widetilde{P}}
\newcommand{\Qtilde}{\widetilde{Q}}
\newcommand{\Rtilde}{\widetilde{R}}
\newcommand{\Stilde}{\widetilde{S}}
\newcommand{\Ttilde}{\widetilde{T}}
\newcommand{\Utilde}{\widetilde{U}}
\newcommand{\Vtilde}{\widetilde{V}}
\newcommand{\Wtilde}{\widetilde{W}}
\newcommand{\Xtilde}{\widetilde{X}}
\newcommand{\Ytilde}{\widetilde{Y}}
\newcommand{\Ztilde}{\widetilde{Z}}


%%%%%%%% Widely accepted definitions %%%%%%%%%%%%%%%

\newcommand{\BB}{\mathbb{B}} % Complex numbers
\newcommand{\CC}{\mathbb{C}} % Complex numbers
\newcommand{\EE}{\mathbb{E}} % Expectation
\newcommand{\VV}{\mathbb{V}} % Variance
\newcommand{\II}{\mathbb{I}} % Indicator
\newcommand{\KK}{\mathbb{K}} % Arbitrary field
\newcommand{\LL}{\mathbb{L}} % Loss
\newcommand{\MM}{\mathbb{M}} % Median
\newcommand{\NN}{\mathbb{N}} % Natural numbers
\newcommand{\PP}{\mathbb{P}} % Probability
\newcommand{\QQ}{\mathbb{Q}} % Rationals
\newcommand{\RR}{\mathbb{R}} % Real numbers
\newcommand{\ZZ}{\mathbb{Z}} % Integers
\newcommand{\XX}{\mathbb{X}} %
\newcommand{\YY}{\mathbb{Y}} %

\newcommand{\one}{\mathbf{1}}  % Identity
\newcommand{\zero}{\mathbf{0}} % Zero
%\newcommand{\TRUE}{\mathbf{TRUE}}  % True
%\newcommand{\FALSE}{\mathbf{FALSE}}  % False

\newcommand*{\mini}{\mathop{\mathrm{minimize}}}
\newcommand*{\maxi}{\mathop{\mathrm{maximize}}}
\newcommand*{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand*{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand*{\st}{\mathop{\mathrm{s.t.}}}
\newcommand{\sgn}{\mathop{\mathrm{sign}}}
\newcommand{\tr}{\mathop{\mathrm{tr}}}
\newcommand{\diag}{\mathop{\mathrm{diag}}}
\newcommand{\rank}{\mathop{\mathrm{rank}}}
\newcommand{\ovec}{\mathop{\mathrm{vec}}}
\newcommand{\traj}{\mathop{\mathrm{Traj}}}
\newcommand*{\cov}{\mathrm{Cov}}
\newcommand*{\conv}{\mathrm{conv}}
\newcommand*{\const}{\mathrm{constant}}

%%%%%%%% Bold Greek Letters %%%%%%%%%%%%%%%
\newcommand{\sigmab}{\bm{\sigma}}
\newcommand{\Sigmab}{\mathbf{\Sigma}}


%%%%%%%% Mess around with LaTeX %%%%%%%%%%%%%%%

%% Some style files might actually define these variables.
%% So don't mess with them if they are already defined

\ifx\BlackBox\undefined
\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}  % end of proof
\fi

\ifx\QED\undefined
\def\QED{~\rule[-1pt]{5pt}{5pt}\par\medskip}
\fi

\ifx\proof\undefined
\newenvironment{proof}{\par\noindent{\bf Proof\ }}{\hfill\BlackBox\\[2mm]}
%\newenvironment{proof}{\emph{Proof. }}{ \hfill \QED}
\fi

\ifx\theorem\undefined
\newtheorem{theorem}{Theorem}
\newtheorem{example}{Example}
\newtheorem{property}{Property}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{assumption}{Assumption}
\fi

\ifx\axiom\undefined
\newtheorem{axiom}[theorem]{Axiom}
\fi

%%%%%%%% Utility functions %%%%%%%%%%%%%%%

\newcommand{\eq}[1]{(\ref{#1})}
\newcommand{\mymatrix}[2]{\left[\begin{array}{#1} #2 \end{array}\right]}
\newcommand{\mychoose}[2]{\left(\begin{array}{c} #1 \\ #2 \end{array}\right)}
\newcommand{\mydet}[1]{\det\left[ #1 \right]}
\newcommand{\sembrack}[1]{[\![#1]\!]}

\newcommand{\ea}{\emph{et al.}}
\newcommand{\eg}{\emph{e.g.}}
\newcommand{\ie}{\emph{i.e.}}
\newcommand{\iid}{\emph{i.i.d.}}
\newcommand{\etc}{\emph{etc.}}

%\newcommand{\alex}[1]{{\bf ALEX: \uppercase{#1}}}
%\newcommand{\vishy}[1]{{\bf VISHY: \uppercase{#1}}}
%\newcommand{\rene}[1]{{\bf RENE: \uppercase{#1}}}
%\newcommand{\karsten}[1]{{\bf KARSTEN: \uppercase{#1}}}

%%%%%%%% Specific symbols for this project %%%%%%%%%%%%%%%

\newcommand{\methodname}{KDE}
\newcommand{\ind}{\boldsymbol{\mathsf{I}}}

\newcommand{\hsic}{\mathrm{HSIC}}
\newcommand{\mmd}{\mathrm{MMD}}

%\newcommand{\tDiag}{\textsf{Diag}}
%\newcommand{\tTr}{\textsf{Tr}}
%\newcommand{\tE}{\textsf{E}}
%\newcommand{\tVec}{\textsf{Vec}}
%\newcommand{\tRank}{\textsf{Rank}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% math symbols and commands
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\newcommand{\eq}[1]{(\ref{#1})}
%\newcommand{\mymatrix}[2]{\left[\begin{array}{#1} #2 \end{array}\right]}

%brackets
\newcommand{\inner}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\rbr}[1]{\left(#1\right)}
\newcommand{\sbr}[1]{\left[#1\right]}
\newcommand{\cbr}[1]{\left\{#1\right\}}
\newcommand{\nbr}[1]{\left\|#1\right\|}
\newcommand{\abr}[1]{\left|#1\right|}
\newcommand{\smallfrac}[2]{{\textstyle \frac{#1}{#2}}}
\renewcommand{\url}[1]{{\sffamily #1}}
\newcommand{\arow}[2]{#1_{#2\cdot}}
\newcommand{\acol}[2]{#1_{\cdot#2}}
\def\ci{\perp\!\!\!\perp}

\newcommand{\ssbr}[1]{\left[\!\left[#1\right]\!\right]}

\newcommand{\twoco}[1]{\multicolumn{2}{c|}{#1}}

\newcommand{\wtimes}[1]{\times_{#1}}
\newcommand{\btimes}[1]{~\bar{\times}_{#1}~}


\newcommand{\secref}[1]{Section~\ref{#1}}
\newcommand{\eqnref}[1]{Eqn~(\ref{#1})}
\newcommand{\eqnsref}[1]{Eqns~(\ref{#1})}
\newcommand{\appref}[1]{Appendix~\ref{#1}}
\newcommand{\algtabref}[1]{Algorithm~\ref{#1}}
\newcommand{\lemref}[1]{Lemma~\ref{#1}}
\newcommand{\propref}[1]{Proposition~\ref{#1}}
\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\corref}[1]{Corollary~\ref{#1}}
\newcommand{\asmpref}[1]{Assumption~\ref{#1}}

\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\figref}[1]{Figure~\ref{#1}}


\newcommand{\defeq}{:=}

\newcommand{\AlgName}{{Spectral Dynamics Embedding}\xspace}
\newcommand{\algabb}{{SPEDE}\xspace}
\newcommand*{\dif}{\mathop{}\!\mathrm{d}}

\usepackage{xcolor}
\newcommand{\Bo}[1]{{\color{blue} [Bo: #1]}}
\newcommand{\Tongzheng}[1]{{\color{red} [Tongzheng: #1]}}


\allowdisplaybreaks

\title{A Free Lunch from the Noise: Provable and Practical Exploration \\ for Representation Learning (Supplementary Materials)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 2, $^\star$]{\href{mailto:<tongzheng@utexas.edu>?Subject=Your UAI 2022 paper}{Tongzheng Ren }{}}
\author[3, $^\star$]{\href{mailto:<tianjunz@berkeley.edu>?Subject=Your UAI 2022 paper}{Tianjun Zhang }{}}
\author[4, 5]{Csaba Szepesv\'{a}ri~}
\author[2]{\href{mailto:<bodai@google.com>?Subject=Your UAI 2022 paper}{Bo Dai}{}
}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science, UT Austin
}
\affil[2]{%
    Google Research, Brain Team
}
\affil[3]{
    Department of EECS, UC Berkeley
  }
\affil[4]{DeepMind}
\affil[5]{Department of Computer Science, University of Alberta}
  
\begin{document}
\appendix
\onecolumn
\maketitle

\let\thefootnote\relax\footnotetext{$^\star$ Equal Contribution}

\section{Backgrounds on Reproducing Kernel Hilbert Space}
\label{sec:background}
We briefly introduce the basic concepts of the Reproducing Kernel Hilbert Space, which is helpful on understanding our paper. To start with, we first define the inner product.
\begin{definition}[Inner Product]
A function $\langle \cdot, \cdot \rangle_{\mathcal{H}} : \mathcal{H}\times \mathcal{H} \to \mathbb{R}$ is said to be an inner product on $\mathcal{H}$ if it satisfies the following conditions:
\begin{enumerate}
    \item Positive Definiteness: $\forall u \in \mathcal{H}$, $\langle u, u \rangle\geq 0$, and $\langle u, u\rangle = 0 \Longleftrightarrow u = 0$.
    \item Symmetry: $\forall u, v \in \mathcal{H}$, $\langle u, v \rangle\in \langle v, u\rangle$.
    \item Bilinearity: $\forall \alpha, \beta \in \mathbb{R}, u, v, w\in\mathcal{H}$, $\langle \alpha u + \beta v, w \rangle = \alpha\langle u, w\rangle + \beta \langle v, w\rangle$.
\end{enumerate}
Additionally, we can define a norm with the inner product: $\|u\| = \sqrt{\langle u, u\rangle}$.
\end{definition}
A Hilbert space is a space equipped with an inner product and satisfies an additional technical condition of completeness. The finite-dimension vector space with the canonical inner product is an example of the Hilbert space. We remark that $\mathcal{H}$ can also be a function space, for example, the space contains all square integrable functions (i.e. $\int_{\mathbb{R}} f(x)^2 \dif x < \infty$, generally denoted as $L_2$) is also a Hilbert space with inner product $\langle f, g\rangle = \int_{\mathbb{R}} f(x) g(x) \dif x$. 

We then define the kernel, and introduce the notion of positive-definite kernel \citep{alvarez2012kernels}.
\begin{definition}[(Positive-Definite) Kernel]
A function $k:\mathcal{X} \times \mathcal{X} \to \mathbb{R}$ is said to be a kernel on non-empty set $\mathcal{X}$ if there exists a Hilbert space $\mathcal{H}$ and a feature map $\phi:\mathcal{X}\to\mathcal{H}$ such that $\forall x, x^\prime\in\mathcal{X}$, we have
\begin{align*}
    k(x, x^\prime) = \langle \phi(x), \phi(x^\prime)\rangle_{\mathcal{H}}.
\end{align*}
Moreover, the kernel is said to be positive definite if $\forall n\geq 1$, $\forall \{a_i\}_{i\in [n]} \subset \mathbb{R}$ and mutually distinct set $\{x_i\}_{i\in [n]} \subset \mathcal{X}$, we have that
\begin{align*}
    \sum_{i\in[n]}\sum_{j\in[n]} a_i a_j k(x_i, x_j) > 0. 
\end{align*}
\end{definition}
Some well-known kernels include:
\begin{itemize}
    \item Linear Kernel: $k(x, y) = \langle x, y\rangle$, with the canonical feature map $\phi(x) = x$.
    \item Polynomial Kernel: $k(x, y) = (\langle x, y \rangle + c)^m$, where $m\in\mathbb{N}^+$ and $c\in\mathbb{R}^+$.
    \item Gaussian (a.k.a radial basis function, RBF) Kernel: $k(x, y) = \exp\left(\frac{\|x-y\|_2^2}{2\sigma^2}\right)$. It's known that such kernel is positive definite.
\end{itemize}
Now we can define the Reproducing Kernel Hilbert space (RKHS) \citep{aronszajn1950theory}.
\begin{definition}[Reproducing Kernel Hilbert Space (RKHS)]
The Hilbert space $\mathcal{H}$ of $\mathbb{R}$-valued function defined on a non-emptry set $\mathcal{X}$ is said to be a reproducing kernel Hilbert space (RKHS) is there is a kernel $k:\mathcal{X} \times \mathcal{X} \to \mathbb{R}$, such that
\begin{enumerate}
    \item $\forall x\in\mathcal{X}$, $k(x, \cdot) \in \mathcal{H}$.
    \item $\forall x\in\mathcal{X}, f\in\mathcal{H}$, $\langle f, k(x, \cdot)\rangle_{\mathcal{H}} = f(x)$ (a.k.a the reproducing property), which also implies that $\langle k(x, \cdot), k(y, \cdot)\rangle = k(x, y)$.
\end{enumerate}
Here $k$ is called a reproducing kernel of $\mathcal{H}$.
\end{definition}
We provide an intuitive interpretation on the definition of RKHS when $\mathcal{H}$ is the space of linear function. Consider $\mathcal{X} = \mathbb{R}^d$ and $k(x, y) = \langle x, y \rangle$. With the definition of the kernel $k$, we can see that $k(x, \cdot) : \mathcal{X} \to \mathbb{R}$ is a linear function, and thus lies in $\mathcal{H}$. Meanwhile, $\forall f\in\mathcal{H}$, there exists $\theta_f$ such that $f(x) = \theta_f^\top x$. We define the inner product on $\mathcal{H}$ via $\langle f, g\rangle_{\mathcal{H}} = \langle \theta_f, \theta_g\rangle$, and thus $\langle f(, k(x, \cdot))\rangle_{\mathcal{H}} = \theta^\top x = f(x)$, which demonstrates the reproducing property, and shows that the space of linear function on any finite-dimensional vector space is an RKHS with linear kernel as the corresponding reproducing kernel.

We state the following theorems without the proof.
\begin{theorem}[Moore-Aronszajn \citep{aronszajn1950theory}]
Every positive definite kernel $k$ is associated with a unique RKHS $\mathcal{H}$.
\end{theorem}
Notice that, Moore-Aronszajn theorem guarantees that all of the positive kernel can be represented as the inner product in certain Hilbert space, hence we can have a linear representation of the Gaussian distribution induced by the reproducing property of Gaussian kernel, as we illustrated in the main text.
\begin{theorem}[Bochner \citep{rudin2017fourier}]
A continuous, shift-invariant kernel (i.e. $k(x, y) = k(x-y)$) is positive definite if and only if $k(x-y)$ is the Fourier transform of a non-negative measure $\omega$, i.e.
\begin{align*}
    k(x-y) = \int_{\mathbb{R}^d} \exp(i\omega^\top (x-y)) \dif \mathbb{P}(\omega) = \int_{\mathbb{R}^d \times [0, 2\pi]} 2\cos(\omega^\top x + b) \cos(\omega^\top y + b) \dif (\mathbb{P}(\omega) \times \mathbb{P}(b)),
\end{align*}
where $\mathbb{P}(b)$ is a uniform distribution on $[0, 2\pi]$.
\end{theorem}
Bochner's theorem shows that any continuous positive definite shift-invariant kernel (e.g. Gaussian kernel, Laplacian kernel) can be represented as the inner product of random Fourier feature, which provides an additional way to provide a representation for certain distribution \citep[see][]{rahimi2007random, dai2014scalable}.

% \newpage
\section{An Equivalent Upper Confidence Bound Algorithm}
\label{sec:ucb}
In this section, we provide a generic Upper Confidence Bound (UCB) algorithm with the OFU principle, and show the connections and differences between the UCB algorithm and the TS algorithm. The prototype for our UCB algorithm is illustrated in Algorithm \ref{alg:UCB}.
\begin{algorithm}
\caption{Upper Confidence Bound (UCB) Algorithm} 
\label{alg:UCB}
\begin{algorithmic}[1]
\Require Number of Episodes $K$, Failure Probability $\delta\in(0, 1)$, Reward Function $r(s, a)$.
\State Initialize the history set $\mathcal{H}_0 = \emptyset$.
\For{episodes $k=1, 2, \cdots$}
\State \label{line:optimistic_planning} {\color{blue} Compute $\pi_{k}$ via \Comment{Optimistic Planning.}
\begin{align*}
    (\pi_{k}, \tilde{f}_k) = \mathop{\arg\max}_{\pi\in\Pi, \tilde{f}\in\mathcal{F}_k} \tilde{V}_{0}^\pi(s_0).
\end{align*}
where $\mathcal{F}_k$ is defined in \eqref{eq:confidence_set}.}
\For{steps $h=0, 1, \cdots, H-1$}\Comment{Execute $\pi_k$.}
\State Execute $a_h^k \sim \pi_k^h(s_h^k)$.
\State Observe $s_{h+1}$.
\EndFor
\State Set $\mathcal{H}_k = \mathcal{H}_{k-1} \cup \{(s_h^k, a_h^k, s_{h+1}^k)\}_{h=0}^{H-1}$.\Comment{Update the History.}
\EndFor
\end{algorithmic}
\end{algorithm}

Notice that, the only difference between UCB algorithm and TS algorithm is the mechanism of finding $f$ we use to plan for each episode (highlighted in blue). For UCB algorithm, we perform an optimistic planning, which finds the $\tilde{f}_k$ that potentially has the largest cumulative reward. However, such constrained optimization problem is NP-hard even for the simplest linear bandits \citep{dani2008stochastic}. Instead, for TS algorithm, we only sample the $f_k$ from the posterior distribution, which gets rid of the complicated constraint optimization. We are interested in the UCB algorithm, as the worst case regret bound of the UCB algorithm can be directly translated to the expected regret bound of the TS algorithm without the need of explicit manipulation of the prior and the posterior\citep{russo2013eluder, russo2014learning, osband2014model}.

\paragraph{Confidence Set Construction} Perhaps the most important part in OFU-style algorithm is the construction of confidence set $\mathcal{F}_k$. To enable sample-efficient learning, the confidence set should
\begin{enumerate}
    \item contain $f^*$ with high probability, so that we can identify $f^*$ eventually;
    \item shrink as fast as possible, so that we can identify $f^*$ efficiently.
\end{enumerate}
In the tabular setting, $\mathcal{F}_k$ is constructed via the concentration of sub-Gaussian/sub-Gamma random variable \citep[e.g.][]{azar2017minimax}, and in the linear MDP setting, $\mathcal{F}_k$ is constructed via the concentration on the linear parameters. As we don't assume any specific structures, we instead constructed $\mathcal{F}_k$ via the concentration on the $\ell_2$ error, following the idea of \citep{russo2013eluder, osband2014model}. Specifically, consider the least-square estimates defined by
\begin{align}
    \hat{f}_K = \mathop{\arg\min}_{f\in \mathcal{F}}L_{2, K}(f) := \sum_{k\in [K]}\sum_{h=0}^{H-1} \|f(s_h^k, a_h^k) - s_{h+1}^k\|_2^2.
\end{align}
As $s_{h+1}^k = f^*(s_h^k, a_h^k) + \epsilon_h^k$ where $\epsilon_h^k$ is the Gaussian noise added to the step $h$ at the $k$-th episode, we know $\hat{f}_{K}$ will not deviate from $f^*$ a lot. Meanwhile, as $K$ increases, the estimation $\hat{f}_K$ should become closer to $f^*$. Specifically, define the empirical $2$-norm $\|\cdot\|_{2, E_t}$ as
\begin{align*}
    \|g\|_{2, E_K}^2 := \sum_{k\in [K]} \sum_{h=0}^{H-1} \|g(s_h^k, a_h^k)\|_2^2.
\end{align*}
We can construct the confidence set based on the following lemma:
\begin{lemma}[Confidence Set Construction \citep{russo2013eluder, osband2014model}]\label{lem:confidence_set}
Define
\begin{align}
\label{eq:confidence_set}
    \mathcal{F}_K = \left\{f\in\mathcal{F}:\|f - \hat{f}_K\|_{2, E_K}\leq \sqrt{\beta_K^*(\mathcal{F}, \delta, \alpha)}\right\},
\end{align}
then
\begin{align}
    \mathbb{P}_{f^*}\left(f^*\in \bigcap_{k=1}^{\infty}\mathcal{F}_k\right) \geq 1-2\delta,
\label{eq:optimism}
\end{align}
where
\begin{align}
    \beta_K^*(\mathcal{F}, \delta, \alpha) = 8\sigma^2 \log(\mathcal{N}(\mathcal{F}, \alpha, \|\cdot\|_{2})/\delta) + 2H\alpha(12C + \sqrt{8d\sigma^2\log(4K^2H/\delta)}).
\end{align}
\end{lemma}
The proof can be found in Appendix \ref{sec:proof_optimism}. Notice that, the empirical $2$-norm $\|f - \hat{f}_K\|_{2, E_K}$ scales linearly with $K$, and $\beta_K^*(\mathcal{F}, \delta, \alpha)$ only scales as $\log K$, so the confidence set shrinks. Meanwhile, Equation \ref{eq:optimism} guarantees that $f^* \in \mathcal{F}_k$, $\forall k$ with high probability. Hence, it satisfies our requirement for the confidence set.

{\color{black}
\paragraph{Regret Upper Bound} We have the following upper bound of the regret for the UCB algorithm:
\begin{theorem}[Regret Bound]
\label{thm:regret_bound_UCB}
Assume Assumption \ref{assump:bounded_output} to \ref{assump:bounded_eluder} holds. We have that
\begin{align*}
\textstyle
    \mathrm{Regret}(K) \leq \tilde{O}(\sqrt{H^2 T\cdot \log \mathcal{N}(\mathcal{F}, T^{-1/2}, \|\cdot\|_2) \cdot \mathrm{dim}_{E}(\mathcal{F}, T^{-1/2})}).
\end{align*}
where $\tilde{O}$ represents the order up to logarithm factors.
\end{theorem}
}

\section{Technical Proof}
\label{sec:technical_proof}
\subsection{Proof for Lemma \ref{lem:confidence_set}} 
\label{sec:proof_optimism}
\begin{proof} We first show the following concentration on the $\ell_2$ error:
\begin{lemma}[Concentration of $\ell_2$ error \citep{russo2013eluder, osband2014model, wang2020reinforcement}]
$\forall \delta > 0, f:\mathcal{S}\times \mathcal{A} \to \mathbb{R}$, we have
\begin{align*}
    \mathbb{P}_{f^*}\left(L_{2, K}(f) \geq L_{2, K}(f^*) + \frac{1}{2}\|f - f^*\|^2_{2, E_K}- 4\sigma^2 \log (1/\delta), \quad\forall K\in \mathbb{N}\right) \geq 1-\delta
\end{align*}
\end{lemma}
\begin{proof}
Define the filtration $\mathcal{H}_{k, h} = \{(s_h^i, a_h^i)\}_{i\in [k-1], h = 0, \cdots, H-1}\cup \{(s_i^k, a_i^k)\}_{h=0}^{h-1}$, and the random variable $Z_{k,h}$ adapted to the filtration $\mathcal{H}_{k, h}$ via:
\begin{align*}
    Z_{k, h} = & \|f^*(s_h^k, a_h^k) - s_{h+1}^k\|^2_2 - \|f(s_h^k, a_h^k) - s_{h+1}^k\|^2_2\\
    = & \|f^*(s_h^k, a_h^k) - s_{h+1}^k\|^2_2 - \|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k) + f^*(s_h^k, a_h^k) - s_{h+1}^k \|^2_2\\
    = & - \|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k)\|^2_2 + 2\langle f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k), \epsilon_h^k\rangle,
\end{align*}
% \Bo{The first term should be square, instead of square root and the rest.}
where $\epsilon_h^k = s_{h+1}^k - f^*(s_h^k, a_h^k)$. Thus, $\mathbb{E}(Z_k^h|\mathcal{H}_{k, h}) = - \|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k)\|^2_2$, and $Z_k^h + \|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k)\|^2_2$ is a martingale w.r.t $\mathcal{H}_{k, h}$. Notice that we assume $\epsilon$ is an isotropic Gaussian noise with variance $\sigma^2$ on each of the dimension, thus the conditional moment generating function of $Z_k^h + \|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k)\|^2_2$ satisfies:
\begin{align*}
    M_{k, h}(\lambda) = & \log \mathbb{E}[\exp(\lambda(Z_k^h + \|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k)\|^2_2))|\mathcal{H}_{k, h}] \\
    = & \log \mathbb{E}[\exp(\langle 2\lambda f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k), \epsilon_h^k \rangle)|\mathcal{H}_{k, h}]\\
    \leq & 2\sigma^2\lambda^2\|f(s_h^k) - f^*(s_h^k, a_h^k)\|_2^2.
\end{align*}
Applying Lemma 4 in \citep{russo2013eluder}, we have that, $\forall x, \lambda \geq 0$,
\begin{align*}
    \mathbb{P}_{f^*}\left(\sum_{k\in [K]}\sum_{h=0}^{H-1}\lambda Z_{k, h} \leq x - \lambda(1-2\lambda  \sigma^2)\sum_{k\in [K]}\sum_{h=0}^{H-1}\|f(s_h^k, a_h^k) - f^*(s_h^k, a_h^k)\|_2^2,\quad \forall k\in \mathbb{N} \right) \leq 1-\exp(-x).
\end{align*}
Take $\lambda = \frac{1}{4\sigma^2}, x = \log 1/\delta$, and notice that $\sum_{k\in [K]}\sum_{h=0}^{H-1}Z_{k, h} = L_{2, K}(f^*) - L_{2, K}(f)$, we have the desired result.
\end{proof}

We construct an $\alpha$-cover $\mathcal{F}_{\alpha}$ in $\mathcal{F}$ with respect to $\|\cdot\|_2$. With a standard union bound, we know that condition on $f^*$, with probability at least $1-\delta$, we have that
\begin{align*}
    L_{2, K}(f^{\alpha}) - L_{2, K}(f^*) \geq \frac{1}{2}\|f^\alpha - f^*\|_{2, E_{K}}^2 - 4\sigma^2 \log (|F^{\alpha}|/\delta), \quad \forall K\in \mathbb{N}, f^{\alpha} \in \mathcal{F}^{\alpha}.
\end{align*}
Thus, we have that
\begin{align*}
    L_{2, K}(f) - L_{2, K}(f^*) \geq & \frac{1}{2}\|f - f^*\|_{2, E_{K}}^2 - 4\sigma^2 \log (|F^{\alpha}|/\delta)\\
    & + \underbrace{\min_{f^\alpha \in \mathcal{F}^{\alpha}}\left\{\frac{1}{2}\|f^{\alpha} - f^*\|_{2, E_K}^2 - \frac{1}{2}\|f - f^*\|_{2, E_K}^2 + L_{2, K}(f) - L_{2, K}(f^\alpha)\right\}}_\text{Discretization Error}.
\end{align*}
We then deal with the discretization error. Assume $\alpha \leq 2C$ (or otherwise we only have a trivial cover) and $\|f^{\alpha}(s, a) - f(s, a)\|_2 \leq \alpha$, we have that
\begin{align*}
    & \|f^{\alpha}(s, a) - f^*(s, a)\|_2^2 - \|f(s, a) - f^*(s, a)\|_2^2\\
    = & \|f^\alpha(s, a)\|_2^2-\|f(s, a)\|_2^2 + 2\langle f^*(s, a), f(s, a) - f^{\alpha}(s, a)\rangle\\
    \leq &\max_{\|y\|_2\leq \alpha}\{\|f(s, a) + y\|_2^2 - \|f(s, a)\|_2^2\} + 2C\alpha\\
    = & \max_{\|y\|_2\leq \alpha} \{2\langle f(s, a), y\rangle + \|y\|_2^2\} + 2C\alpha\\
    \leq & 4C\alpha + \alpha^2 \leq 6C\alpha,
\end{align*}
where the inequality is by Cauchy-Schwartz inequality and $\alpha \leq 2C$. Meanwhile,
\begin{align*}
    & \|s^\prime - f(s, a)\|_2^2 - \|s^\prime - f^{\alpha}(s, a)\|_2^2 \\
    = & 2\langle s^\prime, f^{\alpha}(s, a) - f(s, a)\rangle + \|f(s, a)\|_2^2 - \|f^{\alpha}(s, a)\|_2^2\\
    \leq & 2\langle \epsilon, f^{\alpha}(s, a) - f(s, a)\rangle + 2\langle f^*(s, a), f^{\alpha}(s, a) - f(s, a)\rangle + 2C\alpha + \alpha^2\\
    \leq & 2\|\epsilon\|_2 \alpha + 6C\alpha.
\end{align*}
We now consider the concentration property of $\|\epsilon\|_2$. Here we simply follow \citep{jin2019short} and notice that $\epsilon$ is $\sqrt{d}\sigma$-norm-sub-Gaussian, we have that
\begin{align*}
    \mathbb{P}(\|\epsilon\|_2 > \sqrt{2d\sigma^2 \log (2/\delta)}) \leq \delta.
\end{align*}
By a union bound, we have that
\begin{align*}
    \mathbb{P}(\exists k, \|\epsilon\|_2 > \sqrt{2d\sigma^2 \log (4k^2H/\delta)}) \leq \frac{\delta}{2} \sum_{k=1}^{\infty}\sum_{h=0}^{H-1} \frac{1}{k^2 H} \leq \delta.
\end{align*}
Sum all these up, we can see with probability $1-\delta$, $\forall K\in\mathbb{N}$, the discretization error is upper bounded by:
\begin{align*}
    H\alpha(12C + \sqrt{8d\sigma^2 \log (4K^2 H/\delta)}).
\end{align*}
As we consider the least square estimate $\hat{f}_K$, we have that $L_{2, K}(\hat{f}_K) - L_{2, K}(f^*) \leq 0$. Substitute back, we have the desired results.
\end{proof}
\subsection{Simulation Lemma}
\begin{lemma}[Simulation Lemma (adapted from Lemma 3.9 in \citep{kakade2020information})]\label{lem:simulation}
Given $\hat{f}$, $\forall s\in\mathcal{S}$, the value function $\hat{V}^\pi$ and $V^\pi$ corresponding to the model $\hat{f}$ and $f^*$ satisfies
\begin{align*}
    \hat{V}_0^\pi(s) - V_0^\pi(s) \leq H^{3/2} \sqrt{\mathbb{E}\left[\sum_{h=0}^{H-1}\min\left\{\frac{2\|f^*(s_h, a_h) - \hat{f}(s_h, a_h) \|_2^2}{\sigma^2}, 1\right\}\right]}.
\end{align*}
\end{lemma}
\label{sec:proof_simulation}
\begin{proof}

We first show the following difference lemma:
\begin{lemma}[Difference Lemma]
\label{lem:difference}
Assume the trajectory $\{(s_h, a_h)\}_{h=0}^{H-1}$ is generated via policy $\pi$ and ground truth $f^*$, define
\begin{align*}
    V_h = \sum_{\tau=h}^{H-1}r(s_\tau, a_\tau)
\end{align*}
then $\forall \tau \in \{1, \cdots, H-1\}$, we have:
\begin{align*}
    \hat{V}_0^\pi(s_0) - V_0  = & \mathbb{E}_{s_\tau^\prime\sim \mathcal{N}(\hat{f}(s_{\tau-1}, a_{\tau-1}), \sigma^2 I)} \left[\hat{V}_{\tau}^\pi(s_\tau^\prime)\right] - V_{\tau} \\
    & + \sum_{h=1}^{\tau-1}\left[\mathbb{E}_{s_h^\prime\sim \mathcal{N}(f(s_{h-1}, a_{h-1}), \sigma^2 I)} \left[\hat{V}_h^\pi(s_h^\prime)\right] - \hat{V}_h^\pi(s_h) \right].
\end{align*}
\end{lemma}
\begin{proof}
When $\tau = 1$, we can obtain the result with $a_0 = \pi(s_0)$ and
\begin{align*}
    \hat{V}_0^\pi(s_0) = r(s_0, \pi(s_0)) + \mathbb{E}_{s_1^\prime\sim \mathcal{N}(f(s_0, a_0), \sigma^2 I)} \hat{V}_1^\pi(s_1^\prime).
\end{align*}
We only need to show the case when $\tau = 2$, and the case when $\tau > 2$ can be derived via recursion. Notice that
\begin{align*}
    \hat{V}_0^\pi(s_0) - V_0  = & \mathbb{E}_{s_1^\prime\sim \mathcal{N}(f(s_0, a_0), \sigma^2 I)} \left[\hat{V}_1^\pi(s_1^\prime)\right] - V_1\\
    = & \hat{V}_1^\pi(s_1) - V_1  +  \mathbb{E}_{s_1^\prime\sim \mathcal{N}(f(s_0, a_0), \sigma^2 I)} \left[\hat{V}_1^\pi(s_1^\prime)\right] - \hat{V}_1^\pi(s_1)\\
    = & \mathbb{E}_{s_2^\prime\sim \mathcal{N}(f(s_1, a_1), \sigma^2 I)}\left[\hat{V}_2^\pi(s_2^\prime)\right] - V_2 + \mathbb{E}_{s_1^\prime\sim \mathcal{N}(f(s_0, a_0), \sigma^2 I)} \left[\hat{V}_1^\pi(s_1^\prime)\right] - \hat{V}_1^\pi(s_1),
\end{align*}
where the last equality is due to the fact that $a_1 = \pi(s_1)$.
\end{proof}
We then follow the idea of ``optional stopping'' used in \citep{kakade2020information} and show the following ``optional stopping'' simulation lemma.
\begin{lemma}[``Optional Stopping'' Simulation Lemma]\label{lem:option-stop-simulation}
Consider the stochastic process over the trajectories $\{(s_h, a_h)\}_{h=0}^{H-1}$ generated via policy $\pi$ and ground truth $f^*$, where the randomness is from the Gaussian noise in the dynamics. Define a stopping time $\tau$ w.r.t this stochastic process and a given model $\hat{f}$ via:
\begin{align*}
    \tau := \min\{h\geq 0: \hat{V}_h^\pi(s_h) \leq V_h^\pi(s_h)\}.
\end{align*}
Furthermore, define a random variable:
\begin{align*}
    \tilde{V}_h^\pi(s_h) = \max\{\hat{V}_h^\pi(s_h), V_h^\pi(s_h)\},
\end{align*}
we have that
\begin{align*}
    \hat{V}_0^\pi(s_0) - V_0^\pi(s_0) \leq \mathbb{E}\left[\sum_{h=0}^{H-1}\mathbf{1}_{h<\tau} \left(\mathbb{E}_{s_{h+1}^\prime \sim \mathcal{N}(f^*(s_h, a_h), \sigma^2 I)}\tilde{V}_h^\pi(s_{h+1}^\prime) - \mathbb{E}_{s_{h+1}^\prime \sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)}\tilde{V}_h^\pi(s_{h+1}^\prime)\right)\right],
\end{align*}
where the expectation is w.r.t the stochastic process over the trajectories.
\end{lemma}
\begin{proof}
Define the filtration $\mathcal{F}_h :=\{\epsilon_i\}_{i=0}^{h-1}$, where $\epsilon_i$ is the noise that add to the dynamics at step $i$. Define
\begin{align*}
    M_h = \mathbb{E}[\hat{V}_0^\pi(s_0) - V_0 | \mathcal{F}_h],
\end{align*}
which is a Doob martingale with respect to $\mathcal{F}_i$ \citep{grimmett2020probability}. As $\tau\leq H$, by Doob's optional stopping theorem, we have that
\begin{align*}
    \mathbb{E}[\hat{V}_0^\pi(s_0) - V_0] = \mathbb{E}[M_{\tau}] = \mathbb{E}[\mathbb{E}[\hat{V}_0^\pi(s_0) - V_0|\mathcal{F}_{\tau}]].
\end{align*}
We then provide a bound for $M_{\tau}$. By Lemma \ref{lem:difference}, we have that
\begin{align*}
    M_{\tau} = & \mathbb{E}[\hat{V}_0^\pi(s_0) - V_0|\mathcal{F}_{\tau}]\\
    = & \mathbb{E}_{s_\tau^\prime\sim \mathcal{N}(\hat{f}(s_{\tau-1}, a_{\tau-1}), \sigma^2 I)} \left[\hat{V}_{\tau}^\pi(s_\tau^\prime)\right] - V_{\tau}^\pi(s_{\tau})\\
    & + \mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_{h-1}, a_{h-1}), \sigma^2 I)} \left[ \hat{V}_h^\pi(s_h^\prime)\right] -\sum_{h=1}^{\tau-1} \hat{V}_h^\pi(s_h)\\
    = & \sum_{h=1}^{\tau}\left(\mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2)} \left[\hat{V}_h^\pi(s_h^\prime)\right] - \tilde{V}_h^\pi(s_h)\right)\\
    \leq & \sum_{h=1}^{\tau}\left( \mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)} \left[\tilde{V}_h^\pi(s_h)\right] - \tilde{V}_h^\pi(s_h)\right)\\
    = & \sum_{h=1}^{H} \mathbf{1}_{h\leq \tau}\left(\mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)} \left[\tilde{V}_h^\pi(s_h)\right] - \tilde{V}_h^\pi(s_h)\right),
\end{align*}
where the third inequality follows the definition of $\tau$ (and thus $V_{\tau}^\pi(s_{\tau}) = \tilde{V}_{\tau}^\pi(s_\tau)$ and $\hat{V}_h^\pi(s_h) = \tilde{V}_h^\pi(s_h)$ for $h < \tau$.)

The proof is then concluded via the following observation:
\begin{align*}
    & \mathbb{E} \left[\mathbf{1}_{h\leq \tau}\left( \mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)} \left[\tilde{V}_h^\pi(s_h)\right] - \tilde{V}_h^\pi(s_h)\right)\right]\\
    = & \mathbb{E}\left[\mathbb{E}\left[\mathbf{1}_{h\leq \tau}\left( \mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)} \left[\tilde{V}_h^\pi(s_h)\right] - \tilde{V}_h^\pi(s_h)\right)\bigg|\mathcal{F}_{h-1}\right]\right]\\
    = & \mathbb{E}\left[\mathbb{E}\left[\mathbf{1}_{h-1<\tau}\left( \mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)} \left[\tilde{V}_h^\pi(s_h)\right] - \tilde{V}_h^\pi(s_h)\right)\bigg|\mathcal{F}_{h-1}\right]\right]\\
    = & \mathbb{E}\left[\mathbf{1}_{h-1<\tau}\mathbb{E}\left[\left( \mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2 I)} \left[\tilde{V}_h^\pi(s_h)\right] - \tilde{V}_h^\pi(s_h)\right)\bigg|\mathcal{F}_{h-1}\right]\right]\\
    = & \mathbb{E}\left[\mathbf{1}_{h-1<\tau}\left(\mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2)} \left[\tilde{V}_h^\pi(s_h)\right] - \mathbb{E}_{s_h^\prime\sim \mathcal{N}(f^*(s_h, a_h), \sigma^2)} \left[\tilde{V}_h^\pi(s_h)\right)\right]\right],
\end{align*}
where the third equality is due to the fact that $\mathbf{1}_{h-1<\tau}$ is measurable under $\mathcal{F}_{h-1}$. 
\end{proof}
Before we finally provide the proof of Lemma \ref{lem:simulation}, we state the following lemma that bound the expectation under two isotropic Gaussian distribution with different mean:
\begin{lemma}[Difference of Expectation under Different Mean Isotropic Gaussian]
\label{lem:expectation_diff}$\forall$ (approximately measurable) positive function $g$, we have that
\begin{align*}
    \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} [g(z)] - \mathbb{E}_{z\sim\mathcal{N}(\mu_2, \sigma^2 I)} [g(z)] \leq \min\left\{\frac{\sqrt{2}\|\mu_1 - \mu_2\|}{\sigma}, 1\right\} \sqrt{\mathbb{E}_{z\sim \mathcal{N}(\mu_1, \sigma^2 I)}[g(z)^2]}
\end{align*}
\end{lemma}
\begin{proof}
\begin{align*}
     & \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} [g(z)] - \mathbb{E}_{z\sim\mathcal{N}(\mu_2, \sigma^2 I)} [g(z)] \\
     = & \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} \left[g(z)\left(1 - \exp\left(\frac{2(\mu_1 - \mu_2)^\top z + \|\mu_2\|^2 - \|\mu_1\|^2}{2\sigma^2}\right)\right)\right]\\
     \leq & \sqrt{\mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)}[g(z)^2]} \sqrt{\mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} \left(1 - \exp\left(\frac{2(\mu_2 - \mu_1)^\top z - \|\mu_2\|^2 + \|\mu_1\|^2}{2\sigma^2}\right)\right)^2}
\end{align*}
We then calculate
\begin{align*}
    & \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} \left(1 - \exp\left(\frac{2(\mu_2 - \mu_1)^\top z - \|\mu_2\|^2 + \|\mu_1\|^2}{2\sigma^2}\right)\right)^2\\
    = & 1 - \frac{2}{\sqrt{2\pi} \sigma^{d/2}} \int \exp\left(\frac{-\|z - \mu_1\|_2^2 + 2(\mu_2 - \mu_1)^\top z - \|\mu_2\|^2 + \|\mu_1\|^2}{2\sigma^2}\right) dz \\
    & + \frac{1}{\sqrt{2\pi} \sigma^{d/2}} \int \exp\left(\frac{-\|z - \mu_1\|_2^2 + 4(\mu_2 - \mu_1)^\top z - 2\|\mu_2\|^2 + 2\|\mu_1\|^2}{2\sigma^2}\right) dz\\
    = & -1 + \frac{1}{\sqrt{2\pi} \sigma^{d/2}} \int \exp\left(\frac{-\|z - (2\mu_2 - \mu_1)\|_2^2 + 2\|\mu_2 - \mu_1\|_2^2}{2\sigma^2}\right) dz\\
    = & -1 + \exp\left(\frac{\|\mu_2 - \mu_1\|_2^2}{\sigma^2}\right).
\end{align*}
Also notice that, as $g$ is positive, a simple bound is that
\begin{align*}
     \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} [g(z)] - \mathbb{E}_{z\sim\mathcal{N}(\mu_2, \sigma^2 I)} [g(z)] \leq \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} [g(z)]\leq \sqrt{\mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)}[g(z)^2]}.
\end{align*}
Thus,
\begin{align*}
    \mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)} [g(z)] - \mathbb{E}_{z\sim\mathcal{N}(\mu_2, \sigma^2 I)} [g(z)] \leq \sqrt{\mathbb{E}_{z\sim\mathcal{N}(\mu_1, \sigma^2 I)}[g(z)^2]}\sqrt{\min\left\{\exp\left(\frac{\|\mu_2 - \mu_1\|_2^2}{\sigma^2}\right) - 1, 1\right\}}.
\end{align*}
Notice that, if $\|\mu_2 - \mu_1\| \geq \sigma$, then $\exp\left(\frac{\|\mu_2 - \mu_1\|_2^2}{\sigma^2}\right) - 1 \geq 1$. Meanwhile, when $x\in [0, 1]$, $\exp(x) \leq 1 + 2x$. Thus, 
\begin{align*}
    \sqrt{\min\left\{\exp\left(\frac{\|\mu_2 - \mu_1\|_2^2}{\sigma^2}\right) - 1, 1\right\}} \leq \sqrt{\min\left\{1 + \frac{2\|\mu_2 - \mu_1\|_2^2}{\sigma^2} - 1, 1\right\}} = \min\left\{\frac{2\|\mu_2 - \mu_1\|^2}{\sigma^2}, 1\right\},
\end{align*}
which finishes the proof.
\end{proof}
With Lemma \ref{lem:option-stop-simulation}, we have that
\begin{align*}
    & \hat{V}_0^\pi(s_0) - V_0^\pi(s_0)  \\
    \leq & \mathbb{E}\left[\mathbf{1}_{h-1<\tau}\left(\mathbb{E}_{s_h^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2)} \left[\tilde{V}_h^\pi(s_h)\right] - \mathbb{E}_{s_h^\prime\sim \mathcal{N}(f^*(s_h, a_h), \sigma^2)} \left[\tilde{V}_h^\pi(s_h)\right)\right]\right]\\
    \leq & \sum_{h=0}^{H-1}  \mathbb{E}\left[\sqrt{\mathbb{E}_{s_{h+1}^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2)}\left[\tilde{V}_h^\pi(s_{h+1}^\prime)^2\right]} \min\left\{\frac{\sqrt{2}\|f^*(s_h, a_h) - \hat{f}(s_h, a_h)_2\|}{\sigma}, 1\right\}\right]\\
    \leq & \sum_{h=0}^{H-1} \sqrt{\mathbb{E}\left[\mathbb{E}_{s_{h+1}^\prime\sim \mathcal{N}(\hat{f}(s_h, a_h), \sigma^2)}\left[\tilde{V}_h^\pi(s_{h+1}^\prime)^2\right]\right]} \sqrt{\mathbb{E}\left[\min\left\{\frac{2\|f^*(s_h, a_h) - \hat{f}(s_h, a_h)_2^2\|}{\sigma^2}, 1\right\}\right]}\\
    \leq & \sqrt{\mathbb{E}\left[\sum_{h=0}^{H-1}\mathbb{E}_{s_{h+1}^\prime\sim P(\cdot|f^*(s_h, a_h))}\left[\tilde{V}_h^\pi(s_{h+1}^\prime)^2\right]\right]} \sqrt{\mathbb{E}\left[\sum_{h=0}^{H-1} \min \left\{\frac{2\|f^*(s_h, a_h) - \hat{f}(s_h, a_h)_2^2\|}{\sigma^2}, 1\right\}\right]}\\
    \leq & H^{3/2}\sqrt{\mathbb{E}\left[\sum_{h=0}^{H-1} \min \left\{\frac{2\|f^*(s_h, a_h) - \hat{f}(s_h, a_h)_2^2\|}{\sigma^2}, 1\right\}\right]}
\end{align*}
where the second inequality is due to Lemma \ref{lem:expectation_diff}, and the last inequality is due to the fact that $\tilde{V}_h^\pi(s_{h+1}^\prime) \leq H$, $\forall h$.
\end{proof}
\subsection{Sum of Width Square}
\begin{lemma}[Bound on the Sum of Width Square] \label{lem:width_sum_bound}
Define 
\begin{align*}
    w_{\mathcal{F}}(s, a) := \sup_{\bar{f},\underline{f}\in\mathcal{F}}\|\bar{f}(s, a) - \underline{f}(s, a)\|_2.
\end{align*}
If $\{\beta_k^*\}_{k\in [K]}$ is a non-decreasing sequence, and $\|f\|_{2} < C, \forall f\in \mathcal{F}$, then:
\begin{align*}
    \sum_{k\in [K]} \sum_{h=0}^{H-1} w_{\mathcal{F}_t}^2(s_h^k, a_h^k) \leq 1 + 4C^2 H \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) + 4\beta_K \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) (1 + \log T)
\end{align*}
\end{lemma}
\label{sec:proof_width}
\begin{proof}
We first show the following lemma, which will be helpful in our proof.
\begin{lemma}[Lemma 1 in \citep{osband2014model}]
If $\{\beta_k\}_{k\in [K]}$ is a non-decreasing sequence, we have
\begin{align*}
    \sum_{k\in [K]}\sum_{h=0}^{H-1} \mathbf{1}_{w_{\mathcal{F}_{k}}(s_h^k, a_h^k) > \epsilon} \leq \left(\frac{4\beta_K}{\epsilon^2} +  H \right)\mathrm{dim}_{E}(\mathcal{F}, \epsilon).
\end{align*}
\end{lemma}
\begin{proof}
We first consider when $w_{\mathcal{F}_k}(s_h^k, a_h^k) > \epsilon$ and is $\epsilon$-dependent on $n$ disjoint sub-sequences of $\{(s_h^i, a_h^i)\}_{i\in [k-1]}$. By the definition of $\epsilon$-dependent, we know $\|\bar{f} - \underline{f}\|_{2, E_k} > n\epsilon^2$. On the other hand, by triangle inequality, we know $\|\bar{f} - \underline{f}\|_{2, E_k}\leq 2\sqrt{\beta_k} \leq 2\sqrt{\beta_K}$, thus $n < \frac{4\beta_K}{\epsilon^2}$. Hence we know when $w_{\mathcal{F}_k}(s_h^k, a_h^k) > \epsilon$, then $(s_h, a_h)$ is at most $\epsilon$-dependent on $\frac{4\beta_K}{\epsilon^2}$ disjoint sub-sequences of $\{(s_h^i, a_h^i)\}^{i\in [k-1]}$.

We then show that, for any sequence $\{(s_i, a_i)\}_{i\in [N]}$, there is some element $(s_j, a_j)$ that is $\epsilon$-dependent on at least $\frac{n}{\mathrm{dim}_E(\mathcal{F}, \epsilon)} - H$ disjoint sub-sequences of $\{(s_i, a_i)\}_{i\in [j-1]}$. Let $n$ satisfies that $n\mathrm{dim}_E(\mathcal{F}, \epsilon) + 1 \leq N \leq (n+1)\mathrm{dim}_E(\mathcal{F}, \epsilon) $, and we will construct $n$ disjoint sub-sequences $\{B_i\}_{i\in [n]}$. We first let $B_i = \{(s_i, a_i)\}, \forall i\in [n]$. If $(s_{k+1}, a_{k+1})$ is $\epsilon$-dependent on each $B_i, i\in [n]$, we have the desired results. Otherwise, we append $(s_{k+1}, a_{k+1})$ to the sub-sequence that it is $\epsilon$-independent with. Repeat this process until some $j > n + 1$ is $\epsilon$-dependent on each sub-sequence or we have reached $N$. In the latter case we have $\sum_{i\in [n]}|B_i| \geq n \mathrm{dim}_E(\mathcal{F}, \epsilon)$ (here we can add at most $H-1$ data to avoid the case we need a new episode of data), and since each element of a sub-sequence is $\epsilon$-independent with its predecessors, $|B_i|\leq \mathrm{dim}_E(\mathcal{F}, \epsilon), \forall i$ by the definition of eluder dimension. Thus $|B_i|=\mathrm{dim}_E(\mathcal{F}, \epsilon), \forall i$. And in this case, $(s_N, a_N)$ must be $\epsilon$-dependent on each sub-sequence by the definition of eluder dimension. Notice that, as our data is collected in an episodic pattern, there are at most $H-1$ sub-sequences that contains "imaginary" final episode data introduced to the construction. In this case, we know that there are at least $\frac{n}{\mathrm{dim}_E(\mathcal{F}, \epsilon)} - H$ disjoint sub-sequences that $(s_N, a_N)$ is $\epsilon$-dependent, which finishes our claim.

We finally consider the sub-sequence $B = \{(s_h^k, a_h^k)\}$ with $w_{\mathcal{F}_k}(s_h^k, a_h^k) > \epsilon$. We know that each element in $B$ is $\epsilon$-dependent on at most $\frac{4\beta_K}{\epsilon^2}$ disjoint sub-sequence of $B$, but at least $\epsilon$-dependent on $\frac{|B|}{\mathrm{dim}_E(\mathcal{F}, \epsilon)} - H$ sub-sequence of $B$. Thus we know $|B| \leq \left(\frac{4\beta_K}{\epsilon^2} +  H \right)\mathrm{dim}_{E}(\mathcal{F}, \epsilon)$, which concludes the proof.
\end{proof}


For notation simplicity, we define $w_{t, h} := w_{\mathcal{F}_t}(s_h^t, a_h^t)$. We first reorder the sequence $\{w_{t, h}\}_{k\in [K], 0\leq h\leq H-1}\to \{w_{i}\}_{i\in [KH]}$, such that $w_1\geq \cdots w_{TH}$. Then we have
\begin{align*}
    \sum_{k\in [K]} \sum_{h=0}^{H-1}w_{\mathcal{F}_t}^2(s_h^k, a_h^k) = \sum_{i\in [KH]} w_i^2 \leq \sum_{i\in [KH]} w_i^2 \mathbf{1}_{w_i < T^{-1/2}} + \sum_{i\in [KH]} w_i^2 \mathbf{1}_{w_i \geq T^{-1/2}}\leq 1 + \sum_{i\in [KH]} w_i^2 \mathbf{1}_{w_i \geq T^{-1/2}}.
\end{align*}
As we order the sequence, $w_j \geq \epsilon$ means
\begin{align*}
    \sum_{k\in [K]}\sum_{h=0}^{H-1}  \mathbf{1}_{w_{\mathcal{F}_{t}}(s_h^k, a_h^k) > \epsilon} \geq j.
\end{align*}
Hence we know
\begin{align*}
    \epsilon \leq \sqrt{\frac{4\beta_K}{\frac{j}{\mathrm{dim}_{E}(\mathcal{F}, \epsilon)} - H}} = \sqrt{\frac{4\beta_K \mathrm{dim}_{E}(\mathcal{F}, \epsilon)}{j - H\mathrm{dim}_{E}(\mathcal{F}, \epsilon)}},
\end{align*}
which means if $w_i \geq T^{-1/2}$, then $w_i < \min \left\{2C, \sqrt{\frac{4\beta_K \mathrm{dim}_{E}(\mathcal{F}, T^{-1/2})}{k - H\mathrm{dim}_{E}(\mathcal{F}, T^{-1/2})}}\right\}$. Hence,
\begin{align*}
    \sum_{i\in [KH]} w_i^2 \mathbf{1}_{w_i\geq T^{-1/2}} \leq & 4C^2 H \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) + \sum_{j=H\mathrm{dim}_{E}(\mathcal{F}, T^{-1/2}) + 1}^T \frac{4\beta_K \mathrm{dim}_{E}(\mathcal{F}, T^{-1/2})}{j - H\mathrm{dim}_{E}(\mathcal{F}, T^{-1/2})}\\
    \leq & 4C^2 H \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) + 4\beta_K \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) (1 + \log T),
\end{align*} 
which finishes the proof.
\end{proof}
\subsection{Proof for Theorem \ref{thm:regret_bound} and Theorem \ref{thm:regret_bound_UCB}}
\label{sec:proof_regret}
\begin{proof}
Define $\mathcal{E}_k = \mathbb{P}_{f^*}\left(f^* \in \mathcal{F}_k\right)$. When constructing the confidence set, take $\alpha = T^{-1/2}$ and $\delta = 0.25$ in Lemma \ref{lem:confidence_set}, which leads to 
\begin{align*}
    \beta_k^* := 8\sigma^2 \log(4\mathcal{N}(\mathcal{F}, T^{-1/2}, \|\cdot\|_2)) + HT^{-1/2}(12C + \sqrt{8d\sigma^2 \log (16k^2 H)}).
\end{align*} 
With our confidence set construction, we know that $\sum_{k\in [K]}P(\bar{\mathcal{E}}_k)\leq 0.5$. Notice that
\begin{align*}
    \mathrm{Regret}(K) = &  \sum_{k\in [K]} \left[V_0^*(s_0^k) - V_0^{\pi_k}(s_0^k)\right]\\
    \leq &  \mathbb{E}\left[\sum_{k\in [K]}\mathbb{E}\left[\mathbb{P}(\mathcal{E}_k)[V^*(s_0^k) - V_0^{\pi_k}(s_0^k)]\right]\right] + H\sum_{k\in [K]}\mathbb{P}(\bar{\mathcal{E}}_k) \\
    \leq & \mathbb{E}\left[\sum_{k\in [K]}\mathbb{E}\left[\tilde{V}_{0, k}^{\pi_k}(s_0^k) - V_0^{\pi_k}(s_0^k)\right]\right] + 0.5 H \\
    \leq & H^{3/2} \sum_{k\in K}\sqrt{\mathbb{E} \left[\sum_{h=0}^{H-1}\min\left\{\frac{2\|\tilde{f}_k(s_h^k, a_h^k) - f^*(s_h^k, a_h^k) \|_2^2}{\sigma^2}, 1\right\}\right]} + 0.5H \\
    \leq & \sqrt{H^2T\mathbb{E}\left[\sum_{k\in [K]} \sum_{h=0}^{H-1}\min\left\{\frac{2\|\tilde{f}_k(s_h^k, a_h^k) -\hat{f}^*(s_h^k, a_h^k) \|_2^2}{\sigma^2}, 1\right\}\right]} + 0.5H \\
    \leq & \sqrt{\frac{2H^2 T}{\sigma^2} \left(1 + 4C^2 H \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) + 4\beta_K^* \mathrm{dim}_{E}\left(\mathcal{F}, T^{-1/2}\right) (1 + \log T)\right)} + 0.5H,
\end{align*}
where the first equality is due to the fact that the total reward for each episode is bounded in $[0, H]$, the second inequality is due to the optimism and our confidence set construction, the third inequality is due to Lemma \ref{lem:simulation}, the fourth inequality is due to Cauchy-Schwartz inequality and the final inequality is due to Lemma \ref{lem:width_sum_bound} {\color{black}, which concludes the proof of Theorem \ref{thm:regret_bound_UCB}. Following the idea of \citep{russo2013eluder, russo2014learning, osband2014model}, we can translate the worst-case regret bound for UCB algorithm into the expected regret bound for TS algorithm, that conclude the proof of Theorem \ref{thm:regret_bound}.}
\end{proof}
{\color{black} \paragraph{Remark} It can be undesirable that our regret bound scale with $\sigma^{-1}$, which means our algorithm can perform pretty bad when the noise level is extremely low. It is also more or less counter-intuitive. We want to remark that, such phenomenon is only an artifact introduced by our proof strategy. The simulation lemma (Lemma \ref{lem:simulation}) works well when $f(s, a) - \tilde{f}(s, a)$ is small. However, we need to tolerate some bad episodes to collect sufficient samples, that can eventually make the error small. Fortunately, the regret of such bad episode is at most $H$. Hence, we can use the following strategy to get rid of the dependency on $\sigma^{-1}$.

\begin{definition}[Bad and Good Episodes] Define episode $k$ as a bad episode, if $\exists h \in \{0, 1, \cdots, H-1\}$, such that $w_{k, h} := w_{\mathcal{F}_k}(s_h^k, a_h^k)$ is the largest $H \mathrm{dim}_{E}(\mathcal{F}, \sigma^2 T^{1/2})$ elements in the set $\{w_{k, h}\}_{k\in[K], 0\leq h \leq H-1}$. Define episode $k$ as a good episode, if it is not a bad episode.
\end{definition}

By the definition, we know there are at most $H \mathrm{dim}_{E}(\mathcal{F}, \sigma^2 T^{-1/2})$ bad episodes. We then show the following lemma, that can be directly generalized from Lemma \ref{lem:width_sum_bound}, by setting $\epsilon = \sigma^2 T^{-1/2}$ and remove the terms from bad episodes.
\begin{lemma}
\label{lem:width_sum_bound_good}
If $\{\beta_k^*\}_{k\in [K]}$ is a non-decreasing sequence, and $\|f\|_{2} < C, \forall f\in \mathcal{F}$, then:
\begin{align*}
    \sum_{k\in [K], \text{$k$ is good}} \sum_{h=0}^{H-1} w_{\mathcal{F}_t}^2(s_h^k, a_h^k) \leq \sigma^2 + 4\beta_K \mathrm{dim}_{E}\left(\mathcal{F}, \sigma^2 T^{-1/2}\right) (1 + \log T)
\end{align*}
\end{lemma}

Eventually, we can obtain the following regret bound, by setting the regret of bad episodes as $H$, and bounding the regret of good episodes with Lemma \ref{lem:width_sum_bound_good}.

\begin{theorem}[Improved Regret Bound]
\label{thm:regret_bound_UCB_improved}
Assume Assumption \ref{assump:bounded_output} to \ref{assump:bounded_eluder} holds. Take $\alpha = \sigma^2 T^{-1/2}$ and $\delta = 0.25$ in Lemma \ref{lem:confidence_set}, which leads to 
\begin{align*}
    \beta_k^* := 8\sigma^2 \log(4\mathcal{N}(\mathcal{F}, \sigma^2 T^{-1/2}, \|\cdot\|_2)) + H\sigma^2 T^{-1/2}(12C + \sqrt{8d\sigma^2 \log (16k^2 H)}).
\end{align*}
We have that
\begin{align*}
\textstyle
    \mathrm{Regret}(K) \leq \sqrt{H^2 T (\frac{8\beta_K}{\sigma^2} + 1)\mathrm{dim}_{E}(\mathcal{F}, \sigma^2 T^{-1/2})(1 + \log T)} + 0.5 H + H^2 \mathrm{dim}_{E}(\mathcal{F}, \sigma^2 T^{-1/2})
\end{align*}
\end{theorem}
We would like to remark, that the definition of bad and good episodes is only used for the proof. We don't need to make any modification on the algorithm. Notice that, as $\beta_k^* \propto \sigma^2$, our upper bound in Theorem \ref{thm:regret_bound_UCB_improved} can only scale with $\sigma^{-1}$ through the logarithm covering number $\log(4\mathcal{N}(\mathcal{F}, \sigma^2 T^{-1/2}, \|\cdot\|_2))$ and eluder dimension $\mathrm{dim}_{E}(\mathcal{F}, \sigma^2 T^{-1/2})$. When $\mathcal{F}$ is a linear function class, both term should scale with $\mathrm{polylog}(\sigma)$, that matches the result from \citep{kakade2020information}. 
}

\section{Bounds on the Complexity Term under Linear Realizability}
\label{sec:linear_case}
We provide the upper bound on the covering number and the eluder dimension of $\mathcal{F}$ when $\mathcal{F}:= \{\theta^\top \varphi: \theta\in\mathbb{R}^{d_{\varphi}\times d}, \|\theta\|_2 \leq W\}$ where $\varphi:\mathcal{S}\times\mathcal{A} \to \mathbb{R}^{d_{\varphi}}$ is some known feature map. We first make the following standard assumption:
\begin{assumption}[Bounded Feature]
\begin{align*}
    \|\varphi(s, a)\|_2 \leq B, \forall (s, a)\in\mathcal{S}\times\mathcal{A}.
\end{align*}
\end{assumption}
\subsection{Covering Number}
\begin{theorem}[Covering Number Bound]
We have that
\begin{align*}
    \mathcal{N}(\mathcal{F}, \epsilon, \|\cdot\|_2) \leq \left(1 + \frac{2BW}{\epsilon}\right)^{d_{\varphi}}.
\end{align*}
\end{theorem}
\begin{proof}
Notice that, by Cauchy-Schwartz inequality, we have that
\begin{align*}
    \max_{(s, a)\in\mathcal{S}\times\mathcal{A}} \|\varepsilon_i^\top \varphi(s, a)\|_2 \leq B \|\varepsilon_i\|_2, \quad \forall \varepsilon_i\in\mathbb{R}^{d_{\varphi}}.
\end{align*}
Thus, denote $\varepsilon = [\varepsilon_i]_{i\in [d]}$, we have that
\begin{align*}
    \max_{(s, a)\in\mathcal{S}\times\mathcal{A}} \|\varepsilon^\top \varphi(s, a)\|_2^2 = \max_{(s, a)\in\mathcal{S}\times\mathcal{A}} \sum_{i\in [d]}\|\varepsilon_i^\top \varphi(s, a)\|_2^2\leq B^2 \sum_{i\in [d]}\|\varepsilon_i\|_2^2 = B^2 \|\varepsilon\|_2^2.
\end{align*}
Hence, to find an $\epsilon$-cover for $\mathcal{F}$, we just need to find an $\epsilon/B$-cover of $\{\theta:\theta\in\mathbb{R}^{d_{\varphi}\times d}, \|\theta\|_2\leq W\}$. By standard argument on the covering number of Euclidean space (e.g. Lemma 5.7 in \citep{wainwright2019high}), we can conclude the desired result.
\end{proof}
\subsection{Eluder Dimension}
\begin{theorem}[Eluder Dimension Bound]
We have that
\begin{align*}
    \mathrm{dim}_{E}(\mathcal{F}, \epsilon) \leq \frac{3d_{\varphi} e}{e-1}\log \left(3 + \frac{12W^2 B^2}{\epsilon^2}\right) + 1.
\end{align*}
\end{theorem}
\begin{proof}
Our proof follows the idea in \citep{russo2013eluder}. Define
\begin{align*}
    w_k := \sup\left\{(\theta_1 - \theta_2)^\top \varphi(s, a):\sqrt{\sum_{i\in[k-1]}\left((\theta_1 - \theta_2)^\top \varphi_i(s_i, a_i)\right)^2 }\leq \epsilon^\prime, \theta_1, \theta_2 \in \mathbb{R}^{d_{\varphi\times d}}, \|\theta_1\|\leq W, \|\theta_2\|\leq W\right\}.
\end{align*}
For notation simplicity, define $\varphi_k := \varphi(s_i, a_i)$, $\theta := \theta_1 - \theta_2$, and $\Phi_k := \sum_{i\in[k-1]}\varphi_i\varphi_i^\top$. Obviously, we have that $\|\theta\| \leq 2W$. Moreover, by straightforward calculation, we know
\begin{align*}
    \sum_{i\in [k-1]}\left((\theta_1 - \theta_2)^\top \varphi_i(s_i, a_i)\right)^2 = \mathrm{Trace}(\theta^\top \varphi_k \theta).
\end{align*}
Define $V_k:= \Phi_k + \frac{(\epsilon^\prime)^2}{4W^2} I$, we start from considering the problem
\begin{align*}
    \max_{\theta} \mathrm{Trace}(\theta^\top \varphi_k \varphi_k^\top \theta), \quad \text{subject to}\quad \mathrm{Trace}(\theta^\top V_k  \theta) \leq 2\epsilon^2.
\end{align*}
The Lagrangian can be formed as
\begin{align*}
    \mathcal{L}(\theta, \gamma) = - \mathrm{Trace}(\theta^\top \varphi_k \varphi_k^\top \theta) + \lambda (\mathrm{Trace}(\theta^\top V_k\theta) - 2\epsilon^2), \quad \lambda \geq 0.
\end{align*}
The optimality condition of $\theta$ is
\begin{align*}
    (\lambda V_k - \varphi_k\varphi_k^\top) \theta = 0.
\end{align*}
As $V_k$ is of full rank, $\lambda V_k - \varphi_k\varphi_k^\top$ has rank at least $d_{\varphi} - 1$ (as $\varphi_k \varphi_k^\top$ is of rank $1$). So the equation
\begin{align*}
     (\lambda V_k - \varphi_k\varphi_k^\top) \theta_i = 0, \quad \theta_i \in \mathbb{R}^{d_{\varphi}}
\end{align*}
only has one non-zero solution. Substitute back, we know that (define $\|x\|_{A} := \sqrt{x^\top A x}$):
\begin{align*}
    \sup\{\mathrm{Trace}(\theta^\top \varphi_k \varphi_k^\top \theta):  \mathrm{Trace}(\theta^\top V_k  \theta) \leq \epsilon^2\} = \sqrt{2}\epsilon^\prime \|\varphi_k\|_{V_k^{-1}}.
\end{align*}
With the conclusion above, we have that
\begin{align*}
    w_k \leq \sup\{\theta^\top \varphi_k: \mathrm{Trace}(\theta^\top \Phi_k  \theta) \leq \epsilon^2, \|\theta\|\leq 2W\}\leq \sup\{\theta^\top \varphi_k:  \mathrm{Trace}(\theta^\top V_k  \theta) \leq 2\epsilon^2\} = \sqrt{2}\epsilon^\prime \|\varphi_k\|_{V_k^{-1}}.
\end{align*}
Hence, if $w_{k}\geq \epsilon^\prime$, then $\varphi_k V_k^{-1}\varphi_k \geq 0.5$. Moreover, with Matrix Determinant Lemma, if $w_i\geq \epsilon^\prime$, $\forall i<k$, we have 
\begin{align*}
    \mathrm{det}(V_k) = \mathrm{det}(V_{k-1})(1 + \varphi_k^\top V_{k}^{-1} \varphi_k) \geq  \mathrm{det}(V_{k-1})\left(\frac{3}{2}\right)\geq \cdots \geq  \mathrm{det}\left(\frac{(\epsilon^\prime)^2}{4W^2} I\right) \left(\frac{3}{2}\right)^{k-1} = \frac{(\epsilon^\prime)^{2d}}{4W^{2d}}  \left(\frac{3}{2}\right)^{k-1}.
\end{align*}
Meanwhile,
\begin{align*}
    \mathrm{det}(V_k) \leq \left(\frac{\mathrm{Trace}(V_k)}{d}\right)^d \leq \left(\frac{B^2(k-1)}{d} + \frac{(\epsilon^\prime)^2}{4W^2}\right)^d.
\end{align*}
Hence, we know
\begin{align*}
    \left(\frac{3}{2}\right)^{(k-1)/d} \leq \frac{4W^2B^2}{(\epsilon^\prime)^2}\cdot \frac{k-1}{d} + 1.
\end{align*}
Now we only need to find the largest $k$ that can make this inequality hold. For notation simplicity, define $\alpha := \frac{4W^2B^2}{(\epsilon^\prime)^2}$, $n = \frac{k-1}{d}$. As $\log(1+x) \geq \frac{x}{1+x}$ and $\log x \leq x/e$, we have
\begin{align*}
    \frac{n}{3}\leq n \log 3/2 \leq \log(\alpha + 1) + \log n \leq \log(\alpha + 1) + \log 3 + \log (n/3)\leq \log(\alpha + 1) + \log 3 + \frac{n}{3e}.
\end{align*}
Substitute back, we can obtain the desired result.
\end{proof}

\newpage
\section{Experimental Details}
\label{sec:exp_details}
\subsection{Algorithm Summary}
Our algorithm is easily built on SAC. The only difference we make is we decouple the critic network into a representation network $\phi(\cdot)$ and a linear layer $l(\cdot)$ on top of the representation. The representation network is governed by the model dynamics loss in \algabb, and we train a linear layer to predict the $Q$-value as it lies in the linear space of the representation guaranteed by our analysis. We update the representation by a momentum factor and keep the policy update the same procedure as SAC.
\subsection{Full Experiments}\label{appendix:full_exp}
\begin{table*}[h]
\caption{Performance of \algabb on various MuJoCo control suite tasks. Our method achieve strong performance even comparing to pure empirical baselines. To be specific, in hard tasks like Humanoid-ET and Ant-ET, \algabb outperforms the baselines significantly. Results with $^*$ are directly adopted from MBBL~\citep{wang2019benchmarking}. We also provide the SoTA model-free RL method SAC as a reference.}
\vspace{0.3em}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:MuJoCo_results_full}
\centering
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}}
% {lcccccccccccc}
\toprule
& Swimmer & Ant-ET & Hopper-ET & Pendulum \\ 
\midrule  
ME-TRPO$^*$ & 30.1$\pm$9.7 & 42.6$\pm$21.1 & 4.9$\pm$4.0 & 177.3$\pm$1.9\\
PETS-RS$^*$  & 42.1$\pm$20.2 & 130.0$\pm$148.1 & 205.8$\pm$36.5 & 167.9$\pm$35.8\\
PETS-CEM$^*$  & 22.1$\pm$25.2 & 81.6$\pm$145.8 & 129.3$\pm$36.0 & 167.4$\pm$53.0\\
DeepSF & 25.5$\pm$13.5 & 768.1$\pm$44.1 & 548.9$\pm$253.3 & 168.6$\pm$5.1 \\
{\bf \algabb} & 42.6$\pm$4.2 & 806.2$\pm$60.2 & 732.2$\pm$263.9 & 169.5$\pm$0.6 \\
\midrule
SAC$^*$  & 41.2$\pm$4.6 & 2012.7$\pm$571.3 & 1815.5$\pm$655.1 & 168.2$\pm$9.5 \\
\bottomrule 
\end{tabular}
\centering
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}}
\toprule
& Reacher & Cartpole & I-pendulum & Walker-ET \\ 
\midrule  
ME-TRPO$^*$ & -13.4$\pm$5.2 & 160.1$\pm$69.1 & -126.2$\pm$86.6 & -9.5$\pm$4.6\\
PETS-RS$^*$ & -40.1$\pm$6.9 &  195.0$\pm$28.0 & -12.1$\pm$25.1 & -0.8$\pm$3.2 \\
PETS-CEM$^*$ & -12.3$\pm$5.2 &  199.5$\pm$3.0 & -20.5$\pm$28.9 & -2.5$\pm$6.8 \\
DeepSF & -16.8$\pm$3.6 & 194.5$\pm$5.8 & -0.2$\pm$0.3 & 165.6$\pm$127.9\\
{\bf \algabb} & -7.2$\pm$1.1 & 138.2$\pm$39.5 & 0.0$\pm$0.0 & 501.58$\pm$204.0  \\
\midrule
SAC$^*$ & -6.4$\pm$0.5 & 199.4$\pm$0.4 & -0.2$\pm$0.1 & 2216.4$\pm$678.7\\
\bottomrule 
\end{tabular}
\centering
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}}
\toprule
& MountainCar & Acrobot & SlimHumanoid-ET & Humanoid-ET \\ 
\midrule  
ME-TRPO$^*$  & -42.5$\pm$26.6 & 68.1$\pm$6.7 & 76.1$\pm$8.8 & 776.8$\pm$62.9\\
PETS-RS$^*$ & -78.5$\pm$2.1 & -71.5$\pm$44.6 & 320.7$\pm$182.2 & 106.9$\pm$102.6\\
PETS-CEM$^*$  & -57.9$\pm$3.6 & 12.5$\pm$29.0 & 355.1$\pm$157.1 & 110.8$\pm$91.0 \\
DeepSF & -17.0$\pm$23.4 & -74.4$\pm$3.2 & 533.8$\pm$154.9 & 241.1$\pm$116.6 \\
{\bf \algabb}  & 50.3$\pm$1.1 & -69.0$\pm$3.3 & 986.4$\pm$154.7 & 886.9$\pm$95.2 \\
\midrule
SAC$^*$  & 52.6$\pm$0.6 & -52.9$\pm$2.0 & 843.6$\pm$313.1 & 1794.4$\pm$458.3  \\
\bottomrule 
\end{tabular}
\end{table*}
\newpage

\subsection{Ablations}\label{appendix:ablations}
\begin{table*}[h]
\caption{Ablation Suty of \algabb on MuJoCo tasks. We see that a small momentum factor help stabilize the performance, especially in environments like  Huamoid and Hopper-ET.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:ablation}
\centering
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Hopper-ET & Ant-ET & S-Humanoid-ET & Humanoid-ET \\ 
\midrule  
\algabb-0.9  & 593.2$\pm$37.4 & \textbf{877.7$\pm$45.9} & 881.6$\pm$385.2 & 232.9$\pm$63.4 \\
\algabb-0.99 & 305.9$\pm$13.4 & 707.9$\pm$51.1 & 629.3$\pm$106.9 & 818.1$\pm$130.6 \\
\algabb-0.999 & \textbf{732.2$\pm$263.9} & 806.2$\pm$60.2 & \textbf{986.4$\pm$154.7} & \textbf{886.9$\pm$95.2} \\
\bottomrule 
\end{tabular}
\end{table*}
\paragraph{Momentum Update} Our ablation experiments are trying to study an important design choice of the practical algorithm: the momentum used to update the critic function. We summarize the results in Table~\ref{tab:ablation}. We can see that using a small large momentum factor such as 0.999 shows better performance. This is intuitively understandable: large momentum factor slows down the update speed of the representation of the critic function and thus stabilize the training. Such phenomenon illustrates the importance of slowly update the representation.

\begin{wrapfigure}[11]{R}{0.4\textwidth}
\centering
\vspace{-9mm}
\includegraphics[width=0.9\linewidth]{figures/random_dim.png}
\vspace{-2mm}
\caption{\small %\textbf{RFF in HalfCheetah.} 
Increasing the number of random features can also lead to a performance gain.}
\label{fig:random_dim}
\end{wrapfigure}

\paragraph{Random Feature Dimension} We also conduct the experiments on how does the random feature dimension affect the final performance of the algorithm. We plot the results in HalfCheetah environment in Fig.~\ref{fig:random_dim}. We can see that when we increasing the random feature dimension, we see a performance gain on the final return. This suggests that using a larger number of feature dimension would help the performance. 

\paragraph{MLP Network for Critic Network} We also conduct an experiment to study whether adding a MLP network on top of our representation could work. We show such ablation in Tab.~\ref{tab:mlp}. From the results, we see that the performance of MLP network is in generally better than the Linear network.

\begin{table*}[h]
\centering
{\color{black} 
\caption{Comparison of \algabb linear critic network and critic network. Results show that in general MLP network will further improve the performance.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:mlp}
\centering
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Reacher & MountainCar & Cartpole & Acrobot\\ 
\midrule  
\algabb-Linear  & -7.2$\pm$1.1 & 50.3$\pm$1.1 & 138.2$\pm$39.5 & -69.0$\pm$3.3\\
\algabb-MLP & -6.8$\pm$0.4 & \textbf{53.8}$\pm$\textbf{1.1} & 171.9$\pm$31.0 & \textbf{-15.6}$\pm$\textbf{1.9}\\
SAC & \textbf{-6.4}$\pm$\textbf{0.5} & 52.6$\pm$0.6 & \textbf{199.4}$\pm$\textbf{0.4} & -52.9$\pm$2.0\\
\bottomrule 
\end{tabular}}

\setlength\tabcolsep{3.5pt}
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Pendulum & I-Pendulum & Walker-ET & S-Humanoid-ET\\ 
\midrule  
\algabb-Linear  & \textbf{169.5}$\pm$\textbf{0.6} & \textbf{0.0}$\pm$\textbf{0.0} & 501.6$\pm$204.0 & 986.4$\pm$154.7\\
\algabb-MLP & 165.9$\pm$4.2 & \textbf{0.0}$\pm$\textbf{0.0} & 1005.7$\pm$458.4 & \textbf{2521.1}$\pm$\textbf{420.8}\\
SAC & 168.2$\pm$9.5 & -0.2$\pm$0.1 & \textbf{2216.4}$\pm$\textbf{678.7} & 843.6$\pm$313.1\\
\bottomrule 
\end{tabular}
\end{table*}

{\color{black} 
\subsection{Comparison to LC3}\label{appendix:lc3}
We provide a comparison of empirical results with LC3~\citep{kakade2020information}, which is also an algorithm with rigorous theoretical guarantees. Despite the major difference that we are learning the representation while LC3 assumes a given feature, the performance of \algabb is much better than LC3 in tasks like Mountain Car and Hopper.}

\begin{table*}[h]
\centering
{\color{black} 
\caption{Comparison of \algabb with LC3 on MuJoCo tasks. LC3 only achieves good performance on relatively easy tasks like Reacher. However, their performance on Hopper and Mountain-Car is much worse than \algabb. }
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:lc3}
\centering
\begin{tabular}{p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Reacher & MountainCar & Hopper \\ 
\midrule  
\algabb  & -7.2$\pm$1.1 & \textbf{50.3}$\pm$\textbf{1.1} & \textbf{732.2}$\pm$\textbf{263.9} \\
LC3 & \textbf{-4.1}$\pm$\textbf{1.6} & 27.3$\pm$8.1 & -1016.5$\pm$607.4 \\
\bottomrule 
\end{tabular}}
\end{table*}

\newpage
\subsection{Performance Curves}
We provide an additional performance curve including ME-TRPO in Figure \ref{fig:MuJoCo1} for a reference.
\begin{figure*}[h]
    \centering
    \includegraphics[width=0.99\textwidth]{figures/MuJoCo_result1.pdf}
    \caption{\footnotesize \textbf{Experiments on MuJoCo:} We show curves of the return versus the training steps for \algabb and model-based RL baselines. We also include the final performance of ME-TRPO from ~\citep{wang2019benchmarking} for reference.}
    \label{fig:MuJoCo1}
\end{figure*}

\subsection{Hyperparameters}
\label{appendix:hyperparam}
We conclude the hyperparameter we use in our experiments in the following.
\begin{table*}[h]
\caption{Hyperparameters used for \algabb in all the environments in MuJoCo.}
\footnotesize
\setlength\tabcolsep{3.5pt}
\label{tab:hyper}
\centering
\begin{tabular}{p{5cm}p{3cm}p{5cm}p{2.5cm}p{2.5cm}p{2cm}p{2cm}}
\toprule
& Hyperparameter Value \\ 
\midrule
Actor lr & 0.0003 \\
Model lr & 0.0001 \\
Actor Network Size & (1024, 1024, 1024) \\
Fourier Feature Size & 1024 \\
Discount & 0.99\\
Target Update Tau & 0.005 \\
Model Update Tau & 0.001 \\
Batch Size & 256 \\
\bottomrule 
\end{tabular}
\end{table*}

\bibliography{ren_338}

\end{document}
