\documentclass[accepted]{uai2023} 

\usepackage[american]{babel}


\usepackage{natbib} \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} \usepackage{booktabs} \usepackage{tikz} \usepackage{xfrac}


\usepackage{amsmath,amsfonts,bm,amssymb,amsthm}

\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{%
\typeout{(#1)}%
\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{rothfuss_499}

\definecolor{hanblue}{HTML}{3F88C5}\hypersetup{
hidelinks,
    colorlinks=true,
    linkcolor=hanblue,
    urlcolor=hanblue,
    citecolor=hanblue,
    anchorcolor=black}
    
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}

\newtheorem{theoremA}{Theorem}
\renewcommand{\thetheoremA}{A\arabic{theoremA}}

\newtheorem{lemmaA}{Lemma}
\renewcommand{\thelemmaA}{A\arabic{lemmaA}}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

\newcommand{\newterm}[1]{{\bf #1}}

\makeatletter
\newcommand{\printfnsymbol}[1]{\textsuperscript{\@fnsymbol{#1}}}
\makeatother


\def\figref#1{figure~\ref{#1}}
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
\def\secref#1{section~\ref{#1}}
\def\Secref#1{Section~\ref{#1}}
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
\def\eqref#1{equation~\ref{#1}}
\def\Eqref#1{Equation~\ref{#1}}
\def\plaineqref#1{\ref{#1}}
\def\chapref#1{chapter~\ref{#1}}
\def\Chapref#1{Chapter~\ref{#1}}
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
\def\algref#1{algorithm~\ref{#1}}
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
\def\partref#1{part~\ref{#1}}
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

\newcommand{\pdata}{p_{\rm{data}}}
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} 

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} 

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak



\newcommand{\rom}[1]
    {\text{\MakeUppercase{\romannumeral #1}}}









\newcommand{\norm}[1]{{\lVert#1\rVert}_2}
\newcommand{\normAny}[1]{{\lVert#1\rVert}}
\newcommand{\ba}{\boldsymbol{a}}
\newcommand{\br}{\boldsymbol{r}}
\newcommand{\bs}{\boldsymbol{s}}
\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\by}{\boldsymbol{y}}
\newcommand{\beps}{\boldsymbol{\epsilon}}
\newcommand{\boldeta}{\boldsymbol{\eta}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bnu}{\boldsymbol{\nu}}
\newcommand{\bsigma}{\boldsymbol{\sigma}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\bome}{\boldsymbol{\omega}}
\newcommand{\btheta}{\boldsymbol{\theta}}


\newcommand{\bW}{\mathbf{W}}
\newcommand{\bA}{\mathbf{A}}
\newcommand{\bk}{\mathbf{k}}
\newcommand{\bX}{\mathbf{X}}
\newcommand{\bh}{\boldsymbol{h}}
\newcommand{\obs}[1]{#1^o}
\newcommand{\miss}[1]{#1^m}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bM}{\mathbf{M}}
\newcommand{\bbR}{\mathbb{R}}
\newcommand{\bff}{\mathbf{f}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bI}{\mathbf{I}}
\newcommand{\bLamb}{\mathbf{\Lambda}}
\newcommand{\bB}{\mathbf{B}}
\newcommand{\GP}{\mathcal{GP}}
\newcommand{\bigO}{\mathcal{O}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\normal}{\mathcal{N}}
\newcommand{\given}{\, \vert \,}
\newcommand{\expVal}[1]{\mathbb{E}\left[#1\right]}
\newcommand{\expvalue}[2]{\underset{#1}{\mathbb{E}}\left[#2\right]}
\newcommand{\var}[2]{\underset{#1}{\text{Var}}\left[#2\right]}
\newcommand{\calA}{\mathcal{A}}
\newcommand{\calB}{\mathcal{B}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calD}{\mathcal{D}}
\newcommand{\calE}{\mathcal{E}}
\newcommand{\calF}{\mathcal{F}}
\newcommand{\calG}{\mathcal{G}}
\newcommand{\calH}{\mathcal{H}}
\newcommand{\calJ}{\mathcal{J}}
\newcommand{\calK}{\mathcal{K}}
\newcommand{\calL}{\mathcal{L}}
\newcommand{\calM}{\mathcal{M}}
\newcommand{\calN}{\mathcal{N}}
\newcommand{\calO}{\mathcal{O}}
\newcommand{\calP}{\mathcal{P}}
\newcommand{\calQ}{\mathcal{Q}}
\newcommand{\calR}{\mathcal{R}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calT}{\mathcal{T}}
\newcommand{\calU}{\mathcal{U}}
\newcommand{\calV}{\mathcal{V}}
\newcommand{\calW}{\mathcal{W}}

\newcommand{\calX}{\mathcal{X}}
\newcommand{\calY}{\mathcal{Y}}
\newcommand{\calZ}{\mathcal{Z}}


\newcommand{\expect}[2]{\mathbb{E}_{#1} \; #2}
\newcommand{\supp}[1]{\text{supp}(#1)}
\newcommand{\bbP}{\mathbb{P}}
\newcommand{\bfP}{\mathbf{P}}
\newcommand{\bfTheta}{\mathbf{\Theta}}
\newcommand{\Rhat}{\hat{R}}

\newcommand{\ppol}{\rho^{{\mathrm{pes}}}}
\newcommand{\bpol}{\rho^{b}}
\newcommand{\epol}{\rho^{e}}

\renewcommand{\overline}{\tilde}




\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newcommand{\beginsupplement}{\setcounter{table}{0}
        \renewcommand{\thetable}{S\arabic{table}}\setcounter{figure}{0}
        \renewcommand{\thefigure}{S\arabic{figure}}}
 
\newcommand{\addcitation}{{\color{red} \textbf{[ADD CITATION]} \xspace}}


\newcommand{\vspacefigure}{\vspace{-0pt}}
\newcommand{\vspaceparagraph}{\vspace{-0pt}}
\newcommand{\vspaceequation}{\vspace{-0pt}}

\newcommand{\vspacecaption}{\vspace{-0pt}}
\newcommand{\vspacecaptionlow}{\vspace{-0pt}}
\newcommand{\vspacesubcaption}{\vspace{-0pt}}
\newcommand{\vspacesubcaptionlow}{\vspace{-0pt}} \usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{caption}
\usepackage{subcaption}
\DeclareMathOperator{\diag}{diag}




\newcommand{\swap}[3][-]{#3#1#2} 

\title{Hallucinated Adversarial Control for Conservative Offline Policy Evaluation - Supplementary Material}

\author[1]{Jonas~Rothfuss\thanks{ Equal contribution.}}
\author[1]{Bhavya~Sukhija\printfnsymbol{1}}
\author[1]{Tobias~Birchler\printfnsymbol{1}}
\author[1]{Parnian~Kassraie}
\author[1]{Andreas~Krause}
\affil[1]{ETH Zurich\\
    Switzerland
}
  
\begin{document}
\onecolumn
\maketitle

\appendix

\section{Algorithm and Experiment Details}\label{app:algos}
In the following, we provide algorithmic formalizations and implementation details of the \textsc{HAMBO} framework and its practical variants which were discussed in the main paper. 

\subsection{Generic HAMBO algorithm from Section \ref{subsec:general_hambo_approach}}
First, we formalize the general HAMBO framework from Section \ref{subsec:general_hambo_approach}:
\begin{algorithm}[ht]
\caption{\textsc{HAMBO} Framework}
\label{alg:hambo_basic}
\begin{algorithmic}
\Require Offline dataset $\calD_b$, evaluation policy $\pi_e$, reward function $r(\cdot, \cdot)$, Horizon $T$, initial state distribution $p_0(\bs_0)$
\State $(\bmu_n, \bsigma_n, \beta_n) \leftarrow \mathrm{TrainModel}(\calD_b)$ \Comment{Train statistical model with offline data}
\State $\tilde p_{\bm{\eta}}(\bs_{t+1}\vert \bs_t, \ba_t) \leftarrow  p_{\beps}\big(\bs_{t+1} -  \bmu_n(\bs_t, \ba_t)
    - \beta_n \boldeta(\bs_t, \ba_t)  \bsigma_n(\bs_t, \ba_t) \big)$
    \Comment{Set up adversarial transition model.}
\State $\tilde J(\pi) \leftarrow \min_{\bm{\eta}} \mathbb{E}_{\bs_0 \sim p_0}[\mathbb{E}_{p_{\bm{\eta}},\pi}[\sum_{t=0}^T r(\bs_{t}, \ba_{t})]]$  \Comment{Optimize adversary to get pessimistic value estimate.}\\
\Return  $\tilde J(\pi)$ \end{algorithmic}
\end{algorithm}

We estimate $J_{\tilde{p}_{\boldeta}}(\pi_e) = \mathbb{E}_{\bs_0 \sim p_0}[\mathbb{E}_{p_{\bm{\eta}},\pi}[\sum_{t=0}^T r(\bs_{t}, \ba_{t})]]$ via Monte Carlo estimation, i.e., we roll out $L$ trajectories and estimate the expectation as the average of the trajectory return:
\begin{equation} \label{eq:mc_estimation_return}
    \hat{J}_{\tilde{p}_{\boldeta}}(\pi_e) = \frac{1}{L} \sum_{l=1}^{L} \sum_{t=0}^T r(\bs_{l,t}, \ba_{l, t}) ~~ \text{where} ~~ \bs_{l,0} \sim p_0, ~ \ba_{l,t} \sim \pi(\ba| \bs_{l,t}), ~ \bs_{l,t+1} \sim  \tilde p_{\bm{\eta}}(\bs' \vert \bs_{l,t}, \ba_{l,t}) 
\end{equation}

The optimization of the advesary corresponds to a standard optimal control problem for which we use traditional methods such as trajectory optimization or model-free RL algorithms such as SAC.
\subsection{BNN Based HAMBO Variants}

\subsubsection{The BNN model}
We use fully connected neural networks with 4 hidden layers each of size 256 with ReLU activation functions.
Before training, the offline data inputs and targets are standardized. The NN takes the concatenated state and action as input (i.e., $d_s + d_a$ dimensional) and outputs a vector of size $2 d_s$ which is split into two vectors of size $d_s$. The first one corresponds to the mean prediction $\bh_{\btheta}(\bs, \ba)$ and the second one is the raw the aleatoric standard deviation which is fed through a softplus function to ensure positivity of $\bnu^2_{\btheta}(\bs, \ba)$

As BNN prior we use a standard Normal distribution over the NN parameters $\btheta$, i.e., $p(\btheta) = \calN(\btheta ; \boldsymbol{0}, \bm{I})$. However, as commonly done for BNNs to alleviate the problems of prior misspecification, we add a temperature parameter $\tau$ to the prior, so that we have $p(\btheta| \calD_b) \propto p(\calD_b | \btheta) p(\btheta)^\tau$. This hyper-parameter is chosen to as $\tau=0.0001$ for Pendulum and Hopper and $\tau=0.01$ for the HalfCheetah control environment.

We use Stein Variational Gradient Descent (SVGD) \citep{liu2016svgd} for approximate posterior inference. In particular, we approximate the posterior $p(\btheta| \calD_b)$ with $K$ NN particles $\{ \btheta_1, ..., \btheta_K\}$. After randomly initializing the parameters of the $K$ NNs, the parameters are iteratively updated with the SVGD update rule:
 \begin{equation}
     \btheta_k \leftarrow \btheta_k+  \frac{1}{K} \sum_{k'=1}^K \left [k(\btheta_{k'}, \btheta_k) \nabla_{\btheta_{k'}} \log p(\btheta_{k'}| \calD_b) +  \nabla_{\btheta_{k'}} k(\btheta_k', \btheta_k) \right] ~~ \forall k = 1, ..., K ~.
 \end{equation}
Here $k(\cdot, \cdot)$ is a kernel function on the space of NN parameters vectors. In our experiments, we use an RBF kernel $k(\btheta, \btheta') = \exp \left( - \norm{\btheta - \btheta'}^2 / (2 \ell) \right)$ with a length scale of $\ell = 10$ and $K=5$ NN particles. Note that the kernel here is different from the one in Section \ref{section:hambo_gp_models}. Algorithm \ref{alg:SVGD} summarizes how to obtain the SVGD BNN posterior approximation.

\begin{algorithm}[ht]
\caption{\textsc{SVGD}}
\label{alg:SVGD}
\begin{algorithmic}
\Require Training data $\calD$, number of particles $K$
\State Initialize NN parameter vectors $\btheta_1, ..., \btheta_K $ \Comment{Initialize SVGD particles.}
\While{not converged}
    \State $\log p(\btheta_{k}| \calD) \leftarrow \log p(\calD| \btheta_k) + \tau \log p(\btheta_k)  ~~\forall k = 1, ..., K$
    \State $\btheta_k \leftarrow \btheta_k+  \frac{1}{K} \sum_{k'=1}^K \left [k(\btheta_{k'}, \btheta_k) \nabla_{\btheta_{k'}} \log p(\btheta_{k'}| \calD) +  \nabla_{\btheta_{k'}} k(\btheta_k', \btheta_k) \right] ~~ \forall k = 1, ..., K$
\EndWhile \\
\Return $\{\bh_{\btheta_1}, \bnu^2_{\btheta_1}, \cdots,\bh_{\btheta_K}, \bnu^2_{\btheta_K}\} $ \Comment{Return the $K$ NN predictive mean and aleatoric variance functions}
\end{algorithmic}
\end{algorithm}


\subsection{Recalibration of the BNN uncertainty estimates} 
To obtain well-calibrated confidence sets for HAMBO, we recalibrate the BNNs predictive distribution. In particular, we use temperature scaling based on the regression calibration error \citet{kuleshov2018calibration}. We perform re-calibration based on the predictive distribution $\calN(\bmu_\Theta(\bs, \ba), \bsigma^2_{\Theta}(\bs, \ba))$. The calibration error compares the predictive quantiles of this Normal distribution with the corresponding empirical frequencies of data points, that fall below the predicted quantiles. Formally, we define $\Phi_{\boldsymbol{\tau}}^{-1}(\alpha; \bs, \ba): [0,1]^{d_s} \rightarrow \calS$ as the quantile function (inverse cumulative density function) of $\calN(\bmu_\Theta(\bs, \ba), \boldsymbol{\tau}^2 \bsigma^2_{\Theta}(\bs, \ba))$ where $\boldsymbol{\tau} \in \mathbb{R}^{d_s}$ is the temperature scaling vector. Given a calibration dataset $\calD_c = \{(\bs, \ba, \bs') \}$, the calibration error \citep{kuleshov2018calibration} for multivariate distributions follows as 
\begin{equation}
    \mathrm{CalErr}(\boldsymbol{\tau})  \coloneqq \frac{1}{d_s} \sum_{j=1}^{d_s} \frac{1}{|A|} \sum_{\alpha \in A}  \left(\mathrm{EmpFreq}(\alpha, \boldsymbol{\tau})_j - \alpha \right)^2 ~,
\end{equation}
where $A = \{0.1, \cdots, 0.9, 0.99\}$ is a set of confidence levels and
\begin{equation}
    \mathrm{EmpFreq}(\alpha; \boldsymbol{\tau}) \coloneqq \frac{1}{|\calD_c|} \sum_{(\bs, \ba, \bs') \in \calD_c} \mathbf{1}\{ \bs' \leq \Phi_{\boldsymbol{\tau}}^{-1}(\alpha; \bs, \ba) \}
\end{equation}
is a vector-valued function of the (per dimension) empirical frequencies of the prediction targets that fall below the $\alpha$ quantile.
Finally, we recalibrate the BNN predictions, by choosing the variance scaling vector $\boldsymbol{\tau}$ such that the calibration error is minimized, i.e., we choose
\begin{equation}
    \boldsymbol{\tau}^* = \argmin_{\boldsymbol{\tau}} \mathrm{CalErr}(\boldsymbol{\tau})  ~.
\end{equation}
Algorithm~\ref{alg:BNN_calibrate} summarizes this BNN re-calibration procedure:
\begin{algorithm}[ht]
\caption{\textsc{CalibrateBNN}}
\label{alg:BNN_calibrate}
\begin{algorithmic}
\Require calibration dataset $\calD_c$, predictive mean $\bmu(\cdot, \cdot)$, predictive variance $\bsigma^2(\cdot, \cdot)$
\State $A \leftarrow \{0.1, \cdots, 0.9, 0.99\}$. \Comment{Fix a set of confidence levels.}
\State $\Phi^{-1}(\cdot; \bs, \ba, \boldsymbol{\tau}): [0,1] \mapsto \sR^{d_s}$ as the inverse CDF of the Gaussian distribution $\calN(\bmu(\bs, \ba), \boldsymbol{\tau}^2\bsigma^2(\bx_i))$.
\State Define $\mathrm{EmpFreq}(\alpha; \boldsymbol{\tau}) \leftarrow \frac{1}{|\calD_c|} \sum_{(\bs, \ba, \bs') \in \calD_c} \mathbf{1}\{ \bs' \leq \Phi_{\boldsymbol{\tau}}^{-1}(\alpha; \bs, \ba, \boldsymbol{\tau}) \}$ 
\State Define $\mathrm{CalErr}(\boldsymbol{\tau})  \leftarrow \frac{1}{d_s} \sum_{j=1}^{d_s} \frac{1}{|A|} \sum_{\alpha \in A}  \left(\mathrm{EmpFreq}(\alpha, \boldsymbol{\tau})_j - \alpha \right)^2$\\
\Return $\arg\min_{\boldsymbol{\tau}} \mathrm{CalErr}(\boldsymbol{\tau}) $ \Comment{Choose $\boldsymbol{\tau}$ that minimizes the calibration error}
\end{algorithmic}
\end{algorithm}

\subsection{The NN-Based HAMBO variants}
Here, we provide algorithmic descriptions of the NN-Based HAMBO variants from Section \ref{sec:hambo_nn} as well as details about their implementation and how the corresponding experiments were conducted.











\subsubsection{HAMBO with a Continuous Adversary (HAMBO-CA)}

HAMBO-CA directly reflects the hallucinated adversarial transition model, introduced in Section \ref{section:hambo_general_idea}. The adversary $\boldeta(\bs, \ba) \in [-1, 1]^{d_s}$ chooses the mean of the Gaussian transition probability from the epistemic confidence set, i.e.,
\begin{align}
\tilde{p}_{\boldeta}(\bs'| \bs, \ba) \coloneqq \calN \big(\bs';  \bmu_\Theta(\bs, \ba) +  \tau^2 \boldeta(\bs, \ba)  \bsigma^2_{\Theta, e} , \bsigma^2_{\Theta, a}(\bs, \ba) \big) 
\end{align}
For obtaining the corresponding conservative value estimate $\tilde{J}(\pi_e) = \min_{\boldeta} J_{\tilde{p}_{\boldeta}}(\pi_e)$, we need to find the adversary $\boldeta^\star$ that minimizes the expected return. For this, we parameterize the adversary $\boldeta$ as a neural network policy with two hidden layers of size 256 with ReLU activations and a tanh squashed Gaussian conditional distribution over the adversary actions in $[-1, 1]^{d_s}$. We use SAC \citep{haarnoja2018sac2} to maximize the negative expected return of the adversary policy. As usual, to stabilize the SAC training and avoid Q-value overestimation, we use double critics and trailing target critics. The SAC training is conducted in rounds consisting of rollouts of 1000 episodes under the hallucinated transition model where actions are chosen by $\pi_e$, followed by 1000 gradient steps on the SAC objectives. For the gradient steps, we use a batch size of 1024 and the Adam optimizer with a learning rate of $10^{-3}$ for critic and policy and $5 * 10^{-5}$ for the SAC entropy parameter. After SAC has converged, we take the adversary policy $\boldeta^\star$ and estimate the expected return $\hat{J}_{\tilde p_{\boldeta^\star}}(\pi_e)$ of $\pi_e$ under the adversary transition model, induced by $\boldeta^\star$ with $L = 10^4$ trajectories (see Eq. \ref{eq:mc_estimation_return}). The HAMBO-CA method is summarized in Algorithm \ref{alg:hambo_ca}.

\begin{algorithm}[!h]
\caption{\textsc{HAMBO-CA}}
\label{alg:hambo_ca}
\begin{algorithmic}
\Require Offline dataset $\calD_b$, evaluation policy $\pi_e$, Number of BNN particles $K$
\State Select a subset of $\calD_b$ as calibration set $\calD_c$
\State $\{\bh_{\btheta_1}, \bnu^2_{\btheta_1}, \cdots,\bh_{\btheta_K}, \bnu^2_{\btheta_K}\} \leftarrow \mathrm{SVGD}(\calD_b \setminus \calD_c, K)$ \Comment{Train BNN via SVGD and get predictive NN functions}
\State $\bmu_\Theta(\bs, \ba) \leftarrow \frac{1}{K} \sum_{k=1}^K \bh_{\btheta_k}(\bs, \ba)$ \Comment{Calculate posterior mean.}
\State $\bsigma_{\Theta,e}^2(\bs, \ba) \leftarrow \frac{1}{K} \sum_{k=1}^K (\bh_{\btheta_k}(\bs, \ba) - \bmu_\Theta(\bs, \ba))^2$\Comment{Calculate epistemic uncertainty.}
\State $\bsigma_{\Theta,a}^2(\bs, \ba) \leftarrow\frac{1}{K}\sum_{k=1}^K \bnu^2_{\btheta_k}(\bs, \ba)$\Comment{Calculate aleatoric uncertainty.}
\State $\tau \leftarrow \mathrm{CalibrateBNN}(\calD_c, \bmu_\Theta, \bsigma^2_{\Theta, e}+\bsigma^2_{\Theta, a})$ \Comment{Calibrate the model}
\State Initialize adversary policy $\boldeta$
\State $\tilde{p}_{\boldeta}(\bs'| \bs, \ba) \leftarrow \calN \big(\bs' ; \bmu_\Theta(\bs, \ba) +  \tau^2 \boldeta(\bs, \ba)  \bsigma^2_{\Theta, e} , \bsigma^2_{\Theta, a}(\bs, \ba) \big)$ \Comment{Setup hallucinated adversarial transition model}
\State $\boldeta^\star \leftarrow \mathrm{SoftActorCritic}(-J_{\tilde p_{\boldeta}}(\pi_e), \boldeta)$ \Comment{Train adversary $\boldeta$ via SAC to maximize the negative return}
\State $\tilde{J}(\pi_e) \leftarrow \hat{J}_{\tilde p_{\boldeta^\star}}(\pi_e)$ \Comment{Estimate expected return of $\pi_e$ via sampling (see Eq. \ref{eq:mc_estimation_return})} \\
\Return $\tilde{J}(\pi_e)$
\end{algorithmic}
\end{algorithm}

\subsubsection{HAMBO with a Discrete Adversary (\textsc{HAMBO-DA1} and \textsc{HAMBO-DAinf})}
In the case of \textsc{HAMBO-DA1} the adversary $\vartheta$ has discrete action $\{1,..., K\}$, i.e. picking one of the $K$ particles. We parameterize the adversary policy as a neural network with two hidden layers of size 256 with ReLU activations and softmax-categorical distribution over the $K$ discrete actions. To train this adversary policy, we use clipped double DQN \citep{fujimoto2018td3}. The double DQN training is conducted in rounds consisting of rollouts of 1000 episodes under the hallucinated transition model where actions are chosen by $\pi_e$, followed by 1000 gradient steps on the DQN objectives. For the gradient steps, we use a batch size of 1024 and the Adam optimizer with a learning rate of $10^{-3}$. Once double DQN has converged, we take the adversary policy $\vartheta^\star$ and estimate the expected return $\hat{J}_{\tilde p_{\vartheta^\star}}(\pi_e)$ of $\pi_e$ under the adversary transition model, induced by $\vartheta^\star$ with $L = 10^4$ trajectories (see Eq. \ref{eq:mc_estimation_return}).
The overall \textsc{HAMBO-DA1} method is summarized in Algorithm \ref{alg:hambo_da-1}.
\begin{algorithm}[h]
\caption{\textsc{HAMBO-DA1}}
\label{alg:hambo_da-1}
\begin{algorithmic}
\Require Offline dataset $\calD_b$, evaluation policy $\pi_e$, Number of BNN particles $K$
\State  $\{\bh_{\btheta_1}, \bnu^2_{\btheta_1}, \cdots,\bh_{\btheta_K}, \bnu^2_{\btheta_K}\} \leftarrow \mathrm{SVGD}(\calD_b, K)$ \Comment{Train BNN via SVGD and get predictive NN functions}
\State Initialize adversary policy $\vartheta$
\State $\tilde{p}_{\vartheta}(\bs'| \bs, \ba) :=  \sum_{k=1}^K \vartheta(k | \bs, \ba) \calN \big(\bs' ;  \bh_{\btheta_k}(\bs, \ba) , \bnu^2_{\btheta_k}(\bs, \ba) \big)$ \Comment{Setup hallucinated adversarial transition model}
\State $\vartheta^\star \leftarrow \mathrm{DoubleDQN}(-J_{\tilde p_{\boldeta}}(\pi_e), \vartheta)$ 
\Comment{Train adversary $\vartheta$ via  to maximize the negative return}
\State $\tilde{J}(\pi_e) \leftarrow \hat{J}_{\tilde p_{\vartheta^\star}}(\pi_e)$ \Comment{Estimate expected return of $\pi_e$ via sampling (see Eq. \ref{eq:mc_estimation_return})} \\
\Return $\tilde{J}(\pi_e)$
\end{algorithmic}
\end{algorithm}

In contrast to \textsc{HAMBO-DA1}, \textsc{HAMBO-DAinf} uses a weaker adversary that has to commit to one of the BNN particles for the entire trajectory. As a result, the corresponding pessimistic HAMBO estimate can simply be chosen as the minimum of the expected evaluation policy return under each of the NN models in the particle approximation, i.e. $J(\pi_e) = \min_{k \in \{1, \dots, K\}} J_{p_{\btheta_k}}(\pi_e)$. The \textsc{HAMBO-DAinf} method is summarized in Algorithm \ref{alg:hambo_da-1}.

\begin{algorithm}[ht]
\caption{\textsc{HAMBO-DAinf}}
\label{alg:hambo_dainf}
\begin{algorithmic}
\Require  Offline dataset $\calD_b$, evaluation policy $\pi_e$, Number of BNN particles $K$
\State $\{\bh_{\btheta_1}, \bnu^2_{\btheta_1}, \cdots,\bh_{\btheta_K}, \bnu^2_{\btheta_K}\} \leftarrow \mathrm{SVGD}(\calD_b, K)$ \Comment{Train BNN via SVGD and get predictive NN functions}
\State $p_{\btheta_k}(\bs'| \bs, \ba) \leftarrow \calN \big(\bs' ;  \bh_{\btheta_k}(\bs, \ba) , \bnu^2_{\btheta_k}(\bs, \ba) \big)$ 
\State $\tilde J(\pi_e) \leftarrow \min_{k \in \{1, \dots, K\}} \hat{J}_{p_{\btheta_k}}(\pi_e)$ \Comment{Estimate return of $\pi_e$ for each model (see Eq. \ref{eq:mc_estimation_return})} and take minimum\\
\Return $\tilde J(\pi_e)$
\end{algorithmic}
\end{algorithm}


\section{Proofs and Derivations}
\label{sec:missingproofs}

\begin{proof}[\textbf{Proof of Proposition~\ref{prop:valid_lower_bound}}]
By Assumption \ref{assumption:calibration} we have, with probability $1-\delta$, uniformly over $\calS \times \calA$, that
\begin{equation}
    | \bmu_n(\bs, \ba) - f(\bs, \ba) | \leq \beta_n(\delta) \bsigma_n(\bs, \ba) ~.
\end{equation}
Hence, there exists an (adversary) mapping $\boldeta^\dagger: \calS \times \calA \mapsto [-1, 1]^{d_s}$ such that every $\forall ~ \bs, \ba \in \calS \times \calA$ we have
\begin{equation}
f(\bs, \ba) = \bmu_n(\bs, \ba) + \beta_n(\delta) \boldeta^\dagger(\bs, \ba)  \bsigma_n(\bs, \ba) ~,
\end{equation}
and, thus the hallucinated transition model is equal to the true transition dynamics, i.e.,
\begin{align}
\tilde{p}_{\boldeta^\dagger}(\bs_{t+1}| \bs_t, \ba_t) = p_{\beps}\big(\bs_{t+1} - \bmu_n(\bs, \ba)
    + \beta \boldeta(\bs, \ba)^\dagger  \bsigma_n(\bs, \ba) \big) = p_{\beps}\big(\bs_{t+1} - f(\bs, \ba) \big) = p(\bs_{t+1}| \bs_t, \ba_t) ~.
\end{align}
Finally, we can use this to show
\begin{align}
    \tilde{J}(\pi_e) := \min_{\boldeta} J_{\tilde{p}_{\boldeta}}(\pi_e) \leq J_{\tilde{p}_{\boldeta^\dagger}}(\pi_e) =  J_{p}(\pi_e) = J(\pi_e) ~,
\end{align}
which concludes the proof.
\end{proof}



\begin{proof}[\textbf{Proof of Example \ref{example:reparametrizable_policy_is_w1_lip}}]
If $\pi(\ba|\bs)$ can be reparametrized as $g(\bs, \boldsymbol{\zeta})$, where $\boldsymbol{\zeta} \sim p(\boldsymbol{\zeta})$ and $g$ is $L_g$-Lipschitz, we have that the two random variables are equal in distribution, i.e.
\smash{
    $\ba \stackrel{d}{=} g(\bs, \boldsymbol{\zeta})$
}. Therefore,
\begin{align}
    \calW_1(\pi(\ba|\bs), \pi(\ba|\bs')) = &~\inf_{\gamma \in \Gamma(\pi(\ba|\bs), \pi(\ba|\bs'))} \E_{\ba, \ba' \sim \gamma} \left[ {\norm{\ba - \ba'}} \right] \\
    \leq &~ \E_{\ba, \ba' \sim \tilde{\gamma}} \left[ {\norm{\ba - \ba'}} \right] = \E_{\boldsymbol{\zeta}} \left[ {\norm{g(\bs, \boldsymbol{\zeta}) - g(\bs', \boldsymbol{\zeta})}} \right] \\
    \leq &~ L_g \norm{\bs - \bs'}
\end{align}
where $\tilde{\gamma}(\ba, \ba')$ is the joint probability distribution of $(g(\bs, \boldsymbol{\zeta}), g(\bs', \boldsymbol{\zeta}))$, and, thus a coupling. Hence, we have shown that $\pi(\ba, \bs)$ is $L_g$-Lipschitz w.r.t. the Wasserstein-1 distance.
\end{proof}

\subsection{Proof of Theorem~\ref{theorem:lower_bound}}

The following lemmata will be used to prove the theorem.

\begin{lemma}[Reparameterizability of two random variables with covariates]
\label{lemma:coupling_lemma}
Let $X$ and $Y$ be random variables with finite expectation and corresponding probability distributions $p$ and $q$. Then, we can reparameterize $Y$ as $Y \stackrel{d}{=} X + \bome_{|X}$, where $\bome_{|X}$ is a covariate that is generally dependent on $X$ and satisfies
\begin{equation}
    \E_X \E_{\bome | X} \left[ \norm{\bome} \right] = \calW_1(p, q) ~,
\end{equation}
where $\calW_1(p, q)$ is the Wasserstein-1 distance between $p$ and $q$.
\end{lemma}

\begin{proof}
Recall that the Wasserstein-1 distance is defined as infimum over couplings between $p$ and $q$, i.e.,
    \begin{equation} \label{eq:w1_def_23423}
        \calW_1 = \inf_{\gamma \in \Gamma(p, q)} \E_{X', Y' \sim \gamma} \left[ \norm{X'-Y'} \right]
    \end{equation}
    If the expectation of $p$ and $q$ is finite, then the infimum over couplings in (\ref{eq:w1_def_23423}) is attained for some $\gamma^*(x, y)$. 
Now we construct the covariate $\bome_{|X}$ which is defined by applying the change of variable $g_x(x, y) \mapsto (x, y - x) = (x, \bome)$ to $\gamma^*$, so that we get $\tilde{\gamma}^*(x, \bome) = \gamma^*(x, x + \bome)$. The conditional distribution of the covariate $\bome_{|X}$ is 
    $$\tilde{\gamma}^*(\bome | x) = \frac{\gamma^*(x, x + \bome)}{\gamma^*(x)} = \frac{\gamma^*(x, x + \bome)}{p(x)}$$.

    Now, given our construction of $\bome_{|X}$, we aim to show that $Y \stackrel{d}{=} X + \bome_{|X}$. Define the random variable $Z:=X + \bome_{|X}$. Then we have 
    \begin{align}
        p(z) &= \int_{\calX} p(x, z-x) dx = \int_{\calX} p(x) \tilde{\gamma}^*(\underbrace{z-x}_{\bome}| x) dx \\
        &= \int_{\calX} \gamma^*(x, x+(z-x)) dx = \int_{\calX} \gamma^*(x, z) dx = q(z)
    \end{align}
    which shows that the pdf of $z$ is $q$, the probability density of $Y$. Since $\gamma^*(x, y)$ is the coupling that minimizes the transport cost, we can write
    \begin{align}
        \calW_1(p, q) = \inf_{\gamma \in \Gamma(p, q)} \E_{x', y' \sim \gamma} \left[ \norm{x'-y'} \right] 
        = &~ \E_{x', y' \sim \gamma^*} \left[ \norm{x'-y'} \right] = \E_{x' \sim p(x')} \E_{\bome \sim \tilde{\gamma}^*(\bome|x')} \left[ \norm{\bome} \right]
    \end{align}
which shows that $\E_X \E_{\bome | X} \left[ \norm{\bome} \right] = \calW_1(p, q)$, and, thus concludes the proof.
\end{proof}

\begin{corollary}
Let $\pi(\ba|\bs)$ be $L_\pi$-Lipschitz w.r.t. the Wasserstein-1 distance. For any arbitrary but fixed $\bs, \bs' \in \calS$ we denote $A$ and $A'$ as the random variables that follow the conditional distributions $\pi(\ba|\bs)$ and $\pi(\ba'|\bs')$ respectively. Then, we can construct a covariate $\bome_{|A}$ such that $A' \stackrel{d}{=} A + \bome_{|A}$ and
$\E_{A}\E_{\bome | A} \left[ \norm{\bome} \right] \leq L_\pi \norm{\bs - \bs'}$.
\label{corr:bound_on_coupling}
\end{corollary}
\begin{proof}
    The corollary directly follows from Lemma \ref{lemma:coupling_lemma} and the definition of the $L_\pi$-Lipschitz continuity w.r.t. the Wasserstein-1, i.e., that $\forall ~ \bs, \bs' \in \calS$ we have that $\calW_1\hspace{-2pt}\left(\pi(\ba|\bs), \pi(\ba|\bs') \right) \leq L_\pi \norm{\bs - \bs'}$.
\end{proof}


\begin{lemma}[Lipschitz continuity of Wasserstein-one distance implies Lipschitz continuity in expectation]
    Let $f: \mathcal{X}_1 \times \mathcal{X}_2 \rightarrow \mathcal{Y}$ be $L_f$ Lipschitz continuous and $x_2$ a random variable with distribution $p(\cdot|x_1)$ that is $L_p$ Lipschitz w.r.t. the Wasserstein-1 distance. Then we have
    \begin{equation*}
        \expvalue{x_2 \sim p(\cdot|x_1)}{f(x_1, x_2)} -    \expvalue{x'_2 \sim p(\cdot|x'_1)}{f(x'_1, x'_2)} \leq \Bar{L}_f||x_1 - x_1'||.
    \end{equation*}
    with $\Bar{L}_f = L_f(1+L_p)$.
    \label{lemma:lipschitz_composition}
\end{lemma}

\begin{proof}
    \begin{align*}
         \expvalue{x_2 \sim p(\cdot|x_1)}{f(x_1, x_2)} -    \expvalue{x'_2 \sim p(\cdot|x'_1)}{f(x'_1, x'_2)} &=  \expvalue{x_2 \sim p(\cdot|x_1)}{f(x_1, x_2)} - \expvalue{x_2 \sim p(\cdot|x_1)} {\expvalue{\bome \sim \Tilde{\gamma}^*(\bome|x_2)}{f(x'_1, x_2 + \bome)}} \tag{Lemma~\ref{lemma:coupling_lemma}. }\\
&= \expvalue{x_2 \sim p(\cdot|x_1)} {\expvalue{\bome \sim \Tilde{\gamma}^*(\bome|x_2)}{{f(x_1, x_2)}  - f(x'_1, x_2 + \bome )}} \\
         &\leq L_f \expvalue{x_2 \sim p(\cdot|x_1)}{ \expvalue{\bome \sim \Tilde{\gamma}^*(\bome|x_2)}{
         {|| x_1 - x_1'||_2 + ||\bome||_2}}} \tag{Lipschitzness of $f$}\\
         &\leq L_f || x_1 - x_1'||_2 + L_fL_p||x_1 - x'_1||_2 \tag{Corollary~\ref{corr:bound_on_coupling}}\\
         &= L_f(1+L_p)||x_1 - x_1'||_2. \end{align*}
\end{proof}

In the following, we bound the difference between the pessimistic and true return with the distance between the true and pessimistic trajectory using the Lipschitz continuity of the reward function and the policy's Wasserstein-one distance.
\begin{lemma}[Bound on difference between pessimistic and true return estimate]
\label{lemma:evaluation_error_bound}
Under Assumption \ref{assumption:lip_continuity} we have
\begin{align*}
\left|J(\pi_e) - \tilde{J}(\pi_e)\right|
&\leq \Bar{L}_r \expvalue{{\beps}_{0:T-1}, a_{0:T}}{\expvalue{{\bome}_{0:T}}{\sum_{t=0}^{T-1} ||\bs_t - \Tilde{\bs}_t||_2}}.
\end{align*}
where $\Bar{L}_r = L_r(1+L_{\pi})$.
\end{lemma}
\begin{proof}
We have
\begin{align*}
\left|J(\pi_e) - \tilde{J}(\pi_e)\right|&= \left|\expvalue{\bs_{0:T}, \ba_{0:T}}{\sum_{t=0}^{T-1}r(\bs_t, \ba_t)} - \expvalue{\tilde{\bs}_{0:T}, \tilde{\ba}_{0:T}}{\sum_{t=0}^{T-1}r(\tilde{\bs}_t,\tilde{\ba}_t)}\right|\\
&= \left|\expvalue{{\beps}_{0:T}, a_{0:T}}{\sum_{t=0}^{T-1}r(\bs_t, \ba_t)} - \expvalue{{\beps}_{0:T}, \ba_{0:T}} {\expvalue{\bome_{0:T}}{{\sum_{t=0}^{T-1}r(\tilde{\bs}_t,\ba_t + \bome_t)}}}\right| \tag{Lemma~\ref{lemma:coupling_lemma}}\\
&= \left|\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{\bome_{0:T}}{{\sum_{t=0}^{T-1}r(\bs_t, \ba_t) - r(\tilde{\bs}_t,\ba_t + \bome_t)}}}\right|.
\end{align*}
From Lemma~\ref{lemma:coupling_lemma}, know that $\mathbb{E}~\bome = \calW_1(\pi(\cdot \vert \bs_t), \pi(\cdot \vert \tilde{\bs}_t))$, where $\pi(\cdot \vert \cdot)$ is continuous w.r.t. the WD-1 distance. Therefore,
\begin{align*}
\left|J(\pi_e) - \tilde{J}(\pi_e)\right|&=\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{\bome_{0:T}}{{\sum_{t=0}^{T-1} L_r(1+L_{\pi})||\bs_t - \Tilde{\bs}_t||_2}}} \tag{Lemma~\ref{lemma:lipschitz_composition}}\\
&= \Bar{L}_r \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{\bome_{0:T}}{{\sum_{t=0}^{T-1} ||\bs_t - \Tilde{\bs}_t||_2}}} \tag{$\Bar{L}_r = L_r(1+L_{\pi})$}.
\end{align*}
\end{proof}
Next, we bound the distance between the true and pessimistic trajectory with the epistemic uncertainty around the true trajectory.
\begin{lemma}[Bound on pessimistic and true trajectory]
\label{lemma:state_divergence_bound}
Under Assumption \ref{assumption:calibration} and \ref{assumption:lip_continuity} with probability at least $1-\delta$ for all $\boldeta: \mathcal{S} \rightarrow [-1,1]^{d_s}$ we have for all $t \in \{0,\dots,T\}$ that
\begin{align*}
\resizebox{0.99\linewidth}{!}{$\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bs_{t+1} - \tilde \bs_{t+1}}}} 
\leq \left(1 + \sqrt{d_s}\right)\beta\sum_{i=0}^{(t+1)-1}\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta\Bar{L}_{\sigma}\right)^{(t+1)-1-i}\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bsigma_n(\bs_i, \ba_i)}}}.$
}
\end{align*}
\end{lemma}
\begin{proof}
We prove by induction.
For $t=1$ we have 
\begin{align*}
\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bs_1 - \tilde \bs_1}}} & =  \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{f(\bs_0,\ba_0) + \beps_0 - \bmu_n(\bs_0,\ba_0) - \beta \bsigma_n(\bs_0,\ba_0)\boldeta(\bs_0, \ba_0) - \beps_0}}} \tag{Lemma~\ref{lemma:coupling_lemma}}\\
&\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{f(\bs_0,\ba_0) - \bmu_n(\bs_0, \ba_0)} + \norm{\beta_n \bsigma_n(\bs_0,a_0)\boldeta(\bs_0)}}}\\
&\leq \left(1 + \sqrt{d_s}\right)\beta_n\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bsigma_n(\bs_0, \ba_0)}}} \tag{$\boldeta \in [-1,1]^{d_s}$}
\end{align*}

We get the induction hypothesis that for an arbitrary but fixed $t \geq 0$ we have
\begin{align*}
\resizebox{0.99\linewidth}{!}{$
\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bs_t - \tilde \bs_t}}}
 \leq \left(1 + \sqrt{d_s}\right)\beta_n\sum_{i=0}^{t-1}\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta\Bar{L}_{\sigma}\right)^{t-1-i}\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bsigma_n(\bs_i,\ba_i)}}}
 $
}
\end{align*}

Now for the induction step we can first derive
\begin{small}
\begin{align*}
\expvalue{{\beps}_{0:T}, \ba_{0:T}} {\expvalue{{\bome}_{0:T}}{\norm{\bs_{t+1} - \tilde \bs_{t+1}}}} &= \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{f(\bs_t,\ba_t) + \beps_t - \bmu_n(\tilde \bs_t, \tilde \ba_t) - \beta_n \bsigma_n(\tilde \bs_t,\tilde \ba_t)\boldeta(\tilde \bs_t, \tilde \ba_t) - \beps_t}}} \\
&= \expvalue{{\beps}_{0:T}, \ba_{0:T}} {\expvalue{{\bome}_{0:T}}{\norm{f(\bs_t,\ba_t) - \bmu_n(\tilde \bs_t,\ba_t + \bome_t) - \beta_n \bsigma_n(\tilde \bs_t,\ba_t + \bome_t) \boldeta(\tilde s_t, \ba_t + \bome_t)}}} \tag{Lemma~\ref{lemma:coupling_lemma}}\\
&\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{f(\bs_t, \ba_t) - f(\tilde \bs_t,\ba_t + \bome_t) + f(\tilde \bs_t,\ba_t + \bome_t) - \bmu_n(\tilde \bs_t, \ba_t + \bome_t)}}}\\
&\quad \quad+ \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\beta_n \bsigma_n(\tilde \bs_t, \ba_t + \bome_t) \boldeta(\tilde s_t, \ba_t + \bome_t))}}}\\
&\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{f(\bs_t,\ba_t) - f(\tilde \bs_t,\ba_t + \bome_t)} + \norm{f(\tilde \bs_t,\ba_t + \bome_t) - \bmu_n(\tilde \bs_t, \ba_t + \bome_t)}}} \\
&\quad \quad+\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\beta_n \bsigma_n(\Tilde{\bs}_t, \ba_t + \bome_t)\boldeta(\tilde \bs_t, \ba_t + \bome_t)}}}\\
&\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\Bar{L}_f\norm{\bs_t - \tilde \bs_t} + \left(1 + \sqrt{d_s}\right)\beta_n\norm{\bsigma_n(\tilde \bs_t, \ba_t + \bome_t)}}} \tag{Lemma~\ref{lemma:lipschitz_composition} and $\boldeta \in [-1,1]^{d_s}$}
\end{align*}
\end{small}
By applying the triangle inequality and adding and subtracting $\bsigma_n$ to the second term, 
\begin{small}
\begin{align*}
\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bs_{t+1} - \tilde \bs_{t+1}}}} &\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\Bar{L}_f\norm{\bs_t - \tilde \bs_t} + \left(1 + \sqrt{d_s}\right)\beta_n\norm{\bsigma_n(\Tilde{\bs}_t, \ba_t + \bome_t)}}}\\
&= \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\Bar{L}_f\norm{\bs_t - \tilde \bs_t}}} \\
&\quad\quad+ \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\left(1 + \sqrt{d_s}\right)\beta_n\norm{\bsigma_n(\tilde \bs_t, \ba_t+\bome_t) - \bsigma_n(\bs_t,\ba_t) + \bsigma_n(\bs_t,\ba_t)}}}\\
&\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\Bar{L}_f\norm{\bs_t - \tilde \bs_t} + \left(1 + \sqrt{d_s}\right)\beta_n\left(\norm{\bsigma_n(\tilde \bs_t, \ba_t + \bome_t) - \bsigma_n(\bs_t, \ba_t)}\right)}} \\
&\quad\quad+ \left(1 + \sqrt{d_s}\right)\beta_n\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\left(\norm{\bsigma_n(\bs_t,\ba_t)}\right)}}\\
&\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\Bar{L}_f\norm{\bs_t - \tilde \bs_t} + \left(1 + \sqrt{d_s}\right)\beta_n\left(\Bar{L}_{\sigma}\norm{\bs_t - \tilde \bs_t} + \norm{\bsigma_n(\bs_t,\ba_t)}\right)}} \tag{Lemma~\ref{lemma:coupling_lemma}}\\
&= \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)\norm{\bs_t - \tilde \bs_t} + \left(1 + \sqrt{d_s}\right)\beta_n\norm{\bsigma_n(\bs_t,\ba_t)}}}
\end{align*}
\end{small}
Next, we apply the induction hypothesis
\begin{small}
\begin{align*}
\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bs_{t+1} - \tilde \bs_{t+1}}}} &\leq \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)\norm{\bs_t - \tilde \bs_t} + \left(1 + \sqrt{d_s}\right)\beta_n\norm{\bsigma_n(\bs_t, \ba_t)}}}\\
&\leq \left[\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)\left(1 + \sqrt{d_s}\right)\beta_n\right] \\
& \quad\quad \times \Bigg( \sum_{i=0}^{t-1}\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{t-1-i} \expvalue{{\beps}_{0:T}, \ba_{0:T}} {\expvalue{{\bome}_{0:T}}{\norm{\bsigma_n(\bs_i,\ba_i)}}} \\
&\quad\quad\qquad + \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{{\norm{\bsigma_n(\bs_t, \ba_t)}}}}\Bigg)\\
&= \left(1 + \sqrt{d_s}\right)\beta_n\sum_{i=0}^{(t+1)-1}\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{(t+1)-1-i}\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bsigma_n(\bs_i, \ba_i)}}}\\
\end{align*} 
\end{small}
\end{proof}

Using the above lemmas, we present the proof to the main theorem.

\begin{proof}[\textbf{Proof of Theorem~\ref{theorem:lower_bound}}]
\begin{small}
    \begin{align*}
        \left|J(\pi_e) - \tilde{J}(\pi_e)\right| &\leq \Bar{L}_r \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\sum_{t=0}^{T-1} ||\bs_t - \Tilde{\bs}_t||_2}} \tag{Lemma~\ref{lemma:evaluation_error_bound}}\\
        &\leq \Bar{L}_r  \sum_{t=0}^{T-1} \left(1 + \sqrt{d_s}\right)\beta_n\sum_{i=0}^{(t+1)-1}\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{(t+1)-1-i}\expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{\norm{\bsigma_n(\bs_i, \ba_i)}}} \tag{Lemma~\ref{lemma:state_divergence_bound}}
    \end{align*}
\end{small}
Since $\bar L_f \geq 1$, then for all $0\leq i\leq t$ and $0\leq t\leq T-1$, 
\[\left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{(t+1)-1-i} \leq \left(\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1}  \]
which allows us to write, 
\begin{small}
\begin{align*}
\left|J(\pi_e) -  \tilde{J}(\pi_e)\right| &\leq \Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n \left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1} T \expvalue{{\beps}_{0:T}, \ba_{0:T}}{\expvalue{{\bome}_{0:T}}{{\sum_{t=0}^{T-1} \norm{\bsigma_n(\bs_t, \ba_t)}}}}\\
        &= \Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n \left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1} T \expvalue{{\bs}_{0:T}, \ba_{0:T}}{\sum_{t=0}^{T-1} \norm{\bsigma_n(\bs_t, \ba_t)}} \\
&= \left[\Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n \left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1} T \right] \\
&= \left[\Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n \left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1} T\right] \\
        &\hspace{2em} \times \left(\int_{\calS \times \calA} \sum_{t=0}^{T-1}  p(s_t=\bs, a_t=\ba | \pi_e, \mathcal{M})\norm{\bsigma_n(\bs, \ba)} d\bs d\ba \right)\\
        &=  \Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n \left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1} T \int_{\calS \times \calA} T \rho^{\pi_e}(\bs, \ba) \norm{\bsigma_n(\bs, \ba)} d\bs d\ba \tag{See Eq.~\ref{eq:state_occ_measure}}\\
        &= \Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n \left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1} T^2 \expvalue{\bs, \ba \sim \rho^{\pi_e}}{{\norm{\bsigma_n(\bs, \ba)}}}
    \end{align*}
    \end{small}
\end{proof}

In summary, the deviation between the true and pessimistic return is proportional to the expected epistemic uncertainty of the evaluation policy state-occupancy measure $\rho^{\pi_e}$, and the constant $C_n$ defined as
\begin{equation*}
    C_n \coloneqq \Bar{L}_r \left(1 + \sqrt{d_s}\right)\beta_n T^2\left(1+\Bar{L}_f + \left(1 + \sqrt{d_s}\right)\beta_n\Bar{L}_{\sigma}\right)^{T-1}.
\end{equation*}
In Appendix~\ref{app:consistency_proof}, we provide consistency guarantees for our method. In particular, we prove under further assumptions on the true dynamics function $f$, that $\left|J(\pi_e) - \tilde{J}(\pi_e)\right| \to 0$, for $n \to \infty$.






















\subsection{Proof of known results for kernel methods} \label{app:gp_proofs}
We first recall the notion of maximum mutual information \citep{srinivas, elementsofIT}.
    The mutual information $I(\vx_{1:n}; k)$ quantifies the reduction in uncertainty due to the observations $\vx_{1:n}$. Given a GP model $\mathcal{GP}(0, k(\cdot, \cdot))$ and gaussian noise assumption, mutual information is equal to
\[
I(\vx_{1:n}) =  \frac{1}{2}\log\det(\mI + \sigma_\epsilon^{-2} \mK)
\]
with the kernel matrix $\mK = [k(\vx_i, \vx_j)]_{i,j \leq n}$.
    The maximum information capacity or maximum mutual information of a kernel $k$ is an upper bound on the mutual information, and is defined as 
    \begin{equation*}
        \gamma_n = \max_{\vx_{1:n}} I(\vx_{1:n}).
    \end{equation*}
  Table~\ref{tab:bound_on_gamma_n} shows the growth rate of $\gamma_n$ with $n$ for multiple different kernels.
  
\begin{proof}[Proof of Lemma \ref{lem:gp_calibrated}]
Let $\gamma_n$ be the maximum mutual information of $\mathcal{GP}(0, k(\cdot, \cdot))$. Set $\beta_n(\delta) := \left(B + \sigma_\epsilon \sqrt{2 (\gamma_n + 1 + \ln(d_s/\delta)} \right)$. 
     Element-wise application of Theorem 2 in \citet{chowdhury2017kernelized} over the dimensions of $\calS$ and taking a union bound proves the lemma.
\end{proof}

\begin{proof}[Proof of Lemma \ref{lemma:lip_f_sigma_gp}]
First, we prove the Lipschitz continuity of $\vf$.
By the Cauchy-Schwartz inequality, we have $\forall ~ \bx, \bx' \in \calX$
\begin{equation}
    |f_j(\bx) - f_j(\bx')| = | \langle f_j, k(\bx, \cdot) - k(\bx', \cdot) \rangle_k | \leq \normAny{f_j}_k ~ d_k(\bx, \bx') 
\end{equation}
Since $\normAny{f_j}_k \leq B, \forall j=1, ..., d_s$ and $d_k(\bx, \bx')$ is $L_k$-Lipschitz, we have that
\begin{equation}
\norm{\vf(\bx) - \vf(\bx')} = \sqrt{\sum_{j=1}^{d_s} (f_j(\bx) - f_j(\bx'))^2} \leq \sqrt{d_s B^2 d_k(\bx, \bx')^2} = \sqrt{d_s} B d_k(\bx, \bx') \leq \sqrt{d_s} B L_k \norm{\bx - \bx'} ;.
\end{equation}
Next, we show the Lipschitz continuity of the GP standard deviation. By Lemma 12 in \citet{curi2020hucrl}, we have, independent of $n$, $|\bsigma_n(\bx) - \bsigma_n(\bx)| \leq d_k(\bx, \bx')$ for the GP standard deviation. Now, we make a similar argument as above:
\begin{equation}
    \norm{\bsigma_n(\bx) - \bsigma_n(\bx)} \leq \sqrt{d_s d_k^2(\bx, \bx')} \leq  \sqrt{d_s} L_k \norm{\bx - \bx'}
\end{equation}
which shows that $\bsigma_n(\cdot)$ is $\sqrt{d_s} L_k$-Lipschitz.
\end{proof}

\subsection{Proof of Theorem~\ref{theorem:consistency}} \label{app:consistency_proof}
For showing consistency of our lower bound in Theorem~\ref{theorem:lower_bound} for the GP case, we first prove that the uncertainty with respect to an i.i.d., data sampling distribution $p(x)$ shrinks in expectation. 
\begin{lemma}[Shrinking uncertainty in expectation]
        Let $p(x)$ denote a data sampling distribution with compact support. Then the following holds for sequences $\{x_i\}^{n-1}_{i=0}$ sampled i.i.d.\ from  $p(x)$, 
          \begin{equation}
        C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(x_n|\{x_i\}^{n-1}_{i=0})} \leq C^2_{n}\expvalue{x_{1:n-1}\sim p}{\sigma^2(x_{n-1}|\{x_i\}^{n-2}_{i=0})}.
    \end{equation}
    \label{lemm:shrinking_var_in_exp}
\end{lemma}
\begin{proof}
    \begin{align*}
        C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(x_n|\{x_i\}^{n-1}_{i=0})} &=  C^2_{n}\expvalue{x_{1:n-1}\sim p}{\expvalue{x_{n}\sim p}{\sigma^2(x_n|\{x_i\}^{n-1}_{i=0})}} \\
        &= C^2_{n}\expvalue{x_{1:n-1}\sim p}{\expvalue{x\sim p}{\sigma^2(x|\{x_i\}^{n-1}_{i=0})}} \\
        &\leq C^2_{n}\expvalue{x_{1:n-1}\sim p}{\expvalue{x\sim p}{\sigma^2(x|\{x_i\}^{n-2}_{i=0})}} \tag{Monotonicity of variance}\\
        & =C^2_{n}\expvalue{x_{1:n-2}\sim p}{{\expvalue{x_{n-1}\sim p}{\expvalue{x\sim p}{\sigma^2(x|\{x_i\}^{n-2}_{i=0})\vert x_{n-1}}}}} \\
        &=C^2_{n}\expvalue{x_{1:n-2}\sim p}{\expvalue{x\sim p}{\sigma^2(x|\{x_i\}^{n-2}_{i=0})}} \tag{All points are sampled i.i.d from $p$} \\
                &=C^2_{n}\expvalue{x_{1:n-2}\sim p}{\expvalue{x_{n-1}\sim p}{\sigma^2(x_{n-1}|\{x_i\}^{n-2}_{i=0})}} \\
        &= C^2_{n}\expvalue{x_{1:n-1}\sim p}{\sigma^2(x_{n-1}|\{x_i\}^{n-2}_{i=0})} .
    \end{align*}
\end{proof}

\begin{lemma}[Bound on expectation of uncertainty at $n$]
    Let $p(\vx)$ denote the data sampling distribution with a compact support. Then the following holds for sequences $\{\vx_i\}^{n-1}_{i=0}$ sampled i.i.d.\ from  $p(\vx)$, 
    \begin{equation}
        n C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})} \leq C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sum^n_{j=1}\sigma^2(\vx_{j}|\{\vx_i\}^{j-1}_{i=0})}.
        \label{eq:variance_bound_with_n}
    \end{equation}
    Moreover, we have
    \begin{equation*}
        C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})}  \leq \frac{C^2_n\gamma_n}{n},
    \end{equation*}
    where $\gamma_n$ represents the maximum information gain (\cite{srinivas, elementsofIT}).
    \label{lemm:bound_var_with_gamma}
\end{lemma}

\begin{proof}
    We prove by induction. For $n=1$, Eq.~\ref{eq:variance_bound_with_n} holds trivially.
    Now assume $n>1$,
    \begin{align*}
        n C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})} &= C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})} + (n -1 ) C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})} \\
        &\leq C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})} + (n-1) C^2_{n}\expvalue{\vx_{1:n-1}\sim p}{\sigma^2(\vx_{n-1}|\{\vx_i\}^{n-2}_{i=0})} \tag{Lemma~\ref{lemm:shrinking_var_in_exp}} \\
        &\leq C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(\vx_n|\{\vx_i\}^{n-1}_{i=0})} + C^2_{n}\expvalue{\vx_{1:n-1}\sim p}{\sum^{n-1}_{j=1}\sigma^2(\vx_{j}|\{\vx_i\}^{j-1}_{i=0})} \tag{By induction hypothesis} \\
        &= C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sum^n_{j=1}\sigma^2(\vx_{j}|\{\vx_i\}^{j-1}_{i=0})}.
    \end{align*}

    Note, $\sum^{n}_{j=1} {\sigma^2(\vx_{j+1}|\{\vx_i\}^{j}_{i=0})}$ is a measure of the mutual information associated to the sampling scheme, and lower bounds the mutual information. 
    The mutual information $I(\vx_{1:n})$ quantifies the reduction in uncertainty due to the observations $\vx_{1:n}$~\cite{elementsofIT}. When $f \in \calH_k$, mutual information is equal to
\[
I(\vx_{1:n}) =  \frac{1}{2}\log\det(\mI + \lambda^{-1} \mK)
\]
with the kernel matrix $\mK = [k(\vx_i, \vx_j)]_{i,j \leq n}$.
    Moreover,
    \begin{equation*}
        \expvalue{\vx_{1:n}\sim p} {\sum^{n}_{j=1} {\sigma^2(x_{j+1}|\{x_i\}^{j}_{i=0})} } \leq I(\vx_{1:n}).
    \end{equation*}
The maximum information gain, is an upper bound on the mutual information, and is defined as 
    \begin{equation*}
        \gamma_n = \max_{\vx_{1:n}} I(\vx_{1:n}).
    \end{equation*}
    Therefore, by definition of $\gamma_n$, it is greater
    than the mutual information of all sampling schemes within the the support of $p(x)$. 
        \begin{equation*}
        C^2_{n}\expvalue{\vx_{1:n}\sim p}{\sigma^2(x_n|\{x_i\}^{n-1}_{i=0})} \leq \frac{C^2_{n}\gamma_n}{n}
    \end{equation*}
\cite{srinivas} derive the bounds on $\gamma_n$ (see Table~\ref{tab:bound_on_gamma_n}) for linear, RBF, and Matèrn kernels on compact and convex sets. Hence, we obtain that for the linear and RBF kernel, $C^2_n\gamma_n$ grows sublinearly in $n$, i.e., $\sfrac{C^2_n \gamma_n}{n} \to 0$ for $n\to \infty$. 
    \begin{table}[ht]
    \centering
    \begin{tabular}{c|c}
    Kernel                                   & Bounds on $\gamma_n$ for $x \in \mathbb{R}^d$ \\
    \hline
    Linear                                   &     $\mathcal{O}(d\log{n})$           \\
    RBF                                      &       $\mathcal{O}((\log{n})^{d+1})$         \\
    Matèrn $\nu>1/2$ &       $\mathcal{O}(n^{\frac{d}{2\nu + d}}\log^{\frac{2\nu}{2\nu+d}}(n))$         
    \end{tabular}
    \caption{Bounds on $\gamma_n$ from~\cite[Theorem 5.]{vakili2021information}}
    \label{tab:bound_on_gamma_n}
    \end{table}
\end{proof}
\begin{lemma}
    Let $p(\vx)$ denote a distribution with compact support, and assume that $\vx_1, \dots, \vx_{n-1}$ are i.i.d. samples from $p$. Then, the following holds,
    \begin{equation*}
      \sP\left(  \expvalue{\vx \sim p}{C_{n}\sigma_{n}(\vx)} = \calO\left((1+1/\sqrt{\delta})\sqrt{\frac{\gamma^{T+1}_n}{n}}\right),\, \forall \vx_{1:n-1}\right) \geq 1-\delta, \quad\quad \forall n \in \mathbb{N}.
    \end{equation*}
    For kernels with a maximum information capacity $\gamma_n = \calO(\text{poly}(\log(n))$ that grows at most polylogarithmically with $n$, we have that
    \begin{equation*}
        \mathbb{P}\left(\expvalue{x \sim p}{C_{n}\sigma_{n}(x)} \to 0 \text{ for } n \to \infty\right) = 1.
    \end{equation*}
    \label{lemma:consistency_of_behavioural_policy}
\end{lemma}
\begin{proof}
    From Lemma~\ref{lemm:bound_var_with_gamma}
    \begin{equation}
          \expvalue{x_{1:n-1} \sim p}{ C^2_{n}\expvalue{x \sim p}{\sigma^2_{n}(x)} }  \leq \frac{C^2_{n}\gamma_{n}}{n}.
    \end{equation}
    Using the Markov inequality, we get
     \begin{equation*}
        \mathbb{P}(X \geq a) \leq \frac{\E\left[X\right]}{a}.
    \end{equation*}
    Let $X$ denote $C^2_{n}\expvalue{x \sim p}{\sigma^2_{n}(x)}$. Then we have for all $a>0$, 
    \begin{equation*}
        \mathbb{P}(C^2_{n}\expvalue{x \sim p}{\sigma^2_{n}(x)} \geq a) \leq \frac{\expvalue{x_{1:n-1} \sim p}{ C^2_{n}\expvalue{x \sim p}{\sigma^2_{n}(x)} }}{a} \leq \frac{C^2_{n}\gamma_{n}}{na}.
    \end{equation*}
Therefore, for $n\to \infty$, $C^2_{n}\expvalue{x \sim p}{\sigma^2_{n}(x)} \to 0$ almost surely if $\frac{C^2_{n}\gamma_{n}}{n} \to 0$ for $n\to \infty$. Now by definition of $C_n$ (Theorem~\ref{theorem:lower_bound}) and plugging in the choice of $\beta_n$ (Lemma~\ref{lem:gp_calibrated}), we have $\frac{C^2_n\gamma_n}{n} \propto \frac{\gamma^{T+1}_n}{n}$. By assumption, we have that $\gamma_n = \calO(\text{poly}(\log(n))$, and, thus $\frac{\gamma^{T+1}_n}{n} = \calO(\text{poly}(\log(n))^{T+1} / n) =\calO(\text{poly}(\log(n)) / n)$. Hence, $\frac{C^2_n\gamma_n}{n} \to 0$ for $n\to \infty$.
    For example, for the linear and RBF kernel, we have (see Table~\ref{tab:bound_on_gamma_n})
    \begin{align*}
        \frac{\gamma^{T+1}_n}{n} &= \calO\left(d^{T+1}\frac{\left(\log{n}\right)^{T+1}}{n}\right) \tag{Linear kernel}\\
          \frac{\gamma^{T+1}_n}{n} &= \calO\left(\frac{\left(\log{n}\right)^{(d+1)(T+1)}}{n}\right) \tag{RBF kernel}.
    \end{align*}

    Now to recover the rate of convergence, let $v = \expvalue{x \sim p}{C_{n}\sigma_{n}(x)}$. We study its variance and expectation with respect to $x_{1:n-1} \sim p$ for a fixed $n$. 
    We have \[
    \text{Var}[v] \leq  \expvalue{}{v^2} \leq  \expvalue{x_{1:n-1} \sim p}{ C^2_{n}\expvalue{x \sim p}{\sigma^2_{n}(x)} }\leq \frac{C^2_{n}\gamma_{n}}{n}.\]
    Additionally, $\expvalue{}{v} \leq \sqrt{\expvalue{}{v^2}} \leq C_{n}\sqrt{\frac{\gamma_{n}}{n}}$. Now, we apply the Chebyshev inequality, i.e., 
    \begin{equation*}
        \mathbb{P}(|v - \expvalue{v}{v}| \geq a) \leq \frac{\text{Var}[v]}{a^2} \leq \frac{\expvalue{v}{v^2}}{a^2}.
    \end{equation*}
Therefore, for $a^2 = \frac{\expvalue{v}{v^2}}{ \delta}$, we have with probability at least $1-{\delta}$, 
\begin{align*}
    v &\leq \expvalue{v}{v} + a \\
    &= \expvalue{v}{v} + \sqrt{\frac{\expvalue{v}{v^2}}{{ \delta}}}  \\
    &\leq \left(1 + \frac{1}{\sqrt{{\delta}}}\right) \sqrt{\expvalue{v}{v^2}}.
\end{align*}
Next, we plug in the definition of $v$, to get
\begin{align*}
        \expvalue{x \sim p}{C_{n}\sigma_{n}(x)} &\leq \left(1 + \frac{1}{\sqrt{{\delta}}}\right)\sqrt{\expvalue{x_{1:n-1} \sim p}{\expvalue{x \sim p}{ C^2_{n}\sigma^2_{n}(x)}}} \\
        &\leq \left(1 + \frac{1}{\sqrt{{ \delta}}}\right)C_n\sqrt{\frac{\gamma_{n}}{n}} = \calO\left(\left(1 + 1/\sqrt{\delta}\right)\sqrt{\frac{\gamma^{T+1}_n}{n}}\right).
\end{align*}
with probability at least $1-{ \delta}$.
\end{proof}


\begin{proof}[Proof of Theorem~\ref{theorem:consistency} (Consistency of HAMBO)]
    For the GP case we prove that the well calibration assumption, and the Lipschitz continuity of $f$ and $\sigma$ are satisfied (see Lemmas~\ref{lem:gp_calibrated}~and~\ref{lemma:lip_f_sigma_gp}). 
    This allows us to apply Theorem~\ref{theorem:lower_bound} and Proposition~\ref{prop:valid_lower_bound}, which gives with probability at least $1-\delta$ that,
        \begin{equation}\label{eq:theorem5_7}
       J(\pi_e) \geq \tilde{J}(\pi_e) \geq J(\pi_e) -  C_n ~ \expvalue{\bs, \ba \sim \rho^{\pi_e}}{ \norm{\bsigma_n(\bs, \ba)}}.
    \end{equation}
     To prove consistency, we then only need to show that $C_n ~ \expvalue{\bs, \ba \sim \rho^{\pi_e}}{ \norm{\bsigma_n(\bs, \ba)}}$ goes to $0$ for $n \to \infty$. 
Since the support of the behavioural policy's state-occupancy measure $\rho^{\pi_b}$ is compact, and $\supp{\rho^{\pi_e}} \subseteq \supp{\rho^{\pi_b}}$, we have $\rho^{\pi_b}(s, a) \geq \hat{C}\rho^{\pi_e}(s,a)$ for all $(s,a) \in \calS \times \calA$, and some $\hat{C} > 0$, i.e., the importance sampling ratio is bounded. We can then write,
     \begin{align*}
             C_n ~ \expvalue{\bs, \ba \sim \rho^{\pi_e}}{ \norm{\bsigma_n(\bs, \ba)}} & \leq \sum^{d_s}_{i=1} \expvalue{\bs, \ba \sim \rho^{\pi_e}}{C_n \sigma_{n,i}(\bs, \ba)}\\
        &=  \sum^{d_s}_{i=1} \expvalue{\bs, \ba \sim \rho^{\pi_b}}{C_n \sigma_{n,i}(\bs, \ba)\frac{\rho^{\pi_e}(s,a)}{\rho^{\pi_b}(s,a)}} \\
        &\leq \frac{1}{\hat{C}} \sum^{d_s}_{i=1} \expvalue{\bs, \ba \sim \rho^{\pi_b}}{C_n \sigma_{n,i}(\bs, \ba)}.
    \end{align*}


    Moreover, by taking a union bound over the dimensions $1, ..., d_s$, Lemma~\ref{lemma:consistency_of_behavioural_policy} implies that with probability greater than $1-\delta$, for any set of i.i.d.~trajectories,
    \begin{equation*}
               \sum^{d_s}_{i=1} \expvalue{\bs, \ba \sim \rho^{\pi_e}}{C_n \sigma_{n,i}(\bs, \ba)}  = \calO\left(d_s\left(1+\sqrt{d_s/\delta}\right)\sqrt{\frac{\gamma^{T+1}_n}{n}}\right).
    \end{equation*}
 Consider a sequence $\{\delta_n\}_{n\geq 0}$ such that $\lim_{n\to 0} \delta_n = 0$, and $\lim_{n\to\infty} d_s\left(1+\sqrt{d_s/\delta_n}\right)\sqrt{\sfrac{\gamma^{T+1}_n}{n}} = 0$ (e.g., $\delta_n = \gamma^{-1}_n$), and let $S_n = \sum^{d_s}_{i=1} C_n \sigma_{n,i}(\bs, \ba)$.
Then we have for all $\epsilon > 0$
\begin{align*}
     \sum^{\infty}_{n=0} \sP\left(S_n > \epsilon \right) &= \sum^{N^*(\epsilon)}_{n=0} \sP\left(S_n > \epsilon \right) + \sum^{\infty}_{n=N^*(\epsilon)} \sP\left(S_n > \epsilon \right) \\ 
     &\leq N^*(\epsilon) +  \sum^{\infty}_{n=N^*(\epsilon)} \delta_n < \infty,
 \end{align*}
 where, $N^*(\epsilon)$ is the smallest integer such that $d_s\left(1+\sqrt{d_s/\delta}\right)\sqrt{\sfrac{\gamma^{T+1}_n}{n}} \leq \epsilon$. This implies that
     \begin{equation*}
        \mathbb{P}\left(\sum^{d_s}_{i=1} \expvalue{\bs, \ba \sim \rho^{\pi_e}}{C_n \sigma^{i}_n(\bs, \ba)} \to 0 \text{ for } n \to \infty \right) = 1.
    \end{equation*}
\end{proof}
 



















 \section{HAMBO for Offline Reinforcement Learning}
OPE methods are commonly used in offline reinforcement learning (ORL)~\cite{levine2020tutorial} to recommend/learn an optimal policy. 
Moreover, ORL methods also suffer from distribution shifts and are susceptible to overestimation, i.e., overestimating the performance of the recommended policy. Therefore, in principle, a good COPE method can be applied for ORL applications. 
To this end, we propose a natural modification of \textsc{HAMBO-CA} for ORL.
\begin{equation} \label{eq:pessimistic_value_orl}
\vspaceequation
    \tilde{J}(\pi^*) := \max_{\pi} \min_{\boldeta} J_{\tilde{p}_{\boldeta}}(\pi) ~.
    \vspaceequation
\end{equation}
Our proposed method induces pessimism with respect to the epistemic uncertainty of the learned transition model to tackle distribution shifts. Similar, to \textsc{HAMBO-CA}, we can also use the \textsc{HAMBO-DS1} variant to induce pessimsm. 



We compare our \textsc{HAMBO}-based ORL variants to other ORL algorithms 
on the OpenAI Gym tasks from the D4RL benchmark \cite{fu2020d4rl}. Specifically,
we consider the HalfCheetah environment with data sets generated with a random and a medicore-performing policy. Our results are presented in
table~\ref{table:orlresults}. 

The max-min optimization in eq (\ref{eq:pessimistic_value_orl}) is typically very challenging. For our experiments we use the soft actor critic algorithm to train the policy and adversary together (DQN algorithm is used for the \textsc{HAMBO-DA1} variant). 


Note, our proposed ORL algorithms recommend the policy with the best lower bound and not the best expected return (see eq~\ref{eq:pessimistic_value_orl}). Therefore, in general, they may fail to recommend the optimal policy. This is the price we pay for inducing robustness in our ORL methods. However, 
in practice (see table~\ref{table:orlresults}) we observer  that the \textsc{HAMBO} based ORL methods perform competitively to the start of the art in the field.










\begin{table}
\resizebox{\linewidth}{!}{
\begin{tabular}{c|c|c|c|c|c|c|c|c}
 & \multicolumn{2}{c|}{Ours} & \multicolumn{4}{c|}{Model-based} & \multicolumn{2}{c}{Model-free} \\
\hline
 & \textsc{HAMBO-CA} & \textsc{HAMBO-DA1} & \cite{rigter2022rambo} & \cite{yu2021combo} & \cite{yu2020mopo} & \cite{Morel} & \cite{kumar2020cql} & \cite{kostrikov2022iql} \\
\hline
HalfCheetah-random & 37.1 & 35.1 & 39.5 & 38.8 & 35.4 & 25.6 & 19.6 & - \\
HalfCheetah-medium & 66.9 & 67.9 & 77.9 & 54.2 & 69.5 & 42.1 & 49.0 & 47.4
\end{tabular}
}
\caption{Comparisons on the HalfCheetah from the D4RL benchmark suite. Results of the other algorithms are taken from \cite{rigter2022rambo}.}
\label{table:orlresults}
\end{table}


\bibliography{rothfuss_499-supp}

\end{document}
