\documentclass[accepted]{uai2023} 

\usepackage[american]{babel}


\usepackage{natbib} \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} \usepackage{booktabs} \usepackage{tikz} \usepackage{xfrac}


\usepackage{amsmath,amsfonts,bm,amssymb,amsthm}



\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}

\myexternaldocument{rothfuss_499-supp}


\definecolor{hanblue}{HTML}{3F88C5}\hypersetup{
hidelinks,
    colorlinks=true,
    linkcolor=hanblue,
    urlcolor=hanblue,
    citecolor=hanblue,
    anchorcolor=black}
    
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}

\newtheorem{theoremA}{Theorem}
\renewcommand{\thetheoremA}{A\arabic{theoremA}}

\newtheorem{lemmaA}{Lemma}
\renewcommand{\thelemmaA}{A\arabic{lemmaA}}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

\newcommand{\newterm}[1]{{\bf #1}}

\makeatletter
\newcommand{\printfnsymbol}[1]{\textsuperscript{\@fnsymbol{#1}}}
\makeatother


\def\figref#1{figure~\ref{#1}}
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
\def\secref#1{section~\ref{#1}}
\def\Secref#1{Section~\ref{#1}}
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
\def\eqref#1{equation~\ref{#1}}
\def\Eqref#1{Equation~\ref{#1}}
\def\plaineqref#1{\ref{#1}}
\def\chapref#1{chapter~\ref{#1}}
\def\Chapref#1{Chapter~\ref{#1}}
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
\def\algref#1{algorithm~\ref{#1}}
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
\def\partref#1{part~\ref{#1}}
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

\newcommand{\pdata}{p_{\rm{data}}}
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} 

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} 

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak



\newcommand{\rom}[1]
    {\text{\MakeUppercase{\romannumeral #1}}}









\newcommand{\norm}[1]{{\lVert#1\rVert}_2}
\newcommand{\normAny}[1]{{\lVert#1\rVert}}
\newcommand{\ba}{\boldsymbol{a}}
\newcommand{\br}{\boldsymbol{r}}
\newcommand{\bs}{\boldsymbol{s}}
\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\by}{\boldsymbol{y}}
\newcommand{\beps}{\boldsymbol{\epsilon}}
\newcommand{\boldeta}{\boldsymbol{\eta}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bnu}{\boldsymbol{\nu}}
\newcommand{\bsigma}{\boldsymbol{\sigma}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\bome}{\boldsymbol{\omega}}
\newcommand{\btheta}{\boldsymbol{\theta}}


\newcommand{\bW}{\mathbf{W}}
\newcommand{\bA}{\mathbf{A}}
\newcommand{\bk}{\mathbf{k}}
\newcommand{\bX}{\mathbf{X}}
\newcommand{\bh}{\boldsymbol{h}}
\newcommand{\obs}[1]{#1^o}
\newcommand{\miss}[1]{#1^m}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bM}{\mathbf{M}}
\newcommand{\bbR}{\mathbb{R}}
\newcommand{\bff}{\mathbf{f}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bI}{\mathbf{I}}
\newcommand{\bLamb}{\mathbf{\Lambda}}
\newcommand{\bB}{\mathbf{B}}
\newcommand{\GP}{\mathcal{GP}}
\newcommand{\bigO}{\mathcal{O}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\normal}{\mathcal{N}}
\newcommand{\given}{\, \vert \,}
\newcommand{\expVal}[1]{\mathbb{E}\left[#1\right]}
\newcommand{\expvalue}[2]{\underset{#1}{\mathbb{E}}\left[#2\right]}
\newcommand{\var}[2]{\underset{#1}{\text{Var}}\left[#2\right]}
\newcommand{\calA}{\mathcal{A}}
\newcommand{\calB}{\mathcal{B}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calD}{\mathcal{D}}
\newcommand{\calE}{\mathcal{E}}
\newcommand{\calF}{\mathcal{F}}
\newcommand{\calG}{\mathcal{G}}
\newcommand{\calH}{\mathcal{H}}
\newcommand{\calJ}{\mathcal{J}}
\newcommand{\calK}{\mathcal{K}}
\newcommand{\calL}{\mathcal{L}}
\newcommand{\calM}{\mathcal{M}}
\newcommand{\calN}{\mathcal{N}}
\newcommand{\calO}{\mathcal{O}}
\newcommand{\calP}{\mathcal{P}}
\newcommand{\calQ}{\mathcal{Q}}
\newcommand{\calR}{\mathcal{R}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calT}{\mathcal{T}}
\newcommand{\calU}{\mathcal{U}}
\newcommand{\calV}{\mathcal{V}}
\newcommand{\calW}{\mathcal{W}}

\newcommand{\calX}{\mathcal{X}}
\newcommand{\calY}{\mathcal{Y}}
\newcommand{\calZ}{\mathcal{Z}}


\newcommand{\expect}[2]{\mathbb{E}_{#1} \; #2}
\newcommand{\supp}[1]{\text{supp}(#1)}
\newcommand{\bbP}{\mathbb{P}}
\newcommand{\bfP}{\mathbf{P}}
\newcommand{\bfTheta}{\mathbf{\Theta}}
\newcommand{\Rhat}{\hat{R}}

\newcommand{\ppol}{\rho^{{\mathrm{pes}}}}
\newcommand{\bpol}{\rho^{b}}
\newcommand{\epol}{\rho^{e}}

\renewcommand{\overline}{\tilde}




\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newcommand{\beginsupplement}{\setcounter{table}{0}
        \renewcommand{\thetable}{S\arabic{table}}\setcounter{figure}{0}
        \renewcommand{\thefigure}{S\arabic{figure}}}
 
\newcommand{\addcitation}{{\color{red} \textbf{[ADD CITATION]} \xspace}}


\newcommand{\vspacefigure}{\vspace{-0pt}}
\newcommand{\vspaceparagraph}{\vspace{-0pt}}
\newcommand{\vspaceequation}{\vspace{-0pt}}

\newcommand{\vspacecaption}{\vspace{-0pt}}
\newcommand{\vspacecaptionlow}{\vspace{-0pt}}
\newcommand{\vspacesubcaption}{\vspace{-0pt}}
\newcommand{\vspacesubcaptionlow}{\vspace{-0pt}} \usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{caption}
\usepackage{subcaption}
\DeclareMathOperator{\diag}{diag}




\newcommand{\swap}[3][-]{#3#1#2} 

\title{Hallucinated Adversarial Control for Conservative Offline Policy Evaluation}

\author[1]{Jonas~Rothfuss\thanks{ Equal contribution.}}
\author[1]{Bhavya~Sukhija\printfnsymbol{1}}
\author[1]{Tobias~Birchler\printfnsymbol{1}}
\author[1]{Parnian~Kassraie}
\author[1]{Andreas~Krause}
\affil[1]{ETH Zurich\\
    Switzerland
}
  
  \begin{document}
\maketitle



\begin{abstract}
\looseness -1  We study the problem of {\em conservative off-policy evaluation (COPE)} where given an offline dataset of environment interactions, collected by other agents, we seek to obtain a (tight) lower bound on a policy's performance. This is crucial when deciding whether a given policy satisfies certain minimal performance/safety criteria before it can be deployed in the real world.
To this end, we introduce \textsc{HAMBO}, which builds on an uncertainty-aware learned model of the transition dynamics. To form a conservative estimate of the policy's performance, \textsc{HAMBO} hallucinates worst-case trajectories that the policy may take, within the margin of the models' epistemic confidence regions.
We prove that the resulting COPE estimates are valid lower bounds, and, under regularity conditions, show their convergence to the true expected return. Finally, we discuss scalable variants of our approach based on Bayesian Neural Networks and empirically demonstrate that they yield reliable and tight lower bounds in various continuous control environments.




\end{abstract}

 %\vspacecaption
 \section{Introduction}
 %\vspacecaptionlow
\looseness -1 Reinforcement learning methods require many interactions with their environment to successfully learn and evaluate policies.
Therefore, they are rarely applied in challenging real-world applications such as medicine \citep{murphy2001marginal}, education \citep{mandel2014offline} or autonomous driving \citep{kiran2021deep}, where a policy can only be deployed in the environment if it exceeds a pre-specified performance threshold or fulfills certain safety criteria. This leaves us with a challenging problem: How do we know whether a policy fulfills the necessary criteria so that it can safely interact with the environment, without testing it on the environment, and in the process, compromising safety?










\looseness -1 Off-policy evaluation (OPE) aims to solve this problem by estimating the performance of an evaluation policy, using only offline data that was previously collected by other agents \citep[e.g.][]{precup2001off, dudik2011doubly}.
In practice, offline datasets are often recorded interactions of a human expert with the environment.
Since the evaluation policy typically induces a different action-state distribution than offline data, OPE methods often have to make predictions under strong distribution shifts. As a result, most existing OPE estimators suffer from high variance and are prone to overestimating the performance of the policy \citep{thomas2015hcope}. In safety-critical applications, we can not risk and deploy a policy that is potentially much worse than what the OPE estimate suggests.
Therefore, we aim for {\em conservative off-policy evaluation (COPE)} which seeks a (tight) lower bound on the evaluation policy's expected return that holds with high probability.
Once deployed, the policy may end up exploring areas that were not included in the offline data. Thus, reliably bounding the worst-case performance can be quite challenging.

\looseness -1 We develop a novel {\em model-based} COPE approach that hinges upon two key ideas: {\em epistemic uncertainty} and {\em pessimism}. In particular, our approach, {\em Hallucinated Adversarial Model-Based Off-policy evaluation (\textsc{HAMBO})} (\textsc{HAMBO}), builds on a learned statistical model of the transition dynamics that is able to quantify epistemic uncertainty. To obtain a valid lower bound on the policy performance, HAMBO hallucinates adversarial/worst-case trajectories the agent may take within the epistemic confidence sets of the model.








\looseness-1 We prove that \textsc{HAMBO} reliably yields a high-probability bound on the true expected return of the policy, even when the offline data does not cover the areas explored by the evaluation policy (Proposition~\ref{prop:valid_lower_bound}).
Under regularity conditions, we further show that our conservative estimate {\em converges} from below to the true expected return (Theorem~\ref{theorem:consistency}).
To the best of our knowledge, \textsc{HAMBO} is the first provably consistent and conservative approach for OPE in continuous action-state spaces.
We then propose scalable Bayesian neural network (BNN) variants of \textsc{HAMBO} and empirically evaluate them on various continuous control tasks. Importantly, we demonstrate that, {\em even when the regularity conditions are not met}, \textsc{HAMBO} reliably provides tight lower bounds on the true expected return. \looseness -1 %\vspacecaption
\section{Problem Setting}
%\vspacecaptionlow
We consider a finite horizon Markov decision process (MDP) $\mathcal{M} = (\mathcal{S}, \mathcal{A}, p_0, p, r, T)$ 
with continuous state and action spaces $\mathcal{S} \subseteq \R^{d_s}$ and $\mathcal{A} \subseteq \R^{d_a}$, initial state distribution $p_0(\bs_0)$, reward function $r(\ba_t, \bs_t)$ and horizon $T \in \mathbb{N}$.
In particular, we consider stochastic transition dynamics that are governed by 
$
    \bs_{t+1} = \vf(\bs_t, \ba_t) + \beps_t
$
where $\vf: \calS \times \calA \rightarrow \calS$ is unknown and $\beps_t \in \R^{d_s}$ is independent, additive transition noise with distribution $p_{\beps}(\beps_t|\bs_t, \ba_t)$. Hence, the transition distribution $p$ follows as
$p(\bs_{t+1}| \bs_t, \ba_t) = p_{\beps}(\bs_{t+1} - f(\bs_t, \ba_t) | \bs_t, \ba_t)$. For simplicity, we assume that the reward function is known. However, all results can straightforwardly be extended to unknown rewards. \looseness-1

The agent interacts with the environment according to a policy $\pi(\ba_t | \bs_t)$, which is a distribution over actions, conditioned on the current state $\bs_t$.
The performance of a policy is typically measured by its expected return $J(\pi) := J_p(\pi) := \mathbb{E}_{\bs_0 \sim p_0}[V_{p, 0}^\pi(\bs_0)]$ where 
$V_t^\pi(\bs) := V_{p, t}^\pi(\bs) := \mathbb{E}_{p,\pi}[G_t|S_t = \bs]$ is the value function and $G_t := \sum_{t'=t+1}^T r(\bs_{t'}, \ba_{t'})$ is the return. For simplicity, we omit a discount factor in the return computation. However, all results presented can be straightforwardly adapted to discounted rewards.
Furthermore, we denote the {\em occupancy measure} of policy $\pi$ as
\begin{equation*}
%\vspaceequation
    \rho^{\pi}(\bs, \ba) := \frac{1}{T} \sum_{t=0}^{T-1} p(\bs_t=s, \ba_t=a | \pi, \mathcal{M}) \;,
    \label{eq:state_occ_measure}
%\vspaceequation
\end{equation*}
that is, the probability density function of being in state $\bs$ and performing action $\ba$ at any point of time $t = 0, ..., T-1$.













We study the problem of offline policy evaluation where the task is to evaluate the performance, i.e. estimate the expected return $J(\pi_e)$, of a given evaluation policy $\pi_e$ while only using an offline dataset $\calD_b = \{(\bs_i, \ba_i, r_i, \bs'_i)\}_{i=1}^n$
of observed transitions.
The key challenge in OPE is the distribution shift between the (unknown) behavior policy $\pi_b$ which generated the dataset $\calD_b$ and the policy $\pi_e$ which we would like to evaluate. If $\pi_b$ differs from $\pi_e$, their state occupancy measures $\rho^{\pi_b}$ and $\rho^{\pi_e}$ can look significantly different. As a result, the dataset $\calD_b$ which is generated based on $\rho^{\pi_b}$ may contain many samples in regions of the state-action space which $\pi_e$ is unlikely to visit and limited data in regions that are relevant for accurately evaluating $\pi_e$. In some cases, the support of $\rho^{\pi_b}$ might not even contain the support of $\rho^{\pi_e}$, i.e., 
$\exists (\bs, \ba) \in \mathcal{S} \times \mathcal{A}: \rho^{\pi_e}(\bs, \ba) > 0 \wedge \rho^{\pi_b}(\bs, \ba) = 0$. 
Since OPE methods have to make predictions under such strong distribution shifts their estimates suffer from high variance and are prone to overestimate the performance of the policy. 



OPE is particularly relevant in applications where we need to ensure a certain level of performance before a policy can be deployed online. Hence, it is often important to reliably determine whether or not the policy $\pi_e$ meets its minimum performance requirements. We formalize this problem as {\em conservative offline policy evaluation} (see Definition \ref{definition:conservativeope}) where we want to ideally find a tight lower bound on the expected return that holds with high-probability:\looseness-1





\begin{definition}[Conservative Offline Policy Evaluation]\hfill\\
\label{definition:conservativeope}
Let $\mathcal{M}$ be an MDP and $\calD_b \in (\mathcal{S} \times \mathcal{A} \times \mathbb{R} \times \mathcal{S})^n$ a dataset of transitions, collected with a behavior policy $\pi_b$ on $\mathcal{M}$.
Then the task of conservative OPE is:
Given the offline dataset $\calD_b$, a policy $\pi_e$ to evaluate and a confidence level $\delta \in (0,1)$, find the largest possible lower-bound $b \in \mathbb{R}$, which satisfies $b \leq J(\pi_e)$ with probability at least $1-\delta$.
%\vspaceparagraph
\end{definition}
%\vspaceparagraph
\looseness -1 In some applications \citep[e.g.,][]{brunke2022safe}, safety criteria are not directly encoded in the reward and instead, are expressed as additional constraints in the form of $\E_{(\bs, \ba)\sim \rho^{\pi_e}} \left[ c_i(\bs, \ba)\right] \geq 0$. To determine with high confidence whether $\pi_e$ meets these constraints, we can apply COPE to each $c_i$ individually. 



 %\vspacecaption
\section{COPE via Adversarial Transition Models}
%\vspacecaptionlow
\label{section:hambo_general_idea}
We take a model-based approach to COPE, and use a statistical model to estimate which transition functions $\bh: \calS \times \calA \rightarrow \calS$ from a hypothesis space $\calH$ are plausible given the offline data $\calD_b$ of size $n$. Then, we employ this statistical model of the transition dynamics to estimate the policy value $J(\pi_e)$.
For this estimate, we want to guarantee with high probability that it does not exceed the true policy value. To ensure this, we need to be able to reliably quantify the {\em epistemic uncertainty} of our model estimates.

\looseness -1 Uncertainty quantification can be done with either a frequentist approach that produces mean and confidence estimate $\bmu_n(\bs, \ba)$ and $\bsigma_n(\bs, \ba)$ or with a Bayesian model that maintains a posterior distribution $p(\vh|\calD_b)$ over dynamics models in $\calH$. In the Bayesian case, we denote $\bmu_n(\bs, \ba):= \E_{\vh \sim p(\bh|\calD_b)} [\bh(\bs, \ba)]$ as the posterior mean and $\bsigma^2_n(\bs, \ba) :=\diag(\E_{\vh, \vh' \sim p(h|\calD_b)}[\vh(\bs, \ba) \vh'(\bs, \ba)^\top])$ as the posterior variance. In either case, we require that our statistical model of $\bh$ is calibrated:

\begin{assumption}[Calibrated model] \label{assumption:calibration}
    A statistical model $(\bmu_n, \bsigma_n, \beta_n)$, with $\beta_n(\delta) \in \R^+$ as a scalar function that depends on the confidence level $\delta \in (0, 1]$, is calibrated with respect to $\vf$ if, with probability at least $1-\delta$, for all $ (\bs, \ba) \in \calS \times \calA$ and $j=1, \dots, d_s$
    %\vspaceequation
    \begin{equation*}
    %\vspaceequation
        |\mu_{n,j}(\bs, \ba) - f_{j}(\bs, \ba) | \leq \beta_n(\delta) \sigma_{n,j}(\bs, \ba),
        %\vspaceequation
    \end{equation*}
    where $\mu_{n,j}$ and $\sigma_{n,j}$ denote the $j$-th element in the vector-valued functions $\bmu_n$ and $\bsigma_n$, respectively.
\end{assumption}
%\vspaceparagraph
\looseness -1 Popular statistical models for transition dynamics that capture epistemic uncertainty are {\em Gaussian Processes (GPs)} \citep{rasmussen2005gp},  {\em Probabilistic Neural Network Ensembles} \citep{lakshminarayanan2017uncertaintyensembles} and {\em Bayesian Neural Networks} \citep{blundell2015bnn}. In later sections, we will attend to these specific choices of model in more detail and discuss when they are calibrated.\looseness-1
%\vspacecaption
\subsection{The \textsc{HAMBO} Framework}
%\vspacecaptionlow
%\vspace{-1mm}
 \label{subsec:general_hambo_approach}
\looseness-1 If our model is calibrated, we can, with high probability, use the confidence region 
%\vspaceequation
\[
%\vspaceequation
[\bmu_n(\bs, \ba) - \beta_n(\delta) \bsigma_n(\bs, \ba), \bmu_n(\bs, \ba) + \beta_n(\delta) \bsigma_n(\bs, \ba)]
%\vspaceequation
\]
which is a $d_s$-dimensional hypercube, as a proxy for the true dynamics $\vf(\bs, \ba)$. We then pessimistically select transitions within this region, to guarantee a high probability lower bound on the policy value $J(\pi_e)$. We do so, by introducing an adversary $\boldeta: \calS \times \calA \rightarrow [-1,1]^{d_s}$ that, for every $(\bs, \ba) \in \calS \times \calA$ picks a transition from the confidence region, thereby inducing the following hallucinated transition distribution:
\begin{align} \label{eq:hallucinated_model}
\begin{split}
    \tilde{p}_{\boldeta}(\bs_{t+1}| \bs_t, \ba_t) := p_{\beps}\big(&\bs_{t+1} - \bmu_n(\bs_t, \ba_t)\\
    &- \beta_n \boldeta(\bs_t, \ba_t)  \bsigma_n(\bs_t, \ba_t) \big). 
\end{split}
\end{align}
This allows us to obtain a conservative value estimate for $\pi_e$\looseness-1
\begin{equation} \label{eq:pessimistic_value}
%\vspaceequation
    \tilde{J}(\pi_e) := \min_{\boldeta} J_{\tilde{p}_{\boldeta}}(\pi_e) ~.
    %\vspaceequation
\end{equation}
This equation summarizes our approach {\em hallucinated adversarial model-based off-policy evaluation (\textsc{HAMBO})} and Algorithm~\ref{alg:hambo_basic} presents the pseudo-code. 
Here, the expected reward $J_{\tilde{p}_\eta}(\pi_e)$ of $\pi_e$ under the hallucinated transition model $\tilde{p}_\eta$ can, e.g., be estimated via Monte Carlo estimation (i.e., generating trajectory rollouts and averaging the respecting returns). To find the adversary $\boldeta(\bs, \ba)$ which minimizes (\ref{eq:pessimistic_value}), we can view $\boldeta(\bs, \ba)$ as policy that aims to maximize $- J_{\tilde{p}_\eta}(\pi_e)$ and solve the corresponding optimal control problem. Importantly, with high probability, $\tilde{J}(\pi_e)$ is a lower bound on the true policy value $J(\pi_e)$:

\begin{proposition}[Valid lower bound] \label{prop:valid_lower_bound} 
Given a calibrated model $(\bmu_n, \bsigma_n, \beta_n(\delta))$, the \textsc{HAMBO} estimates satisfy $\tilde{J}(\pi_e) \leq J(\pi_e)$, with probability greater than $1-\delta$.
\end{proposition}
%\vspaceparagraph
While Proposition \ref{prop:valid_lower_bound} shows that our estimate $\tilde{J}(\pi_e)$ fulfills the requirements of COPE, $\tilde{J}(\pi_e)$ could potentially be very loose. However, 
 we can further establish a worst-case lower bound on $\tilde{J}(\pi_e)$, if $\vf$, $r$, $\bsigma_n$ and $\pi_e$ are continuous. Formally, we make the following Lipschitz continuity assumption:

\begin{assumption} (Lipschitz continuity) \label{assumption:lip_continuity}
  $\vf$ is $L_f$-Lipschitz, $r$ is $L_r$-Lipschitz, $\bsigma$ is $L_\sigma$-Lipschitz and $\pi_e$ is $L_\pi$-Lipschitz w.r.t. the Wasserstein-1 distance, i.e., for all $\bs, \bs' \in \calS$
  %\vspaceequation
  \begin{equation}
  %\vspaceequation
      \calW_1\hspace{-2pt}\left(\pi(\ba|\bs), \pi(\ba|\bs') \right) \leq L_\pi \norm{\bs - \bs'}.
      %\vspaceequation
  \end{equation}
\end{assumption}
Here, the continuity assumption on $\pi$ is expressed in terms the Wasserstein-1 distance and implies that a small change in the state space only induces a proportionally small change in the conditional action distribution of the policy. For instance, this is the case for policies that can be reparametrized with a Lipschitz function which is very common in practice:
\begin{example} \label{example:reparametrizable_policy_is_w1_lip}
\looseness -1 Any policy $\pi(\ba|\bs)$ that can be reparametrized as $\vg(\bs, \boldsymbol{\zeta})$, where $\boldsymbol{\zeta} \sim p(\boldsymbol{\zeta})$ and $\vg$ is $L_g$-Lipschitz, is also $L_g$-Lipschitz w.r.t. the $\calW_1$- distance.
%\vspaceparagraph
\end{example}
%\vspaceparagraph
Such Lipschitz assumptions are common in model-based OPE \citep[e.g.][]{fontenau2009lower_bound, paduraru2013off} and RL more broadly \citep[e.g.][]{berkenkamp2017safe, curi2020hucrl}, and, e.g, hold in many real-world control problems.
With these regularity assumptions, we bound how far away the \textsc{HAMBO} estimate $\tilde{J}(\pi_e)$ is from the true policy value: 

\begin{theorem} \label{theorem:lower_bound}
Under Assumption \ref{assumption:calibration} and \ref{assumption:lip_continuity} we have, with probability at least $1-\delta$, that
%\vspaceequation
    \begin{equation*}
    %\vspaceequation
       J(\pi_e) - \tilde{J}(\pi_e) \leq  C_n ~ \expvalue{(\bs, \ba) \sim \rho^{\pi_e}}{ \norm{\bsigma_n(\bs, \ba)}}
       %\vspaceequation
    \end{equation*}
    %\vspaceequation
    where 
    %\vspaceequation
    \[
    %\vspaceequation
    C_n \hspace{-2pt} \coloneqq \hspace{-2pt} \Bar{L}_r \left(1\hspace{-2pt} +\hspace{-2pt} \sqrt{d_s}\right)\beta_n T^2 
 \hspace{-2pt} \left(1\hspace{-2pt} +\hspace{-2pt} \Bar{L}_f \hspace{-2pt}  + \hspace{-2pt} (1\hspace{-2pt} +\hspace{-2pt} \sqrt{d_s})\beta_n(\delta)\Bar{L}_{\sigma}\right)^{\hspace{-2pt} T-1}  \hspace{-10pt}
 \]
    with $\Bar{L}_r := L_r (1+L_\pi)$ and $\Bar{L}_f, \Bar{L}_\sigma$ defined analogously.
    %\vspaceparagraph
\end{theorem}
%\vspaceparagraph
This theorem shows how by tuning the confidence level $\delta$, we can trade-off accuracy with reliability. In particular, choosing a small $\delta$ will ensure that the upper-bound on $J(\pi_e)$ holds. However, it also increases $\beta_n(\delta)$ and loosens the bound, indicating the $\tilde J(\pi_e)$ estimate will be less accurate.
Tightness of the HAMBO lower bound $\tilde{J}(\pi_e)$ depends on the
following key factors: Lipschitz-regularity, episode horizon $T$, and epistemic uncertainty.
Mainly, smaller Lipschitz constants and shorter episode lengths improve the bound. Moreover, the smaller the expected epistemic standard deviation $\bsigma_n(\bs_t, \ba_t)$ under the state occupancy measure of $\pi_e$, the tighter the bound. 
While the first two factors are generally dictated by the problem instance, the epistemic uncertainty can be reduced by using more offline data (in the relevant areas of the state-action space). If we can show that the epistemic uncertainty shrinks sufficiently fast with the number of offline data points $n$ (i.e., faster than $\calO(\beta_n^{T})$), then we can prove that $\tilde{J}(\pi_e)$ converges to the true policy value as $n \rightarrow \infty$. 
In the following, we discuss corresponding sufficient convergence conditions for GP models.



\subsection{\textsc{HAMBO} with Smooth GP functions}
 %\vspacecaptionlow
 \label{section:hambo_gp_models}
In this section, we discuss the application of GPs for constructing calibrated confidence regions to be used for \textsc{HAMBO}. For the transition dynamics, we consider vector-valued functions $\vf(\bs, \ba) \mapsto (f_1(\bs, \ba), ..., f_{d_s}(\bs, \ba))$ such that the scalar-valued functions $f_j \in \calH_k$ reside in a Reproducing Kernel Hilbert Space (RKHS) $\calH_k$ with kernel function $k(\cdot, \cdot)$ and have bounded RKHS norm, i.e. $\normAny{f_j}_k \leq B$. 
We denote this space by  \smash{$\vf \in \calH_{k, B}^{d_s} = \{ [f_1, ..., f_{d_s}] : \normAny{f_j}_k \leq B, j=1, ..., d_s \}$}.
We assume that transition noise $\beps \sim \calN(0, \sigma^2_\epsilon \bm{I})$ is normally distributed with variance $\sigma^2_\epsilon$.

By fitting a zero-mean Gaussian Process $\mathcal{GP}(0, k)$ on each dimension $j=1, ..., d_s$ of the next state $\bs_{t+1}$, we can use the posterior means and variances to construct calibrated confidence sets. For brevity, we denote $\bx := (\bs, \ba)$, so that
%\vspaceequation
\begin{align}
\begin{split}
    \mu_{n,j} (\bx)& = {\bm{k}}_{n}^\top(\bx)({\bm K}_{n} + \sigma^2_\epsilon \bm{I})^{-1}\by_{n, j}  \label{eq:GPposteriors}\\
     \sigma^2_{n, j}(\bx) & =  k(\bx, \bx) - {\bm k}^T_{n}(\bx)({\bm K}_{n}+\sigma^2_\epsilon \bm{I})^{-1}{\bm k}_{n}(\bx)
\end{split}
\end{align}
where $\by_{n, j} = [s'_{i, j}]^\top_{i \leq n}$ is the vector the $j$-th element of the observed next states $\bs'_i$, $\bm{k}_{n}(\bx) = [ k(\bx, \bx_i)]^\top_{i \leq n}$, and ${\bm K}_{n} = [ k(\bx_i, \bx_l)]_{i,l \leq n}$ is the kernel matrix. By concatenating the element-wise posterior mean and standard deviation, we obtain $\bmu_n(\bx) = [\mu_{n,j}(\bx)]^\top_{j \leq d_s}$ and
 $\bsigma_n(\bx) = [\sigma_{n,j}(\bx)]^\top_{j \leq d_s}$. Using this, we can construct calibrated confidence intervals that fulfill Assumption \ref{assumption:calibration}:


\begin{lemma}[Calibrated GP confidence sets] \label{lem:gp_calibrated}
Let $\vf \in \calH_{k,B}^{d_s}$.
Suppose $\bmu_n$ and $\bsigma_n$ are the posterior mean and variance of a GP with kernel $k$, fitted to $n$ noisy evaluations of $\vf$.
There exists $\beta_n(\delta)$, for which the tuple $(\bmu_n, \bsigma_n, \beta_n(\delta))$ satisfies Assumption~\ref{assumption:calibration} w.r.t. function $\vf$.
\end{lemma}
%\vspaceparagraph

\looseness -1 In Appendix~\ref{app:gp_proofs} we prove this lemma using results of \citet{chowdhury2017kernelized} and give the exact expression for a $\beta_n(\delta)$ that satisfies it. Generally, $\beta_n(\delta)$ depends on the {\em maximum information capacity} $\gamma_n$ of the kernel (see Appendix \ref{app:gp_proofs} for definition and details).
In the GP setting, we can also show Lipschitz continuity of $\vf$ and $\bsigma$, if the kernel function $k$ is sufficiently regular:
\begin{lemma} \label{lemma:lip_f_sigma_gp}
    If the kernel metric $d_k(\bx, \bx') := (k(\bx, \bx) + k(\bx', \bx') - 2 k(\bx, \bx'))^{\frac{1}{2}}$ is $L_k$-Lipschitz, then every $\vf \in \calH_{k, B}^{d_s}$ is Lipschitz with $L_f = \sqrt{d_s} B L_k$ and the posterior standard deviation $\bsigma$ is Lipschitz with $L_\sigma = \sqrt{d_s} L_k$.
\end{lemma}
%\vspaceparagraph
For common kernels, the kernel metric is Lipschitz continuous, and thus Lemma \ref{lemma:lip_f_sigma_gp} applies. For instance, for the linear kernel we have $L_k = 1$, for the RBF kernel we have $L_k = 1 / \ell$ and for the Matern-$\nu$ kernel we have $L_k = \sqrt{\nu / (\nu -1)} / \ell$, where $\ell$ is the lengthscale and $\nu$ the smoothness parameter of the Matern kernel.

\looseness -1 We can conclude that conditions of Proposition~\ref{prop:valid_lower_bound} and Theorem~\ref{theorem:lower_bound} are met when a GP is used for learning the transition dynamics from offline data. Hence, when the reward and the policy are Lipschitz, the \textsc{HAMBO} estimate satisfies
\[
%\vspaceequation
J(\pi_e) - C_n \expvalue{\rho^{\pi_e}}{ \norm{\bsigma_n(\bs, \ba)}} \leq \tilde J(\pi_e) \leq J(\pi_e)
%\vspaceequation
\]
with high probability. We can show that given a dataset of i.i.d.~trajectories, the difference term shrinks with $n$ sufficiently fast:\looseness-1

\begin{theorem}[Consistency of  \textsc{HAMBO}]  \label{theorem:consistency}
    Let $r$ be $L_r$-Lipschitz, $\pi$ be $L_\pi$-Lipschitz w.r.t. the $\calW_1$-distance and $\vf \in \calH_{k, B}^{d_s}$ where $k$ is a kernel with a $L_k$-Lipschitz kernel metric with a maximum information capacity $\gamma_n$ which is $\calO(\mathrm{polylog}(n))$.Suppose both $\rho^{\pi_e}$ and $\rho^{\pi_b}$ have a compact support and 
    $\supp{\rho^{\pi_e}} \subseteq \supp{\rho^{\pi_b}}$ and $\calD_{b}$ consists of $n$ data points from i.i.d. trajectories according to the behavior policy $\pi_b$. Then as $n \rightarrow \infty$,
    \[
    \tilde{J}_n(\pi_e) \xrightarrow[]{\mathrm{a.s.}} J(\pi_e).
    \]   
\end{theorem}
  %\vspaceparagraph
\looseness -1 The theorem implies that $\tilde J(\pi_e)$ is not only a conservative estimator for $J(\pi_e)$, but under certain regularity conditions, it is also a consistent estimator of the policy's true value. Meaning that for large $n$, the trade-off between reliability and accuracy vanishes.
In Appendix~\ref{app:consistency_proof} we prove this theorem and give the exact rate at which the \textsc{HAMBO} estimate converges to the true value of $\pi_e$. This rate depends on the choice of kernel, time horizon $T$, and dimensions of the environment $(d_a, d_s)$. As an example, if $k$ is a Linear or RBF kernel, with high probability
$\vert \tilde{J}_n(\pi_e) - J(\pi_e)\vert = \tilde \calO\left(n^{-1/2}\right)$
where $\tilde O$ omits polylogarithmic factors.

\looseness -1 The kernel assumptions in Theorem \ref{theorem:consistency} hold for many popular kernels such as any inner product kernel $k(\bx, \bx') = \phi(\bx)^\top\phi(\bx')$ with Lipschitz continuous finite-dimensional feature maps $\phi(\cdot)$ or smooth kernels such as the RBF. On the other hand, Theorem \ref{theorem:consistency} does hold for non-smooth functions, e.g., those corresponding to a Matern since their maximum information capacity $\gamma_n$ grows polynomially with $n$.

To the best of our knowledge, Theorem~\ref{theorem:consistency} is the first result that shows the consistency of a model-based finite-horizon OPE method for a continuous environments.

%\vspacecaption

\section{HAMBO with Neural Networks}
%\vspacecaptionlow

\label{sec:hambo_nn}
In practice, we often want to evaluate policies in settings where the state and action spaces are higher-dimensional, and have access to larger amounts of offline data. In such environments, GPs become unpractical as they tend to generalize poorly in high-dimensional domains and their inference becomes prohibitively expensive for larger datasets.

\textbf{The NN-based Statistical Model.} In this section, we discuss practical variants of \textsc{HAMBO} which employ neural networks that scale more favorably to large datasets and high-dimensional domains. Crucially, we need to be able to quantify epistemic uncertainty. For this purpose, we employ Bayesian Neural Networks (BNNs) which model $\bh_{\btheta}(\bs, \ba)$ as a neural network function where $\btheta$ are the parameters of the neural network. BNNs presume a prior distribution $p(\btheta) = \calN(\btheta ; 0, \lambda \bm{I})$ and maintain an approximation of the posterior $p(\btheta| \calD_b) \propto p(\calD_b | \btheta) p(\btheta)$ over neural network parameters. We use an independent Gaussian likelihood $p(\calD_b | \btheta) = \prod_{i=1}^n \calN(\bs_i'; \bh_{\btheta}(\bs_i, \ba_i), \bnu^2_{\btheta}(\bs_i, \ba_i))$ where 
$\bnu^2_{\btheta}(\bs, \ba)$ is the vector of transition noise variances which is also predicted by the BNN.

We use Stein Variational Gradient Descent (SVGD) \citep{liu2016svgd} to approximate the posterior as a set of $K$ particles $\Theta = \{ \btheta_1, ..., \btheta_K \}$. 
We form the mean prediction of our model as the average prediction of the $K$ NNs:
%\vspaceequation
\begin{align*}
%\vspaceequation
    \bmu_\Theta(\bs, \ba) &= \frac{1}{K} \sum_{k=1}^K \bh_{\btheta_k}(\bs, \ba). 
%\vspaceequation
\end{align*}
Similarly, we estimate the epistemic variance as
%\vspaceequation
\begin{align*}
%\vspaceequation
    \bsigma^2_{\Theta, e}(\bs, \ba) &= \frac{1}{K} \sum_{k=1}^K (\bh_{\btheta_k}(\bs, \ba) - \bmu_\Theta(\bs, \ba))^2.
    %\vspaceequation
\end{align*}
The overall predictive distribution is the equally weighted mixture of all $K$ NN-based conditional Gaussians, i.e.,  
\begin{equation} \label{eq:predictive_gmm}
%\vspaceequation
    p(\bs'|\bs, \ba, \calD_b) = \frac{1}{K} \sum_{k=1}^K \calN(\bs'; \bh_{\btheta_k}(\bs, \ba), \bnu^2_{\btheta_k}(\bs, \ba))
    %\vspaceequation
\end{equation}
whose variance is $\bsigma_\Theta^2(\bs, \ba) =  \bsigma^2_{\Theta, e}(\bs, \ba) + \bsigma^2_{\Theta, a}(\bs, \ba)$, where \smash{$\bsigma^2_{\Theta, a}(\bs, \ba) := \frac{1}{K}\sum_{k=1}^K \bnu^2_{\btheta_k}(\bs, \ba)$} represents aleatoric and $\bsigma^2_{\Theta, e}(\bs, \ba)$ the epistemic uncertainty. \looseness-1

\looseness -1 \textbf{Calibrating the Model.} Since our BNN model uses approximate inference and a potentially misspecified prior, it may not satisfy the calibration condition of Assumption \ref{assumption:calibration}. Thus, we re-calibrate the model's uncertainty estimates with a calibration set $\calD_c \subset \calD_b$ that is withheld from the training. In particular, we use temperature scaling which chooses $\tau > 0$ such that the scaled predictive distribution (\ref{eq:predictive_gmm}) with variance $\tau^2 \bsigma_\Theta^2(\bs, \ba)$ has a minimal empirical calibration error on $\calD_c$ \citep{kuleshov2018calibration}. Algorithm~\ref{alg:BNN_calibrate} formalizes this technique. Note that re-calibrating the BNN model does not guarantee formal calibration in the sense of Assumption \ref{assumption:calibration}. However, in our experiments, we found it to reliably yield a conservative value estimate $\tilde{J}(\pi_e)$.


%\vspacesubcaption

\subsection{Practical NN-Based HAMBO Variants} %\vspacesubcaptionlow

\label{sec:variants}
In the following, we discuss three ways of constructing adversarially hallucinated transition models based on our BNN model described in Equation~(\ref{eq:predictive_gmm}). The formal pseudo-code of all algorithms is presented in Appendix~\ref{app:algos}.

\looseness -1 \textbf{Continuous Adversary (\textsc{HAMBO-CA}).} This approach directly reflects the hallucinated adversarial transition model, introduced in (\ref{eq:hallucinated_model}) and (\ref{eq:pessimistic_value}). The adversary $\boldeta(\bs, \ba) \in [-1, 1]^{d_s}$ chooses the mean of the Gaussian transition probability from the epistemic confidence set, i.e.,
\begin{align*}
\resizebox{0.99\hsize}{!}{$\tilde{p}_{\boldeta}(\bs'| \bs, \ba) \coloneqq \calN \big(\bs';  \bmu_\Theta(\bs, \ba) +  \tau^2 \boldeta(\bs, \ba)  \bsigma^2_{\Theta, e} , \bsigma^2_{\Theta, a}(\bs, \ba) \big) .$
}
\end{align*}
\looseness -1 To get the corresponding conservative value estimate $\tilde{J}(\pi_e)$, we need to solve the minimization problem $\min_{\boldeta} J_{\tilde{p}_{\boldeta}}(\pi_e)$. For this, we parameterize the adversary  $\boldeta(\bs, \ba)$ as a neural network policy and use Soft Actor-Critic (SAC)  \citep{haarnoja2018sac2} to maximize the negative return. 


\textbf{Discrete Adversary (\textsc{HAMBO-DA}).}
\looseness -1 Our BNN posterior is approximated by a set of $K$ NNs whose mean squared error difference corresponds to epistemic uncertainty. Thus, we can also construct a pessimistic transition model by letting the adversary choose which of the $K$ NNs to pick. In this case, the adversary $\vartheta(k | \bs, \ba)$ is a categorical distribution over the NN indices $\{1, ..., K\}$. The hallucinated transition model follows as: 
%\vspaceequation
\begin{align*}
%\vspaceequation
\tilde{p}_{\vartheta}(\bs'| \bs, \ba) :=  \sum_{k=1}^K \vartheta(k | \bs, \ba) \calN \big(\bs' ;  \bh_{\btheta_k}(\bs, \ba) , \bnu^2_{\btheta_k}(\bs, \ba) \big).
%\vspaceequation
\end{align*}
Here, the adversary stochastically picks one of NN models at every step $t=0, ..., T-1$. For this reason, we refer to this variant as $\textsc{DA1}$ (Algorithm~\ref{alg:hambo_da-1}). The corresponding value estimate follows as $\tilde{J}_{\textsc{DA1}}(\pi_e) = \min_{\vartheta} J_{\tilde{p}_{\vartheta}}(\pi_e)$. We solve the optimization problem by parameterizing the adversary $\vartheta$ as a NN policy and use the clipped double DQN algorithm \cite{fujimoto2018td3} to maximize the negative return.

Alternatively, we can constrain the adversary so that it has to commit to one of the $K$ NN models for the entire trajectory. We refer to this variant as $\textsc{DAinf}$ (Algorithm~\ref{alg:hambo_dainf}). In this case, the transition model corresponds to the predictive distribution of one of the NNs $p_{\btheta_k}(\bs'| \bs, \ba) =  \calN \big(\bs' ;  \bh_{\btheta_k}(\bs, \ba), \bnu^2_{\btheta_k}(\bs, \ba) \big)$, and the value estimate follows as the minimum the policy values under each of the models, i.e.,  $\tilde{J}_{\mathrm{DAinf}}(\pi_e) = \min_{k \in \{1, \dots, K\}} J_{p_{\btheta_k}}(\pi_e)$.
If $K$ is larger (e.g., $K > 20$), we recommend taking the empirical $\delta$ quantile of the policy values $\{J_{\bar{p}_{k}}(\pi_e)\}_{k=1}^K$ instead of the minimum. In this case, $\textsc{DAinf}$ has similarities to the model-based bootstrap approach of \citet{kostrikov2020bootstrap}. \looseness -1

Naturally, the value estimates of $\textsc{DAinf}$ are less pessimistic than those of $\textsc{DA1}$, i.e. $\tilde{J}_{\mathrm{DAinf}}(\pi_e) \geq \tilde{J}_{\mathrm{DA1}}(\pi_e)$, because the adversary cannot change which model it picks throughout the trajectory. In the experiment section, we investigate whether $\textsc{DAinf}$ is still conservative enough to reliably yield lower bounds on the true policy values $J(\pi_e)$.
 \begin{figure}[ht]
    \centering
    \includegraphics[width=\columnwidth]{graphics/visual_toyexample.pdf}
    %\vspacefigure
    %\vspacecaption
\caption{Hallucinated trajectories for model-based OPE and pessimistic \textsc{HAMBO}. While OPE overestimates the performance of the unsafe policy, \textsc{HAMBO} correctly gives a conservative estimates through its adversarial transition model. The adversary chooses the worst-case trajectory with the confidence sets (shaded blue areas). }\label{fig:toyexample}
    %\vspacefigure
\end{figure}
%\vspacecaption
\section{Experiments}
%\vspacecaptionlow
\label{sec:results}

\looseness-1 We start this section by illustrating the inner workings of \textsc{HAMBO} with a toy example to show why pessimism is crucial for COPE.
We demonstrate that the convergence guarantees from Section~\ref{section:hambo_gp_models} materialize in practice for GP models.
Finally, we empirically evaluate and compare the practical variants of \textsc{HAMBO} with BNNs on various continuous control tasks. For comparability between our environments, we shift and scale all our results so that the true policy return value $J(\pi_e)$ is 1. 


\begin{figure*}[ht]
    \centering
\includegraphics[width=0.38\textwidth, trim={5mm 3mm 2mm 1mm},clip]{graphics/PointEnv_GP.pdf}
\hspace{20pt}
        \includegraphics[width=0.38\textwidth, trim={5mm 3mm 2mm 1mm}, clip]{graphics/Pendulum_GP.pdf}
    \caption{\looseness -1 GP-based \textsc{HAMBO} for increasing offline dataset sizes $n$ evaluated on the PointEnv and Pendulum-v1. The lower bound approaches the true return.\label{fig:gp_consistency}}    
%\vspacefigure
%\vspacefigure
\end{figure*}
\subsection{Illustrative Example}
%\vspacecaptionlow
To illuminate the core idea of \textsc{HAMBO} and  why pessimism is crucial for COPE, we conduct experiments on a toy environment which we call PointSafety (see Figure~\ref{fig:toyexample}).
In this environment, the agent navigates in the two-dimensional plane by applying actions $\ba \in [-0.5, 0.5]^2$ such that its position (i.e, state $\bs \in \calS = \mathbb{R}^2$) changes to $\bs_{t+1} = \bs_t + \ba_t$.
The agent always starts on the left $\bs_0 = (-2, 0)$ and aims to go to its goal on the right $\bs_{\mathrm{fin}} = (2,0)$. However, the unit circle is a danger zone, in which the agent is subject to highly negative rewards (red shaded area).

We consider evaluation policies $\pi_y$ with an intermediate goal $\bs_{\mathrm{im}} = (0, y)$ on the y-axis that goes in a straight line from $\bs_0$ to $\bs_{\mathrm{im}}$ and then in a straight line from $\bs_{\mathrm{im}}$ to the goal $\bs_{\mathrm{fin}}$.
Note that policies $\pi_y$ with $|y| \leq 1.155$ 
are unsafe.

We generate an offline dataset by rolling out the behavior policy $\pi_{1.6}$ with Gaussian action noise with a standard deviation of $0.1$.
Then, we evaluate $\pi_{1.1}$, which is unsafe (see black trajectory), by rolling it out using \textsc{HAMBO-CA}.


We compare this to a neutral variant that predicts the next state with the predictive mean $\bmu_\Theta(\bs, \ba)$, i.e., without pessimism. As we can observe from the yellow trajectory, it falsely estimates $\pi_{1.1}$ as safe, that is, it predicts that the trajectory lies outside of the danger zone. 
The trajectories with the adversarial transition model and the corresponding epistemic confidence sets for every step are depicted in Fig~\ref{fig:toyexample}. The adversary successfully moves the prediction towards the danger zone within the confidence set, and, thus, correctly estimates the policy to be unsafe.
Overall, this demonstrates a failure case of (neutral) off-policy evaluation and shows how \textsc{HAMBO} reliably gives a conservative estimate of the policy value through its pessimistic transition model.

\subsection{Empirical convergence of \textsc{HAMBO}} %\vspacesubcaptionlow
%\vspacesubcaption
\looseness=-1
For GP models, we show that \textsc{HAMBO} estimates converge to the true policy values (Theorem \ref{theorem:consistency}). Now, we empirically evaluate the behavior of GP-based \textsc{HAMBO} with an RBF kernel, as the number of offline data points grows. 
To this end, we consider two environments; a simple 2D PointEnv ($\calS=\mathbb{R}^2$, $\calA = [-1, 1]^2$), similar to the PointSaftey environment, and the Pendulum-v1 environment from the OpenAI Gym \citep{brockman2016openai}. 
In the PointEnv, the agent has to navigate the origin and accordingly receives the negative distance to the origin as a reward.

To generate the offline dataset, we collect transition data by uniformly sampling states and actions from the state and action space respectively. For the PointEnv, we restrict the sampled states to $[-40, 40]^2$ which covers the relevant part of the state space. 
As the evaluation policy, we use a proportional controller for the PointEnv, and a controller learned with SAC for the Pendulum. 

Figure~\ref{fig:gp_consistency} plots the \textsc{HAMBO} estimates $\tilde{J}(\pi_e)$ for a varying number of offline datapoints $n = |\calD_b|$. We notice that in the PointEnv, when we have insufficient data (here, ca. $n \leq 150$), the epistemic confidence regions of our GP model are large enough so that the transition model adversary sometimes manages to steer the policy outside the data support where the epistemic uncertainty is even higher. As a result, we see that $\tilde{J}(\pi_e)$ are initially far below the true expected return $J(\pi_e)$. However, as $n$ increases, the GP uncertainty regions become smaller, and, as we can observe in Figure~\ref{fig:gp_consistency}, $\tilde{J}(\pi_e)$ becomes an increasingly tighter lower bound, approaching $J(\pi_e)$ for both the environments. 

\begin{figure*}[t]
\centering
\includegraphics[width=0.32\textwidth]{graphics/visual_ope_pendulum_data_noaleatoric.pdf}
\includegraphics[width=0.32\textwidth]{graphics/visual_ope_hopper_data_noaleatoric.pdf}
\includegraphics[width=0.32\textwidth]{graphics/visual_ope_halfcheetah_data_noaleatoric.pdf}
%\vspacecaption
\caption{\textsc{HAMBO} variants and neutral OPE baselines for continuous control. 
Unlike neutral OPE, which frequently overestimates the true expected return, \textsc{HAMBO} always yields a valid lower bound, which becomes more accurate with $n$.\looseness-1 \label{fig:ope_data}}
%\vspacefigure
\end{figure*}
\begin{figure*}[!ht]
\centering
\includegraphics[width=0.32\textwidth]{graphics/visual_ope_pendulum_horizon_noaleatoric.pdf}
\includegraphics[width=0.32\textwidth]{graphics/visual_ope_hopper_horizon_noaleatoric.pdf}
\includegraphics[width=0.32\textwidth]{graphics/visual_ope_halfcheetah_horizon_noaleatoric.pdf}
%\vspacecaption
\caption{\textsc{HAMBO} variants and neutral OPE baselines for different horizons. With longer horizons, the variance of the neutral OPE estimates increases and \textsc{HAMBO} lower bounds become looser.}
\label{fig:ope_horizon}
%\vspacefigure
\end{figure*}
%\vspacesubcaption
\subsection{HAMBO for Continuous Control}
%\vspacesubcaptionlow
\looseness-1 We evaluate the NN-based \textsc{HAMBO} methods from Section \ref{sec:hambo_nn} on the continuous control tasks Pendulum-v1, Hopper-v3, and HalfCheetah-v3 from the OpenAI Gym and compare them to respective neutral (non-pessimistic) OPE methods.

\looseness -1 Our general methodology is as follows:
For a given environment, we first train a policy using the SAC algorithm \cite{haarnoja2018sac,haarnoja2018sac2} and save several checkpoints of the agent. Then, some of the mediocre-performing checkpoints are rolled out to generate an offline dataset. After that, a given policy (usually one of the best checkpoints) is evaluated with the NN-based \textsc{HAMBO} variants. 


We compare our approach to neutral OPE variants that do not use a pessimistic transition model \citep{fonteneau2013batch}. In particular, we consider various trajectory uncertainty propagation methods from \citet{chua2018pets}, employed in the context of OPE: First, we consider \textsc{OPE-DS}, where the transition model is approximated by a Gaussian $p(\bs'|\bs, \ba) = \calN(\bs' ;  \bmu_\Theta(\bs, \ba); \bsigma_\Theta^2(\bs, \ba))$, here the variance is the sum of the epistemic and aleatoric variance. Second, we consider \textsc{OPE-TS1} where the transition model is the mixture of predictive Gaussians in (\ref{eq:predictive_gmm}). This means that, in every step, one of the NN models is chosen uniformly at random to compute the next state distribution. Third, we consider \textsc{OPE-TSinf}, where, for every episode, we randomly commit to one of the $K$ NNs.




\looseness-1 We investigate the following three aspects: 1) whether a method yields reliable lower bounds, 2) the effect of the offline dataset size, and 3) the curse of long horizons. Figure \ref{fig:ope_data} and \ref{fig:ope_horizon} report the estimated expected policy returns, averaged over 5 seeds, alongside the corresponding confidence intervals. 

\textbf{Reliable Lower Bounds. }
The \textsc{HAMBO} variants are designed to give reliable lower bounds on the true expected return. The results in Figure \ref{fig:ope_data} and \ref{fig:ope_horizon} empirically confirm that, across all seeds, all NN-based \textsc{HAMBO} variants reliably provide lower bounds on $J(\pi_e)$, and, thus, fulfill the COPE requirements from Definition \ref{definition:conservativeope}. In contrast, the neutral OPE variants which do not introduce pessimism w.r.t. the epistemic uncertainty of the transition model fail to do so. In many cases, they overestimate the true policy value, particularly in the Hopper environment. This demonstrates the importance of pessimism in model-based COPE and affirms the validity of \textsc{HAMBO}, even with BNN models, where calibration (Definition \ref{assumption:calibration}) cannot be formally proven.

\textbf{Offline Dataset Size and Tightness.}
\looseness -1 The difference between \textsc{HAMBO} estimates $\tilde{J}(\pi_e)$ and the true expected reward $J(\pi_e)$ depends on the strength of the transition adversary, which is limited by the size of the epistemic confidence sets. As the size of the offline datasets $\calD_b$ increases, we can generally expect the epistemic uncertainty to shrink. Thus, the adversary $\boldeta$ becomes less powerful and the \textsc{HAMBO} estimates become an increasingly tight lower bound.

In Figure \ref{fig:ope_data}, we empirically investigate this effect by varying the offline dataset size $n$. As we hypothesized, we can observe the general trend that the \textsc{HAMBO} estimates come close to the true policy value, as $n$ increases.
Moveover, we observe that the \textsc{HAMBO-DA1} estimates are always strictly smaller than those of the \textsc{HAMBO-DAinf} variant. This is expected, since in the \textsc{DA1} variant, the adversary can pick the worst-case NN transition model at every step while in the case of \textsc{DAinf} the adversary can only do so per trajectory, and, thus has less power. Since our experiment results indicate that the pessimism in \textsc{HAMBO-DAinf} is sufficient to obtain reliable lower bounds in practice, we conclude that \textsc{HAMBO-DAinf} is the preferred choice among the two. While \textsc{HAMBO-DAinf} performs better in Hopper and HalfCheetah, \textsc{HAMBO-CA} yields the tightest lower bounds in the Pendulum environment.


\textbf{The Curse of the Long Horizons.}
Finally, we investigate the effect of the horizon length $T$ on our COPE estimates. Over the course of a trajectory, the transition model estimation errors can compound and lead to large discrepancies. This is a well-studied phenomenon in model-based RL \citep[e.g. see][]{janner2019mbpo}. In our case, this is reflected by the worst-case lower bound in Theorem \ref{theorem:lower_bound} which depends exponentially on $T$.


\looseness -1  To evaluate the empirical effect of horizon length, we report the (C)OPE estimates for an offline dataset of size $n= 10^5$ across varying horizon lengths: $T$ = 100, 200 and 400 for the Pendulum and Hopper. For HalfCheetah, we only report horizon lengths of $T$ = 100 and 150. Figure~\ref{fig:ope_horizon} displays the corresponding results. For an increasing horizon length, the variance of the neutral variants increases and the lower bounds of the conservative \textsc{HAMBO} estimates become looser. However, the observed decline in tightness in Figure~\ref{fig:ope_horizon} is much less pronounced than the exponential decline of the worst-case bound in Theorem \ref{theorem:lower_bound}.

\looseness -1  For large horizon lengths, it can happen that the hallucinated trajectory under the pessimistic transition model strives far outside the support of the offline data. In such cases, unlike neutral OPE methods, \textsc{HAMBO} will still provide lower bounds on the true expected return. However, these bounds can be very pessimistic. For instance, this can be observed in the case of Pendulum, where for $T=400$ the estimates of \textsc{HAMBO-DA1} and \textsc{HAMBO-DAinf} go out of the chart.
Making accurate long-horizon predictions is generally very hard. For instance, this is discussed extensively in the context of model-based RL in \citet{janner2019mbpo}. Often, a discount factor is used when computing returns to alleviate these issues. We highlight that we work with undiscounted returns and continuous state-action spaces, and, thus, operate in the most challenging setting for OPE.

 %\vspacecaptionlow

\section{Related Work}
%\vspacecaptionlow

\label{section:related}
This work mainly contributes to the literature on off-policy evaluation for MDPs, which we divide to three categories.



\looseness -1 
\textbf{Model-Free OPE. } The key challenge in OPE is to the distribution shift between behavior and evaluated policy. A popular natural approach to correct the distribution mismatch is to use importance sampling (IS) ratios to re-weight the rewards collected by the behavior policy \citep{precup2000importancesampling, dudik2011doubly} or to adjust the recursive updates when estimating the values directly via the Bellman equation \citep{precup2001off, sutton2015importancesamplingstatemarginal, hallak2017importancesamplingstatemarginal}. Some work also combine both approaches to obtain a more favorable bias-variance trade-off \citep{jiang2015importancesamplingdr, thomas2016data}. Unlike \textsc{HAMBO}, these approaches are model-free, i.e., they do not learn a model of the state transitions. However, they suffer from three key disadvantages: First, they have notoriously high variance, especially if the evaluated policy differs a lot from the behavior policy \cite{levine2020tutorial}. Second, they require the support of the behavior occupancy measure $\rho^{\pi_b}$ to contain the support of $\rho^{\pi_e}$ which is often not the case. In contrast, \textsc{HAMBO} still provides valid estimates in this scenario. Third, to compute the importance ratios, they assume access to the distribution of behavior policy which is almost never the case in practical applications where data is often collected by human experts. \textsc{HAMBO} does not require access to the behavior policy and, thus, is much more broadly applicable.


A recent line of work \citep{nachum2019dualdice, nachum2019algaedice, zhang2020gendice, yang2020off} estimates the state occupancy correction ratios via a form of fixed point iteration, and does not require access to the behavior policy. However, the Bellmann-like fixed point iteration is not applicable to the finite horizon case that we study in this paper. In addition, due to the fixed point iteration, it is very hard to quantify the uncertainty or bound error that is associated with such OPE estimates, making them poorly suited to COPE.


\looseness -1 \textbf{Model-Based OPE. } 
This approach first learns the transition dynamics, to then simulate rollouts with the evaluation policy $\pi_e$ and thereby estimate the expected reward of $\pi_e$ \citep[e.g.,][]{fonteneau2013batch, hanna2017bootstrap, kostrikov2020bootstrap}. Due to error in predicting the transitions, the resulting OPE estimate may overestimate the policy's performance which is prohibited in safety-critical applications.
Our approach additionally simulates pessimistic trajectories using the model's epistemic uncertainty, to avoid overestimation.
Further, to the best of our knowledge, Theorem \ref{theorem:consistency} is the first consistency result for model-based OPE.



\textbf{COPE and High-Confidence OPE. }
\looseness -1 We study the problem of COPE which seeks a high-probability lower bound on the expected return. This is is closely related to estimating confidence bounds for OPE. \citet{thomas2015hcope} provide such confidence bounds for IS-based OPE estimates. However, due to the high variance of IS estimates, such bounds are often very loose \citep{levine2020tutorial}.
\citet{kallus2020double} and \citet{shi2021debiasedope} propose a model-free approach to give asymptotically normal confidence intervals for directly $J(\pi)$. 
Assuming that the $Q$-function resides in an RKHS, \citet{feng2020accountable} and \citet{ feng2021nonasymptotic} present rates of convergence, under theoretically unverified assumptions about the MDP.
 These model-free approaches only work for discounted, infinite-horizon MDPs, thus, are not generally applicable to our finite-horizon setting.

\citet{hanna2017bootstrap, kostrikov2020bootstrap} use model-based bootstrapping to construct confidence intervals for the OPE estimates. \citet{kostrikov2020bootstrap} prove the asymptotic correctness of the bootstrap confidence intervals only for finite state-action spaces. In contrast, we show the validity of our COPE estimates non-asymptotically for any $|\calD_b|$, and in continuous state-action spaces.
Alternatively, \citet{fontenau2009lower_bound} and \citet{paduraru2013off} employ a Lipschitz argument to obtain valid COPE estimates. Our derivation of the worst-case lower bound in Theorem~\ref{theorem:lower_bound} also uses Lipschitz continuity. However, the \textsc{HAMBO} estimate provide a tighter lower bound on the true policy value, as we use the local confidence intervals rather than the global Lipschitz constants to introduce pessimism. Furthermore, unlike the mentioned work, \textsc{HAMBO} does not require knowledge of the Lipschitz constant
and works with sub-Gaussian noise.




























































 %\vspacecaption
\section{Conclusion}
%\vspacecaption
{HAMBO}, a novel approach for COPE that forms a pessimistic estimate of the expected return by hallucinating adversarial trajectories within the epistemic confidence regions of the estimated transition model. We formally prove the validity and consistency of the resulting COPE estimates. We propose various scalable NN-based variants of \textsc{HAMBO} and empirically demonstrate that they give reliable and tight lower bounds on the true expected return.

Importantly, our approach does not require access to the probability distribution of the behavior policy and gives reliable estimates, even when the support evaluation policy's occupancy measure is not contained in the offline data distribution. This makes \textsc{HAMBO} particularly relevant for safety-critical real-world applications, where the offline data is mostly collected by human experts and we need to make reliable decisions about whether a given policy is good enough to be deployed. 

HAMBO can be naturally combined with other offline reinforcement learning (ORL) algorithms to solve safety-critical ORL tasks. We leave this for future work to investigate. 

 
\begin{acknowledgements} This research was supported by the European Research Council (ERC) under the European Union's Horizon 2020 research and innovation program grant agreement no.\ 815943 and the Swiss National Science Foundation under NCCR Automation, grant agreement 51NF40 180545. Jonas Rothfuss was supported by an Apple Scholars in AI/ML fellowship. We thank Sebastian Curi for contributing to the initial idea for this project.
\end{acknowledgements} 
\bibliography{rothfuss_499}
\end{document}
