\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% Additional packages 
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{xr-hyper}
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\hypersetup{
    colorlinks=true,
    % linkcolor=brown,
    allcolors=brown
}

\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{physics} % shortcuts like norm, abs, qty, etc
\usepackage{numprint}
\usepackage{bbm}
\usepackage{dsfont}
\usepackage{float}
\usepackage{graphicx} % default *.eps
\usepackage{subcaption}
\usepackage[font={small}]{caption}
\newtheorem{proposition}{Proposition}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

% \usepackage{xcolor}
% \usepackage{enumitem}
% \usepackage{sidecap}
% \usepackage{wrapfig,lipsum,booktabs}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\added}[1]{{\color{black}#1}}
%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
% \DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak


% Imported from NeurIPS document

\newcommand{\ECE}{{\mathrm{ECE}}}
\newcommand{\LCE}{\mathrm{LCE}}
\newcommand{\MLCE}{\mathrm{MLCE}}
\newcommand{\method}{LoRe}
\newcommand{\Regret}{{\mathrm{Regret}}}

\newcommand{\Acc}{{\mathrm{Acc}}}
\newcommand{\Conf}{{\mathrm{Conf}}}
\newcommand{\hECE}{{\hat{\ECE}}}

\newcommand{\hmu}{{\hat{\mu}}}
\newcommand{\hp}{{\hat{p}}}

\newtheorem{hyp}{Hypothesis}
\newtheorem{prop}{Proposition}
\newtheorem{conjecture}{Conjecture}
\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}

\newcommand{\Ac}{{\mathcal{A}}}
\newcommand{\Bc}{{\mathcal{B}}}
\newcommand{\Cc}{{\mathcal{C}}}
\newcommand{\Dc}{{\mathcal{D}}}
\newcommand{\Ec}{{\mathcal{E}}}
\newcommand{\Fc}{{\mathcal{F}}}
\newcommand{\Gc}{{\mathcal{G}}}
\newcommand{\Hc}{{\mathcal{H}}}
\newcommand{\Ic}{{\mathcal{I}}}
\newcommand{\Jc}{{\mathcal{J}}}
\newcommand{\Kc}{{\mathcal{K}}}
\newcommand{\Lc}{{\mathcal{L}}}
\newcommand{\Mc}{{\mathcal{M}}}
\newcommand{\Nc}{{\mathcal{N}}}
\newcommand{\Oc}{{\mathcal{O}}}
\newcommand{\Pc}{{\mathcal{P}}}
\newcommand{\Qc}{{\mathcal{Q}}}
\newcommand{\Rc}{{\mathcal{R}}}
\newcommand{\Sc}{{\mathcal{S}}}
\newcommand{\Tc}{{\mathcal{T}}}
\newcommand{\Uc}{{\mathcal{U}}}
\newcommand{\Vc}{{\mathcal{V}}}
\newcommand{\Wc}{{\mathcal{W}}}
\newcommand{\Xc}{{\mathcal{X}}}
\newcommand{\Yc}{{\mathcal{Y}}}
\newcommand{\Zc}{{\mathcal{Z}}}

\newcommand{\Ab}{{\mathbb{A}}}
\newcommand{\Bb}{{\mathbb{B}}}
\newcommand{\Cb}{{\mathbb{C}}}
\newcommand{\Db}{{\mathbb{D}}}
\newcommand{\Eb}{{\mathbb{E}}}
\newcommand{\Fb}{{\mathbb{F}}}
\newcommand{\Gb}{{\mathbb{G}}}
\newcommand{\Hb}{{\mathbb{H}}}
\newcommand{\Ib}{{\mathbb{I}}}
\newcommand{\Jb}{{\mathbb{J}}}
\newcommand{\Kb}{{\mathbb{K}}}
\newcommand{\Lb}{{\mathbb{L}}}
\newcommand{\Mb}{{\mathbb{M}}}
\newcommand{\Nb}{{\mathbb{N}}}
\newcommand{\Ob}{{\mathbb{O}}}
\newcommand{\Pb}{{\mathbb{P}}}
\newcommand{\Qb}{{\mathbb{Q}}}
\newcommand{\Rb}{{\mathbb{R}}}
\newcommand{\Sb}{{\mathbb{S}}}
\newcommand{\Tb}{{\mathbb{T}}}
\newcommand{\Ub}{{\mathbb{U}}}
\newcommand{\Vb}{{\mathbb{V}}}
\newcommand{\Wb}{{\mathbb{W}}}
\newcommand{\Xb}{{\mathbb{X}}}
\newcommand{\Yb}{{\mathbb{Y}}}
\newcommand{\Zb}{{\mathbb{Z}}}

\myexternaldocument{luo_333}

\title{Local Calibration: Metrics and Recalibration (Supplementary Material)}
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rsluo@stanford.edu>?Subject=Your UAI 2022 paper}{Rachel Luo}{}$^*$}
\author[2]{\href{mailto:<abhatnagar@salesforce.com>?Subject=Your UAI 2022 paper}{Aadyot Bhatnagar}{}$^*$}
\author[2]{Yu Bai}
\author[1]{Shengjia Zhao}
\author[2]{Huan Wang}
\author[2]{Caiming Xiong}
\author[1,2]{Silvio Savarese}
\author[1,2]{Stefano Ermon}
\author[1]{Edward Schmerling}
\author[1]{Marco Pavone}
% Add affiliations after the authors
\affil[1]{%
    Stanford University\\
    Stanford, California, USA
}
\affil[2]{%
    Salesforce AI Research\\
    Palo Alto, California, USA
}

\begin{document}
\onecolumn
\appendix
\maketitle

\newtheorem{assumption}{Assumption}
\renewcommand{\theassumption}{\Alph{assumption}}
\newcommand{\set}[1]{{\left\{ #1 \right\}}}
\renewcommand{\abs}[1]{{\left| #1 \right|}}
\newcommand{\paren}[1]{{\left( #1 \right)}}
\newcommand{\brac}[1]{{\left[ #1 \right]}}
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\mb}[1]{\mathbf{#1}}
\renewcommand{\P}{\mathbb{P}}
\newcommand{\indic}[1]{\mathds{1}\left[#1\right]} % Indicator function
\newcommand{\slce}{\mathrm{SLCE}}
\newcommand{\slcehat}{\widehat{\mathrm{SLCE}}}
\renewcommand{\norm}[1]{\left\|{#1}\right\|} % A norm with 1 argument
\newcommand{\wt}{\widetilde}

\section{Model Architecture, Training, and Other Hyperparameters}
\label{appendix:details}
For ImageNet and CelebA, we compute the ECE, MCE, and LCE using 15 equal-width confidence bins. For the UCI communities and crime dataset, we use 5 equal-width bins because the dataset is much smaller (500 datapoints for recalibration). These numbers of bins represent a good tradeoff between bias and variance in estimating the relevant calibration errors. We also ran some initial experiments with equal-mass binning, but found that the results were very similar to those obtained with equal-width binning.

\subsection{ImageNet}
For all experiments with the ImageNet dataset, we used the pre-trained ResNet-50 model from the PyTorch \texttt{torchvision} package as our classifier. To calculate the LCE and apply \method{}, we used pre-trained Inception-v3 features, applying either t-SNE to reduce their dimension to 3 \added{or PCA to reduce their dimension to 50}, as a feature representation for the kernel.

\subsection{UCI Communities and Crime}
For all experiments with the UCI communities and crime dataset, we used a 3-hidden-layer dense neural network as our base classifier. Each hidden layer had a width of 100 and was followed by a Leaky ReLU activation. We applied dropout with probability 0.4 after the final hidden layer. We trained the model using the Adam optimizer with a batch size of 64 and a learning rate of $3 \times 10^{-4}$ until the validation accuracy stopped improving. All other hyperparameters were PyTorch defaults. Training was done locally on a laptop CPU. We trained 60 different models with different random seeds to perform the experiments described in Section \ref{sec:fairness} and Figure \ref{fig:setting1_mlce}.
To calculate the LCE and apply \method{}, we used the final hidden layer representation learned by our model, applying t-SNE to reduce the dimension to 2 \added{or PCA to reduce their dimension to 20}, as a feature representation for the kernel.

\subsection{CelebA}
For all experiments with the CelebA dataset, we trained a ResNet50 model and used it as our base classifier. We applied standard data augmentation to our training data (random crops \& random horizontal flips), and trained all models for 10 epochs 
using the Adam optimizer with a learning rate of $1 \times 10^{-3}$ and a batch size of 256. %In practice, this was sufficient for convergence. 
All other hyperparameters were PyTorch defaults. Training was distributed over 4 GPUs, and training a single model took about 30 minutes. For both Setting 2 and Setting 3 (described in Section~\ref{sec:fairness}), we trained 20 models with different random seeds to perform the experiments shown in Figures~\ref{fig:setting2_mlce} and ~\ref{fig:setting3_mlce}.
To calculate the LCE and apply \method{}, we used pre-trained Inception-v3 features, applying t-SNE to reduce their dimension to 2 \added{or PCA to reduce their dimension to 50}, as a feature representation for the kernel.

\subsection{COMPAS Criminal Recidivism}
For all experiments with the COMPAS criminal recidivism dataset, we used a 3-hidden-layer dense neural network as our base classifier. Each hidden layer had a width of 100 and was followed by a Leaky ReLU activation. We applied dropout with probability 0.4 after the final hidden layer. We trained the model using the Adam optimizer with a batch size of 64 and a learning rate of $3 \times 10^{-4}$ until the validation accuracy stopped improving. All other hyperparameters were PyTorch defaults. Training was done locally on a laptop CPU. We trained 60 different models with different random seeds to perform the experiments described in Section \ref{sec:fairness} and Figure \ref{fig:setting1_mlce}.
To calculate the LCE and apply \method{}, we used the final hidden layer representation learned by our model, applying t-SNE to reduce the dimension to 2 \added{or PCA to reduce their dimension to 20}, as a feature representation for the kernel.


\section{Additional Experimental Results}
\label{appendix:more_results}
In Figures \ref{fig:setting1_mlce}, \ref{fig:setting2_mlce}, \ref{fig:setting3_mlce}, and \ref{fig:setting4_mlce} we visualize the MLCE achieved by all recalibration methods for the three experimental settings evaluated in Section \ref{sec:fairness}. Figure \ref{fig:recalibration_mlce} in the main paper shows the same visualization for all methods on ImageNet. \added{In Figure \ref{fig:cifar100}, we plot the MLCE achieved by all recalibration methods for CIFAR-100, and in Figure \ref{fig:cifar10}, we do the same for CIFAR-10.} Across all settings and datasets, our method \method{} is the most effective at minimizing MLCE across a wide range of $\gamma$, even accounting for variations between runs. 

In these figures, ``Original'' represents no recalibration, ``TS'' represents temperature scaling, ``HB'' represents histogram binning, ``IR'' represents isotonic regression, ``MMCE'' represents direct MMCE optimization, and ``LoRe'' is our method. 

\added{Next, we examine the influence of the specific feature map used. In Figures \ref{fig:mlce_inceptionv3}, \ref{fig:mlce_alexnet}, \ref{fig:mlce_densenet121}, and \ref{fig:mlce_resnet101}, we plot the MLCE achieved by all recalibration methods for ImageNet using Inception-v3, AlexNet, DenseNet121, and ResNet101 features. In Figures \ref{fig:mlce_inceptionv3_lore_alexnet} and \ref{fig:mlce_densenet121_lore_alexnet}, we plot the MLCE achieved by all recalibration methods for ImageNet when the features used to calculate the MLCE are different from the features used by \method{}. For completeness, in Figures \ref{fig:elce_imagenet}, \ref{fig:elce_setting1}, \ref{fig:elce_setting2}, and \ref{fig:elce_setting3}, we also visualize the average LCE for all experimental settings. All plots show similar results: \method{} performs best over a wide range of $\gamma$.}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.48\textwidth]{figures/setting1_mlce.pdf}
    \hfill
    \includegraphics[width=0.48\textwidth]{figures/setting1_mlce_pca.pdf}
    \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all methods on task 1 of Section \ref{sec:fairness}, predicting whether a neighborhood's crime rate is higher than the median. \method{} achieves the best (or competitive) MLCE for most $\gamma$. Left: 2D t-SNE features. Right: 20D PCA features.}
    \label{fig:setting1_mlce}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.48\textwidth]{figures/setting2_mlce.pdf}
    \hfill
    \includegraphics[width=0.48\textwidth]{figures/setting2_mlce_pca.pdf}
    \captionof{figure}{MLCE vs.\ kernel bandwidth $\gamma$ for all methods on task 2 of Section \ref{sec:fairness}, predicting hair color on CelebA. \method{} achieves the best MLCE for virtually all values of $\gamma$. Left: 2D t-SNE features. Right: 50D PCA features.}
    \label{fig:setting2_mlce}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.48\textwidth]{figures/setting3_mlce.pdf}
    \hfill
    \includegraphics[width=0.48\textwidth]{figures/setting3_mlce_pca.pdf}
    \captionof{figure}{MLCE vs.\ kernel bandwidth for all methods on task 3 of Section \ref{sec:fairness}, predicting hair type on CelebA. \method{} achieves the best MLCE for all $\gamma < 1$ and is tied with histogram binning for $\gamma > 1$. Left: 2D t-SNE features. Right: 50D PCA features.}
    \label{fig:setting3_mlce}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.48\textwidth]{figures/setting4_mlce.pdf}
    \hfill
    \includegraphics[width=0.48\textwidth]{figures/setting4_mlce_pca.pdf}
    \captionof{figure}{MLCE vs.\ kernel bandwidth for all methods on task 4 of Section \ref{sec:fairness}, predicting criminal recidivism. \method{} achieves the best (or competitive) MLCE for most $\gamma$. Left: 2D t-SNE features. Right: 20D PCA features.}
    \label{fig:setting4_mlce}
\end{figure}


\begin{figure}[H]
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth,trim={0cm 0cm 0cm 0cm}]{figures/mlce_cifar100.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods for CIFAR-100 (3D t-SNE features). \method{} achieves lower MLCE for most $\gamma$.}
        \label{fig:cifar100}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth, trim={0cm 0cm 0cm 0cm},clip]{figures/mlce_cifar10.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods for CIFAR-10 (3D t-SNE features). \method{} achieves lower MLCE for most $\gamma$.}
        \label{fig:cifar10}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth,trim={0cm 0cm 0cm 0cm}]{figures/mlce_recalibration.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet using Inception-v3 features. \method{} achieves the best MLCE for most $\gamma$.}
        \label{fig:mlce_inceptionv3}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth, trim={0cm 0cm 0cm 0cm},clip]{figures/mlce_alexnet.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet using AlexNet features. \method{} achieves the best MLCE for most $\gamma$.}
        \label{fig:mlce_alexnet}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth,trim={0cm 0cm 0cm 0cm}]{figures/mlce_densenet121.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet using DenseNet121 features. \method{} achieves the best MLCE for most $\gamma$.}
        \label{fig:mlce_densenet121}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth, trim={0cm 0cm 0cm 0cm},clip]{figures/mlce_resnet101.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet using ResNet101 features. \method{} achieves the best MLCE for most $\gamma$.}
        \label{fig:mlce_resnet101}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth,trim={0cm 0cm 0cm 0cm}]{figures/mlce_inceptionv3_lore_alexnet.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet using Inception-v3 features to calculate the MLCE and AlexNet features for applying \method{}.}
        \label{fig:mlce_inceptionv3_lore_alexnet}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth, trim={0cm 0cm 0cm 0cm},clip]{figures/mlce_densenet121_lore_alexnet.pdf}
        \caption{MLCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet using DenseNet121 features to calculate the MLCE and AlexNet features for applying \method{}.}
        \label{fig:mlce_densenet121_lore_alexnet}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth,trim={0cm 0cm 0cm 0cm}]{figures/imagenet_elce.pdf}
        \caption{Average LCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods on ImageNet (3D t-SNE features). \method{} gets lower average LCE for most $\gamma$.}
        \label{fig:elce_imagenet}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth, trim={0cm 0cm 0cm 0cm},clip]{figures/setting1_elce.pdf}
        \caption{Average LCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods in task 1 (crime data, 2D t-SNE features). \method{} gets lower average LCE for most $\gamma$.}
        \label{fig:elce_setting1}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth,trim={0cm 0cm 0cm 0cm}]{figures/setting2_elce.pdf}
        \caption{Average LCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods in task 2 (CelebA, 2D t-SNE features).  \method{} gets lower average LCE for most $\gamma$.}
        \label{fig:elce_setting2}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth, trim={0cm 0cm 0cm 0cm},clip]{figures/setting3_elce.pdf}
        \caption{Average LCE vs.\ kernel bandwidth $\gamma$ for all recalibration methods in task 3 (CelebA, 2D t-SNE features). \method{} gets lower average LCE for most $\gamma$.}
        \label{fig:elce_setting3}
    \end{minipage}
\end{figure}


\section{Proof of Lemma \ref{lemma:global}}
\label{appendix:lemma}
We restate Lemma \ref{lemma:global} below, and provide the proof:
\begin{lemma}
Assume that $\lim_{\gamma \to \infty} k_{\gamma}(x, x') = 1$ for all $x, x' \in \mc{X}$. Then, as $\gamma \to \infty$, the MLCE converges to the MCE.
\end{lemma}
\begin{proof}
Since $\lim_{\gamma \to \infty} k_{\gamma}(x, x') = 1$ identically,
\begin{align*}
    \lim_{\gamma \to \infty} \max_x \widehat{\mathrm{LCE}}_{\gamma}(x; f, \hp)
    &= \max_x \frac{1}{\abs{\beta(x)}} \abs{\sum_{i \in \beta(x)} \hp(x_i) - \indic{f(x_i) = y_i}} \\
    &= \max_k \frac{1}{\abs{B_k}} \abs{\sum_{i \in B_k} \hp(x_i) - \indic{f(x_i) = y_i}} \\
    &= \max_k \abs{\mathrm{conf}(B_k) - \mathrm{acc}(B_k)} \\
    &= \mathrm{MCE}(x; f, \hp)
\end{align*}
\end{proof}


\section{Formal Statement and Proof of Theorem \ref{theorem:slce_informal}}
\label{appendix:lce}

Let $B_1,\dots,B_N$ denote a set of bins that partition $[0,1]$, and $B(p)$ denote the bin that a particular $p\in[0,1]$ belongs to. Let $a_f(x, y) = \indic{f(x) = y}$ indicate the accuracy of a the classifier $(f, \hp)$ on an input $x$. We consider the signed local calibration error (SLCE):
\begin{align*}
    \quad \slce_{\gamma}(x; f, \hp) &:=
    \frac{\Eb[(\hp(X) - a_f(X, Y)) k_{\gamma}(X, x) \mid \hp(X) \in B(\hp(x))]}{\Eb[k_{\gamma}(X, x) \mid \hp(X) \in B(\hp(x))]} \\
    & \; = \frac{\Eb[(\hp(X) - a_f(X, Y)) k_{\gamma}(X, x) \indic{\hp(X) \in B(\hp(x))}]}{\Eb[k_{\gamma}(X, x) \indic{\hp(X) \in B(\hp(x))}]}.
\end{align*}

\subsection{Assumptions and Formal Statement of Theorem}
We make the following assumptions:

\begin{assumption}[Lipschitz kernel]
\label{assumption:kernel}
The kernel $k_\gamma$ takes the form
\begin{align*}
    k_\gamma(x, x') = g\paren{ \frac{\phi(x) - \phi(x')}{\gamma}},
\end{align*}
where $\phi:\mc{X}\to \R^d$ is a representation function, and $g:\R^d\to [0,1]$ is $L$-Lipschitz with respect to some norm $\norm{\cdot}$.
\end{assumption}
Note this definition may require an implicit rescaling (for example, we can take $\phi(x)\leftarrow \phi^{\rm feature}(x)/d$ for a $d$-dimensional feature map $\phi^{\rm feature}$ and take $g(z)=\exp(-\|z\|_1)$, which corresponds to the Laplacian kernel we used in Section~\ref{section:choice-kernel}).

\begin{assumption}[Binning-aware covering number]
\label{assumption:cover}
For any $\epsilon>0$, the range of the representation function $\phi(\mc{X}):= \set{\phi(x): x \in \mc{X}}$ has an $\epsilon$-cover in the $\norm{\cdot}$-norm of size $(C/\epsilon)^d$ for some absolute constant $C>0$: There exists a set $\mc{N}_\epsilon\in\mc{X}$ with $|\mc{N}_\epsilon|\le (C/\epsilon)^d$ such that for any $x\in\mc{X}$, there exists some $x'\in \mc{N}_\epsilon$ such that $\norm{\phi(x)-\phi(x')}\le \epsilon$ and $B(\hp(x))=B(\hp(x'))$.
\end{assumption}

\begin{assumption}[Lower bound on expectation of kernel within bin]
\label{assumption:lower-bound}
We have
\begin{align*}
    \inf_{x\in\mc{X}} \E\brac{k_\gamma(X, x) \indic{\hp(X)\in B(\hp(x))}} \ge \alpha
\end{align*}
for some constant $\alpha\in(0,1)$. 
\end{assumption}
The constant $\alpha$ characterizes the hardness of estimating the SLCE from samples. Intuitively, with a smaller $\alpha$, the denominator in SLCE gets smaller and we desire a higher accuracy in estimating both the numerator and the denominator. Also note that in practice the value of $\alpha$ typically depends on $\gamma$.


We analyze the following estimator of the SLCE using $n$ samples:
\begin{align}
    \slcehat_\gamma(x; f, \hp) = \frac{ \frac{1}{n}\sum_{i=1}^n (\hp(x_i) - a_f(x_i, y_i))k_\gamma(x_i, x) \indic{\hp(x_i) \in B(\hp(x))} }{ \frac{1}{n}\sum_{i=1}^n k_\gamma(x_i, x) \indic{\hp(x_i) \in B(\hp(x))}  }.
\end{align}

\begin{theorem}
\label{theorem:slce}
Under Assumptions~\ref{assumption:kernel},~\ref{assumption:cover}, and~\ref{assumption:lower-bound}, Suppose the sample size $n\ge \widetilde{O}(d/\alpha^4\epsilon^2)$ where $\epsilon>0$ is a target accuracy level, then with probability at least $1-\delta$ we have
\begin{align*}
    \sup_{x\in\mc{X}} \abs{\slcehat_\gamma(x; f, \hp) - \slce_\gamma(x; f, \hp) } \le \epsilon,
\end{align*}
where $\widetilde{O}$ hides log factors of the form $\log(L/\gamma\epsilon\delta \alpha)$.
\end{theorem}
Theorem~\ref{theorem:slce} shows that $\wt{O}(d/\epsilon^2\alpha^4)$ samples is sufficient to estimate the SLCE simultaneously for all $x\in\mc{X}$. When $\alpha=\Omega(1)$, this sample complexity only depends polynomially in terms of the representation dimension $d$ and logarithmically in other constants (such as $L,\gamma$, and the failure probability $\delta$).

\subsection{Proof of Theorem~\ref{theorem:slce}}

{\bf Step 1.} We first study the estimation at finitely many $x$'s. Let $\mc{N}\subseteq\mc{X}$ be a finite set of $x$'s with $|\mc{N}|=N$. Since $k_\gamma\in[0,1]$ and $|\hp(x)-a_f(x, y)|\le 1$ are bounded variables, by the Hoeffding inequality and a union bound, we have
\begin{align*}
    & \quad \P\bigg( \sup_{x \in \mc{N}} \bigg| \frac{1}{n}\sum_{i=1}^n (\hp(x_i) - a_f(x_i, y_i))k_\gamma(x_i, x) \indic{\hp(x_i) \in B(\hp(x))} \\
    & \qquad \qquad - \E\brac{ (\hp(X) - a_f(X, Y))k_\gamma(X, x) \indic{\hp(X) \in B(\hp(x)) } }\bigg| > \alpha\epsilon/10\bigg) \\
    & \le \exp(-cn\alpha^2\epsilon^2 + \log N).
\end{align*}
Therefore, as long as $n\ge O(\log (N/\delta)/\epsilon^2\alpha^2)$ samples, the above probability is bounded by $\delta$. In other words, with probability at least $1-\delta$, we have simultaneously
\begin{align*}
    & \bigg| 
    \underbrace{\frac{1}{n}\sum_{i=1}^n (\hp(x_i) - a_f(x_i, y_i))k_\gamma(x_i, x) \indic{\hp(x_i) \in B(\hp(x))}}_{:=\hat{A}(x)} \\
    & \qquad - \underbrace{\E\brac{ (\hp(X) - a_f(X, Y))k_\gamma(X, x) \indic{\hp(X) \in B(\hp(x)) } }}_{:=A(x)} \bigg| \\
    & \le  \alpha\epsilon/10.
\end{align*}
for all $x\in \mc{N}$. Similarly, when $n\ge O(\log(N/\delta)/\epsilon^2\alpha^4)$, we also have (with probability at least $1-\delta$)
\begin{align*}
    \bigg| 
    \underbrace{\frac{1}{n}\sum_{i=1}^n k_\gamma(x_i, x) \indic{\hp(x_i) \in B(\hp(x))}}_{:= \hat{B}(x)} - \underbrace{\E\brac{ k_\gamma(X, x) \indic{\hp(X) \in B(\hp(x)) } }}_{:= B(x)}
    \bigg| \le  \alpha^2\epsilon/10
\end{align*}
On these concentration events, we have for any $x\in \mc{N}$ that
\begin{align*}
    \quad \abs{\slcehat_\gamma(x; f, \hp) - \slce_\gamma(x; f, \hp)} 
    & = \abs{ \frac{\hat{A}(x)}{\hat{B}(x)} - \frac{A(x)}{B(x)} } \\ 
    & \le \abs{\hat{A}(x)} \abs{\frac{1}{\hat{B}(x)} - \frac{1}{B(x)} } + \frac{1}{\abs{B(x)}} \abs{\hat{A}(x) - A(x)} \\
    & \le 1\cdot \frac{\alpha^2 \epsilon/10}{\alpha (\alpha - \alpha^2\epsilon/10)} + \frac{1}{\alpha} \cdot \alpha\epsilon/10 \\
    & \le \epsilon.
\end{align*}

{\bf Step 2.} We now extend the bound to all $x\in\mc{X}$ using the covering argument. By Assumption~\ref{assumption:cover}, we can take an $\alpha^2\epsilon\gamma/(10L)$-covering of $\phi(\mc{X})$ with cardinality $N\le (10CL/\alpha^2\epsilon\gamma)^d$. Let $\mc{N}\subset \mc{X}$ denote the covering set (in the $\mc{X}$ space). This means that for any $x\in\mc{X}$, there exists $x'\in\mc{N}$ such that $\norm{\phi(x) - \phi(x')}\le \alpha^2\epsilon\gamma/(10L)$ amd $B(\hp(x))=B(\hp(x'))$, which implies that for any $\wt{x}\in\mc{X}$ we have
\begin{align*}
    \quad \abs{k(\wt{x}, x) - k(\wt{x}, x')} 
    & = \abs{f\paren{\frac{\phi(\wt{x}) - \phi(x)}{\gamma}} - f\paren{\frac{\phi(\wt{x}) - \phi(x')}{\gamma}}} \\
    & \le \frac{L}{\gamma} \norm{\phi(x) - \phi(x')} \\
    & \le \alpha^2\epsilon/10,
\end{align*}
where we have used the Lipschitzness assumption of $g$ (Assumption~\ref{assumption:kernel}). This further implies
\begin{align*}
    \quad \abs{\hat{A}(x) - \hat{A}(x')} 
    & = \biggl| \frac{1}{n}\sum_{i=1}^n (\hp(x_i) - a_f(x_i, y_i))k_\gamma(x_i, x) \indic{\hp(x_i) \in B(\hp(x))} \\
    & \qquad \qquad - \frac{1}{n}\sum_{i=1}^n (\hp(x_i) - a_f(x_i, y_i))k_\gamma(x_i, x') \indic{\hp(x_i) \in B(\hp(x'))} \biggr| \\
    & = \biggl| \frac{1}{n}\sum_{i=1}^n (\hp(x_i) - a_f(x_i, y_i))\brac{k_\gamma(x_i, x) - k_\gamma(x_i, x')} \indic{\hp(x_i) \in B(\hp(x))} \biggr| \\
    & \le \frac{1}{n}\sum_{i=1}^n \abs{\hp(x_i) - a_f(x_i, y_i)} \cdot \abs{k_\gamma(x_i, x) - k_\gamma(x_i, x')} \cdot \indic{\hp(x_i) \in B(\hp(x))} \\
    & \le \alpha^2\epsilon/10.
\end{align*}
Similarly, we have $|A(x) - A(x')|\le \alpha^2\epsilon/10$, $|\hat{B}(x) - \hat{B}(x')|\le \alpha^2\epsilon/10$, and $|B(x) - B(x')|\le \alpha^2\epsilon/10$. This means that the estimation error at $x$ is close to that at $x'\in\mc{N}$ and consequently also bounded by $\epsilon$:
\begin{align*}
    \abs{\slcehat_\gamma(x; f, \hp) - \slce_\gamma(x; f, \hp)} &= \abs{\frac{\hat{A}(x)}{\hat{B}(x)} - \frac{A(x)}{B(x)} } \\
    & \le \abs{ \frac{\hat{A}(x)}{\hat{B}(x)} - \frac{\hat{A}(x')}{\hat{B}(x')} } + \abs{ \frac{\hat{A}(x')}{\hat{B}(x')} - \frac{A(x')}{B(x')} } + \abs{ \frac{A(x')}{B(x')} - \frac{A(x)}{B(x)} } \\
    & \le 3 \brac{ 1\cdot \frac{\alpha^2 \epsilon/10}{\alpha (\alpha - \alpha^2\epsilon/10)} + \frac{1}{\alpha} \cdot \alpha^2\epsilon/10 } \\
    & \le \epsilon.
\end{align*}
Therefore, taking this $\mc{N}$ in step 1, we know that as long as the sample size
\begin{align*}
    N \ge O\paren{ \frac{\log(|\mc{N}|/\delta)}{\epsilon^2\alpha^4} } = O\paren{ \frac{d\brac{\log(10CL/\alpha^2\epsilon\gamma) + \log(1/\delta)}}{\alpha^4\epsilon^2} } = \wt{O}\paren{d/\alpha^4\epsilon^2},
\end{align*}
we have with probability at least $1-\delta$ that
\begin{align*}
    \sup_{x\in\mc{X}} \abs{\slcehat_\gamma(x; f, \hp) - \slce_\gamma(x; f, \hp)} \le \epsilon.
\end{align*}
This is the desired result.

\qed

\end{document}
