% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\usepackage{xr}
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{amsfonts}   
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{thmtools}
\usepackage{threeparttable}
\usepackage{multirow}
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{caption}
\usepackage{subcaption}

%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
% \def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\vTheta{{\bm{\Theta}}}
\def\valpha{{\bm{\alpha}}}
\def\vphi{{\bm{\phi}}}
\def\vbeta{{\bm{\beta}}}
\def\vSigma{{\bm{\Sigma}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\DeclareMathOperator{\diag}{diag}
\let\ab\allowbreak


% \newtheorem{hypothesis}{Hypothesis}
% \newtheorem{theorem}{Theorem}
% \newtheorem{corollary}{Corollary}[theorem]
\newtheorem{proposition}{Proposition}
\newtheorem*{proposition*}{Proposition}
% \newtheorem{definition}{Definition}[theorem]
% \newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{remark}{Remark}

\newcommand{\rom}[1]{%
  \textup{\uppercase\expandafter{\romannumeral#1}}%
}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{shu_406}

\renewcommand{\thetable}{S\arabic{table}}   
\renewcommand{\thefigure}{S\arabic{figure}}
\renewcommand{\theequation}{S\arabic{equation}}
% \title{Going Beyond Neural Architecture Search via Sampling-Based Neural Ensemble Search}
\title{Neural Ensemble Search via Bayesian Sampling (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<shuyao@comp.nus.edu.sg>?Subject=About Your UAI 2022 paper}{Yao Shu}{}}
\author[1]{Yao Shu}
\author[1]{Yizhou Chen}
\author[1]{Zhongxiang Dai}
\author[1]{Bryan Kian Hsiang Low}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    National University of Singapore\\
    Singapore
}
  
\begin{document}
\maketitle

\appendix
\section{Proofs}\label{sec:proofs}

\begin{proposition*}{\textbf{\normalfont (Training fairness in the supernet.)}}\label{prop:fair}
Let $T$ and $T_{\gA}$ denote the number of steps applied to train the supernet and candidate architecture $\gA$ in the search space of size $N$, by uniformly randomly sampling a single architecture from this search space for the model training in each step, we have
\begin{equation*}
    \begin{gathered}
        \normalfont\text{Pr}(\lim_{T\rightarrow\infty} T_{\gA_i} / T = \lim_{T\rightarrow\infty} T_{\gA_j} / T) = 1 \quad \forall i,j \in \{1, \cdots, N\} \ .
    \end{gathered}
    \end{equation*}
\end{proposition*}
\begin{proof}
Let random variable $X_i^t \in \{0,1\}$ denote the selection of candidate architecture $\gA_i$ at training step $t$ under our sampling scheme in the proposition above. For any $t>0$ and $i,j \in [N]$, random variable $X_i^t - X_j^t$ can achieve following possible assignments and probabilities (denoted by $p$):
\begin{equation}
\begin{gathered}
    X_i^t - X_j^t = 
    \left\{\begin{array}{rcl}
             +1, & & p = 1/N \\
             0, & & p = (N-2)/N \\
             -1, & & p = 1/N
    \end{array}\right. \ .
\end{gathered}
\end{equation}
Consequently, $\E[X_i^t - X_j^t] = 0$. According to the strong law of large numbers, we further have
\begin{equation}
\begin{gathered}
    \text{Pr}(\lim_{T\rightarrow\infty} T^{-1}\sum_{t=1}^T X_i^t - X_j^t = 0) = 1 \ .
\end{gathered}
\end{equation}
Note that 
\begin{equation}
\begin{gathered}
    T^{-1}(T_{\gA_i} - T_{\gA_j}) = T^{-1}\sum_{t=1}^T X_i^t - X_j^t \ .
\end{gathered}
\end{equation}
We thus can complete this proof by
\begin{equation}
\begin{aligned}
    &\text{Pr}(\lim_{T\rightarrow\infty} T^{-1}(T_{\gA_i} - T_{\gA_j}) = 0) = 1 \ .
\end{aligned}
\end{equation}
\end{proof}

\paragraph{Proof of Proposition \ref{prop:svgdrd-update}.}
As particles $\{\vx_i\}_{i=1}^n$ of size $n$ are applied to approximate the density $q$ in our SVGD-RD, the second term (i.e., the controllable diversity term) in our \eqref{eq:min-kl-max-div} can then be approximated using these particles as
\begin{equation}
\begin{aligned}
    n\delta \E_{\vx,\vx' \sim q} \left[k(\vx, \vx')\right] &\approx  \delta/n \sum_{i=1}^n\sum_{j=1}^n k(\vx_i, \vx_j) \triangleq \sum_{i=1}^n h(\vx_i) \ ,
\end{aligned}
\end{equation}
where $h(\vx)\triangleq\delta/n\sum_{j=1}^n k (\vx, \vx_j)$. We take $\vx_j$ in $k(\vx, \vx_j)$ as a constant for the approximation above. Consequently, we have
\begin{equation}\label{eq:equal-grad}
\begin{aligned}
    \nabla_{\vx_k}\sum_{i=1}^n h(\vx_i) = \nabla_{\vx_k}h(\vx_k) \ .
\end{aligned}
\end{equation}

Let $\vx_i^{+} \triangleq \vx_i + \epsilon\vphi^*(\vx_i)$ ($\forall i\in\{1,\cdots,n\}$) denote the functional gradient decent in the RKHS $\gH$ to minimize the KL divergence term in our \eqref{eq:min-kl-max-div}. Based on \eqref{eq:equal-grad} above, given proximal operator $\text{prox}_h(\vx^{+})=\argmin_{\vy} h(\vy)+1/2\|\vy-\vx^{+}\|_2^2$, by using proximal gradient method \cite{proximal}, our \eqref{eq:min-kl-max-div} can then be optimized via the following update to each particle $\vx_i$:
\begin{equation}
    \vx_i \leftarrow \text{prox}_h(\vx_i^{+})=\argmin_{\vy} h(\vy)+1/2\|\vy-\vx_i^{+}\|_2^2 \ .
\end{equation}

According to the \textit{Karush-Kuhn-Tucker} (KKT) conditions, the local optimum $\vy^*$ of this proximal operator satisfies
\begin{equation}
\begin{aligned}
    \text{prox}_h(\vx_i^{+}) = \vy^* = \vx_i^{+} - \nabla_{\vy^*} h(\vy^*)\ .
\end{aligned}
\end{equation} 
When $h(\cdot)$ is convex, this local optimum is also a global optimum. As (16) is intractable to solve given a complex $h(\cdot)$, we approximate $h(\vy^*)$ with its first-order Taylor expansion, i.e., $h(\vy^*) \approx h(\vx_i) + \nabla_{\vx_i} h(\vx_i)(\vy^* - \vx_i)$ and achieve following approximation:
\begin{equation}
\begin{aligned}
    \text{prox}_h(\vx_i^{+}) &\approx \vx_i^{+} - \nabla_{\vx_i} h(\vx_) \\
    &\approx \vx_i + \epsilon\vphi^*(\vx_i) - \nabla_{\vx_i} h(\vx_i) \\
    &\approx \vx_i + \epsilon\vphi^*(\vx_i) - \delta/n \textstyle\sum_j^n \nabla_{\vx_i} k(\vx_i, \vx_j) \ .
\end{aligned}
\end{equation}

Given the approximation $\vphi^*(\vx_i) \approx \widehat{\vphi}^*(\vx_i)$ and the definition of $\widehat{\vphi}^*(\vx_i)$ in \eqref{eq:svgd-approx}, we complete our proof by
\begin{equation}
\begin{aligned}
    \vx_i \leftarrow \vx_i &+ 1/n \textstyle\sum_{j=1}^n k(\vx_j, \vx_i)\nabla_{\vx_j}\log p(\vx_j) \\
    & + \nabla_{\vx_j}k(\vx_j, \vx_i) - \delta \nabla_{\vx_i} k(\vx_j, \vx_i) \ .
\end{aligned}
\end{equation}

\paragraph{Proof of Proposition \ref{prop:exploitation}.}
Notably, since $k(\vx,\vx')=c$ when $\vx=\vx'$, we will achieve a constant $k(\vx, \vx)$ for any particle $\vx$ in the case of $n=1$, which can be ignored in our SVGD-RD for any $\delta \in \mathbb{R}$. In light of this, our SVGD-RD in the case of $n=1$ degenerates into standard SVGD. Consequently, to prove Proposition \ref{prop:exploitation}, we only need to consider SVGD in the case of $n=1$.

Considering SVGD in the case of $n=1$, we can frame the density $q$ represented by a single particle $\vx'$ as 
\begin{equation}
\begin{aligned}
    q(\vx) = 
    \left\{\begin{array}{rcl}
             1 & & \vx = \vx' \\
             0 & & \vx \neq \vx'
    \end{array}\right. \ .
\end{aligned}
\end{equation}
The KL divergence between $q(\vx)$ and the target density $p(\vx)$ can then be simplified as
\begin{equation}
\begin{aligned}
    \text{KL}(q \| p) &= \E_{q(\vx)}[\log(q(\vx)/p(\vx))] = -\log p(\vx') \ .
\end{aligned}
\end{equation}
Finally, standard SVGD in the case of $n=1$ obtain its optimal particle by optimizing the following problem:
\begin{equation}
\begin{aligned}
    q^* &= \argmin_q \text{KL}(q \| p) \\
    & = \argmin_{\vx'} \{-\log p(\vx')\} \\
    & = \argmax_{\vx'} p(\vx') \ ,
\end{aligned}
\end{equation}
which finally concludes the proof.

\begin{remark}
\emph{
In practice, this $k(\vx,\vx)=c$ can be well satisfied, such as the radial basis function (RBF) kernel that we have applied in our experiments.
}
\end{remark}

\section{Experimental Settings}\label{sec:setting-exp}
\subsection{The DARTS Search Space}\label{sec:setting-search-space}
In the DARTS \citep{darts} search space, each candidate architecture consists of a stack of $L$ cells, which can be represented as a directed acyclic graph (DAG) of $N$ nodes denoted by $\{z_0, z_1, \dots, z_{N-1}\}$. Among these $N$ nodes in a cell, $z_0$ and $z_1$ denote the input nodes produced by two preceding cells, and $z_N$ denotes the output of a cell, which is the concatenation of all intermediate nodes, i.e., from $z_2$ to $z_{N-1}$. As in the work of \citet{darts}, to select the best-performing architectures, we need to select their corresponding cells, including the normal and reduction cell. We refer to the DARTS paper for more details.
In practice, this search space is conventionally represented as a supernet stacked by 8 cells (6 normal cells and 2 reduction cells) with initial channels of 16.

\subsection{Model Training of Supernet}\label{sec:setting-one-shot}
Following \citep{pc-darts}, we apply a partial channel connection with $K=2$ in the model training of the supernet, which allows us to accelerate and reduce the GPU memory consumption during this model training. We split the standard training dataset of CIFAR-10 into two piles in our ensemble search: 70\% randomly sampled data is used in the model training of the supernet, and the rest is used to obtain the posterior distribution of neural architectures in Sec. \ref{sec:posterior} and also the final selected ensembles in Sec. \ref{sec:sampling}. To achieve not only a fair but also a sufficient model training for every candidate architecture, we apply \textit{stochastic gradient descent} (SGD) with epoch 50, learning rate cosine scheduled from 0.1 to 0, momentum 0.9, weight decay $3\times10^{-4}$ and batch size 128 in the model training of the supernet, where only a single candidate architecture is uniformly randomly sampled from this supernet in every training step.

\subsection{Posterior Distribution}\label{sec:setting-posterior}
\paragraph{Variational posterior distribution.}
Following \citep{snas}, the variational posterior distribution of architectures is represented as $p_{\valpha}(\gA)$ parameterized by $\valpha$. Specifically, within the search space demonstrated in our Appendix \ref{sec:setting-search-space}, each intermediate nodes $z_i$ is the output of one selected operation $o \sim p_{\valpha_i}(o)$ using the inputs from its proceeding nodes or cells, where $\gO$ is a predefined operation set for our search. Specifically, given $\valpha_i=(\alpha_i^{o_1} \cdots \alpha_i^{o_{|\gO|}})$, $p_{\valpha_i}(o)$ can be represented as 
\begin{equation}\label{eq:discrete-dist}
    p_{\valpha_i}(o) = \frac{\exp(\alpha_i^o/\tau)}{\sum_{o \in \gO}\exp(\alpha_i^o/\tau)} \ ,
\end{equation}
where $\tau$ denotes the softmax temperature, which is usually set to be 1 in practice. Based on this defined probability for each intermediate node $z_i$, our variational posterior distribution can be framed as
\begin{equation}
    p_{\valpha}(A) = \prod_{i=2}^{N-2}p_{\valpha_i}(o) \ .
\end{equation}
More precisely, this representation is applied for single-path architecture with identical cells. We use it to ease our representation. For double-path architectures consisting of two different cells (i.e., normal and reduction cell), e.g., the candidate architecture in the DARTS search space, a similar representation can be obtained.

\paragraph{Optimization details.}
To optimize \eqref{eq:elbo}, we firstly relax our variational posterior distribution to be differentiable using the Straight-Through (ST) Gumbel-Softmax \citep{concrete, gumbel-softmax} with the reparameterization trick. More precisely, we propose a variant of ST Gumbel-Softmax outputting the double-path architectures in the DARTS search space. Then, we use stochastic gradient-based algorithms to optimize \eqref{eq:elbo} efficiently. In each optimization step, we sample one neural architecture from the distribution $p_{\valpha}(\gA)$ to estimate $\E_{\gA \sim p_{\valpha}(\gA)}\left[\log p(\gD|\gA)\right]$ (i.e., the commonly used Cross-Entropy loss). In practice, we use Adam \citep{adam} with learning rate 0.01, $\beta_1=0.9$, $\beta_2=0.999$ and weight decay $3\times10^{-4}$ to update our variational posterior distribution $p_{\valpha}(\gA)$ for 20 epochs. 

\subsection{SVGD of Regularized Diversity}\label{sec:setting-svgd}
\paragraph{Continuous relaxation of variational posterior distribution.}
Notably, SVGD \citep{svgd} and also our SVGD-RD is applied for continuous distribution. Unfortunately, the variational posterior distribution $p_{\valpha}(\gA)$ is discrete due to a discrete search space. To apply SVGD-RD, we firstly relax this discrete posterior into its continuous counterpart using a mixture of Gaussian distribution. Specifically, we represent each operation $o\in\gO$ in \eqref{eq:discrete-dist} into a one-hot vector $\vh_o$. By introducing the random variable $\vo_i\in\mathbb{R}^{|\gO|}$ and multi-variate normal distribution $\gN(\vo_i | \vh_o, \Sigma)$ into our relaxation, our relaxed posterior distribution of neural architectures can be framed as
\begin{equation}\label{eq:relax-dist}
\begin{gathered}
    \widehat{p}_{\valpha}(\gA) = \prod_{i=2}^{N-2} 1/Z_i\sum_{o \in \gO} p_{\valpha_i}(o) \gN(\vo_i | \vh_o, \Sigma) \ ,
\end{gathered}
\end{equation}
where $Z_i$ denotes the normalization constant. Given the sampled particle $\vx^*=(\cdots \vo_i^* \cdots)$ in SVGD-RD, the final selected architecture can then be derived using the determination of each selected operation $o_i^*$, i.e., 
\begin{equation}
    o_i^* = \argmin_{o \in \gO} \|\vo_i^* - \vh_o\|_2 \ .
\end{equation}

\paragraph{Optimization details.} 
Since \citet{svgd} have demonstrated that SVGD is able to handle unnormalized target distributions, the normalization constant in \eqref{eq:relax-dist} can then be ignored in our SVGD-RD algorithm. In practice, the covariance matrix $\Sigma$ in \eqref{eq:relax-dist} is set to an identity matrix scaled by $|\gO|$. Besides, the parameter $\delta$ is optimized as a hyper-parameter via grid search or Bayesian Optimization \citep{bo} within the range of $[-2, 1]$ in practice. To obtain well-performing particles in our SVGD-RD algorithms efficiently, we apply SGD using the gradient provided in Sec. \ref{sec:svgd-rd} with a \textit{radial basis function} (RBF) kernel on randomly initialized particles for $L{=}1000$ iterations under a learning rate of 0.1 and a momentum of 0.9.

\subsection{Evaluation on Benchmark Datasets}\label{sec:setting-training}
\paragraph{Evaluation on CIFAR-10/100.} 
We apply the same constructions in DARTS \citep{darts} for our final performance evaluation on CIFAR-10/100: The final selected architectures consist of 20 cells, and 18 of them are identical normal cells, with the rest being the identical reduction cell. An auxiliary tower with a weight of 4 is located at the 13-th cell of the final selected architectures. The final selected architecture is then trained using stochastic gradient descent (SGD) for 600 epochs with a learning rate cosine scheduled from 0.025 to 0, momentum 0.9, weight decay $3\times10^{-4}$, batch size 96 and initial channels 36. Cutout \citep{cutout}, and a scheduled DropPath, i.e., linearly decayed from 0.2 to 0,  are employed to achieve SOTA generalization performance.

\paragraph{Evaluation on ImageNet.}
Following \citep{darts}, the architectures evaluated on ImageNet consist of 14 cells (12 identical normal cells and 2 identical reduction cells). To meet the requirement of evaluation under the mobile setting (less than 600M multiply-add operations), the number of initial channels for final selected architectures are conventionally set to 44. We adopt the training enhancements in \cite{darts,p-darts,sdarts}, including an auxiliary tower of weight 0.4 and label smoothing. Following P-DARTS \cite{p-darts} and SDARTS-ADV \cite{sdarts}, we train the selected architectures from scratch for 250 epochs using a batch size of 1024 on 8 GPUs, SGD optimizer with a momentum of 0.9 and a weight decay of $3\times10^{-5}$. The learning rate applied in this training is warmed up to 0.5 for the first 5 epochs and then decreased to zero linearly.

\subsection{Adversarial Defense}\label{sec:setting-adversarial}
Adversarial attack intends to find a small change for each input such that this input with its corresponding small change will be misclassified by a model. As ensemble is known to be a possible defense against such adversarial attacks \citep{ensemble-for-defense}, we also examine the effectiveness of our NESBS algorithm by comparing the model robustNESBS achieved by our algorithms to other ensemble and ensemble search algorithms under various benchmark adversarial attacks. To the best of our knowledge, we are the first to examine the advantages of ensemble search algorithms in defending against adversarial attacks.

In this experiment, two processes are required, i.e., \emph{attack} and \emph{defense}. The \emph{attack} process is a typical white-box attack scenario: Only a single model (randomly sampled from an ensemble) is attacked by an attacker, and this process will be repeated for $n$ rounds given an ensemble of size $n$ in order to accurately measure the improvement of model robustNESBS induced by an ensemble. In each round, a different model from this ensemble is selected to be attacked. 
The \emph{defense} process is then applied using neural network ensembles, i.e., neural network ensembles will make predictions based on those perturbed images produced by the aforementioned attacker. Corresponding to the \emph{attack} process, we also need to repeat this defense process for $n$ rounds. In fact, such an adversarial defense setting is reasonably practical when only a single model from an ensemble is required to be publicly available for model producers.

We apply the following attacks in our experiment: The \emph{Fast Gradient Signed Method} (FGSM) attack \cite{fgsm}, the \emph{Projected Gradient Descent} (PGD) attack \cite{pgd}, the \emph{Carlini Wagner} (CW) attack \cite{cw} and the AutoAttack~\citep{autoattack}. In both the FGSM attack and the PGD attack, we impose a $L_\infty$ norm constrain of $0.01$. The step size and the number of iterations in the PGD attack are set to $0.008$ and $40$, respectively. We adopt the same configurations of the CW attack under a $L_2$ norm constrain in \citep{cw}: We set the confidence constant, the range of constant $c$, the number of binary search steps, and the maximum number of optimization steps to $0$, $[0.001,10]$, $3$, and $50$, respectively; we then adopt Adam \citep{adam} optimizer with learning rate $0.01$ and $\beta_1=0.9$, $\beta_2=0.999$ in its search process. Besides, we adopt the same configuration of AutoAttack from \citep{autoattack}.

\section{Complementary Results}

\subsection{Ensemble Performance Estimation}\label{sec:exp-oneshot}
\begin{table}
\renewcommand\multirowsetup{\centering}
\centering
\begin{tabular}{lcccc}
\toprule
\textbf{Metric} & 
\textbf{$n=1$} & 
\textbf{$n=3$} & 
\textbf{$n=5$} & 
\textbf{$n=7$} \\
\midrule 
Spearman & 0.65 & 0.33 & 0.40 & $-$0.12 \\
Pearson & 0.82 & 0.45 & 0.45 & $-$0.16 \\
Agreement-30\% & 33\% & 20\% & 31\% & 25\% \\
\bottomrule
\end{tabular}
\caption{The correlation between the estimated and true performances of candidate architectures and their ensembles in the DARTS search space on CIFAR-10.}
\label{tab:rank}
\end{table}


As shown in Sec. \ref{sec:oneshot}, we apply the model parameters inherited from a trained supernet to estimate the performance of candidate architectures as well as their ensembles in our NESBS algorithm. We therefore use the following three metrics to measure the effectiveness of such estimation in the DARTS search space: the Spearman's rank order coefficient between the estimated and true performances, the Pearson correlation coefficient between the estimated and true performances, and the percentage of architectures achieving both Top-$k$ estimated performance and Top-$k$ true performances (named the Agreement-$k$). Since the evaluation of the true performances is prohibitively costly, we randomly sample 10 architectures of diverse estimated performances from the DARTS search space for this experiment. Notably, based on these 10 architectures, there are hundreds of possible ensembles under the ensemble size of 3, 5, 7, which we believe is sufficiently large to validate the effectiveness of our performance estimations. To obtain the true performance of candidate architectures as well as their ensembles, we train these architectures independently for 100 epochs following the settings in Appendix \ref{sec:setting-training}. 

Table \ref{tab:rank} summarizes the results. Notably, the estimated and true performances are shown to be positively correlated in the case of $n{=}1,3,5$ by achieving relatively high Spearman and Pearson coefficients as well as a  high agreement in these cases. Although the coefficients are low when the ensemble size is larger (i.e., $n{=}7$), the estimated and true performances are still capable of achieving a reasonably good agreement in this case. 
Based on these results, we argue that our estimated ensemble performance is informative and effective for our ensemble search. This effectiveness can also be supported by the competitive search results achieved by our NESBS in Sec. \ref{sec:darts}.

\begin{figure}[t]
\centering
\resizebox{0.75\columnwidth}{!}{
\begin{tabular}{cc}
    \includegraphics[width=0.36\textwidth]{figs/discrepancy.pdf}
\end{tabular}
}
\caption{The comparison of performance discrepancy with the post-training and best-response posterior distribution on CIFAR-10. This performance discrepancy is measured by the gap of test error between the best-performing architecture (i.e., the architecture with the smallest test error) and the maximal-probability architecture (i.e., the architecture with the largest probability in the corresponding posterior distribution) in the DARTS search space.}
\label{fig:best_vs_post}
\end{figure}

\subsection{Post-training vs.~Best-response Posterior Distribution}\label{sec:exp-best_vs_post}
To examine the advantages of our post-training posterior distribution, we compare it with its best-response counterpart applied in \citep{gdas, snas}. While our post-training posterior distribution is obtained \emph{after} the model training of the supernet, the best-response posterior distribution is updated \emph{during} the model training of the supernet. We refer to \citep{gdas, snas} for more details about this best-response posterior distribution. We follow the optimization details in Appendix \ref{sec:setting-one-shot} and \ref{sec:setting-posterior} to obtain these two posterior distributions.

\paragraph{More accurate characterization of single-model performances using post-training posterior distribution.}
We firstly compare the characterization of single-mode performance using these two posterior distributions by examining the performance discrepancy between their best-performing architecture (i.e., the architecture achieving the smallest test error) and maximal-probability architecture (i.e.,  the architecture achieving the largest probability in the corresponding posterior distribution) in the search space. In this experiment, the performance discrepancy is measured by the gap of test error achieved by the best-performing architecture and the maximal-probability architecture using the model parameters inherited from the supernet.

Figure \ref{fig:best_vs_post} illustrates the comparison. The results show that our post-training posterior distribution enjoys a smaller performance discrepancy, suggesting that our post-training posterior distribution is able to provide a more accurate characterization of the single-model performances. Interestingly, the best-response counterpart contributes to the best-performing architecture with a lower test error than our post-training posterior distribution, which should result from the Matthew Effect as justified in \citep{dropnas}. Specifically, well-performing architectures contribute to the frequent selections of these architectures for their model training during the optimization of the best-response posterior distribution. This will finally result in unfair model training in the search space and therefore the inaccurate characterization of single-model performances. Notably, we need a more accurate characterization of single-mode performance in this paper, as shown in Sec. \ref{sec:posterior}. Therefore, our post-training posterior distribution should be more suitable than its best-response counterpart in our ensemble search.

\paragraph{Improved performance of selected ensembles using post-training posterior distribution.}
We then compare the final ensemble test performance achieved by our NESBS algorithm using the post-training posterior distribution and its best-response counterpart on CIFAR-10 with the ensemble size of $n=3$. To obtain the final ensemble performance, we train each architecture in an ensemble for 100 epochs following the settings in Appendix \ref{sec:setting-training}. Table \ref{tab:best_vs_post} summarizes the results. Notably, our post-training posterior distribution is shown to be capable of contributing to an improved ensemble performance than its best-response counterpart, which further demonstrates the advantages of applying the post-training posterior distribution in our ensemble search.


\begin{table}[t]
\renewcommand\multirowsetup{\centering}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{lcc}
\toprule
\textbf{Method} & 
\textbf{Best-response} & 
\textbf{Post-training} \\
\midrule 
NESBS (MC Sampling) & 4.74 & \textbf{4.54}$_{\Delta{=}0.20}$ \\
NESBS (SVGD-RD) & 4.81 & \textbf{4.48}$_{\Delta{=}0.33}$\\
\bottomrule
\end{tabular}
}
\caption{The comparison of true ensemble test error (\%) on CIFAR-10 achieved by our NESBS algorithm using the post-training posterior distribution and its best-response counterpart with an ensemble size of $n{=}3$. We use $\Delta$ to denote the improved generalization performance achieved by our post-training posterior distribution.}
\label{tab:best_vs_post}
\end{table}

\subsection{Effectiveness and Efficiency}\label{sec:efficient-and-effective}

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figs/sample_efficiency.pdf}
\caption{The comparison of search effectiveness (test error of ensembles in the $y$-axis) and efficiency (evaluation budget in the $x$-axis) for different ensemble search algorithms under varying ensemble size $n$. The single best baseline refers to the single best architecture achieving the lowest test error in the search space. The $y$-axis is shown in log-scale to ease visualization. Note that the test error for each algorithm is reported with the mean and standard error of five independent trials.
}
\label{fig:search-efficiency}
\end{figure}

As justified in Sec \ref{sec:sampling}, both our MC Sampling and SVGD-RD algorithms can sample neural architectures with competitive single-model performances and diverse model predictions, which are known to be the criteria for well-performing ensembles \citep{zhou-ensemble}. 
To further demonstrate that our algorithms are capable of selecting well-performing ensembles effectively and efficiently based on this sampling property,
we compare our NESBS algorithm, including NESBS (MC Sampling) and NESBS (SVGD-RD), with the following ensemble search baselines on CIFAR-10 \citep{cifar} in the DARTS \citep{darts} search space: (a) Uniform random sampling which we refer to as URS, and (b) NES-RS \citep{nes}. That is, we only replace the Bayesian sampling in our NESBS algorithm with these two different sampling/selection algorithms in this experiment and we keep using the model parameters inherited from a supernet to estimate the single-model and ensemble performances of architectures (including the test errors).
The detailed experimental settings are in Appendix \ref{sec:setting-exp}.

Figure \ref{fig:search-efficiency} illustrates the search results. Note that both NES-RS and our NESBS are able to achieve lower test errors than the single best-performing architecture in the search space. These results therefore demonstrate that these two ensemble search algorithms are indeed capable of achieving improved performance over conventional NAS algorithms that select only one single architecture from the search space. More importantly, given the same evaluation budgets, our NESBS algorithm consistently achieves lower test errors than URS and NES-RS, indicating the superior search effectiveness achieved by our NESBS algorithm. 
Meanwhile, our NESBS algorithm requires fewer evaluation budgets than URS and NES-RS to achieve comparable test errors, which also suggests that our algorithm is more efficient than URS and NES-RS.
Interestingly, compared with MC Sampling, SVGD-RD can consistently produce improved search effectiveness and efficiency, which likely results from its controllable trade-off between the single-model performances and the diverse model predictions as justified in Sec.~\ref{sec:sampling}.
Overall, these results have well justified the effectiveness and efficiency of our NESBS algorithm.


\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figs/sample_efficiency_tau.pdf}
\caption{
The comparison of search effectiveness (test error of ensembles in the $y$-axis) and efficiency (evaluation budget in the $x$-axis) between our NESBS (MC Sampling) and NESBS (SVGD-RD) algorithm under varying softmax temperature $\tau$. The single best baseline refers to the single best architecture achieving the lowest test error in the search space. Each test error is reported with the mean and standard error of five independent trials.}
\label{fig:robust-tau}
\end{figure}


\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figs/diversity_test.pdf}
\caption{The comparison of ensemble test error achieved by our NESBS (SVGD-RD) algorithm with varying $\delta$ under different softmax temperature $\tau$ given an ensemble size of $n=5$. We use $\delta^*$ to denote the optimal $\delta$ we obtained in our SVGD-RD algorithm under different temperature $\tau$. The test error for each $\delta$ is reported with the mean and standard error of five independent trials.}
\label{fig:optimal-diversity}
\end{figure}

\subsection{The Advantages of Controllable Diversity in SVGD-RD}\label{sec:exp-optimal-diversity}

To examine the advantages of controllable diversity in our SVGD-RD, we firstly compare the search effectiveness and efficiency achieved by our NESBS (MC Sampling) and NESBS (SVGD-RD) algorithm with varying softmax temperature $\tau$ (appeared in \eqref{eq:discrete-dist}). A larger temperature $\tau$ will lead to a flatter posterior distribution and hence degenerate its capability of characterizing single-model performances of neural architectures as indicated in \eqref{eq:discrete-dist}. We use these posterior distributions with varying temperature $\tau$ to simulate the possible posterior distributions we may obtain in practice. Figure \ref{fig:robust-tau} illustrates the comparison on CIFAR-10 in the DARTS search space with an ensemble size of $n=5$. Notably, our NESBS (SVGD-RD) with controllable diversity can consistently achieve improved search effectiveness and efficiency than our NESBS (MC Sampling). Interestingly, this improvement becomes larger in the case of $\tau=0.1,10.0$, which should be the consequences of a bad exploration and exploitation achieved by our NESBS (MC Sampling), respectively. These results therefore suggest that the controllable diversity in our SVGD-RD generally can lead to improved search effectiveness and efficiency than our NESBS (MC Sampling).


We further provide the comparison of ensemble test error achieved by our SVGD-RD with varying $\delta$ under different softmax temperature $\tau$ in Figure \ref{fig:optimal-diversity}. Notably, when the posterior distribution tends to be flatter (i.e., $\tau=10$), a smaller $\delta$ is preferred by our SVGD-RD in order to sample architectures with better single-model performances while maintaining the compelling diverse model predictions. Meanwhile, when this posterior distribution tends to be sharper (i.e., $\tau=0.1$), a larger $\delta$ is preferred by our SVGD-RD in order to sample architectures with a larger diverse model predictions while preserving the competitive single-model performances. Based on this controllable diversity and hence the controllable trade-off between the single-model performances and the diverse model predictions, our SVGD-RD is thus capable of achieving comparable performances under varying $\tau$, which usually improve over our NESBS (MC Sampling) by comparing them with the results in Figure~\ref{fig:robust-tau}. These results further validate the advantages of the controllable diversity in our SVGD-RD.

\end{document}
