% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\usepackage{xr}
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{amsfonts}   
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{thmtools}
\usepackage{threeparttable}
\usepackage{multirow}
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{caption}
\usepackage{subcaption}

%%%%% NEW MATH DEFINITIONS %%%%%

\usepackage{amsmath,amsfonts,bm}

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{fig.~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Fig.~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
% \def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\vTheta{{\bm{\Theta}}}
\def\valpha{{\bm{\alpha}}}
\def\vphi{{\bm{\phi}}}
\def\vbeta{{\bm{\beta}}}
\def\vSigma{{\bm{\Sigma}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\DeclareMathOperator{\diag}{diag}
\let\ab\allowbreak


% \newtheorem{hypothesis}{Hypothesis}
% \newtheorem{theorem}{Theorem}
% \newtheorem{corollary}{Corollary}[theorem]
\newtheorem{proposition}{Proposition}
\newtheorem*{proposition*}{Proposition}
% \newtheorem{definition}{Definition}[theorem]
% \newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{remark}{Remark}

\newcommand{\rom}[1]{%
  \textup{\uppercase\expandafter{\romannumeral#1}}%
}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{{shu_406-supp}}

% \title{Going Beyond Neural Architecture Search via Sampling-Based Neural Ensemble Search}
\title{Neural Ensemble Search via Bayesian Sampling}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<shuyao@comp.nus.edu.sg>?Subject=About Your UAI 2022 paper}{Yao Shu}{}}
\author[1]{Yao Shu}
\author[1]{Yizhou Chen}
\author[1]{Zhongxiang Dai}
\author[1]{Bryan Kian Hsiang Low}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    National University of Singapore\\
    Singapore
}
  
\begin{document}
\maketitle

\begin{abstract}
Recently, \emph{neural architecture search} (NAS) has been applied to automate the design of neural networks in real-world applications. A large number of algorithms have been developed to improve the search cost or the performance of the final selected architectures in NAS. Unfortunately, these NAS algorithms aim to select only \emph{one single} well-performing architecture from their search spaces and thus have overlooked the capability of \emph{neural network ensemble} (i.e., an ensemble of neural networks with diverse architectures) in achieving improved performance over a single final selected architecture.
To this end, we introduce a novel neural ensemble search algorithm, called \emph{neural ensemble search via Bayesian sampling} (NESBS), to effectively and efficiently select well-performing neural network ensembles from a NAS search space. In our extensive experiments, NESBS algorithm is shown to be able to achieve improved performance over state-of-the-art NAS algorithms while incurring a comparable search cost, thus indicating the superior performance of our NESBS algorithm over these NAS algorithms in practice.
\end{abstract}

\section{Introduction}

Recent years have witnessed a surging interest in designing well-performing architectures for different tasks. These architectures are typically manually designed by human experts, which requires numerous trials and errors during this manual design process and therefore is prohibitively costly. Consequently, the increasing demand for developing well-performing architectures in different tasks makes this manual design infeasible. To avoid such human efforts, \citet{nas} have introduced \emph{neural architecture search} (NAS) to help automate the design of architectures. Since then, a number of NAS algorithms \citep{enas, darts, p-darts} have been developed to improve the search efficiency (i.e., search cost) or the search effectiveness (i.e., generalization performance of their final selected architectures) in NAS.


However, conventional NAS algorithms aim to select only \emph{one single architecture} from their search spaces and have thus overlooked the capability of other candidate architectures from the same search spaces in helping improve the performance achieved by their final selected single architecture. 
That is, \emph{neural network ensembles} are widely known to be capable of achieving an improved performance compared with a single neural network in practice \citep{adanet, mc-dropout, deepens}.
This naturally begs the question: \emph{How to select best-performing neural network ensembles with diverse architectures from a NAS search space in order to improve the performances achieved by existing NAS algorithms?}
To the best of our knowledge, only limited efforts (e.g., \citep{nes}) have been devoted to this problem in the NAS literature. Unfortunately, the \emph{neural ensemble search} (NES) algorithm based on random search or evolutionary algorithm in \citep{nes} requires excessive search costs to select their final neural network ensembles, which will not be affordable in resource-constrained scenarios.


To this end, this paper introduces a novel algorithm, namely \emph{neural ensemble search via Bayesian sampling} (NESBS), to effectively and efficiently select the well-performing neural network ensemble with diverse architectures from a search space.
We firstly represent the search space as a supernet following conventional one-shot NAS algorithms and then use the model parameters inherited from this supernet after its model training to estimate the single-model performances and also the ensemble performance of independently trained architectures (Sec. \ref{sec:oneshot}). Next, since both single-model performances and diverse model predictions affect the final ensemble performance according to \citep{zhou-ensemble}, we propose to use a variational posterior distribution of architectures based on a trained supernet to characterize these two factors, i.e., single-model performances and diverse model predictions (Sec. \ref{sec:posterior}). We then introduce two novel Bayesian sampling algorithms based on the posterior distribution of architectures, i.e., \emph{Monte Carlo sampling} (MC Sampling) and \emph{Stein Variational Gradient Descent with regularized diversity} (SVGD-RD), to effectively and efficiently select ensembles with both competitive single-model performances and compelling diverse model predictions (Sec. \ref{sec:sampling}), which is also guaranteed to be able to achieve impressive ensemble performances \citep{zhou-ensemble}. Lastly, we use extensive experiments to show that our NESBS algorithm is indeed able to select well-performing neural network ensembles effectively and efficiently in practice (Sec.~\ref{sec:exps}).


\section{Related Works \& Background}

\subsection{Neural Architecture Search}
In the literature, many NAS algorithms \citep{amoebanet, nas, nasnet} have been developed to automate the design of well-performing neural architectures. However, these NAS algorithms are inefficient in practice due to their requirement of the independent model training for each candidate architecture in the search space. To reduce such training costs, a supernet has been introduced to represent the search space and also share model parameters among the candidate architectures in the search space \citep{enas}. As a result, only the model training of this supernet is required, which can significantly improve the search efficiency of conventional NAS algorithms. After that, a number of one-shot NAS algorithms based on model parameter sharing \citep{p-darts, sdarts, darts-, darts, snas} have been developed. 
Unfortunately, these algorithms aim to select \emph{only one single architecture} from their search spaces. Thus, the capability of other candidate architectures from the same search spaces in helping improve the performance of their final selected single architecture have been overlooked. 


\subsection{Neural Network Ensembles}
Meanwhile, neural network ensembles have been widely applied to improve the performance of a single neural network in different applications \citep{eml}. Over the years, a number of methods have been proposed to construct such neural network ensembles. For example, \citet{mc-dropout} have proposed to use Monte Carlo Dropout to obtain neural network ensembles at test time. Meanwhile, \emph{deep ensembles} (DeepEns) \citep{deepens} adopt neural networks trained with different random initializations to construct ensembles and has achieved impressive performances on various tasks. 
Another line of ensemble works uses the checkpoints obtained during model training to build neural network ensembles \citep{snapshot}. More recently, \citet{nes} have introduced \emph{neural ensemble search} (NES) into NAS area to build well-performing neural network ensembles by selecting diverse architectures from the NAS search space, which has achieved competitive performance even compared with other ensemble methods. Unfortunately, the algorithm presented in \citep{nes} is shown to be prohibitively costly, which will not be affordable in resource-constrained scenarios. To this end, this paper presents a novel NESBS algorithm to advance this line of works (e.g., NES) by achieving state-of-the-art performances for neural network ensembles with diverse architectures while incurring a reduced search cost.


\subsection{Stein Variational Gradient Descent}\label{sec:bg-svgd}
\emph{Stein Variational Gradient Descent} (SVGD) \citep{svgd} is a variational inference algorithm that approximates a target distribution $p(\vx)$ with a simpler density $q^*(\vx)$ in a predefined set $\gQ$ by minimizing the \emph{Kullback-Leibler} (KL) divergence between these two densities:
\begin{equation}\label{eq:min-kl}
    q^* = \argmin_{q \in \gQ} \{\text{KL}(q || p)\triangleq\E_q\left[\log\left(q(\vx) / p(\vx)\right)\right]\} \ .
\end{equation}
Specifically, SVGD represents $q^*(\vx)$ with a set of particles $\{\vx_i\}_{i=1}^{n}$ which are firstly randomly initialized and then iteratively updated with updates $\vphi^*(\vx_i)$ and a step size $\epsilon$:
\begin{equation}\label{eq:svgd-iter}
    \vx_i \leftarrow \vx_i + \epsilon\vphi^*(\vx_i) \quad \text{for}\ i = 1,\ldots,n \ .
\end{equation}
Let $q_{[\epsilon\vphi]}$ denote the distribution of updated particles $\vx' = \vx + \epsilon\vphi(\vx)$. Let $\mathbb{F}$ denote the unit ball of a vector-valued \emph{reproducing kernel Hilbert space} (RKHS) $\gH\triangleq\gH_0 \times \ldots \times\gH_0$ where $\gH_0$ is an RKHS formed by scalar-valued functions associated with a positive definite kernel $k(\vx, \vx')$. The work of \citet{svgd} has shown that \eqref{eq:svgd-iter} can be viewed as functional gradient descent in the RKHS $\gH$ and the optimal $\vphi^*$ in \eqref{eq:svgd-iter} can be obtained by solving the following problem:
\begin{equation}
    \vphi^* = \argmax_{\vphi \in \mathbb{F}}\left\{-\frac{d}{d\epsilon}\text{KL}(q_{[\epsilon\vphi]} || p)\Bigr|_{\epsilon=0}\right\} \ ,
\end{equation}
which yields a closed-form solution: 
\begin{equation}
    \vphi^*(\cdot) = \E_{\vx\sim q}[k(\vx, \cdot)\nabla_{\vx}\log p(\vx) + \nabla_{\vx}k(\vx, \cdot)] \ .
\end{equation}
In practice, \citet{svgd} have approximated the expectation in this closed-form solution with the empirical mean of particles: $\vphi^*(\vx_i) \approx \widehat{\vphi}^*(\vx_i)$ where $\widehat{\vphi}^*(\vx_i)$ is defined as
\begin{equation}\label{eq:svgd-approx}
    \widehat{\vphi}^*(\vx_i) \triangleq \\
    \frac{1}{n}\sum_{j=1}^n k(\vx_j, \vx_i)\nabla_{\vx_j}\log p(\vx_j) + \nabla_{\vx_j} k (\vx_j, \vx_i) \ .
\end{equation}
As revealed in \citep{svgd}, the two terms in the aforementioned closed-form solution take different effects: The first term with $\nabla_{\vx}\log p(\vx)$ favors particles with higher probability density, while the second term pushes the particles away from each other to encourage diversity.


\section{Neural Ensemble Search via Bayesian Sampling}
Contrary to the selection of one single architecture in conventional NAS algorithm, this paper focuses on the problem of selecting a well-performing neural network ensemble with diverse architectures from the NAS search space, i.e., \emph{neural ensemble search} (NES) \citep{nes}. Let $\vf_{\gA}(\vx, \vtheta_{\gA})$ denote the output of an architecture $\gA$ with input data $\vx$ and model parameter $\vtheta_{\gA}$,  $S$ be a set of architectures, $\vTheta_S$ be a set of the corresponding model parameters of these architectures, and $\Ls_{\text{train}}$ and $\Ls_{\text{val}}$ denote the training and validation losses, respectively. Given the ensemble scheme $\gF_S(\vx, \vTheta_S)\triangleq n^{-1}\textstyle\sum_{\gA \in S}\vf_{\gA}(\vx, \vtheta_{\gA})$ with an ensemble size of $|S|=n$,\footnote{We apply this ensemble scheme for simplicity. Other ensemble schemes can also be used in the algorithm of this paper.} NES can be formally framed as
\begin{equation}\label{eq:nes}
\begin{gathered}
    \min_{S} \Ls_{\text{val}}(\gF_S(\vx, \vTheta_S^*)) \\
    \text{s.t.} \; \forall  \vtheta^*_{\gA} \in \vTheta^*_S \quad \vtheta^*_{\gA} = \argmin_{\vtheta_{\gA}} \Ls_{\text{train}}(\vf_{\gA}(\vx, \vtheta_{\gA})) \ .
\end{gathered}
\end{equation}


Unfortunately, \eqref{eq:nes} is challenging to solve mainly due to the following two reasons: (\rom{1}) The enormous number of candidate architectures in the NAS search space  (e.g., ${\sim}10^{25}$ in the DARTS search space \citep{darts}) makes the independent model training of every candidate architecture (i.e., lower-level optimization in \eqref{eq:nes}) unaffordable. (\rom{2}) The ensemble search space is exponentially increasing in the ensemble size $n$: For example, there are ${\sim}m^n$ different ensembles given $m$ diverse architectures. The combinatorial optimization problem (i.e., upper-level optimization in \eqref{eq:nes}) is thus intractable to solve within this huge ensemble search space. Recently, \citet{nes} have attempted to avoid these two problems by sampling a small pool of architectures from the search space for their final ensemble search. Thus, they fail to explore the whole search space and may achieve poor ensemble performances in practice. Moreover, their search cost is still unaffordable due to the independent model training of every architecture in the pool.

To this end, we novelly present the \emph{neural ensemble search via Bayesian sampling} (NESBS) algorithm to solve \eqref{eq:nes} effectively and efficiently. We firstly employ the model parameters inherited from a supernet (i.e., a representation of the NAS search space) after its model training to estimate the single-model performances and also the ensemble performance of independently trained architectures (Sec. \ref{sec:oneshot}). This only requires the model training of the supernet and thus allows us to overcome the aforementioned challenge \rom{1}. We then derive a posterior distribution of architectures to characterize both the single-model performances and the diverse model predictions of candidate architectures in the search space (Sec. \ref{sec:posterior}). Finally, based on this posterior distribution and also the aforementioned ensemble performance estimation, we introduce \emph{Monte Carlo Sampling} (MC Sampling) and \emph{Stein Variational Gradient Descent with regularized diversity} (SVGD-RD) to explore the ensembles in the whole~search space~effectively and efficiently (Sec. \ref{sec:sampling}), which thus allows us to overcome the aforementioned challenge \rom{2}. An overview of our NESBS is in Algorithm~\ref{alg:nes}.


\subsection{Model Training of Supernet}\label{sec:oneshot}


Similar to one-shot NAS algorithms \citep{darts, enas}, we represent NAS search space as a supernet. This then allows us to use the model parameters inherited from this trained supernet to estimate not only the single-model performances but also the ensemble performance of independently trained candidate architectures in the search space. However, in order to realize an accurate and fair estimation of these performances, we need to further ensure that every candidate architecture in the search space is trained for a comparable number of steps, namely, the training fairness among candidate architectures \citep{fairnas}.
To achieve this, in every training step of this supernet, we uniformly randomly sample one single candidate architecture from this supernet for model training (see Fig.~\ref{fig:train}). The training fairness of such a training scheme can then be theoretically guaranteed, as demonstrated in Appendix \ref{sec:proofs}. Moreover, we provide empirical results in Appendix \ref{sec:exp-oneshot} to validate the effectiveness of such performance estimations.

\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figs/supernet-training.pdf}
\caption{An illustration of the model training of~supernet. The supernet here consists of three candidate architectures with $r_i$ indicating the selection of one architecture and $\vtheta_i^{t}$ denoting its model parameters at step $t$. In every training step, only one architecture is uniformly sampled to update its parameters and all other architectures will be ignored.}
\label{fig:train}
\end{figure}

\subsection{Distribution of Architectures}\label{sec:posterior}
It has been demonstrated that both competitive single-model performances and diverse model predictions are required to achieve compelling ensemble performances \citep{zhou-ensemble}. That is, NES algorithms should be capable of selecting architectures with both competitive single-model performances and diverse model predictions to achieve competitive ensemble performances. To realize this, we introduce a posterior distribution of architectures to firstly characterize these two factors. Let $\gD$ denote the validation dataset, and $p(\gA)$ and $p(\gA | \gD)$ denote, respectively, the prior and posterior distributions of a candidate architecture after its model training where $p(\gA)$ follows from a categorical uniform distribution, as required in Sec. \ref{sec:oneshot}. According to the Bayes' theorem, since $p(\gA)$ is uniform and $p(\gD)$ is constant,
\begin{equation}\label{eq:bayes}
\begin{gathered}
    p(\gA|\gD) = p(\gD|\gA)p(\gA)/p(\gD) \propto p(\gD|\gA)
\end{gathered}
\end{equation}
where $p(\gD|\gA)$ (i.e., likelihood) is widely used to represent the single-model performance (i.e., loss) in practice.
So, \eqref{eq:bayes} implies that the posterior distribution $p(\gA|\gD)$ can also characterize the single-model performances of architectures.

Meanwhile, given a $\gamma$-Lipschitz continuous loss function $\Ls(\vf)$, 
the diversity of model predictions (i.e., $\|\vf_{\gA_1}{-}\vf_{\gA_2}\|_2$) can then be lower bounded  based on the Lipschitz continuity of $\Ls(\cdot)$:
\begin{equation}\label{eq:approx}
    \begin{gathered}
        \|\vf_{\gA_1} - \vf_{\gA_2}\|_2 \geq \gamma^{-1}|\Ls(\vf_{\gA_1}) - \Ls(\vf_{\gA_2})| \ .
    \end{gathered}
    \end{equation}
Therefore, \eqref{eq:approx} suggests that in addition to being able to characterize the single-model performances of architectures (i.e., $\Ls(\vf)$), the posterior distribution $p(\gA|\gD)$ can estimate the diversity of model predictions for different architectures (e.g., $\gA_1$ and $\gA_2$) using $|p(\gA_1|\gD) - p(\gA_2|\gD)|$.

However, it is intractable to obtain exact posterior distribution $p(\gA|\gD)$ in the NAS search space. So, we approximate it with a variational distribution $p_{\valpha}(\gA)$ (parameterized by a low-dimensional $\valpha$) that can be optimized via variational inference, i.e., by minimizing the KL divergence between $p_{\valpha}(\gA)$ and $p(\gA|\gD)$. Equivalently, we only need to maximize a lower bound of the log-marginal likelihood (i.e., the \emph{evidence lower bound} (ELBO) \citep{vae}) to get an optimal variational distribution $p_{\valpha^*}(\gA)$:
\begin{equation}\label{eq:elbo}
\begin{gathered}
    \max_{\valpha} \E_{\gA \sim p_{\valpha}(\gA)}\left[\log p(\gD|\gA)\right] - \text{KL}[p_{\valpha}(\gA) || p(\gA)] \ .
\end{gathered}
\end{equation}
Similar to \citep{vae}, a gradient-based optimization algorithm with the reparameterization trick is employed to solve \eqref{eq:elbo} efficiently (see Appendix \ref{sec:setting-posterior}). While \citet{snas} have adopted a similar form to \eqref{eq:elbo} (without the KL term) \emph{during} the model training of the supernet (namely, the \emph{best-response} posterior distribution), our \emph{post-training} posterior distribution is able to not only provide a more accurate characterization of the single-model performances but also contribute to an improved ensemble search performance, as demonstrated in Appendix \ref{sec:exp-best_vs_post}. 

\subsection{Bayesian Sampling}\label{sec:sampling}
To solve \eqref{eq:nes} effectively and efficiently, we finally introduce two novel Bayesian sampling algorithms based on the posterior distribution of architectures in Sec.~\ref{sec:posterior}, i.e., \emph{Monte Carlo sampling} (MC Sampling) and \emph{Stein Variational Gradient Descent with regularized diversity} (SVGD-RD), to sample ensembles with both competitive single-model performances and compelling diversity of model predictions, as required by well-performing ensembles~\citep{zhou-ensemble}.

\begin{figure}[t]
\begin{minipage}{\columnwidth}
\begin{algorithm}[H]
  \caption{NES via Bayesian Sampling (NESBS)}
  \label{alg:nes}
\begin{algorithmic}[1]
  \STATE {\bfseries Input:} Iterations $T$, ensemble size $n$, a supernet
  \STATE Train the supernet to get its tuned parameters $\vtheta^*$
  \STATE Obtain the posterior distribution $p_{\valpha^*}(\gA)$ with \eqref{eq:elbo}
  \FOR{iteration $t=1, \ldots, T$}
  \STATE Sample $S_t$ of size $n$ via Algorithm \ref{alg:mc} or \ref{alg:svgd-rd} 
  \STATE Evaluate estimated $\Ls_{\text{val}}(\gF_{S_t}(\vx, \vTheta_{S_t}^*))$ given $\vtheta^*$
  \ENDFOR
  \STATE Select optimum $S^* = \argmin_{S_t} \Ls_{\text{val}}(\gF_{S_t}(\vx, \vTheta_{S_t}^*))$
\end{algorithmic}
\end{algorithm}
\end{minipage}
\hfill
\begin{minipage}{\columnwidth}
\begin{algorithm}[H]
  \caption{MC Sampling}
  \label{alg:mc}
\begin{algorithmic}[1]
  \STATE {\bfseries Input:} Ensemble size $n$, set $S=\emptyset$, posterior $p_{\valpha^*}(\gA)$
  \FOR{iteration $i=1, \ldots, n$}
  \STATE Sample $\gA_i \sim p_{\valpha^*}(\gA)$
  \STATE $S \leftarrow S \cup \{\gA_i\}$
  \ENDFOR
  \STATE {\bfseries Output:} $S$
\end{algorithmic}
\end{algorithm}
\end{minipage}

\begin{minipage}{\columnwidth}
\begin{algorithm}[H]
  \caption{SVGD-RD}
  \label{alg:svgd-rd}
\begin{algorithmic}[1]
  \STATE {\bfseries Input:} Diversity coefficient $\delta$, ensemble size $n$, iterations $L$, initial particles $\{\vx_i^{(0)}\}_{i=1}^n$, posterior $p_{\valpha^*}(\gA)$, kernel $k(\vx, \vx')$, step size $\{\epsilon_l\}_{l=1}^L$
  \FOR{iteration $l=0, \ldots, L-1$}
  \STATE Evaluate updates $\widehat{\vphi}_l^*(\vx) =\displaystyle\frac{1}{n}\sum_{j=1}^n\nabla_{\vx_j^{(l)}}k(\vx_j^{(l)}, \vx)- \delta \nabla_{\vx}k(\vx_j^{(l)}, \vx) + k(\vx_j^{(l)},\vx)\nabla_{\vx_j^{(l)}}\log p_{\valpha^*}$
  \STATE Update particles $\vx_i^{(l+1)} \leftarrow \vx_i^{(l)} + \epsilon_l\ \widehat{\vphi}_l^*(\vx_i^{(l)})$
  \ENDFOR
  
  \STATE {\bfseries Output:} $S=\{\gA_i\}_{i=1}^n$ derived based on $\{\vx_i^{(L)}\}_{i=1}^n$
\end{algorithmic}
\end{algorithm}
\end{minipage}
\end{figure}


\subsubsection{Monte Carlo Sampling (MC Sampling)}\label{sec:mc}
Given the posterior distribution of architectures in Sec. \ref{sec:posterior}, we firstly propose to use \emph{Monte Carlo sampling} (MC Sampling) to sample a set of architectures from this posterior distribution (Algorithm \ref{alg:mc}). Note that MC Sampling guarantees that architectures with better single-model performances will be sampled (i.e., exploited) with higher probabilities, while architectures with diverse model predictions
can also be sampled (i.e., explored) due to the inherent randomness in the sampling process.
Compared with conventional NAS algorithms that select only one single well-performing architecture from the search space \citep{gdas, snas}, our MC sampling algorithm extends these algorithms by exploring the capability of diverse architectures while preserving its exploitation of architectures with compelling single-model performances.


\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{figs/svgd_rd.pdf}
\caption{Impact of $\delta$ in SVGD-RD. We use contours and dots to denote the density of target distribution and sampled particles, respectively. The target distribution is chosen to be $p(\vx){=}(1/Z)\left[\gN(\vx | \vu_1, \Sigma_1) + \gN(\vx | \vu_2, \Sigma_2)\right]$ where $\vu_1{=}(-1, 0)$, $\vu_2{=}(0, 1)$, $\Sigma_1{=}\Sigma_2{=}\diag((0.25, 0.5))$, and $Z$ denotes the normalization constant. Those sampled particles are obtained from Algorithm \ref{alg:svgd-rd} using $L{=}1000$, $n{=}15$, $\epsilon_l{=}0.1$, and a \emph{radial basis function} (RBF) kernel. Notably, SVGD-RD tends to sample particles with more diverse probability densities as $\delta$ is increased, hence indicating a controllable (via $\delta$) diversity in our SVGD-RD algorithm. Meanwhile, SVGD-RD can consistently sample particles with high probability densities under varying $\delta$.}
\label{fig:diversity}
\end{figure}


\subsubsection{SVGD with Regularized Diversity (SVGD-RD)}\label{sec:svgd-rd}
However, the diversity of sampled architectures using the MC Sampling algorithm above cannot be controlled and hence may lead to poor ensemble search results.
So, in order to achieve a controllable diversity, we resort to \emph{Stein Variational Gradient Descent} (SVGD). Theoretically, SVGD is capable of sampling particles with both large probability density and good diversity where the diversity is explicitly encouraged (i.e., by the second term in \eqref{eq:svgd-approx}). Nonetheless, in practice, the particles sampled by SVGD may still fail to represent the target distribution well owing to the lack of diversity among those sampled particles, as observed in \citep{mp-svgd}. Besides, the diversity of sampled particles in standard SVGD  still cannot be controlled by human experts. 

We hence develop an \emph{SVGD with regularized diversity} (SVGD-RD) sampling algorithm that can achieve a controllable diversity among those sampled particles. We follow the notations from Sec. \ref{sec:bg-svgd}.
In particular, when optimizing the distribution $q^*$ (represented by the $n$ particles $\{\vx^*_i\}_{i=1}^{n}$), we modify the objective in \eqref{eq:min-kl} by adding a term representing the (controllable) diversity among the particles measured by the kernel function $k(\vx, \vx')$:
\begin{equation}\label{eq:min-kl-max-div}
    q^* = \argmin_{q \in \gQ}\text{KL}(q||p) + n\delta\  \E_{\vx,\vx' \sim q} \left[k(\vx, \vx')\right] 
\end{equation}
where $\delta$ is the parameter explicitly controlling the diversity, and $p$ in \eqref{eq:min-kl-max-div} denotes the posterior distribution $p_{\valpha^*}(\gA)$ derived in Sec. \ref{sec:posterior} which we intend to sample from.
Following the work of SVGD, $q^*$ in \eqref{eq:min-kl-max-div} is represented by $\{\vx^*_i\}_{i=1}^{n}$ denoting our final selected neural network ensemble that can achieve both competitive single-model performances (i.e., large probability density) and also diverse model predictions. Proposition \ref{prop:svgdrd-update} below provides one possible update rule for the particles $\{\vx_i\}_{i=1}^{n}$ to optimize \eqref{eq:min-kl-max-div} (see its proof in Appendix \ref{sec:proofs}).
Finally, Algorithm \ref{alg:svgd-rd} summarizes the details of our SVGD-RD algorithm and Appendix \ref{sec:setting-svgd} provides its optimization details in practice. After obtaining those optimal particles $\{\vx^*_i\}_{i=1}^{n}$ in our SVGD-RD algorithm, we then apply these particles to derive the architectures in our final selected ensembles (see details in Appendix \ref{sec:setting-svgd}).

\begin{proposition}\label{prop:svgdrd-update}
    Given the proximal operator $\normalfont \text{prox}_h(\vy)=\argmin_{\vz}h(\vz)+1/2\|\vz-\vy\|_2^2$, by applying proximal gradient method \citep{proximal} and proper approximation, \eqref{eq:min-kl-max-div} can be optimized via the following updates of the particles $\{\vx_i\}_{i=1}^{n}$:
    \begin{equation*}
    \begin{array}{l}
        \displaystyle\vx_i \leftarrow \vx_i + \frac{1}{n} \sum_{j=1}^n k(\vx_j, \vx_i)\nabla_{\vx_j}\log p(\vx_j) \\
        \displaystyle\qquad\qquad\qquad\quad\ + \nabla_{\vx_j} k (\vx_j, \vx_i) - \delta\nabla_{\vx_i} k (\vx_j, \vx_i)\ .
    \end{array}
    \end{equation*}
\end{proposition}


Compared with MC Sampling, our SVGD-RD algorithm provides a controllable trade-off between the single-model performances and the diverse model predictions.
On the one hand, the minimization of the KL divergence term in \eqref{eq:min-kl-max-div} encourages the selection of architectures with competitive single-model performances by favoring particles with high probability densities, as shown by Proposition \ref{prop:exploitation} below (its proof is in Appendix \ref{sec:proofs}).\footnote{Although Proposition \ref{prop:exploitation} is only applicable in the case of $n=1$, our SVGD-RD is still capable of sampling particles with high probability densities when $n>1$, as validated in Fig.~\ref{fig:diversity}.}
On the other hand, the maximization of the scaled distance $-n\delta\ \E_{\vx,\vx' \sim q} \left[k(\vx, \vx')\right]$ among the sampled particles leads to a controllable diversity (via $\delta$) among these sampled particles and also a controllable diversity of the probability densities among these particles (see Fig.~\ref{fig:diversity}), which also implies a controllable diversity of the model predictions, as suggested in Sec.~\ref{sec:posterior}.

\begin{proposition}\label{prop:exploitation}
Let $p$ be a target density and $k(\vx,\vx')=c$ for every $\vx=\vx'$ where $c$ is a constant. For any $\delta \in \mathbb{R}$, our SVGD-RD algorithm is equivalent to the maximization of the density $p$ w.r.t.~$\vx$ in the case of $n=1$.
\end{proposition}



\section{Experiments}\label{sec:exps}
\subsection{Search in NAS-Bench-201}\label{sec:nasbench}

\begin{table*}[t]
\caption{Comparison of architectures selected by different NAS and ensemble (search) algorithms in NAS-Bench-201 with ensemble size $n=3$. Test errors are reported with the mean and standard error of three independent trials and our search costs are evaluated on a single Nvidia $1080$Ti GPU.  Results marked by $\dagger$ are reported by \citet{nasbench201}.}
\centering
\resizebox{0.9\textwidth}{!}{
\begin{threeparttable}
\begin{tabular}{lcccc}
\toprule
\multirow{2}{*}{\textbf{Architecture(s)}} & \multicolumn{3}{c}{\textbf{Test Error} (\%)} &
\multirow{2}{2cm}{\textbf{ Search Cost} (GPU Hours)} \\
\cmidrule(l){2-4}
& \textbf{CIFAR-10} & \textbf{CIFAR-100} & \textbf{ImageNet-16-200} &  \\
\midrule
& \multicolumn{4}{c}{\textbf{Manual design}} \\
ResNet$^{\dagger}$ \citep{resnet} & 6.03 & 29.14 & 56.37 & - \\
\midrule
& \multicolumn{4}{c}{\textbf{NAS algorithms}} \\
ENAS$^{\dagger}$ \citep{enas} & 45.70$\pm$0.00 & 84.39$\pm$0.00 & 83.68$\pm$0.00 & 3.7 \\
DARTS$^{\dagger}$ (2nd) \citep{darts} & 45.70$\pm$0.00 & 84.39$\pm$0.00 & 83.68$\pm$0.00 & 8.3 \\
GDAS$^{\dagger}$ \citep{gdas} & 6.49$\pm$0.13 & 29.39$\pm$0.26 & 58.16$\pm$0.90 & 8.0 \\
SETN$^{\dagger}$ \citep{setn} & 13.81$\pm$4.63 & 43.13$\pm$7.77 & 68.10 $\pm$4.07 & 8.6 \\
RSPS$^{\dagger}$ \citep{rsps} & 12.34$\pm$1.69 & 41.67$\pm$4.34 & 68.86$\pm$3.88 & 2.1 \\
\midrule
& \multicolumn{4}{c}{\textbf{Ensemble (search) algorithms}} \\
DeepEns \citep{deepens} & 5.75 & 25.27 & 54.70 & - \\
NES-RS \citep{nes} & 5.83$\pm$0.33 & 25.58$\pm$0.84 & 54.34$\pm$1.67 & 5.1 \\
\midrule
& \multicolumn{4}{c}{\textbf{Our ensemble search algorithm}} \\
NESBS (MC Sampling) & 5.76$\pm$0.25 & 25.39$\pm$0.69 & \textbf{53.47}$\pm$1.75 & \textbf{1.1} \\
NESBS (SVGD-RD) & 5.92$\pm$0.07 & \textbf{25.00}$\pm$0.17 & \textbf{52.68}$\pm$0.35 & \textbf{1.2} \\
\bottomrule
\end{tabular}
\end{threeparttable}
}
\label{tab:nasbench201}
\end{table*}



To verify the effectiveness and efficiency of our NESBS algorithm, we firstly compare it with other well-known NAS and ensemble (search) algorithms in NAS-Bench-201 \citep{nasbench201}. Table~\ref{tab:nasbench201} summarizes the results. Table~\ref{tab:nasbench201} shows that ensemble (search) algorithms, including our NESBS, consistently achieve improved generalization performance over conventional NAS algorithms. This is because ensemble (search) algorithms will select neural network ensembles whereas NAS algorithms will select only one single architecture. Moreover, it has been widely verified that model ensembles generally outperform a single machine learning model in practice \citep{zhou-ensemble}. In addition, our NESBS algorithm outperforms other ensemble (search) baseline (i.e., DeepEns and NES-RS), especially on large-scale datasets (i.e., CIFAR-100 \citep{cifar} and ImageNet-16-200 \citep{imagenet-16-120}) while incurring less search costs than NES-RS, which thus implies the superior performance of our NESBS over these ensemble (search) baselines. Even on a small-scale dataset (i.e., CIFAR-10), our NESBS can also achieve comparable search results to DeepEns and NES-RS.  Interestingly, our NESBS algorithm is even able to incur reduced search costs than conventional NAS algorithms. This is likely because more training epochs have been used in these NAS algorithms, whereas a small number of training epochs can already contribute to well-performing results for our NESBS algorithm.

\subsection{Search in The DARTS Search Space}\label{sec:darts}
We further demonstrate the superior search effectiveness and efficiency of our NESBS by comparing it with other NAS and ensemble (search) baselines in a larger search space (i.e.,  DARTS \citep{darts} search space) using both classification and adversarial defense tasks on CIFAR-10/100 or ImageNet~\citep{imagenet}. We follow Appendix \ref{sec:setting-training} to evaluate the final neural network ensembles selected by our NESBS algorithm with ensemble size $n=3$, $T=5$, and optimization details in Appendix~\ref{sec:setting-exp}.

\begin{table*}[t]
\caption{Comparison of different image classifiers on CIFAR-10/100. Results of MC DropPath are from a drop~probability of $0.01$ and our search costs are evaluated on Nvidia $1080$Ti.}
\centering
\resizebox{0.9\textwidth}{!}{
\begin{threeparttable}
\begin{tabular}{lcccccc}
\toprule
\multirow{2}{*}{\textbf{Architecture(s)}} & \multicolumn{2}{c}{\textbf{Test Error} (\%)} &
\multicolumn{2}{c}{\textbf{Params} (M)} &
% \multirow{2}{*}{\textbf{Search Cost (GPU days)}} &
\multirow{2}{1.8cm}{\textbf{Search Cost} (GPU Days)} &
\multirow{2}{*}{\textbf{Search Method}} \\
\cmidrule(l){2-3} \cmidrule(l){4-5} 
& \textbf{C10} & \textbf{C100} & \textbf{C10} & \textbf{C100} & \\
\midrule 
& \multicolumn{6}{c}{\textbf{NAS algorithms}} \\

NASNet-A \citep{nasnet} & 2.65 & - & 3.3 & - & 2000 & RL\\
AmoebaNet-A \citep{amoebanet} & 3.34 & 18.93 & 3.2 & 3.1 & 3150 & evolution\\
PNAS \citep{pnas} & 3.41 & 19.53 & 3.2 & 3.2 & 225 & SMBO\\
ENAS \citep{enas} & 2.89 & 19.43 & 4.6 & 4.6 & 0.5 & RL\\
DARTS \citep{darts} & 2.76 & 17.54 & 3.3 & 3.4 & 1 & gradient\\
GDAS \citep{gdas} & 2.93 & 18.38 & 3.4 & 3.4 & 0.3 & gradient \\

P-DARTS \citep{p-darts} & 2.50 & - & 3.4 & - & 0.3 & gradient \\
DARTS- (avg) \citep{darts-} & 2.59 & 17.51 & 3.5 & 3.3 & 0.4 & gradient \\
SDARTS-ADV \citep{sdarts} & 2.61 & - & 3.3 & - & 1.3 & gradient \\
\midrule
& \multicolumn{6}{c}{\textbf{Ensemble (search) algorithms}} \\
MC DropPath (ENAS) & 2.88 & 16.83 & 3.8$^\ddagger$ & 3.9$^\ddagger$ & - & - \\

DeepEns (ENAS) & 2.49 & 15.04 & 3.8$^\ddagger$ & 3.9$^\ddagger$ & - & - \\
DeepEns (DARTS) & 2.42 & 14.56 & 3.3$^\ddagger$ & 3.4$^\ddagger$ & - & - \\

NES-RS$^{\sharp}$ \citep{nes} & 2.50 & 15.24 & 3.0$^\ddagger$ & 3.1$^\ddagger$ & 0.7 & greedy \\

\midrule
& \multicolumn{6}{c}{\textbf{Our ensemble search algorithm}} \\
NESBS (MC Sampling) & \textbf{2.41} & 14.70 & 3.8$^\ddagger$ & 3.9$^\ddagger$ & \textbf{0.2} & sampling \\
NESBS (SVGD-RD) & \textbf{2.36} & \textbf{14.55} & 3.7$^\ddagger$ & 3.8$^\ddagger$ & \textbf{0.2} & sampling \\
\bottomrule
\end{tabular}
\begin{tablenotes}\footnotesize
    \item[$\ddagger$] Reported as the averaged parameter size of the architectures in a neural network ensemble.
    \item[$\sharp$] Obtained from a pool of size $50$, in which every architecture is uniformly randomly sampled from the DARTS search spaces and then trained independently for $50$ epochs following the evaluation settings in Appendix~\ref{sec:setting-training}.
\end{tablenotes}
\end{threeparttable}
}
\label{tab:accuracy-cifar}
\end{table*}


\begin{table}[!t]
\caption{Comparison of image classifiers on ImageNet. The ensemble size is set to $n=3$ for NES-RS and NESBS.}
\centering
\resizebox{\columnwidth}{!}{
\begin{threeparttable}
\begin{tabular}{lcccc}
\toprule
\multirow{2}{*}{\textbf{Architecture(s)}} & \multicolumn{2}{c}{\textbf{Test Error} (\%)} &
\multirow{2}{1.0cm}{\textbf{Params}} &
\multirow{2}{0.6cm}{\textbf{$+\times$}} \\
\cmidrule(l){2-3}
& \textbf{Top-1} & \textbf{Top-5} & (M) & (M) \\

\midrule
\multicolumn{5}{c}{\textbf{NAS algorithms}} \\
NASNet-A  & 26.0 & 8.4 & 5.3 & 564 \\
AmoebaNet-A & 25.5 & 8.0 & 5.1 & 555 \\
PNAS & 25.8 & 8.1 & 5.1 & 588 \\
DARTS & 26.7 & 8.7 & 4.7 & 574 \\
GDAS & 26.0 & 8.5 & 5.3 & 581 \\
P-DARTS & 24.4 & 7.4 & 4.9 & 557 \\
SDARTS-ADV & 25.2 & 7.8 & 5.4 & 594 \\
\midrule
\multicolumn{5}{c}{\textbf{Ensemble (search) algorithm}} \\
NES-RS & 23.4 & 6.8 & 3.9 & 432 \\
\midrule
\multicolumn{5}{c}{\textbf{Our ensemble search algorithm}} \\
NESBS (MC Sampling) & \textbf{22.3} & \textbf{6.2} & 4.6 & 522 \\
NESBS (SVGD-RD) & \textbf{22.3} & \textbf{6.1} & 4.9 & 562 \\
\bottomrule
\end{tabular}
\end{threeparttable}
}
\label{tab:accuracy-imagenet}
\end{table}

\paragraph{Ensemble for classification.}
Table \ref{tab:accuracy-cifar} summarizes the comparison of classification performances on CIFAR-10/100. Similar to the results in Sec. \ref{sec:nasbench}, ensemble (search) algorithms, including our NESBS, are generally able to achieve improved generalization performances over conventional NAS algorithms, which thus justifies the essence of ensemble (search) algorithms for improved performance.
Notably, even compared with other ensemble baselines such as MC DropPath (i.e., developed following Monte Carlo Dropout \citep{mc-dropout}) and DeepEns, our NESBS is still able to achieve improved performances. Since these ensemble baselines are orthogonal to our NESBS, they can be integrated into our NESBS for further performance improvement in real-world applications. More importantly, our algorithm outperforms NES-RS by achieving both improved search effectiveness (lowest test errors) and efficiency (lowest search costs). Furthermore, our NESBS even incurs comparable search costs compared with the most efficient NAS algorithms (e.g., GDAS, P-DARTS), which also highlights the efficiency of our NESBS. Similar results on ImageNet can be achieved by our NESBS as shown in Table~\ref{tab:accuracy-imagenet}. \footnote{Following the convention of NAS and ensemble search algorithms in Table~\ref{tab:accuracy-imagenet}, the ensembles selected by our NESBS are also searched on CIFAR-10 and then transferred to ImageNet.}

\paragraph{Ensemble for adversarial defense.}\label{sec:exp-adversarial}
Ensemble methods have already been shown to be an essential and effective defense mechanism against adversarial attacks~\citep{ensemble-for-defense}.
Specifically, an adversarial attacker can only use \emph{a single model} randomly sampled from an ensemble to generate the adversarial examples, whereas the ensemble method defends against adversarial attacks (i.e., makes its predictions) using \emph{all models} in this ensemble. Ensemble methods can defend against the adversarial attacks in such a setting because the generated adversarial examples using only one single model are unlikely to fool all models in an ensemble.
More details are provided in Appendix \ref{sec:setting-adversarial}.
Table \ref{tab:adversarial} summarizes the comparison of adversarial defense among ensemble (search) algorithms on CIFAR-10/100 under different
white-box adversarial attacks, including the \emph{Fast Gradient Signed Method} (FGSM) attack \cite{fgsm}, the \emph{Projected Gradient Descent} (PGD) attack \cite{pgd}, the \emph{Carlini Wagner} (CW) attack \cite{cw}, and the AutoAttack~\citep{autoattack}.
Table \ref{tab:adversarial} shows that ensemble (search) algorithms are indeed able to significantly improve the performance of adversarial defense, i.e., the test accuracies in the \emph{Defense} column are consistently higher than the ones in \emph{Attack} column. More importantly, even under different white-box adversarial attacks, our NESBS algorithm can generally achieve improved defense performances (i.e., higher test accuracy in the \emph{Defense} columns) than other baselines including DeepEns and NES-RS. These results thus further support the effectiveness of our NESBS over existing ensemble (search) algorithms.
Besides, even regarding the adversarial robustness of the single models in an ensemble, the architectures selected by our NESBS are also more advanced (i.e., by achieving higher test accuracy in the \emph{Attack} columns) than well-known architectures such as RobNet~\citep{robnet} and DARTS.

\begin{table*}[t]
\caption{Comparison of adversarial defense among different ensemble (search) algorithms on CIFAR-10/100 under white-box adversarial attacks.
The \emph{Attack} and \emph{Defense} columns denote the test \emph{accuracy} under the attack using a single model randomly sampled from an ensemble and the defense using the whole ensemble, respectively.
Each result reports the mean and standard deviation of test accuracies for $3$ rounds of the attack-defense process with an ensemble size of $n=3$.}
\newcommand{\ms}{\phantom{-}}
\renewcommand\multirowsetup{\centering}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{l*{8}{c}}
\toprule
\multirow{2}{*}{\textbf{Method}} & 
\multicolumn{2}{c}{\textbf{FGSM}} &  
\multicolumn{2}{c}{\textbf{PGD-40}} &
\multicolumn{2}{c}{\textbf{CW}} &
\multicolumn{2}{c}{\textbf{AutoAttack}}
\\
\cmidrule(l){2-3} \cmidrule(l){4-5} \cmidrule(l){6-7} \cmidrule(l){8-9} 
&
Attack (\%) & Defense (\%) &
Attack (\%) & Defense (\%) & 
Attack (\%) & Defense (\%) & 
Attack (\%) & Defense (\%)  \\
\midrule 
& \multicolumn{8}{c}{\textbf{On CIFAR-10 Dataset}} \\
DeepEns & - & - & - & - & - & - & - & - \\
$\quad \hookrightarrow$ RobNet-free & 66.62$\pm$0.32 & 85.25$\pm$0.39 & 41.81$\pm$0.80 & 77.48$\pm$0.67 & $\;\;$5.74$\pm$1.41 & 86.53$\pm$0.50 & 21.35$\pm$0.33 & 45.51$\pm$0.15 \\
$\quad \hookrightarrow$ ENAS & 77.85$\pm$0.58 & 87.94$\pm$0.21 & 59.51$\pm$1.13 & 86.57$\pm$0.15 & 31.36$\pm$1.20 & 85.20$\pm$0.77 & 31.71$\pm$0.72 & 50.96$\pm$0.07 \\
$\quad \hookrightarrow$ DARTS & 76.79$\pm$0.80 & 88.21$\pm$0.14 & 57.71$\pm$1.65 & 82.02$\pm$0.10 & 26.90$\pm$1.37 & 82.46$\pm$0.35 & 29.97$\pm$1.17 & 49.67$\pm$0.14 \\
NES-RS & 79.19$\pm$1.39 & 89.32$\pm0.27$ & 65.59$\pm$2.11 & 85.22$\pm$0.41 & 37.20$\pm$4.62 & 86.75$\pm$0.88 & 35.00$\pm$1.15 & 53.80$\pm$0.14 \\
\cmidrule(l){1-9}
NESBS (MC Sampling) & 78.75$\pm$1.29 & 89.15$\pm$0.08 & 63.60$\pm$1.87 & 85.35$\pm$0.31 & \textbf{37.71}$\pm$1.97 & \textbf{86.86}$\pm$0.66 & \textbf{36.02}$\pm$0.64 & \textbf{56.90}$\pm$0.17 \\
NESBS (SVGD-RD) & 79.12$\pm$0.61 & \textbf{89.86}$\pm$0.33 & 65.53$\pm$1.56 & 85.37$\pm$0.38 & \textbf{38.27}$\pm$1.27 & 86.00$\pm$1.10 & \textbf{37.55}$\pm$0.68 & \textbf{57.15}$\pm$0.20\\
\midrule
& \multicolumn{8}{c}{\textbf{On CIFAR-100 Dataset}} \\
DeepEns & - & - & - & - & - & - & - & - \\
$\quad \hookrightarrow$ RobNet-free & 36.47$\pm$0.25 & 61.39$\pm$0.30 & 18.18$\pm$0.47 & 52.61$\pm$0.13 & 2.36$\pm$0.13 & 69.44$\pm$0.04 & $\;\;$7.31$\pm$0.35 & 24.56$\pm$0.33 \\
$\quad \hookrightarrow$ ENAS & 46.40$\pm$0.37 & 64.94$\pm$0.27 & 28.87$\pm$0.27 & 56.79$\pm$0.25 & 9.60$\pm$0.30 & 69.43$\pm$0.44 & 11.53$\pm$0.47 & 27.01$\pm$0.27 \\
$\quad \hookrightarrow$ DARTS & 46.98$\pm$0.57 & 65.38$\pm$0.23 & 28.78$\pm$0.74 & 57.10$\pm$0.04 & 9.73$\pm$0.43 & 70.15$\pm$0.29 & 11.20$\pm$0.40 & 26.86$\pm$0.36 \\
NES-RS & 47.10$\pm$1.46 & 65.33$\pm0.36$ & 30.68$\pm$1.66 & 58.80$\pm$0.80 & 9.96$\pm$1.45 & 70.24$\pm$0.33 & 12.01$\pm$0.93 & 27.49$\pm$0.34 \\
\cmidrule(l){1-9}
NESBS (MC Sampling) & \textbf{50.69}$\pm$1.58 & \textbf{67.63}$\pm$0.05 & \textbf{33.37}$\pm$0.42 & \textbf{60.36}$\pm$0.62 & \textbf{15.64}$\pm$2.83 & \textbf{71.25}$\pm$1.27 & \textbf{13.11}$\pm$1.16 & \textbf{29.87}$\pm$1.17 \\
NESBS (SVGD-RD) & \textbf{51.47}$\pm$0.40 & \textbf{66.66}$\pm$0.13 & \textbf{35.02}$\pm$0.37 & \textbf{59.96}$\pm$0.18 & \textbf{16.72}$\pm$0.61 & \textbf{69.88}$\pm$0.16 & \textbf{14.62}$\pm$0.55 & \textbf{31.07}$\pm$0.33 \\
\bottomrule
\end{tabular}
}
\label{tab:adversarial}
\end{table*}
 

\subsection{Single-model Performances and Diverse Model Predictions}\label{sec:trade-off}
We demonstrate that the effectiveness of our NESBS results from its ability to achieve a good trade-off between the single-model performances and the diversity of model predictions. 
We firstly quantitatively compare the single-model performances (measured by the \emph{averaged test error} (ATE) of the models in an ensemble) and the diversity of model predictions (measured by the \emph{pairwise predictive disagreement} (PPD) of an ensemble \citep{disagreement}) achieved by different ensemble (search) algorithms on CIFAR-10/100. 
We further qualitatively visualize their single-model performances and diverse model predictions using a histogram of the ATE of the models in their ensembles and a t-SNE \citep{t-sne} plot of their model predictions, respectively.

\begin{table}[t]
\caption{Quantitative comparison of the single-model performances (measured by ATE (\%), smaller is better) and the diversity of model predictions (measured by PPD (\%), larger is better) achieved by different ensemble (search) algorithms with an ensemble size of $3$ on CIFAR-10/100.}
\renewcommand\multirowsetup{\centering}
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{l *{4}{c}}
\toprule
\multirow{2}{*}{\textbf{Method}} & 
\multicolumn{2}{c}{\textbf{C10}} &
\multicolumn{2}{c}{\textbf{C100}} \\
\cmidrule(l){2-3} \cmidrule(l){4-5}
& ATE & PPD & ATE & PPD \\
\midrule 
MC DropPath (DARTS) & 2.71 & 0.39 & 16.68 & 2.63 \\
DeepEns (DARTS) & \textbf{2.69} & 2.08 & \textbf{16.18} & 12.45  \\
NES-RS  & 2.87 & 2.29 & 17.20 & \textbf{14.14} \\
\midrule
NESBS (MC Sampling)  & 2.80 & \textbf{2.57}  & 16.70 & 13.84\\
NESBS (SVGD-RD) & 2.78 & 2.27 & 16.50 & 13.16 \\
\bottomrule
\end{tabular}
}
\label{tab:perf_vs_diver}
\end{table}

Table \ref{tab:perf_vs_diver} and Fig.~\ref{fig:perf_vs_diver} present the results of our quantitative and qualitative comparisons, respectively. Compared with the ensemble baselines of MC DropPath and DeepEns, our NESBS is capable of enjoying a larger diversity of model predictions while preserving competitive single-model performances. Meanwhile, compared with the ensemble search baselines of NES-RS, our algorithm can achieve improved single-model performances while maintaining comparably diverse model predictions. These results suggest that our NESBS is able to select ensembles achieving a better trade-off between the single-model performances and the diversity of model predictions among these baselines, which is known to be an important criterion for well-performing ensembles \citep{zhou-ensemble}. Thus, Table \ref{tab:perf_vs_diver} and Fig.~\ref{fig:perf_vs_diver} provide empirical justifications for the improved effectiveness of NESBS.



\begin{figure}[t]
\centering
\begin{tabular}{cc}
    \hspace{-6mm}\includegraphics[width=0.525\columnwidth]{figs/avg_test_error_cifar10.pdf} & \hspace{-6mm}\includegraphics[width=0.485\columnwidth]{figs/tsne.pdf} \\
    {\hspace{-3mm} (a) Single-model performances} & \hspace{-6mm} {(b) Diverse predictions}
\end{tabular}
\caption{Qualitative comparison of (a) the single-model performances and (b) the diverse model predictions achieved by different ensemble (search) algorithms with an ensemble size of $n=3$ on CIFAR-10. Each architecture in (b) is independently evaluated for ten times to visualize their model predictions, which follows from DeepEns.}
\label{fig:perf_vs_diver}
\end{figure}


\section{Conclusion}
This paper presents a novel neural ensemble search algorithms, called NESBS, that can effectively and efficiently select well-performing neural network ensembles with diverse architectures from a NAS search space. Our extensive experiments have shown that NESBS is able to achieve improved performances while preserving a comparable search cost compared with conventional NAS algorithms. Moreover, even compared with other ensemble (search) baselines (e.g., DeepEns and NES-RS), our NESBS is also capable of enjoying boosted search effectiveness and efficiency, which further suggests the superior performance of our NESBS in practice.

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This research/project is supported by A*STAR under its RIE$2020$ Advanced Manufacturing and Engineering (AME) Programmatic Funds (Award A$20$H$6$b$0151$).

\end{acknowledgements}

\nocite{pc-darts, concrete, gumbel-softmax, adam, bo, cutout, dropnas}
\bibliography{shu_406}

% \appendix
% \input{appendix}

\end{document}
