\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{wrapfig}
\usepackage{wrapfig}
\usepackage{float}
\floatstyle{plaintop}
\restylefloat{table}
\usepackage{url} 
\graphicspath{ {figure/} }
\usepackage{tikz}
\usetikzlibrary{shapes,arrows}

\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{graphicx}
% algorithm
\usepackage{algorithm}
\usepackage{algpseudocode}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Diversity-enhanced Probabilistic Ensemble For
Uncertainty Estimation}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<wangh36@rpi.edu>}{Hanjing Wang}}
\author[1]{\href{mailto:<jiq@rpi.edu>}{Qiang Ji}}
% Add affiliations after the authors
\affil[1]{%
   ECSE\\
    Rensselaer Polytechnic Institute\\
    Troy, New York, USA
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  \begin{document}
\maketitle

\begin{abstract}
Ensemble methods combine multiple individual models for prediction, which have demonstrated their effectiveness in accurate uncertainty quantification (UQ) and strong robustness. Obtaining a diverse ensemble set of model parameters results in better model averaging performance and better approximation of the true posterior distribution of these parameters. In this paper, we propose the diversity-enhanced probabilistic ensemble method with the adaptive uncertainty-guided ensemble learning strategy for better quantifying uncertainty and further improving the model robustness. Specifically, we construct the probabilistic ensemble model by building a Gaussian distribution of the model parameters for each ensemble component using Laplacian approximation in a post-processing manner. Then a mixture of Gaussian model is established with learnable and refinable parameters in an EM-like algorithm. During ensemble training, we leverage the uncertainty estimated from previous models as guidance when training the next one such that the new model will focus more on the less explored regions by previous models. Various experiments including out-of-distribution detection and image classification under distributional shifts have demonstrated better uncertainty estimation and improved model generalization ability of our proposed method.
\end{abstract}

\section{Introduction}
The real world is full of uncertainty. However, deterministic deep learning models might be overconfident in some predictions that they actually do not know due to the lack of knowledge of those data regions \citep{Lakshminarayanan_NIPS17_ensemble}. Hence, establishing deep learning models in a probabilistic manner is very important for a trusted system, which will enable us to tell when the models will fail in their predictions and guide human behaviors with confidence.

There are mainly two types of uncertainty, namely, epistemic uncertainty and aleatoric uncertainty \citep{Kendall2017}. Epistemic uncertainty represents the prediction uncertainty due to the lack of knowledge when building the models. Aleatoric uncertainty measures the inherent data noise in the distribution, which is irreducible. 
%To quantify aleatoric uncertainty, we can assume the target prediction follows a parametric distribution and use a probabilistic neural network to learn the parameters of the distribution as a function of the input~\cite{Nix1994,Kendall2017}. For quantifying epistemic uncertainty, 
For quantifying those uncertainties, we can rely on Bayesian neural networks (BNNs) which aim at constructing the posterior distribution of the neural network parameters. %The epistemic uncertainty can be quantified by the mutual information between the target prediction and model parameters \cite{depeweg2018decomposition}. 
However, the Bayesian inference performs marginalization over the posterior distribution, which is often intractable in practice. 

Alternatively, the deep ensemble method \citep{Lakshminarayanan_NIPS17_ensemble} trains an ensemble of deep neural networks from random initializations, which demonstrates great success in predictive uncertainty calibration and outperforms various approximate BNNs. Generating sufficient and diverse ensemble components can better approximate the complex posterior distribution. Ensemble diversity is also a good indicator of uncertainty quantification performance and model robustness \cite{dusenberry2020efficient}. Higher diversity enables different models to make independent errors such that their individual mistakes will be canceled out during majority voting and model averaging, leading to better prediction accuracy and improved generalization ability \citep{bian2021does,zhang2020diversified}. However, traditional ensemble methods have limited diversity since each component is trained independently with only different initializations. Generating sufficient and diverse ensemble models requires many initializations and is hence computationally expensive. With limited computational resources, ensemble-based methods can only provide several modes to approximate the posterior distribution, which is hard to describe the complex posterior landscape. Moreover, previous methods often train each component independently and ignore the important knowledge from previous models when getting a new model. Finally, there are also multiple resources that we can gain additional diversity during ensemble training besides random initializations. 

To overcome the above limitations, we propose the diversity-enhanced probabilistic ensemble (PE) method, which has the following contributions. 
\begin{itemize}
\item We leverage the PE, a Bayesian framework to model aleatoric and epistemic uncertainty by combining the ensemble method and Laplacian approximation (LA)~\citep{MacKay1992} for Bayesian inference. The diversity of ensemble components is enhanced through exploring the neighborhood of each ensemble member by LA, where performance guarantees are provided. 
\item Given the LA for ensemble members, a mixture of Gaussian (MoG) is constructed with learnable and refinable parameters in an EM-like algorithm, enabling a better posterior approximation of model parameters.
\item We propose an adaptive uncertainty-guided ensemble training strategy (AUEL), where the new ensemble model is trained based on the knowledge of previous models with the guidance of uncertainty, leading to an improved ensemble diversity and a better joint model averaging performance.
\item Various applications have been conducted including out-of-distribution detection and image classification under distributional shifts, which showcase the competitive performance of our method in uncertainty estimation and domain generalization. 
\end{itemize}

\section{Related Work}

\paragraph{Laplacian Approximation}
Laplace approximation assumes a Gaussian posterior distribution by performing Taylor expansion around the mode. 
%The Laplacian approximation gains increasing attention recently for uncertainty estimation due to its simplicity for applying to any pre-trained deterministic neural networks without the need of retraining to generate uncertainty. 
However, constructing the Gaussian posterior for large models by LA is not applicable mainly because of the computational difficulty of the large covariance matrix for high-dimensional model parameters. Several methods are proposed to improve the efficiency of LA. For example, subnetwork LA \citep{daxberger2021bayesian} and last-layer LA \citep{kristiadi2020being} reduce the number of Bayesian parameters by constructing the posterior distribution only for partial neural network weights. Different Hessian matrix factorization methods are also proposed such as Kronecker-factored approximation curvature (KFAC) \citep{Ritter_ICLR18_laplace} and low-rank KFAC \citep{lee2020estimating}. Please refer to the survey paper \citep{daxberger2021laplace} for more information.

\paragraph{Ensemble Methods for Uncertainty Estimation}
Besides the deep ensemble method, different ensemble-based variants have been proposed to improve the UQ efficiency or accuracy. For improving efficiency, deep sub-ensemble~\citep{valdenegro2019deep} ensembles only the layers close to the output. The snapshot ensemble \citep{huang2017snapshot} method collects different ensemble components in different epochs of one training attempt. Considering weight sharing, the batch-ensemble \citep{wen2020batchensemble} method proposes a parameter-efficient representation of ensemble weights. For improving the accuracy, some ensemble methods further explore each ensemble subspace by an approximate posterior estimation such as Multi-SWAG \citep{wilson2020bayesian} and ensemble with subspace sampling \citep{fort2019deep}. Multi-SWAG combines the deep ensemble with SWAG to form a mixture of Gaussian distribution with uniform coefficients while \cite{fort2019deep} built an ensemble model by training multiple variational BNNs with empirical analysis. Recently, \cite{eschenhagen2021mixtures} connected ensemble methods with LA for better uncertainty quantification. Some ensemble techniques such as MIMO \citep{havasi2020training} and Rank-1 BNN \citep{dusenberry2020efficient} also use a mixture of approximate posteriors to capture ensemble components. However, they are mainly designed for training multiple subnetworks in one model's capacity, which is less accurate. Compared to the above methods, we focus on diversity-enhanced ensemble learning for improving UQ accuracy and proposed three sub-modules, including probabilistic ensemble, adaptive uncertainty-guided ensemble learning, and MoG refinement.

\paragraph{Diversity-enhanced Ensemble Learning} Diversity matters for improving ensemble performance. One line of work trains ensemble models with special diversity regularization \citep{zhang2020diversified,zaid2021efficiency,jain2020maximizing,liu1999ensemble,pearce2018_Bayesian_ensemble,wabartha2021handling}. For example, \cite{zhang2020diversified} utilized the pair-wise difference between classifiers %adopted from \cite{li2012diversity} 
as regularization. 
\cite{zaid2021efficiency} created a diversity-promoted ensemble loss based on mutual information. \cite{jain2020maximizing} leveraged out-of-distribution samples as regularization to increase ensemble diversity.  Another line of work focuses on training each ensemble component with a subset of data so that each ensemble model has its own learning specialty to increase diversity \citep{lee2015m,zhou2018diverse}. Moreover, EDST \citep{liu2021deep} and SeBayS \citep{jantre2022sequential} make adjustments to the learning process to obtain ensembles sequentially from diverse models. Finally, \cite{wenzel2020hyperparameter} tried to increase ensemble diversity by training with different hyperparameters. \cite{zaidi2020neural} further constructed the ensemble models with different architectures and proposed a special selection procedure for choosing diverse ensemble members from a pre-trained ensemble model pool. Recently, several particle-based function-space variational inference methods \citep{tiulpin2021greedy, d2021repulsive,yashima2022feature} try to utilize a finite number of models to approximate the Bayesian posterior distribution through optimization, where they often consider the interaction between models with an explicit diversity measurement as regularization. We exclude them from comparisons since our proposed method is in weight space and is a randomization-based method, which is often more efficient than function-space methods. In the development of diversity-enhanced learning, different diversity metrics are studied \citep{wu2020promoting,wu2021boosting}. There are also many related applications such as active learning \citep{tan2021diversity} and computer vision tasks \citep{dvornik2019diversity}.

\section{Proposed Method} \label{Proposed_method}
\subsection{Background}
\textbf{General Notations and Assumptions.} 
Denote the input as $x$, the target variable as $y$, the training data as $\mathcal{D} = \{x_m, y_m\}_{m=1}^M$. 
%Assume $y$ is a random variable with a certain distribution assumption, e.g. categorical for classification. 
In this paper, we will focus on classification tasks. We denote $f(x,\theta) \in \mathcal{R}^C$ as the output of the neural network with input $x$ parameterized by $\mathbf{\theta}$, which is the probability logit before the softmax layer. $C$ represents the number of classes. When constructing the ensemble models, $f(x,\mathbf{\theta}_i)$ represents the output of the $i$th ensemble component. $\beta$ represents the hyperparameter of LA for the prior distribution of $\theta$. $\mathbb{E}(\cdot)$ represents the expectation. $\mathcal{H}(\cdot)$ represents the entropy. 

\textbf{Laplacian Approximation.}
The LA constructs the posterior distribution $p(\mathbf{\theta}|\mathcal{D},\beta)$ by a Gaussian distribution around the MAP estimate $\mathbf{\theta}_{map}$ where
\begin{equation}
    \mathbf{\theta}_{map}=\arg \max_{\mathbf{\theta}} \log p(\mathbf{\theta}|\mathcal{D},\beta).
\end{equation}
By taking the second-order Taylor expansion of $\log p(\mathbf{\theta}|\mathcal{D},\beta)$ around $\mathbf{\theta}_{map}$, we can observe that
\begin{equation}
    p(\mathbf{\theta}|\mathcal{D},\beta) \approx \mathcal{N}(\mathbf{\theta}_{map}, \Sigma)
\end{equation}
where $\Sigma=-(H)^{-1}$ and $H=\nabla_{\mathbf{\theta}}^2 \log p(\mathbf{\theta}|\mathcal{D},\beta)|_{\mathbf{\theta} = \mathbf{\theta}_{map}}$. 
Please refer to Appendix A.1 for more details. This paper utilizes the last-layer LA \citep{kristiadi2020being} to achieve competitive accuracy with high efficiency. 

\textbf{Uncertainty Quantification.} For classification problems, we estimate the epistemic uncertainty and the aleatoric uncertainty by the mutual information and the expected entropy \citep{depeweg2018decomposition}. Details can be found in Appendix A.2. 

\begin{figure}[ht]
    \centering
    \includegraphics[width=0.8\linewidth]{PE_illustration.pdf}

    \caption{Posterior Approximation by Samples.}
    \label{fig:PE}
\end{figure}
\subsection{Probabilistic Ensemble} \label{PE}
Given $N$ pre-trained ensemble models, we denote $\theta_{i}$ as the MAP estimation of the $i$th ensemble component parameters. Inspired by \citep{eschenhagen2021mixtures}, we perform the Laplacian approximation for each ensemble component as an approximation of the true posterior, denoted as $\mathcal{N}(\theta; \theta_{i}, \Sigma_i)$. A mixture of Gaussian model is constructed with coefficients $\{\lambda_i\}_{i=1}^N$ as shown in Eq.~\eqref{mixture_of_Gaussian_PE}, which can better approximate the posterior distribution $p(\theta|\mathcal{D},\beta)$.

\begin{equation}
    \label{mixture_of_Gaussian_PE}
    p(\theta|\mathcal{D},\beta) \approx \sum_{i=1}^N \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i)
\end{equation}
where $\lambda_i\in [0,1], i=1,2,...,N$ and $\sum_{i=1}^N \lambda_i=1$.

As shown in Figure \ref{fig:PE}, PE can better approximate the posterior distribution than the deep ensemble method by exploring each ensemble subspace using LA. Given the probabilistic ensemble model, the Bayesian inference is performed shown in Eq. \eqref{inference_PE}:
\begin{equation}
    \label{inference_PE}
    \begin{split}
        p(y|x,\mathcal{D}) &\approx \int p(y|x,\theta)\sum_{i=1}^N \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i)d\theta \\  
        &\approx \frac{1}{S} \sum_{s=1}^S p(y|x,\theta^s).
    \end{split}
\end{equation}
where $\theta^s \sim \sum_{i=1}^N \lambda_i \mathcal{N}(\mathbf{\theta};\mathbf{\theta}_{i},\Sigma_i)$ represents the $s$th sample from the Gaussian mixture model. While our suggested probabilistic ensemble approach may bear resemblances to the method outlined in \citep{eschenhagen2021mixtures}, especially in the context of merging LA with ensemble models, our research is primarily driven by a focus on diversity-enhanced ensemble learning, backed with theoretical validations. The most notable distinctions between our strategies and \citep{eschenhagen2021mixtures} predominantly involve our process of securing the diverse modes $\{\theta_i\}_{i=1}^N$, and our methodology in formulating the Gaussian mixture. 

Several propositions are shown to demonstrate the effectiveness of the PE model with theoretical guarantees. They are valid for the PE of any pre-trained deterministic ensemble models, regardless of their training methodology. All the proofs can be found in Appendix B. Specifically, approximation guarantees are shown in Proposition \ref{PE_convergence} and \ref{better_posterior}. When the sample size is large, Proposition \ref{PE_convergence} guarantees that the PE model converges to the true posterior distribution. Otherwise, Proposition \ref{better_posterior} shows theoretical evidence that the PE model bridges the connection of the deep ensemble method to approximate Bayesian inference and has better posterior approximation than single LA.

\begin{proposition}[Convergence of PE]\label{PE_convergence} Denote the data samples as $\mathcal{D} = \{x_m, y_m\}_{m=1}^M$. Under mild regularity conditions \citep{gelman2011induction}, as the sample size $M\rightarrow \infty$, the probabilistic ensemble representation of $\theta$ approaches its posterior distribution, i.e.,
\begin{equation}
    \label{convergence_PE}
    \sup_{\theta} \left|p(\theta|\mathcal{D},\beta) - \sum_{i=1}^N \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i)\right| \rightarrow 0.
\end{equation}
\end{proposition}

\begin{proposition}[Better posterior approximation]\label{better_posterior} PE models extend the deep ensemble method for approximate Bayesian inference. Denote $p_{PE}(\theta)=\sum_{i=1}^N \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i), p_{LA}^{(i)}(\theta) =  \mathcal{N}(\theta;\theta_{i}, \Sigma_i)$ as the PE approximation and the ith-network LA, respectively.
The PE model has better posterior approximation compared to the single LA with a measure of KL divergence. 
\begin{equation}
    \begin{split}
        &KL(p(\theta|\mathcal{D},\beta)||p_{PE}(\theta)) \leq \sum_{i=1}^N \lambda_i KL(p(\theta|\mathcal{D},\beta)||p_{LA}^{(i)}(\theta))\\
    \end{split}
\end{equation}
\end{proposition}

\begin{proposition}[Error reduction of the PE and the role of diversity]\label{theorem1}
Denote $\theta  \sim \sum_{i=1}^N \lambda_i \mathcal{N}(\mathbf{\theta};\mathbf{\theta}_{i},\Sigma_i)$ as PE parameters, $x$ as the input, and $y^*$ as the corresponding label. The PE model fulfills 
\begin{equation}
\label{theorem: error_reduction}
\begin{split}
    -\log \mathbb{E}_{\theta}
    [p(y^*|x,\theta)] \leq  &\mathbb{E}_{\theta}[- \log p(y^*|x,\theta)]\\
    &-\inf_{\theta}\frac{1}{2p(y^*|x,\theta)^2} \mathbb{V}_{\theta}[p(y^*|x,\theta)]
\end{split}
\end{equation}
where $\inf_{\theta}\frac{1}{p(y^*|x,\theta)^2}$ is bounded given $p(y^*|x,\theta)\in [0,1]$ and $\mathbb{V}_{\theta}[p(y^*|x,\theta)]$ is the variance of probabilistic ensemble model prediction.
\begin{equation}
    \label{variance}
    \mathbb{V}_{\theta}[p(y^*|x,\theta)] = \mathbb{E}_{\theta}[(p(y^*|x,\theta) - \mathbb{E}_{\theta}[p(y^*|x,\theta)])^2]
    \end{equation}
\end{proposition}
Proposition \ref{theorem1} shows that the errors of the PE model are reduced compared to single models, which are also bounded by variance $\mathbb{V}_{\theta}[p(y^*|x,\theta)]$. The diversity measurement $\mathbb{V}_{\theta}[p(y^*|x,\theta)]$ can be applied for both regression and classification tasks since $p(y^*|x,\theta)$ is a scalar variable parameterized by $\theta$ given label $y^*$. With a larger variance, the upper bound of the negative log-likelihood (NLL) is reduced.  As a result, we can theoretically show that enhancing diversity improves the prediction performance when $\mathbb{E}_{\theta}[- \log p(y^*|x,\theta)]$ remains similar. Moreover, Proposition \ref{theorem_diversity} shows that PE has better diversity compared to deep ensemble method as the theoretical basis of the improved performance.

\begin{proposition}[Enhanced diversity of PE]\label{theorem_diversity} Let $\mu_D, \Sigma_D$ be the mean and covariance matrix of the deep ensemble representation $p_{DE}(\theta)=\sum_{i=1}^N \lambda_i \delta(\theta,\theta_i)$ where $\delta$ represents the delta function. Let $\mu_P, \Sigma_P$ be the mean and covariance matrix of $p_{PE}(\theta)$. We show that
\begin{equation}
    \mu_D = \mu_P \quad \quad \Sigma_P \geq \Sigma_D
\end{equation}
where $\Sigma_{P}\geq \Sigma_{D}$ means $\Sigma_{P}- \Sigma_{D}$ is positive semi-definite. Compared to deep ensemble method, the PE model gains improved diversity. 
\end{proposition}

\begin{proposition}[Overconfidence reduction of PE]\label{overconfidence} Given a probabilistic ensemble model with $N$ components, let $f_{\theta_i}: R^{|x|}\rightarrow R^C$ be a ReLU network parameterized by $\theta_i$. Let $|x|$ represent the dimension of $x$ and $\theta \sim \sum_{i=1}^N \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i)$. Then for any input $x$, the estimated probability based on multi-class probit approximation (see Appendix A.1) of the PE fulfills 
\begin{equation}
    \lim_{\eta \rightarrow \infty} p_{PE}(y=c|\eta x) \leq \sum_{i=1}^N  \frac{\lambda_i}{1+\sum_{j\neq c} \exp \{ -t_i^{(j)}-t_i^{(c)}\}}
\end{equation}
where 
$$
    t_i^{(k)} = \frac{||w_i^{(k)}||}{s_{min}(J_{i}^{(k)})\sqrt{\frac{\pi}{8}\lambda_{min}(\Sigma_i)}} \quad k = 1,2,\cdots,C
$$
and $w_i = [w_i^{(1)},w_i^{(2)},\cdots,w_i^{(C)}]\in R^{|x|\times C}$ is a matrix that only depends on $\theta_i$. $J_i^{(j)} = \frac{\partial w_i^{(j)}}{\partial \theta}|_{\theta = \theta_i}$ is the Jacobian matrix of $w_i^{(j)}$ at $\theta = \theta_i$. $\lambda_{min}$ represents the minimum eigenvalue. $s_{min}$ represents the minimum singular value.   
\end{proposition}

Deterministic models suffer from the overconfidence issue such that the estimated probability is very high even if the input is far away from the data distribution. The Proposition \ref{overconfidence} builds an upper bound for the predictive probability of samples $\{\eta x\}$ when $\eta \rightarrow \infty$, which prevents $p_{PE}(y=c|\eta x)$ to be extremely large. The Proposition \ref{overconfidence} also shows that the confidence for far-away samples is upper bounded by the uncertainty estimated from LA. Especially, when the uncertainty is large, i.e., $\lambda_{min}(\Sigma_i) \rightarrow \infty, i=1,2,\cdots,N$, we have $\lim_{\eta \rightarrow \infty} p_{PE}(y=c|\eta x)\leq \frac{1}{C}$. 


\subsection{Adaptive Uncertainty-guided Ensemble Learning} \label{AUEL}
The deep ensemble method trains ensemble models independently, which ignores the information obtained from previous models when getting a new one. This may cause knowledge redundancy that limits the diversity among ensemble models. 
%Like uncertainty-guided active learning \cite{yang2016active} which add new samples for retraining the network based on uncertainty information, 
The key idea of the proposed adaptive uncertainty-guided ensemble learning is to always make the new model focus on the regions which previous models have less explored, measured by uncertainty. Thus the new model will have the ability to provide complementary information to the previous models, which will improve the model averaging performance as well as implicitly enhance the diversity. %The framework is shown in Figure \ref{fig:framework}.

\begin{figure}[ht]
    \centering
    \includegraphics[width=0.6\linewidth]{PE_figure_aistats.pdf}
    \caption{The Adaptive Uncertainty-guided Ensemble Learning Framework}
    \label{fig:framework}
\end{figure}

Given $k$ trained deterministic models with parameters $\{\theta_i\}_{i=1}^k$, we perform the adaptive uncertainty-guided ensemble learning to get the ($k+1$)th model in the following steps. First, we construct the probabilistic ensemble illustrated in Sec. \ref{PE} such that $\theta \sim \sum_{i=1}^k \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i)$. Then, the epistemic uncertainty $u(x)$ of each training data $x$ is computed. Finally, we use the estimated epistemic uncertainty $u(x)$ from previous models as weights to guide the training of the $(k+1)$th model. Given a batch of data $\mathcal{D}_B = \{x_m,y_m\}_{m=1}^B$ of size $B$, the uncertainty-guided training loss can be expressed as 

\begin{equation}
\label{uncertainty_guided_nll}
    \mathcal{L}_{nll}(\theta) = -\frac{1}{B}\sum_{m=1}^B w(x_m) \log p(y_m|x_m,\theta) 
\end{equation}

where $w(x_m)$ is the weight for sample $x_m$ as a function of $u(x_m)$, which is shown in Eq.~\eqref{weights}.
\begin{equation}
    \label{weights}
    w(x_m) = \frac{\exp(a*\log(u(x_m))+b)}{\sum_{j=1}^B \exp(a*\log(u(x_j))+b) }
\end{equation}
Eq.~\eqref{weights} guarantees that samples with larger uncertainty will receive larger weights and the weights for a batch of data sum to 1. A $\log$ function is applied on $u(x_m)$ since the epistemic uncertainty is usually small. $a,b>0$ are hyperparameters that can be tuned. The following propositions provide some theoretical evidence of the proposed method with proofs shown in Appendix C. 
\begin{proposition}[Prediction error bound]\label{prediction_error_bound} The prediction error is bounded by the total uncertainty. The epistemic uncertainty is positively correlated with the prediction error. 
\end{proposition} 
\begin{proposition}[Striking the right balance with uncertainty \citep{khan2019striking}]\label{balance} 
For the imbalanced classification problems, minimizing the empirical loss results in a hypothesis that the classification boundary is towards the minority classes, leading to a larger classification region for the majority ones.
\end{proposition}
% In fact, let $f$ be any classifier with input $x$ and denote $y$ as the corresponding unknown true labels. Following \cite{hellman1970probability}, we have
% \begin{equation}
%     \label{prediction_error}
%     Pr(y\neq f(x)) \leq \frac{\mathcal{H}(y)-MI(x,y)}{2} = \frac{1}{2} \mathcal{H}(y|x)
% \end{equation}
% where $MI(x,y)$ is the mutual information between $x$ and $y$. Note that $\mathcal{H}(y|x)=\mathcal{H}[\mathbb{E}_{\theta}[p(y|x,\theta)]]$ is the total uncertainty, which is positively correlated with the prediction error. It indicates that minimizing the total uncertainty can lead to a better prediction error bound. Since total uncertainty is the sum of epistemic uncertainty and aleatoric uncertainty and aleatoric uncertainty is irreducible, the epistemic uncertainty is also positively correlated with the prediction error. 

Proposition \ref{prediction_error_bound} provides theoretical support for uncertainty-guided learning. By putting higher weights on problematic samples, the proposed method can reduce their uncertainty through adaptive ensemble learning to improve overall accuracy. Proposition \ref{balance} shows that a single model tends to sacrifice minority samples to obtain a good overall performance. It motivates us to adaptively learn complementary models focusing on minority samples, in order to achieve better ensemble performance.

Although our method is similar to boosting methods \citep{freund1997decision, hastie2009multi} in terms of reweighing the samples, they, however, are fundamentally different. As a discriminative model, boosting methods build an ensemble classifier by combining a set of weak classifiers to better classify the data. In contrast, we construct a generative ensemble that better models the posterior distribution of model parameters, through which we perform uncertainty quantification. Moreover, instead of using classification errors to weigh the samples, we use epistemic uncertainty to weigh the training samples. As epistemic uncertainty inversely measures training sample density, training of the next model will focus more on the samples that are not well represented by previous models. 

\subsection{Mixture of Gaussian Refinement}\label{refine}
In this section, we will establish an EM-like algorithm for refining the mixture of Gaussian parameters. To our knowledge, most of the ensemble methods assume that each ensemble component has the same importance, which may not be the case for real-world applications. Denote $\phi = \{\{\lambda_i\}_{i=1}^N, \{\theta_i\}_{i=1}^N, \{\Sigma_i\}_{i=1}^N\}$ as the mixture of Gaussian parameters, $\phi^0 = \{\{\lambda_i^0\}_{i=1}^N, \{\theta_i^0\}_{i=1}^N, \{\Sigma_i^0\}_{i=1}^N\}$ as the previous learned parameters before the refinement, and the training data as $\mathcal{D} = \{\mathcal{D}_m\}_{m=1}^M = \{x_m, y_m\}_{m=1}^M$. Let $Z \sim Cat(\lambda_1,\lambda_2,\dots,\lambda_N)$ be the latent variable indicating membership of $(x,y)$ belonging to which ensemble component. We learn non-uniform $\{\lambda_i\}_{i=1}^N$ and refine $\{\{\theta_i\}_{i=1}^N, \{\Sigma_i\}_{i=1}^N\}$ in the following EM steps.

E-step: construct the loss function $Q(\phi|\phi^0,\mathcal{D})$ as the expected value of the log-likelihood function of $\phi$ with respect to the current conditional distribution of $Z$ given $\phi^0$ and $\mathcal{D}$.
\begin{equation}\label{eq:e-step}
\begin{split}
    \log &p(\mathcal{D}|\phi) = \sum_{m=1}^M \log p(\mathcal{D}_m|\phi) \\%= \sum_{m=1}^M \log \sum_{i=1}^N p(\mathcal{D}_m,Z=i|\phi) \\
    &= \sum_{m=1}^M \log \sum_{i=1}^N \frac{p(Z=i|\mathcal{D}_m,\phi^0)}{p(Z=i|\mathcal{D}_m,\phi^0)} p(\mathcal{D}_m,Z=i|\phi)\\
    &\geq \sum_{m=1}^M \sum_{i=1}^N p(Z=i|\mathcal{D}_m,\phi^0) \log \frac{p(\mathcal{D}_m,Z=i|\phi)}{p(Z=i|\mathcal{D}_m,\phi^0)} \\
    &:= Q(\phi|\phi^0,\mathcal{D})
\end{split}
\end{equation}
M-step: maximize $Q(\phi|\phi^0,\mathcal{D})$ with respect to $\phi$. 
\begin{equation} \label{eq:m-step}
    \phi^* = \arg \max_{\phi} Q(\phi|\phi^0,\mathcal{D})
\end{equation}
Optimizing Eq.~\eqref{eq:m-step} returns a close-form expression of $\{\lambda_i^*\}_{i=1}^N$.
\begin{equation}
    \lambda_i^* = \frac{\sum_{m=1}^M p(Z=i|\mathcal{D}_m,\phi^0)}{\sum_{m=1}^M \sum_{j=1}^N p(Z=j|\mathcal{D}_m,\phi^0)} 
\end{equation}
Letting $p_m(\theta) =  p(y_m|x_m,\theta)$,
\begin{equation}
    p(Z=i|\mathcal{D}_m,\phi^0) = \frac{\lambda_i^0 \int p_m(\theta)\mathcal{N}(\theta; \theta_i^0,\Sigma_i^0)d\theta}{\sum_{j=1}^N \lambda_j^0 \int p_m(\theta)\mathcal{N}(\theta; \theta_j^0,\Sigma_j^0)d\theta}
\end{equation}
Then given the distribution $Z \sim Cat(\{\lambda_i^*\}_{i=1}^N)$, we assign each data samples to its top $l$ nearest components based on their weighted log-likelihood (i.e., $l = N/2$). The refinement is conducted by fine-tuning the existing ensemble components on the data samples they receive to further strengthen the specialty and diversity of each ensemble model. Details can be found in Appendix D. 
\subsection{Probabilistic Ensemble Training Strategy}
In this paper, three sub-modules are proposed: the probabilistic ensemble built by LA, the uncertainty-guided ensemble learning, and the mixture of Gaussian refinement. The pseudocode of the overall proposed method is shown in Algorithm \ref{alg:ensemble}, consisting of four steps. Although the final refinement step can further improve performance, it is not required. 

During training, we admit that AUEL requires sequential training, which takes more time than parallel training. However, our methods can achieve similar UQ results with fewer ensemble components, compared to other ensemble baselines in Sec. \ref{sec:ablation}. It could be more useful when there are limited capacities for parallel training or when there exist parallelly trained ensemble models with low diversity and we want to add a new one for providing complementary information. PE can be applied to any trained ensemble models with high efficiency. The last-layer LA is efficient whose complexity is $O(m+c^3+p^3)$, where $m,c,p$ represents the total number of parameters, the number of classes, and the number of last-layer parameters. For the inference complexity of PE, we can generate an arbitrary number of samples from the mixture of Gaussian. Compared to the deep ensemble method, the additional cost to obtain one more sample is $O(p)$, which is minimal since we only sample the last-layer parameters and reuse the intermediate outputs. More importantly, each sub-module can be applied to other ensemble methods separately to make further improvements. Although incorporating all sub-modules leads to the best performance, only applying PE could be an alternative way for efficient training.

The possible parallel training extensions may include: (1) Train one deterministic model using LA for UQ, then parallelly train other models with varying uncertainty-driven weights from Eq. 11 with different hyperparameters $a,b$; (2) Train all models in parallel, compute LA for each near completion to build PE, and use uncertainty-guided weights to refine the models in their final training phase. We will investigate those possibilities in our future research. It is also worth noting that the proposed method can be applied to autoregressive ensemble training \cite{havasi2020training, dusenberry2020efficient}. Uncertainty-guided weights can promote diversity in MIMO sub-networks, and LA can be used to construct a probabilistic ensemble model after training.

\begin{algorithm}[h]
\caption{Probabilistic Ensemble with Adaptive Uncertainty-guided Ensemble Learning}\label{alg:ensemble}
\begin{algorithmic}
\State {\bfseries Input:} Training data  $\mathcal{D} = \{x_m, y_m\}_{m=1}^M$. Initialize the model pool $P=\{\empty\}$
\State {\bfseries Ouput:} The probabilistic ensemble model parameters $\theta \sim \sum_{i=1}^N \lambda_i \mathcal{N}(\theta;\theta_{i}, \Sigma_i)$
\State {\bfseries Step 1 (single model):} Train the first model using NLL loss to obtain $\theta_1$; $P = P + \{\theta_1\}$
\State {\bfseries Step 2 (AUEL):} Perform the adaptive uncertainty-guided ensemble learning to obtain $\{\theta_i\}_{i=2}^N$
\For{$k=2:N$}
    \State (1) Given $P$, construct the probabilistic ensemble model with uniform weights.
    \State ~~~~$\theta \sim p_{k-1}(\theta)=\frac{1}{k-1}\sum_{i=1}^{k-1}  \mathcal{N}(\theta;\theta_{i}, \Sigma_i)$
    \State (2) Estimate the epistemic uncertainty $\{u(x_m)\}_{m=1}^M$ using $\theta \sim p_{k-1}(\theta)$
    \State (3) Use the weighted loss in Eq.~\eqref{uncertainty_guided_nll} to train $\theta_k$
    \State (4) Update the model pool: $P = P + \{\theta_k\}$
\EndFor
\State {\bfseries Step 3 (AUEL+PE):} Based on current $P=\{\theta_i\}_{i=1}^N$, construct the probabilistic ensemble model
\State {\bfseries Step 4 (AUEL+RPE):} We refine the Gaussian mixture model parameters based on Sec. \ref{refine}
\end{algorithmic}
\end{algorithm}

\begin{table*}[ht]
% \fontsize{8.5}{9}\selectfont
	\caption{OOD Detection Results for AUROC (\%) and AUPR (\%) on MNIST-related and C10-related Datasets with Epistemic Uncertainty. Each experiment result is aggregated over 3 independent runs.}
	\label{tab:ood_result}
	\centering
\begin{tabular}{|l|cc|cc|cc|cc|}

\hline
	\multirow{2}{*}{Method} & \multicolumn{2}{|c|}{MNIST $\rightarrow$ Omniglot} &\multicolumn{2}{|c|}{MNIST $\rightarrow$ EMNIST} & \multicolumn{2}{|c|}{MNIST$ \rightarrow$ KMNIST} \\ \cline{2-7}
	&AUROC&AUPR &AUROC &AUPR  &AUROC &AUPR \\
\hline  
\multirow{1}{*}{Ours}  &$\textbf{98.49} \pm  \scalebox{0.85}{0.01}$   & $\textbf{98.23}\pm \scalebox{0.85}{0.03}$ & $\textbf{98.01} \pm \scalebox{0.85}{0.07}$ & $\textbf{97.26} \pm \scalebox{0.85}{0.08}$ & $\textbf{98.39}\pm \scalebox{0.85}{0.11}$ & $\textbf{97.98}\pm \scalebox{0.85}{0.08}$ \\
\multirow{1}{*}{ESB}  &$97.92 \pm  \scalebox{0.85}{0.25}$   & $97.33 \pm \scalebox{0.85}{0.34}$ & $97.32 \pm \scalebox{0.85}{0.14}$ & $96.10 \pm \scalebox{0.85}{0.46}$ & $97.92\pm \scalebox{0.85}{0.10}$ & $97.13\pm \scalebox{0.85}{0.27}$ \\
\multirow{1}{*}{Batch-E}  &$95.95 \pm  \scalebox{0.85}{0.17}$   & $94.74 \pm \scalebox{0.85}{0.22}$ & $95.79 \pm \scalebox{0.85}{0.70}$ & $93.76 \pm \scalebox{0.85}{0.75}$ & $96.59\pm \scalebox{0.85}{0.45}$ & $94.72\pm \scalebox{0.85}{0.43}$ \\
\multirow{1}{*}{Hyper-E}  &$97.97 \pm  \scalebox{0.85}{0.26}$   & $97.55 \pm \scalebox{0.85}{0.24}$ & $97.56 \pm \scalebox{0.85}{0.31}$ & $96.68 \pm \scalebox{0.85}{0.51}$ & $97.92\pm \scalebox{0.85}{0.43}$ & $97.32\pm \scalebox{0.85}{0.53}$ \\
\multirow{1}{*}{Bayes-E}  &$97.42 \pm  \scalebox{0.85}{0.28}$   & $96.94 \pm \scalebox{0.85}{0.46}$ & $97.07 \pm \scalebox{0.85}{0.29}$ & $95.86 \pm \scalebox{0.85}{0.33}$ & $97.73\pm \scalebox{0.85}{0.06}$ & $96.72\pm \scalebox{0.85}{0.14}$ \\
\multirow{1}{*}{LPBNN}  &$95.94 \pm  \scalebox{0.85}{0.52}$ & $94.41 \pm \scalebox{0.85}{0.57}$  & $92.84 \pm \scalebox{0.85}{0.69}$ & $92.54 \pm \scalebox{0.85}{0.39}$  & $97.40\pm \scalebox{0.85}{0.71}$ & $95.96\pm \scalebox{0.85}{0.95}$ \\
\multirow{1}{*}{LA}  &$97.87 \pm  \scalebox{0.85}{0.39}$   & $97.49 \pm \scalebox{0.85}{0.37}$ & $97.72 \pm \scalebox{0.85}{0.48}$ & $97.02 \pm \scalebox{0.85}{0.44}$ & $98.11\pm \scalebox{0.85}{0.19}$ & $97.54\pm \scalebox{0.85}{0.17}$ \\
\multirow{1}{*}{Multi-SWAG} &$96.52 \pm  \scalebox{0.85}{0.37}$   & $94.56 \pm \scalebox{0.85}{0.84}$ & $95.81 \pm \scalebox{0.85}{0.60}$ & $90.64 \pm \scalebox{0.85}{1.70}$ & $96.70\pm \scalebox{0.85}{0.42}$ & $94.34\pm \scalebox{0.85}{0.98}$ \\
\multirow{1}{*}{Diversified-E} &$97.92 \pm  \scalebox{0.85}{0.19}$   & $97.21 \pm \scalebox{0.85}{0.23}$ & $94.40 \pm \scalebox{0.85}{0.16}$ & $96.21 \pm \scalebox{0.85}{0.37}$ & $97.93\pm \scalebox{0.85}{0.12}$ & $97.01\pm \scalebox{0.85}{0.32}$ \\
\multirow{1}{*}{MCT} &$97.04 \pm  \scalebox{0.85}{0.34}$   & $95.62 \pm \scalebox{0.85}{0.93}$ & $96.65 \pm \scalebox{0.85}{0.46}$ & $95.61 \pm \scalebox{0.85}{0.82}$ & $97.31\pm \scalebox{0.85}{0.10}$ & $95.81\pm \scalebox{0.85}{0.56}$ \\
\hline
\end{tabular}\\
\begin{tabular}{|l|cc|cc|cc|cc|}

\hline
	\multirow{2}{*}{Method} & \multicolumn{2}{|c|}{C10 $\rightarrow$ SVHN} &\multicolumn{2}{|c|}{C10 $\rightarrow$ LSUN} & \multicolumn{2}{|c|}{C10$ \rightarrow$ C100} \\ \cline{2-7}
	&AUROC&AUPR &AUROC &AUPR  &AUROC &AUPR \\
\hline  
\multirow{1}{*}{Ours}  &$93.88 \pm  \scalebox{0.85}{0.57}$   & $90.58 \pm \scalebox{0.85}{1.58}$ & $\textbf{89.57} \pm \scalebox{0.85}{0.08}$ & $\textbf{86.81} \pm \scalebox{0.85}{0.14}$ & $\textbf{93.80}\pm \scalebox{0.85}{0.11}$ & $91.67\pm \scalebox{0.85}{0.36}$ \\
\multirow{1}{*}{ESB}  &$91.23 \pm  \scalebox{0.85}{1.35}$   & $86.16 \pm \scalebox{0.85}{1.73}$ & $88.42 \pm \scalebox{0.85}{0.85}$ & $84.99 \pm \scalebox{0.85}{0.65}$ & $91.87\pm \scalebox{0.85}{0.58}$ & $88.69\pm \scalebox{0.85}{0.55}$ \\
\multirow{1}{*}{Batch-E}  &$90.40 \pm  \scalebox{0.85}{1.62}$   & $85.12 \pm \scalebox{0.85}{2.64}$ & $86.10 \pm \scalebox{0.85}{0.24}$ & $81.42 \pm \scalebox{0.85}{0.40}$ & $90.15\pm \scalebox{0.85}{0.18}$ & $85.48\pm \scalebox{0.85}{0.49}$ \\
\multirow{1}{*}{Hyper-E}  &$91.11 \pm  \scalebox{0.85}{0.32}$   & $85.86 \pm \scalebox{0.85}{0.46}$ & $88.82 \pm \scalebox{0.85}{0.15}$ & $85.29 \pm \scalebox{0.85}{0.25}$ & $92.59\pm \scalebox{0.85}{0.24}$ & $89.65\pm \scalebox{0.85}{0.71}$ \\
\multirow{1}{*}{Bayes-E}  &$90.96 \pm  \scalebox{0.85}{3.35}$   & $86.57 \pm \scalebox{0.85}{5.27}$ & $87.85 \pm \scalebox{0.85}{1.22}$ & $84.56 \pm \scalebox{0.85}{1.01}$ & $91.80\pm \scalebox{0.85}{0.45}$ & $88.83\pm \scalebox{0.85}{0.02}$ \\
\multirow{1}{*}{LPBNN}  &$89.99 \pm  \scalebox{0.85}{2.44}$   & $85.18 \pm \scalebox{0.85}{4.00}$ & $86.87 \pm \scalebox{0.85}{0.01}$  & $82.14\pm \scalebox{0.85}{0.49}$ & $90.80\pm \scalebox{0.85}{0.22}$ & $85.62 \pm \scalebox{0.85}{1.59}$\\
\multirow{1}{*}{LA}   &$93.39 \pm  \scalebox{0.85}{0.46}$   & $91.17 \pm \scalebox{0.85}{0.98}$ & $87.27 \pm \scalebox{0.85}{0.19}$ & $85.77 \pm \scalebox{0.85}{0.21}$ & $93.45\pm \scalebox{0.85}{1.17}$ & $\textbf{92.59}\pm \scalebox{0.85}{1.47}$ \\
\multirow{1}{*}{Multi-SWAG} &$\textbf{94.06} \pm  \scalebox{0.85}{0.54}$   & $\textbf{93.92} \pm \scalebox{0.85}{0.59}$ & $87.23 \pm \scalebox{0.85}{0.29}$ & $85.44 \pm \scalebox{0.85}{0.61}$ & $90.24\pm \scalebox{0.85}{0.86}$ & $88.05\pm \scalebox{0.85}{1.02}$\\
\multirow{1}{*}{Diversified-E}& $92.56\pm \scalebox{0.85}{1.36}$ & $88.04 \pm \scalebox{0.85}{3.29}$ & $89.06 \pm \scalebox{0.85}{0.09}$ &$85.53\pm \scalebox{0.85}{0.20}$ & $92.90\pm \scalebox{0.85}{0.07}$ & $90.01\pm \scalebox{0.85}{0.16}$\\
\multirow{1}{*}{MCT} &$91.04\pm \scalebox{0.85}{0.44}$ & $84.73 \pm \scalebox{0.85}{0.35}$ & $88.71 \pm \scalebox{0.85}{0.16}$ &$84.86\pm \scalebox{0.85}{0.18}$ & $92.18\pm \scalebox{0.85}{0.03}$ & $88.67\pm \scalebox{0.85}{0.24}$\\
\hline
\end{tabular}

\end{table*}
\section{Experiment}

\subsection{Out-of-distribution Detection}
\label{sub:ood}
Out-of-distribution (OOD) detection tries to detect anomalous data that is inconsistent with the training data distribution. 
Utilizing epistemic uncertainty as a measure for out-of-distribution detection is one of the major applications for demonstrating the quality of UQ performance. We evaluate our methods on benchmark image classification datasets MNIST \citep{deng2012mnist} and CIFAR-10 (C10) \citep{krizhevsky2014cifar}, respectively. We choose Omniglot \citep{lake2015human}, EMNIST \citep{cohen2017emnist}, and KMNIST \citep{clanuwat2018deep} as OOD datasets for MNIST. For C10 dataset, the SVHN \citep{netzer2011reading}, LSUN \citep{journals/corr/YuZSSX15}, and CIFAR-100 (C100) \citep{krizhevsky2009learning} are the OOD datasets. We compare our proposed method (AUEL+PE) with general ensemble-based methods (i.e., ESB \citep{Lakshminarayanan_ensemble_NIPS17}, Batch-E \citep{wen2020batchensemble}, Bayes-E \citep{pearce2018_Bayesian_ensemble}), Diversity-promoted ensemble methods (Hyper-E \citep{wenzel2020hyperparameter}, Multi-SWAG \citep{wilson2020bayesian}, Diversified-E \citep{zhang2020diversified}, MCT \citep{lee2015m}), and approximate BNNs (i.e., LPBNN \citep{franchi2020encoding}, LA). We exclude sequential ensemble methods (EDST \citep{liu2021deep}, SeBayS \citep{jantre2022sequential}) and other mixture posterior approximation methods (MIMO \citep{havasi2020training}, Rank-1 BNN \citep{dusenberry2020efficient}) for comparison since they are shown to perform worse than ESB method. The evaluation metrics include the area under the receiver operating characteristic curve (AUROC $\uparrow$) and the area under the precision-recall curve (AUPR $\uparrow$). All ensemble-based methods have size 5. The experiment settings and implementation details can be found in Appendix E.

The out-of-distribution detection performance is shown in Table \ref{tab:ood_result}. It is obvious that the proposed method (AUEL+PE) can achieve significant improvement over recent ensemble-based methods on various OOD detection tasks. Additional OOD detection experiments are shown in Sec. 4.2 for MNIST and C10 under different levels of distributional shifts. Since the post-processing refinement of the MoG is not required, we will show the effectiveness of the refinement (AUEL+RPE) in Sec. \ref{sec:ablation}. Compared to diversity-enhanced ensemble learning such as Hyper-E and Multi-SWAG, our better OOD detection performance also indicates enhanced diversity. 

\begin{figure*}[ht]
    \centering
    \includegraphics[width=1.0\linewidth]{PE_shift_mnist_cifar.pdf}
    \caption{Predictive Calibration Analysis of Rotated MINST and Corrupted C10 Datasets. The first row shows the results for MNIST while the second row represents C10. There are three different metrics (ECE, Brier Score, NLL) that are analyzed in each column, respectively. Each experiment result is aggregated over 3 independent runs.}
    \label{fig:ece}
\end{figure*}

\subsection{Image Classification under Distributional Shift}
Bayesian models marginalize all possible solutions for the final prediction, leading to improved robustness. In this section, we will demonstrate the effectiveness of the proposed method for image classification tasks on MNIST and C10 with synthetic distributional shifts. For MNIST, we create the synthetic rotated MNIST dataset, where we increasingly rotate the MNIST testing data from $0^{\circ}$ to $180^{\circ}$ with a step of $20^{\circ}$. For the C10 dataset, we add the Gaussian noise with 0 mean and variance ranging from 0 to 0.25 with a step of 0.05 to the testing data as the corrupted C10 dataset. Additional adversarial shifts can be found in Sec. \ref{sec:ablation}. Note that we keep the original training strategy on MNIST/C10 training data but test on the shifted testing data. During the evaluation, the uncertainty calibration metrics include negative log-likelihood (NLL $\downarrow$), accuracy (ACC $\uparrow$), expected calibration error (ECE $\downarrow$), maximum calibration error (MCE $\downarrow$), and brier score (BS $\downarrow$). We also provide the OOD detection results of MNIST $\rightarrow$ Rotated MNIST and C10 $\rightarrow$ Corrupted C10 in terms of AUROC and AUPR. The comparisons are conducted under the same experiment settings as Sec. \ref{sub:ood}. In Figure \ref{fig:ece}, partial results for ECE, BS, and NLL are shown.  Additional analysis can be found in Appendix F. 

Based on Figure \ref{fig:ece}, we can observe that the probabilistic ensemble method can achieve better calibration performance for both rotated MNIST and corrupted C10 datasets. As the shift level increases, our proposed method consistently outperforms other ensemble-based methods, which demonstrates the great potential of our method in better generalization ability. Besides improved robustness of uncertainty quantification, comparable within-dataset performance can be found in Appendix F.1. 

\begin{table}[h]
    \centering
	\caption{Diversity Analysis of Ensemble-based Methods Trained on C10 dataset}
	\label{tab:diversity}
\begin{tabular}{|l|c|c|c|c|c|c|}
\hline
	Method  &QS& BD & FK & KW \\ 
\hline  
\multirow{1}{*}{Ours} &$\textbf{0.174}$ &$\textbf{0.538}$ &$\textbf{0.383}$ &$\textbf{0.857}$    \\
\multirow{1}{*}{ESB}   &$0.185$ &$0.552$ &$0.404$ &$0.860$   \\
\multirow{1}{*}{Batch-E}  &$0.284$ &$0.576$ &$0.422$ &$0.868$  \\
\multirow{1}{*}{Hyper-E} &$0.199$ &$0.554$ &$0.406$ &$0.861$  \\
\multirow{1}{*}{Bayes-E}  &$\textbf{0.174}$ &$0.548$ &$0.406$ &$0.859$  \\
\multirow{1}{*}{LPBNN}  &$0.209$ &$0.557$ &$0.405$ &$0.862$  \\
\multirow{1}{*}{Multi-SWAG}  &$0.246$ &$0.566$ &$0.418$ &$0.865$ \\
\multirow{1}{*}{Diversified-E} &$0.184$&$0.552$&$0.402$&$0.860$\\
\multirow{1}{*}{MCT} &$0.190$&$0.553$&$0.405$&$0.861$\\
\hline
\end{tabular}
\end{table}

\begin{table*}[t]
% \fontsize{8.5}{9}\selectfont
	\caption{Ablation Studies: OOD Detection Results and Robustness Analysis on MNIST/C10 Datasets. The first table shows the effectiveness of the sub-modules. The second table shows the improvement when PE serves as a plug-and-play module. Each experiment result is aggregated over 3 independent runs.}
	\label{tab:ablation}
	\centering
\begin{tabular}{|l|cc|cc|cc|cc|}

\hline
	\multirow{2}{*}{Method} & \multicolumn{2}{|c|}{MNIST $\rightarrow$ Omniglot} &\multicolumn{2}{|c|}{C10 $\rightarrow$ SVHN} &\multicolumn{2}{|c|}{Rotated MNIST $60^{\circ}$} & \multicolumn{2}{|c|}{Noisy C10 Level 0.1} \\ \cline{2-9}
	&AUROC&AUPR 	&AUROC&AUPR &NLL &ECE &NLL  &ECE \\
\hline  
\multirow{1}{*}{Ensemble} &$97.92$ & $97.33$ & $91.23$& $86.16$& $2.30$& $0.256$& $3.58$ &$0.435$\\
\multirow{1}{*}{AUEL}& $98.02$ & $97.50$ &$92.98$ &$88.97$ &$2.24$ & $0.243$ &$3.28$ &$0.394$\\
\multirow{1}{*}{AUEL+PE} & $98.49$ & $98.23$ & $93.88$& $90.58$ & $2.09$ & $0.210$ & $3.06$ & $0.380$\\
\multirow{1}{*}{AUEL+RPE} &$\textbf{98.95}$& $\textbf{98.90}$ & $\textbf{93.93}$& $\textbf{91.93}$ & $\textbf{1.92}$&$\textbf{0.163}$  & $\textbf{3.02}$ & $\textbf{0.375}$\\
\hline
\end{tabular}
\begin{tabular}{|l|cc|cc|cc|cc|}

\hline
	\multirow{2}{*}{Method} & \multicolumn{2}{|c|}{MNIST $\rightarrow$ Omniglot} &\multicolumn{2}{|c|}{C10 $\rightarrow$ SVHN} &\multicolumn{2}{|c|}{Rotated MNIST $120^{\circ}$} & \multicolumn{2}{|c|}{Noisy C10 Level 0.1} \\ \cline{2-9}
	&AUROC&AUPR 	&AUROC&AUPR &NLL &ECE &NLL  &ECE \\
\hline  
\multirow{1}{*}{Hyper-E}&$97.97$&$97.55$ & $91.11$& $85.86$ & $4.40$&$0.468$& $3.23$ &$0.407$\\
\multirow{1}{*}{Hyper-E + PE}&$\textbf{98.56}$ &$\textbf{98.37}$ & $\textbf{92.09}$ &$\textbf{87.52}$  &$\textbf{3.72}$& $\textbf{0.416}$& $\textbf{2.66}$ & $\textbf{0.342}$\\
\multirow{1}{*}{Bayes-E} & $97.42$&$96.94$ & $90.96$ &$86.57$ &$4.55$ &$0.502$ & $3.36$& $0.404$\\
\multirow{1}{*}{Bayes-E + PE} & $\textbf{98.21}$&$\textbf{98.02}$& $\textbf{93.27}$ & $\textbf{90.41}$ & $\textbf{3.68}$& $\textbf{0.450}$& $\textbf{2.40}$ & $\textbf{0.298}$\\
\hline
\end{tabular}
\end{table*}


\subsection{Diversity Analysis}

In addition to the theoretical confirmation of augmented diversity exhibited in Sections \ref{PE} and \ref{AUEL}, we also supply empirical analysis underscoring the diversity benefits derived from our proposed methodology. The diversity metrics we employ originate from \cite{wu2020promoting}, featuring both pairwise diversity measures like Q Statistics (QS) and Binary Disagreement (BD), and non-pairwise metrics such as Fleiss’ Kappa (FK) and Kohavi-Wolpert Variance (KW).

We undertake a normalization process for all these scores to ensure that lower values ($\downarrow$) signify a higher degree of diversity. As illustrated in Table \ref{tab:diversity}, our approach surpasses other ensemble-based techniques in relation to diversity. 

It is crucial to highlight that our proposed method leverages uncertainty-guided learning via AUEL to generate diverse modes, utilizes LA for neighborhood exploration to yield diverse samples, and employs an EM-like refinement strategy to further boost diversity. On the other hand, the baseline methods generally concentrate on fostering diversity in a single area.

Aside from numerical findings, we provide visualizations of both parameter space and prediction space diversity in Appendix G. Essentially, we represent the neural network parameters and the predictive logits for MNIST testing data within a two-dimensional space, utilizing principal component analysis (PCA).

\subsection{Ablation Studies and Further Analysis}\label{sec:ablation}

\textbf{Effectiveness of Sub-modules.}
In this section, we evaluate the effectiveness of each step illustrated in Algorithm \ref{alg:ensemble}. Each proposed sub-module helps further improve the OOD detection and uncertainty calibration performance. The MNIST and CIFAR-10 related experiments are shown in Table \ref{tab:ablation}. More analysis for various experiment settings with different metrics can be found in Appendix H.1.\\ \\
\textbf{Probabilistic Ensemble as a Plug-and-Play Module.}
Our method can be a plug-and-play module for easily applying to other ensemble methods with further improvements. Given trained ensemble models from other ensemble methods, we can apply the PE module to construct the mixture of Gaussian model in the post-processing way.  For example, we combine the Hyper Ensemble (Hyper-E) with PE and the Bayesian Ensemble (Bayes-E) with PE to show further improvements in Table \ref{tab:ablation}. Additional analysis can be found in Appendix H.2. \\\\
\textbf{Efficiency Analysis.} 
In Appendix H.3, we present a thorough theoretical and practical evaluation of our methodology's efficiency against various ensemble baselines. In addition, we extend our analysis to compare ensemble baseline models with varying numbers of components. The findings demonstrate that our approach necessitates a smaller number of ensemble components to reach comparable outcomes.  \\
\textbf{Application to Larger Datasets.} In Appendix I, we demonstrate the suitability of our techniques for handling larger datasets, such as CIFAR-100 and TinyImagenet \citep{Tiny}. Our approach can effectively scale with a large number of parameters in the last layer. We can utilize diagonal or block-diagonal covariance matrices for LA, which scale impressively while maintaining competitive accuracy, as per \citep{daxberger2021laplace}. \\\\
\textbf{Other Distributional Shifts.} In Appendix J, we perform adversarial perturbations on C10 testing dataset using the fast gradient sign method \citep{goodfellow2014explaining}. Then, we compute the ACC and NLL of our proposed methods on the perturbed images compared to various ensemble baselines, indicating the effectiveness of our method against adversarial attacks.  \\\\
\textbf{Synthetic Experiments.} In Appendix K, we provide some toy examples of the one-dimensional regression problem and the two-moon classification problem. These examples show that the estimated epistemic uncertainty of the PE model inversely matches well with the training data density. 

\section{Conclusion}
In this paper, we propose the probabilistic ensemble method with adaptive uncertainty-guided ensemble training to construct the Gaussian mixture model with learnable and refinable parameters. Both theoretical and empirical evidence is provided to show that our proposed method can achieve a better approximation of the posterior distribution with enhanced diversity. Moreover, the proposed method has demonstrated better uncertainty quantification performance as well as improved uncertainty calibration ability for various applications including out-of-distribution detection and image classification under different distributional shifts.

\bibliography{main}

\end{document}
