\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{nameref}
\usepackage{zref-xr}
\zxrsetup{toltxlabel}
\zexternaldocument*{tifrea_575}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{macros}

\renewcommand\thesection{\Alph{section}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Semi-supervised novelty detection using\\ ensembles with regularized
disagreement -- Supplementary material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{\href{mailto:<tifreaa@inf.ethz.ch>?Subject=Your UAI 2022
paper}{Alexandru~Țifrea}{}}
\author[1]{Eric~Stavarache}
\author[1]{Fanny~Yang}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    ETH Zurich\\
    Switzerland
}
  
\begin{document}

\onecolumn

\maketitle

% #############################################################################
\section{Theoretical statements}
\label{sec:appendix_theory}

\begin{definition}[$(\eps, \noise)$-clusterable data set]

  We say that a data set $\DD=\{ (x_i, y_i) \}_{i=1}^n$ is \emph{($\eps,
  \noise$)-clusterable} for fixed $\eps > 0$ and $\noise \in [0, 1]$ if there exists
  a partitioning of it into subsets $\{ C_1, ..., C_K \},$ which we call
  \emph{clusters}, each with their associated unit-norm cluster center $c_i$,
  that satisfy the following conditions:

%   We associate a point $c_i$ with each subset $C_i$, and call these points the
  %   cluster centers.
%   The partitioning $\CC$ is an $\eps$-clustering of $\DD$ for a constant $\eps >
%   0$ if there exist points $\{c_i\}_{i=1}^K$, which we call the cluster centers,
%   associated with each subset $C_i$, that satisfy the following requirements:

  \begin{itemize}[leftmargin=*]

    \item $\bigcup_{i=1}^K C_i = \DD$ and $C_i \cap C_j = \emptyset, \forall i,
      j \in [K]$; 

    \item all the points in a cluster lie in the $\eps$-neighborhood of their
      corresponding cluster center, i.e.\ $||x - c_i||_2 \le \eps$ for all $x
      \in C_i$ and all $i \in [K]$;

    \item a fraction of at least $1-\noise$ of the points in each cluster $C_i$
      have the same label, which we call the \emph{cluster label} and denote
      $y^*(c_i)$. The remaining points suffer from label noise;

    \item if two cluster $C_i$ and $C_j$ have different labels, then their
      centers are $2\eps$ far from each other, i.e.\ $||c_i - c_j||_2 \ge
      2\eps$;

    \item the clusters are balanced i.e.\ for all $i \in [K], \alpha_1
      \frac{n}{K} \le |C_i| \le \alpha_2 \frac{n}{K}$, where $\alpha_1$ and
      $\alpha_2$ are two positive constants.

  \end{itemize}

\end{definition}

In our case, for a fixed label $\targetlabel\in\YY$, we assume that the set
$\sourceset \cup \labeledtarget$ is $(\eps, \noise)$-clusterable into $K$
clusters. We further assume that each cluster $C_i$ only includes a few noisy
samples from $\wronglylabeledtargetid$, i.e.\ $\frac{|C_i
\cap\wronglylabeledtargetid|}{|C_i|} \le \noise$ and that for clusters $C_i$ whose
cluster label is not $\targetlabel$, i.e.\ $y^*(c_i) \neq \targetlabel$, it
holds that $C_i \cap \labeledtargetood = \emptyset$. 

We define the matrices $C:=[c_1, ..., c_K]^T \in \RR^{K \times d}$ and
$\Sigma:=(CC^T)\bigodot \EE_g[\phi'(Cg)\phi'(Cg)^T]$, with $g \sim \gauss(0,
I_d)$ and where $\bigodot$ denotes the elementwise product. We use $\| \cdot \|$
and $\lambda_{min}(\cdot)$ to denote the spectral norm and the smallest
eigenvalue of a matrix, respectively.

For prediction, we consider a 2-layer neural network model with $p$ hidden
units, where $p \gtrsim \frac{K^2\|C\|^4}{\lambda_{min}(\Sigma)^4}$. We can
write this model as follows:

\begin{align}
  x \mapsto f(x; W) = v^T\phi(Wx),
\end{align}

The first layer weights $W$ are initialized with random values drawn from
$\gauss(0, 1)$, while the last layer weights $v$ have fixed values: half of them
are set to $1/p$ and the other half is $-1/p$. We consider activation functions
$\phi$ with bounded first and second order derivatives, i.e.\ $|\phi'(x)|\le
\Gamma$ and $\phi''(x)\le \Gamma$.
% where $v$ has half of the entries equal to $1/p$ and the other half set to
% $-1/p$. 
We use the squared loss for training, i.e.\ $\LL(W)=\frac{1}{2}\sum_{i=0}^n (y_i
- f(x_i; W))^2$ and take gradient descent steps to find the optimum of the loss
function, i.e.\ $W_{\tau+1} = W_\tau - \eta \nabla \LL(W_\tau)$, where the step
size is set to $\eta \simeq \frac{K}{n\|C\|^2}$.

In the informal Proposition~\ref{proposition_informal} we use the notation
$\separability := \lambda_{min}(\Sigma)$. Intuitively, $\Sigma$ can
be seen as a kernel matrix associated with the two-layer neural network, as
discussed in \citet{mahdi}. In particular, $\Sigma$ depends on the choice of the
nonlinear activation function of the neural network. For instance, for ReLU
activations, $\lambda_{min}(\Sigma)$ can be lower bounded like
$\lambda_{min}(\Sigma) \gtrsim \frac{\mu}{K^2}$, where $\mu$ is the minimum
distance between two clusters.

We can now state the following proposition:

\begin{proposition}
  \label{proposition_appendix}

  Assume that $\noise \le \delta / 8$ and $\eps \le \alpha \delta
  \lambda_{min}(\Sigma)^2 / K^2$, where $\delta$ is a constant such that $\delta
  \le \frac{2}{|\YY - 1|}$ and $\alpha$ is a constant that depends on $\Gamma$.
  Then it holds with high probability $1 - 3 / K^{100} - Ke^{-100d}$ over the
  initialization of the weights that the neural network trained on $\sourceset
  \cup \labeledtarget$ perfectly fits $\sourceset$, $\correctlylabeledtargetid$
  and $\labeledtargetood$, but not $\wronglylabeledtargetid$, after
  $T=c_4\frac{\|C\|^2}{\lambda_{min}(\Sigma)}$ iterations.
%   predicts the cluster label on all inputs in an $\eps$-neighborhood of the
  %   cluster center for all clusters, for an appropriately chosen stopping
  %   time.

\end{proposition}

This result shows that there exists an optimal stopping time at which the neural
network predicts the correct label on all ID points and the label $\targetlabel$
on all the OOD points. As we will see later in the proof, the proposition is
derived from a more general result which shows that the early stopped model
predicts these labels not only on the points in $\targetset$ but also in an
$\eps$-neighborhood around cluster centers. Hence, an $\method$ ensemble can be
used to detect holdout OOD samples similar to the ones in $\targetset$, after
being tuned on $\targetset$. This follows the intuition that classifiers
regularized with early stopping are smooth and generalize well.

The clusterable data model is generic enough to include data sets with
non-linear decision boundaries. Moreover, notice that the condition in
Proposition~\ref{proposition_appendix} is satisfied when $\sourceset \cup
\labeledtargetid$ is $(\eps, \noise)$-clusterable and $\labeledtargetood$ is
$\eps$-clusterable and if the cluster centers of $\labeledtargetood$ are at
distance at least $2\eps$ from the cluster centers of $\sourceset \cup
\labeledtargetid$. A situation in which these requirements are met is, for
instance, when the OOD data comes from novel classes, when all classes
(including the unseen ones that are not in the training set) are well separated,
with cluster centers at least $2\eps$ away in Euclidean distance. In addition,
in order to limit the amount of label noise in each cluster, it is necessary
that the number of incorrectly labeled samples in $\wronglylabeledtargetid$ is
small, relative to the size of $\sourceset$.

In practice, we only need that the decision boundary separating
$\labeledtargetood$ from $\sourceset$ is easier to learn than the classifier
required to interpolate the incorrectly labeled $\wronglylabeledtargetid$, which
is often the case, provided that $\labeledtargetood$ is large enough and the OOD
samples come from novel classes.

We now provide the proof for Proposition~\ref{proposition_appendix}:

\begin{proof}

  We begin by restating a result from \citet{mahdi}:

\begin{theorem}[\citep{mahdi}]
  \label{thm:mahdi}

  Let $\DD:=\{(x_i, y_i)\}\in \RR^d \times \YY$ be an $(\eps, \noise)$-clusterable
  training set, with $\eps \le c_1 \delta \lambda_{min}(\Sigma)^2 / K^2$ and
  $\noise \le \delta / 8$, where $\delta$ is a constant that satisfies $\delta \le
  \frac{2}{|\YY|-1}$. Consider a two-layer neural network as described above,
  and train it with gradient descent starting from initial weights sampled
  i.i.d.\ from $\gauss(0, 1)$. Assume further that the step size is $\eta =
  c_2\frac{K}{n\|C\|^2}$ and that the number of hidden units $p$ is at least
  $c_3 \frac{K^2\|C\|^4}{\lambda_{min}(\Sigma)^4}$. Under these conditions, it
  holds with probability at least $1 - 3 / K^{100} - Ke^{-100d}$ over the random
  draws of the initial weights, that after
  $T=c_4\frac{\|C\|^2}{\lambda_{min}(\Sigma)}$ gradient descent steps, the
  neural network $x\mapsto f(x; W_T)$ predicts the correct cluster label for all
  points in the $\eps$-neighborhood of the cluster center, namely:

  \begin{align}
    \arg\max_{y \in \YY} | f(x; W_T) - \onehot(y) |= y^*(c_i), \text{ for all } x \text{ with }
    \| x - c_i \|_2 \le \eps \text{ and all clusters } i \in [K],
  \end{align}

  where $\onehot: \YY \to \{0, 1\}^{|\YY|}$ yields one-hot embeddings of the
  labels. The constants $c_1, c_2, c_3, c_4$ depend only on $\Gamma$.

\end{theorem}

Notice that, under the assumptions introduced above, the set $\sourceset \cup
\labeledtarget$ is $(\eps, \noise)$-clusterable, since the incorrectly labeled ID
points in $\wronglylabeledtargetid$ constitute at most a fraction $\noise$ of the
clusters they belong to. As a consequence,
Proposition~\ref{proposition_appendix} follows directly from
Theorem~\ref{thm:mahdi}.

\end{proof}

% The following proposition follows as a straight-forward application of
% Theorem~\ref{thm:mahdi}. We use the notation introduced in
% Section~\ref{sec:reto}.
% and further assume that the OOD data comes from novel classes and that
% $\sourceset \cup \targetoodset$ is $(\eps, 0)$-clusterable with no label noise
% and such that clusters with different labels (including the OOD classes not
% present in the training set) are at least $2\eps$ away in Euclidean distance.

% =============================================================================


% #############################################################################
\vspace{-0.5cm}
\section{Disagreement score for novelty detection}
\label{sec:appendix_statistic}

\begin{figure}[t]
  \centering
  \includegraphics[width=0.8\textwidth]{figures/disagreement.png}

  \caption{\small{Cartoon illustration showing a diverse ensemble of linear
      binary classifiers. We compare novelty detection performance for two
      aggregation scores: $\entavg$ (\textbf{Left}) and $\Tdis$ with
      $\rho(f_1(x), f_2(x))=\mathbb{1}_{\sgn(f_1(x))\neq\sgn(f_2(x))}$
      (\textbf{Right}). The two metrics achieve similar TPRs, but using
      $\entavg$ instead of our score, $\Tdis$, leads to more false positives,
      since the former simply flags as OOD a band around the averaged model
      (solid black line) and does not take advantage of the ensemble's
  diversity.}}

  \label{fig:ensemble_disagreement}
\end{figure}


As we argue in Section~\ref{sec:earlystopping},
Algorithm~\ref{algo:reto_training} produces an ensemble that disagrees on OOD
data, and hence, we want to devise a scalar score that reflects this model
diversity.
%% The challenge now is to devise a test statistic that exploits the diversity of
%% an ensemble to detect OOD samples.
Previous works \citep{balaji, ood_ovadia} first average the softmax predictions
of the models in the ensemble and then use the entropy as a metric, i.e.\
$\entavg(f_1(x), ..., f_K(x)):=-\sum_{i=1}^{|\YY|} (f(x))_i \log (f(x))_i$ where
$f(x) := \frac{1}{K} \sum_{i=1}^K f_i(x)$ and $(f(x))_i$ is the $i^{\text{th}}$
element of $f(x) \in [0,1]^{|\YY|}$\footnote{We abuse notation slightly and
  denote our disagreement metric as $\Tdis$ to contrast it with the ensemble
  entropy metric $\entavg$, which first takes the average of the softmax outputs
and only afterwards computes the score.}. We argue later that averaging discards
information about the diversity of the models.

%, which looks as follows for an ensemble of $K$ classifiers: total variation
%distance between the softmax outputs $f_i(x), f_j(x) \in [0, 1]^{|\YY|}$ of
%models $i,j \in \{1, ..., K\}$ in the ensemble:
Recall that our average pairwise \emph{disagreement} between the outputs of $K$
models in an ensemble reads:\footnote{We abuse notation slightly and denote our
  disagreement metric as $\Tdis$ to contrast it with the ensemble entropy metric
  $\entavg$, which first takes the average of the softmax outputs and only
afterwards computes the score.}

\begin{align}
  \label{eq:statistic}
%   T_{\text{avg-TV}}(x) := \frac{1}{K(K-1)} \sum_{i\neq j} \TV \left(f_i(x),
%   f_j(x)\right),
  \Tdis(f_1(x), ..., f_K(x)) := \frac{2}{K(K-1)} \sum_{i\neq j} \dis \left(f_i(x),
  f_j(x)\right),
\end{align}

\noindent where $\dis$ is a measure of disagreement between the softmax outputs
of two predictors, for example the total variation distance
$\dis_{\text{TV}}(f_i(x), f_j(x))=\frac{1}{2} \|f_i(x) - f_j(x) \|_1$ used in
our experiments.
% \noindent where $d_{TV}$ is the total variation distance.  The null hypothesis
% is rejected for high values of $T_{\text{avg-TV}}$.  as shown in
% Algorithm~\ref{algo:reto_detection}.


We briefly highlight the reason why averaging softmax outputs \emph{first} like
in previous works relinquishes all the benefits of having a more diverse
ensemble, as opposed to the proposed pairwise score in
Equation~\ref{eq:statistic}. Recall that varying thresholds yield different true
negative and true positive rates (TNR and TPR, respectively) for a given
statistic.
% One way to evaluate a metric for a hypothesis test is to report the TNR (the
% larger the better) at a threshold that corresponds to a fixed desired TPR.
% One way to evaluate a metric for a hypothesis test is to report the TPR at a
% threshold that corresponds to a fixed desired FPR~\ref{fig:tpr} or vice versa
% ~\ref{fig:tnr}. The larger the former the better and the smaller the latter
% the better. \fy{rephrase}.
In the sketch in Figure~\ref{fig:ensemble_disagreement} we show that the score
we propose, $\Tdis$, achieves a higher TNR compared to $\entavg$, for a fixed
TPR, which is a common way of evaluating statistical tests. Notice that the
detection region for $\entavg$ is always limited to a band around the average
model for any threshold value $t_0$. In order for the $\entavg$ to have large
TPR, this band needs to be wide, leading to many false positives. Instead, our
disagreement score exploits the diversity of the models to more accurately
detect OOD data. 

\begin{figure}[t]
  \begin{center}
    \includegraphics[width=0.8\textwidth]{figures/vanilla_ensemble_diversity.png}
  \end{center}

  \caption{Relying only on the randomness of SGD and of the weight
    initialization to diversify models is not enough, as it often yields similar
    classifiers. Each column shows a different predictor trained from random
    initializations with Adam. All models have the same 1-hidden layer MLP
  architecture.}

  \label{fig:vanilla_diversity}
\end{figure}

We now provide further quantitative evidence to support the intuition presented
in Figure~\ref{fig:ensemble_disagreement}.  The aggregation metric is tailored
to exploit ensemble diversity, which makes it particularly beneficial for
$\method$.  On the other hand, Vanilla Ensembles only rely on the stochasticity
of the training process and the random initializations of the weights to produce
diverse models, which often leads to classifiers that are strikingly similar as
we show in Figure~\ref{fig:vanilla_diversity} for a few 2D data sets. As a
consequence, using our disagreement score $\Tdis$ for Vanilla Ensembles can
sometimes hurt novelty detection performance. To see this, consider the extreme
situation in which the models in the ensemble are identical, i.e.\ $f_1 = f_2$.
Then it follows that $\Tdis(f_1(x), f_2(x)) = 0$, for all test points $x$ and
for any function $\rho$ that satisfies the distance axioms.  

We note that the disagreement score that we propose takes a form that is similar
to previous diversity scores, e.g.\ \citet{Zhang2010, mcd_ood}. In the context
of regression, one can measure uncertainty using the variance of the outputs
metric previously employed in works such as \citet{gal2016}. However, we point
out that using the output variance requires that the ensemble is the result of
sampling from a random process (e.g.\ sampling different training data for the
models, or sampling different parameters from a posterior). In our framework, we
obtain the ensemble by solving a different optimization problem for each of the
models by assigning a different label to the unlabeled data.  Therefore, despite
their similarities, our disagreement score and the output variance are, on a
conceptual level, fundamentally different metrics.

Table~\ref{table:score_comparison} shows that $\Tdis$ leads to worse novelty 
detection performance for Vanilla Ensembles, compared to using the entropy of
the average softmax score, $\entavg$, which was proposed in prior work.
However, if the ensembles are indeed diverse, as we argue is the case for our
method $\method$ (see Section~\ref{sec:earlystopping}), then there is a clear
advantage to using a score that, unlike $\entavg$, takes diversity into account,
as shown in Table~\ref{table:score_comparison} for 5-model $\method$ ensembles. 


\begin{table}[h]
\scriptsize

\caption{\small{The disagreement score that we propose $\Tdis$ exploits ensemble
    diversity and benefits in particular $\method$ ensembles. Novelty detection
    performance is significantly improved when using $\Tdis$ compared to the
    previously proposed $\entavg$ metric. Since Vanilla Ensemble are not diverse
    enough, a score that relies on model diversity can hurt novelty detection
    performance. We highlight the AUROC and the TNR@95 obtained with the score
    function that is $\bestnonreto{best for Vanilla Ensemble}$ and the
$\bestreto{best for \method}$.}}

\label{table:score_comparison}
\begin{center}

\input{tables/score_comparison.tex}

\end{center}
\end{table}

We highlight once again that other methods that attempt to obtain diverse
ensembles, such as MCD, fail to train models with sufficient disagreement, even
when they use oracle OOD for hyperparameter tuning
(Figure~\ref{fig:scores_mcd_es}).

\begin{figure*}[h]
  \centering
  \begin{subfigure}[t]{0.3\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/score_distrib_mcd_es.png}
    \caption{Not enough diversity (MCD)}
    \label{fig:scores_mcd_es}
  \end{subfigure}
%   \begin{subfigure}[t]{0.3\textwidth}
%     \centering
%     \includegraphics[width=\textwidth]{figures/score_distrib_mcd_end.png}
%     \caption{Too much diversity\\ (MCD end of training)}
%     \label{fig:scores_mcd_end}
%   \end{subfigure}
  \begin{subfigure}[t]{0.3\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/score_distrib_erd.png}
    \caption{Regularized diversity (ERD)}
    \label{fig:scores_erd}
  \end{subfigure}

% \vspace{-0.1cm}
\caption{Distribution of disagreement scores on ID and OOD data
  for an ensemble that is not diverse enough (\textbf{Left}), and an ensemble
  with regularized disagreement (\textbf{Right}). Note that MCD is early-stopped
  using oracle OOD data. ID=CIFAR10[0:4], OOD=CIFAR10[5:9].}
% \vspace{-0.4cm}

\end{figure*}


% We distinguish between ID and OOD data using an approach reminiscent of a
% two-sample statistical test. Consider two finite-sample sets, namely the
% training set $\{ x_i : x_i \sim \iddist, 1 \le i \le n \}$ and the set that only
% contains one test point $\{ x : x \sim \ooddist \}$. We are interested in
% identifying anomalous covariates, so we discard the labels when performing the
% statistical test. The null hypothesis is $H_0: \idsupp = \oodsupp$ and, if it is
% rejected, it indicates that the test sample $x$ is OOD. The various OOD
% detection methods differ in their choice of the test statistic.  For approaches
% that use ensembles of classifiers, the test statistic should reflect the belief
% that the models have similar outputs on ID samples, and disagree on OOD samples.
% For example, \cite{balaji} proposes averaging the softmax outputs of
% all the models in the ensemble and then taking the maximum or the entropy of the
% resulting probability vector as the test statistic. For a $K$-model ensemble and
% an input $x$ this can be written as follows:
% 
% \begin{align*}
%   T_{\text{max-p}}(x) := \max_{i \in \YY} \frac{1}{K} \sum_{k=1}^K \left( f_k(x)\right)_i,
%   \text{ with } f_k(x) \in \RR^{|\YY|} \text{ the softmax output of the k\textsuperscript{th}
%   model.}
% \end{align*}
% 
% \begin{figure}[h]
%   \begin{center}
%     \includegraphics[width=0.5\textwidth]{figures/probability_simplex.png}
%   \end{center}
% 
%   \caption{Averaging the probability outputs of the models in an ensemble can
%   lead to catastrophic information loss, in some cases. The softmax vectors of a
%   3-model ensemble are represented on the 2D probability simplex. For an OOD
%   sample, each model predicts with high confidence the arbitrary label it has
%   seen during training. For the ID sample, the models predict the correct class
%   with moderate confidence. Therefore, the average probability vectors for the
%   ID and the OOD sample are close, which can make it hard to distinguish between
%   them.}
%   \label{fig:probability_simplex}
% \end{figure}
% 
% Averaging the softmax vectors loses some information about the model
% predictions, because different initial probability vectors can map to the same
% averaged vector. In our approach, models are more uncertain on ID samples than
% on OOD samples, which can make the averaged softmax vector fall at the same
% location for an ID and an OOD sample. This makes it impossible to distinguish
% between the two, as illustrated in Figure~\ref{fig:probability_simplex}.
% 
% For neural network ensembles, following a standard training procedure of
% minimizing the cross-entropy loss leads to models that make confident
% predictions on both ID and OOD samples, as shown by \cite{hein, fourier}.
% Consequently, the information lost through averaging is not causing any issues:
% on ID samples, the models will tend to give the same prediction, while on OOD
% samples the models tend to disagree, giving different predictions with high
% confidence in both cases.
% 
% However, in our situation, because of early stopping, the training process is halted
% at different stages for test ID and test OOD samples, as indicated in
% Figure~\ref{fig:training_curves}.
% 
% Recent papers like \cite{tishby, avh} analyze the dynamics of optimizing the
% cross-entropy loss with SGD. They suggest that there might exist two stages: one
% in which a good decision boundary is found, and another in which the margin is
% increased between the representations of inputs from different classes. It is
% this second stage that also leads to overconfident predictions on both ID and
% OOD samples. Thus, early stopping causes the models to be more uncertain on
% test ID samples than on test OOD. This is indeed shown in
% Figure~\ref{fig:confidence}.
% 
% To avoid the problem of information loss described previously, we compute the
% pairwise total variation distances between the softmax outputs of the models in
% the ensemble, and we take the average of these distances as our test statistic:
% 
% \begin{align*}
%   T_{\text{avg-TV}}(x) := \frac{2}{K(K-1)} \sum_{\substack{i,j=1\\i < j}}^K
%   \TV \left(f_i(x), f_j(x)\right)
% \end{align*}
% 
% 
% \begin{figure}[h]
%   \begin{center}
%     \includegraphics[width=\textwidth]{figures/confidence}
%   \end{center}
% 
%   \caption{Distribution of confidence for a model trained on the ID training set
%     alone (\textbf{Top}) and a model trained on both the training set and the
%     arbitrarily labeled unlabeled set, with early stopping (\textbf{Middle}) and
%     after 100 epochs (\textbf{Bottom}). The models in the vanilla ensemble are
%     confident on both ID and OOD samples. The early-stopped model trained on
%     $\sourceset \cup \labeledtarget$ is confident only on the OOD data, even
%     though at convergence it has a high confidence on the test ID samples as
%   well.}
% 
%   \label{fig:confidence}
% \end{figure}

% =============================================================================


% #############################################################################
\section{Taxonomy of OOD detection methods according to overall objective}
\label{sec:appendix_related_work}

We now provide more details regarding the categorization of OOD detection
approaches based on the different surrogate objectives that they
use in order to detect OOD samples.

\paragraph{Learning the ID marginal $\iddist$.} We loosely define OOD samples as
all $x$ for which $\iddist(x) <\alpha$, for a small constant $\alpha > 0$.
Therefore, if we had access to the marginal training distribution $\iddist$,
we would have perfect OOD detection. Realistically, however, $\iddist$ is
unknown, and we need to resort to estimating it. Explicit density estimation
with generative models \citep{ganomaly2018, nalisnick} is inherently
difficult in high dimensions. 
Alternatively, one-class classification \citep{mari2010, Ruff2020, sohn2021} and
PU learning approaches \citep{duPlessis14, Kiryo17} try to directly learn a
discriminator between ID and OOD data in the presence of known (e.g.\ A-UND) or
unknown (e.g.\ SSND) OOD data. However, these methods tend to produce
indistinguishable representations for inliers and outliers when the ID
distribution consists of many diverse classes.

\paragraph{Learning $\iddist$ using label information (ours).} Since in a
prediction problem, the ID training set has class labels, one can take advantage
of that additional information to distinguish points in the support of $\iddist$
from OOD data. For instance, \citet{mahalanobis, gram_ood} propose to use the
intermediate representations of neural networks trained for prediction to detect
OOD data. Often, the task is to also simultaneously predict well on ID data, a
problem known as open-set recognition \citep{Geng2021} and
tackled by approaches like OpenHybrid \citep{openhybrid2020}.

\paragraph{Learning uncertainty estimates for $P_{Y|X}$.} In the prediction
setting, calibrated uncertainty estimates error could naturally be used to
detect OOD samples. Many uncertainty quantification methods are based on a
Bayesian framework \citep{gal2016, dpn} or calibration
improvement \citep{odin, Hafner2019}. However, neither of them perform as well
as other OOD methods mentioned above \citep{ood_ovadia}.

% =============================================================================


% #############################################################################
\section{Experiment details}
\label{sec:appendix_experiments}

\subsection{Baselines}

In this section we describe in detail the baselines with which we compare our
method and describe how we choose their hyperparameters. For all baselines we
use the hyperparameters suggested by the authors for the respective data sets
(e.g.\ different hyperparameters for CIFAR10 or ImageNet). For all methods, we
use pretrained models provided by the authors. However, we note that for the
novel-class settings, pretraining on the entire training set means that the
model is exposed to the OOD classes as well, which is undesirable. Therefore,
for these settings we pretrain only on the split of the training set that
contains the ID classes. Since the classification problem is similar to the
original one of training on the entire training set, we use the same
hyperparameters that the authors report in the original papers.

Moreover, we point out that even though different methods use different model
architectures, that is not inherently unreasonable when the goal is novelty
detection, since it is not clear if a complex model is more desirable than a
smaller model. For this reason, we use the model architecture recommended by the
authors of the baselines and which was used to produce the good results reported
in their published works. For Vanilla Ensembles and for $\method$ we show
results for different architectures in
Appendix~\ref{sec:appendix_different_arch}.

\begin{itemize}

%   \item \textbf{k-Nearest Neighbors}: We take $k = 8$. For each test sample, we
%     take the average distance to the nearest neighbors in the input (pixel) space, and we
%   use this as the test statistic.

  \item \textbf{Vanilla Ensembles} \citep{balaji}: We train an ensemble on the
    training set according to the true labels. For a test sample, we average the
    outputs of the softmax probabilities predicted by the models, and use the
    entropy of the resulting distribution as the score for the hypothesis test
    described in Section~\ref{sec:disagreement}. We use ensembles of 5 models,
    with the same architecture and hyperparameters as the ones used for
    $\method$. Hyperparameters are tuned to achieve good validation accuracy.

  \item \textbf{Gram method} \citep{gram_ood}: The Gram baseline is similar to
    the Mahalanobis method in that both use the intermediate feature
    representations obtained with a deep neural network to determine whether a
    test point is an outlier. However, what sets the Gram method apart is the
    fact that it does not need any OOD data for training or calibration. We use
    the pretrained models provided by the authors, or train our own, using the
    same methodology as described for the Mahalanobis baseline. For OOD
    detection, we use the code published by the authors. We note that for MLP
    models, the Gram method is difficult to tune and we could not find a
    configuration that works well, despite our best efforts and following the
    suggestions proposed during our communication with the authors.
%  The Gram method requires laborious hyperparameter tuning for
% multi-layer perceptron (MLP) models, so we do not consider it for the MNIST and
% FMNIST data sets, since the code provided by the authors does
% not include the configurations required for MLP models.

  \item \textbf{Deep Prior Networks (DPN)} \citep{dpn}: DPN is a Bayesian Method
    that trains a neural network (Prior Network) to parametrize a Dirichlet
    distribution over the class probabilities.  We train a WideResNet WRN-28-10
    for $100$ epochs using SGD with momentum $0.9$, with an initial learning
    rate of $0.01$, which is decayed by $0.2$ at epochs $50$, $70$, and $90$.
    For MNIST, we use EMINST/Letters as OOD for tuning. For all other settings,
    we use TinyImages as OOD for tuning.

  \item \textbf{Outlier Exposure} \citep{outlier_exposure}: This approach makes
    a model's softmax predictions close to the uniform distribution on the known
    outliers, while maintaining a good classification performance on the
    training distribution. We use the WideResNet architecture (WRN). For
    fine-tuning, we use the settings recommended by the authors, namely we train
    for $10$ epochs with learning rate $0.001$. For training from scratch, we
    train for $100$ epochs with an initial learning rate of $0.1$.  When the
    training data set is either CIFAR10/CIFAR100 or ImageNet, we use the default
    WRN parameters of the author's code, namely $40$ layers, $2$ widen-factor,
    droprate $0.3$.  When the training dataset is SVHN, we use the author's
    recommended parameters of $16$ layers, $4$ widen-factor and droprate $0.4$.
    All settings use the cosine annealing learning rate scheduler provided with
    the author's code, without any modifications. For all settings, we use
    TinyImages as known OOD data during training. In
    Section~\ref{sec:appendix_more_oe} we show results for known OOD data that
    is similar to the OOD data used for testing.

  \item \textbf{Mahalanobis} \citep{mahalanobis}: The method pretrains models on
    the labeled training data. For a test data point, it uses the intermediate
    representations of each layer as ``extracted features''. It then performs
    binary classification using logistic regression using these extracted
    features. In the original setting, the classification is done on
    ``training'' ID vs ``training'' OOD samples (which are from the same
    distribution as the test OOD samples).  Furthermore, hyperparameter tuning
    for the optimal amount of noise is performed on validation ID and OOD data.
    We use the WRN-28-10 architecture, pretrained for $200$ epochs.  The initial
    learning rate is $0.1$, which is decayed at epochs $60$, $120$, and $160$ by
    $0.2$. We use SGD with momentum $0.9$, and the standard weight decay of $5
    \cdot 10^{-4}$. The code published for the Mahalanobis method performs a
    hyperparameter search automatically for each of the data sets. 

\end{itemize}

The following baselines attempt to leverage the unlabeled data that is available
in applications such as the one depicted in Figure~\ref{fig:practical_sketch},
similar to $\method$. 

\begin{itemize}

  \item \textbf{Non-negative PU learning (nnPU)} \citep{Kiryo17}: The method
    trains a binary predictor to distinguish between a set of known positives
    (in our case the ID data) and a set that contains a mixture of positives and
    negatives (in our case the unlabeled set). To prevent the interpolation of
    all the unlabeled samples, \citet{Kiryo17} proposes a regularized objective.
    It is important to note that most training objectives in the PU learning
    literature require that the ratio between the positives and negatives in the
    unlabeled set is known or easy to estimate. For our experiments we always
    use the exact OOD ratio to train the nnPU baseline. Therefore, we obtain an
    upper bound on the AUROC/TNR@95. If the ratio is estimated from finite
    samples, then estimation errors may lead to slightly worse OOD detection
    performance. We perform a grid search over the learning rate and the
    threshold that appears in the nnPU regularizer and pick the option with the
    best validation accuracy measured on a holdout set with only positive
    samples (in our case, ID data). 
%     Notably, PU learning is more similar to semi-supervised classification,
    %     where only few labeled samples are available. In contrast, for
    %     semi-supervised outlier detection it is usually assumed that the ID
    %     labeled data is not scarce.

  \item \textbf{Maximum Classifier Discrepancy (MCD)} \citep{mcd_ood}: The MCD
    method trains two classifiers at the same time, and makes them disagree on
    the unlabeled data, while maintaining good classification performance.  We
    use the WRN-28-10 architecture as suggested in the paper.  We did not change
    the default parameters which came with the author's code, so weight decay is
    $10^{-4}$, and the optimizer is SGD with momentum $0.9$.  When available
    (for CIFAR10 and CIFAR100), we use the pretrained models provided by the
    authors. For the other training datasets, we use their methodology to
    generate pretrained models: We train a WRN-28-10 for 200 epochs.  The
    learning rate starts at 0.1 and drops by a factor of 10 at $50\%$ and $75\%$
    of the training progress.

  \item \textbf{Mahalanobis-U}: This is a slightly different version of the
    Mahalanobis baseline, for which we use early-stopped logistic regression to
    distinguish between the training set and an unlabeled set with ID and OOD
    samples (instead of discriminating a known OOD set from the inliers). The
    early stopping iteration is chosen to minimize the classification errors on
    a validation set that contains only ID data (recall that we do not assume to
    know which are the OOD samples).

\end{itemize}

In addition to these approaches that have been introduced in prior work, we also
propose a strong novel baseline that
that bares some similarity to PU learning and to $\method$. 

\begin{itemize}

 \item \textbf{Binary classifier} The approach consists in discriminating
   between the labeled ID training set and the mixed unlabeled set, that
   contains both ID and OOD data. We use regularization to prevent the trivial
   solution for which the entire unlabeled set is predicted as OOD. Unlike PU
   learning, the binary classifier does not require that the OOD ratio in the
   test distribution is known. The approach is similar to a method described in
   \citep{scott08} which also requires that the OOD ratio of the unlabeled set is
   known.  We tune the learning rate and the weight of the unlabeled samples in
   the training loss by performing a grid search and selecting the configuration
   with the best validation accuracy, computed on a holdout set containing only
   ID samples.  We note that the binary classifier that appears in
   Section~\ref{sec:appendix_medical} in the medical benchmark, is not the same
   as this baseline. For more details on the binary classifier that appears in
   the medical data experiments we refer the reader to \citet{Cao2020}.

\end{itemize}

\vspace{-0.2cm}
\subsection{Training configuration for $\method$}

% \vspace{-0.2cm}
For $\method$ we always use hyperparameters that give the best validation
accuracy when training a model on the ID training set. In other words, we pick
hyperparameter values that lead to good ID generalization and do not perform
further hyperparameter tuning for the different OOD data sets on which we
evaluate our approach. We point out that, if the ID labeled set is known to
suffer from class imbalance, subpopulation imbalance or label noise, any
training method that addresses these issues can be used instead of standard
empirical risk minimization to train our ensemble (e.g.\ see 
\citet{mahdi}).

For MNIST and FashionMNIST, we train ensembles of 3-layer MLP models with ReLU
activations. Each intermediate layer has 100 neurons. The models are optimized
using Adam, with a learning rate of $0.001$, for $10$ epochs.

For SVHN, CIFAR10/CIFAR100 and ImageNet, we train ensembles of ResNet20
\citep{He2015}. The models are initialized with weights pretrained for $100$
epochs on the labeled training set. We fine-tune each model for 10 epochs using
SGD with momentum $0.9$, and a learning rate of $0.001$.  The weights are
trained with an $\ell_2$ regularization coefficient of $5e-4$.  We use a batch
size of 128 for all scenarios, unless explicitly stated otherwise. We used the
same hyperparameters for all settings. 

For pretraining, we perform SGD for 100 epochs and use the same architecture and
hyperparameters as described above, with the exception of the learning rate that
starts at $0.1$, and is multiplied by $0.2$ at epochs $50$, $70$ and $90$.

Apart from $\method$, which fine-tunes the ensemble models starting from
pretrained weights, we also present in the Appendix results for $\method$++.
This variant of our method trains the models from random initializations, and
hence needs more iterations to converge, making it more computationally
expensive than $\method$. We train all models in the $\method$++ ensembles for
$100$ epochs with a learning rate that starts at $0.1$, and is multiplied by
$0.2$ at epochs $50$, $70$ and $90$. All other hyperparameters are the same as
for $\method$ ensembles.

For the medical data sets, we train a Densenet-121 as the authors do in the
original paper \citep{Cao2020}. For $\method$++, we do not use random weight
initializations, but instead we start with the ImageNet weights provided with
Tensorflow. The training configuration is exactly the same as for ResNet20,
except that we use a batch size of 32 due to GPU memory restrictions, and for
fine tuning we use a constant learning rate of $10^{-5}$.

\vspace{-0.2cm}
\subsection{Computational considerations for $\method$}

We note that $\method$ models reach the optimal stopping time within the first
10 epochs on all the data sets that we consider, which amounts to around $6$
minutes of training time 
%% as few as three
%% epochs of fine-tuning are enough on average to achieve the performance
%% that we report.  This amounts to around $2$ minutes
if the models in the ensemble are fine-tuned in parallel on NVIDIA 1080 Ti GPUs.
This is substantially better than the cost of fine-tuning a large ViT
transformer model (which takes about 1 hour for 2500 iterations on the same
hardware). Moreover, since the loss we use to train the ensemble decouples over
the models, it allows for easy parallelization, unlike objectives like MCD where
the ensemble models are intertwined.

% =============================================================================


% #############################################################################
\vspace{-0.2cm}
\section{ID and OOD data sets}
\label{sec:appendix_datasets}
\vspace{-0.2cm}


% \subsection{Data sets}
For evaluation, we use the following image data sets: MNIST \citep{mnist},
Fashion MNIST \citep{fashion}, SVHN \citep{svhn}, CIFAR10 and CIFAR100
\citep{cifar}.

\begin{figure}[H]
  \centering
  \begin{subfigure}[b]{0.4\textwidth}
     \centering

    \includegraphics[width=\textwidth]{figures/dataset_samples/mnist01234}
    \includegraphics[width=\textwidth]{figures/dataset_samples/mnist56789}
    \includegraphics[width=\textwidth]{figures/dataset_samples/fashion_mnist02378}
    \includegraphics[width=\textwidth]{figures/dataset_samples/fashion_mnist14569}

     \caption{}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.4\textwidth}
     \centering

    \includegraphics[width=\textwidth]{figures/dataset_samples/cifar1001234}
    \includegraphics[width=\textwidth]{figures/dataset_samples/cifar1056789}
    \includegraphics[width=\textwidth]{figures/dataset_samples/svhn_cropped01234}
    \includegraphics[width=\textwidth]{figures/dataset_samples/svhn_cropped56789}

     \caption{}
  \end{subfigure}

  \caption{Samples for the settings with novel classes. (a) Data samples for the MNIST/FashionMNIST splits. (b) Data samples for the CIFAR10/SVHN splits.}
  \label{fig:data_samples_mnist_splits}

\end{figure}
\vspace{-0.2cm}


For the MNIST and FashionMNIST experiments, the training set size is 50K,
the validation size is 10K, and the test ID and test OOD sizes are both 10K.
For SVHN, CIFAR10 and CIFAR100, the training set size is 40K, the validation
size is 10K, and the unlabeled set contains 10K samples: 5K are ID and 5K are
OOD. For evaluation, we use a holdout set of 10K examples (half ID, half OOD).
When half of the classes are used as ID and the other half as OOD,
all the sizes are halved.



% =============================================================================



% #############################################################################
\newpage
\section{More experiments}
\label{sec:appendix_more_experiments}

We now present more experimental results that provide additional insights about
the proposed approach. We note that, unless otherwise specified, we use 5-model
ERD ensembles in this section.

\vspace{-0.2cm}
\subsection{Evaluation on the unlabeled set}
\label{sec:appendix_transductive_results}

In the main text we describe how one can leverage the unlabeled set $\targetset$
to obtain an novelty detection algorithm that accurately identifies outliers at test
time that similar to the ones in $\targetset$. It is, however, possible to also
use our method $\method$ to flag the OOD samples contained in the same set
$\targetset$ used for fine-tuning the ensemble. In
Table~\ref{table:ss_vs_transductive} we show that the novelty detection performance
of $\method$ is similar regardless of whether we use $\targetset$ for
evaluation, or a holdout test set $\testset$ drawn from the same distribution as
$\targetset$.

\begin{table}[H]
\scriptsize

\caption{Comparison between the novelty detection performance of $\method$ when
using a holdout test set $\testset$ for evaluation, or the same unlabeled set
$\targetset$ that was used for fine-tuning the models.}
\vspace{-0.5cm}

\begin{center}

\input{tables/holdout}

\label{table:ss_vs_transductive}
\end{center}
\end{table}


\vspace{-0.3cm}
\subsection{Comparison with other related works}
\label{sec:appendix_cifar10_cifar100}

We compare 5-model ERD ensembles to more OOD detection approaches. For various
reasons we did not run these methods ourselves on the data sets for which we
evaluate our method in Section~\ref{sec:experiments} (e.g.\ code not available,
unable to replicate published results, poor performance reported by the authors
etc). We collected the AUROC numbers presented in
Table~\ref{table_cifar10_cifar100} from the papers that introduce each method.
We note that our approach shows an excellent overall performance, being
consistently better than or on par with the related works that we consider.
While the method of \citet{fort2021} performs significantly better than all
other baselines on CIFAR10/CIFAR100 tasks, we argue in
Appendix~\ref{sec:appendix_vit} that this is primarily due to the convenient
choice of the data set used for pretraining the transformer models (i.e.\
Imagenet21k) which is strikingly similar to the ID and OOD data.
% almost matching the AUROC of the best performing method of \citet{fort2021}
% which uses large scale visual transformer models pretrained on a superset of
% the OOD data, i.e.\ ImageNet21k.


\begin{table*}[h]
\scriptsize
\centering

\caption{\small{AUROC numbers collected from the literature for a number of relevant
OOD detection methods. We note that the method of \citet{fort2021} ($^\dagger$) uses a large scale
visual transformer models pretrained on a superset of the OOD data, i.e.\
ImageNet21k, while the method of \citet{sehwag2021} ($^*$) uses oracle OOD
samples for training from the same data set as test OOD. For the settings with random classes, the numbers are averages over 5
draws and the standard deviation is always strictly smaller than $0.01$ for our
method.}}
\label{table_cifar10_cifar100}

\begin{tabularx}{\textwidth} {@{}ll @{} @{\hskip 0.1cm} XXXXXXXXX @{}} 
  \toprule
ID data  & OOD data & \citet{fort2021}$^\dagger$ & \citet{openhybrid2020} &
\citet{winkens2020} & \citet{Tack2020} & \citet{sehwag2021}$^*$ &\citet{liu2020hybrid} &
\citet{yujie2020} & ERD (ours) & ERD++ (ours)                   \\
\midrule

CIFAR10  & CIFAR100 & 98.52                             & 0.95          & 0.92
         & 0.92           &     0.93               & 0.91  & - & 0.92       & 0.95 \\
CIFAR100 & CIFAR10  & 96.23                             & 0.85
         & 0.78                                 & - & 0.78
         & - & -                                     & 0.91       & 0.94 \\
\midrule
\makecell[l]{SVHN: 6\\ random\\ classes} & \makecell[l]{SVHN: 4\\ random\\
classes}  & -                             & 0.94
          & -               & -                  & -
          & -                                & 0.91      & 0.94       & 0.94 \\
\makecell[l]{CIFAR10: 6\\ random\\ classes} & \makecell[l]{CIFAR10: 4\\ random\\
classes}  & -                             & 0.94
          & -                  & -               & -
          & -                                & 0.85      & 0.94       & 0.97 \\

  \bottomrule
\end{tabularx}

\end{table*}


OpenHybrid \citep{openhybrid2020} is an open set recognition approach which
reports great near OOD detection performance. We note that, despite our best
efforts, we did not manage to match in our own experiments the results reported
in the paper, even after communicating with the authors and using the code that
they have provided. Moreover, we point out that the performance of OpenHybrid
seems to deteriorate significantly when the ID data consists of numerous
classes, as is the case for CIFAR100.

Furthermore, we note that generative models \citep{nalisnick, 
ganomaly2018} and one-class classification approaches \citep{Ruff2020, Tack2020,
sohn2021} showed generally bad performance, in particular on near OOD data. When
the ID training set is made up of several diverse classes, it is difficult to
represent accurately all the ID data, and only the ID data.

\subsection{Shortcomings of pretrained ViT models for novelty detection}
\label{sec:appendix_vit}

In this section we provide further experimental results pointing to the fact
that large pretrained transformer models \citep{fort2021} can only detect near
OOD samples from certain specific data sets, and do not generalize well more
broadly.

\paragraph{Implementation details.} We fine-tune visual transformer (ViT) models
pretrained on Imagenet21k according the methodology described in
\citet{fort2021}. We report results using the ViT-S-16 architecture (~22 million
trainable parameters) which we fine-tune for 2500 iterations on labeled ID data.
We use the hyperparameters suggested by the authors and always ensure that the
prediction accuracy of the fine-tuned model on ID data is in the expected range.
The code published by the authors uses three different test statistics to detect
OOD data: the maximum softmax probability \citep{Hendrycks2017}, the vanilla
Mahalanobis distance \citep{mahalanobis} and a recently proposed variant of the
Mahalanobis approach \citep{Ren2021}. In Table~\ref{table:vit} we present only
the metrics obtained with the best-performing test statistic for ViT. We stress
that this favors the ViT method significantly, as different test statistics seem
to perform better on different data sets. Since test OOD data is unknown, it is
not possible to select which test statistic to use a priori, and hence, we use
oracle knowledge to give ViT models an unfair advantage.

\paragraph{Experimental results.} In Table~\ref{table:vit} we compare pretrained
visual transformers with 5-model $\method$ and $\method$++ ensembles.  Notably,
the data sets can be partitioned in two clusters, based on ViT novelty detection
performance. On the one hand, if the ID or OOD data comes from CIFAR10 or
CIFAR100, ViT models can detect novel-class samples well. Perhaps surprisingly,
ViT fails short of detecting OOD data perfectly (i.e. AUROC and TNR@95 of 1) on
easy tasks such as CIFAR10 vs SVHN or CIFAR100 vs SVHN, unlike $\method$ and a
number of other baseline approaches.

On the other hand, ViT shows remarkably poor performance on all other data sets,
when neither the ID nor the OOD data come from CIFAR10/CIFAR100. This includes
some of the novel disease use cases from the medical OOD detection benchmark
(see Appendix~\ref{sec:appendix_medical} for more details about the data sets).
This unsatisfactory performance persists even for larger ViT models (we
have tried ViT-S-16 and ViT-B-16 architectures), when fine-tuning for more
iterations (we have tried both 2500 and 10000 iterations), or when varying
hyperparameters such as the learning rate.


\begin{table}[H]
\scriptsize

\caption{Pretrained ViT models tend to perform well when the ID and OOD data is
  semantically similar to (or even included in) the pretraining data, e.g.\
  CIFAR10, CIFAR100 (top part), and their detection performance deteriorates
  drastically otherwise (bottom part). We compare ViT-S-16 models pretrained on
Imagenet21k with 5-model $\method$ and $\method$++ ensembles and
\bestnonreto{highlight} the best method. See Appendix~\ref{sec:appendix_medical}
for more details about the medical data sets.}

\label{table:vit}

\centering

\input{tables/vit_table}

\end{table}

\paragraph{Intuition for why ViT fails.} We conjecture that the novelty
detection performance with pretrained ViT models relies heavily on the choice of
the pretraining data set. In particular, we hypothesize that, since
CIFAR10/CIFAR100 classes are included in the Imagenet21k data set used for
pretraining, the models learn features that are useful for distinguishing ID and
OOD classes when the ID and/or OOD data comes from CIFAR10/CIFAR100. Hence, this
would explain the good performance of pretrained models on the data sets at the
top of Table~\ref{table:vit}. On the other hand, when ID and OOD data is
strikingly different from the pretraining data, both ID and OOD samples are
projected to the same concentrated region of the representation space, which
makes it difficult to detect novel-class points.  Moreover, the process of
fine-tuning as it is described in \citet{fort2021} seems to not help to
alleviate this problem. This leads to the poor performance observed on the near
OOD data sets at the bottom of Table~\ref{table:vit}.

In conclusion, having a large pretraining data set seems to be beneficial when
the OOD data shares many visual and semantic features in common with the
pretraining data. However, in real-world applications it is often difficult to
collect such large data sets, which makes the applicability of pretrained ViT
models limited to only certain specific scenarios.



\vspace{-0.5cm}
\subsection{OOD detection for data with covariate shift}
\label{sec:appendix_cov_shift}

\vspace{-0.2cm}
In this section we evaluate the baselines and the method that we propose on
settings in which the OOD data suffers from covariate shift.
% \citep{Shimodaira2000}
The goal is to identify all samples that come from the shifted distribution,
regardless of how strong the shift is. Notice that mild shifts may be easier to
tackle by domain adaptation algorithms, but when the goal is OOD detection they
pose a much more difficult challenge.

We want to stress that in practice one may not be interested in identifying
\emph{all} samples with distribution shift as OOD, since a classifier may still
produce correct predictions on some of them. 
% Recall that for the novel-class scenarios presented in the main text domain
% adaptation is an ill-posed problems, due to the fact that the set of training
% labels is different from the set of labels of the OOD data.
In contrast, when data suffers from covariate shift we can try to learn
predictors that perform well on both the training and the test distribution, and
we may use a measure of predictive uncertainty to identify only those test
samples on which the classifier cannot make confident predictions. Nevertheless,
we use these covariate shift settings as a challenging OOD detection benchmark
and show in Table~\ref{table:all_resnet_results} that our method $\method$ does
indeed outperform prior baselines on these difficult settings.

We use as outliers corrupted variants of CIFAR10 and CIFAR100 \citep{cifar_c}, as
well as a scenario where ImageNet \citep{Deng2009} is used as ID data and
ObjectNet \citep{Barbu2019} as OOD, both resized to 32x32.
Figure~\ref{fig:data_samples_cifar_and_objectnet} shows samples from these data
sets. The Gram and nnPU baselines do not give satisfactory results on the
difficult CIFAR10/CIFAR100 settings in Table~\ref{table:main_results} and thus
we do not consider them for the covariate shift cases.  For the SSND methods
(e.g.\ MCD, Mahal-U and $\method$/$\method$++) we evaluate on the same unlabeled
set that is used for training (see the discussion in
Section~\ref{sec:appendix_transductive_results}).
% For ImageNet vs ObjectNet, the training set size is 281,167, the validation
% size is 1,000,000, the test ID size is 50,000, and the test OOD size is
% 50,273.

\begin{figure*}[h]
\vspace{-0.2cm}
  \centering
  \hspace*{\fill}%
  \begin{subfigure}[r]{0.49\textwidth}
  \centering
  \includegraphics[width=\textwidth]{figures/dataset_samples/imagenet_objectnet}

\end{subfigure}
\hfill
\begin{subfigure}[c]{0.5\textwidth}
  \centering
  \includegraphics[width=\textwidth]{figures/dataset_samples/cifar10c_sev2}
  \includegraphics[width=\textwidth]{figures/dataset_samples/cifar10c_sev5}
\end{subfigure}

  \caption{Left: Samples from ObjectNet. Figure from
    \citet{Barbu2019}. Right: Corrupted samples from CIFAR10-C.}

  \label{fig:data_samples_cifar_and_objectnet}
\end{figure*}


Furthermore, we present results on distinguishing between CIFAR10 \citep{cifar}
and CIFAR10v2 \citep{recht}, a data set meant to be drawn from the same
distribution as CIFAR10 (generated from the Tiny Images collection).  In \citet{recht}, the authors argue
that CIFAR10 and CIFAR10v2 come from very similar distributions. They provide
supporting evidence by training a binary classifier to distinguish between them,
and observing that the accuracy that is obtained of 52.9\% is very close to
random.

Our experiments show that the two data sets are actually distinguishable,
contrary to what previous work has argued. First, our own binary classifier
trained on CIFAR10 vs CIFAR10v2 obtains a test accuracy of 67\%, without any
hyperparameter tuning. The model we use is a ResNet20 trained for 200 epochs
using SGD with momentum 0.9. The learning rate is decayed by 0.2 at epochs 90,
140, 160 and 180. We use 1600 examples from each data set for training, and we
validate using 400 examples from each data set.

% \vspace{-0.2cm}
\begin{table}[H]
\scriptsize
\caption{OOD detection performance on CIFAR10 vs CIFAR10v2. We highlight the
  \bestreto{best $\method$} variant and the \bestnonreto{best baseline}.}
\vspace{-0.5cm}
\label{table:cifar10v2}
\begin{center}
\input{tables/cifar10v2}
\end{center}
\end{table}
\vspace{-0.5cm}

Our OOD detection experiments (presented in Table~\ref{table:cifar10v2}) show
that most baselines are able to distinguish between the two data sets, with
$\method$ achieving the highest performance. The methods which require OOD data
for tuning (Outlier Exposure and DPN) use CIFAR100. 


\begin{table}[H]
\scriptsize

\caption{OOD detection performance on data with covariate shift. For $\method$
and vanilla ensembles, we train 5 ResNet20 models for each setting. The
evaluation metrics are computed on the unlabeled set. We highlight the
  \bestreto{best $\method$} variant and the \bestnonreto{best baseline}.}
\vspace{-0.5cm}

\begin{center}

\include{tables/extended_results_table}

\label{table:all_resnet_results}
\end{center}
\end{table}

\vspace{-0.5cm}
\subsection{Results with a smaller unlabeled set}
\label{sec:appendix_small_test_set}

We now show that our method performs well even when the unlabeled set is
significantly smaller. In particular, we show in the table below that $\method$
maintains a high AUROC and TNR@95 even when only 1,000 unlabeled samples are
used for fine-tuning (500 ID and 500 OOD). 

\begin{table}[H]
\scriptsize

\caption{Experiments with a test set of size 1,000, with an equal number of ID
and OOD test samples. For $\method$ and vanilla ensembles, we train 5 ResNet20 models for
each setting. The evaluation metrics are computed on the unlabeled set. We highlight 
\bestreto{$\method$} and the \bestnonreto{best baseline}.}
\vspace{-0.5cm}

\begin{center}

\include{tables/results_table_test_size_1000}

\end{center}
\end{table}

\newpage

\vspace{-0.5cm}
\subsection{More results for Outlier Exposure}
\label{sec:appendix_more_oe}

The Outlier Exposure method needs access to a set of OOD samples during
training. The numbers we report in the rest of paper for Outlier Exposure are
obtained by using the TinyImages data set as the OOD samples that are seen
during training.  In this section we explore the use of an OOD$_{\text{train}}$
data set that is more similar to the OOD data observed at test time. This is a
much easier setting for the Outlier Exposure method: the closer
OOD$_{\text{train}}$ is to OOD$_{\text{test}}$, the easier it will be for the
model tuned on OOD$_{\text{train}}$ to detect the test OOD samples.

In the table below we focus only on the settings with corruptions. For each
corruption type, we use the lower severity corruption as OOD$_{\text{train}}$
and evaluate on the higher severity data and vice versa. We report for each metric
the average taken over all corruptions (A), and the value for the worst-case
setting (W).

\vspace{0.2cm}
\begin{table}[H]
  \small

\begin{center}


\input{tables/outlier_exposure}
\end{center}


\label{table:oe}
    \caption{Results for Outlier Exposure, when using the same corruption type,
    but with a higher/lower severity, as OOD data seen during training.}

\end{table}



\vspace{-0.5cm}
\subsection{Results on MNIST and FashionMNIST}

\begin{table}[H]
\scriptsize

\caption{Results on MNIST/FashionMNIST settings. For $\method$ and vanilla
  ensembles, we train five 3-hidden layer MLP models for each setting. The evaluation metrics are computed on the unlabeled set. We highlight the
  \bestreto{best $\method$} variant and the \bestnonreto{best baseline}.}
\vspace{-0.5cm}

\begin{center}

\input{tables/mlp_results}

\end{center}
\end{table}

\vspace{-0.4cm}
For FashionMNIST we chose this particular split (i.e. classes 0,2,3,7,8 vs
classes 1,4,5,6,9) because the two partitions are more similar to each other.
This makes novelty detection more difficult than the 0-4 vs 5-9 split.



\vspace{-0.2cm}
\subsection{Vanilla and $\method$ Ensembles with different architectures}
\label{sec:appendix_different_arch}

In this section we present OOD detection results for 5-model Vanilla and
$\method$ ensembles with different architecture choices, and note that the
better performance of our method is maintained across model classes. Moreover,
we observe that $\method$ benefits from employing more complex models, like the
WideResNet.

\begin{table}[H]
\scriptsize

\caption{Results with three different architectures for Vanilla and $\method$
  ensembles. All ensembles comprise 5 models. For the corruption data sets, we
  report for each metric the average taken over all corruptions (A), and the
  value for the worst-case setting (W). The evaluation metrics are computed on
the unlabeled set.}
\vspace{-0.5cm}

\begin{center}

\input{tables/different_arch_results}

\end{center}
\end{table}


\vspace{-0.7cm}
\subsection{Impact of the ensemble size and of the choice of arbitrary label}
\label{sec:appendix_ensemble_size}

\vspace{-0.2cm}
In this section we show novelty detection results with our method using a smaller
number of models for the ensembles. We notice that the performance is not
affected substantially, indicating that the computation cost of our approach
could be further reduced by fine-tuning smaller ensembles.

\begin{table}[H]
\scriptsize

\caption{Results obtained with smaller ensembles for $\method$. The numbers for
  $K < 5$ are averages over 3 runs, where we use a different set of arbitrary
  labels for each run to illustrate our method's stability with respect the
  choice of labels to be assigned to the unlabeled set. We note that the
standard deviations are small ($\sigma \le 0.01$ for the AUROC values and
$\sigma \le 0.08$ for the TNR@95 values).}
\vspace{-0.4cm}

\begin{center}

\input{tables/different_ensemble_size_results}

\end{center}
\end{table}

\vspace{-0.5cm}
\paragraph{Impact of the choice of arbitrary labels.} Furthermore, we note that
in the table we report averages over 3 runs of our method, where for each
run we use a different subset of $\YY$ to assign arbitrary labels to the
unlabeled data. We do this in order to assess the stability of $\method$
ensembles to the choice of the arbitrary labels and notice that the novelty 
detection performance metrics do not vary significantly.  Concretely, the
standard deviations are consistently below $0.01$ for all data sets for the
AUROC metric, and below $0.07$ for the TNR@95 metric.


\vspace{-0.2cm}
\subsection{Detection performance on different OOD data}
\label{sec:appendix_different_ood}

In this section we investigate whether the proposed method maintains its good
novelty detection performance when the test-time OOD data comes from a different
data set compared to the OOD data that is present in the unlabeled set used for
fine-tuning. In particular, we are interested if our approach can still identify
outliers in situations when they suffer from various corruptions. This scenario
can sometimes occur in practice, when machine failure or uncurated data can lead
to mild distribution shift.

Concretely, we focus on the difficult near OOD scenarios and take as ID half of
the CIFAR10 or CIFAR100 classes, while the other half is OOD. For this
experiment, we fine-tune the ERD ensembles using clean OOD data from the other
half of CIFAR10 and CIFAR100, respectively. For evaluation, we use clean ID
data and corrupted OOD samples from CIFAR10-C and CIFAR100-C, respectively.
We give more details on these corrupted data sets in
Appendix~\ref{sec:appendix_cov_shift}. We consider corruptions of severity 2 and
5 from all corruptions types.

In Table~\ref{table:different_ood} we show the average AUROC and the worst AUROC
over all corruption types for vanilla and ERD ensembles. Note that our approach
maintains a similar performance compared to the numbers presented in
Table~\ref{table:main_results} for same test-time OOD data. It is also
noteworthy that all the average AUROC values are consistently larger than the
baselines in Table~\ref{table:main_results}.


\begin{table}[H]
  \small
  \begin{center}

    \caption{Results obtained when evaluating on an OOD data set different from
    the one used for fine-tuning. All ERD ensembles are tuned on clean ID and
  OOD data and are evaluated on OOD data with corruptions.}


\hyphenpenalty10000
\begin{tabularx}{0.83\textwidth}{lll| cc}
\toprule
\makecell{ID data} & \makecell{OOD data in\\unlabeled set} & \makecell{Test-time\\OOD data} & \makecell{Vanilla\\Ensemble} & \makecell{ERD} \\
                   & & & \multicolumn{2}{c}{AUROC $\uparrow$} \\
\midrule
$\text{ CIFAR10[0:4] }$ & $\text{ CIFAR10[5:9] }$ & $\text{ CIFAR10[5:9]-C sev 2 (A) }$ & 0.82 & 0.93 \\
$\text{ CIFAR10[0:4] }$ & $\text{ CIFAR10[5:9] }$ & $\text{ CIFAR10[5:9]-C sev 2 (W) }$ & 0.77 & 0.88 \\
$\text{ CIFAR10[0:4] }$ & $\text{ CIFAR10[5:9] }$ & $\text{ CIFAR10[5:9]-C sev 5 (A) }$ & 0.85 & 0.91 \\
$\text{ CIFAR10[0:4] }$ & $\text{ CIFAR10[5:9] }$ & $\text{ CIFAR10[5:9]-C sev 5 (W) }$ & 0.79 & 0.86 \\
\midrule
$\text{ CIFAR100[0:49] }$ & $\text{ CIFAR100[50:99] }$ & $\text{ CIFAR100[50:99]-C sev 2 (A) }$ & 0.78 & 0.84 \\
$\text{ CIFAR100[0:49] }$ & $\text{ CIFAR100[50:99] }$ & $\text{ CIFAR100[50:99]-C sev 2 (W) }$ & 0.75 & 0.78 \\
$\text{ CIFAR100[0:49] }$ & $\text{ CIFAR100[50:99] }$ & $\text{ CIFAR100[50:99]-C sev 5 (A) }$ & 0.77 & 0.83 \\
$\text{ CIFAR100[0:49] }$ & $\text{ CIFAR100[50:99] }$ & $\text{ CIFAR100[50:99]-C sev 5 (W) }$ & 0.63 & 0.78 \\


\bottomrule
\end{tabularx}

\label{table:different_ood}
\end{center}
\end{table}

% =============================================================================


% #############################################################################
\vspace{-0.5cm}
\section{Medical OOD detection benchmark}
\label{sec:appendix_medical}

The medical OOD detection benchmark is organized as follows. There are four
training (ID) data sets, from three different domains: two data sets with chest
X-rays, one with fundus imaging and one with histology images. For each ID data
set, the authors consider three different OOD scenarios:

\begin{enumerate}[leftmargin=*]
  \compresslist

  \item Use case 1: The OOD data set contains images from a completely different
    domain, similar to our category of easy OOD detection settings.

  \item Use case 2: The OOD data set contains images with various corruptions,
    similar to the hard covariate shift settings that we consider in
    Section~\ref{sec:appendix_cov_shift}.

  \item Use case 3: The OOD data set contains images that come from novel
    classes, not seen during training.

\end{enumerate}

\begin{figure}[H]
  \begin{center}
    \includegraphics[width=0.7\textwidth]{figures/dataset_samples/medical_samples.png}
  \end{center}

\vspace{-0.5cm}
  \caption{Samples from the medical image benchmark. There are 3 ID data sets
  containing frontal and lateral chest X-rays and retinal images. Hard OOD
samples contain images of diseases that are not present in the training set.}

  \label{fig:medical_samples}
\end{figure}

\vspace{-0.2cm}
The authors evaluate a number of methods on all these scenarios. The methods can
be roughly categorized as follows:

\begin{enumerate}[leftmargin=*]
  \compresslist

  \item Data-only methods: Fully non-parametric approaches like kNN.

  \item Classifier-only methods: Methods that use a classifier trained on the
    training set, e.g.\ ODIN \citep{odin}, Mahalanobis \citep{mahalanobis}. $\method$
    falls into this category as well.

  \item Methods with Auxiliary Models: Methods that use an autoencoder or a
    generative model, like a Variational Autoencoder or a Generative Adversarial
    Network. Some of these approaches can be expensive to train and difficult to
    optimize and tune.

\end{enumerate}

We stress the fact that for most of these methods the authors use (known) OOD
data during training. Oftentimes the OOD samples observed during training come
from a data set that is very similar to the OOD data used for evaluation.  For
exact details regarding the data sets and the methods used for the benchmark, we
refer the reader to \citet{Cao2020}. 
% We did not evaluate $\method$ on the histology image data set due to resource
% limitations; the data set is much larger than the others.

\vspace{-0.2cm}
\begin{figure}[H]
  \begin{center}
    \includegraphics[width=\textwidth]{figures/avg_medical_ood.png}
  \end{center}

\vspace{-0.5cm}
  \caption{AUROC averaged over all scenarios in the medical OOD detection
    benchmark \citep{Cao2020}. The values for all the baselines are computed
    using code made available by the authors of \citet{Cao2020}. Notably,
    most of the baselines assume oracle knowledge of OOD data at training time.}

  \label{fig:avg_medical_ood}
\end{figure}

% \vspace{-1cm}
In addition, in Figure~\ref{fig:avg_medical_novel_class} we present the average
taken over only the novel-class settings in the medical benchmark. We observe
that the performance of all methods is drastically affected, all of them
performing much worse than the average presented in
Figure~\ref{fig:avg_medical_ood}. This stark decrease in AUROC and TNR@95
indicates that novelty detection is indeed a challenging task for OOD detection
methods even in realistic settings. Nevertheless, 2-model ERD ensembles maintain
a better performance than the baselines.


% \vspace{-0.2cm}
\begin{figure}[H]
  \begin{center}
    \includegraphics[width=\textwidth]{figures/avg_medical_ood_novel_class.png}
  \end{center}

\vspace{-0.5cm}
  \caption{AUROC averaged over the novel-class scenarios in the medical OOD
  detection benchmark \citep{Cao2020}, i.e.\ only use case 3.}

  \label{fig:avg_medical_novel_class}
%   \vspace{-1cm}
\end{figure}

In Figures~\ref{fig:medical_nih}, \ref{fig:medical_pad}, \ref{fig:medical_drd}
we present AUROC and AUPR (Area under the Precision Recall curve) for $\method$
for each of the training data sets, and each of the use cases.
Figure~\ref{fig:avg_medical_ood} presents averages over all settings that we
considered, for all the baseline methods in the benchmark.  Notably, $\method$
performs well consistently across data sets. The baselines are ordered by their
average performance on all the settings (see Figure~\ref{fig:avg_medical_ood}).

For all medical benchmarks, the unlabeled set is balanced, with an equal
number of ID and OOD samples (subsampling the bigger data set, if necessary). We
use the unlabeled set for evaluation.

\begin{figure}[H]
  \begin{center}
    \includegraphics[width=0.7\textwidth]{figures/medical_NIHCC.png}
  \end{center}

  \vspace{-0.5cm}
  \caption{Comparison between $\method$ and the various baselines on the NIH chest
  X-ray data set, for use case 1 (top), use case 2 (middle) and use case 3
(bottom). Baselines ordered as in Figure~\ref{fig:avg_medical_ood}.}
  \vspace{-0.2cm}

  \label{fig:medical_nih}
\end{figure}

  \vspace{-0.5cm}
\begin{figure}[H]
  \begin{center}
    \includegraphics[width=0.7\textwidth]{figures/medical_PAD.png}
  \end{center}

  \vspace{-0.5cm}
  \caption{Comparison between $\method$ and the various baselines on the PC chest
  X-ray data set, for use case 1 (top), use case 2 (middle) and use case 3
(bottom). Baselines ordered as in Figure~\ref{fig:avg_medical_ood}.}
  \vspace{-0.2cm}


  \label{fig:medical_pad}
\end{figure}

\begin{figure}[H]
  \begin{center}
    \includegraphics[width=0.7\textwidth]{figures/medical_DRD.png}
  \end{center}

  \vspace{-0.2cm}
  \caption{Comparison between $\method$ and the various baselines on the DRD fundus
    imaging data set, for use case 1 (top), use case 2 (middle) and use case 3
(bottom). Baselines ordered as in Figure~\ref{fig:avg_medical_ood}.}


  \label{fig:medical_drd}
%   \vspace{-1cm}
\end{figure}

% =============================================================================


% #############################################################################
\section{Effect of learning rate and batch size}
\label{sec:appendix_lr_bs}

We show now that $\method$ ensembles are not too sensitive to the choice of
hyperparameters. We illustrate this by varying the learning rate and the batch
size, the hyperparameters that we identify as most impactful. As
Figure~\ref{fig:lr_and_bs_hyperparam} shows, many different configurations lead
to similar novelty detection performance.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.5\textwidth]{figures/learning_rate_and_batch_size_effect.png}

    \caption{
        AUROCs obtained with an ensemble of WRN-28-10 models, as the initial
        learning rate and the batch size are varied.
        We used the hardest setting, CIFAR100:0-50 as ID, and
        CIFAR100:50-100 as OOD.
    }
    \label{fig:lr_and_bs_hyperparam}
\end{figure}

% =============================================================================


% % #############################################################################
% \section{Resource requirements for $\method$}
% \label{sec:appendix_cost}
% 
% \paragraph{Computational cost} Our method can work with training each model in
% the ensemble from scratch (i.e.  random initialization), but it also preforms
% well when fine-tuning a network from pretrained weights. This reduces
% significantly the inference time for each batch of test data. For the settings
% we considered, on average, as few as three epochs of fine tuning are enough to
% achieve the best performance: the training is stopped early, on average, after
% three epochs, according to the condition on the validation loss.
% 
% \subsection{Dependence on ensemble size}
% 
% \begin{figure}[H]
%     \centering
%     \includegraphics[width=0.7\textwidth]{figures/vary_ensemble_size}
% 
%     \caption{Effect of ensemble size on CIFAR100 vs. SVHN. Both methods are trained from scratch for 100 epochs.}
%     \vspace{-0.5cm}
%     \label{fig:vary_ensemble_size}
% 
% \end{figure}
% 
% Figure~\ref{fig:vary_ensemble_size} shows that the good performance of $\method$ does
% not rely on a large number of models in the ensemble. Unlike vanilla ensembles,
% our method achieves a high AUROC with as few as 2 models. This is because, in
% vanilla ensembles, the networks are diverse `by chance', due to the
% stochasticity of the training procedure. On the other hand, in $\method$, our
% training method actively encourages the ensembles to be diverse, and hence two
% models will already disagree on OOD data almost as much as five.
% =============================================================================

% #############################################################################
\vspace{-0.2cm}
\section{Additional figure showing the dependence on the unlabeled set configuration}
\label{sec:appendix_vary_ood_ratio}

The configuration of the unlabeled set (i.e.\ the size of the unlabeled set, the
ratio of OOD samples in the unlabeled set) influences the performance of our
method, as illustrated in Figure~\ref{fig:vary_target_main}. Below, we show that
the same trend persists for different data sets too, e.g.\ when we consider
CIFAR10 as ID data and SVHN as OOD data.

\begin{figure}[h]
  \centering
  \includegraphics[width=0.7\textwidth]{figures/heatmap_cifar10_vs_svhn_cropped_ensemble3_holdout.png}

  \label{fig:vary_target}

  \caption{ The AUROC of a 3-model ERD ensemble as the number and proportion of
  ID (CIFAR10) and OOD (SVHN) samples in the unlabeled set are varied.  }
  \vspace{-0.2cm}

\end{figure}

% =============================================================================


% #############################################################################
\section{Learning curves for other data sets}
\label{sec:appendix_learning_curves}

In addition to Figure~\ref{fig:training_curves}, we present in this section learning curves for
other data sets as well. The trend that persists throughout all figures is that
the arbitrary label is learned first on the unlabeled OOD data. Choosing a
stopping time before the validation accuracy starts to deteriorate prevents the
model from fitting the arbitrary label on unlabeled ID data.

\paragraph{Impact of near OOD data on training $\method$ ensembles.} The
learning curves illustrated in Figure~\ref{fig:appendix_training_curves} provide
insight into what happens when the OOD data is similar to the ID training
samples and the impact that has on training the proposed method. In particular,
notice that for CIFAR10[0-4] vs CIFAR10[5-9] in
Figure~\ref{fig:learning_curves_cifar_split}, the models require more training
epochs before reaching an accuracy on unlabeled OOD samples of 100\%. The
learning of the arbitrary label on the OOD samples is delayed by the fact that
the ID and OOD data are similar, and hence, the bias of the correctly labeled
training set has a strong effect on the predictions of the models on the OOD
inputs. Since we early stop when the validation accuracy starts deteriorating
(e.g.\ at around epoch $8$ in Figure~\ref{fig:learning_curves_cifar_split}), we
end up using models that do not interpolate the arbitrary label on the OOD
samples. Therefore, the ensemble does not disagree on the entirety of the OOD
data in the unlabeled set, which leads to lower novelty detection performance.
Importantly, however, our empirical evaluation reveals that the drop in
performance for $\method$ ensembles is substantially smaller than what we
observe for other OOD detection methods, even on near OOD data sets.

\begin{figure*}[h]
  \centering

  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/training_curves_pretrained_svhn_cropped.png}
    \caption{ID = SVHN; OOD = CIFAR10.}
  \end{subfigure}
  \hfill
  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/training_curves_pretrained_svhn_cropped01234.png}
    \caption{ID = SVHN[0-4]; OOD = SVHN[5-9].}
  \end{subfigure}

  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/training_curves_pretrained_cifar10.png}
    \caption{ID = CIFAR10; OOD = SVHN.}
  \end{subfigure}
  \hfill
  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/training_curves_pretrained_cifar1001234.png}
    \caption{ID = CIFAR10[0-4]; OOD = CIFAR10[5-9].}
    \label{fig:learning_curves_cifar_split}
  \end{subfigure}

  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/training_curves_pretrained_cifar100.png}
    \caption{ID = CIFAR100; OOD = SVHN.}
  \end{subfigure}
  \hfill
  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/training_curves_pretrained_cifar1000-50.png}
    \caption{ID = CIFAR100[0-49]; OOD = CIFAR100[50-99].}
  \end{subfigure}

  \caption{ \small{Accuracy measured while fine-tuning a model
      pretrained on $\sourceset$ (epoch 0 indicates values obtained with the
      initial pretrained weights). The samples in $\labeledtargetood$ are fit first, while
      the model reaches high accuracy on $\labeledtargetid$ much later. We
      fine-tune for at least one epoch and then early stop when the validation
      accuracy starts decreasing.}}

  \label{fig:appendix_training_curves}
\end{figure*}

% #############################################################################


% #############################################################################
\vspace{-0.3cm}
\section{Evolution of disagreement score during fine-tuning}
\label{sec:appendix_score_curves}

In this section we illustrate how the distribution of the disagreement score
changes during fine-tuning for ID and OOD data, for a 5-model ERD ensemble.
Thus, we can further understand why the performance of the $\method$ ensembles
is impacted by near OOD data.

Figure~\ref{fig:appendix_score_curves} reveals that for far OOD data (the left
column) the disagreement scores computed on OOD samples are well separated from
the disagreement scores on ID data (note that disagreement on OOD data is so
concentrated around the maximum value of $2$ that the boxes are essentially
reduced to a line segment). On the other hand, for near OOD data (the right
column) there is sometimes significant overlap between the disagreement scores
on ID and OOD data, which leads to the slightly lower AUROC values that we
report in Table~\ref{table:main_results}.

The figures also illustrate how the disagreement on the ID data tends to
increase as we fine-tune the ensemble for longer, as a consequence of the models
fitting the arbitrary labels on the unlabeled ID samples. Conversely, in most
instances one epoch suffices for fitting the arbitrary label on the OOD data.

We need to make one important remark: While in the figure we present
disagreement scores for the ensemble obtained after each epoch of fine-tuning,
we stress that the final $\method$ ensemble need not be selected among these. In
particular, since each model for $\method$ is early stopped separately,
potentially at a different iteration, it is likely that the $\method$ ensemble
contains models fine-tuned for a different number of iterations. Since we select
the $\method$ ensembles from a strictly larger set, the final ensemble selected
by the our proposed approach will be at least as good at distinguishing ID and
OOD data as the best ensemble depicted in
Figure~\ref{fig:appendix_score_curves}.

\begin{figure*}[t]
  \centering

  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/score_curves_svhn_cropped.png}
    \caption{ID = SVHN; OOD = CIFAR10.}
  \end{subfigure}
  \hfill
  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/score_curves_svhn_cropped01234.png}
    \caption{ID = SVHN[0-4]; OOD = SVHN[5-9].}
  \end{subfigure}

  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/score_curves_cifar10.png}
    \caption{ID = CIFAR10; OOD = SVHN.}
  \end{subfigure}
  \hfill
  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/score_curves_cifar1001234.png}
    \caption{ID = CIFAR10[0-4]; OOD = CIFAR10[5-9].}
  \end{subfigure}

  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/score_curves_cifar100.png}
    \caption{ID = CIFAR100; OOD = SVHN.}
  \end{subfigure}
  \hfill
  \begin{subfigure}[r]{0.49\textwidth}
    \centering
    \includegraphics[width=1\textwidth]{figures/score_curves_cifar1000-50.png}
    \caption{ID = CIFAR100[0-49]; OOD = CIFAR100[50-99].}
  \end{subfigure}

  \caption{ \small{The distribution of the disagreement score measured during
  fine-tuning on ID and OOD data (blue and orange boxes, respectively). The box
  indicates the lower and upper quartiles of the distribution, while the
  middle line represents the median and the whiskers show the extreme values.
  Notice that the distributions of the scores are easier to distinguish for far
  OOD data (left column), and tend to overlap more for near OOD settings (right
  column).}}

  \label{fig:appendix_score_curves}
\end{figure*}

% #############################################################################



% % #############################################################################
% 
% \section{The importance of finer grained FPR control}
% \label{sec:appendix_num_classes}
% 
% TODOTODO
% 
% \begin{figure*}[t]
%   \begin{center}
%     \includegraphics[width=0.75\textwidth]{figures/auroc_vs_num_classes.png}
%   \end{center}
% 
%   \vspace{-0.3cm}
%   \caption{\small{Running $\method$ with a varying number of training classes on
%   CIFAR100:0-50 vs CIFAR100:50-100. Using more of the information coming from
%   the labels of the training data allows for picking a better early stopping
%   time which in turn leads to improved OOD detection performance, compared to
%   the baselines that pool together all the ID data in one class.} }
% 
%   \label{fig:vary_num_classes}
% 
%   \vspace{-0.3cm}
% \end{figure*}
% 
% % =============================================================================

% % #############################################################################
% \section{OOD detection hardness}\label{sec:appendix_hardness}
% 
% Out of distribution detection benchmarks can be assessed based on their
% difficulty. In what follows, we propose a simple way to evaluate the hardness of
% an OOD detection setting and provide empirical evidence that shows that the
% scenarios we looked at are indeed more complicated than some of the common OOD
% detection benchmarks.
% 
% Consider the task of distinguishing between samples that come from two
% distributions $P, Q$ with disjoint supports $\idsupp, \oodsupp$.  Let us assign
% labels according to the distribution the points are coming from: $\DD=\{ (x_i,
% y_i) : y_i=-1 \iftext x_i \in \idsupp, y_i=1 \iftext x_i \in \oodsupp \}$. We
%   solve the classification problem by searching for a minimizer of the empirical
%   risk inside a function class $\FF$.
% 
% The intuition for our measure of \textit{hardness} is as follows: if it is
% difficult for a binary classifier to separate samples from $P$ and $Q$, then it
% will also be difficult to detect test samples from $Q$ as OOD, when only a
% training set drawn from $P$ is available.
% 
% To quantify the difficulty of the binary classification problem, we use the area
% under the training curve, i.e.\ the curve of the training loss as a function of
% iterations of the optimization algorithm. The larger the area, the more
% iterations it takes to converge, which in turn indicates that the classification
% problem is difficult.
% 
% Formally, we define the \textit{hardness} of the OOD detection task with respect to a
% function class $\FF$ as:
% 
% $$\hardness(\DD; \FF) := \int_{0}^{1} L_n(f_{t}) dt,$$
% \vspace{1mm}
% 
% where $f_t$ is the model after a fraction $t$ of the training epochs are
% finished and $L_n$ is the empirical loss function.
% 
% For our task, we start with a VGG model and train it for 30 epochs. In order to
% approximate the integral, we take the training loss for the whole data set every
% 5 epochs, and we average these losses.
% 
% \vspace{0.5cm}
% \begin{table}[h]
% 
% \caption{OOD detection hardness.}
% \label{table:ood_hardness}
% 
% \begin{center}
% 
% \begin{tabularx}{\textwidth}{lllX}
% \toprule
%  & \textbf{ID data set} & \textbf{OOD data set} &  $\hardness$ \\
% \midrule
%   \multirow{5}{*}{1) Easy OOD} & MNIST & FashionMNIST & 0.01 \\
%                             & FashionMNIST & MNIST & 0.01 \\
%                             & SVHN & CIFAR10 & 0.05 \\
%                             & CIFAR10 & SVHN & 0.05 \\
%                             & CIFAR100 & SVHN & 0.06 \\
% 
% \midrule
% 
%   \multirow{5}{*}{2) Novel classes} & MNIST[0:4] & MNIST[5:9] & 0.15 \\
%                                & FashionMNIST:0,2,3,7,8 & FashionMNIST:1,4,5,6,9 & 0.27 \\
%                                & SVHN[0:4] & SVHN[5:9] & 0.24 \\
%                                & CIFAR10[0:4] & CIFAR10[5:9] & 0.45 \\
%                                & CIFAR100[0:49] & CIFAR100[50:99] & 0.59 \\
% 
%   \midrule
% 
%   \multirow{3}{*}{3) Hard covariate shift} & CIFAR10 & CIFAR10-C sev5 & 0.08 \\
%                                       & CIFAR100 & CIFAR100-C sev5 & 0.10 \\
%                                       & ImageNet & ObjectNet & 0.18 \\
% 
% \midrule
% \midrule
% 
%   \multirow{2}{*}{3') Easy covariate shift} & CIFAR10 & CIFAR10-C sev2 & 0.18 \\
%                                       & CIFAR100 & CIFAR100-C sev2 & 0.19 \\
% \end{tabularx}
% \end{center}
% \end{table}
% 
% Notice that the settings with novel classes and the one with hard covariate
% shift are generally more difficult than the common benchmarks used in the OOD
% detection literature.
% 
% Apart from the three categories of settings that we introduced in
% Section~\ref{sec:experiments}, we also present numbers for CIFAR10-C and
% CIFAR100-C with lower-severity corruptions, i.e.\ severity 2. These scenarios
% are usually easier to solve with domain adaptation techniques. Nevertheless, in
% Appendix~\ref{sec:appendix_more_experiments} we show that $\method$ performs well on
% OOD detection on these settings as well. Even though performing OOD detection is
% redundant here, the good results of our method go to show that it can still work
% well, even in difficult situations.
% 
% Figure~\ref{fig:auroc_vs_hardness} TODOTODOTODO
% 
% \begin{figure*}[t]
%   \centering
%   \hspace*{\fill}%
%   \begin{subfigure}[r]{0.28\textwidth}
%     \centering
%     \includegraphics[width=\textwidth]{figures/transductive_setting.png}
%   \end{subfigure}
%   \hfill
%   \begin{subfigure}[c]{0.60\textwidth}
%     \includegraphics[width=\textwidth]{figures/auroc_vs_hardness.png}
%   \end{subfigure}
%   \hspace*{\fill}%
% 
%   \caption{\small{Performance of $\method$ compared to competitive baselines. The
%       horizontal axis shows various data sets ranked by difficulty (i.e.\ sorted
%       by our method's AUROC). The solid lines show the gap between $\method$ and the
%       next best baseline. The gap is wider for hard OOD detection settings. }}
% 
% \label{fig:auroc_vs_hardness}
% \vspace{-0.4cm}
% \end{figure*}
% 
% % =============================================================================



% % #############################################################################
% \section{Dependence on the test set configuration}
% \label{sec:appendix_test_size}
% 
% In order to bridge the gap between (offline) transductive OOD detection and
% online anomaly detection we investigate the impact of the size of the test set
% on the OOD detection performance. In addition, we also vary the ratio of OOD
% samples in the test set, i.e.\ $\frac{|\targetoodset|}{|\targetidset| +
% |\targetoodset|}$. Our findings suggest that there is a broad spectrum of values
% for which $\method$ maintains a good performance. In
% Figures~\ref{fig:vary_target_pretrained} and \ref{fig:vary_target} we show the
% AUROC of $\method$ and vanilla ensembles for various configurations of the test set,
% when the $\method$ method is trained from pre-trained weights or random
% initializations.  For $\method$, we use a smaller batch size of 32. The reason is
% that, for settings where there is almost no OOD data, larger batch sizes (like
% 128) will `drown out' the signal from OOD samples, since there will be so few of
% them per batch, compared to the ID samples.
% 
% However, when there are only a few very diverse OOD test samples, their
% contribution to the gradient is small. Moreover, if the number of ID test
% samples is large, fitting a single arbitrary label on them is much easier. As a
% consequence, the arbitrary label can take longer to fit on the (few) OOD
% samples, than on the (many) test ID samples, leading to false positives, i.e.\
% ID samples incorrectly flagged as OOD. This loss in efficacy can be mitigated by
% either splitting the test set in smaller batches, or by using a different
% labeling scheme for the test set, the details of which we leave as future work.
% Alternatively, one can use both an inductive and a transductive method to do OOD
% detection: the transductive method ensures high detection accuracy, if there are
% enough OOD samples in the test batch, while the inductive method can be used as
% a fallback solution.
% 
% 
% \begin{figure}[h]
%   \centering
%   \begin{subfigure}[b]{\textwidth}
%     \centering
%     \includegraphics[width=0.75\textwidth]{figures/vary_target_heatmap_pretrained.png}
%     \caption{Initialization from pre-trained weights.}
%     \label{fig:vary_target_pretrained}
%   \end{subfigure}
% 
%   \begin{subfigure}[b]{\textwidth}
%     \centering
%     \includegraphics[width=0.75\textwidth]{figures/vary_target_heatmap_scratch.png}
%     \caption{Initialization from random weights.}
%     \label{fig:vary_target}
% 
%   \end{subfigure}
% 
% 
%   \caption{ AUROCs obtained with an ensemble of ResNet20 models as the
%     composition of the test set is changed. Only settings with at least 5 OOD
%     samples have been considered. The ID samples are from CIFAR10, while the OOD
%     samples are from CIFAR10-C/snow5. For comparison, we provide in parentheses
%     AUROC values for a vanilla ensemble trained once on the training set and
%     evaluated on each test set configuration.
%     (a) The models are initialized from pre-trained weights. (b) The models are
%     initialized with random weights.
%   }
% 
% \end{figure}
% 
% % =============================================================================



% % #############################################################################
% \section{Regularized training/test discriminator for transductive OOD detection}
% 
% \at{TODOTODO delete this section probably}
% 
% In \cite{scott08}, the authors suggests that training a binary classifier with
% bounded false positive rate to distinguish between the training set $\sourceset$
% and the test set $\targetset$ can successfully separate the OOD samples from
% the ID samples in the test set. However, this approach does not fall in the
% category of predictive uncertainty-based OOD detection methods, since it does
% not provide a good classifier of the labeled training set as well.
% 
% We present in what follows a set of experiments run to check if a similar
% technique works for the data sets we considered. Early stopping with respect to
% a validation set that contains only ID samples is enough to obtain good OOD
% detection performance.
% 
% For the corruption data sets, the table shows the average of the AUROC
% taken over all corruptions (A), and the value for the worst-case setting
% (W).
% 
% \begin{table}[h]
%   \begin{center}
% 
%   \caption{AUROC for an early-stopped binary classifier trained to separate the
%   training set from the test set.}
% 
%   \input{tables/binary_classifier}
% 
% \end{center}
% \end{table}
% % =============================================================================
% 
% 
% % #############################################################################
% \section{Generalization to hold-out test set}
% \label{sec:appendix_semi_supervised}
% 
% \at{TODOTODO delete this section probably}
% 
% In this section we present experiments which show that after
% training/fine-tuning on a test set with ID and OOD samples, one can also use our
% method to detect OOD samples from the same distribution, that have not been seen
% during training. Concretely, we use a test set of 5000 ID and 5000 OOD samples
% to train $\method$ ensembles (we reiterate that we do not have access to which
% samples are indeed OOD in the test set). For evaluation, we compute the metrics
% on a separate data set, with 5000 ID and 5000 OOD samples, where the OOD samples
% come from the same distribution as the samples seen during training. As revealed
% in Table~\ref{table:generalization}, the performance does not change
% substantially, when evaluating on the hold-out test set. For the corruption
% data sets, we report for each metric the average taken over all corruptions (A),
% and the value for the worst-case setting (W).
% 
% \begin{table}[H]
% \tiny
% 
% \caption{Generalization on held out test set.}
% \label{table:generalization}
% 
% \begin{center}
% 
% \input{tables/holdout}
% 
% \end{center}
% \end{table}
% 
% % =============================================================================

\newpage
\bibliography{appendix}
\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: "iclr2021_conference"
%%% End:
