\section{Proposed method}
\label{sec:method}


% \vspace{-0.1cm}
\begin{figure*}[t] 
  \begin{minipage}[t]{0.50\linewidth}
    \removelatexerror
        \setlength{\algomargin}{-0.0cm}
        \begin{algorithm}[H]
        \DontPrintSemicolon
        \small
 
     \SetKwInOut{KwInput}{Input}

     \KwInput{Train set $\sourceset$, ID Validation set $\validset$, Unlabeled set
     $\targetset$, Model $\tilde{f}$ pretrained on $\sourceset$, Ensemble size $K$}

     \KwResult{$\method$ ensemble $\{\fhat_{y_i}\}_{i=1}^K$}

     Sample $K$ different labels $\{y_1, ..., y_K\}$ from $\YY$

     \For(\tcp*[h]{fine-tune $K$ models}){$c \gets \{ y_1, ..., y_K\}$} {
       $\fhat_c \gets \textit{Initialize}(\tilde{f})$\;
       $\labeledtarget \gets \{ (x, c) : x \in \targetset \}$\;
       $\fhat_c \gets \textit{EarlyStoppedFineTuning}\left(\fhat_c, \sourceset \cup \labeledtarget;
       \validset \right)$\;
     }

     \KwRet $\{\fhat_{y_i}\}_{i=1}^K$ \;

     \caption{Obtaining $\method$ ensemble via early stopping}
     \label{algo:reto_training}

    \end{algorithm}
  \end{minipage}
  \hspace{0.2cm}
  \begin{minipage}[t]{0.48\textwidth}
    \removelatexerror
        \setlength{\algomargin}{0.2cm}
        \begin{algorithm}[H]
        \DontPrintSemicolon
        \small

     \SetKwInOut{KwInput}{Input}

     \KwInput{Ensemble $\{\fhat_{y_i}\}_{i=1}^K$, Test set $\testset$, $\outputs
       = \emptyset$, Threshold~$\thresh$,
   Disagreement metric $\dis$}

     \KwResult{$\outputs$, i.e.\ the novel-class samples from $\testset$}
     \vspace{0.075in}

     \For(\tcp*[h]{run hypothesis test}){$x \in \testset$} {
     \vspace{0.075in}
         \If{$\Tdis(\fhat_{y_1}, ..., \fhat_{y_{K}})(x) > t_{0}$} {
         \vspace{0.05in}
         %\left(x; \fhat_{y_1},\dots,\fhat_{y_K}\right)
          $\outputs \gets \outputs \cup \{x\}$\;
        }
     }
     \vspace{0.075in}
     \KwRet $\outputs$ \;

     \caption{Novelty detection using $\method$}
     \label{algo:reto_detection}

    \end{algorithm}
  \end{minipage}

%   \vspace{-0.4cm}
\end{figure*}


% 
% \fy{I'm actually a bit confused - is ERD the method/algorithm? isn't it only
% part of the detection method ... in particular, its the output of algo 1 and
% not the algo itself ... make sure this naming is correct} 
% 
% In this section we introduce our proposed algorithm, $\method$, and provide a
% principled justification for the key ingredients that lead to the improved
% performance of our method.  % We then contrast our approach with some related
% methods and discuss their % limitations.
In this section we first introduce our proposed method to obtain Ensembles with
Regularized Disagreement ($\method$) and describe how they can be used for
novelty detection.
% We then provide the theoretical motivation for it and argue why it is
% well-suited for novelty detection.

\subsection{Training ensembles with regularized disagreement ($\method$)}
\label{sec:RETOproc}

Recall from Figure~\ref{fig:practical_sketch} that we have access to both a
labeled training set $\sourceset = \{(x_i, y_i)\}_{i=1}^{n} \sim \idjoint$, with
covariates $x_i \in \idsupp$ and discrete labels $y_i \in \YY$, and an unlabeled
set $\targetset$, which contains both ID and unknown OOD samples. Moreover, we
initialize the models of the ensemble using the weights of a predictor with good
in-distribution performance, pretrained on $\sourceset$. In the scenarios we
consider, such a well-performing pretrained classifier is readily available, as
it solves Task~I in Figure~\ref{fig:practical_sketch}.
% \footnote{In the appendix we also present a version of $\method$ trained for
% more epochs from random initializations, i.e.\ $\method$++.}
  
The entire training procedure is described in
Algorithm~\ref{algo:reto_training}.  For training a single model in the
ensemble, we assign a label $c \in \YY$ to all the unlabeled points in
$\targetset$, resulting in the $\targetlabel$-labeled set that we denote as
$\labeledtarget \defn \{(x,c): x\in \targetset\}$. We then fine-tune a
classifier $\fhat_c$ on the union $\sourceset \cup \labeledtarget$ of the
correctly-labeled training set $\sourceset$, and the unlabeled set
$\labeledtarget$.  In particular, we choose an early stopping time at which
% early stop training such that
validation accuracy is high and training error on $\sourceset \cup
\labeledtarget$ is low.
% In practice the early stopping time can be chosen post-hoc. 
%% The model $f_c$ that we output is regularized to have high
%% accuracy on the ID validation set while still achieving a low training error on
%% $\sourceset \cup \labeledtarget$. \fy{mention ES here?}
%% namely we pick a model at an intermediate epoch,
%% before the accuracy on a holdout ID validation set $\validset$ starts
%% to decrease.
We create a diverse ensemble of $K$ classifiers $\fhat_c$ by choosing a
different artificial label $c \in \YY$ for every model.


%% use this ensemble to flag as OOD all the points for which an aggregate
%% disagreement measure (described in Section~\ref{sec:disagreement})
%% surpasses a threshold value $t_0$.

Intuitively, encouraging each model in the ensemble to fit different labels to
the unlabeled set $\targetset$ promotes disagreement, as shown in
Figure~\ref{fig:setting_ensemble}.  In the next sections, we elaborate on how to
use diverse ensembles for novelty detection.


%% for OOD detection
%% \citep{Jain2020} or for better predictive performance \citep{Bennett2002,
%% Zhang2010} The maximum discrepancy method (MCD) \citep{mcd_ood}, the
%% only candidate that can be used with complex models like neural
%% networks, tends to result in ensembles that do not disagree enough on
%% OOD data, leading to subpar novelty detection performance (see
%% Figure~\ref{fig:scores_mcd_es} in
%% Appendix~\ref{sec:appendix_statistic}).

% Intuitively, the training procedure encourages the models to produce different
% predictions on the OOD samples in $\targetset$, while regularization prevents
% them from fitting the incorrect label $\targetlabel$ and hence disagreeing on
% the ID points.

%% \fy{isn't it that we illustrate how to use ERD}
%% Next, we explain how to use ERD for detection and argue why regularized
%% disagreement is essential for OOD detection.
%% and in Section~\ref{sec:earlystopping} we justify why
%% we can find such models using early stopping.
%% and that, indeed, the resulting ensemble
%% disagrees only on the OOD samples in $\targetset$.
%We emphasize again that we do not require any labeled OOD data during training
%as opposed to many other OOD . \fy{really need?}
%% and
%% compared to vanilla ensembles, the additional computational costs are solely
%% due to fitting the additional small set $(U,c)$ in every epoch which.
%% , however,
%% may be outweighed by the time savings due to early stopping.
%% \fy{another problem is that if OOD is very hard to fit this'll be long}
%% \at{in a way, if OOD is hard to fit, it just means our algorithm has low power.
%% ID always takes as much to start being fit, $\le 10$ epochs on our data sets.
%% if OOD cannot be fit before that, we won't detect it with our algorithm}


\begin{figure*}[t]
  \centering

    \includegraphics[width=0.85\textwidth]{figures/regularization_effect_new.pdf}

%     \caption{\small{a) Two models trained on $\sourceset \cup \labeledtarget$
%         form an ensemble that disagrees on both ID and OOD data. b) Regularization
%         prevents individual models from fitting $\wronglylabeledtargetid$,
%         limiting disagreement to only OOD samples. c) In contrast,
%     ensembles with too little disagreement fail to detect OOD samples.}}

    \caption{
%       \small{
        a) Ensembles with too little disagreement fail to detect OOD
        samples. b) An ensemble of two models trained on $\sourceset \cup
        \labeledtarget$ disagrees on both ID and OOD data. b) Regularization
        prevents models from fitting $\wronglylabeledtargetid$,
    limiting disagreement to only OOD samples.
% }
}

%    \label{fig:regularization_effect}
   \label{fig:disagreement_types}
%   \vspace{-0.4cm}
 \end{figure*}

% \vspace{-0.2cm}
\subsection{Ensemble disagreement for novelty detection}
\label{sec:disagreement}
% \vspace{-0.1cm}

We now discuss how we can use ensembles with disagreement to detect OOD
samples and why the right amount of diversity is crucial. Note that we
can cast the novelty detection problem as a hypothesis test with the null
hypothesis $H_0: x \in \suppid$.

%% During test time we run Algorithm~\ref{algo:reto_detection}, that flags all samples
%% as OOD that have an aggregate disagreement metric (described in
%% Section~\ref{sec:disagreement}) larger than a threshold value $t_0$.
%% are flagged as OOD while on the remaining samples we can output
%% predictions using the initial model trained on $\sourceset$.
% \fy{we do sell our thing as dedicated OOD detector right? cause
% used to have: predict on the rest...}
% \at{yes, OOD detector; one can do prediction with the model pretrained on S}

As usual, we test the null hypothesis by comparing a test statistic with a
threshold $\thresh$: 
% using an $\method$ ensemble with a disagreement metric: We propose
% Algorithm~\ref{algo:reto_detection} to test the null hypothesis by using an
% disagreement metric using the \method:
The null hypothesis is \emph{rejected} and we report $x$ as OOD
(\emph{positive}) if the test statistic is larger than $\thresh$
(Section~\ref{sec:erd_eval} elaborates on the choice of $\thresh$).
% \fy{say sth like this is normal usage of diverse ensemble and cite or sth
% unless we want to make it a contribution - pick the battles}
In particular, we use as test statistic the following disagreement score, which computes
% To quantify disagreement we use
the average distance between the softmax outputs of the $K$ models in the
ensemble:

\vspace{-0.5cm}
\begin{align*}
%   T_{\text{avg-TV}}(x) := \frac{1}{K(K-1)} \sum_{i\neq j} \TV \left(f_i(x),
%   f_j(x)\right),
  \Tdis(f_1(x), ..., f_K(x)):=\frac{2\sum_{i\neq j} \dis \left(f_i(x),
  f_j(x)\right)}{K(K-1)},
\end{align*}

\vspace{-0.2cm}
\noindent where $\dis$ is a measure of disagreement between the softmax outputs
of two predictors, for example the total variation distance
$\dis_{\text{TV}}(f_i(x), f_j(x))=\frac{1}{2} \|f_i(x) - f_j(x) \|_1$ used in
our experiments\footnote{We also expect other distance metrics to be similarly
effective.}. We provide a thorough discussion on the soundness of this
test statistic for disagreeing models and compare it with previous metrics in
Appendix~\ref{sec:appendix_statistic}.

Even though previous work like \citet{mcd_ood} used a similar disagreement
score, their detection performance is notably worse. The reason lies in the lack
of diversity in their trained ensemble (see Figure~\ref{fig:scores_mcd_es} in
Appendix~\ref{sec:appendix_statistic}). On the other hand
Algorithm~\ref{algo:reto_training} without early stopping would lead to a too
diverse ensemble, that also disagrees on ID points, and hence, has a high false
positive rate (see Appendix~\ref{sec:appendix_score_curves}). In the next
section, we explain why novelty detection with this test statistic crucially
relies on the right amount of ensemble diversity and how ensembles may achieve
this goal if they are trained to have regularized disagreement.

% \vspace{-0.3cm}
\subsection{Desired ensemble diversity via regularized disagreement}
\label{sec:disagreement}
% \vspace{-0.1cm}

% using ensembles with regularized disagreement is crucial
For simplicity of illustration, let us first assume a training set with binary labels
and a semi-supervised novelty detection setting as depicted in
Figure~\ref{fig:setting_ensemble}~a).
% First, we illustrate how we use the disagreement of models in an ensemble for
% OOD detection. For two different models in an ensemble as in
For an ensemble with two models, like in Figure~\ref{fig:setting_ensemble}~b),
the model predictions \emph{agree} on the blue and red areas and \emph{disagree}
on the gray area depicted in Figure~\ref{fig:setting_ensemble}~c).  Note that
the two models in Figure~\ref{fig:setting_ensemble} are \emph{just diverse
enough} to obtain both high power (flag true OOD as OOD) and low false positive
rate (avoid flagging true ID as OOD) at the same time. 
%% if we flag as OOD all the samples in the region where
%% the models disagree, we achieve high power and a low false positive
%% rate.
% In order to achieve %Ideally, the test should have high power (flag true OOD
% as OOD) and low false positive rate (avoid flagging true ID as OOD) with this
% test.

% Importantly, despite employing a similar disagreement-based test statistic, the
% method of \citet{mcd_ood} fails to produce diverse ensembles, and hence,
% performs poorly on novelty detection tasks.

Previous methods that try to leverage unlabeled data to obtain more
diverse ensembles either do not work with deep neural networks
\citep{Bennett2002,Zhang2010,Jain2020} or do not disagree enough on OOD data
\citep{mcd_ood}, leading to subpar novelty detection performance (see
Figure~\ref{fig:scores_mcd_es} in Appendix~\ref{sec:appendix_statistic}).
% The failure of previous ensemble methods can be primarily traced back to
% either too little disagreement as in Figure~\ref{fig:disagreement_types}~a),
% resulting in low power, or too much diversity, resulting in high false
% positive rate as in Figure~\ref{fig:disagreement_types}~c).

To obtain the right amount of diversity, it is crucial to train ensembles with
\emph{regularized disagreement} on the unlabeled set: The models
should disagree on the unlabeled OOD samples, but \emph{agree} on the unlabeled
ID points (Figure~\ref{fig:disagreement_types}c).
%% Next, we argue that we can get ensembles with the right amount of diversity with
%% $\method$, by \emph{regularizing disagreement} on the data used to train the
%% ensemble: While the models disagree on the unlabeled OOD samples, the key is
%% that they \emph{agree} on the unlabeled ID points
%% (Figure~\ref{fig:disagreement_types}c).
Thus, we avoid having too little disagreement as in
Figure~\ref{fig:disagreement_types}a), which results in low power, or too much
diversity, resulting in high false positive rate as in
Figure~\ref{fig:disagreement_types}b). In particular, if models $f_c$ predict
the correct label on ID points and the label $c$ on OOD data, we can effectively
use disagreement to detect novel-class samples.  Since classifiers with good ID
generalization need to be smooth, we expect the model predictions on holdout OOD
data from the same distributions to be in line with the predictions on the
unlabeled set.
% \at{NEW: transductive is necessary for semi-supervised to work} In contrast,
% training $\method$ ensembles finds the ``sweet spot'' in
% Figure~\ref{fig:disagreement_types}~b) where the models have \emph{regularized
% disagreement}: While the models disagree on the unlabeled OOD samples, the key
% is that they \emph{agree} on the unlabeled ID points. In particular, if models
% $f_c$ predict the correct label on ID points and the label $c$ on OOD data, we
% can effectively use disagreement to detect novel-class samples. Since
% regularized classifiers are smooth, we expect the model predictions on holdout
% ID and OOD data from the same distributions to be in line with the predictions
% on the unlabeled set. Therefore, novelty detection performance is governed by
% the following trade-off: A low training error on $\sourceset \cup
% \labeledtarget$ increases power, while maintaining a high validation accuracy
% limits the false positive rate.

%% In Sections~\ref{sec:earlystopping} and \ref{sec:experiments} we show
%% theoretically and empirically that we can achieve this regularized disagreement
%% using early stopping like in Algorithm~\ref{algo:reto_training}.

In Section~\ref{sec:earlystopping} we argue that the training procedure in
Algorithm~\ref{algo:reto_training} successfully induces
% combined with early stopping regularization 
\emph{regularized disagreement} and prove it in a synthetic setting.
% \fy{in Section 3 we show why ES allows us to get the right amount of
% disagreement} \fy{should this be here? maybe to early stopping section?
% actually you could probably prove sth re mahdi some special type of
% regularization as we once discussed with reinhard, but unsure that it would be
% easy to translate to other algorithm... } \at{I think it fits better here than
% in exp section} Finally, we note that instead of early stopping, one could
% also explicitly In particular, we prove for a synthetic setting that early
% stopping leads to
%ensembles with regularized disagreement.
Our experiments in Section~\ref{sec:experiments} further corroborate our
theoretical statements. Finally, we note that one could also use other
regularization techniques like dropout or weight decay. However, running a grid
search to select the right hyperparameters can be more computationally expensive
than simply using one run of the training process to select the optimal stopping
time.

% \fy{also rephrase above}

%% early-stopped fine-tuning
%% on the artificially labeled unlabeled set.


% \begin{figure}[!h]
%   \centering
% 
%     \includegraphics[width=\columnwidth]{figures/disagreement_types_unpadded.pdf}
% 
%     \caption{\small{Varying degrees of ensemble disagreement and how that
%     influences what regions are flagged OOD.}}
% 
%    \label{fig:disagreement_types}
% \end{figure}


% \vspace{-0.5cm}
\section{Provable regularized disagreement via early stopping}
% \section{Early stopping for provable regularized disagreement}
\label{sec:earlystopping}
% \vspace{-0.2cm}

% \fy{we could already make the transductive switch here. in particular,  and
% say: we now show how early stopping prevents fitting the artificial labels on
% the unlabeled in-distribution data.}
In this section, we show how using early stopping in
Algorithm~\ref{algo:reto_training} prevents fitting the incorrect artificial
label on the unlabeled ID samples.
% leads to ensembles with the right amount of disagreement.
%We now make this intuition rigorous in Theorem~\ref{}.
Albeit for a simplified setting, this result provides a rigorous proof of
concept and intuition for why $\method$ ensembles achieve the right amount of
diversity necessary for good novelty detection.


%% We base our claims on two insightful observations from the literature. Firstly,
%% empirical evidence shows that neural networks can fit arbitrary labels perfectly
%% \citep{zhang2016}. On the other hand, noisy samples with incorrect labels are
%% often fit later during training, after the correctly labeled samples
%% \cite{Yilmaz2019,mahdi,song2020,liu2020,xia2021}. This regularizing effect of
%% early stopping is also demonstrated in theoretical works which show that it
%% implicitly restricts model complexity \citep{Yao07, Raskutti13, fanny}.
% We now make this intuition rigorous in Theorem~\ref{}.  Albeit for a
% simplified setting, this result provides a rigorous proof of concept and
% intuition for why ERD ensembles work well.

% \vspace{-0.2cm}
\subsection{Preliminary definitions}

We first introduce necessary definitions to prepare the mathematical
statement. Recall that in our approach, in addition to the correct
labels of the ID training set $\sourceset$, each member of the
ensemble tries to fit one label $\targetlabel$ to the entire unlabeled
set $\targetset$ that can be further partitioned into

\vspace{-0.6cm}
\begin{align*}
  \labeledtarget &= \labeledtargetid \cup \labeledtargetood \\
                 &= \{ (x, c) : x \in
  \targetidset \} \cup \{ (x, c) : x \in \targetoodset \},
\end{align*}
\vspace{-0.6cm}

where $\targetidset := \targetset \cap \idsupp$ and $\targetoodset := \targetset
\setminus \targetidset$. Moreover, assuming that the label of an ID input $x$ is
deterministically given by $\ystar(x)$,
%given by a deterministic function $\ystar: \XX \to \YY$,
we can partition the set $\labeledtargetid$ (see
Figure~\ref{fig:disagreement_types}b) into a subset of effectively ``correctly labeled'' samples $\correctlylabeledtargetid$ and ``incorrectly labeled'' samples $\wronglylabeledtargetid$:
%% samples whose ground truth label differs from $\targetlabel$ and are,
%% thus, incorrectly labeled with $\targetlabel$, and the subset whose
%% correct label is indeed $\targetlabel$:

\vspace{-0.6cm}
\begin{align*}
  \wronglylabeledtargetid := \{ (x, c) : x \in \targetidset \text{ with }
  \ystar(x) \neq \targetlabel \} \\
  \correctlylabeledtargetid := \{ (x, c) : x \in \targetidset \text{ with }
  \ystar(x) = \targetlabel \}.
\end{align*}
\vspace{-0.6cm}

Note that $\wronglylabeledtargetid$ can be viewed as the subset of noisy samples
from the entire training set $\sourceset \cup \labeledtarget$.
%Assuming the OOD samples are representative,

%% If the models fit $\labeledtargetood$ perfectly to different artificial labels
%% $\targetlabel$, then they disagree on the OOD data in the labeled set, as
%% well as on unseen similar OOD samples, leading to a diverse ensemble. On the
%% other hand, if the models also fit the samples in $\wronglylabeledtargetid$, the
%% ensemble becomes too diverse, as shown in
%% Figure~\ref{fig:regularization_effect}a). Fortunately, as noted before, early
%% stopping can prevent models from fitting the training samples with
%% label noise, having an effect similar to the one depicted in
%% Figure~\ref{fig:regularization_effect}b).
% , which, in our case, amounts to the incorrectly labeled subset
% $\wronglylabeledtargetid$. 



% \vspace{-0.2cm}
\subsection{Main result}

We now prove that there exists indeed an optimal stopping time at which a
two-layer neural network trained with gradient descent does not fit the
incorrectly labeled subset $\wronglylabeledtargetid$, under mild distributional
assumptions.

\at{NEW: assumptions and statement} For the formal statement, we assume that the
artificially labeled set $\sourceset \cup \labeledtarget$ is \emph{clusterable},
i.e.\ the points can be grouped in $K$ clusters of similar sizes. Each class may
comprise several clusters, but every cluster contains only samples from one
class. Any cluster may include at most a fraction $\noise \in [0, 1]$ of samples
with label noise, e.g.\ $\wronglylabeledtargetid$.  We denote by $c_1, ..., c_K$
the cluster centers and define the matrix $C:=[c_1, ..., c_K]^T \in \RR^{K
\times d}$.  Further, let $\separability$ be a measure of how well a
randomly-initialized two-layer neural network can separate the cluster centers.
We provide the formal definition of $\separability$ in
Appendix~\ref{sec:appendix_theory}. Intuitively, $\separability$ is large if the
cluster centers are well-separated and it vanishes if $c_i = c_j$ for some $i, j
\le K$.  Under these assumptions we have the following:
% Further, let $\separability$ be a measure of how well a two-layer neural
% network can separate the cluster centers ($\separability=0$ if $c_i = c_j$ for
% some $i, j \le K$).

\begin{proposition} (informal)
  \label{proposition_informal}
%   Assume that $\sourceset \cup \labeledtarget$ is \emph{clusterable} and can
  %   be partitioned into $K$ clusters with centers $c_1, ..., c_K$.  $\|C\|$ be
  %   its spectral norm.  that $\rho \le \delta / 8$ and $\eps \le \alpha \delta
  %   \lambda_{min}(\Sigma)^2 / K^2$, where $\delta$ is a constant such that
  %   $\delta \le \frac{2}{|\YY - 1|}$ and $\alpha$ is a constant that depends
  %   on $\Gamma$.
  It holds with high probability
%   $1 - 3 / K^{100} - Ke^{-100d}$
  over the initialization of the weights that a two-layer neural network
  trained on $\sourceset \cup \labeledtarget$ perfectly fits $\sourceset$,
  $\correctlylabeledtargetid$ and $\labeledtargetood$, but not
  $\wronglylabeledtargetid$, after $T\simeq\frac{\|C\|^2}{\separability}$
  iterations.
  %   where $a$ is a constant and $\separability$ measures how well a two-layer
  %   neural network can separate the cluster centers.  predicts the cluster
  %   label on all inputs in an $\eps$-neighborhood of the cluster center for
  %   all clusters, for an appropriately chosen stopping time.
\end{proposition}

% \fy{make statement formal, keep assumptions vague but maybe clusterability a
% bit like we had before - whatever is necessary to get the stopping time
% definition}
% \vspace{-0.2cm}
The precise assumptions for the proposition can be found in
Appendix~\ref{sec:appendix_theory}. On a high level, the reasoning follows from
two simple insights: 1. When the artificial label is not equal to the 
true label, the ID samples in the unlabeled set can be seen as
noisy samples in the set $S \cup (U,c)$.  2. It is well known that early
stopping prevents models from fitting incorrect labels since noisy samples with
incorrect labels are often fit later during training (see e.g.\ theoretical and
empirical evidence here \cite{Yilmaz2019,mahdi,song2020,liu2020}).  In
particular, our proof heavily relies on Theorem~2.2 of \citet{mahdi} which shows
that early stopped predictors are robust to label noise.

% \fy{see previous discussion - maybe cut this here:} Since regularized
% predictors are smooth, a consequence of Proposition~\ref{proposition_informal}
% is that the ensembles also disagree on similar, holdout OOD samples not
% included in $\targetoodset$, and predict the true label on ID data.

Proposition~\ref{proposition_informal} gives a flavor of the theoretical
guarantees that $\method$ enjoys.  Albeit simple, the clusterable data
model actually includes data with non-linear decision boundaries.  On the other
hand, the requirement that the clusters are balanced seems rather restrictive.
In our experiments we show that this condition is in fact more
stringent than it should.  In particular, our method still works when the
number of OOD samples $|\targetoodset|$ is considerably smaller than the number
of ID samples from any given class, as we show in Section~\ref{sec:ablations}.
% \fy{Hence, necessarily, blablabla early
% stopping still works.  More details on the ablation study can be found in
% Section~\ref{sec:ablations}}
%% Nevertheless, for reliable novelty detection, the unlabeled set
%% should include a sufficient (albeit fairly low) number of OOD samples, as we
%% elaborate in Section~\ref{sec:ablations}.

%% some of the conditions are more stringent than they ought to be.
%% In particular, the requirement that the clusters are balanced can be relaxed
%% significantly, as shown in our experiments where the number of OOD samples
%% $|\targetoodset|$ is considerably smaller than the number of ID samples from any
%% given class. Nevertheless, for reliable novelty detection, the unlabeled set
%% should include a sufficient (albeit fairly low) number of OOD samples, as we
%% elaborate in Section~\ref{sec:ablations}.

%% \begin{proposition}[informal]
%%   \label{proposition}

%%   For a fixed label $\targetlabel\in\YY$ assume that the set $\sourceset \cup
%%   \labeledtarget$ is clusterable
%%   and each cluster $C_i$ only includes a few noisy samples from
%%   $\wronglylabeledtargetid$, namely $\frac{|C_i
%%   \cap\wronglylabeledtargetid|}{|C_i|} \le \rho$. If $\rho \lesssim
%%   \frac{1}{|\YY|}$, then it holds with high probability over the initialization
%%   of the weights that a two-layer neural network trained on $\sourceset \cup
%%   \labeledtarget$ perfectly fits $\sourceset$, $\correctlylabeledtargetid$ and
%%   $\labeledtargetood$, but not $\wronglylabeledtargetid$, for an appropriately
%%   chosen stopping time.

%% \end{proposition}


%% The precise conditions and statement of this proposition, which is a
%% straight-forward extension of Theorem~2.2 in \cite{mahdi}, can be found in
%% Appendix~\ref{sec:appendix_theory}. As a direct consequence of this result, our
%% method satisfies the nice property that it can correctly detect the OOD samples
%% in $\targetset$.


% \vspace{-0.3cm}
\subsection{Choosing the early stopping time}
% \vspace{-0.3cm}

In practice, we avoid computing the exact value of $T$ by using instead a
heuristic for picking the early stopping iteration with the highest validation
accuracy (indicated by the vertical line in Figure~\ref{fig:training_curves}).
As shown in the figure, the model fits the noisy training points, i.e.\
$\wronglylabeledtargetid$, late during fine-tuning, which causes the validation
accuracy to decrease, since the model will also predict the incorrect label
$\targetlabel$ on some validation ID samples. In
Appendix~\ref{sec:appendix_learning_curves} we show that the trend in
Figure~\ref{fig:training_curves} is consistent across data sets.

% \vspace{-0.1cm}
\begin{figure}[h!]
  \begin{center}
    \includegraphics[width=\columnwidth]{figures/training_curves_pretrained_svhn_cropped01234.png}
  \end{center}

%   \vspace{-0.4cm}
  \caption{
%     \small{
      Accuracy during fine-tuning a model
      pretrained on $\sourceset$ (epoch 0 indicates values obtained with the
      initial pretrained weights). The samples in $\labeledtargetood$ are fit
      first, while the model reaches high accuracy on $\labeledtargetid$ much
  later. We fine-tune for at least one epoch and then early stop when the
  validation accuracy starts decreasing after 7 epochs (vertical line). The
  model is trained on SVHN[0:4] as ID and SVHN[5:9] as OOD. 
%   }
}
%   \vspace{-0.3cm} \caption{ \small{Accuracy during fine-tuning on $\sourceset$
  %   (epoch 0 corresponds to the initial pretrained weights). The samples in
  %   $\labeledtargetood$ are fit first, while high accuracy on
  %   $\labeledtargetid$ comes much later. We fine-tune for at least one epoch
  %   and then early stop when the validation accuracy starts decreasing after 7
  %   epochs
%   (vertical line). ID=SVHN[0:4], OOD=SVHN[5:9].  }} \vspace{-0.3cm}

  \label{fig:training_curves}
\end{figure}



% \fy{rephrase: In practice, we cannot use $T=$ and we need a procedure
%   to choose the stopping time. As we noted earlier, $U_ID^c$ is
%   basically noisy data and is fit later.}  As these noisy training
% points are fit, validation accuracy likely decreases because it also
% incorrectly predicts the label $\targetlabel$ on some validation ID
% points.  Therefore, we can use the heuristic of picking the iteration
% with the largest validation accuracy (indicated by the vertical line
% in the figure). In Appendix~\ref{sec:appendix_learning_curves} we show
% that the trend in Figure~\ref{fig:training_curves} also persists for
% other data sets.

%% To find the best stopping time in practice, we use a validation set of
%% labeled ID points to select an intermediate checkpoint before
%% convergence.  As a model starts to fit $\wronglylabeledtargetid$,
%% i.e.\ the wrongly labeled ID samples in $\targetidset$, it also
%% incorrectly predicts the label $\targetlabel$ on some validation ID
%% points, leading to a decrease in validation accuracy, as shown in
%% Figure~\ref{fig:training_curves}. In our experiments, we wait for one
%% epoch to allow for the fine-tuning to have any effect at all, and then
%% pick the iteration with the largest validation accuracy (indicated by
%% the vertical line in the figure). In
%% Appendix~\ref{sec:appendix_learning_curves} we show that the trend in
%% Figure~\ref{fig:training_curves} also persists for other data sets.


