%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}


\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{import}
%%%%% start contents of packages.tex
% ICML Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
% \usepackage{tikz}
%\usepackage{subfigure} % Switched to floatrow + subfig
%\usepackage{booktabs} % for professional tables
% END ICML Reccomended

\usepackage{xspace} % need it for the etal/etc/eg macros. CVPR includes, ICML 
% doesn't

% \usepackage[dvipsnames]{xcolor}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{etoolbox}

\usepackage[thmmarks, amsmath, thref]{ntheorem}
\usepackage{bbm}
\usepackage{bm}

%\usepackage{ulem} % for strikethrough \sout
\usepackage[normalem]{ulem} % for \sout w/o the annoying underline in \emph

\usepackage{times}
\usepackage{epsfig}
\usepackage{graphicx}
% \usepackage{multirow}
% \usepackage{multicol}
\usepackage{xcolor}
% Subfigures
%\usepackage{subfig}
%\usepackage{floatrow}
\usepackage{subcaption}
%\floatsetup[figure]{subcapbesideposition=top}
%\floatsetup[table]{capposition=top}


% \usepackage[linesnumbered,algoruled,boxed,lined]{algorithm2e}
%  \usepackage[linesnumbered,algoruled,noend,noline]{algorithm2e}
% \usepackage[algoruled,noend,noline]{algorithm2e} % a bit shorter with no line numbers


% \usepackage{slashbox}
\usepackage{diagbox}


\usepackage{lipsum}  

\usepackage{threeparttable}
\usepackage[singlelinecheck=true,justification=centering]{caption}
\captionsetup[table]{skip=10pt}
\usepackage{makecell}
\usepackage{multirow}
\usepackage{color,soul}
\usepackage{xcolor}  

% \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
\definecolor{dimgray}{rgb}{0.35, 0.35, 0.35}

\usepackage[linesnumbered,algoruled,noend,noline]{algorithm2e}

%%%%% end contents of packages.tex

%%%%% start contents of macros.tex
 %--------------------------------------------
% Add a period to the end of an abbreviation unless there's one
% already, then \xspace.
\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
% \def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}

\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{cf}\onedot} \def\Cf{\emph{Cf}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} 
% \def\wrt{with respect to }  % The extra space at the end means \wrt blah would 
% not end up as ``with respect toblah''. 
 
\def\dof{d.o.f\onedot}
% \def\etal{\emph{et al}\onedot}
\def\etal{\emph{et al}\onedot}
% \def\etal{\emph{et al}\onedot}
%
\def\iid{\emph{i.i.d}\onedot}
% \def\pdf{\emph{p.d.f}\onedot}



\makeatother
%--------------------------------------------------

\newcommand{\FIG}{Fig.~}
\newcommand{\FIGS}{Figs.~}
\newcommand{\EQN}{Eq.~}
% \newcommand{\EQN}{Equation }
\newcommand{\EQNS}{Eqs.~}
% \newcommand{\EQNS}{Equations }

%\newcommand{\SEC}{Sec.~}.  Use \autoref or ~\autoref instead.



% integers
\newcommand{\ZZ}{\ensuremath{\mathbb{Z}}}
\newcommand{\RR}{\ensuremath{\mathbb{R}}}
\newcommand{\Rtwo}{\ensuremath{\RR^2}}
\newcommand{\Rthree}{\ensuremath{\RR^3}}
\newcommand{\Rfive}{\ensuremath{\RR^5}}

\newcommand{\Rsix}{\ensuremath{\RR^6}}

\newcommand{\Rn}{\ensuremath{\RR^n}}
\newcommand{\Rd}{\ensuremath{\RR^d}}
\newcommand{\RD}{\ensuremath{\RR^D}}
\newcommand{\Rk}{\ensuremath{\RR^k}}

% 
%positive integers
\newcommand{\Zplus}{\ensuremath{\ZZ^+}}
%positive reals
% \newcommand{\Rplus}{\ensuremath{\RR^+}}

\newcommand{\Rnonneg}{\ensuremath{\RR_{\ge0}}}
\newcommand{\Rpos}{\ensuremath{\RR_{>0}}}


% set notation
% \newcommand{\set}[1]{\ensuremath{{\left\{#1\right\}}}}
\newcommand{\set}[1]{\ensuremath{{\{#1\}}}}
\newcommand{\tuple}[1]{\ensuremath{{(#1)}}}
\newcommand{\tupleLarge}[1]{\ensuremath{{\left(#1\right)}}}
\newcommand{\argmin}[1]{\ensuremath{\operatorname*{arg\,min}_{#1}}}
\newcommand{\argmax}[1]{\ensuremath{\operatorname*{arg\,max}_{#1}}}



\newcommand{\InnerProduct}[2]{\left\langle #1,#2 \right\rangle}

\newcommand{\norm}[1]{{{\left\|#1\right\|}}}
\newcommand{\sign}[1]{{\mathrm{sign}\left(#1\right)}}

\newcommand{\ellTwo}{\ell_2}
\newcommand{\ellTwoNorm}[1]{\norm{#1}_{\ellTwo}}
\newcommand{\ellOne}{\ell_1}




\newcommand{\MATRIX}[2][cccccccccccccccccccc]{\left[
 \begin{array}{#1}
 #2
 \end{array}
\right]}

\newcommand{\Mod}[1]{\ (\mathrm{mod}\ #1)}



% homogeneous coordinates
\newcommand{\bxh}{\widetilde{\bx}}

\newcommand{\RED}[1]{{{\color{red}{#1}}}}  
\newcommand{\BLUE}[1]{{{\color{blue}{#1}}}}
\newcommand{\GREEN}[1]{{{\color{green}{#1}}}}
\definecolor{orcgreen}{RGB}{100,150,100}
\newcommand{\ORCGREEN}[1]{{{\color{orcgreen}{#1}}}}
\newcommand{\MAGENTA}[1]{{{\color{magenta}{#1}}}}  

\definecolor{brown}{RGB}{165,42,42}
\newcommand{\BROWN}[1]{{{\color{brown}{#1}}}}



\newcommand{\FREIFELD}[1]{\textbf{\MAGENTA{[FREIFELD says: #1]}}}
\newcommand{\OR}[1]{\textbf{\ORCGREEN{[OR says: #1]}}}
% Make your own

\newcommand{\TBD}{\RED{[TBD]}}
\newcommand{\TODO}[1]{\RED{[TODO: #1]}}
\newcommand{\REPLACE}[2]{\RED{REPLACE. } \BLUE{#1} \RED{WITH. } \BLUE{#2}}



% In python: print_iterable(['\\bmdefine\\b{0}'.format(x)+'{'+x+'}' for x in 
% string.ascii_letters])

\bmdefine\ba{a}
\bmdefine\bb{b}
\bmdefine\bc{c}
\bmdefine\bd{d}
\bmdefine\be{e}
% \bmdefine\bf{f}  # Clashes with a standard command
\bmdefine\boldf{f}
\bmdefine\bg{g}
\bmdefine\bh{h}
\bmdefine\bi{i}
\bmdefine\bj{j}
\bmdefine\bk{k}
\bmdefine\bl{l}
\bmdefine\bm{m}
\bmdefine\bn{n}
\bmdefine\bo{o}
\bmdefine\bp{p}
\bmdefine\bq{q}
\bmdefine\br{r}
\bmdefine\bs{s}
\bmdefine\bt{t}
\bmdefine\bu{u}
\bmdefine\bv{v}
\bmdefine\bw{w}
\bmdefine\bx{x}
\bmdefine\by{y}
\bmdefine\bz{z}
\bmdefine\bA{A}
\bmdefine\bB{B}
\bmdefine\bC{C}
\bmdefine\bD{D}
\bmdefine\bE{E}
\bmdefine\bF{F}
\bmdefine\bG{G}
\bmdefine\bH{H}
\bmdefine\bI{I}
\bmdefine\bJ{J}
\bmdefine\bK{K}
\bmdefine\bL{L}
\bmdefine\bM{M}
\bmdefine\bN{N}
\bmdefine\bO{O}
\bmdefine\bP{P}
\bmdefine\bQ{Q}
\bmdefine\bR{R}
\bmdefine\bS{S}
\bmdefine\bT{T}
\bmdefine\bU{U}
\bmdefine\bV{V}
\bmdefine\bW{W}
\bmdefine\bX{X}
\bmdefine\bY{Y}
\bmdefine\bZ{Z}


\bmdefine\balpha{\alpha}
\bmdefine\bbeta{\beta}
\bmdefine\bgamma{\gamma}

\bmdefine\bdelta{\delta}
\bmdefine\btheta{\theta}
\bmdefine\blambda{\lambda}
\bmdefine\bphi{\phi}
\bmdefine\bxi{\xi}
\bmdefine\bzeta{\zeta}
\bmdefine\boldeta{\eta}
\bmdefine\bpi{\pi}

\bmdefine\bmu{\mu}
\bmdefine\brho{\rho}
\bmdefine\bomega{\omega}
\bmdefine\bOmega{\Omega}
\bmdefine\bPi{\Pi}

\bmdefine\bvarepsilon{\varepsilon}
\bmdefine\bepsilon{\epsilon}


\bmdefine\bDelta{\Delta}

\bmdefine\bTheta{\Theta}

\bmdefine\bsigma{\sigma}
\bmdefine\bSigma{\Sigma}
\bmdefine\bPsi{\Psi}
\bmdefine\bLambda{\Lambda}



\bmdefine\bzero{0}
\bmdefine\bone{1}
\bmdefine\binfty{\infty}

 \newcommand{\indicator}{\mathbbm{1}}


% In python 
% print_iterable(['\\newcommand{\\'+'{0}cal'.format(x)+'}{\\mathcal{'+x+'}}' 
% for x in string.ascii_uppercase])
\newcommand{\Acal}{\mathcal{A}}
\newcommand{\Bcal}{\mathcal{B}}
\newcommand{\Ccal}{\mathcal{C}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\Ecal}{\mathcal{E}}
\newcommand{\Fcal}{\mathcal{F}}
\newcommand{\Gcal}{\mathcal{G}}
\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Ical}{\mathcal{I}}
\newcommand{\Jcal}{\mathcal{J}}
\newcommand{\Kcal}{\mathcal{K}}
\newcommand{\Lcal}{\mathcal{L}}
\newcommand{\Mcal}{\mathcal{M}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Ocal}{\mathcal{O}}
\newcommand{\Pcal}{\mathcal{P}}
\newcommand{\Qcal}{\mathcal{Q}}
\newcommand{\Rcal}{\mathcal{R}}
\newcommand{\Scal}{\mathcal{S}}
\newcommand{\Tcal}{\mathcal{T}}
\newcommand{\Ucal}{\mathcal{U}}
\newcommand{\Vcal}{\mathcal{V}}
\newcommand{\Wcal}{\mathcal{W}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\Zcal}{\mathcal{Z}}


 \newcommand{\DP}{\mathrm{DP}}
 \newcommand{\DIR}{\mathrm{Dir}}

%%%%% end contents of macros.tex


\usepackage{booktabs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
% Oren says:
% Load hyperref as the last package. You will live longer (and will have less warnings)
% \usepackage[colorlinks=true,    % declare we want color links
% 	    bookmarksopen=true,  % expanded bookmarks
% 	    pagebackref=false,        % or backref
% 	    linkcolor=blue,
% 	    urlcolor  = blue
% 	    ]{hyperref}
% \usepackage[backref, colorlinks,citecolor=blue]{hyperref}
% \usepackage[backref]{hyperref}
% \hypersetup{urlcolor=blue, colorlinks=true} 
\hypersetup{
  colorlinks   = true, %bColours links instead of ugly boxes
  urlcolor     = blue, %Colour for external hyperlinks
  linkcolor    = blue, %Colour of internal links
  citecolor   = blue, %Colour of citations
}

% \hypersetup{citecolor=blue}

% \usepackage{capt-of}
% \hypersetup{citecolor=blue}   
\usepackage{ellipsis} % The comment about regarding hyperref is more of a guideline than a rule... 
                      %``Note that ellipsis must be loaded after hyperref. 
                      %(The ellipsis documentation doesn't mention this, but the hyperref README does.)''

 \hypersetup{final}  % The things you learn the hard way: when draft mode is on (in \documentclass)
                     % Then the links created by hyperref are tuned off. Even if I pass draft=false and final=true to hyperref.
                     % The hypersetup{final} makes sure we have links active even in draft mode.
%   \hypersetup{draft}
               
% To have short pointers for sections and subsections. Oren Freifeld
\renewcommand{\sectionautorefname}{\S} 
\renewcommand{\subsectionautorefname}{\S} 
\renewcommand{\subsubsectionautorefname}{\S} 

\renewcommand{\algorithmautorefname}{Algorithm}
% \renewcommand{\algorithmautorefname}{Alg.~}
\newcommand{\Exampleautorefname}{Example}



\title{Variational- and Metric-based Deep Latent Space \\ for Out-of-Distribution Detection}


% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dinari@post.bgu.ac.il >?Subject=Your UAI 2022 paper}{Or Dinari}{}}
\author[1]{\href{mailto:<orenfr@cs.bgu.ac.il >?Subject=Your UAI 2022 paper}{Oren Freifeld}{}}
% Add affiliations after the authors
\affil[1]{%
    The Department of Computer Science,
    Ben-Gurion University of the Negev\\
    Be'er Sheva, Israel
}



\begin{document}


\maketitle

\begin{abstract}
One popular deep-learning approach for the task of Out-Of-Distribution (OOD) detection is based on thresholding the values of per-class Gaussian likelihood of deep features. However, two issues arise with that approach: first, the distributions are often far from being Gaussian; second, many OOD data points 
fall within the effective support of the known classes' Gaussians. Thus, either way it is hard to find a good threshold. In contrast, our proposed solution for OOD detection is based on a new latent space where: 1) each known class is well captured by a nearly-isotropic Gaussian; 2) those Gaussians are far from each other and from the origin of the space (together, these properties effectively leave the area around the origin free for OOD data). Concretely, given a (possibly-trained) backbone deep net of choice, we use it to train a conditional variational model via a Kullback Leibler loss, a triplet loss, and a new distancing loss that pushes classes away from each other.  During inference, the class-dependent log-likelihood values of a deep feature ensemble of the test point are also weighted based on reconstruction errors, improving further the decision rule. Experiments on popular benchmarks show that our method yields state-of-the-art results, a feat achieved despite the fact that, unlike some competitors, we make no use of OOD data for training or hyperparameter tuning. Our code is available at~\url{https://github.com/BGU-CS-VIL/vmdls}.
\end{abstract}

\begin{figure}[!t]
    % \centering
    \subcaptionbox{Known classes: MNIST. OOD: Omniglot.}%[.45\linewidth]    
    {\includegraphics[height=4.5cm,trim=1.5cm 1.1cm 1.5cm 1.2cm,clip]{mnist_tsne.png}}
    \newline
    \subcaptionbox{Known classes: CIFAR-10. OOD: ImageNet-resize.}%[.45\linewidth]
    {\includegraphics[height=4.5cm,trim=1.5cm 1.1cm 0cm 1.2cm,clip]{cifar_tsne.png}}
\captionsetup{justification=justified, singlelinecheck=false}
    \caption{The t-SNE visualizations of feature ensembles, obtained using the proposed method, of test data in two different experiments (see text for details). The known classes are well separated from each other and from 
    the OOD data.  
}
     \label{Fig:intro}
\end{figure}



\section{Introduction}\label{Sec:Intro}

Out-of-Distribution (OOD) detection is the following classification task. 
During training, there are labeled data points where each label is associated with one out of $C$ classes.
The latter are referred to as \emph{known classes} while the points from all the $C$ classes are collectively called 
\emph{in-distribution train data}. 
In test time, in addition to previously-unseen (and unlabeled) points from the $C$ known classes 
-- such points are called \emph{in-distribution test data} --
there are also (unlabeled) points that belong to neither of 
the known classes -- such points are called \emph{OOD data}. 
OOD detection is then the binary classification between in-distribution (test) data and
OOD data. 
Note that in general, 
unlike the case of in-distribution data, the availability of OOD data during the training cannot be assumed.


In Deep Learning (DL)~\citep{Lecun:Nature:2015:DL}, OOD detection is especially important: \eg, when Deep Neural Nets (DNNs) misclassify data, \emph{they often do this with high confidence}~\citep{Goodfellow:arxiv:2014:explaining,Nguyen:TPAMI:2015:deep,Hendrycks:ICLR:2016:baseline,Moosavi:CVPR:2017:universal}. 
 As such mistakes often stem from OOD data, it is beneficial to detect the latter.

Attempts to solve classification tasks, OOD detection included, 
are often based on trying to learn a low-dimensional latent space
wherein classes are well separated. 
Many existing OOD-detection methods try to do so using 
metric learning (typically based on a contrastive loss) or a variational loss.
Then, in test time, they use a decision rule based on thresholding the values of 
per-class Gaussian likelihoods. 
However,  
the problem with that Gaussian-based approach, 
at least when used naively, is
twofold: 1) the distribution of each known class is usually 
not well captured by a Gaussian; 2), many of the OOD points tend to fall within the effective support of one or more of the known classes' Gaussians. 
Consequently, %with either the softmax or the Gaussian approach, 
finding a good threshold is difficult. 


Our proposed method eliminates these issues (see~\autoref{Fig:intro}), 
a feat achieved partly by leveraging in a synergistic way some of aforementioned ideas 
and partly by introducing new ideas. 

For example, part of our method is based on using both a variational term and a metric loss. 
The variational term is a class-conditioned Gaussian Kullback-Leibler (KL) divergence loss~\citep{Sun:CVPR:2020:conditional}.
However, rather than using a contrastive loss, our metric loss consists of two terms: a triplet loss~\citep{Schroff:CVPR:2015:facenet} and a new loss called \emph{distancing}.
The distancing loss helps achieving a much better inter-class separation than what the triplet loss can do by itself. 
The key insight in this context is that without the distancing loss, the combination of the variational and triplet losses, which may work well in other classification tasks, 
 tends to be less effective here (in OOD detection)
 as each loss pushes in a different direction, leading to insufficient separation and much of the OOD data ending up within the known classes' support.
 Adding the distancing term solves this, implicitly helping to form, between the known classes, an empty area wherein most of the OOD data ends up.   
%

Another key difference between the learning stage of our method 
and others is that we measure our metric loss on the output of a \emph{stochastic generator},
and not, as is usually done with such losses, on deterministic representations.

Our overall loss results in a new latent space, coined a \emph{Variational- and Metric-based Deep Latent Space (VMDLS)}, wherein:  
1) the distribution of each known class is (a nearly-isotropic) Gaussian;
2) these Gaussians are well separated from each other;
3) the in-distribution (train and test) points are well separated from the OOD points.


After the DNN training and before 
the inference, we partially follow~\citep{Lee:NIPS:2018:simple}
in that we extract a deep feature ensemble from our trained model for each of the training points,
and fit, using a new implicit scheme, a (usually-anisotropic) Gaussian to the feature ensembles of each class. 
During inference, we compute for the feature ensemble of each test point its negative log-likelihood (ll) according to each of those Gaussians. 
Then, our method has two versions. 
In the basic one, denoted by \textbf{VMDLS}$_\bb$, we compare the best negative ll value against an application-specific threshold.
As we will show,  \textbf{VMDLS}$_\bb$ is already a good OOD method --
it has State-of-the-art (SOTA) results or close to it in terms of the F1 score (which takes in account both the classifier accuracy and OOD detection) --
and achieves fairly-decent results in binary OOD detection. 
However, a better version, denoted by \textbf{VMDLS}, is obtained
{with the help of
a reconstruction-based weighting scheme (see \autoref{Sec:Method:Subsec:Weighting}).} 
\textbf{VMDLS} yields results that are uniformly better than \textbf{VMDLS}$_\bb$ and sets new SOTA results on several popular benchmarks in OOD detection (at least among methods that do not need  OOD data
for training/tuning).  
%
 In~\autoref{Fig:intro} we show, using t-SNE visualizations~\citep{Van:JMLR:2008:tsne},
 example results of the latent space associated with the learned feature ensemble in two different experiments: in one, the in-distribution data is MNIST~\citep{Lecun:1998:mnist} and the OOD data is Omniglot~\citep{Lake:Science:2015human}; in the second, the in-distribution data is CIFAR-10~\citep{Krizhevsky:Citeseer:2009:Cifar} and the OOD data is ImageNet-resize~\citep{Liang:ICLR:2018:ODIN}. 
 All of the data shown in~\autoref{Fig:intro} is test data. 
 

\textbf{To summarize, our main contributions are: }
1) A new learning scheme of a novel latent space, where classes are well separated
from each other and from OOD data based on a stochastic generator
as well as a synergy between KL and metric losses via the help of a new metric-loss term; 
2) A new inference scheme using that latent space, together with a reconstruction
of low-level features, 
for OOD detection. 

Together, these contributions give rise to a new end-to-end solution for OOD detection that
yields SOTA results in multiple benchmarks and, importantly and unlike some existing methods, \emph{requires no OOD data for training/tuning}. 

Finally, all our experiments are reproducible, as the reader can verify by running our publicly-available code 


\section{Related Work}\label{Sec:Related}
\cite{Hendrycks:ICLR:2016:baseline} proposed a popular OOD-detection baseline
that uses a threshold on the maximal SoftMax score of a classifier to determine whether
a point is OOD or not. 
ODIN~\citep{Liang:ICLR:2018:ODIN} is a method that is applied to a previously-trained DNN and, by using small perturbations on the input and temperature scaling,
calibrates the classifier's SoftMax scores,
improving the separation between in-distribution and OOD data.
\cite{Lee:NIPS:2018:simple} proposed a method, called Mahalanobis, that extracts a feature ensemble from a trained DNN for all the known classes, and then fits class-conditioned Gaussian distributions (an idea we use too). Given a test point, they compute the Mahalanobis distance according to each of the Gaussians. Next, the decision (OOD or not) is made based on that distance, with the aid of a logistic-regression detector that they tune on a small validation set, which, in addition 
to in-distribution points, also \emph{includes  some OOD data}.   
%
\cite{Masana:BMVC:2018:metric} proposed using metric learning for OOD detection. Specifically, 
they use a contrastive loss~\citep{Chen:ICML:2020:contrastive} with OOD mining, 
incorporating an OOD set (different from the set they use at testing) during training.
%
In~\citep{Winkens:Arxiv:2020contrastive},
 a contrastive loss is used in several self-supervision tasks,
together with a SoftMax-based classifier, for learning meaningful latent representations.
In CSI~\citep{Tack:NIPS:2020:csi} a contrastive loss is used as well, with shifting instances.
Two other smart methods that use 
OOD data during training are Outlier Exposure (OE)~\citep{Hendrycks:ICLR:2019:deep}
and the energy-based model in~\citep{Liu:NIPS:2020:energy}.

More generally, note that the dependency of the elegant methods in~\citep{Hadsell:CVPR:2006dimensionality,Hendrycks:ICLR:2019:deep,Lee:NIPS:2018:simple,Liang:ICLR:2018:ODIN,Liu:NIPS:2020:energy}
on OOD data for either tuning or training is a limitation. Moreover, that dependency also  makes a comparison between these methods and ones that do not rely on OOD data (like ours) unfair against the latter.

\cite{Hsu:CVPR:2020:generalized} proposed Generalized ODIN,
an ODIN variant that requires no hyperparameter tuning on OOD data. 
Recently, \cite{Zaeemzadeh:CVPR:2021:out} proposed using 1-dimensional subspaces for OOD detection; similarly to Generalized ODIN, 
their method requires no tuning on OOD data. 
%
~\cite{Yoshihashi:CVPR:2019:classification} proposed CROSR,
where instead of using only the DNN's output,
they also used a latent representation extracted from the DNN. They have also introduced DHRNet, a DNN which assists the learning of meaningful representations via reconstruction.
%

While methods that rely on trained classifiers (\eg,~\citep{Liang:ICLR:2018:ODIN,Lee:NIPS:2018:simple,Hendrycks:ICLR:2016:baseline}) have proven to be successful, they are,
as was noted in~\citep{Lee:ICLR:2018:training}, limited by the trained classifier.
In contrast, rather than relying on such trained DNNs, our method is based on a novel training scheme.
% 
Our method may be viewed as belonging to the hybrid discriminative-generative camp, an approach adopted by several recent works such 
as OpenHybrid~\citep{Zhang:ECCV:2020:hybrid},
OSAD~\citep{Shao:ECCV:2020:open} (which is used for adversarial defense) and SSD~\citep{Sehwag:ICLR:2020:ssd} (which utilizes unlabeled in-distribution samples).
 
A reconstruction-based method was proposed by~\cite{Perera:CVPR:2020generative} who trained a generative model for the known classes, 
and augmented the input with representations obtained from that model for training a classifier. They also used self-supervision to learn more informative features. 
\cite{Chen:ECCV:2020:learning} used Reciprocal Point Learning (RPL) to learn compact and discriminative representations.
More relevant to our work is a method called CGDL~\citep{Sun:CVPR:2020:conditional}
which trains a Variational Auto Encoder (VAE) with a class-conditioned Gaussian distribution. Particularly, their model learns a different Gaussian for each class and uses a ladder architecture to extract high-level features,
a SoftMax-based classifier, and a detector of unknown classes in the learned latent space. From their work we have adopted (and adapted) a single idea and that is the usage of class-conditioned Gaussians.
Another recent work employing a conditional VAE is CVAECapOSR~\citep{Guo:ICCV:2021:conditional} which is based on fitting to each class a predefined Gaussian.
One key difference between our usage of a Conditional VAE and~\citep{Sun:CVPR:2020:conditional,Guo:ICCV:2021:conditional} is that while they use predefined target Gaussians~\citep{Guo:ICCV:2021:conditional},
    or explicitly learn the target Gaussians~\citep{Sun:CVPR:2020:conditional}, we learn them implicitly by the metric losses. This helps us achieve better separation between the classes.



\section{The Proposed Method}\label{Sec:Method} 
We design our model to have a deep latent space with two desiderata: %desired properties in mind:
1) That the empirical distribution of each class will be well approximated by a Gaussian (this simplifies inference and implies that the level sets of each class-dependent empirical distribution are nearly convex). 
2) That each such Gaussian will be far from the Gaussians of all of the other classes. 
Once these properties are achieved, a likelihood-based decision whether a point belongs to a known class or not is easily 
made, as we explain below. 
% \subsection{Model} \label{section:model}

\begin{figure*}[t]
    \centering
        \includegraphics[width=\linewidth,trim=0cm .23cm 0cm .53cm,clip]{network.pdf} 
    \caption{The proposed model with a ResNet backbone.
%     Solid lines (green and red) are used in the training phase;
%     green lines (solid and dashed) are used in the testing phase.
%     Solid green lines are used during both training and testing.
%     Solid red lines are used only during training. 
%     Dashed lines are used only during testing. 
    }
     \label{Fig:DNN}
\end{figure*}


Let $D$ be the dimension of the  data,
let $\bx\in \RD$ denote a data point, 
and let $d<D$ be 
the dimension of the sought-after latent space. 
Our model is an encoder (an $\RD\to\Rd$ function) that consists of a backbone DNN denoted by $\bg:\RD\to\Rk$ (where $d<k<D$) and a stochastic generator, $\boldf:\Rk\to\Rd$, that generates $d$-dimensional samples and whose functionality is closely related to the sampling step in a VAE~\citep{Kingma:2014:AutoEncodingVB}. 
% let $\bg(\bx)\in \RR^{k}$ (\ie, $\bg:\RD\to\Rk$) denote the corresponding backbone's output, and let $d<D$ denote the dimension of the latent space. 
More concretely, let  
\begin{align}
   &\boldf(\bg(\bx)) = \bmu(\bg(\bx))+(\bsigma(\bg(\bx)) \odot \bepsilon) \in \Rd
\nonumber \\ 
%  \quad
 &\bmu =(\mu_1,\ldots,\mu_d):\Rk\to\Rd\,%,
%     \quad 
\nonumber \\ 
% &
% \RED{\bsigma\circ \bg:\RD\to\Rd_{+}\,,} \nonumber\\
&\bsigma=(\sigma_1,\ldots,\sigma_d):\Rk\to\Rd_{>0}\,%,
    \nonumber \\ 
    & \bepsilon\sim\Ncal(\bzero_{d\times 1},\bI_{d\times d})
\end{align}
where
% and
$\odot$ is the Hadamard (element-wise) product.
Each of the  functions $\bmu$ and $\bsigma$ 
is defined via its own fully-connected layer whose input is $\bg(\bx)$ (the positivity of $\bsigma$ is ensured by exponentiating the output of its layer). 
A difference from a standard VAE 
% here, $\boldf(\bg(\bx))$ is in $\Rd$, not $\RD$; 
is that here $\boldf(\bg(\bx))\in\Rd$ is never transformed back to $\RD$;  
\ie, there is no ``decoding'' as we do not try to reconstruct
$\bx$. The role of the latent space will become apparent when we discuss our loss function. 
As for $\bg$, in principle any backbone could be used within our method. In our experiments
we explored several types: ResNet18~\citep{He:CVPR:2016deep};
ResNet34~\citep{He:CVPR:2016deep}; 
DenseNet-BC100~\citep{Huang:2017:densely}; 
WideResNet28~\citep{Zagoruyko:arxiv:2016:wide}; VGG~\citep{simonyan:ICLR:2015:VGG}; a simple Convolutional Neural Net (CNN).
The
values of $k$ and $d$ depend on the backbone and dataset; for the concrete values used in our experiments see our 
Supplemental Material (\textbf{SupMat}).  
The model's architecture (with a ResNet backbone) is shown in~\autoref{Fig:DNN},
where (as we explain in the next sections) the red lines are used only during training, the solid blue lines are used during both training and testing, and the dashed blue lines are used only during testing. 




\subsection{Training}

Recall that $C$ is the number of known classes. Let $N$ be the number of training points and
let $y_i\in\set{1,\ldots,C}$ be the class label of a training point $\bx_i\in \RD$.   
We split the training data
into batches using the following standard mining strategy.
For each batch we choose  $B_C$ classes at random, where
$B_C\in\set{2,\ldots, C}$ (\ie, there must be representatives from at least two classes in each batch), 
and then, from each such a randomly-chosen class we choose $B_N$ examples at random. These examples constitute the batch. 
The values of $B_C$ and $B_N$ depend on both the backbone and the dataset
(the concrete values used in our experiments appear in the \textbf{SupMat}).
%
Our overall loss combines three loss types. 
The first loss is the popular triplet loss~\citep{Schroff:CVPR:2015:facenet} 
which is based on three input examples (from the same batch) at each time: 
an anchor example, $\bx_i^a$;
a positive example, $\bx_i^p$;
a negative example, $\bx_i^n$. 
This loss aims to 
make (in the latent space) the squared $\ellTwo$ distance between 
$\bx_i^a$
and 
$\bx_i^p$ smaller than the squared $\ellTwo$ distance between 
$\bx_i^a$
and 
$\bx_i^n$
by, at least, a user-defined margin $M_t$:
\begin{align}
  \Lcal_t(\bx_i^a,\bx_i^p,\bx_i^n)
    \triangleq&\max(0,M_t+\norm{\boldf(\bg(\bx_i^a))-\boldf(\bg(\bx_i^p))}^2
      \nonumber \\ & -
                \norm{\boldf(\bg(\bx_i^a))-\boldf(\bg(\bx_i^n))}^2) \, .
\end{align}
The anchor and the positive examples are chosen from the same class,
while the negative example is from some other class. 
The rationale is to push examples of different classes away from each other while drawing same-class examples closer to each other. While this is desired in our context as well, 
it turns out that it is not enough for pushing the classes \emph{sufficiently far} from each other. In other words, we want to push the anchor away from the negative examples independently of the current distance between $\bx_i^a$ and $\bx_i^p$.
We thus propose adding a new simple metric loss term, called \emph{distancing}. It uses the same $\bx_i^a$ and $\bx_i^n$ from the nominal triplet used in $\Lcal_t$ (and ignores $\bx_i^p$) and has 
its own user-defined margin parameter, $M_d$:
\begin{align}
\hspace{-2.5mm}
\Lcal_d(\bx_i^a,\bx_i^n)\hspace{-0.6mm}\triangleq\hspace{-0.6mm}\max(0,M_d
\hspace{-0.8mm}-\hspace{-0.8mm}
\norm{\boldf(\bg(\bx_i^a))
\hspace{-0.7mm}-\hspace{-0.7mm}
\boldf(\bg(\bx_i^n))}^2)\hspace{-0.2mm}\,\hspace{-0.2mm}.\hspace{-0.4mm}
\end{align}
The idea here  is to further push the anchor from the negative examples, without affecting its distances from the positive ones. 
As an aside, the addition of the $M_d$ hyperparameter does not increase the number of knobs to tweak; 
as it turns out, the presence of the $\Lcal_d$ term lets us fix the value of $M_t$ (the margin from the standard triplet loss)
to $M_t=0.1$ in all our experiments on all the datasets (changing the value 
of $M_t$ is equivalent to scaling all the other hyperparameters
accordingly). As for the value of $M_d$, we usually use $M_d=d$. 
Lastly, our combined \emph{metric loss} for triplet $(\bx_i^a,\bx_i^p,\bx_i^n)$,
\emph{measured on the output of the stochastic generator,} $\boldf$, is:
\begin{align}
    &\Lcal_m(\bx_i^a,\bx_i^p,\bx_i^n)= \Lcal_t(\bx_i^a,\bx_i^p,\bx_i^n)+
    \Lcal_d(\bx_i^a,\bx_i^n) \, . 
\end{align}
The third loss term is a Gaussian class-conditioned KL divergence loss,
proposed by~\citep{Sun:CVPR:2020:conditional} (based on~\citep{Kingma:2014:AutoEncodingVB}).
Its purpose is to make the intra-class distribution in the latent space as close as possible
to an isotropic Gaussian. 
The KL-divergence loss (\ie, the negated KL divergence) for $\bx_i$ is 
\begin{align}
    \Lcal_\mathrm{KL}(\bx_i) &= -D_{\mathrm{KL}}(q(\cdot|\bg(\bx_i))||p(\cdot|y_i)) %\\ \nonumber
%      &= \sum_{j=1}^d -\log\left(\frac{s}{\sigma_j(\bg(\bx))}\right)-\frac{\sigma_j(\bg(\bx))+(\bmu_j(\bg(\bx))-(m_{y_i})_j)^2}{2s^2}+0.5 \, .
\end{align}
where 
{$q(\cdot|\bg(\bx_i))$} is a $d$-dimensional Gaussian probability density function
(pdf) with a  mean vector $\bmu(\bg(\bx_i))$ and a diagonal covariance matrix
whose $(j,j)$ entry is $\sigma_j^2(\bg(\bx_i))$
while 
{$p(\cdot|y_i)$} is an isotropic $d$-dimensional Gaussian pdf, associated with class $y_i$, with a mean vector $\bm(y_i)=(m_1(y_i),\ldots,m_d(y_i))$ and 
variance $s^2$ (we use $s^2=0.1$ in all classes and in all our experiments; changing $s^2$ to another constant 
will only change the scaling of the other hyper-parameters). 
We now explain how $(\bm(c))_{c=1}^C$ are defined. 
While in~\citep{Guo:ICCV:2021:conditional} the means are predefined, and in~\citep{Sun:CVPR:2020:conditional} the means were learned at the same time with the rest of the model, we 
{compute them} based on the previous epoch. That is, at epoch $t$, for each class $c$ we set $\bm(c) = \frac{1}{N_c}\sum_{\bx_i:y_i=c}\boldf_{t-1}(\bg(\bx_i))$,
where  $N_c=|\set{i:y_i=c}|$ is the number of training points in class $c$, 
and $\boldf_{t-1}(\bg(\bx_i))$ is the previous epoch's output for $\bx_i$. 

The reason for using the fixed results from the previous epoch 
is that since the \textit{distancing loss} pushes the classes away from each other, learning the target means from the previous epoch gradually pushes the target Gaussians from each other,
and does so without the added complexity of learning them separately (as was done in~\citep{Sun:CVPR:2020:conditional}).
This encourages the classes 
to have Gaussian distributions of similar shape and (small) size in the latent space,
simplifying learning and inference. 
%\REPLACE{It follows}
{Based on the above, it can be shown (\textbf{SupMat})} that 
$\Lcal_\mathrm{KL}(\bx_i)$ equals
\begin{align}
\hspace{-2.6mm}
 \sum\limits_{j=1}^d 
  \hspace{-0.4mm}-\hspace{-0.4mm}\log\tfrac{s}{\sigma_j(\bg(\bx_i))}
  \hspace{-0.5mm}-\hspace{-0.5mm}\tfrac{\sigma_j^2(\bg(\bx_i))+(\mu_j(\bg(\bx_i))-m_j(y_i))^2}{2s^2}
  \hspace{-0.6mm}+\hspace{-0.6mm}\tfrac{1}{2}\, \hspace{-.4mm}
 . \hspace{-.4mm}
\end{align}
Lastly, $\Lcal(\bx_i)$, the overall loss of $\bx_i$,
and $\Lcal((\bx_i)_{i=1}^N)$, the loss across the entire training data,
are 
\begin{align}
    \Lcal(\bx_i) = 
    &\Lcal_{KL}(\bx_i)+\tfrac{1}{|\Tcal(\bx_i)|}\sum\limits_{\bx_i^p,\bx_i^n\in \Tcal(\bx_i)}\Lcal_m(\bx_i,\bx_i^p,\bx_i^n)
    \nonumber \\ 
    & \text{ and } \Lcal((\bx_i)_{i=1}^N)= \tfrac{1}{N}\sum\nolimits_{i=1}^N \Lcal(\bx_i)
\end{align}
where $\Tcal(\bx_i$) consists of all of the possible triplets for $\bx_i$ in the  batch. 

At this point, the reader may wonder about the role of the stochastic generator
as seemingly the metric loss can be measured  
on the deterministic $\bmu(\bg(\cdot))$ instead of the stochastic 
$\boldf(\bg(\cdot))$. However, empirically, 
the deterministic option rarely works well (and even then it is no better
than the stochastic one) while in most cases it led to unstable optimization
and poor results. We did not encounter these issues
with the stochastic version. A plausible theoretical explanation
(similar to one used in the classical VAE)
to this phenomenon is that the sampling
leads, during the back propagation, to unbiased estimates of the true gradient. 

Importantly, the effect of the overall loss is not only creating small isotropic
Gaussians that are far from each other but also creating a large
empty area between them (for an intuitive explanation, see \textbf{SupMat}).
Empirically, this is where the OOD data ends up during test time. 



\textbf{Learning $C$ Anisotropic Gaussians over Deep Feature Ensembles.} 
After the training,
% Once the model is trained, 
  following an idea from~\citep{Lee:NIPS:2018:simple}, we run a forward pass on the training set (with no augmentations) and for each $\bx_i$ collect a feature ensemble, $\bt_i$:
\begin{align}
\hspace{-0.25cm}
 \bt_i \triangleq \bt(\bx_i)= (\bt^1(\bx_i),\bt^2(\bx_i),\bt^3(\bx_i), \bg(\bx_i), \bmu(\bg(\bx_i))
 \label{Eqn:Ensemble}
\end{align}
where $(\bt^1,\bt^2,\bt^3)$ are the outputs (after performing average pooling) of 3 different layers from the backbone.
The choice of those layers depends on the backbone's architecture. In block-based architectures (\eg, ResNet or DenseNet) we use the outputs of the first 3 blocks; see, \eg, \autoref{Fig:DNN}. In other architectures, various choices may be made. See \textbf{SupMat} for examples in our experiments.
% \RED{
Next, for each class $c$, we fit 
% (using classical maximum-likelihood estimation in a Gaussian model)
a multivariate \emph{anisotropic} Gaussian to $(\bt_i)_{i:y_i=c}$, the feature ensembles associated with that class. 
\subsection{Testing}\label{Sec:Method:Subsec:Testing}
    Given a test point $\bx\in \RD$ we extract its feature ensemble $\bt$ and calculate its log-likelihood values according to each of the $C$ Gaussians from~\autoref{Sec:Method}. % , $(\Gcal_c)_{c=1}^C$. 
Let $(l_c)_{c=1}^C$
 denote these  values
and let $l=\max_c l_c$ and $c_\mathrm{max}
=\argmax{c} l_c$. 
The t-SNE visualizations (\autoref{Fig:intro}) of the feature ensembles for the \emph{test data}
% (\ie, OOD data as well as previously-unseen data from the known classes) 
in two benchmarks show that 
the known classes are well separated from each other 
as well as from the OOD data; \ie, the properties 
of the last latent layer propagate to the other latent layers, hence also to the additional features in the ensemble. 
Moreover, likelihood values of the OOD points are in general
much lower than those of the in-distribution test points. 
In principle we can stop here, and use the following rule
to decide
whether $\bx$ belongs to class $c_\mathrm{max}$
or if it is an OOD point:
$
 \widehat{y} = c_\mathrm{max} $ if 
             $l > \lambda$
             and 
             $\widehat{y}=\mathrm{OOD}$ otherwise
%
where $\widehat{y}$ is the predicted label of $\bx$. Here, 
$\lambda$ is a user-defined threshold whose application-specific
value can be determined based on either the train set or a small validation set (neither of these sets contains OOD data) according to the allowed False Negative Rate (FNR) while taking into account the model accuracy (if interested in classification as well as OOD detection). 
That is, $\lambda$ is chosen according to the relative importance the user gives to False Positives (FP) or False Negatives (FN).
% \ie in certain domains (such as medical), a single FN could be fatal, while in other domains we would prefer a FN over a FP. }
%
%
We refer to this basic version of our method as \textbf{VMDLS}$_{\bb}$. 
As we show in~\autoref{Sec:Results}, \textbf{VMDLS}$_{\bb}$ already achieves SOTA results when benchmarking F1 score (taking in account both classification and OOD) or close to it, and has good results in binary OOD detection. 
That said, a simple change improves results even further; see~\autoref{Sec:Method:Subsec:Weighting}.
% ; see~\autoref{Sec:Method:Subsec:Weighting}. 
%
\subsection{Reconstruction-based Weighting}\label{Sec:Method:Subsec:Weighting}
 To improve results we use and adapt a popular technique~\citep{Sun:CVPR:2020:conditional,Sun:arXiv:2020:open,Yoshihashi:CVPR:2019:classification} based on test-point reconstruction.
 Concretely, we train, on the same training data (but independently of our model) a simple (non-conditional, non-variational) autoencoder (see \textbf{SupMat} for details) with an $\ellTwo$ reconstruction loss.
Let $\widehat{\bx}$ denote the reconstruction of test point $\bx$ using that AE. 
We run a forward pass of our model on both $\bx$ and $\widehat{\bx}$. 
Let $\bt^1$ and $\widehat{\bt^1}$
denote the first features (see \EQN\eqref{Eqn:Ensemble})
associated with $\bx$ and $\widehat{\bx}$, respectively, and let %\begin{align}
$ w =         ||\bt^1-\widehat{\bt^1}||_{\ell_2}^2$.
If $\bx$ is in-distribution, then $w$ is usually small. 
% Otherwise, $w$ might be either large or small. 
We now modify $l$ (from~\autoref{Sec:Method:Subsec:Testing}) by $w$: \ie, $l_\mathrm{new} \triangleq w\cdot l$.
Within the modified values, the separation of 
OOD points from the rest is better than before, a fact becoming clearer
when the negative log-likelihood values
are normalized to lie in the unit interval and a similar procedure is applied to their modified version: while most of the OOD values are
hardly affected by the modification,
the in-distribution values are vastly reduced; see \autoref{Fig:ll}.
This leads to a new rule:
$
 \widehat{y} = c_\mathrm{max} $ if 
             $l_\mathrm{new} > \lambda$
             and 
             $\widehat{y}=\mathrm{OOD}$ otherwise. 
 We refer to our method, when using this rule, as \textbf{VMDLS}. 
 
 One difference in how we use the reconstruction-based weighting from how 
 such schemes are commonly used is that we rely of the reconstruction
 of the (relatively) shallow feature. The reason is that in the deeper features,
 the differences between the OOD and in-distribution are (empirically) less noticeable as many fine details get washed out. 
 %
%  \RED
 For some intuition, suppose that in-distribution points are cars and the OOD points are animals. 
 When a dog image enters the simple AE (trained on cars) its reconstruction is poor;
 \eg, it might look like a blurry/unrecognizable car. Both the dog image and its poor reconstruction are fed to our model.
 The low-level features (\eg, edges) in both cases will still be fairly-well captured but will be very different across the two images of the dog and its poorly-reconstructed version. 
 That disparity, however, becomes smaller when moving deeper into the model.
 This is because the (true) high-level features of the dog image (\eg, the dog's legs) are poorly captured by our model (learned on cars) and will appear in neither the dog's image nor its reconstructed version. 
 Thus, our reconstruction-based weighting is based on the shallow features.
 See \textbf{SupMat} for empirical comparison with deeper features. 

\begin{figure}[t]
    \centering
    \subcaptionbox{Unmodified values (\textbf{VMDLS}$_\bb$)}[.49\linewidth]{\includegraphics[height=1.45cm,trim=1.5cm 0.0cm 0.0cm 0.7cm,clip]{pre_ae_features.png}} 
      \subcaptionbox{Modified values (\textbf{VMDLS})}[.49\linewidth]{\includegraphics[height=1.45cm,trim=1.5cm 0.0cm 0.0cm 0.7cm,clip]{post_ae_features.png}} 
    \caption{Negative ll (scaled to one) without (a) and with (b) the modification. 
    In-distribution: CIFAR-10 (blue). OOD: ImageNet-resize (orange).
    Note the better separation in (b).}
     \label{Fig:ll}
\end{figure}


\section{Experiments and Results}\label{Sec:Results}
We evaluated our method on several OOD-detection benchmarks and compared it with several key relevant methods, focusing on methods whose authors made their code available
and/or published results on those benchmarks.
% (naturally, we could not cover all works
% that did so but we included recent SOTA methods).
We excluded methods that use OOD data during training (\eg, OE~\citep{Hendrycks:ICLR:2019:deep}) 
as such methods enjoy an unfair advantage. We did include, however, two methods that use OOD data for hyperparameter tuning (ODIN~\citep{Liang:ICLR:2018:ODIN} and Mahalanobis~\citep{Lee:NIPS:2018:simple}).  
%
In our experiments, we followed testing methodologies employed in recent works~\citep{Sun:CVPR:2020:conditional,Sun:arXiv:2020:open,Yoshihashi:CVPR:2019:classification,Oza:CVPR:2019:c2ae,Liang:ICLR:2018:ODIN,Lee:NIPS:2018:simple,Hsu:CVPR:2020:generalized}. 
We split the experiments to two types.
In the first, the binary OOD detection benchmark, 
we used two widely-used metrics:
1) The Area Under the Receiver Operating Characteristic curve (AUROC),
which is a threshold-independent metric (unlike, \eg, the F1 score). 
2) The true-negative rate at a true-positive rate of $0.95$, denoted as TNR@TPR95.
For the second, we evaluated our method on the popular Open Set Recognition
(OSR)~\citep{Scheirer:TPAMI:2012:toward} task,
in which one measures 
how well a method predicts (in the test data) the $C+1$ labels, 
where the $(C+1)$-th label represents
the OOD data. To that aim, and as is common, we computed the F1 score
per class, and then computed the macro-average across those $C+1$ values.  


% \subsection{Implementation Details}
We experimented with several backbones: ResNet18;
ResNet34; WideResNet28; DenseNet-BC100; a modified version of VGG (as defined in~\citep{Yoshihashi:CVPR:2019:classification}); 
a plain CNN (as defined in~\citep{Yoshihashi:CVPR:2019:classification}).
For each model and dataset we used a suitable (and fairly-standard) training strategy (except in the CIFAR-100 DenseNet and the CIFAR-10 WideResnet28 experiments where we fine-tuned a network pretrained on the data with a standard SoftMax classifier, in all the other experiments we trained the models from scratch); see \textbf{SupMat} for details.
%Our code and model checkpoints are included in the submission.
% \subsection{Benchmarks}

\textbf{Benchmarks.}  
For \textbf{Binary OOD detection}, we used the benchmarks proposed in~\citep{Liang:ICLR:2018:ODIN}:
using CIFAR-10~\citep{Krizhevsky:Citeseer:2009:Cifar} or CIFAR-100~\citep{Krizhevsky:Citeseer:2009:Cifar} for the known classes, and either of the following 4 datasets as OOD: ImageNet-crop; ImageNet-resize; LSUN-crop; LSUN-resize.
Thus, there were 8 experiments (one per combination).
%
ImageNet-crop and ImageNet-resize are subsets of ImageNet~\citep{Deng:CVPR:2009:imagenet},
either cropped or resized to the appropriate size for CIFAR. Likewise, LSUN-crop and LSUN-resize
are cropped/resized subsets of LSUN~\citep{Yu:arxiv:2015:lsun}.
Each of the OOD sets contains $10K$ images, curated by~\citep{Liang:ICLR:2018:ODIN}.
%
%
For \textbf{OSR}, in one type of experiments we used CIFAR-10 as the known classes, and the same aforementioned 4 OOD sets
(so there were 4 such experiments). 
We also did three experiments using MNIST~\citep{Lecun:1998:mnist} as the known classes, and one of the following three OOD sets each time: 
Omniglot's test set~\citep{Lake:Science:2015human},
which contains different handwritten characters from multiple alphabets; 
MNIST-noise, a version of MNIST with random uniform noise added to it;
NOISE, where each observation is a random uniform noise.
Each OOD set contains $10K$ images.

\textbf{Binary OOD Detection. } 
On the binary OOD-detection benchmark described above, we compared our method against 5 other methods: 
ODIN~\citep{Liang:ICLR:2018:ODIN}; Mahalanobis~\citep{Lee:NIPS:2018:simple}; DeConf-C~\citep{Hsu:CVPR:2020:generalized};CSI~\citep{Tack:NIPS:2020:csi}; SubSpaces~\citep{Zaeemzadeh:CVPR:2021:out}.
Note this comparison is biased in favor of ODIN and Mahalanobis; unlike the other methods (ours included)
which require no tuning on OOD data, ODIN requires tuning on a different OOD set,
while Mahalanobis uses $\frac{1}{10}$ of both the test sets of the known classes and the OOD to tune its hyperparameters. To account for that,  
we also include the results for versions of those methods which do not depend on OOD sets for tuning,
as described by~\cite{Hsu:CVPR:2020:generalized}. We mark them as ODIN* and Mahalanobis*. 
We report the results in~\autoref{Tab:cifar:ood} (the numbers for ODIN*, Mahalanobis* and DeConf-C are taken from~\citep{Hsu:CVPR:2020:generalized}), 
where we show the macro-average results across the 4 OOD sets (full results are in the \textbf{SupMat}).
It is observable that in most cases \textbf{VMDLS} outperforms the others. 
The exception is the AUROC score for CIFAR-10 where Mahalanobis has a slightly higher score than \textbf{VMDLS};
however, once its usage of OOD sets is denied (\ie, Mahalanobis*), its results drop below ours.

\begin{table}[t]
    \centering  
%     \captionsetup{justification=justified, singlelinecheck=false}
    \caption{OOD-detection results on CIFAR-10 and CIFAR-100. Results: macro-averages over
    4 OOD datasets: ImageNet-resize; ImageNet-crop;  LSUN-crop; LSUN-resize. Backbones: ResNet18 for CSI; WideResnet28 for SubSpaces; DenseNet-BC100 for the rest.}
    
    \label{Tab:cifar:ood}  
    \resizebox{\columnwidth*1}{!}{
    
    \begin{threeparttable}

       
        \centering

        \setlength\tabcolsep{1.5pt}
        \begin{tabular}{@{}lllll@{}}
            \toprule
            & \multicolumn{2}{c}{CIFAR-10} & \multicolumn{2}{c}{CIFAR-100} \\ \cmidrule(l){2-5} 
            Method                                          &AUROC&TNR@TPR95&AUROC&TNR@TPR95\\ \midrule  
            ODIN\tnote{1}     &0.987&0.945    &0.910&0.599    \\
            Mahalanobis\tnote{2}&\textbf{0.993}&0.979&0.986&0.940\\
            DeConf-C     &0.989&0.946    &0.820&0.556    \\
            Mahalanobis* &0.962&0.820    &0.916&0.649    \\
            ODIN*        &0.904&0.556    &0.911&0.581    \\
            CSI          &0.981&0.897    &0.913&0.582 \\
            SubSpaces      &0.988&0.943    &0.930&0.675 \\
            VMDLS$_\bb$  (Ours)       &0.977&0.866    &0.975&0.878    \\
            VMDLS (Ours)                                       &0.990&\textbf{0.982}&\textbf{0.995}&\textbf{0.980}\\ \bottomrule            
        \end{tabular}
    \begin{tablenotes}
        \item[1] Uses an OOD set (though not the OOD test set) for %hyperparameter 
        tuning.
        \item[2] Uses $1/10$ of the test and OOD sets for %hyperparameter 
        tuning and training a linear regressor during inference. 
        % \item[3] Results reported in~\citep{Hsu:CVPR:2020:generalized}.
    \end{tablenotes}
   \end{threeparttable} 
    }   
\end{table}


\textbf{OSR. }
On the \textbf{CIFAR-10-as-in-distribution benchmark} described above, we compared with the following methods:
SoftMax; OpenMax~\citep{Bendale:CVPR:2016:towards}; CROSR~\citep{Yoshihashi:CVPR:2019:classification}, 
C2AE~\citep{Oza:CVPR:2019:c2ae}; ODIN~\citep{Liang:ICLR:2018:ODIN}; CGDL~\citep{Sun:CVPR:2020:conditional}; CPGM-AAE~\citep{Sun:arXiv:2020:open}
RPL~\citep{Chen:ECCV:2020:learning}; CVAECapOSR~\citep{Guo:ICCV:2021:conditional}.
Unlike in the previous benchmark (where most methods used DenseNet-BC100), here some methods use different backbones
and, in addition, ODIN uses a different OOD set for hyperparameter tuning,
giving it an advantage.
For a fair comparison, we have evaluated our method with several different backbones
used by the other methods: 1) the same modified VGG used in~\citep{Yoshihashi:CVPR:2019:classification,Sun:CVPR:2020:conditional,Sun:arXiv:2020:open};
2) 
ResNet34~\citep{He:CVPR:2016deep} (used in~\citep{Guo:ICCV:2021:conditional});
3) DenseNet-BC100 (used in~\citep{Liang:ICLR:2018:ODIN});
4) WideResNet28 (used in~\citep{Zaeemzadeh:CVPR:2021:out});
5) For completeness, we also used the popular ResNet18~\citep{He:CVPR:2016deep}.
% We report our results %\textsuperscript{\ref{note1}} 
% \autoref{Tab:F1_cifar} summarizes the results.
On the \textbf{MNIST-as-in-distribution benchmark} described above, we compared our method 
with 5 others: 
A standard SoftMax; OpenMax~\citep{Bendale:CVPR:2016:towards}; CROSR~\citep{Yoshihashi:CVPR:2019:classification}; 
CPGM-VAE~\citep{Sun:CVPR:2020:conditional}; CPGM-AAE~\citep{Sun:arXiv:2020:open}; CVAECapOSR~\citep{Guo:ICCV:2021:conditional}.
For a fair comparison, as the backbone for our method we used the same CNN specified in~\citep{Yoshihashi:CVPR:2019:classification}.
\autoref{Tab:F1_cifar} and~\autoref{Tab:F1_mnist} summarize the results. 
The results in~\autoref{Tab:F1_cifar} show that, for a given backbone, \textbf{VMDLS}
outperforms the other methods that use that backbone. 
Moreover, even with ResNet18 \textbf{VMDLS} outperforms the DenseNet-based ODIN (despite
ODIN's usage of tuning on OOD data).
Likewise, the results in~\autoref{Tab:F1_mnist} show that \textbf{VMDLS} outperforms the others 
by a large margin. 
Finally, note that in~\autoref{Tab:F1_cifar}, after \textbf{VMDLS}, \textbf{VMDLS}$_\bb$ is almost always the runner-up. 



\begin{table}[t]
    \centering
    \caption{Macro-average F1-scores of 11 classes (10 known; 1 unknown) on CIFAR-10 as the known classes
    and 4 different OOD sets.}
            \label{Tab:F1_cifar}  
    \resizebox{1.01\columnwidth}{!}{
    \hspace*{-0.2cm}
    \begin{threeparttable}
%         \caption{OSR results on CIFAR-10 with various outliers, we report macro-average F1-scores of 11 classes (10 known and an additional unknown class).}

        \centering
        \setlength\tabcolsep{1.5pt}
\begin{tabular}{@{}llllll@{}}
    \toprule
    Method  & Backbone & ImageNet-c & ImageNet-r & LSUN-c & LSUN-r\\ \midrule
    SoftMax & VGG      & 0.639         & 0.653           & 0.642     & 0.647       \\ 
    OpenMax&   VGG       &  0.660             &   0.684              &   0.657        &    0.668         \\
    CROSR&VGG  & 0.721              &   0.735          & 0.720   & 0.749      \\
    C2AE&CNN  & 0.837              &   0.826          & 0.783   & 0.801      \\
    CGDL&VGG  & 0.840   &   0.832  & 0.806 & 0.812      \\
    CPGM-AAE&VGG  & 0.793              &   0.830          & 0.820   & 0.865      \\
    RPL& WResNet& 0.811&0.810 & 0.846 & 0.820 \\
    CVAECapOSR&ResNet34 & 0.857 & 0.834 & 0.868 & 0.882 \\
    ODIN\tnote{1}&DenseNet  & 0.910              &   0.904          & 0.898   & 0.911      \\ \bottomrule
    VMDLS$_\bb$ (Ours)&VGG  & 0.850              &   0.845          & 0.849   & 0.844      \\
    VMDLS$_\bb$ (Ours)&ResNet18  & 0.889              &   0.875          & 0.888   & 0.882      \\
    VMDLS$_\bb$ (Ours)&DenseNet  & 0.913              &   0.908          & 0.914   & 0.911      \\
    VMDLS$_\bb$ (Ours)&WResNet &0.934 & 0.901 & 0.935 & 0.910 \\
    VMDLS (Ours)&VGG  & 0.907              &   0.861          & 0.900   & 0.886      \\
    VMDLS (Ours)&ResNet18  & 0.926              &  0.906         & 0.904  & 0.924    \\
         VMDLS (Ours)&ResNet34 &{0.930} & {0.911} & {0.928} & {0.907}
    \\
    VMDLS (Ours)&DenseNet  & 0.939              &   0.927          & 0.941   & 0.934 \\
    VMDLS (Ours)&WResNet &\textbf{0.945} & \textbf{0.930} & \textbf{0.942} & \textbf{0.936}   \\
    \bottomrule
    \end{tabular}
    \begin{tablenotes}
        % \item[1] Results reported in~~\citep{Sun:arXiv:2020:open,Yoshihashi:CVPR:2019:classification}, excluding ours and ODIN (reproduced).
        \item[1] Uses an OOD set (though not the OOD test set) for 
        %hyperparameter 
        tuning.
      \end{tablenotes}
   \end{threeparttable}
    }
\end{table}


\begin{table}[b]
    \centering
    \caption{Macro-average F1-scores of 11 classes (10 known; 1 unknown) score on MNIST as the known classes
    and 3 different OOD sets}
% \hspace{-.2cm}
\resizebox{\columnwidth*1}{!}{
        \label{Tab:F1_mnist}  
        
\begin{tabular}{@{}llll@{}}
    \toprule
    Method  & Omniglot & MNIST-noise & Noise \\ \midrule
    SoftMax  & 0.595           & 0.801     & 0.829       \\ 
    OpenMax&   0.780              &   0.816        &    0.826         \\
    CROSR&   0.793          & 0.827   & 0.826      \\
    CGDL&   0.850  & 0.887 & 0.859      \\
    CPGM-AAE&   0.872    & 0.865   &0.872    \\ 
    CVAECapOSR &0.971 & 0.982 & 0.982 \\
    \bottomrule
    VMDLS$_\bb$ (Ours)&   0.969    & 0.961   &0.963 \\
    VMDLS (Ours)& \textbf{0.974}              &   \textbf{0.984}          & \textbf{0.984}   \\ \bottomrule
    \end{tabular}}
    % \begin{tablenotes}
    %     \item[1] Results reported in~\citep{Sun:arXiv:2020:open,Sun:CVPR:2020:conditional}, excluding ours.
    %   \end{tablenotes}
\end{table}  


\textbf{Remarks.}
\textbf{1)} As ODIN's published code (targeting binary OOD detection) was easy to adapt for OSR we included 
it in these experiments. 
%
\textbf{2)} F1-score is threshold-dependent. For the purpose of benchmarking, the F1-scores values in this paper were computed (as was done, \eg,  in~\citep{Sun:CVPR:2020:conditional}) using a threshold that aims to recognize 95\% of the {train set} as belonging to known classes.
%
\textbf{3)} Except our own results and ODIN's results (which we computed ourselves), the rest of the results in~\autoref{Tab:F1_cifar} and~\autoref{Tab:F1_mnist} were reported in~\citep{Sun:arXiv:2020:open,Sun:CVPR:2020:conditional,Guo:ICCV:2021:conditional,Chen:ECCV:2020:learning}. 
\textbf{4)}  As the tables show,
\textbf{VMDLS} always outperforms \textbf{VMDLS}$_\bb$. This is also true for individual OOD sets,
not just the averages. Thus, even if sometimes the improvement from \textbf{VMDLS}$_\bb$ to \textbf{VMDLS} is small,  
we always recommend the latter. 


\subsection{Ablation Study}\label{Sec:Results:Subsec:Ablation}

\begin{figure}[t]
    \centering
    \newcommand{\MySpace}{}
    \subcaptionbox{$\Lcal_t$\label{Fig:MNIST:ABLATION_LOSSES:a}}[0.32\linewidth]{\includegraphics[height=2cm,trim=0.25cm 0.15cm 0.0cm 0.1cm]{mnist_just_trip.png}\MySpace}
    \subcaptionbox{$\Lcal_t+\Lcal_d$, $M_d=32$\label{Fig:MNIST:ABLATION_LOSSES:b}}[0.32\linewidth]{\includegraphics[height=2cm,trim=0.25cm 0.15cm 0.0cm 0.1cm]{mnist_nokl.png}\MySpace}
    \subcaptionbox{$\Lcal_{\mathrm{KL}}+\Lcal_t$\label{Fig:MNIST:ABLATION_LOSSES:c}}[0.32\linewidth]{\includegraphics[height=2cm,trim=0.25cm 0.15cm 0.0cm 0.1cm]{mnist_tirplets.png}\MySpace}\newline
    \subcaptionbox{$\Lcal_{\mathrm{KL}}+\Lcal_t+\Lcal_d$, $M_d=1$\label{Fig:MNIST:ABLATION_LOSSES:d}}[0.45\linewidth]{\includegraphics[height=2cm,trim=0.25cm 0.15cm 0.0cm 0.1cm]{mnist_mar_1.png}\MySpace}
    \subcaptionbox{$\Lcal_{\mathrm{KL}}+\Lcal_t+\Lcal_d$, $M_d=32$\label{Fig:MNIST:ABLATION_LOSSES:e}}[0.45\linewidth]{\includegraphics[height=2cm,trim=0.25cm 0.15cm 0.0cm 0.1cm]{mnist_mar_32.png}\MySpace}
    \caption{The effect of the  losses 
    on $(\bmu(\bx_i))_{i=1}^N$  in MNIST's train data with $d=2$
    (\ie, this is \emph{not} a t-SNE plot but the actual features).
    Each digit has its own color.    }
     \label{Fig:MNIST:ABLATION_LOSSES}
\end{figure}


 
\textbf{Losses.} We trained the model on MNIST several times
each time using a different loss:
(a) $\Lcal_t$;
(b) $\Lcal_t+\Lcal_d$ with $M_d=32$;
(c) $\Lcal_{\mathrm{KL}}+\Lcal_t$;
(d) $\Lcal_{\mathrm{KL}}+\Lcal_t+\Lcal_d$, $M_d=1$;
(e) $\Lcal_{\mathrm{KL}}+\Lcal_t+\Lcal_d$, $M_d=32$.
The last two options are the proposed loss (with different $M_d$ values).
In this ablation study, we chose $d=2$ to let us visualize $\bmu$ directly, \emph{without having to resort to a t-SNE visualization which might distort the structure}. 
\autoref{Fig:MNIST:ABLATION_LOSSES} depicts the values of $(\bmu(\bx_i))_{i=1}^N$.
Using only $\Lcal_t$ (\autoref{Fig:MNIST:ABLATION_LOSSES:a})
or using $\Lcal_t+\Lcal_d$ without $\Lcal_\mathrm{KL}$ (\autoref{Fig:MNIST:ABLATION_LOSSES:b})
leads to small distances between the classes whose clusters also differ from each other 
drastically in shape and size. Using $\Lcal_\mathrm{KL}+\Lcal_t$ (\autoref{Fig:MNIST:ABLATION_LOSSES:c})
leads to better results, but the clusters are far from being isotropic and, worse, 
are still close to each other in the vicinity of the origin. 
Using the proposed loss with either $M_d=1$ (\autoref{Fig:MNIST:ABLATION_LOSSES:d}) or, even more so, $M_d=32$
(\autoref{Fig:MNIST:ABLATION_LOSSES:e}), leads to nearly-isotropic clusters of similar sizes that are also much further away from each other. 
%
% This also translates to quantitative differences:
~\autoref{Tab:ab_mnist} summarizes the quantitative results obtained 
with \textbf{VMDLS}$_\bb$ using the different losses on the MNIST-as-in-distribution benchmark,
again with $d=2$. We avoided here using \textbf{VMDLS} (which got slightly-better results
-- which we omit -- than \textbf{VMDLS}$_\bb$) to focus on the loss aspect.
~\autoref{Tab:ab_mnist} also includes the (inferior) result of using $\Lcal_t$ with $M_t=1$ (higher $M_t$ values were even worse).

\begin{table}[t]
    \centering
\caption{Marco-average F1 scores with several losses.  
    Known classes: MNIST. Unknown: Omniglot. 
    $M_t=0.1$ unless stated otherwise. 
% %     Unlike in the other tables,     here $d=2$. 
%     $d=2$. 
    %Results are macro-average F1-scores.
    }
    \label{Tab:ab_mnist}
% \hspace{0.0cm}
\begin{tabular}{@{}llll@{}}
\toprule
Losses  & F1-score \\ \midrule
% $KL+M_d=32$ & 0.9524 \\
% $KL+M_d=1$ & 0.9429 \\
% $KL+Triplets$ & 0.8795\\
% $Triplets+M_d=32$ & 0.938 \\
% $Triplets$ & 0.871 \\
    $\Lcal_\mathrm{KL}$ + $\Lcal_t$ + $\Lcal_d$ ($M_d=32$) & \textbf{0.969} \\
    $\Lcal_\mathrm{KL}$ + $\Lcal_t$ + $\Lcal_d$ ($M_d=1$) & 0.941 \\
    $\Lcal_\mathrm{KL}$ + $\Lcal_t$   & 0.860\\
    $\Lcal_t$ + $\Lcal_d$ ($M_d=32$) & 0.935 \\
    $\Lcal_t$  & 0.858 \\
    $\Lcal_t$ ($M_t=1$)  & 0.829 \\
\bottomrule   \end{tabular}
\end{table}






\textbf{Feature Ensemble.}  
To study the effect of the feature ensemble,
%$\bt=(\bt^1,\bt^2,\bt^3,\bg,\bmu)$, 
we compared \textbf{VMDLS} with different versions of itself where each version uses a different subset
of the features, and evaluated the F1 performance of the different models
in the OOD task where CIFAR-10 served as the in-distribution data while ImageNet-resize 
acted as the OOD data. As~\autoref{Table:Ablation:Ensemble} shows, 
while the results were reasonably-good even when merely using $\bmu$, the added features helped and the best results were achieved when using the entire ensemble. 


\begin{table}[h]
    \centering
    \resizebox{1.01\columnwidth}{!}{
    \setlength\tabcolsep{1.0pt}  
    \begin{threeparttable}
        \caption{The effect of the feature ensemble.  \\  In-distribution data: CIFAR-10. OOD data: ImageNet-resize.}
      \label{Table:Ablation:Ensemble}
        \centering
\begin{tabular}{@{}l@{\hskip 0.5cm}ccccc@{}}
    \toprule
    \emph{Features}  & ($\bt^1,\bt^2,\bt^3,\bg,\bmu)$ & ($\bt^2,\bt^3,\bg,\bmu)$& $(\bt^3,\bg,\bmu)$ & $(\bg,\bmu)$ & $\bmu$ \\ \midrule
    \emph{F1-score} & \textbf{0.927}      & 0.925         & 0.919           & 0.918     & 0.906       \\ 
   \bottomrule
    \end{tabular}
   \end{threeparttable}
    }
\end{table}


\section{Conclusion}\label{Sec:Conclusion}
We proposed an end-to-end SOTA method, for OOD detection, which requires no OOD data for learning/tuning. 
% Our experiments and results are reproducible.
Its success lies in the good inter-class separation in the latent space, which also translates to a large empty area between the known classes where the OOD points end up. 
Our method has 2 main limitations. While some methods 
use a pre-trained DNN as is,
ours requires either full DNN training 
or fine-tuning a pretrained DNN. However, besides the fact that this is also the case with most recent methods, our SOTA results justify the training effort. 
The second limitation is that our training is about 4x slower
than the training of a simple SoftMax-based classifier with the same backbone.
However, this limitation can be mitigated considerably by using a pre-trained backbone (as we did in a few of the experiments in~\autoref{Sec:Results}) followed by fine tuning. 

An interesting future direction which we have not explored in this work is how to incorporate 
uncertainty decomposition within the proposed method
in order to improve the OOD detection, 
as was done in Bayesian neural networks and/or ensemble approaches~\citep{vadera2020ursabench,malinin2019ensemble,vadera2020generalized,depeweg2018decomposition}.

\begin{acknowledgements} 
  This work was supported by the Lynn
and William Frankel Center at BGU CS, by the Israeli Council for
Higher Education via the BGU Data Science Research Center, and
by Israel Science Foundation Personal Grant \#360/21. O.D. was
also funded by the Jabotinsky Scholarship from Israel’s Ministry
of Technology and Science, and by BGU’s Hi-Tech Scholarship.
\end{acknowledgements}


%\clearpage
% \small
% \bibliographystyle{ieee}
%\bibliographystyle{ieee_fullname}
% \bibliographystyle{abbrvnat}
\bibliography{refs}
%\bibliography{my_refs}


\end{document}
