% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

% \usepackage{algorithm}
% \usepackage{algorithmic}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% \newtheorem{theorem}{Theorem}
% \usepackage{thmtools} 
% \usepackage{thm-restate}

% \declaretheorem[name=Theorem]{theorem}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\usepackage{amsfonts}
% \usepackage{booktabs} % commands to create good-looking tables
\usepackage{subfigure}

\usepackage{float}

%\newcommand{\theHalgorithm}{\arabic{algorithm}}

\newcommand{\csize}{
\fontsize{8}{8}\selectfont
}

\newcommand{\csizenine}{
\fontsize{9}{9}\selectfont
}
\newenvironment{proofof}[1]{{\bf Proof of #1.  }}{\hfill$\Box$}

\newcommand{\csizenineplus}{
\fontsize{9.5}{9.5}\selectfont
}

\newcommand{\csizeten}{
\fontsize{10}{10}\selectfont
}

\newcommand{\tabsize}{
\fontsize{7}{7}\selectfont
}

\newcommand{\tsize}{
\fontsize{6.5}{6.5}\selectfont
}

% \renewcommand\algorithmiccomment[1]{
%   {
%   	{
% % 	\csizenine    
%   	{\textit{\%\ #1}}
%   	}
%   }
% }

\newcommand{\cA}{{\mathcal{A}}}
\newcommand{\cB}{{\mathcal{B}}}
\newcommand{\cC}{{\mathcal{C}}}
\newcommand{\cD}{{\mathcal{D}}}
\newcommand{\cG}{{\mathcal{G}}}
\newcommand{\cI}{{\mathcal{I}}}
\newcommand{\cN}{{\mathcal{N}}}
\newcommand{\cM}{{\mathcal{M}}}
\newcommand{\cO}{{\mathcal{O}}}
\newcommand{\cP}{{\mathcal{P}}}
\newcommand{\bP}{{\mathbf{P}}}
\newcommand{\cR}{{\mathcal{R}}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cH}{{\mathcal{H}}}
\newcommand{\cK}{{\mathcal{K}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{{\mathcal{U}}}
\newcommand{\cV}{{\mathcal{V}}}
\newcommand{\cY}{{\mathcal{Y}}}
\newcommand{\cZ}{{\mathcal{Z}}}
\newcommand{\newsetminus}{{\!-\!}}
\newcommand{\cVmA}{{\cV\newsetminus\cA}}
\newcommand{\cX}{{\mathcal{X}}}
\newcommand{\cs}{s}
\newcommand{\cVms}{{\cV-\cs}}

\newcommand{\ba}{{\mathbf{a}}}
\newcommand{\bb}{{\mathbf{b}}}
\newcommand{\bu}{{\mathbf{u}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\resid}{\cR}

\newcommand{\NP}{{\mathbf{NP}}}

% \DeclareMathOperator{\MIF}{MI} 

\newcommand{\bs}[1]{\boldsymbol{#1}}
\newcommand{\mb}[1]{\mathbf{#1}}

\newcommand{\mhk}{\cM^h_k}

\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\eqnref}[1]{Eq.~\ref{#1}}
\newcommand{\secref}[1]{Sec.~\ref{#1}}
\newcommand{\appref}[1]{Appendix~\ref{#1}}
\newcommand{\prcref}[1]{Procedure~\ref{#1}}
\newcommand{\assmref}[1]{Assumption~\ref{#1}}
\newcommand{\crlref}[1]{Corollary~\ref{#1}}
\newcommand{\algoref}[1]{Alg.~\ref{#1}}
\newcommand{\prpref}[1]{Proposition~\ref{#1}}
\newcommand{\cnjref}[1]{Conjecture~\ref{#1}}
\newcommand{\axmref}[1]{Axiom~\ref{#1}}
\newcommand{\lmaref}[1]{Lemma~\ref{#1}}

\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[lemma]{Corollary}
\newtheorem{procedure}[lemma]{Procedure}
\newtheorem{assumption}[lemma]{Assumption}
\newtheorem{claim}[lemma]{Claim}
\newtheorem{conclusion}[lemma]{Conclusion}
\newtheorem{proposition}[lemma]{Proposition}
\newtheorem{conjecture}[lemma]{Conjecture}
\newtheorem{axiom}[lemma]{Axiom}
\newtheorem{algo}[lemma]{Algorithm}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}

% %additions suggested by Sahil
% \newcommand{\s}[1]{\textcolor{magenta}{#1}}

% % %deletions suggested by Sahil
% % \newcommand{\sd}[1]{\textcolor{orange}{#1}}

% %additions suggested by Sahil
% \newcommand{\todo}[1]{\textcolor{blue}{Sahil's todo: #1}}

% % \newcommand{\uai}[1]{\textcolor{brown}{#1}}

% \newcommand{\te}{TE }
% \newcommand{\tes}{TE}

% %additions suggested by Mina
% \newcommand{\mina}[1]{\textcolor{purple}{Mina says: #1}}
    
% \definecolor{shadecolor}{gray}{0.95}
% \newcommand{\algshade}[1]{
%     \hspace*{-\fboxsep}
%     %\vspace*{-\fboxsep}
%     \colorbox{shadecolor}{
%         \parbox{\linewidth}{#1}
%     }
% }

\title{In- or Out-of-Distribution Detection via Dual Divergence Estimation}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{Sahil Garg}
\author[2]{Sanghamitra Dutta}
\author[1]{Mina Dalirrooyfard}
\author[1]{Anderson Schneider}
\author[1]{Yuriy Nevmyvaka}

% Add affiliations after the authors
\affil[1]{%
    Dept. of Machine Learning Research\\
    Morgan Stanley\\
    New York, New York, USA
}
\affil[2]{%
    Dept. of Electrical and Computer Engineering\\
    University of Maryland\\
    College Park, Maryland, USA
}
\affil[*]{%
Corresponding Author: sahil.garg@morganstanley.com, sahil.garg.cs@gmail.com
}

  
\begin{document}
\maketitle

\begin{abstract}
% 
Detecting \emph{out}-of-distribution~(OOD) samples is a problem of practical importance for a reliable use of deep neural networks (DNNs) in production settings. The corollary to this problem is the detection \emph{in}-distribution~(ID) samples, which is applicable to domain adaptation scenarios for augmenting a train set with ID samples from other data sets, or to continual learning for replay from the past.
% 
For both ID or OOD detection, we propose a principled yet simple approach of (empirically) estimating KL-Divergence, in its \emph{dual form}, for a given test set w.r.t. a known set of ID samples in order to quantify the contribution of each test sample individually towards the divergence measure and accordingly detect it as OOD or ID. Our approach is compute-efficient and enjoys strong theoretical guarantees.
% 
For WideResnet101 and ViT-L-16, by considering ImageNet-1k dataset as the ID benchmark, we evaluate the proposed OOD detector on 51 test (OOD) datasets, and observe drastically and consistently lower false positive rates w.r.t. all the competitive methods. Moreover, the proposed ID detector is evaluated, using ECG and stock price datasets, for the task of data augmentation in domain adaptation and continual learning settings, and we observe higher efficacy compared to relevant baselines.
\end{abstract}

\section{Introduction}
% 
Despite the great success of deep neural nets, there are important challenges that remain to be addressed in continual lifelong learning settings~\citep{lopez2017gradient,riemer2018learning,parisi2019continual,rao2019continual,lesort2021understanding}. In continual learning settings, due to the inherent nonstationarity of a domain, it is typical to observe samples in a test setting which are Out-Of-Distribution~(OOD) w.r.t. the training set. A DNN must be capable to detect such OOD samples and acknowledge that it is not knowledgeable to have high confidence outputs on such inputs~\cite{hendrycks2016baseline,liu2020energy,hendrycks2019scaling}. 
% 
\begin{figure}
\centering
\includegraphics[width=0.8\columnwidth]{dual_cut.png}
\caption{
\csizeten
The dark blue dots are ID samples and all the other dots~(with gray circles) are test samples. For detecting OOD samples in a test set, we propose to first empirically estimate KL-Divergence~(KL-D), in its dual form, between the underlying distribution of test samples and of the ID samples. For estimating the KL-D in its dual form, ID and test samples are mapped to 1-D space by a dual function, $f(.)$, and the dual space is optimized by minimizing log sum exp~\emph{smooth max}) of dark blue dots~(ID samples) while maximizing mean of rest of the dots~(test samples). In the second sketch, where the dual space is nearly-optimized, ID~(light blue) and OOD~(red and orange) samples within the test set are well separated.}
\label{fig:dv_sampling}
\end{figure}
% 
In the extreme scenarios where a majority of test samples are OOD w.r.t. the training set, it is natural to retrain the network on observations from the most recent past as representative of the (OOD) test setting. In such case of continual learning or domain adaption, for data augmentation, one can detect all the samples from the history of the same domain or even other domains which are ID w.r.t. the representative set. Both the problems of ID or OOD detection are related in theory, two sides of the same coin, yet differing in their utilities for lifelong learning.
% 
We approach this problem from the perspective of estimating divergence between two distributions. In a continual learning setting, if one encounters a test set of samples which are all known to be OOD w.r.t. the known set of ID samples, we should expect high divergence between the underlying distributions of the two sets. In contrast, if a test contains only ID samples, divergence between the respective distributions should be close to zero. Of course, in the real world settings, a test set may contain a mixture of both ID and OOD samples.
% 
In such case, we wish to split the test set into two parts, OOD vs ID samples. The aforementioned intuitions about pure ID vs pure OOD set should apply to the two parts of the test set as well.   
    
Specifically, we propose a \emph{novel} approach for ID vs OOD detection based upon the concept of dual divergence estimation. As illustrated in \figref{fig:dv_sampling}, we propose to empirically estimate KL-Divergence of a given test set of samples w.r.t. a set of known ID samples. The key idea is that by estimating KL-Divergence in its dual form due to \cite{donsker1983asymptotic}, we obtain the individual contribution of each sample in the test set towards the divergence measure so as to detect OOD and ID samples within the test set.
% 
Our principled approach enables OOD detection algorithms that enjoy linear time complexity and \emph{theoretical guaranties}.
     
For the problem of OOD detection in pre-trained deep neural nets, Imagenet has been the most challenging and well known benchmarked ID dataset. We specifically consider the task of OOD detection in a WideResnet and a Vision Transformer~(ViT) pretrained on Imagenet. For OOD datasets, we consider 51 datasets from diverse domains including all four previously benchmarked OOD datasets. Our extensive empirical analysis shows that the proposed OOD detector is consistently and drastically superior w.r.t. all the competitive methods.
    
For the evaluation of ID detector, we consider the problem of timeseries forecasting. We augment the training dataset for a given timeseries with ID samples detected from the past of the same timeseries as well as of other timeseries. Our empirical analysis of data sets of US stock prices and ECG demonstrates the competitiveness of our proposed ID detector w.r.t. relevant baselines of contextual replay from continual learning and of domain adaptation.
    
\paragraph{Contributions}
% 
Our contributions are: (i) a novel principled information theoretic approach for OOD detection which enjoys theoretical guaranties; (ii) extensive empirical evaluation demonstrating the superiority of our approach while also establishing new benchmarks on 47 new OOD datasets considering Imagenet-1k as the ID dataset; (iii) we leverage the proposed approach for ID detection to augment data sets in continual learning or domain adaptation settings as demonstrated using multivariate timeseries datasets from the diverse domains of finance and healthcare. (iv) Codebase at  \href{https://github.com/morganstanley/MSML/tree/main/papers/OOD_Detection_via_Dual_Divergence_Estimation}{github.com/morganstanley/MSML/tree/main/papers}.
                     
% \section{Related Works}
% \label{sec:related_works}
% Next we discuss the related works in detail.
    
\subsection{Related Works for OOD Detection
% in Pretrained DNNs
}
\label{sec:rw_ood}
Detecting OOD samples in a pre-trained deep neural network~(DNN) is a problem of high practical importance. The intuition behind this body of work is that the representations from the top hidden layer of a DNN~(referred to as logits) are informative of all the hierarchical features relevant for distinguishing between ID classes as well as for differentiating OOD vs ID samples. Various heuristics have been proposed in regards to what kind of information in the logits may be relevant for detecting OOD samples. \cite{hendrycks2016baseline} propose maximum softmax probability~(MSP) as the criterion for OOD detection. However, in practice, it is observed that DNNs tend to assign a high probability to one of the ID classes even for OOD samples. \cite{liang2018enhancing} propose ODIN which attempts to fix this issue by temperature scaling and adding small perturbations to the inputs. 
    
\cite{liu2020energy} obtain energy score from the logits as the criterion for OOD detection. Intuitively, since the energy score~(EBO) is $\log$ \emph{sum} $\exp$ function of the logits which is a well known approximation~(upper bound) for $\max$ function, it accounts for all the high values in the vector of logits rather than relying upon the highest value only, effectively a smooth version of $\max$ function. For the same reason, it is known to be consistently superior to ODIN and its predecessors, as we also observe in our extensive evaluation. \cite{hendrycks2019scaling} propose to simply use the maximum value of the logits as the criterion for OOD detection. Their intuition about the superiority of raw logits over its normalization as in MSP or ODIN is shown to be empirically valid. Though their approach is not as effective as energy scores~("smooth" max of the logits) for the reasons mentioned above. \cite{hendrycks2019scaling} also introduce KL-Matching method which computes relative entropies of per class of a softmax distribution w.r.t. the respective distribution templates.
    
\cite{huang2021importance} propose to use the gradient norm of KL-Divergence of the softmax distribution w.r.t. its respective uniform distribution, relying upon the intuition that the gradients should be of higher norm for ID samples in contrast to OOD samples. There isn't solid evidence supporting this intuition, it may be applicable only to certain OOD scenarios. \cite{sun2021information} propose a simple yet highly effective technique of rectifying activations, i.e. truncating activations above a certain threshold, in the penultimate layer of a DNN. The authors suggest that ReAct is particularly suitable when OOD activations are chaotic and positively skewed in comparison to ID activations.
        
\cite{wangwatermarking} propose to robustify existing OOD detection methods by watermarking ID patterns through reprogramming of the neural nets. Specifically, a static pattern is learned as a watermark to be added to any given input for its detection as OOD. Another technique for robustifying OOD methods is to sparsify \emph{weights} at inference time~\citep{sun2022dice}. \cite{djurisic2022extremely} instead propose to sparsify \emph{representations} in the top layers of a DNN which is superior to sparsifying weights as per our analysis. 
            
Another body of work is based on non-parametric modeling of the logits, such as deep Gaussian Mixtures~\citep{morteza2022provable}, or deep k Nearest Neighbors~\citep{sun2022out}.
% 
\cite{gomesigeood2022} propose to employ Fisher-Rao distances between normalized logits~(softmax distribution) for obtaining centroids as representative of ID classes. 
% 
These nonparametric methods, as also acknowledged by \cite{sun2022out}, are inefficient if the number of ID classes is large~(1000) as in Imagenet dataset.
        
Our approach is generic enough to be applicable to other various flavors of OOD detection problem settings such as retraining strategies or regularization techniques, advanced techniques for generating OOD samples, mixing raw features and representations from different layers, generative modeling or autoencoding, out-of-model-scope detection, etc.~\citep{ren2019likelihood,teney2020value,bitterwolf2020certifiably,mahmoodmultiscale,morningstar2021density,fort2021exploring,zhou2021contrastive,ming2022delving,ming2022poem,ming2022impact,du2022siren,yang2022openood,lin2022out,wang2022vim,wei2022mitigating,liu2022diffusion,fan2022simple,jiang2022read,guerin2022out,wu2022towards,wangout2022,huangdensity2022,zhangood2022,wilson2023hyperdimensional,wang2023out}, though we restrict our analysis in this paper to the above discussed settings of OOD detection in pretrained DNNs for its high practical importance and simplicity. 
    
\subsection{Related Works for ID Detection
% for Continual Learning
}
\label{sec:id_rw}
% 
In a general continual learning~(CL) scenario, the goal is to ensure that a neural net does not catastrophically forget what it has learned in the past when it learns from the present episodes~\cite{riemer2018learning,mccloskey1989catastrophic}.
% 
One of the most simple, effective, brain-inspired, and generic non-intrusive approach for continual learning is \emph{replay} of the past memory episodes~\citep{rolnick2019experience,van2020brain,deja2021binplay}.
    
\cite{shin2017continual} introduced the idea of deep generative replay, popularly known as DGR.
% 
To alleviate the problem of catastrophic forgetting, \citep{van2018generative} propose generative replay via distillation (i.e., employing class probabilities as “soft targets”).
    % 
\citep{aljundi2019online} address the issue of forgetting by formulating a controlled sampling criterion for both generative and experience replay settings.
% 
\cite{buzzega2020dark} propose ``dark experience replay" as a simple yet powerful baseline that mixes rehearsal with knowledge distillation and regularization.
% 
The problem of contextual replay also has connections to the classical field of active learning, and it has been pursued in the recent works of continual learning as well~\citep{tang2020graph,sun2021information}. \cite{sun2021information} propose to quantify informativeness of samples via criteria of surprise and learnability.
    
The CL setting considered in this paper is related to the above but different in the sense that we are interested in leveraging all the knowledge~(observations) from the past of a given domain or of other domains for training a model representative of the present only. While we are interested only in detecting ID samples and not generating novel ones or adapting existing ones, domain adaptation techniques such as optimal transport or domain discrimination are also tangentially related~\citep{ganin2016domain,balaji2020robust}.
    
\section{DDE for OOD or ID Detection}
\label{sec:dde_ood}
We discuss our novel approach for OOD detection based on dual divergence estimation, and its applicability for ID detection in continual learning settings.
        
\paragraph{Problem Settings}
For the problem of OOD detection, we assume the availability of an ID set of N samples, $\mb{X}^{in} = \{ \mb{x}^{in}_i\}_{i=1}^N$. For OOD detection in pretrained networks, $\mb{X}^{in}$ is assumed to be a set of the representations of ID inputs from the top hidden layer of a DNN, also referred as logits. Given a test set of inputs or the respective representations, $\mb{X}=\{ \mb{x}_j \}_{j=1}^M$, we detect if $\mb{x}_j$ is out-of-distribution w.r.t. the underlying (unknown) distribution $\cX^{in}$ of $\mb{X}^{in}$.
    
In our approach for OOD detection, we make no assumptions about the underlying distribution, nor do we advocate learning the corresponding density function. Moreover, our approach is not to learn an OOD detector prior to encountering a test set, but rather to estimate \emph{empirical} KL-Divergence~(KL-D) between a test set and the ID set on the fly.
% 
The key idea is that estimating the divergence measure in its \textit{dual form} is naturally informative about the subset of samples in the test set that are OOD.
    
Mathematically, KL-D between the (unknown) underlying distribution of a test set $\mb{\cX}$ and the (unknown) representative distribution $\mb{\cX}^{in}$ is expressible as, 
% 
\begin{align}
D(\mb{\cX} \| \mb{\cX}^{in})
& 
=
\mathbb{E}_{\mathbf{x} \sim \mb{\cX}}
\log
\frac{P(\mb{x})}
{P^{in}(\mb{x})}
% 
\label{eqn:kld_standard}
\end{align}
% 
Here, $P(.)$ and $P^{in}(.)$ are density functions corresponding to the distributions $\cX$ and $\cX^{in}$. As such, even for the empirical estimate of KL-D in \eqnref{eqn:kld_standard}, one has to rely upon the knowledge of the density functions which are unknown. Estimating a density function is a hard problem on its own and can be avoided if estimating the divergence in its dual form by \cite{donsker1983asymptotic}, as expressed below.
% 
\begin{align}
D(\mb{\cX} \| \mb{\cX}^{in})
=
\max_{f(.)}
\mathbb{E}_{\mb{x} \sim \mb{\cX}}
f(\mb{x})
-
\log \mathbb{E}_{\mb{x}^{in} \sim \mb{\cX}^{in}}
e^{f(\mb{x}^{in})}
\nonumber
% \label{eqn:kld_dual}
\end{align}
% 
Herein, $f(.)$ can be any function such that the expectations are finite, referred as the \emph{dual function}.
% 
As we observe above, for estimating KL-D in its dual form, we only need samples from the distributions $\mb{\cX}$ and $\mb{\cX}^{in}$, not the density functions. This form is particularly suitable for an \emph{empirical} estimate of KL-D between $\mb{\cX}$ and $\mb{\cX}^{in}$ using a test set $\mb{X}$ and the ID set $\mb{X}^{in}$ as below.
% 
\begin{align}
\hat{D}(\mb{X} \| \mb{X}^{in})
=
\max_{\hat{f}(.) \in \cH}
\sum_{\mb{x}_j \in \mb{X}}
\frac{\hat{f}(\mb{x}_j)}{M}
-
\log
\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}(\mb{x}^{in}_i)}}{N}
\nonumber
% \label{eqn:kld_dual_emp}
\end{align}
% 
Here, the maximization is performed over the fixed class of functions $\cH$, e.g., the class of functions that can be learned by DNNs~\citep{belghazi2018mutual}.
% 
As we also illustrate in \figref{fig:dv_sampling}, all the samples from both sets are mapped into the 1-D dual functional space that is optimized such that expected value of the test samples is maximized whereas the $\log$ sum $\exp$ i.e. smooth max of the ID samples is minimized. 
    
In the optimal (1-D) dual space~($f^*$) as shown in \figref{fig:cut_point_choices}, test samples~(light blue, orange, and red dots) and ID samples~(dark blue dots) are separated as much as possible. The red dots from the test set are located on the r.h.s. in the optimal dual space contributing to the KL-D measure. On the other hand, despite the objective of maximizing the value of light blue dots~(test samples) while minimizing smooth max of the ID samples, we find the light blue dots interspersed between the dark blue ones~(ID samples), clearly separated from red dots. This is simply because the optimized dual function fails to distinguish such samples from the ID set. Therefore, intuitively, these light blue test samples should be detected as ID samples whereas the red dots can be detected as OOD samples. Later, we support these intuitions with theoretical analysis. 
% 
The optimized dual space provides us a nice geometric interpretation of ID vs OOD samples. On this note, in \figref{fig:cut_point_choices}, it is also interesting to observe that orange dots~(test samples) lie on the soft boundary between the detected ID and OOD samples in the test set.  
    
\begin{figure}
\centering
\includegraphics[width=0.8\columnwidth]{dual_cut_in_details.png}
\caption{
\csizeten
Our three choices for cut points in the optimized 1-D dual functional space for OOD detection. The dark blue dots refer to samples from the ID set whereas the rest of the dots are points from a given test set. From a theoretical standpoint, we propose to compute the cut point as smooth max~(\emph{Log Sum Exp}) of the dark blue dots. One interpretable choice of cut point for OOD detection is the maximum of the dark blue dots~(\emph{Max}); this cut point detects test points on the boundary of the ID set as OOD as well~(orange dots). In a scenario where the ID set is corrupted by OOD samples~(for example, see the dark blue dot with orange circle), it would make sense to find a cut point by tuning within the range of dark blue dots (\emph{Tune}), thus deeming even some of the known ID samples as OOD.}
\label{fig:cut_point_choices}
\end{figure}
        
In essence, estimating KL-D of a test set w.r.t. the ID set in its dual form naturally splits it into two subsets, detected ID~(light blue dots) vs detected OOD samples~(red dots). Our approach has the advantage of treating OOD detection problem purely as an optimization problem~(optionally solvable using deep learning as we propose) while enjoying information theoretic guaranties.
    
Although, in practice, one can choose any point in the optimized (1-D) dual space as a cut point~(threshold) for OOD detection, we approach this problem more formally.
% 
Intuitively, the smooth max of the dark blue dots could serve as a cut point for OOD detection as we also illustrate in \figref{fig:cut_point_choices}. In the following, we present some theoretical insights which confirm this intuition.
 % 
First, we establish that for detecting OOD samples in a test set, there is no cut point required on the left side of ID samples in the optimized (1-D) dual space, i.e. no test samples exist beyond the left boundary of ID samples~(dark blue dots) in \figref{fig:cut_point_choices}.
% 
\begin{theorem}
\label{thm:min}
% 
Given an ID set $\mb{X}^{in}$ and a test set $\mb{X}$, from estimating KL-D of $\mb{X}$ w.r.t. $\mb{X}^{in}$ in its dual form as,
% 
\begin{align}
\hat{D}(\mb{X} \| \mb{X}^{in})
=
\max_{\hat{f}(.)\in \cH}
\sum_{\mb{x}_j \in \mb{X}}
\frac{\hat{f}(\mb{x}_j)}{M}
-
\log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}(\mb{x}^{in}_i)}}{N},
\nonumber
\end{align}
% 
we obtain the optimal dual function $\hat{f}^*(.)$. The optimal dual function $\hat{f}^*(.)$ satisfies the following:
% 
\begin{align}
% 
\forall{\mb{x}_j \in \mb{X}},\ \ 
\min_{\mb{x}_i^{in} \in \mb{X}^{in}}
\hat{f}^*(\mb{x}^{in}_i) \leq \hat{f}^*(\mb{x}_j).
\end{align}
% 
\end{theorem}
% 
Our next result establishes that OOD samples if any lie only on the right side of the ID samples~(dark blue dots) in the optimized (1-D) dual space. 
% 
\begin{theorem}
\label{thm:max}
Given an ID set $\mb{X}^{in}$ and a test set $\mb{X}$, the optimal dual function $\hat{f}^*(.)$ which maximizes the estimate of KL-D as defined in Theorem~\ref{thm:min}, satisfies the following:
% 
\begin{align}
\max_{\mb{x}_i^{in} \in \mb{X}^{in}}
\hat{f}^*(\mb{x}^{in}_i) \leq \max_{\mb{x}_j \in \mb{X}}
\hat{f}^*(\mb{x}_j)
\end{align}
\end{theorem}
% 
These results provide critical intuition on the role of the optimal dual function $\hat{f}^*$ in distinguishing ID and OOD samples. As discussed above, essentially, the function $\hat{f}^*$ attempts to find a representation where the ID and OOD samples are maximally separated. So, points that lie in $\mb{X}^{in}$ are assigned a lower value, and points in  $\mb{X}$ are assigned values based on how similar they are to $\mb{X}^{in}$. Thus, test ID samples are assigned somewhat lower values as well and test OOD samples are assigned higher values.
In Theorem~\ref{thm:max}, a strict inequality holds when there exists a cut point such that some points $\mb{{P}} \subseteq \mb{X}$ that are possibly quite dissimilar to $\mb{X}^{in}$ can be separated from $\mb{X}^{in} \cup (\mb{X} \backslash \mb{{P}})$ using $\hat{f}^*$. When equality holds, it can be interpreted as: $\mb{X}$ and 
$\mb{X}^{in}$ are already quite similar. As per the intuition mentioned above, we \emph{propose to use the smooth max of the ID samples~(dark blue dots) as a cut point for OOD detection}, with theoretical guarantees as presented below in \thmref{thm:smoothmax_cut}.
% 
\begin{theorem}
\label{thm:smoothmax_cut}
% 
Given an ID set $\mb{X}^{in}$ and a test set $\mb{X}$, let $\hat{f}^*(.)$ be the optimal dual function which maximizes the estimate of KL-D (as defined in Theorem~\ref{thm:min}). Then, for the subset of the test set deemed as OOD,
% 
\begin{align}
\mb{X}^{ood} = \{ \mb{x}_j: \mb{x}_j \in  \mb{X}, \hat{f}^*(\mb{x}_j) > \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\exp \hat{f}(\mb{x}^{in}_i) \},
\nonumber
\end{align}
% 
its KL-D w.r.t. the ID set is lower bounded as,
\begin{align}
\hat{D}(\mb{X}^{ood} \| \mb{X}^{in}) > \log(N).
\nonumber
\end{align}
\end{theorem}
% 
The above result suggest that OOD detection in DNNs can be improved by increasing the sample size in training~(ID) sets.
% 
Another way to interpret the lower bound is in its relation to the entropy of the ID set. Since all the empirical realizations in the ID set are equally probable, as such, we obtain the maximum possible value of the entropy as the lower bound. 
% 
Furthermore, this result is applicable for any subset of the detected OOD samples as well. One alternative, as shown in \figref{fig:cut_point_choices}, is to use $\max$ instead of the smooth max for finding the cut point.
% 
Having a cut point any further left of the above choices can be problematic, though relevant if the ID set consists of OOD samples as noise.
        
See the supplement for details on how to employ DNNs as the dual function approximators.
        
\subsection{ID Detection}
    
\begin{figure}
\centering
\includegraphics[width=0.8\columnwidth]{id_sampling.png}
\caption{
\csizeten
ID sampling from the 1-D dual space. The dark blue dots refer to samples from the ID set whereas the rest of the dots are points from a given test set. All the data points are binned using histograms of constant width $d$ in the optimal (1-D) dual space. We propose to take test samples from bins $B_1$, $B_2$, $B_3$, $B_4$ as ID samples, the ones with light blue color.}
\label{fig:id_sampling}
\end{figure}
% 
Intuitively, the problem of ID detection is similar to the problem of detecting OOD samples though its potential use cases are different.
% 
For this problem, $\mb{X}^{in}$ denotes the representative set of samples such as observations from the present~(recent past) of a domain of interest. On the other hand, $\mb{X}$ refers to a large set of observations from the historical past of the same domain or other related domains. 
        
Similar to OOD detection, we estimate divergence of $\mb{X}$ w.r.t. $\mb{X}^{in}$ in its dual form. The key difference being that we perform histogram binning of all the samples from both the sets in the 1-D dual functional space~\citep{freedman1981histogram}. The bins which contain samples only from $\mb{X}$ and not from $\mb{X}^{in}$ are discarded, and we select all the samples of $\mb{X}$ from the rest of the bins as ID detections w.r.t. $\mb{X}^{in}$. We also introduce the following notation for the evaluation of the divergence estimate at any function $f(\cdot)\in \cH$:
% 
\begin{align}
\hat{D}_{f}(\mb{X}_a \| \mb{X}_b)  := \sum_{\mb{x}_j \in \mb{X}_a} 
\frac{f(\mb{x}_j)}{|\mb{X}_a|}
-
\log \sum_{\mb{x}_i \in \mb{X}_b}
\frac{e^{f(\mb{x}_i)}}{|\mb{X}_b|}.
\nonumber
\end{align}
% 
\begin{theorem}
\label{thm:replay-pres-same-dist}
Given a representative ID set $\mb{X}^{in}$ and a set of observations from the historical past $\mb{X}$, from estimating KL-D of $\mb{X}$ w.r.t. $\mb{X}^{in}$ in its dual form as,
% 
\begin{align}
\hat{D}(\mb{X} \| \mb{X}^{in})
=&
\max_{\hat{f}(.)\in \cH}
\sum_{\mb{x}_j \in \mb{X}}
\frac{\hat{f}(\mb{x}_j)}{M}
-
\log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}(\mb{x}^{in}_i)}}{N},
\nonumber
\end{align}
% 
we obtain the optimal dual function, $\hat{f}^*(.)$. Then, from histogram binning in the 1-D dual functional space of both the sets, we select the samples of $\mb{X}$ with respect to the distribution of $\mb{X}^{in}$ in the bins, denoted as $\bar{\mb{X}}$. For uniform width $d$ of histogram bins, we have:
% 
\begin{align}
\hat{D}_{\hat{f}^*}(\bar{\mb{X}} \| \mb{X}^{in}) \le O(d).
\end{align}
\end{theorem}
% 
Here $\hat{f}^*$ is the optimal dual function that maximizes the divergence estimate between $\mb{X}$ and $\mb{X}^{in}$. We show that the divergence estimate computed between the selected ID samples $\bar{\mb{X}}$ and $\mb{X}^{in}$ (evaluated using the same $\hat{f}^*$) is quite small and in fact bounded by the histogram width. We note that while we show this result for $\hat{D}_{\hat{f}^*}(\bar{\mb{X}} \| \mb{X}^{in})$ (which may in principle be different from $\hat{D}(\bar{\mb{X}} \| \mb{X}^{in})$ since the maxima may be attained for a different function other than $\hat{f}^*$), we are also able to show that under certain restrictions on the model class $\cH$, $\hat{D}(\bar{\mb{X}} \| \mb{X}^{in})\approx \hat{D}_{\hat{f}^*}(\bar{\mb{X}} \| \mb{X}^{in})$ (see Theorem~\ref{thm:appendix}). 

The significance of Theorem~\ref{thm:replay-pres-same-dist} is that the divergence of the detected ID samples from $\mb{X}$ w.r.t. the given ID set $\mb{X}^{in}$ directly depends upon the expressiveness of the histogram model itself. Note that it is not desirable to reduce the upper bound to value zero by employing a very small bin width as it would lead to selecting only those samples from $\mb{X}$ which are highly similar to the samples in $\mb{X}^{in}$, an obvious case of overfitting in sampling. Whereas choosing too large a value for bin width is not also advisable.
    
Compute cost for dual divergence estimation and binning is linear in sample size whereas the compute complexity for sampling from the bins is constant.

Lastly, for completeness, we include another result (Theorem~\ref{thm:appendix}) which demonstrates that computing the KL-D estimate using the dual optimal function $\hat{f}^*$ suffices in a lot of scenarios, particularly when using subsets of $\mb{X}$ separable in the dual space.
% 
\begin{theorem}
\label{thm:appendix}
Let $\cH$ be a class of functions such that each $f\!\in\! \cH$ satisfies $|f(x)| \! <\infty \ \forall \ x \in R^k$. Furthermore, if $f_1(x),f_2(x),g(x) \in \mathcal{H}$, then functions of the form: $f(x)=f_1(x)I(g(x)\geq \tau)+f_2(x)I(g(x)<\tau)$ (which are essentially derived entirely from functions in $\mathcal{H}$) also lie in $\mathcal{H}$ for any constant $\tau$ and indicator function $I(\cdot)$.
 Consider $\hat{f}_1 \in \mathcal{H}$ such that $\hat{D}(\mb{X} \| \mb{X}^{in})=\max_{f \in \mathcal{H}} \hat{D}_{f}(\mb{X} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1}(\mb{X} \| \mb{X}^{in})$.
Then, for a subset $\mb{\bar{X}} \subseteq \mb{X}$ such that $\hat{f}_1(x)>\tau$ for $x \in \mb{X}\backslash \mb{\bar{X}}$ and $\hat{f}_1(x)\leq\tau$ for $x \in \mb{X}^{in}\cup \mb{\bar{X}}$, we have $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1}(\mb{\bar{X}} \| \mb{X}^{in})$.
\end{theorem}
% 
Intuitively, what Theorem~\ref{thm:appendix} demonstrates is that if there is a better function $\hat{f_0} \in \cH$ such that $\hat{D}(\bar{\mb{X}} \| \mb{X}^{in}) = \hat{D}_{\hat{f_0}}(\bar{\mb{X}} \| \mb{X}^{in}) > \hat{D}_{\hat{f}^*}(\bar{\mb{X}} \| \mb{X}^{in}) $, then one might be able to leverage this $\hat{f_0}$ to design an even better dual function for the original divergence estimate, i.e., there would exist an $\hat{f_0^*}$ such that $\hat{D}_{\hat{f_0^*}}(\mb{X} \| \mb{X}^{in}) > \hat{D}_{\hat{f}^*}(\mb{X} \| \mb{X}^{in}) = \hat{D}(\mb{X} \| \mb{X}^{in})$ which leads to a contradiction.  
        
\section{Empirical Evaluation}
\label{sec:experiments_more_baselines}
    
\begin{table*}[tp!]
\centering
\tabsize
\renewcommand{\arraystretch}{0.85}
% \renewcommand{\arraystretch}{1.0}
\renewcommand{\tabcolsep}{2.2pt}
\begin{tabular}{lllllllllllllllll}
\toprule
\textbf{Dataset}&\textbf{MSP}&\textbf{MLS}&\textbf{ODIN}&\textbf{EBO}&\textbf{GN}&\textbf{ReAct}&\textbf{GM}&\textbf{kNN}&\textbf{DICE}&\textbf{ASH}&\textbf{WM}&\textbf{KL-M}&\textbf{CIDER}&\textbf{IGE}&\textbf{DDE*}&\textbf{DDE-SM*}\\
% 
\toprule
ID Test $\uparrow$
&93&93&92&93&95&87&\textbf{98}&95&91&93&93&94&\underline{96}&92&95&94\\
\midrule
OOD Val. 
&50&48&49&47&69&\underline{40}&98&84&49&48&44&58&65&46&\textbf{31}&42\\
\toprule
SUN&65&60&63&61&47&47&98&72&35&22&\textbf{12}&75&75&57&\underline{18}&24\\
\midrule
Places&68&63&66&63&61&34&97&72&47&34&57&77&77&60&\textbf{10}&\underline{21}\\
\midrule
iNaturalist&55&62&63&61&50&20&96&65&26&\underline{12}&60&74&73&57&\textbf{11}&25\\
\midrule
Textures&68&96&81&81&61&47&43&69&32&\textbf{12}&61&95&84&96&\underline{15}&30\\
\toprule
Agriculture Crop&\underline{1}
&\textbf{0}&\underline{1}&2&82&2&100&81&9&\textbf{0}&2&16&72&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Animation
&42&33&40&30&94&21&100&66&38&30&29&59&69&27&\textbf{6}&\underline{19}\\
\midrule
Brain Tumors&36&26&34&21&99&16&100&69&31&20&14&54&74&20&\textbf{3}&\underline{4}\\
\midrule
Chest Xray
&22&15&20&13&67&11&100&71&21&13&\underline{7}&42&71&10&\textbf{4}&\underline{7}\\
\midrule
Faces in the Wild
&39&29&37&26&97&19&100&68&36&26&24&57&72&23&\textbf{9}&\underline{16}\\
\midrule
Fastfood
&70&64&68&62&91&47&97&70&64&60&59&79&77&60&\textbf{10}&\underline{18}\\
\midrule
Gemstone
&66&59&65&54&97&39&97&63&54&52&50&77&71&52&\textbf{4}&\underline{18}\\
\midrule
LEGO&11
&4&10&\underline{2}&97&3&100&73&11&3&\underline{2}&32&76&3&\textbf{0}&\textbf{0}\\
\midrule
Plant Diseases&27&20&26&18&95&15&100&67&30&18&17&49&72&14&\textbf{2}&\underline{3}\\
\midrule
USPS&38&27&36&18&97&12&100&62&26&18&12&55&69&18&\textbf{1}&\underline{3}\\
\midrule
Alzeihmers
&22&14&21&8&100&5&100&67&18&8&4&40&67&7&\textbf{1}&\underline{2}\\
\midrule
Blood Cells&16&11&14&13&79&13&100&72&22&10&6&37&70&9&\textbf{1}&\underline{2}\\
\midrule
Brand Logos&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&95&\textbf{0}&100&94&\textbf{0}&\textbf{0}&\textbf{0}&\underline{1}&74&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Captcha&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&100&\textbf{0}&100&100&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&100&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Cards&77&74&76&73&88&59&86&67&71&70&67&83&78&71&\textbf{11}&\underline{14}\\
\midrule
Arabic Handwritten Char. & 34&25&33&15&66&10&99&55&18&15&\textbf{4}&47&64&17&\textbf{4}&\underline{6}\\
\midrule
Chess Pieces&26&16&24&12&95&12&100&69&23&10&9&49&77&10&\textbf{1}&\underline{2}\\
\midrule
Chinese Fine Art&5&\underline{2}&4&4&89&6&100&79&15&\textbf{1}&3&28&77&\textbf{1}&\textbf{1}&\textbf{1}\\
\midrule
Coffee Beans&26&17&25&11&99&10&100&65&22&11&10&45&70&11&\textbf{1}&\underline{2}\\
\midrule
Colonoscopy&4&\textbf{1}&3&\underline{2}&97&\underline{2}&100&71&7&\textbf{1}&\textbf{1}&23&67&\textbf{1}&\textbf{1}&\underline{2}\\
\midrule
Covid CT Scans&26&18&24&14&95&\underline{11}&100&69&25&15&\underline{11}&48&70&12&\textbf{3}&\textbf{3}\\
\midrule
Diamonds&47&40&45&39&97&31&100&76&46&39&36&62&78&35&\textbf{3}&\underline{5}\\
\midrule
Emotional Faces&34&25&32&21&87&\underline{15}&100&68&30&21&16&52&71&18&\textbf{5}&16\\
\midrule
Human Eyes&39&31&37&27&97&20&100&66&35&26&24&57&69&24&\textbf{5}&\underline{9}\\
\midrule
Fire \& Smoke&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&81&\textbf{0}&100&91&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&\underline{72}&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
English Handwritten Char.&26&18&25&10&69&8&99&55&16&9&9&39&60&11&\textbf{2}&\underline{3}\\
\midrule
Excavation&3&\underline{1}&2&\underline{1}&99&\underline{1}&100&79&6&\textbf{0}&\textbf{0}&17&68&\underline{1}&\textbf{0}&\textbf{0}\\
\midrule
Eyes&33&25&32&24&88&19&100&71&31&22&11&52&72&20&\textbf{3}&\underline{4}\\
\midrule
Handwritten Math Symbols&34&24&33&15&74&10&99&53&19&13&11&48&62&15&\textbf{1}&\underline{2}\\
\midrule
Bart and Homer&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&100&\textbf{0}&100&78&\underline{1}&\textbf{0}&\textbf{0}&11&72&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Indian Food&67&62&65&61&93&49&98&68&64&56&56&79&76&58&\textbf{13}&\underline{27}\\
\midrule
LEGO Minifigures&8&4&8&2&97&3&100&74&12&2&2&26&72&\underline{1}&\textbf{0}&\textbf{0}\\
\midrule
Licence Plates&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&100&\textbf{0}&100&94&\textbf{0}&\textbf{0}&\textbf{0}&\underline{2}&68&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Meat Quality&\underline{1}&\textbf{0}&\underline{1}&\textbf{0}&98&\textbf{0}&100&78&4&\underline{1}&\textbf{0}&10&57&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Monkeypox&67&64&65&64&77&50&96&65&65&62&52&77&74&61&\textbf{8}&\underline{12}\\
\midrule
Movie Posters&57&51&55&49&94&37&100&69&55&47&48&71&76&45&\textbf{14}&\underline{24}\\
\midrule
Ornamental Plants&20&13&18&14&84&14&100&75&26&12&14&40&69&10&\textbf{0}&\underline{1}\\
\midrule
Paintings&6&3&6&5&77&5&100&69&9&3&3&28&66&\underline{2}&\textbf{1}&4\\
\midrule
Pollen Grain&25&17&23&16&94&16&100&68&30&13&\underline{12}&50&73&13&\textbf{1}&\textbf{1}\\
\midrule
QR Codes&22&13&20&9&98&7&100&71&20&10&5&41&71&8&\textbf{1}&\underline{2}\\
\midrule
Railway Tracks&2&\underline{1}&2&\underline{1}&82&2&100&74&6&\textbf{0}&\underline{1}&20&68&\underline{1}&\underline{1}&\underline{1}\\
\midrule
Weed Crops&42&34&40&32&94&\underline{26}&100&72&40&31&\underline{26}&58&69&28&\textbf{4}&\textbf{4}\\
\midrule
YouTube Thumbnails&54&47&52&47&91&40&100&76&54&46&44&70&80&43&\textbf{5}&\underline{19}\\
\midrule
Weather&75&72&73&73&91&58&95&78&72&73&66&80&80&70&\textbf{14}&\underline{36}\\
\midrule
Sign Language&30&20&29&13&100&10&100&62&23&12&11&48&65&13&\textbf{1}&\underline{2}\\
\midrule
Stairs&\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&69&\textbf{0}&100&88&\textbf{0}&\textbf{0}&\textbf{0}&\underline{1}&64&\textbf{0}&\textbf{0}&\textbf{0}\\
\midrule
Shells or Pebbles&77&74&75&74&83&59&91&69&72&71&71&83&76&71&\textbf{22}&\underline{33}\\
\toprule
\textbf{Summary Statistics}&32$\pm$25&27$\pm$25&31$\pm$25&25$\pm$25&87$\pm$13&18$\pm$18&98$\pm$8&72$\pm$9&28$\pm$21&20$\pm$21&20$\pm$22&46$\pm$25&72$\pm$7&23$\pm$25&\textbf{4$\pm$5}&\underline{8$\pm$10}\\
\toprule
\end{tabular}
\caption{
\csizeten
Evaluation results for OOD detection in WideResnet101 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$). Best scores are shown in bold and the second best scores are underlined.}
\label{tab:wideresnet_ood}
\end{table*}

In the following, we present our empirical analysis for both the problem of OOD and ID detection.
    
\begin{figure*}
\centering
% 
\subfigure[Test sets vs ID set]{
\includegraphics[width=0.47\columnwidth]{ood_vary_ood_detection_rate_vs_num_iters.pdf}
\label{fig:ablation_test_sets}
}
% 
\subfigure[Vary Batch Size]{
\includegraphics[width=0.47\columnwidth]{ood_vary_ood_detection_rate_vs_num_iters__batch_size.pdf}
\label{fig:ablation_batch_size}
}
% 
\subfigure[Vary No. of Hidden Units]{
\includegraphics[width=0.47\columnwidth]{ood_vary_ood_detection_rate_vs_num_iters__hidden_size.pdf}
\label{fig:ablation_no_of_hidden_units}
}
% 
\subfigure[Vary Learning Rate]{
\includegraphics[width=0.47\columnwidth]{ood_vary_ood_detection_rate_vs_num_iters__lr.pdf}
\label{fig:ablation_lr}
}
\caption{
\csizeten
Ablation study for WideResnet101. OOD detection rate w.r.t. the number of batch updates performed for estimating KL-D is analyzed. Variation of OOD detection across all the test sets, and w.r.t. change in batch size, number of hidden units, learning rate is presented. FPR95 shown in each of the plots are for the default configuration only~(b=10k, h=512, lr=5e-4).}
\label{fig:ablation_hp}
\end{figure*}
    
\subsection{Out-of-Distribution Detection}
% 
\paragraph{Datasets \& Evaluation Settings}
We perform extensive empirical analysis for the problem of OOD detection in deep neural networks, WideResnet101 and ViT-L-16, pretrained on Imagenet-1k. Since there are various possible scenarios for observing OOD samples, we use an extensive list of 51 image datasets which are OOD w.r.t. Imagenet, including the previously benchmarked four datasets: SUN, Places, iNaturallist~(species), and Textures. All the images are rescaled to size 224x224, following the standard procedure for preprocessing as considered in the previous works. Following the procedure of simple perturbations proposed by \cite{hendrycksdeep2019}, we generate a validation set of OOD samples from ID samples in Imagenet. While a validation OOD set is optional for many of methods for OOD detection including ours, we find it useful for reproducibility purposes~(reporting the validation accuracy) and for scenarios where multiple hyperparameter configurations of a method are equivalent if one were to only consider the standard criterion of 5\% ID samples being falsely identified as OOD~(corresponding to the evaluation metric FPR95) for hyperparameter tuning. See more details in the supplement.
    
\paragraph{Competitive Methods}
We compare our proposed approach of dual divergence estimation~(\emph{\textbf{DDE*}}) w.r.t. a comprehensive list of methods for OOD detection in pretrained DNNs~(see \secref{sec:rw_ood} for more details): (i) maximum softmax probability~(\emph{MSP}); (ii) maximum logit score~(\emph{MLS}); (iii) \emph{ODIN} (iv) energy scores~(\emph{EBO}); (v) gradient norms~(\emph{GN}); (vi) Reactivation of representations~(\emph{ReAct}); (vii) Gaussian mixtures~(\emph{GM}); (viii) k-Nearest Neighbors~(\emph{kNN}); (ix) sparsifying weights~(\emph{DICE}); (x) sparsifying representations~(\emph{ASH}); (xi) watermarking~(\emph{WM}); (xii) KL-Matching~(\emph{KL-M}); (xiii) hyperspherical embeddings~(\emph{CIDER}); (xiv) information geometric approach of computing Fisher Rao distances between softmax distributions~(\emph{IGE}). Note that \emph{DDE*} by default employs $\max$ function based cut point in the dual functional space whereas DDE-SM* refers to \emph{smooth max} function for the cut point, as detailed in \secref{sec:dde_ood}.
    
\subsubsection{Empirical Results}
% 
In Table \ref{tab:wideresnet_ood}, we compare all the methods across 51 OOD test datasets for OOD detection in WideResnet101, using the standard and the most relevant evaluation metric, FPR95~($\downarrow$). We observe that OOD detection rate varies highly across methods and the datasets. While our methods, \emph{DDE*} and \emph{DDE-SM*}, manifests drastically lower FPR95 rates w.r.t. all the other methods, simple baselines such as \emph{React} and \emph{WM} perform competitively. See the supplement for our analysis on OOD detection in ViT-L-16.
    
Furthermore, see \figref{fig:tpr_in_vs_ood_wideresnet}, for the analysis on detecting OOD samples at the cost of falsely detecting ID samples as OOD. Note that in our approach \emph{DDE*}, unlike the other methods, the cut point~(threshold) for detecting OODs is fixed, and it gives a very low false detection rate in the ID set as such~(as desired). For obtaining higher false detection rate with our method as required solely for the purpose of the analysis presented in \figref{fig:tpr_in_vs_ood_wideresnet}, we have to overfit the neural dual function by performing a very large number of batch updates~(which is not required for practical use of the detector). For the same reason, the curve for \emph{DDE*} hardly changes beyond false rate of 0.06 on X-axis. In contrast, in all the other methods, threshold for OOD detection score is manually tuned for there is a trade off between detecting OODs in ID vs OOD sets. 
    
\begin{figure}
\centering
\includegraphics[width=0.9\columnwidth]{ood_vary_ood_detection_rate_id_vs_ood_sets.pdf}
\caption{
\csizeten
OOD detection rate is analyzed between ID set vs OOD sets, for WideResnet101. ID set is the figure is Imagenet test set and for OOD set, we take average across all the 51 OOD test sets. Detection rate in OOD sets should be high at the minimal cost of falsely detecting samples in the ID set as OOD. This plot demonstrates the superiority of \emph{DDE*}~(our approach) w.r.t. all the competitive methods, it achieves OOD detection rate of 0.6 while not falsely detecting any of the ID samples as OOD, and the detection rates in OOD sets increases sharply for a very small increase in the false detection of OODs within the ID set.}
\label{fig:tpr_in_vs_ood_wideresnet}
\end{figure}
        
In \figref{fig:ablation_hp},  we present the analysis from an extensive ablation study for our approach. First, in \figref{fig:ablation_test_sets}, we analyze how OOD detection rate increases in the ID set vs OOD test sets as we increase the number of batch updates of the weights of a neural KL-D estimator. As we discussed previously, a few hundred batch updates suffice in practice~(considering a large batch size of 10k) for convergence of the KL-D estimates whereas performing a large number of batch updates (in thousands) can force a neural estimator to start distinguishing even between similar samples leading to divergence of the KL-D estimates. Correspondingly, in \figref{fig:ablation_test_sets}, we see that OOD detection rate within the ID set remains close to zero for the first few hundred iterations of batch updates. It is only if we keep on increasing the number of batch updates that the estimator starts detecting OOD samples even within the ID set, with 5\% OOD detection rate in the ID set corresponding to the metric FPR95. In contrast, OOD detection rate across all the test OOD sets increases at a faster pace as it should be. 
    
\begin{table}[tp!]
\centering
\csize
\begin{tabular}{ll}
\toprule
\textbf{Methods}&Summary Statistics from All Test Sets\\
\toprule
% MSP&32$\pm$25\\
% \midrule
% MLS&27$\pm$25\\
% \midrule
% ODIN&31$\pm$25\\
% \midrule
% EBO&25$\pm$25\\
% \midrule
% GN&87$\pm$13\\
% \midrule
ReAct&18$\pm$18\\
\midrule
% GM&98$\pm$8\\
% \midrule
% kNN&72$\pm$9\\
% \midrule
% DICE&28$\pm$21\\
% \midrule
ASH&20$\pm$21\\
\midrule
WM&20$\pm$22\\
% \midrule
% KL-M&46$\pm$25\\
% \midrule
% CIDER&72$\pm$7\\
% \midrule
% IGE&23$\pm$25\\
% \midrule
\toprule
DDE*&4$\pm$5\\
\midrule
DDE-Online&5$\pm$6\\
\midrule
DDE-Mixed&10$\pm$12\\
\toprule
DDEv&12$\pm$17\\
\midrule
DDEvt10&9$\pm$11\\
\midrule
DDEvt20&7$\pm$8\\
\toprule
DDE-N30k&9$\pm$9\\
\midrule
DDE-N10k&14$\pm$11\\
% \midrule
% DDEN3k&13$\pm$10\\
\midrule
DDE-N1k&$15\pm 10$\\
\midrule
DDE-N100&31$\pm$14\\
\toprule
\end{tabular}
\caption{Evaluation of the variants of DDE* for OOD detection in WideResnet101 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$).}
\label{tab:wideresnet_summary_stats}
\end{table}
    
In \ref{fig:ablation_batch_size}, we analyze mean OOD detection across the test sets w.r.t. the number of batch updates, while varying the batch size. We find that using small batch size requires larger number of batch updates. The curves from batch sizes, 3k, 5k, 10k, 30k, are all alike in contrast to lower batch sizes. From theoretical standpoint, larger batch size is advantageous for achieving lower variance in the estimation of KL-D~(note the zigzag in the curves for lower batch sizes). Note, the number of batch updates corresponding from FPR95 is different across the batch sizes and we only show FPR95 for the default batch size of 10k. In \ref{fig:ablation_no_of_hidden_units}, we vary the number of hidden units. For all the three hidden sizes, 512, 1024, 2048, our approach performs similarly. The results for varying the learning rate are not surprising. 
% 
Overall, it suggest that the effect of a hyperparameter on detection performance is intuitive and smooth.

In \tabref{tab:wideresnet_summary_stats}, we demonstrate competitiveness of the variants of DDE* w.r.t. best of the baselines (ReAct, ASH, WM). "DDE-Online" refers to batch inference on a test set. "DDE-Mixed" is for the evaluation setting of augmenting each OOD test set with (3000) ID test samples.
% 
For analyzing generalization of the estimator, we optimize the dual function for estimating KL-divergence between the ID training set and the OOD validation set. Using this dual function, we perform OOD detection across all the OOD test sets. This highly compute efficient variant of our method is referred as "DDEv". Optionally, we fine tune for a given test set using 10\% or 20\% of the original compute cost of our method ("DDEvt10" and "DDEvt20"). 
% 
We perform a new ablation study for our method (DDE*) by varying the sample size ($N$) on the Imagenet (ID) dataset; see "DDE-N30k", $\cdots$, "DDE-N100". See the supplement for more details.
    
\subsection{In-Distribution Detection}

\begin{figure}[pt!]
\centering
\includegraphics[width=\columnwidth]{ap_legend.pdf}
\subfigure[ECG Activity]{
\includegraphics[width=0.85\columnwidth]{ap_ecg.pdf}
\label{fig:ecg_cl}}
\subfigure[US Security Prices Activity]{
\includegraphics[width=0.85\columnwidth]{ap_us.pdf}
\label{fig:stocks_cl}}
\caption{
\csizeten
On the x-axis in each plot, we index individual timeseries within a dataset. For each timeseries, all the methods are compared in terms of Average Precision metric~($\uparrow$).}
\label{fig:cl}
\end{figure}
    
\paragraph{Datasets and Evaluation Settings}
For the problem of ID detection, we consider the task of timeseries forecasting using two datasets, ECG Activity and US Security Price Activity. In the Security Price dataset of 1000 most liquid securities, given each of the 1000 securities, we augment the training set from the same security with (ID) samples from the historical past of the same security and of the other 999 securities.~(Results reported for only the first 100 least liquid securities.) Same applies to ECG dataset of 50 timeseries. We preprocess each timeseries to obtain \% change in activity w.r.t. the previous timestep. The task is of forecasting if the absolute value of \% change is beyond a certain threshold~(mean absolute value) given the knowledge of \% change in the previous 100 timesteps. Evaluation metric is Average Precision~(AP). See the supplement for more details.
    
\paragraph{Competitive Methods}
% 
Various (contextual) replay techniques from the literature of continual learning are relevant as baselines for our method \emph{DDE*}~(see \secref{sec:id_rw} for details): (i) no transfer of knowledge via data augmentation~(\emph{NT})  (ii) random selection~(\emph{Random}); (iii) Learning without Forgetting~(\emph{LwF} and \emph{LwF-Distill}); (iv) Dark Experience Replay~(\emph{DER} and \emph{DER++}); (v) Deep Generative Replay~(\emph{DGR}); (vi) Maximally Interfered Retrieval~(\emph{MIR}); (vii) Memorable Information Criterion~(\emph{MIC}).
% 
Besides, from the literature of domain adaptation on invariant representation learning, we compare to (viii) domain discrimination~(\emph{DA-DC}), and (ix) neural optimal transport~(\emph{DA-OT}).
        
\paragraph{Empirical Results}
% 
In the plots in In \figref{fig:cl}, X-axis represent indices of the individual timeseries in a given dataset. For each timeseries, we compare all the methods in terms of AP~($\uparrow$). For ECG activity dataset, due to high interference between samples from different timeseries and high temporal dynamics within a single timeseries, we observe a very significant contrast between the methods. While \emph{DDE*}~(ours) provides consistently highest AP across almost all the timeseries in the dataset, some of the other methods such as \emph{DER++}, \emph{MIR} are also competitive. For the dataset of US security price activity as well, \emph{DDE*} obtains the highest AP across all the timeseries, though the difference of AP across the methods is not as drastic as observed for ECG activity dataset. Another interesting aspect is that DA-DC is efficient for  price activity in contrast to the ECG dataset.
    
\section{Conclusions}
% 
In this paper, we tackle the highly impactful problem of OOD detection in pretrained DNNs. Our approach of OOD detection via dual divergence estimation is novel, principled, and highly efficient in practice. It enjoys theoretical guaranties owing to its foundations in information theory. While the approach is generic, one can employ a lightweight deep neural net as a dual function approximator for divergence estimation. Our extensive exprimental evaluation shows that our approach is drastically superior to all the competitive methods. We also establish benchmarks for a large number of new OOD test datasets. 
% In addition, we observe that Vision Transformers, in contrast to Resnets, are poor at OOD detection. 
Moreover, we show that OOD detection is theoretically similar to ID detection, an underexplored problem with applications to continual learning and domain adaptation. 
% 
For this problem as well, we provide theoretical guaranties and show its competitiveness w.r.t. many baselines on datasets from healthcare and finance domain.

% \subsubsection*{Author Contributions}

% \subsubsection*{Acknowledgements}
% The authors would like to thank, Irina Rish, Yikai Zhang, Songzhu Zheng, Umang Gupta, Guillermo Cecchi, Yeshaya Adler, Antonio Musumeci, for providing their valuable feedback.
    
\bibliography{references}
    
\end{document}
