% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

% \usepackage{algorithm}
% \usepackage{algorithmic}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% \newtheorem{theorem}{Theorem}
% \usepackage{thmtools} 
% \usepackage{thm-restate}

% \declaretheorem[name=Theorem]{theorem}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\usepackage{amsfonts}
% \usepackage{booktabs} % commands to create good-looking tables
\usepackage{subfigure}

\usepackage{float}

%\newcommand{\theHalgorithm}{\arabic{algorithm}}

\newcommand{\csize}{
\fontsize{8}{8}\selectfont
}

\newcommand{\csizenine}{
\fontsize{9}{9}\selectfont
}
\newenvironment{proofof}[1]{{\bf Proof of #1.  }}{\hfill$\Box$}

\newcommand{\csizenineplus}{
\fontsize{9.5}{9.5}\selectfont
}

\newcommand{\csizeten}{
\fontsize{10}{10}\selectfont
}

\newcommand{\tabsize}{
\fontsize{7}{7}\selectfont
}

\newcommand{\tsize}{
\fontsize{6.5}{6.5}\selectfont
}

\newcommand{\cA}{{\mathcal{A}}}
\newcommand{\cB}{{\mathcal{B}}}
\newcommand{\cC}{{\mathcal{C}}}
\newcommand{\cD}{{\mathcal{D}}}
\newcommand{\cG}{{\mathcal{G}}}
\newcommand{\cI}{{\mathcal{I}}}
\newcommand{\cN}{{\mathcal{N}}}
\newcommand{\cM}{{\mathcal{M}}}
\newcommand{\cO}{{\mathcal{O}}}
\newcommand{\cP}{{\mathcal{P}}}
\newcommand{\bP}{{\mathbf{P}}}
\newcommand{\cR}{{\mathcal{R}}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cH}{{\mathcal{H}}}
\newcommand{\cK}{{\mathcal{K}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{{\mathcal{U}}}
\newcommand{\cV}{{\mathcal{V}}}
\newcommand{\cY}{{\mathcal{Y}}}
\newcommand{\cZ}{{\mathcal{Z}}}
\newcommand{\newsetminus}{{\!-\!}}
\newcommand{\cVmA}{{\cV\newsetminus\cA}}
\newcommand{\cX}{{\mathcal{X}}}
\newcommand{\cs}{s}
\newcommand{\cVms}{{\cV-\cs}}

\newcommand{\ba}{{\mathbf{a}}}
\newcommand{\bb}{{\mathbf{b}}}
\newcommand{\bu}{{\mathbf{u}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\resid}{\cR}

\newcommand{\NP}{{\mathbf{NP}}}

% \DeclareMathOperator{\MIF}{MI} 

\newcommand{\bs}[1]{\boldsymbol{#1}}
\newcommand{\mb}[1]{\mathbf{#1}}

\newcommand{\mhk}{\cM^h_k}

\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\eqnref}[1]{Eq.~\ref{#1}}
\newcommand{\secref}[1]{Sec.~\ref{#1}}
\newcommand{\appref}[1]{Appendix~\ref{#1}}
\newcommand{\prcref}[1]{Procedure~\ref{#1}}
\newcommand{\assmref}[1]{Assumption~\ref{#1}}
\newcommand{\crlref}[1]{Corollary~\ref{#1}}
\newcommand{\algoref}[1]{Alg.~\ref{#1}}
\newcommand{\prpref}[1]{Proposition~\ref{#1}}
\newcommand{\cnjref}[1]{Conjecture~\ref{#1}}
\newcommand{\axmref}[1]{Axiom~\ref{#1}}
\newcommand{\lmaref}[1]{Lemma~\ref{#1}}

\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[lemma]{Corollary}
\newtheorem{procedure}[lemma]{Procedure}
\newtheorem{assumption}[lemma]{Assumption}
\newtheorem{claim}[lemma]{Claim}
\newtheorem{conclusion}[lemma]{Conclusion}
\newtheorem{proposition}[lemma]{Proposition}
\newtheorem{conjecture}[lemma]{Conjecture}
\newtheorem{axiom}[lemma]{Axiom}
\newtheorem{algo}[lemma]{Algorithm}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}

\title{In- or Out-of-Distribution Detection via Dual Divergence Estimation (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{Sahil Garg}
\author[2]{Sanghamitra Dutta}
\author[1]{Mina Dalirrooyfard}
\author[1]{Anderson Schneider}
\author[1]{Yuriy Nevmyvaka}

% Add affiliations after the authors
\affil[1]{%
    Dept. of Machine Learning Research\\
    Morgan Stanley\\
    New York, New York, USA
}
\affil[2]{%
    Dept. of Electrical and Computer Engineering\\
    University of Maryland\\
    College Park, Maryland, USA
}
\affil[*]{%
Corresponding Author: sahil.garg@morganstanley.com, sahil.garg.cs@gmail.com
}

  
\begin{document}

\onecolumn
\maketitle

\section{Proofs}
\label{sec:proofs}

We first recall the definition of $\hat{D}(\mb{X} \| \mb{X}^{in})$ here.
\begin{align}\label{eq:dhat}
\hat{D}(\mb{X} \| \mb{X}^{in})
=
\max_{\hat{f}(.) \in \cH}
\frac{1}{m}\sum_{\mb{x}_j \in \mb{X}}
\hat{f}(\mb{x}_j)
-
\log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}\!
e^{\hat{f}(\mb{x}^{in}_i)}
+ \log N
% 
\end{align}
We define the value inside the max expression for any particular $f$ as $\hat{D}_f(\mb{X} \| \mb{X}^{in})$. 
\begin{align}\label{eq:dhatforf}
\hat{D}_{\hat{f}}(\mb{X} \| \mb{X}^{in})
=
\frac{1}{m}\sum_{\mb{x}_j \in \mb{X}}
\hat{f}(\mb{x}_j)
-
\log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}\!
e^{\hat{f}(\mb{x}^{in}_i)}
+ \log N
% 
\end{align}

% We first note the notation we are going to use for our proofs. Note that to compute the KL-D,  we can assign sets of data points to the distributions and hence compute the expectations w.r.t. distributions by evaluating the functions on data points from the corresponding sets. Considering this, the KL-D expression can be written as below for two subsets of data $P$ and $Q$. Let $n_P$ and $n_Q$ be the size of $P$ and $Q$ respectively. 
% \begin{equation}
% % 
% \hat{\cD}(P||Q) 
% = \max_{f(.)}
% \frac{1}{n_P} \sum_{i \in P} f(\mathbf{x}_i) - \log \frac{1}{n_Q} \sum_{j \in Q} \exp {f(\mathbf{x}_j)}\label{eq:obj-exact}
% % 
% \end{equation}
% For simplicity let $\overline{f^P}=\frac{1}{n_P} \sum_{i \in P} f(\mathbf{x}_i)$ and $\overline{e^{f^Q}}=\frac{1}{n_Q} \sum_{j \in Q} \exp {f(\mathbf{x}_j)}$ for any real-valued function $f$ and any subsets $P$ and $Q$. 

%\subsection{}

% \textbf{Reminder of Theorem \ref{thm:replay-pres-same-dist}}
% \textit{Consider the DV representation of the present data $(\boldsymbol{\cX}^{p}, \cY^p)$ and the historical past $(\boldsymbol{\cX}^{h}, \cY^h)$, and consider a histogram of width $d$ on this representation. The number of present data points in each bin forms a distribution.
% % 
% % \todo{which distribution, present or past, Mina?}
% % 
% If we sample past data from each bin with respect to this distribution to obtain the replay sample $(\boldsymbol{\cX}^{r}, \cY^r)$, then the KL-D between present data and replay sample is $O(d)$ where $d$ is the width of the histogram bins.
% % 
% More formally, }

% \begin{align}
% \hat{\cD}_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{r}, \cY^r))\le O(d)
% \\
% % 
% \nonumber
% % 
% \hat{\cD}_{kl}((\boldsymbol{\cX}^{r}, \cY^r)||(\boldsymbol{\cX}^{p}, \cY^p))\le O(d)
% \end{align}


% 
% \todo{KL-D is supposed to be positive, why absolute value symbol, Mina?}
% 
%Sampling from the histograms bins in DV space, the ones which have non-zero density for the present, is good representative of desired distribution.
% \todo{Talk to Mina}
% 
% \todo{What is good distribution, present?}
% 
% Our approach for this proof is to provide some sort of upper bound KL-D between samples and the present distribution.
% 
%Since KL-D is expressed in terms of DV representation itself, and DV is optimized in this case, it should be straightforward to compute KL-D between selected samples for replay and the present observations.
% 




\subsection{}

% 
% \textbf{Reminder of Lemma \ref{thm:min}}
% \textit{
% % 
% Consider the DV representation of the present $(\boldsymbol{\cX}^{p}, \cY^p)$ and past data $(\boldsymbol{\cX}^{h}, \cY^h)$ estimating  ${\cD}_{kl}((\boldsymbol{\cX}^{h}, \cY^h)||(\boldsymbol{\cX}^{p}, \cY^p))$.
% % is maximized for any episode $(\boldsymbol{\cX}^{h}, \cY^h)$ of the past data.
% There is no past data point with value lower than all the present data points in the DV representation.
% %
% %Suppose that the minimum value of present data and past data in this representation is $m_p$ and $m_h$ respectively. Then we must have $m_p\le m_h$.
% % 
% }

\begin{proofof}{Theorem 1}

Let $m = \min_{\mb{x}_i^{in}}
\hat{f}^*(\mb{x}^{in}_i)$. By way of contradiction assume that there is $\mb{x}_j$ such that
$\hat{f}^*(\mb{x}_j)
<
m.$ Define function $\bar{f}$ as follows. For any $\mb{x}_i\in \mb{X}$,
$$
\bar{f}(\mb{x}_i) =
\left\{
	\begin{array}{ll}
		\hat{f}^*(\mb{x}_i)  & \mbox{if } f(\mb{x}_i) \geq m \\
		(\hat{f}^*(\mb{x}_i)+m)/2 & \mbox{otherwise}
	\end{array}
\right.
$$
We show that $\hat{D}_{\bar{f}}(\mb{X}\|\mb{X}^{in})>\hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in})$. To see this, first note that for any $\mb{x}_i^{in}\in \mb{X}^{in}$, $\bar{f}(\mb{x}_i^{in})=\hat{f}^*(\mb{x}_i^{in})$. So we have 
\begin{equation}\label{eq:helper1}
    \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\bar{f}(\mb{x}^{in}_i)}}{|\mb{X}^{in}|} = \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}^*(\mb{x}^{in}_i)}}{|\mb{X}^{in}|}.
\end{equation}
Moreover, we have that for any $\mb{x}_i\in \mb{X}$, $\bar{f}(\mb{x}_i)\ge \hat{f}^*(\mb{x}_i)$. This is because if $\hat{f}^*(\mb{x}_i)\ge m$, then $\bar{f}(\mb{x}_i)= \hat{f}^*(\mb{x}_i)$ and if $\hat{f}^*(\mb{x}_i)< m$, then $\bar{f}(\mb{x}_i)= (m+\hat{f}^*(\mb{x}_i))/2 > \hat{f}^*(\mb{x}_i)$. Since there is at least one $\mb{x}_j$ such that $\hat{f}^*(\mb{x}_j)< m$, we have that 
\begin{equation}\label{eq:helper2}
\sum_{\mb{x}_j \in \mb{X}}
\frac{\bar{f}(\mb{x}_j)}{|\mb{X}|}>\sum_{\mb{x}_j \in \mb{X}}
\frac{\hat{f}^*(\mb{x}_j)}{|\mb{X}|}.\end{equation}
By Eq \ref{eq:helper1} and \ref{eq:helper2} and the definition of $\hat{D}_f(\mb{X}\|\mb{X}^{in})$, we have that $\hat{D}_{\bar{f}}(\mb{X}\|\mb{X}^{in})>\hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in})$. Since $\hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in}) = \hat{D}(\mb{X}\|\mb{X}^{in})=\max\hat{D}_f(\mb{X}\|\mb{X}^{in})$, this is a contradiction. So for all $\mb{x}_j\in \mb{X}$, we have $\hat{f}^*(\mb{x}_j)\ge m$.
%
% Let $\boldsymbol{f}:f_{h\to p}$ be the neural function defining the DV representation and hence optimizing %$\cD_{kl}((\boldsymbol{\cX}^{h}, \cY^h)||(\boldsymbol{\cX}^{p}, \cY^p))$.
% $\hat{D}(H||P)$, where $H$ is the set of historical data and $P$ is the set of present data (recall that distributions wrt which KL-D is obtained are empirical distributions of these sets).
% This means that
% %$$
% %\cD_{kl}((\boldsymbol{\cX}^{h}, \cY^h)||(\boldsymbol{\cX}^{p}, \cY^p)) = \sup_f \overline{f^h} - \log \overline{e^{f^p}} = \overline{\boldsymbol{f}^h} - \log \overline{e^{\boldsymbol{f}^p}}
% %$$ 
% $$
% \hat{D}_{kl}(H||P) = \sup_f \overline{f^H} - \log \overline{e^{f^P}} = \overline{\boldsymbol{f}^H} - \log \overline{e^{\boldsymbol{f}^P}}
% $$ 
% Let $m_H$ be the minimum value $\boldsymbol{f}$ assigns to a historical data point and let $m_P$ be the minimum value $\boldsymbol{f}$ assigns to a present data point. If $m_H<m_P$, then we show that we can find a continuous function $g$ such that $\overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^H}}<\overline{g^P} - \log \overline{e^{g^H}}$, which is a contradiction.
%
% Define $g$ as follows: 
% $$
% g(x) =
% \left\{
% 	\begin{array}{ll}
% 		\boldsymbol{f}(x)  & \mbox{if } f(x) \geq m_P \\
% 		(\boldsymbol{f}(x)+m_P)/2 & \mbox{otherwise}
% 	\end{array}
% \right.
% $$
%
% We have that $\overline{e^{\boldsymbol{f}^P}}=\overline{e^{g^P}}$ and $\overline{g^H} > \overline{\boldsymbol{f}^H}$, so $\overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^H}}<\overline{g^P} - \log \overline{e^{g^H}}$. Thus we must have that $m_H\ge m_P$.
\end{proofof}


\subsection{}

\begin{proofof}{Theorem 2}
By way of contradiction assume that 
$\max_{\mb{x}_j \in \mb{X}}
\hat{f}^*(\mb{x}_j) < \max_{\mb{x}_i^{in}\in \mb{X}^{in}}
\hat{f}^*(\mb{x}^{in}_i)$. Let $m= \max_{\mb{x}_j \in \mb{X}}
\hat{f}^*(\mb{x}_j)$. Define $f_0(x)=\min(\hat{f}^*(x),m)$. Thus, $f_0(x)\leq \hat{f}^*(x)$ for all $x$ with strict inequality $f_0(x) < \hat{f}^*(x)$ for at least some $\mb{x}_i^{in}\in \mb{X}^{in}$. 

We will now that show that $\hat{D}_{f_0}(\mb{X}\|\mb{X}^{in})>\hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in})$. To see this, note that:
\begin{equation}
    \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{f_0(\mb{x}^{in}_i)}}{|\mb{X}^{in}|} < \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}^*(\mb{x}^{in}_i)}}{|\mb{X}^{in}|}.
\end{equation}
This leads to:
\begin{align}
\hat{D}_{f_0}(\mb{X}\|\mb{X}^{in}) & = \sum_{\mb{x}_j \in \mb{X}} \frac{f_0(\mb{x}_j)}{|\mb{X}|} - \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{f_0(\mb{x}^{in}_i)}}{|\mb{X}^{in}|} \nonumber\\
&> \sum_{\mb{x}_j \in \mb{X}} \frac{f_0(\mb{x}_j)}{|\mb{X}|} - \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}^*(\mb{x}^{in}_i)}}{|\mb{X}^{in}|} \nonumber \\
& = \sum_{\mb{x}_j \in \mb{X}} \frac{\hat{f}^*(\mb{x}_j)}{|\mb{X}|} - \log \sum_{\mb{x}^{in}_i \in \mb{X}^{in}}
\frac{e^{\hat{f}^*(\mb{x}^{in}_i)}}{|\mb{X}^{in}|} \text{  since $\hat{f}^*(\mb{x}_j)\leq m$ for all $\mb{x}_j \in \mb{X}$ making $\hat{f}^*(\mb{x}_j)=f_0(\mb{x}_j)$ } \nonumber \\
& = \hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in}).
\end{align}
%
This is a contradiction since $\hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in}) = \hat{D}(\mb{X}\|\mb{X}^{in})=\max_{f \in \cH}\hat{D}_f(\mb{X}\|\mb{X}^{in})$. So, we have $\max_{\mb{x}_j \in \mb{X}}
\hat{f}^*(\mb{x}_j) \geq \max_{\mb{x}_i^{in}\in \mb{X}^{in}}
\hat{f}^*(\mb{x}^{in}_i)$.
\end{proofof}

\subsection{}

\begin{proofof}{Theorem 3}
By the definition of $\mb{X}^{ood}$, since for any $\mb{x}_j\in \mb{X}^{ood}$ we have $\hat{f}^*(\mb{x}_j) > \log \sum_{\mb{x}_i^{in} \in \mb{X}^{in}} e^{\hat{f}^*(\mb{x}_i^{in})}$, we obtain $\frac{1}{m}\sum_{\mb{x}_j \in \mb{X}^{ood}}
\hat{f}^*(\mb{x}_j) > \log \sum_{\mb{x}_i^{in} \in \mb{X}^{in}} e^{\hat{f}^*(\mb{x}_i^{in})}$
and thus by Eq \ref{eq:dhatforf} we get that $\hat{D}_{\hat{f}^*}(\mb{X}^{ood}\|\mb{X}^{in}) >\log{N}$. Now since $\hat{D}(\mb{X}^{ood}\|\mb{X}^{in}) = \max_{\hat{f}} \hat{D}_{\hat{f}}(\mb{X}^{ood}\|\mb{X}^{in}) $, we have that $\hat{D}(\mb{X}^{ood}\|\mb{X}^{in})\ge \hat{D}_{\hat{f}^*}(\mb{X}^{ood}\|\mb{X}^{in})>\log{N}$. Note that the function attaining a maximum in $\hat{D}(\mb{X}^{ood}\|\mb{X}^{in})$ is not necessarily $\hat{f}^*$ and we don't make a such assumption. 

\end{proofof}
\subsection{}

% \begin{proofof}{Theorem \ref{thm:replay-pres-same-dist}}
% % 
% % \todo{SM and Anant are working on it to complete the proof while Mina took care of it if f were assumed to be fixed. Our intuition for the proof is that it is OOD samples in the test set which push f towards the right hand side as much as possible. Having removed OOD samples, there is no reason that f(.) can increased any further for detected ID samples.}
% % 

% Let $\hat{f}^*$ be the function that attains maximum in eq \ref{eq:dhat}, i.e. $\hat{D}(\mb{X}\|\mb{X}^{in})=\max\hat{D}_{\hat{f}^*}(\mb{X}\|\mb{X}^{in})$. By Lemma \ref{lem:helperthm} we know that $\hat{D}(\mb{\bar{X}}\|\mb{X}^{in})=\hat{D}_{\hat{f}^*}(\mb{\bar{X}}\|\mb{X}^{in})$. First we show that $\hat{D}_{\hat{f}^*}(\mb{\bar{X}}\|\mb{X}^{in})-\hat{D}_{\hat{f}^*}(\mb{X}^{in}\|\mb{X}^{in})\le d$. Then since $\hat{D}_{\hat{f}^*}(\mb{X}^{in}\|\mb{X}^{in})\le \hat{D}(\mb{X}^{in}\|\mb{X}^{in})$, we obtain that $\hat{D}(\mb{\bar{X}}\|\mb{X}^{in})-\hat{D}(\mb{X}^{in}\|\mb{X}^{in})\le d$. Since $\hat{D}(\mb{X}^{in}\|\mb{X}^{in})=0$, we have $\hat{D}(\mb{\bar{X}}\|\mb{X}^{in})\le d$.

% By the way we sample $\mb{\bar{X}}$, there is $\alpha>0$ such that for each bin $B$, if $n_B$ is the number of data points $\mb{x}^{in}\in \mb{X}^{in}$ in such that $\hat{f}^*(\mb{x}^{in})\in B$, then we select $\alpha n_B$ points $\mb{x}\in \mb{X}\setminus \mb{X}^{in}$ with $\hat{f}^*(\mb{x})\in B$. %For any point $\mb{x}\in \mb{X}$ where $\hat{f}^*(\mb{x})\in B$, we say that $x$ is in bin $B$. 
% Note that any two points $\mb{x},\mb{x'}$ with $\hat{f}^*$ values in the same bin, we have $|\hat{f}^*(\mb{x})-\hat{f}^*(\mb{x'})|\le d$. So we have that for each bin $B$ and for any points $\mb{x}\in \mb{\bar{X}},\mb{x}^{in}\in \mb{X}^{in}$ such that $\hat{f}^*(\mb{x}),\hat{f}^*(\mb{x}^{in})\in B$, we have $\hat{f}^*(\mb{x})\le \hat{f}^*(\mb{x}^{in})+d$. So  
% $\sum_{\hat{f}^*(\mb{x})\in B}\hat{f}^*(\mb{x})\le \alpha \sum_{\hat{f}^*(\mb{x}^{in})\in B}(\hat{f}^*(\mb{x}^{in})+d)$. Using the fact that $\alpha|\mb{X}^{in}|= |\mb{\bar{X}}|$ and the above inequality, we have
% $$
% \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}\in \mb{\bar{X}}}\hat{f}^*(\mb{x}) = \frac{1}{|\mb{\bar{X}}|}\sum_B \sum_{\hat{f}^*(\mb{x})\in B}\hat{f}^*(\mb{x}) \le \frac{1}{|\mb{\bar{X}}|}\sum_B \alpha\sum_{\hat{f}^*(\mb{x}^{in})\in B} (\hat{f}^*(\mb{x}^{in})+d) = d+\frac{1}{|\mb{{X}}^{in}|}\sum_{\mb{x}^{in}\in \mb{{X}}^{in}}\hat{f}^*(\mb{x}^{in})
% $$
% So from the definition of $\hat{D}_{\hat{f}^*}(\mb{\bar{X}}\|\mb{X}^{in})$ and $\hat{D}_{\hat{f}^*}(\mb{X}^{in}\|\mb{X}^{in})$, we have that $\hat{D}_{\hat{f}^*}(\mb{\bar{X}}\|\mb{X}^{in}) \le \hat{D}_{\hat{f}^*}(\mb{X}^{in}\|\mb{X}^{in})+d$ and this finishes the proof.
% %
% % First we consider present $\to$ past direction in computing KL-D. We show present data points as a set $P$ and historical past data points as $H$. For any subset $T$ of the historical past, we show the KL divergence between present and this subset of the past by $\hat{D}_{kl}(P||T)$.
% % Recall that we can assume there is a neural net function $\boldsymbol{f}:f_{P\to T}$ optimizing %$\cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{t}, \cY^t))$
% % $\hat{D}_{kl}(P||T)$
% % for episodes %$(\boldsymbol{\cX}^{t}, \cY^t)\subseteq (\boldsymbol{\cX}^{h}, \cY^h)$ 
% % $T\subseteq H$ of the past data. Formally we assume that
% % %$$
% % %\cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{t}, \cY^t)) = \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^t}}
% % %$$
% % $$
% % \hat{D}_{kl}(P||T) = \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^T}}
% % $$
% % For $T=P$, we have that $\hat{D}_{kl}(P||P)$ is near zero since the KL-D between $P$ and $P$ is zero. So we have 
% % %
% % %0=\cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{p}, \cY^p)) = \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^p}}
% % %$$
% % $$
% % \hat{D}_{kl}(P||P) = \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^P}} = O(1)
% % $$
% % So $\overline{\boldsymbol{f}^P} \le \log \overline{e^{\boldsymbol{f}^P}} + O(1)$ .
% % %be the function DV representation of the  

% % Now using the fact that the replay samples $R$ are taken with respect to the present bins distribution,
% % we show that $|\log \overline{e^{\boldsymbol{f}^R}}-\log \overline{e^{\boldsymbol{f}^P}}|\le d$. There is $\alpha>0$ such that for each present bin $B$, if $B$ has $n_B$ present data points we sample $\alpha n_B$ past points in $B$. Moreover for any present data $x_P$ and replay data $x_R$ in $B$ we have $x_P-d\le x_R\le x_P+d$. Let $B^R$ be the set of replay samples in $B$ and $B^P$ be the set of present samples in $B$. So 

% % $$
% % \alpha e^{-d}\sum_{x\in B^P}e^{\boldsymbol{f}(x)} \le \alpha \sum_{x\in B^P}e^{\boldsymbol{f}(x-d)} \le
% % \sum_{x\in B^R}e^{\boldsymbol{f}(x)} \le 
% % \alpha \sum_{x\in B^P}e^{\boldsymbol{f}(x+d)} \le \alpha e^d \sum_{x\in B^P}e^{\boldsymbol{f}(x)} 
% % $$
% % So we have 
% % $
% % \overline{e^{\boldsymbol{f}^P}}\cdot e^{-d}\le \overline{e^{\boldsymbol{f}^R}} \le \overline{e^{\boldsymbol{f}^P}}\cdot e^d.
% % $
% % And so $|\log \overline{e^{\boldsymbol{f}^R}}-\log \overline{e^{\boldsymbol{f}^P}}|\le d$. This means that 
% % %$$0\le \cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{r}, \cY^r))= \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^r}} \le \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^p}}+d\le d
% % %$$
% % $$0\le \hat{D}_{kl}(P||R)= \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^R}} \le \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^P}}+d+O(1)\le O(d)
% % $$
% % The replay $\to$ present direction is very similar, we note it here for completeness.

% % Again there is a neural net function $\boldsymbol{g}: f_{R\to P}$ optimizing $\hat{D}(T||P)$ for episodes $T\subseteq H$ of the past data. This means:
% % $$
% % \hat{D}_{kl}(T||P) = \overline{\boldsymbol{g}^T} - \log \overline{e^{\boldsymbol{g}^P}}
% % $$
% % For $T=P$, we have 
% % $$
% % \hat{D}_{kl}(P||P) = \overline{\boldsymbol{g}^P} - \log \overline{e^{\boldsymbol{g}^P}} = O(1)
% % $$
% % So $\overline{\boldsymbol{g}^P} \le \log \overline{e^{\boldsymbol{g}^P}}+O(1)$. For every bin $B$, we have that the difference in value between any present sample and replay sample is at most $d$, and hence 
% % $
% % |\overline{\boldsymbol{g}^P}-\overline{\boldsymbol{g}^R}|\le d
% % $
% % So we have 
% % $$0\le \hat{D}_{kl}(R||P)= \overline{\boldsymbol{g}^R} - \log \overline{e^{\boldsymbol{g}^P}} \le \overline{\boldsymbol{g}^P} - \log \overline{e^{\boldsymbol{g}^P}}+d+O(1)\le O(d)
% % $$
% % %
% % %and $|\boldsymbol{f^{p}}-\boldsymbol{f^{r}}|\le d$. This is because  
% % %
% \end{proofof}

% \begin{theorem}
% \label{thm:replay-pres-same-dist}
% % 
% Consider the DV representation of the present data $(\boldsymbol{\cX}^{p}, \cY^p)$ and the historical past $(\boldsymbol{\cX}^{h}, \cY^h)$, and consider a histogram of width $d$ on this representation. The number of present data points in each bin forms a distribution.
% % 
% If we sample past data from each bin with respect to this distribution to obtain the replay sample $(\boldsymbol{\cX}^{r}, \cY^r)$, then the KL-D between present data and replay sample is $O(d)$ where $d$ is the width of the histogram bins.
% % 
% More formally, }

% \begin{align}
% \hat{\cD}_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{r}, \cY^r))\le O(d)
% \\
% % 
% \nonumber
% % 
% \hat{\cD}_{kl}((\boldsymbol{\cX}^{r}, \cY^r)||(\boldsymbol{\cX}^{p}, \cY^p))\le O(d)
% \end{align}
% % 
% \end{theorem}
% 
\begin{proofof}{Theorem 4}
% 
First we consider present $\to$ past direction in computing KL-D. We show present data points as a set $P$ and historical past data points as $H$. For any subset $T$ of the historical past, we show the KL divergence between present and this subset of the past by $\hat{D}_{kl}(P||T)$.
Recall that we can assume there is a neural net function $\boldsymbol{f}:f_{P\to T}$ optimizing %$\cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{t}, \cY^t))$
$\hat{D}_{kl}(P||T)$
for episodes %$(\boldsymbol{\cX}^{t}, \cY^t)\subseteq (\boldsymbol{\cX}^{h}, \cY^h)$ 
$T\subseteq H$ of the past data. Formally we assume that
%$$
%\cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{t}, \cY^t)) = \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^t}}
%$$
$$
\hat{D}_{kl}(P||T) = \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^T}}
$$
For $T=P$, we have that $\hat{D}_{kl}(P||P)$ is near zero since the KL-D between $P$ and $P$ is zero. So we have 
%
%0=\cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{p}, \cY^p)) = \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^p}}
%$$
$$
\hat{D}_{kl}(P||P) = \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^P}} = O(1)
$$
So $\overline{\boldsymbol{f}^P} \le \log \overline{e^{\boldsymbol{f}^P}} + O(1)$.
    
Now using the fact that the replay samples $R$ are taken with respect to the present bins distribution,
we show that $|\log \overline{e^{\boldsymbol{f}^R}}-\log \overline{e^{\boldsymbol{f}^P}}|\le d$. There is $\alpha>0$ such that for each present bin $B$, if $B$ has $n_B$ present data points we sample $\alpha n_B$ past points in $B$. Moreover for any present data $x_P$ and replay data $x_R$ in $B$ we have $x_P-d\le x_R\le x_P+d$. Let $B^R$ be the set of replay samples in $B$ and $B^P$ be the set of present samples in $B$. So 

$$
\alpha e^{-d}\sum_{x\in B^P}e^{\boldsymbol{f}(x)} \le \alpha \sum_{x\in B^P}e^{\boldsymbol{f}(x-d)} \le
\sum_{x\in B^R}e^{\boldsymbol{f}(x)} \le 
\alpha \sum_{x\in B^P}e^{\boldsymbol{f}(x+d)} \le \alpha e^d \sum_{x\in B^P}e^{\boldsymbol{f}(x)} 
$$
So we have 
$
\overline{e^{\boldsymbol{f}^P}}\cdot e^{-d}\le \overline{e^{\boldsymbol{f}^R}} \le \overline{e^{\boldsymbol{f}^P}}\cdot e^d.
$
And so $|\log \overline{e^{\boldsymbol{f}^R}}-\log \overline{e^{\boldsymbol{f}^P}}|\le d$. This means that 
%$$0\le \cD_{kl}((\boldsymbol{\cX}^{p}, \cY^p)||(\boldsymbol{\cX}^{r}, \cY^r))= \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^r}} \le \overline{\boldsymbol{f}^p} - \log \overline{e^{\boldsymbol{f}^p}}+d\le d
%$$
$$0\le \hat{D}_{kl}(P||R)= \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^R}} \le \overline{\boldsymbol{f}^P} - \log \overline{e^{\boldsymbol{f}^P}}+d+O(1)\le O(d)
$$
The replay $\to$ present direction is very similar, we note it here for completeness.

Again there is a neural net function $\boldsymbol{g}: f_{R\to P}$ optimizing $\hat{D}(T||P)$ for episodes $T\subseteq H$ of the past data. This means:
$$
\hat{D}_{kl}(T||P) = \overline{\boldsymbol{g}^T} - \log \overline{e^{\boldsymbol{g}^P}}
$$
For $T=P$, we have 
$$
\hat{D}_{kl}(P||P) = \overline{\boldsymbol{g}^P} - \log \overline{e^{\boldsymbol{g}^P}} = O(1)
$$
So $\overline{\boldsymbol{g}^P} \le \log \overline{e^{\boldsymbol{g}^P}}+O(1)$. For every bin $B$, we have that the difference in value between any present sample and replay sample is at most $d$, and hence 
$
|\overline{\boldsymbol{g}^P}-\overline{\boldsymbol{g}^R}|\le d
$
So we have 
$$0\le \hat{D}_{kl}(R||P)= \overline{\boldsymbol{g}^R} - \log \overline{e^{\boldsymbol{g}^P}} \le \overline{\boldsymbol{g}^P} - \log \overline{e^{\boldsymbol{g}^P}}+d+O(1)\le O(d)
$$
%
%and $|\boldsymbol{f^{p}}-\boldsymbol{f^{r}}|\le d$. This is because  
%
\end{proofof}

\begin{figure}
    \centering
    \includegraphics{histogram_bin_ex.eps}
    \caption{DV representation of data points in one dimension. The grey points represent past data and the red points represent present data. Observe that if we remove all the points in $B_4,\ldots,B_8$, the mean value of past data points will decrease.}
    \label{fig:hist_bin_ex}
\end{figure}



% %\textcolor{blue}{Lemma added by SD to help prove Theorem 3}
% \textcolor{red}{SD: As an afterthought, I feel that we allowed $\tilde{f}$ to be arbitrary in previous Lemma 2. If we are allowing for arbitrary functions, then $\hat{D}(X_a||X_b)$ can become infinity when $X_a$ and $X_b$ are disjoint by choosing an arbitrary function that is high on $X_a$ and low on $X_b$. When working with the empirical estimate of $D$, we should also be restricted to a set of possible functions we can choose that I would like to call: ``Learnable functions'' $H$. I am now working on incorporating a definition of learnable functions in the proof of the previous Lemma 2. So, when we make statements like $f^*=\arg\max_{f} (expression)$, it would be replaced by $f^*=\arg \max_{f \in H} (expression)$. Otherwise, $f$ is really arbitrary and $\hat{D}(X_a||X_b)$ can become infinity.  }


% \begin{lemma}\label{lem:helperthm}
% Consider $\hat{f}_1$ such that $\hat{D}(\mb{X} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1}(\mb{X} \| \mb{X}^{in})$.
%and $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})=\hat{D}_{\hat{f}_2}(\mb{X} \| \mb{\bar{X}}^{in})$. 
% Then if $\hat{f}_1'$ is the restriction of $\hat{f}_1$ to $\bar{X}\cup X^{in}$, we have $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1'}(\mb{\bar{X}} \| \mb{X}^{in})$, when the elements of $X\backslash \bar{X} $, $\bar{X}$ and $X_{in}$ are distinct.
% \end{lemma}

% \begin{proof}
% Recall that 
% $\hat{D}(\mb{X} \| \mb{X}^{in})
% =
% \max_{\hat{f}(.)}
% \frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
% \hat{f}(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}(\mb{x}^{in}_i)}$ and 
% $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})
% =
% \max_{\hat{f}(.)}
% \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}(\mb{x}^{in}_i)}$.
% We also have:

% $\hat{f}_1=\arg\max_{\hat{f}(.)}
% \frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
% \hat{f}(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}(\mb{x}^{in}_i)}$ 
% and

% $\hat{f}_2=\arg\max_{\hat{f}(.)}
% \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}(\mb{x}^{in}_i)}$.

% We show that: $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})= \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)}$ when the elements of $X\backslash \bar{X} $, $\bar{X}$ and $X_{in}$ are distinct.

% We can show that $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})= \max_{\hat{f}(.)}
% \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}(\mb{x}^{in}_i)} \\
% \geq \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)}$. Now, we will prove the other way round, i.e., $\hat{D}(\mb{\bar{X}}\| \mb{X}^{in})  \leq \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)}$.

% Let us suppose that we assume the following strict inequality (to contradict later): 
% $\hat{D}(\mb{\bar{X}}\| \mb{X}^{in})  > \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)}$. 

% This implies,

% $\frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_2(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_2(\mb{x}^{in}_i)} > \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)}$.

% Or,

% $\frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_2(\mb{x}_j)
% + \frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
% \hat{f}_1(\mb{x}_j) - \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_2(\mb{x}^{in}_i)} \\
% > \frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)} = \hat{D}(\mb{X} \| \mb{X}^{in})$.

% But this would mean that there exists a function $\tilde{f}(x)$ as follows:

% $\tilde{f}(x)= \begin{cases}
% & \frac{|\mb{X}|}{|\mb{\bar{X}}|}\hat{f}_2(x) + (1-\frac{|\mb{X}|}{|\mb{\bar{X}}|})\hat{f}_1(x), \ \ \  x \in  \mb{\bar{X}}\\
% & \hat{f}_1(x), \ \ \ x \in \mb{X} \backslash \mb{\bar{X}} \\
% & \hat{f}_2(x), \ \ \ x \in \mb{X}^{in}, 
% \end{cases}$

% such that $\frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
% \tilde{f}(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\tilde{f}(\mb{x}^{in}_i)} > \hat{D}(\mb{X} \| \mb{X}^{in})$ which is a contradiction. Thus, the strict inequality does not hold, and we have:
% $\hat{D}(\mb{\bar{X}}\| \mb{X}^{in})  =\frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
% \hat{f}_1(\mb{x}_j)
% -
% \log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
% e^{\hat{f}_1(\mb{x}^{in}_i)}$. 




% \textcolor{red}{Adding some new results updating the previous lemma 2 and making it more rigorous. See Theorem 4 below which will use this Lemma 2}

\subsection{}

\begin{lemma}
\label{lem:offset}
The value of the function $\hat{D}_{f}(\mb{X_a} \| \mb{X_b}) := \frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} f(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{f(\mb{x}_i)}$ is unchanged if we replace $f(x)$ with $\tilde{f}(x)=f(x)+c$ for some constant $c$ for all $x\in R^k$, given any two sets $\mb{X_a}$ and $\mb{X_b}\subseteq R^k$. 
\end{lemma}

\begin{proofof}{Lemma~\ref{lem:offset}}

\begin{align}
\hat{D}_{\tilde{f}}(\mb{X_a} \| \mb{X_b}) & = \frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} \tilde{f}(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{\tilde{f}(\mb{x}_i)} \nonumber \\
& =\frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} 
(f(\mb{x}_j)+c)-
\log{ \frac{1}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{f(\mb{x}_j)+c} \nonumber \\
& = c + \frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} 
f(\mb{x}_j)    -
\log{ \frac{e^c}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{f(\mb{x}_j)} \nonumber \\
& =c + \frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} 
f(\mb{x}_j) - \log{e^c} - \log{ \frac{1}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{f(\mb{x}_j)} \nonumber \\
& =  \frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} 
f(\mb{x}_j)  - \log{ \frac{1}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{f(\mb{x}_j)} = \hat{D}_{f}(\mb{X_a} \| \mb{X_b}).
\end{align}

\end{proofof}

Next, we prove Theorem~5.
We define:
$\hat{D}_{f}(\mb{X_a} \| \mb{X_b}) := \frac{1}{|\mb{X_a}|}\sum_{\mb{x}_j \in \mb{X_a}} f(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X_b}|}\sum_{\mb{x}_i \in \mb{X_b}}}
e^{f(\mb{x}_i)}$ for any $\mb{X_a}, \mb{X_b} \subseteq R^k$.
Then,
$\hat{D}(\mb{X_a} \| \mb{X_b}) = \max_{f \in \mathcal{H}} \hat{D}_{f}(\mb{X_a} \| \mb{X_b})$ where $\mathcal{H}$ is the set of all learnable functions defined as follows:
$\mathcal{H}\subseteq\{f:R^k\rightarrow R \} $ such that (i) $-\infty < f(x) < \infty$ for all $x \in R^k$ and (ii) If $f_1,f_2,g \in \mathcal{H}$, then functions of the form $f_1(x)I(g(x)\geq \tau)+f_2(x)I(g(x)<\tau)$ (which are essentially entirely derived from functions in $\mathcal{H}$) also lie in $\mathcal{H}$. Here $I(\cdot)$ is the indicator function. This stems from the intuition that if we are able to learn some functions on $R^k\rightarrow R$, then a function that is entirely derived from those functions should also be learnable.

% \begin{theorem}
% \label{thm:appendix}
% Let $\hat{D}(\mb{X_a} \| \mb{X_b}) = \max_{f \in \mathcal{H}} \hat{D}_{f}(\mb{X_a} \| \mb{X_b})$ where $\mathcal{H}$ is a set of functions as follows: $\mathcal{H}\subseteq\{f:R^k\rightarrow R \} $ such that: (i) $-\infty < f(x) < \infty$ for all $x \in R^k$; and (ii) If $f_1(x),f_2(x),g(x) \in \mathcal{H}$, then functions of the form $f_1(x)I(g(x)\geq \tau)+f_2(x)I(g(x)<\tau)$ (which are essentially derived entirely from functions in $\mathcal{H}$) also lie in $\mathcal{H}$ for any constant $\tau$.
%  Consider $\hat{f}_1 \in \mathcal{H}$ such that $\hat{D}(\mb{X} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1}(\mb{X} \| \mb{X}^{in})$.
% Then, for a subset $\mb{\bar{X}} \subseteq \mb{X}$, we have $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1}(\mb{\bar{X}} \| \mb{X}^{in})$, when the sets $\mb{\bar{X}}$, $\mb{X}\backslash \mb{\bar{X}} $ and $\mb{X_{in}}$ are such that $\hat{f}_1(x)>\tau$ for $x \in \mb{X}\backslash \mb{\bar{X}}$ and $\hat{f}_1(x)\leq\tau$ for $x \in \mb{X}^{in}\cup \mb{\bar{X}}$.
% \end{theorem}


\begin{proofof}{Theorem 5}
We have
$\hat{D}(\mb{X} \| \mb{X}^{in})
=\max_{f \in \mathcal{H}}
\hat{D}_{f}(\mb{X} \| \mb{X}^{in})$ and 
$\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})
=
\max_{f\in \mathcal{H}}\hat{D}_f(\mb{\bar{X}} \| \mb{X}^{in}) $. Now,  $\hat{f}_1 \in \mathcal{H}$ is such that $\hat{D}(\mb{X} \| \mb{X}^{in})=\hat{D}_{\hat{f}_1}(\mb{X} \| \mb{X}^{in})$. We also let $\hat{f}_2 \in \mathcal{H}$ be a function such that $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})=\hat{D}_{\hat{f}_2}(\mb{\bar{X}} \| \mb{X}^{in})$.

Observe that, $\hat{D}(\mb{\bar{X}} \| \mb{X}^{in})= \max_{f \in \mathcal{H}}
\hat{D}_{f}(\mb{X} \| \mb{X}^{in})
\geq \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_1(\mb{x}^{in}_i)}= \hat{D}_{\hat{f}_1}(\mb{\bar{X}}\| \mb{X}^{in})$. 

By way of contradiction, let us assume strict inequality: 
$\hat{D}_{\hat{f}_1}(\mb{\bar{X}}\| \mb{X}^{in}) < \hat{D}(\mb{\bar{X}}\| \mb{X}^{in})$. 

Then, plugging in $\hat{f}_2$, we get,

\begin{equation} \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_1(\mb{x}^{in}_i)} < \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_2(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_2(\mb{x}^{in}_i)}.
\end{equation}
Or,

\begin{align}
\hat{D}(\mb{X} \| \mb{X}^{in}) & = \frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
\hat{f}_1(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_1(\mb{x}^{in}_i)} \nonumber \\
& < \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_2(\mb{x}_j)
+ \frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
\hat{f}_1(\mb{x}_j) - \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j)
-\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_2(\mb{x}^{in}_i)} \nonumber \\
& = \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_2(\mb{x}_j)
+ \frac{1}{|\mb{X}|} \left(\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j) + \sum_{\mb{x}_j \in \mb{X}\backslash\mb{X}} \hat{f}_1(\mb{x}_j)\right) - \frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j)
-\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_2(\mb{x}^{in}_i)} \nonumber \\
& = \frac{1}{|\mb{X}|} \left(\sum_{\mb{x}_j \in \mb{\bar{X}}}
\frac{|\mb{X}|}{|\mb{\bar{X}}|}\hat{f}_2(x) + (1-\frac{|\mb{X}|}{|\mb{\bar{X}}|})\hat{f}_1(x) + \sum_{\mb{x}_j \in \mb{X}\backslash\mb{X}} \hat{f}_1(\mb{x}_j)\right) 
-\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_2(\mb{x}^{in}_i)} \nonumber \\
& \leq \frac{1}{|\mb{{X}}|}\left( \sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_2(\mb{x}_j) + \sum_{\mb{x}_j \in \mb{X} \backslash \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j) \right)
-\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_2(\mb{x}^{in}_i)},
\end{align}
where the last line holds because, without loss of generality, we can assume that $\hat{f}_2(x) < \hat{f}_1(x)$. This is because, if the $\hat{f}_2(x)$ happens to be greater than $\hat{f}_1(x)$ at some values of $x$, using Lemma~\ref{lem:offset}, we can always redefine another $\hat{f}_2(x) \in \mathcal{H}$ as the old $\hat{f}_2(x) - c$ where the constant $c$ is an offset that is chosen appropriately, e.g., $c=\max_{x\in R}|\hat{f}_1(x) - \text{old}\hat{f}_2(x)| < \infty$ since $\hat{f}_1$ and old $\hat{f}_2$ also belong to $\mathcal{H}$.

Let us now define a function $\tilde{f}(x)$ as follows:
$\tilde{f}(x)= \hat{f}_1(x) I(\hat{f}_1(x)> \tau) + \hat{f}_2(x)I(\hat{f}_1(x) \leq \tau)$. This function attains the following values over the subsets  $ \mb{\bar{X}}$, $\mb{X} \backslash \mb{\bar{X}}$ and $\mb{X}^{in}$:
$\tilde{f}(x)= \begin{cases}
& \hat{f}_1(x), \ \ \ x \in \mb{X} \backslash \mb{\bar{X}} \\
& \hat{f}_2(x), \ \ \ x \in \mb{X}^{in} \cup \mb{\bar{X}},
\end{cases}$
since $\hat{f}_1(x)>\tau$ for $x \in \mb{X}\backslash \mb{\bar{X}}$ and $\hat{f}_1(x)\leq\tau$ for $x \in \mb{X_{in}}\cup \mb{\bar{X}}$. The function $\tilde{f}$ also belongs to 
$\mathcal{H}$ because of its form that is entirely derived from other functions in $\mathcal{H}$.

But this means that we now have a function $\tilde{f}(x) \in \mathcal{H}$, 
such that $\frac{1}{|\mb{X}|}\sum_{\mb{x}_j \in \mb{X}}
\tilde{f}(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\tilde{f}(\mb{x}^{in}_i)} = \hat{D}_{\tilde{f}}(\mb{X} \| \mb{X}^{in}) > \hat{D}(\mb{X} \| \mb{X}^{in})$ which is a contradiction since $\hat{D}(\mb{X} \| \mb{X}^{in})=\max_{f\in \mathcal{H}}\hat{D}_{f}(\mb{X} \| \mb{X}^{in})$. 

Thus, the strict inequality ($\hat{D}_{\hat{f}_1}(\mb{\bar{X}}\| \mb{X}^{in}) < \hat{D}(\mb{\bar{X}}\| \mb{X}^{in}) $) does not hold, and we have:
$$\hat{D}(\mb{\bar{X}}\| \mb{X}^{in})  =\hat{D}_{\hat{f}_1}(\mb{\bar{X}} \| \mb{X}^{in})=\frac{1}{|\mb{\bar{X}}|}\sum_{\mb{x}_j \in \mb{\bar{X}}}
\hat{f}_1(\mb{x}_j)
-
\log{ \frac{1}{|\mb{X}^{in}|}\sum_{\mb{x}^{in}_i \in \mb{X}^{in}}}
e^{\hat{f}_1(\mb{x}^{in}_i)}$$ 
\end{proofof}


% \textbf{Reminder of Theorem \ref{thm:replay-pres-diff-dist}}
% \textit{
% Consider the one dimensional DV representation of the present data $(\boldsymbol{\cX}^{p}, \cY^p)$ 
% and the historical past $(\boldsymbol{\cX}^{h}, \cY^h)$
%  estimating $\cD_{kl}((\boldsymbol{\cX}^{h}, \cY^h)||(\boldsymbol{\cX}^{p}, \cY^p))$, and consider a histogram on this representation. Consider the histogram bin that contains the largest value present data points get. By removing all of the bins from the past data to the right of this bin, the KL-Divergence $\cD_{kl}((\boldsymbol{\cX}^{h}, \cY^h)||(\boldsymbol{\cX}^{p}, \cY^p))$ decreases. 
% }


% \begin{proofof}{Theorem \ref{thm:replay-pres-diff-dist}} %\mina{add a figure}
% %
% %\mina{In the statement we say if we remove any bin the kl-d will decrease though the correct statement is if we remove all the bins of high value this happens}
% %\mina{todo: add in the proof: if we further remove bins from the past it reduces the kl-d but the kl-d estimate might not be that accurate then.}
% %First we clarify the statement of the Theorem. Consider the histogram bin that contains the largest value present data points get in the DV representation. We show that if by removing ``all" the bins to the right of this present bin from the past data, the KL-D between present and past data decreases. 
% To better visualize the theorem statement see \figref{fig:hist_bin_ex}: If the red points represent present data and the grey points represent past data, we remove all the past data from bins $B_4,\ldots,B_8$, and show that this would decrease the KL-D between present and past data.

% Let $\boldsymbol{f}:f_{H\to P}$ be the neural function defining the DV representation and hence optimizing %$\cD_{kl}((\boldsymbol{\cX}^{h}, \cY^h)||(\boldsymbol{\cX}^{p}, \cY^p))$.
% $\hat{D}_{kl}(H||P)$, where $H$ is the set of historical data and $P$ is the set of present data points. This means that
% $$
% \hat{D}_{kl}(H||P) = \sup_f \overline{f^H} - \log \overline{e^{f^P}} = \overline{\boldsymbol{f}^H} - \log \overline{e^{\boldsymbol{f}^P}}
% $$ 
% Let $M$ be the largest value $\boldsymbol{f}$ assigns to any data point in present, and let the bin containing the DV representation of this data point be $B_M$. Let $S$ be the set of bins to the right of $B_M$ on the one dimensional line, i.e. all the values in bins in $S$ are larger than values in $B_M$. Let $R$ be the subset of historical data obtained by removing the bins in $S$. We have $\overline{\boldsymbol{f}^R}\le \overline{\boldsymbol{f}^H}$, and so $\hat{D}_{kl}(R||P)\le\hat{D}_{kl}(H||P)$.

% One might suggest that if we keep removing bins $\overline{\boldsymbol{f}^R}$ will decrease anyway. While this is  mathematically correct, the $\hat{D}_{kl}(R||P)$ estimate would become less accurate as we remove bins.
% %
% %
% \end{proofof}


\section{More on Experimental Analysis}
\subsection{Visualizations}

%\begin{figure}[pt!]
\begin{figure}[H]
% 
\centering
% 
\subfigure[SUN]{
\includegraphics[width=0.25\columnwidth]{sun.pdf}
}
% 
\subfigure[Places]{
\includegraphics[width=0.25\columnwidth]{places.pdf}
}
% 
\subfigure[iNaturalist]{
\includegraphics[width=0.25\columnwidth]{iNaturalist.pdf}
}
% 
\subfigure[Textures]{
\includegraphics[width=0.25\columnwidth]{textures.pdf}
}
% 
\subfigure[Animation]{
\includegraphics[width=0.25\columnwidth]{animation_faces.pdf}
}
% 
\subfigure[Arabic]{
\includegraphics[width=0.25\columnwidth]{arabic_handwritten_characters.pdf}
}
% 
\subfigure[Tumors]{
\includegraphics[width=0.25\columnwidth]{brainMRITumorTypes.pdf}
}
% 
\subfigure[Chest Xray]{
\includegraphics[width=0.25\columnwidth]{chest_xray_pneumonia.pdf}
}
% % 
% \subfigure[Cards]{
% \includegraphics[width=0.25\columnwidth]{cards.pdf}
% }
% 
\subfigure[M. Pox]{
\includegraphics[width=0.25\columnwidth]{monkeypox.pdf}
}
% 
\subfigure[M. Posters]{
\includegraphics[width=0.25\columnwidth]{movie_posters.pdf}
}
% 
\subfigure[YouTube T.]{
\includegraphics[width=0.25\columnwidth]{outube_thumbnail.pdf}
}
% 
\subfigure[Shells P.]{
\includegraphics[width=0.25\columnwidth]{shells_pebbles.pdf}
}
% 
\caption{With our method \emph{DDE*} for OOD detection in WideResnet101, Imagenet dataset~(ID set in blue) vs OOD test sets~(in red) are shown to be separated in the respective dual functional spaces.}
\label{fig:f_ood_id}
\end{figure}

\subsection{Analysis for ViTs}

In Table \ref{tab:ood_vit}, we present results for OOD detection in ViT-L-16. Note that OOD detection in pretrained Vision Transformers is under explored. The results suggest that all the methods are fundamentally limited in their capability for OOD detection in ViTs. It is only for a few OOD datasets such as USPS, Alzeihmers, Arabic Characters, Sign Language, Shells Pebbles, that we observe good performance across a majority of the methods. While our methods, \emph{DDE*} and \emph{DDE-SM*}, are signfiicantly superior w.r.t. all the methods for ViT-L-16, the ViT does limit even our proposed OOD detectors in comparison to the WideResnet. 

\begin{table}[tp!]
\centering
\tabsize
\renewcommand{\arraystretch}{0.6}
% \renewcommand{\tabcolsep}{0.55pt}
\begin{tabular}{llllllllllllllllll}
\toprule
\textbf{Dataset}&\textbf{msp}&\textbf{mls}&\textbf{odin}&\textbf{ebo}&\textbf{gn}&\textbf{react}&\textbf{gm}&\textbf{knn}&\textbf{dice}&\textbf{ash}&\textbf{wm}&\textbf{klm}&\textbf{cider}&\textbf{ige}&\textbf{dde*}&\textbf{dde-sm*}\\
% 
\toprule
ID Test$\uparrow$&\underline{95}&94&93&94&\underline{95}&93&92&\underline{95}&\underline{95}&93&94&93&94&94&\textbf{96}&\underline{95}\\
\midrule
OOD Val.&77&74&75&68&68&67&67&79&68&66&83&73&79&68&\textbf{54}&\underline{59}\\
\toprule
% 
SUN&96&96&\underline{94}&98&97&98&98&99&99&97&--&96&96&98&\textbf{91}&\textbf{91}\\
\midrule
Places&96&\underline{95}&\textbf{94}&97&96&96&\underline{95}&98&98&96&--&96&95&97&96&\textbf{94}\\
\midrule
iNaturalist&93&93&90&95&94&94&98&99&96&92&--&93&93&95&\underline{76}&\textbf{66}\\
\midrule
Textures&95&93&94&90&91&88&89&94&90&85&--&94&95&90&\underline{71}&\textbf{58}\\
\toprule
Agr. Crop&85&89&78&98&96&97&98&98&98&98&--&96&85&97&\underline{40}&\textbf{33}\\
\midrule
Animation&--&99&99&--&--&--&--&--&--&--&--&99&--&--&\textbf{18}&\underline{19}\\
\midrule
B. Tumors &91&88&86&91&93&89&\textbf{13}&\underline{30}&92&66&--&96&67&92&31&31\\
\midrule
C. Xray &90&91&84&--&--&--&80&--&--&--&--&90&93&--&\textbf{4}&\underline{7}\\
% \midrule
% COVID CT Scan &98.0&\underline{97.0}&\underline{97.0}&99.5&99.5&98.5&97.5&--&--&\underline{97.0}&--&94.6&\textbf{14.9}\\
\midrule
Faces in W.&91&88&87&93&95&90&59&83&91&90&--&92&88&93&\textbf{30}&\underline{35}\\
\midrule
Fastfood &98&98&97&98&98&97&96&98&96&97&--&97&98&97&\underline{62}&\textbf{50}\\
\midrule
Gemstone&--&--&99&--&--&--&97&99&--&99&--&--&99&--&\underline{78}&\textbf{63}\\
\midrule
Lego &--&--&99&--&--&99&\underline{98}&99&--&99&--&\underline{98}&97&--&\textbf{66}&\textbf{66}\\
\midrule
% Oxford F. &91.9&91.0&88.0&95.1&94.7&93.3&\underline{80.5}&84.3&91.1&94.7&--&95.6&\textbf{53.0}\\
% \midrule
Plant D.&99&--&99&--&--&--&--&--&--&--&--&--&--&--&\underline{27}&\textbf{20}\\
\midrule
USPS &\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&--&--&--&\textbf{0}&--&--&--&\textbf{0}&\textbf{0}&--&\textbf{0}&\textbf{0}\\
\midrule
Alzeihmers &17&3&7&4&14&\underline{1}&2&98&53&\textbf{0}&--&68&76&8&\textbf{0}&\textbf{0}\\
\midrule
B. Cells &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{11}&\underline{12}\\
\midrule
B. Logos &98&98&98&98&98&98&95&98&99&98&--&98&98&98&\textbf{23}&\underline{25}\\
\midrule
Captcha &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{0}&\textbf{0}\\
\midrule
Cards &98&98&98&96&97&96&94&96&97&94&--&97&97&96&\underline{47}&\textbf{36}\\
\midrule
Arabic&48&36&43&19&15&18&\textbf{0}&\underline{1}&2&\underline{1}&29&66&12&24&\textbf{0}&\textbf{0}\\
\midrule
Chess&94&93&93&90&91&90&82&86&90&87&95&96&90&90&\textbf{74}&\underline{80}\\
\midrule
C. Fine Art&99&99&99&98&98&98&97&98&97&97&--&99&99&98&\underline{68}&\textbf{48}\\
\midrule
Coffee B. &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{3}&\underline{5}\\
\midrule
Colon S. &--&--&--&--&--&--&\underline{84}&94&--&--&--&--&--&--&\textbf{0}&\textbf{0}\\
\midrule
Covid CT S.&--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{6}&\underline{7}\\
\midrule
Diamonds &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{12}&\underline{15}\\
\midrule
E. Faces&--&--&99&--&--&--&90&98&--&--&--&--&99&--&\textbf{10}&\underline{14}\\
\midrule
H. Eyes &--&--&--&--&--&--&99&--&--&--&--&--&--&--&\textbf{13}&\underline{15}\\
\midrule
Fire \& S.&--&--&--&98&--&98&68&69&90&98&95&--&82&99&\textbf{57}&\underline{63}\\
\midrule
H.W. Eng. &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{0}&\textbf{0}\\
\midrule
Excavation &--&--&99&99&99&97&--&--&99&99&--&93&--&99&\underline{23}&\textbf{22}\\
\midrule
Eyes &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{29}&31\\
\midrule
H.W. Math&--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{8}&\textbf{8}\\
\midrule
H. \& B. &99&99&98&99&99&99&99&--&--&99&--&96&--&99&\textbf{8}&\underline{10}\\
\midrule
I. Food &97&96&96&95&96&94&92&96&93&94&--&97&96&95&\underline{75}&\textbf{56}\\
\midrule
Lego M. F.&--&--&99&--&--&99&98&99&--&99&--&98&97&--&\underline{66}&\textbf{61}\\
\midrule
Licence P. &81&81&79&91&96&91&66&81&94&89&--&94&64&92&\textbf{26}&\underline{34}\\
\midrule
Meat Q.&--&--&--&--&--&\underline{99}&--&--&--&--&--&--&--&--&\textbf{0}&\textbf{0}\\
\midrule
M. Pox&--&--&99&--&99&99&83&99&98&99&--&99&--&99&\underline{55}&\textbf{52}\\
\midrule
M. Posters &87&82&83&75&78&72&75&81&74&71&--&88&84&75&\textbf{43}&\underline{52}\\
\midrule
Orna. P. &--&--&99&--&--&--&--&--&--&99&--&98&--&--&\textbf{11}&\underline{15}\\
\midrule
Paintings &96&96&95&98&98&97&\textbf{41}&\underline{43}&98&97&67&--&88&98&51&51\\
\midrule
Pollen G. &--&--&--&--&--&--&--&--&--&--&--&--&--&--&\textbf{6}&\underline{8}\\
\midrule
QR C.&86&83&77&--&--&97&75&--&--&\underline{47}&--&95&99&--&\textbf{0}&\textbf{0}\\
\midrule
Railway T.&--&--&--&--&--&--&--&--&--&--&--&99&--&--&\underline{21}&\textbf{19}\\
\midrule
Weed C. &76&72&72&71&71&70&63&85&76&68&--&87&78&71&\textbf{33}&\underline{40}\\
\midrule
YouTube T.&84&78&79&76&77&72&75&83&79&72&--&83&83&75&\underline{59}&\textbf{51}\\
\midrule
Weather &99&98&98&98&98&98&95&97&99&97&--&98&95&98&\textbf{87}&\underline{89}\\
\midrule
Sign L.&34&25&30&6&11&4&\textbf{0}&2&\underline{1}&\underline{1}&98&65&38&8&\textbf{0}&1\\
\midrule
Stairs &98&98&98&99&99&99&98&98&99&98&99&99&97&99&\textbf{51}&\underline{71}\\
\midrule
Shells P. &\textbf{0}&\textbf{0}&\textbf{0}&\textbf{0}&93&90&--&\textbf{0}&93&\underline{85}&--&\textbf{0}&\textbf{0}&--&\textbf{0}&\textbf{0}\\
\toprule
\end{tabular}
\caption{Evaluation results for OOD detection in ViT-L-16 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$). Due to space constraints, we display method names in lower case and use "--" wherever FPR95 is 100. Best scores are shown in bold and the second best scores are underlined.}
\label{tab:ood_vit}
\end{table}

\subsection{Varying Sample Size in Imagenet (ID) Dataset}

We perform a new ablation study for our method (DDE*) by varying the sample size ($N$) on the Imagenet (ID) dataset. Following the same experimental setup as in Table 1 in the paper, we present results in the table below. "All" refers to using all the samples in the Imagenet dataset for OOD detection which is the same as column "DDE*" in Table 1 of the paper. For a given sample size, we randomly select samples from the Imagenet dataset and use only those samples for the entire experiment including tuning the hyperparameters. We perform 10 random trials, and correspondingly report mean and standard deviation of the FPR95 scores for each of the 51 test OOD sets. We observe that, for many of the OOD test sets, even a small sample size of a few thousands ($N=3000$) suffices to achieve high OOD detection rate. However, on the extreme end, using a sample size of 100 is clearly not enough. Note that the numbers from this study should not be compared to other methods in the paper, since the latter use the entire Imagenet dataset.

\begin{table}[tp!]
\centering
\tabsize
% \renewcommand{\arraystretch}{0.85}
\renewcommand{\tabcolsep}{4.5pt}
\begin{tabular}{llllllll}
\toprule
\textbf{Dataset}&All&N=30000&N=10000&N=3000&N=1000&N=100\\
\toprule
ID Test $\uparrow$&95&95$\pm$1&94$\pm$1&94$\pm$1&95$\pm$2&96$\pm$4\\
\midrule
OOD Validation &31&46$\pm$6&47$\pm$5&42$\pm$5&35$\pm$5&33$\pm$22\\
\toprule
SUN&18&29$\pm$8&33$\pm$6&32$\pm$9&33$\pm$23&44$\pm$35\\
\midrule
Places&10&32$\pm$12&37$\pm$8&34$\pm$9&34$\pm$23&45$\pm$35\\
\midrule
iNaturalist&11&22$\pm$9&28$\pm$9&24$\pm$6&28$\pm$17&39$\pm$31\\
\midrule
Textures&15&27$\pm$10&38$\pm$13&32$\pm$8&34$\pm$12&61$\pm$37\\
\toprule
Agriculture Crop&0&3$\pm$3&9$\pm$7&8$\pm$4&17$\pm$28&19$\pm$21\\
\midrule
Animation&6&14$\pm$7&20$\pm$7&18$\pm$6&19$\pm$11&33$\pm$29\\
\midrule
Brain Tumors&3&8$\pm$5&11$\pm$5&12$\pm$4&11$\pm$2&35$\pm$33\\
\midrule
Chest Xray&4&9$\pm$5&14$\pm$6&14$\pm$6&12$\pm$4&44$\pm$37\\
\midrule
Faces in the Wild&9&16$\pm$8&23$\pm$8&19$\pm$8&24$\pm$26&37$\pm$33\\
\midrule
Fastfood&10&27$\pm$8&35$\pm$9&33$\pm$9&27$\pm$7&44$\pm$35\\
\midrule
Gemstones&4&10$\pm$6&17$\pm$9&18$\pm$6&16$\pm$6&40$\pm$33\\
\midrule
LEGO&0&3$\pm$3&6$\pm$5&6$\pm$3&15$\pm$28&40$\pm$37\\
\midrule
Plant Diseases&2&8$\pm$5&13$\pm$5&13$\pm$5&15$\pm$13&47$\pm$35\\
\midrule
USPS&1&5$\pm$3&9$\pm$5&8$\pm$4&7$\pm$2&29$\pm$31\\
\midrule
Alzeihmers&1&4$\pm$4&6$\pm$4&6$\pm$3&5$\pm$2&28$\pm$36\\
\midrule
Blood Cells&1&5$\pm$4&9$\pm$6&9$\pm$4&17$\pm$24&21$\pm$25\\
\midrule
Brand Logos&0&0$\pm$0&1$\pm$2&1$\pm$1&1$\pm$1&7$\pm$16\\
\midrule
Captcha&0&0$\pm$0&0$\pm$0&0$\pm$0&0$\pm$0&0$\pm$0\\
\midrule
Cards&11&17$\pm$7&23$\pm$12&19$\pm$7&21$\pm$9&45$\pm$32\\
\midrule
Arabic Handwritten Characters&4&7$\pm$4&10$\pm$5&9$\pm$3&8$\pm$2&18$\pm$20\\
\midrule
Chess&1&5$\pm$4&10$\pm$5&11$\pm$5&11$\pm$4&32$\pm$34\\
\midrule
Chinese Fine Art&1&4$\pm$4&9$\pm$5&9$\pm$5&17$\pm$28&44$\pm$36\\
\midrule
Coffee Beans&1&4$\pm$3&6$\pm$5&6$\pm$3&5$\pm$3&18$\pm$24\\
\midrule
Colonoscopy&1&3$\pm$2&6$\pm$4&5$\pm$2&5$\pm$2&22$\pm$28\\
\midrule
Covid CT Scan&3&7$\pm$5&11$\pm$5&11$\pm$5&12$\pm$6&24$\pm$27\\
\midrule
Diamonds&3&5$\pm$3&9$\pm$5&7$\pm$3&7$\pm$2&35$\pm$33\\
\midrule
Emotional Faces&5&13$\pm$7&20$\pm$10&17$\pm$7&22$\pm$26&39$\pm$34\\
\midrule
Human Eyes&5&11$\pm$6&19$\pm$7&17$\pm$9&15$\pm$5&28$\pm$30\\
\midrule
Fire \& Smoke&0&0$\pm$0&1$\pm$1&0$\pm$1&10$\pm$30&11$\pm$20\\
\midrule
English Handwritten Characters&2&4$\pm$3&7$\pm$4&6$\pm$3&6$\pm$2&15$\pm$18\\
\midrule
Excavation&0&2$\pm$2&4$\pm$3&4$\pm$2&13$\pm$29&41$\pm$40\\
\midrule
Eyes&3&5$\pm$3&7$\pm$4&6$\pm$3&9$\pm$10&35$\pm$34\\ 
\midrule
Handwritten Math Symbols&1&3$\pm$3&$\pm$4&6$\pm$3&6$\pm$2&15$\pm$19\\
\midrule
Bart and Homer&0&1$\pm$1&3$\pm$3&3$\pm$2&13$\pm$29&16$\pm$20\\
\midrule
Indian Food&13&23$\pm$11&29$\pm$9&30$\pm$8&32$\pm$19&41$\pm$33\\
\midrule
Lego Minifigures&0&3$\pm$3&7$\pm$4&6$\pm$4&15$\pm$29&32$\pm$35\\
\midrule
Licence Plates&0&0$\pm$0&0$\pm$1&0$\pm$1&0$\pm$1&20$\pm$38\\
\midrule
Meat Quality&0&0$\pm$1&2$\pm$3&1$\pm$1&1$\pm$2&11$\pm$26\\
\midrule
Monkeypox&8&14$\pm$7&21$\pm$7&21$\pm$6&29$\pm$25&44$\pm$35\\
\midrule
Movie Posters&14&26$\pm$10&35$\pm$9&31$\pm$9&26$\pm$10&37$\pm$30\\
\midrule
Ornamental Plants&0&3$\pm$4&6$\pm$5&6$\pm$3&14$\pm$29&21$\pm$26\\
\midrule
Paintings&1&5$\pm$4&9$\pm$5&8$\pm$4&16$\pm$28&21$\pm$26\\
\midrule
Pollen Grain&1&4$\pm$4&9$\pm$6&11$\pm$5&9$\pm$3&25$\pm$27\\
\midrule
QR Codes&1&2$\pm$2&4$\pm$3&3$\pm$2&3$\pm$2&12$\pm$16\\
\midrule
Railway Tracks&1&2$\pm$2&6$\pm$5&6$\pm$4&14$\pm$29&31$\pm$34\\
\midrule
Weed Crop&4&6$\pm$4&9$\pm$6&9$\pm$4&17$\pm$28&27$\pm$25\\
\midrule
YouTube Thumbnail&5&22$\pm$11&31$\pm$8&27$\pm$7&31$\pm$24&49$\pm$33\\
\midrule
Weather&14&27$\pm$8&33$\pm$12&29$\pm$7&31$\pm$20&51$\pm$40\\
\midrule
Sign Language&1&3$\pm$4&6$\pm$5&6$\pm$3&4$\pm$1&22$\pm$24\\
\midrule
Stairs&0&0$\pm$0&1$\pm$2&1$\pm$1&0$\pm$1&9$\pm$22\\
\midrule
Shells or Pebbles&22&26$\pm$6&31$\pm$9&31$\pm$8&26$\pm$9&53$\pm$35\\
\toprule
\end{tabular}
\caption{Evaluation results for OOD detection in WideResnet101 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$).}
\end{table}
    
\subsection{Batch Inference on Test Set}

Following the same experimental setup as in Table 1 in the paper, we perform an ablation study of OOD detection in a test set splitting it into (100) small ordered batches of equal sizes; batch size across the OOD test set varies from 2 to 430 with median value of 28. We believe batch inference of a test set to strongly resemble real world scenarios of continual lifelong learning. For attaining a reasonable sample size in a test set (though not necessary), we augment each batch of test samples with (300) randomly selected samples from ID training set (i.e. Imagenet dataset) and (300) samples from the OOD validation set (same as discussed in the paper, generated from ID samples in Imagenet by simple perturbations proposed by Hendrycks et al. [2019]). We perform 10 trials to account for randomness in selecting the samples for augmentation. In the table below, referring to this batch-inference based online version of our method as "DDE-Online", we present mean and standard deviation of FPR95 scores (from 10 trials) for each of the OOD test sets. In addition, for a comparison, we present the original results for our method ("DDE*") as well as the best of all the baselines (which is different for each OOD test set) from Table 1 in the paper. It is interesting to note that the standard deviation of FPR95 scores is low and that it performs even better than "DDE*" for many test sets. Even for most of the other cases, "DDE-Online" has lower FPR95 than the best of the baselines.

\begin{table}[tp!]
\centering
\tabsize
\renewcommand{\tabcolsep}{4.5pt}
\begin{tabular}{lllll}
\toprule
\textbf{Dataset}&Best of the Baselines&DDE*&\textbf{DDE-Online}\\
% &DDE-B-O\\
\toprule
SUN&12&18&14$\pm$1\\
% &6\\
\midrule
Places&34&10&14$\pm$0\\
% &7\\
\midrule
iNaturalist&12&11&8$\pm$1\\
% &3\\
\midrule
Textures&12&15&10$\pm$1\\
% &4\\
\toprule
Agriculture Crop&0&0&3$\pm$1\\
% &0\\
\midrule
Animation&21&6&4$\pm$1\\
% &4\\
\midrule
Brain Tumors&14&3&6$\pm$1\\
% &2\\
\midrule
Chest Xray&7&4&4$\pm$0\\
% &2\\
\midrule
Faces in the Wild&19&9&5$\pm$1\\
% &3\\
\midrule
Fastfood&47&10&14$\pm$1\\
% &8\\
\midrule
Gemstone&39&4&11$\pm$1\\
% &2\\
\midrule
LEGO&2&0&4$\pm$1\\
% &1\\
\midrule
Plant Diseases&14&2&6$\pm$1\\
% &2\\
\midrule
USPS&12&1&1$\pm$0\\
% &0\\
\midrule
Alzeihmers&4&1&1$\pm$0\\
% &0\\
\midrule
Blood Cells&6&1&6$\pm$1\\
% &1\\
\midrule
Brand Logos&0&0&0$\pm$0\\
% &0\\
\midrule
Captcha&0&0&0$\pm$0\\
% &0\\
\midrule
Cards&59&11&9$\pm$1\\
% &4\\
\midrule
Arabic Handwritten Characters&4&4&0$\pm$0\\
% &1\\
\midrule
Chess Pieces&9&1&7$\pm$1\\
% &2\\
\midrule
Chinese Fine Art&2&1&7$\pm$1\\
% &2\\
\midrule
Coffee Beans&10&1&4$\pm$1\\
% &1\\
\midrule
Colonoscopy&1&1&2$\pm$1\\
% &0\\
\midrule
Covid CT Scans&11&3&6$\pm$1\\
% &1\\
\midrule
Diamonds&31&3&5$\pm$1\\
% &2\\
\midrule
Emotional Faces&15&5&4$\pm$0\\
% &4\\
\midrule
Human Eyes&20&5&5$\pm$1\\
% &2\\
\midrule
Fire \& Smoke&0&0&0$\pm$0\\
% &0\\
\midrule
English Handwritten Characters&8&2&2$\pm$1\\
% &0\\
\midrule
Excavation&1&0&2$\pm$1\\
% &0\\
\midrule
Eyes&11&3&5$\pm$1\\
% &1\\ 
\midrule
Handwritten Math Symbols&10&1&1$\pm$1\\
% &0\\
\midrule
Bart and Homer&0&0&1$\pm$0\\
% &0\\
\midrule
Indian Food&49&13&14$\pm$1\\
% &4\\
\midrule
LEGO Minifigures&1&0&4$\pm$1\\
% &1\\
\midrule
Licence Plates&0&0&0$\pm$0\\
% &0\\
\midrule
Meat Quality&0&0&0$\pm$0\\
% &0\\
\midrule
Monkeypox&50&8&9$\pm$1\\
% &3\\
\midrule
Movie Posters&37&14&13$\pm$1\\
% &6\\
\midrule
Ornamental Plants&10&0&3$\pm$2\\
% &0\\
\midrule
Paintings&2&1&5$\pm$1\\
% &1\\
\midrule
Pollen Grain&12&1&6$\pm$2\\
% &2\\
\midrule
QR Codes&5&1&0$\pm$0\\
% &0\\
\midrule
Railway Tracks&1&1&2$\pm$1\\
% &0\\
\midrule
Weed Crops&26&4&7$\pm$1\\
% &2\\
\midrule
YouTube Thumbnails&40&5&17$\pm$2\\
% &7\\
\midrule
Weather&58&14&16$\pm$1\\
% &7\\
\midrule
Sign Language&10&1&2$\pm$1\\
% &0\\
\midrule
Stairs&0&0&0$\pm$0\\
% &0\\
\midrule
Shells or Pebbles&59&22&14$\pm$1\\
% &5\\
\toprule
\end{tabular}
\caption{Evaluation results for OOD detection in WideResnet101 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$).}
\end{table}

\subsection{Fixed estimator tuned for test sets}

As per the reviewer's suggestion, in the table below, we present results from an ablation study on generalization of the estimator. We optimize the dual function for estimating KL-divergence between the ID training set and the OOD validation set. Using this dual function, we perform OOD detection across all the OOD test sets. This highly compute efficient variant of our method is referred as "DDEv". Optionally, we fine tune for a given test set using 10\% or 20\% of the original compute cost of our method (DDEvt10 and DDEvt20). For a comparison, we also present results for the default version of our method DDE* and the best of the baselines from Table 1 in the paper. FPR95 scores in the table below suggest that the estimator does generalize to many OOD test sets, and it further benefits from fine tuning.
    
\begin{table}[tp!]
\centering
\tabsize
\renewcommand{\tabcolsep}{4.5pt}
\begin{tabular}{llllllll}
\toprule
\textbf{Dataset}&Best of the Baselines&DDE*&DDEv&DDEvt10&DDEvt20\\
\toprule
SUN&12&18&33&22&21\\
% &6\\
\midrule
Places&34&10&32&23&16&\\
% &7\\
\midrule
iNaturalist&12&11&29&15&15\\
% &3\\
\midrule
Textures&12&15&82&65&43\\
% &4\\
\toprule
Agriculture Crop&0&0&0&0&0\\
% &0\\
\midrule
Animation&21&6&6&6&6\\
% &4\\
\midrule
Brain Tumors&14&3&5&6&6\\
% &2\\
\midrule
Chest Xray&7&4&3&7&7\\
% &2\\
\midrule
Faces in the Wild&19&9&7&6&6\\
% &3\\
\midrule
Fastfood&47&10&35&24&17\\
% &8\\
\midrule
Gemstone&39&4&26&10&9\\
% &2\\
\midrule
LEGO&2&0&1&3&3\\
% &1\\
\midrule
Plant Diseases&14&2&4&10&10\\
% &2\\
\midrule
USPS&12&1&2&4&4\\
% &0\\
\midrule
Alzeihmers&4&1&1&2&2\\
% &0\\
\midrule
Blood Cells&6&1&2&7&7\\
% &1\\
\midrule
Brand Logos&0&0&0&0&0\\
% &0\\
\midrule
Captcha&0&0&0&0&0\\
% &0\\
\midrule
Cards&59&11&50&21&15\\
% &4\\
\midrule
Arabic Handwritten Characters&4&4&3&5&5\\
% &1\\
\midrule
Chess Pieces&9&1&5&6&6\\
% &2\\
\midrule
Chinese Fine Art&2&1&2&11&11\\
% &2\\
\midrule
Coffee Beans&10&1&2&3&3\\
% &1\\
\midrule
Colonoscopy&1&1&0&0&0\\
% &0\\
\midrule
Covid CT Scans&11&3&3&4&4\\
% &1\\
\midrule
Diamonds&31&3&16&6&6\\
% &2\\
\midrule
Emotional Faces&15&5&4&6&6\\
% &4\\
\midrule
Human Eyes&20&5&6&6&6\\
% &2\\
\midrule
Fire \& Smoke&0&0&0&0&0\\
% &0\\
\midrule
English Handwritten Characters&8&2&0&1&1\\
% &0\\
\midrule
Excavation&1&0&0&1&1\\
% &0\\
\midrule
Eyes&11&3&8&5&5\\
% &1\\ 
\midrule
Handwritten Math Symbols&10&1&1&2&2\\
% &0\\
\midrule
Bart and Homer&0&0&0&0&0\\
% &0\\
\midrule
Indian Food&49&13&31&18&13\\
% &4\\
\midrule
LEGO Minifigures&1&0&0&3&3\\
% &1\\
\midrule
Licence Plates&0&0&0&0&0\\
% &0\\
\midrule
Meat Quality&0&0&0&0&0\\
% &0\\
\midrule
Monkeypox&50&8&31&14&11\\
% &3\\
\midrule
Movie Posters&37&14&23&15&12\\
% &6\\
\midrule
Ornamental Plants&10&0&2&4&4\\
% &0\\
\midrule
Paintings&2&1&2&4&4\\
% &1\\
\midrule
Pollen Grain&12&1&6&7&7\\
% &2\\
\midrule
QR Codes&5&1&0&3&3\\
% &0\\
\midrule
Railway Tracks&1&1&0&1&1\\
% &0\\
\midrule
Weed Crops&26&4&10&5&5\\
% &2\\
\midrule
YouTube Thumbnails&40&5&27&18&14\\
% &7\\
\midrule
Weather&58&14&51&29&21\\
% &7\\
\midrule
Sign Language&10&1&2&4&4\\
% &0\\
\midrule
Stairs&0&0&0&0&0\\
% &0\\
\midrule
Shells or Pebbles&59&22&44&28&20\\
% &5\\
\toprule
\end{tabular}
\caption{Evaluation results for OOD detection in WideResnet101 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$).}
\end{table}

\subsection{Mixture of ID \& OOD Samples in Test Set}

We evaluate our approach for OOD detection on test sets containing both ID and OOD samples. We augment each OOD test set with (3000) ID test samples. Besides this change, evaluation setup is same as for Table 1 in the paper. Results for this setting are denoted as "DDE-mixed". In the table below, for each test set, we report FPR95 scores for OOD samples, as well as for ID samples in parenthesis. In addition, for a comparison, we present the results of "DDE*" as well as the best of all the baselines from Table 1 in the paper. Our method detects ID samples in each test set with a very high accuracy (FPR95 > 94). As for detecting OOD samples, for many of the test sets, our method achieves lower FPR95 scores (as desired) w.r.t. the best of the baselines.

\begin{table}[tp!]
\centering
\tabsize
\renewcommand{\tabcolsep}{4.5pt}
\begin{tabular}{lllll}
\toprule
\textbf{Dataset}&Best of the Baselines&DDE*&DDE-mixed\\
\toprule
SUN&12&18&37 (99)\\
\midrule
Places&34&10&36 (98)\\
\midrule
iNaturalist&12&11&17 (98)\\
\midrule
Textures&12&15&20 (98)\\
\toprule
Agriculture Crop&0&0&25 (95)\\
\midrule
Animation&21&6&10 (98)\\
\midrule
Brain Tumors&14&3&1 (96)\\
\midrule
Chest Xray&7&4&1 (96)\\
\midrule
Faces in the Wild&19&9&7 (96)\\
\midrule
Fastfood&47&10&23 (97)\\
\midrule
Gemstone&39&4&5 (98)\\
% &2\\
\midrule
LEGO&2&0&14 (95)\\
% &1\\
\midrule
Plant Diseases&14&2&5 (95)\\
% &2\\
\midrule
USPS&12&1&0 (98)\\
% &0\\
\midrule
Alzeihmers&4&1&0 (96)\\
% &0\\
\midrule
Blood Cells&6&1&11 (98)\\
% &1\\
\midrule
Brand Logos&0&0&1 (94)\\
% &0\\
\midrule
Captcha&0&0&0 (94)\\
% &0\\
\midrule
Cards&59&11&9 (98)\\
% &4\\
\midrule
Arabic Handwritten Characters&4&4&2 (98)\\
% &1\\
\midrule
Chess Pieces&9&1&16 (94)\\
% &2\\
\midrule
Chinese Fine Art&2&1&28 (95)\\
% &2\\
\midrule
Coffee Beans&10&1&0 (98)\\
% &1\\
\midrule
Colonoscopy&1&1&4 (94)\\
% &0\\
\midrule
Covid CT Scans&11&3&0 (96)\\
% &1\\
\midrule
Diamonds&31&3&0 (98)\\
% &2\\
\midrule
Emotional Faces&15&5&10 (99)\\
% &4\\
\midrule
Human Eyes&20&5&3 (97)\\
% &2\\
\midrule
Fire \& Smoke&0&0&2 (94)\\
% &0\\
\midrule
English Handwritten Characters&8&2&0 (98)\\
% &0\\
\midrule
Excavation&1&0&9 (95)\\
% &0\\
\midrule
Eyes&11&3&1 (98)\\
% &1\\ 
\midrule
Handwritten Math Symbols&10&1&0 (97)\\
% &0\\
\midrule
Bart and Homer&0&0&6 (94)\\
% &0\\
\midrule
Indian Food&49&13&20 (97)\\
% &4\\
\midrule
LEGO Minifigures&1&0&11 (94)\\
% &1\\
\midrule
Licence Plates&0&0&1 (94)\\
% &0\\
\midrule
Meat Quality&0&0&0 (97)\\
% &0\\
\midrule
Monkeypox&50&8&5 (97)\\
% &3\\
\midrule
Movie Posters&37&14&35 (98)\\
% &6\\
\midrule
Ornamental Plants&10&0&0 (94)\\
% &0\\
\midrule
Paintings&2&1&21 (95)\\
% &1\\
\midrule
Pollen Grain&12&1&7 (96)\\
% &2\\
\midrule
QR Codes&5&1&0 (98)\\
% &0\\
\midrule
Railway Tracks&1&1&9 (94)\\
% &0\\
\midrule
Weed Crops&26&4&2 (98)\\
% &2\\
\midrule
YouTube Thumbnails&40&5&38 (97)\\
% &7\\
\midrule
Weather&58&14&35 (99)\\
% &7\\
\midrule
Sign Language&10&1&0 (96)\\
% &0\\
\midrule
Stairs&0&0&12 (94)\\
% &0\\
\midrule
Shells or Pebbles&59&22&31 (98)\\
% &5\\
\toprule
\end{tabular}
\caption{Evaluation results for OOD detection in WideResnet101 pretrained on Imagenet-1k using the metric FPR95~($\downarrow$).}
\end{table}
         
\subsection{Details on Dual Divergence Estimation via Deep Neural Networks}
% 
As it has been explored in the previous works for dual divergence estimation~\citep{belghazi2018mutual}, we employ a lightweight deep neural net, independent of the pretrained DNN, as a dual function approximator. The neural dual function is optimized via maximization of the divergence measure w.r.t. the weight parameters. Large batch size~(10k in our experiments) is recommended to avoid otherwise high variance in estimating the measure~\citep{song2019understanding}. 
    
Besides, DNNs present the challenge of overfitting. In the context of divergence estimation, it means that if we perform a very large number of batch updates, the estimate can eventually diverge. In practice, a few hundred batch updates with low learning rate~(5e-4 in our experiments) suffice to converge before the phenomenon of divergence may start to take place after a few thousand batch updates. 
    
The neural architecture, along with the hyperparameters such as learning rate, and number of batch updates, can be automatically tuned such that 5\% of the samples in ID set are identified as OOD as it is the standard practice in all the previous works on OOD detection in pretrained networks~(corresponding to metric FPR95). Furthermore, as suggested in previous works, one can also minimize false positive rates on a validation set of OOD samples which is generated via various kinds of perturbations performed on ID samples~\citep{hendrycksdeep2019}. 
    
It is also worth noting that, while the architecture and all the hyperparameters are fixed after the tuning on the ID set, the weight parameters of a DNN are optimized independently for each test set. This is because the dual function is unique to the problem of dual divergence estimation between a test set and the ID set. Despite this, we find in our experimental analysis that the average compute time for OOD detection in a test set~(of size in few thousand) is in seconds.
% on a GPU.

\subsection{Datasets for ID Detection}

% \subsubsection{Datasets}

\paragraph{US stocks prices.}
% 
We started with the 1000 stocks from the constituents of the Russell 3000 index that have the highest liquidity. This dataset is publicly available, though very large in size to be released as a single file.
% 
After performing necessary preprocessing and checks on data quality issues, we use 982 of those stocks. The returns are evaluated every 5 minutes, for the period of from May 2021 to May 2022, i.e. 7800 timesteps.

ECG dataset is available on Kaggle.\footnote{\url{https://www.kaggle.com/datasets/shayanfazeli/heartbeat?select=ptbdb_abnormal.csv}}
 
% Different neural timeseries forecasting models were explored, including feedforward networks, LSTMs, Transformers, NBeats.
% 
% As such, there is no budget for the number of samples to replay. Rather, the problem is of leveraging all the information from the historical past of same domain and of others for inferring in the present context window of short length. We let the baselines models select or generate same number of samples as our sampling algorithm selects from the past, which may be a large subset of the historical past.

% Since this is a large scale analysis, it is not possible to describe all the details here.
% 
% Notebooks will be released as part of the codebase upon publication. 

\subsection{Test Datasets for OOD Detection}
% 
All the new datasets are available at Kaggle. For the previously benchmarked test OOD datasets, we obtained the preprocessed versions from the respective sources.

\bibliography{references}    
\end{document}
