% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 

\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage[algo2e, ruled, vlined]{algorithm2e}
\usepackage{colortbl}
\usepackage{amsthm}
\usepackage{subfigure}
\input{math_commands.tex}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\let\oldnl\nl
\newcommand{\nonl}{\renewcommand{\nl}{\let\nl\oldnl}}


\title{Vacant Holes for Unsupervised Detection of the Outliers in \\ Compact Latent Representation\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<m.glazunov@tudelft.nl>?Subject=Your UAI 2023 paper}{Misha~Glazunov}{}}
\author[2]{Apostolis~Zarras}

% Add affiliations after the authors
\affil[1]{%
    Delft University of Technology\\
    the Netherlands
}
\affil[2]{%
    University of Piraeus\\
    Greece\\
}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix





\section{Preservation of compactness under continuous mapping}
$\textbf{Lemma:}$ Let $f:\mathcal{X} \to \mathcal{Y}$ be a continuous mapping from a topological space $\mathcal{X}$ to a topological space $\mathcal{Y}$. If $\mathcal{X}$ is compact then its image $f[\mathcal{X}]$ is also compact.

$\textbf{Proof:}$\footnote{Adapted from: \url{http://mathonline.wikidot.com/preservation-of-compactness-under-continuous-maps}} Let $\mathcal C = \{ U_i \}_{i \in I}$ be any open covering of $f[\mathcal{X}]$ in $\mathcal{Y}$. Then: $f[\mathcal{X}] \subseteq \bigcup_{i \in I} U_i $

% \begin{equation}
% f[\mathcal{X}] \subseteq \bigcup_{i \in I} U_i    
% \end{equation}

Now let's take the inverse of both its sizes:

\begin{align} 
\mathcal{X} \subseteq f^{-1} \left ( \bigcup_{i \in I} U_i \right ) \\  \mathcal{X} \subseteq\bigcup_{i \in I} f^{-1}(U_i)
\end{align}
Since $f$ is continuous and $U_i$ is open in $\mathcal{Y}$ for all $i \in I$ we have that $f^{-1}(U_i)$ is open in $\mathcal{X}$ for all $i \in I$. From above, we see that then $\{ f^{-1}(U_i) \}_{i \in I}$ is an open cover of $\mathcal{X}$. Since $\mathcal{X}$ is compact, this open cover has a finite subcover, say $\{ f^{-1}(U_{i_1}), f^{-1}(U_{i_2}), ..., f^{-1}(U_{i_n}) \}$ where $i_n \in I$ where:
\begin{align} \quad \mathcal{X} \subseteq \bigcup_{k=1}^{n} f^{-1}(U_{i_k}) \end{align}

Taking the image of both sides above and we have that:
\begin{align} \quad f[\mathcal{X}] \subseteq f \left ( \bigcup_{k=1}^{n} f^{-1}(U_{i_k}) \right ) \\ \quad f[\mathcal{X}] \subseteq\bigcup_{k=1}^{n} f(f^{-1}(U_{i_k})) \\ \quad f[\mathcal{X}] \subseteq \bigcup_{k=1}^{n} U_{i_k}
\end{align}

Thus $\mathcal C^* = \{ U_{i_1}, U_{i_2}, ..., U_{i_n} \}$ is a finite subcover of $\mathcal C$. Hence $f[\mathcal{X}]$ is compact in $\mathcal{Y} \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \qedsymbol$

% $\rule{0.7em}{0.7em}$
\begin{figure*}[ht]
  \centering
  \includegraphics[width=1\linewidth]{./figures/SurfaceAreaCollapse-crop.pdf }
  \caption{The problem of surface area collapse.}
  \label{fig:surfaceAreaCollapse}
\end{figure*}
\section{Sphere is compact}
$\textbf{Lemma:}$ Let $\mathcal{S}^{n} := \{ \mathbf{x} \in \reals^{n+1} : ||\mathbf{x}|| = 1 \}$ be a hypersphere with radius $r=1$ centered at $\mathbf{0}$ and embedded in $\reals^{n+1}$ then $\mathcal{S}^{n}$ is compact

$\textbf{Proof:}$ First note that $\mathcal{S}^n$ is obviously bounded. Next, observe that $||\rvx|| = \sum \ervx^2$ which represents a continuous mapping whose inverse is a closed set: $\{ 1 \}$, therefore the $\mathcal{S}^n$ is closed. It follows that the $\mathcal{S}^n$ is both closed and bounded, hence by Heine-Borel theorem it is compact.

\section{Surface area collapse of the sphere}
As can be observed from the Figure~\ref{fig:surfaceAreaCollapse} the surface area grows up to approximately seven dimensions and after that it goes down completely collapsing in cases with greater than twenty dimensions. This issue makes it infeasible to use compact hyperspherical latent space in high-dimensional configurations.


\section{DNN architectures used}
For MNIST and FashionMNIST datasets with a single channel we used the following architectures for baseline experiments.

\begin{table}[H]
\centering
\caption{Encoder CNN for MNIST and FashionMNIST}
\label{table:FMNISTEncoderCNN}
\begin{tabular}{cccc}
\hline
 Operation &  Kernel & Strides & Feature Maps  \\ \hline
 Convolution &  3 x 3 & 1 x 1 &  32\\
 Convolution & 3 x 3 & 1 x 1 & 16 \\
 Max pooling 2D & 2 x 2 & 2 x 2 & \textemdash \\
 Linear for $\boldsymbol{\mu}$ & \textemdash & \textemdash & 10 \\
 Linear for $\log \boldsymbol{\sigma}$ & \textemdash & \textemdash & 10  
\end{tabular}
\end{table}

\begin{table}[H]
\centering
\caption{Decoder CNN for MNIST and FashionMNIST}
\label{table:FMNISTDecoderCNN}
\begin{tabular}{cccc}
\hline
 Operation &  Kernel & Strides & Feature Maps  \\ \hline
 Linear for sampled $\mathbf{z}$ & \textemdash & \textemdash & 2306 \\
 Upsampling nearest 2D & \textemdash & \textemdash & \textemdash \\
 Max pooling 2D & 2 x 2 & 2 x 2 & \textemdash \\
 Transposed Convolution &  3 x 3 & 1 x 1 &  32\\
 Transposed Convolution & 3 x 3 & 1 x 1 & 1
\end{tabular}
\end{table}

For CIFAR10 dataset with three channels we used the following architectures with additional padding = 1 and no bias for every convolutional layer. Latent dimensionality = 70.

\begin{table}[H]
\centering
\caption{Encoder CNN for SVHN and CIFAR10}
\label{table:SVHNEncoderCNN}
\begin{tabular}{cccc}
\hline
 Operation &  Kernel & Strides & Feature Maps  \\ \hline
 Convolution &  3 x 3 & 1 x 1 &  16\\
 Convolution & 3 x 3 & 2 x 2 & 32  \\
 Convolution & 3 x 3 & 1 x 1 & 32  \\
 Convolution & 3 x 3 & 2 x 2 & 16  \\
  Linear & \textemdash & \textemdash & 512 \\
  Linear for $\boldsymbol{\mu}$ & \textemdash & \textemdash & 70 \\
 Linear for $\log \boldsymbol{\sigma}$ & \textemdash & \textemdash & 70  
\end{tabular}
\end{table}

\begin{table}[H]
\centering
\caption{Decoder CNN for SVNH and CIFAR10}
\label{table:SVHNDecoderCNN}
\begin{tabular}{cccc}
\hline
 Operation &  Kernel & Strides & Feature Maps  \\ \hline
 Linear for sampled $\mathbf{z}$ &  \textemdash & \textemdash &  512\\
 Linear & \textemdash & \textemdash &  1024\\
 Transposed Convolution & 3 x 3 & 2 x 2 & 32  \\
 Transposed Convolution & 3 x 3 & 1 x 1 & 32  \\
 Transposed Convolution & 3 x 3 & 2 x 2 & 16  \\
 Transposed Convolution & 3 x 3 & 1 x 1 & 3  \\ 
\end{tabular}
\end{table}


For all architectures we used ReLU as a non-linearity in case of classical VAE. For Lipschitz encoder we used GroupSort. In addition, all pixels of the images have been normalized to [0,1] range for each channel for both training and testing phases. For HVAE we used the same architectures as in the original implementation~\footnote{We used the official implementation available at \url{https://github.com/nicola-decao/s-vae-pytorch}}, i.e., two hidden linear layers for the encoder with the dimensionality 256 and 128 correspondingly, and two hidden linear layers for the decoder with dimensionality 128 and 256. For Lipschitz VAE we also used two hidden linear layers for both encoder and decoder with doubled dimensionality for each corresponding hidden layer. 

\newpage
\section{Forward pass of the Lipschitz constant enforcing}
\begin{algorithm2e}[h]
\DontPrintSemicolon
\SetKwInOut{Input}{Input}
\SetKwInOut{Output}{Output}
\SetKwInOut{Result}{Result}
\SetKwInOut{Requires}{Requires}
\SetKwFunction{LInfBallProj}{LInfBallProjection}
\SetKwBlock{Forward}{Forward pass}{Forward pass}
\SetKwBlock{ProjectBlock}{LInfBallProjection}{LInfBallProjection}
\ProjectBlock{
\Input{$\rvy \in \reals^N$}
\Output{$\rvx \in \reals^N$}
Sort $\rvy$ into $\rvu$: $u_1 \geq \ldots \geq u_N $ \;
% \BlankLine
Set $K := \max_{1 \leq k \leq N}\{k | (\sum_{r=1}^{k} u_r - 1) / k < u_k \}$ \;
Set $\tau := (\sum_{k}^{K}u_k - 1)/K$ \;
\For{$n=1, \ldots, N$}{
    Set $x_n := \max_{y_n - \tau,0}$
}
}
\Input{Data point $\rvx$} 
\Result{Network output $\mathbf{h}_L$}
\Requires{Lipschitz constant $M$}
\nonl
\Forward{
\nonl $\mathbf{h}_0 \leftarrow \rvx$ \;

\nonl
\For{$l=1, \ldots, L$}{
\nonl $\mathbf{W}_l \leftarrow$ \LInfBallProj{$\mathbf{W}_l$}\; 
pre-activation $\leftarrow$ $M^{\frac{1}{L}} \mathbf{W}_l\mathbf{h}_{l-1}$ \; 
$\mathbf{h}_l \leftarrow \mathrm{GroupSort}(\text{pre-activation})$\;
}}
\caption{Ensuring Lipschitz constant in a DNN mapping}
\label{alg: training_forward_pass}
\end{algorithm2e}

\section{Further experiments with hyperspherical VAE}

\begin{table*}[h]
\centering
\caption{Scoring values (means and $99.9\%$ confidence interval) for toy experiments with $\mathcal{S}^2$ for MNIST vs. held-out and Fashion-MNIST. The held-out outliers are all digits except 0's and 1's. And with $\mathcal{S}^3$ for Fashion-MNIST vs. MNIST. Note that Vanilla VAEs in the experiments are equipped with the same low dimensional latent space as the surface of the corresponding $\mathcal{S}$-VAE.} 
\label{table:ToyExperiments}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccccccc}
\toprule
& \multicolumn{3}{c}{\textbf{MNIST held-out}} & \multicolumn{3}{c}{\textbf{MNIST vs. Fashion-MNIST}} & \multicolumn{3}{c}{\textbf{Fashion-MNIST vs. MNIST}} \\
\cmidrule(lr){2-4}\cmidrule(lr){5-7}\cmidrule(lr){8-10}
& \textbf{ROC AUC$\uparrow$} & \textbf{AUPRC$\uparrow$} & \textbf{FPR80$\downarrow$} & \textbf{ROC AUC$\uparrow$} & \textbf{AUPRC$\uparrow$} & \textbf{FPR80$\downarrow$}  &
\textbf{ROC AUC$\uparrow$} & \textbf{AUPRC$\uparrow$} & \textbf{FPR80$\downarrow$}
\\
% \cmidrule{2-10}
\midrule
& \multicolumn{9}{c}{\textit{\textbf{Vanilla VAE}}} \\
\midrule
\textbf{Log likelihood} &  96.84 (±0.07) &  98.50 (±0.04) &  4.43 (±0.27) & 99.85 (±0.02) & 99.86 (±0.01) & 0.00 (±0) & 45.13 (±0.1) & 43.75 (±0.05) & 75.60 (±0.27) \\
\textbf{Input complexity} & 42.98 (±0.86) & 45.28 (±0.52) & 81.82 (±0) &  18.27 (±2.12) &  37.18 (±0.8) & 100 (±0) & \cellcolor[rgb]{0.9,0.9,0.9} 94.96 (±1.18) & \cellcolor[rgb]{0.9,0.9,0.9} 95.57 (±1.12) & \cellcolor[rgb]{0.9,0.9,0.9} 10.91 (±5.68) \\
\textbf{Typicality test} & \cellcolor[rgb]{0.9,0.9,0.9} 96.84 (±0.05) & 98.50 (±0.04) & \cellcolor[rgb]{0.9,0.9,0.9} 4.24 (±0.25) & \cellcolor[rgb]{0.9,0.9,0.9} 99.86 (±0.01) & \cellcolor[rgb]{0.9,0.9,0.9} 99.87 (±0.01) & \cellcolor[rgb]{0.9,0.9,0.9} 0.00 (±0) & 45.16 (±0.1) & 43.76 (±0.06) & 75.60 (±0.35) \\
% \cmidrule{2-10}
\midrule
& \multicolumn{9}{c}{\textit{\textbf{$\mathcal{S}$-VAE}}} \\
% \cmidrule{2-10}
\midrule
\textbf{Log likelihood} &  97.07 (±0.05) &  98.62 (±0.06) &  4.34 (±0.24) & 99.85 (±0.02) & 99.87 (±0.01)  & 0.01 (±0.01) & 45.25 (±0.07) & 44.45 (±0.05) & 76.21 (±0.26) \\
\textbf{Input complexity} & 41.74 (±1.11) & 44.67 (±0.44) & 80.00 (±5.68) &  17.54 (±2.45) &  37.02 (±0.83) & 100 (±0) & \cellcolor[rgb]{0.9,0.9,0.9} 94.79 (±1.63) & \cellcolor[rgb]{0.9,0.9,0.9} 95.45 (±1.39) & \cellcolor[rgb]{0.9,0.9,0.9} 12.73 (±7.57) \\
\textbf{Typicality test} & \cellcolor[rgb]{0.9,0.9,0.9} 97.04 (±0.05) & 98.59 (±0.05) & \cellcolor[rgb]{0.9,0.9,0.9} 4.34 (±0.25) & \cellcolor[rgb]{0.9,0.9,0.9} 99.86 (±0.02) & \cellcolor[rgb]{0.9,0.9,0.9} 99.87 (±0.02)  & \cellcolor[rgb]{0.9,0.9,0.9} 0.00 (±0) & 45.25 (±0.08) & 44.45 (±0.09) & 76.17 (±0.24) \\
\textbf{Hole indicator (ours)} & \textbf{89.05 (±0.25)} & \cellcolor[rgb]{0.9,0.9,0.9} \textbf{99.38 (±0.02)} & \textbf{16.1 (±0.72)} &  \textbf{94.54 (±0.09)} & \cellcolor[rgb]{0.9,0.9,0.9} \textbf{99.01 (±0.02)} & \textbf{5.60 (±0.2)} & \textbf{87.37 (±0.16)} & \textbf{88.86 (±0.15)} &  \textbf{19.25 (±0.46)}\\
\bottomrule
\end{tabular}
}
{\scriptsize The most robust scores are in bold. The highest values are in gray.\\
\textbf{*} 0's in FPR80 are possible since it is a value for false-positive rate at 80\% of true-positive rate}
\end{table*}

As can be observed in Table~\ref{table:ToyExperiments} the most robust scores are hole indicators that achieve the most consistent results across all used datasets.
% \bibliography{uai2023-template}


\section{Scores}
\subsection{Stds of LLs}
 Recall that \emph{importance sampling} is used to estimate the marginal likelihood of the input under the trained VAE, namely:
%
\begin{footnotesize}
\begin{equation}\label{eq_importance_sampling}
p_{\boldsymbol{\theta}}(\mathbf{x}) \simeq \frac{1}{N} \sum_{i=1}^{N} \frac{p_{\boldsymbol{\theta}}(\rvx, \rvz_{\textmd{ \emph{(i)}}})}{q_{\boldsymbol{\phi}}(\rvz_{\textmd{ \emph{(i)}}} | \rvx)},\quad\textrm{where}\quad\rvz_{\textmd{ \emph{(i)}}} \sim q_{\boldsymbol{\phi}}(\rvz | \rvx)
\end{equation}
\end{footnotesize}

where $\boldsymbol{\phi}$ represents the variational parameters of the encoder responsible for the variational approximation of the posterior $q_{\boldsymbol{\phi}}$ over the latent variable $\mathbf{z}$, and $\boldsymbol{\theta}$ stands fr the generative parameters of the decoder responsible for the parametrization of the likelihood of the input $p_{\boldsymbol{\theta}}(\mathbf{x}|\mathbf{z})$.
Hence, it is possible to compute the sample standard deviation of the marginal likelihood under \emph{importance sampling} by computing the sample standard deviation of the terms within the given sum. This constitutes the essence of the Stds of LLs score.

\subsection{Hole indicator score}
For this score we sample the approximated posterior $q_{\boldsymbol{\phi}}(\rvz|\rvx)$ with several latent codes $\rvz$ under a particular input $\rvx$ and compute the sample standard deviation of the log-likelihoods $\log p(\rvx|\rvz)$:
\begin{equation}
\Sigma_{\rvz}[\rvx] = \sqrt{\frac{1}{N-1} \sum_{{\rvz}} \left(\log p(\rvx|\rvz) - \overline{\log p(\rvx|\rvz)}\right)^2}\end{equation}

\subsection{Typicality}
The test for typicality treats all input sequences as inliers if their entropy is sufficiently close to the entropy of the model, i.e., if the following holds for small $\epsilon$ then the given input is in-distribution: 
\begin{footnotesize}
\begin{equation}
\begin{split}
\left|-\log p\left({\rvx^*}\right)-\sum_{\rvx \in \mathcal{D}}\log p(\rvx)\right| \leq \epsilon
\end{split}
\end{equation}
\end{footnotesize}
This score is applied to one-element sequences in our work since it is the most realistic scenario in practical applications of outlier detection.


\subsection{Input complexity}
First, we compute the complexity estimate $L(\rvx)$ by compressing the input $\rvx$ with JPEG2000. The result represents a string of bits: $C(\rvx)$. After that we apply the normalization of the length of the resulting string by dimensionality $d$:
\begin{equation*}
    L(\rvx) = \frac{\left|C(\rvx)\right|}{d} .
\end{equation*}

Subsequently the input complexity score is calculated in the following way (in bits per dimension):
\begin{equation}
    S(\rvx) = -\log p(\rvx) - L(\rvx)
\end{equation}
The higher the $S$ score, the more indicative it is that the current input is the outlier.

\section{Compactness ablation}
Since the placement of the outliers within the unconstrained compact space with Vanilla VAEs is basically arbitrary, it can be the case that some outliers will still be successfully detected via hole indicator when these outliers are mapped within the same space as the inliers. Hence, in order to make an appropriate ablation study only for the compactness, we conducted the following experiments. We gradually increase the pixel intensity of the images from one to higher values by multiplying it with a scalar. We calculate the hole indicator for each intensity step for both Lipschitz VAE and Vanilla VAE. The corresponding results can be observed in the Table~\ref{table:CompactnessAblation}.

\begin{table*}[h]
\centering
\caption{Ablation of compactness with hole indicator.} 
\label{table:CompactnessAblation}
% \resizebox{\textwidth}{!}{
\begin{tabular}{lcccccccc}
\toprule
& \textbf{1x} & \textbf{3x} & \textbf{5x}  & \textbf{7x} & \textbf{9x} & \textbf{11x} 
& \textbf{13x} & \textbf{15x}
\\
% \cmidrule{2-10}
\midrule
\textbf{Vanilla VAE} 
&  100.0 &  100.0  &  100.0 
& 99.69 & 53.40 & 0.00 
& 0.00 & 0.03\\
\textbf{Lipschitz VAE} 
& 100.0 & 100.0 & 100.0 
&  99.48 &  99.36 & 95.32

& 99.48  & 95.70 \\
% \cmidrule{2-10}
\bottomrule
\end{tabular}
% }
\end{table*}

As can be seen from the obtained values, there is a clear transition from the detectable outliers vs. non-detectable ones through the latent holes in the case of Vanilla VAE, and no degradation of the results in the case with the Lipschitz VAEs.



\section{Samples from Lipschitz VAEs}
\begin{figure*}[ht]
\centering 
\subfigure{\includegraphics[width=0.33\columnwidth]{./figures/mnist_data_lpinf_z10_lipenc5_ep102_reconstructed2.pdf}}
\hfill
\subfigure{\includegraphics[width=0.33\columnwidth]{./figures/fmnist_data_lpinf_z10_lipenc5_ep102_reconstructed10.pdf}}
\hfill
\subfigure{\includegraphics[width=0.33\columnwidth]{./figures/cifar10_data_lpinf_z70_lipenc12_ep115_reconstructed2.pdf}}
\hfill


\caption{Random samples from the Lipshitz VAEs trained on MNIST, Fashion-MNIST, CIFAR-10}
\label{fig:LipschitzVAESamples}
\end{figure*}

\end{document}
