% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{graphicx}
\usepackage[ruled,noline,linesnumbered]{algorithm2e}
\usepackage{algorithmic}

\usepackage{amsthm}
\usepackage{amsmath, amssymb}
\usepackage{bbold}

\usepackage{url}

\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{remark}{Remark}


\newcommand{\floor}[1]{\left\lfloor #1 \right\rfloor}
\newcommand{\ceil}[1]{\left\lceil #1 \right\rceil}
\newcommand*{\comb}[2]{{}^{#1}C_{#2}}%


\DeclareMathOperator*{\minimize}{minimize}
\DeclareMathOperator*{\maximize}{maximize}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\V}{\textup{Var}}
\DeclareMathOperator{\C}{\textup{Cov}}
\DeclareMathOperator{\MSE}{\textup{MSE}}

\DeclareMathOperator{\rank}{\textup{rank}}
\newcommand{\tr}{\textup{tr}}

\SetKwInput{KwInput}{Input}
\SetKwInput{KwOutput}{Output}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr} 
% \externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Robust Gaussian Process Regression with the Trimmed Marginal Likelihood\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<andrade@hiroshima-u.ac.jp>?Subject=Your UAI 2023 paper}{Daniel Andrade}{}}
\author[2,3]{Akiko Takeda}

% Add affiliations after the authors
\affil[1]{%
	Education and Research Center for Artificial Intelligence and Data Innovation\\
	Hiroshima University\\
	Hiroshima, Japan
}
\affil[2]{%
	Department of Mathematical Informatics\\ 
	The University of Tokyo\\
	Tokyo, Japan
}
\affil[3]{%
	Center for Advanced Intelligence Project\\
	RIKEN\\
	Tokyo, Japan
}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 
% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix

\section{Proofs}

\subsection{Convergence Guarantee of the Proposed Projected Gradient Descent Method}

Optimization problem (P1) from the main paper is given by
\begin{equation}
	\min_{\mathbf{b}} f(\mathbf{b}) \mbox{ s.t. } \| \mathbf{b} \|_0 = n-m.
	\label{supp_prob1}
\end{equation}
This problem can be expressed as an unconstrained optimization problem by using the indicator function\footnote{The indicator function is defined as 
	$\delta_C(\mathbf{b}) := \begin{cases}
		0 & \text{if } \mathbf{b} \in C \, , \\
		\infty & \text{else}.
	\end{cases}$}
as follows:
\begin{equation}
	\min_{\mathbf{b}} F(\mathbf{b}) \, ,
	\label{supp_prob2}
\end{equation}
with
\begin{equation*}
	F(\mathbf{b}) := f(\mathbf{b}) + \delta_C(\mathbf{b}), \mbox{ where } C=\{\mathbf{b} \in \mathbb{R}^n \, | \,   \|\mathbf{b}\|_0 = n-m \} \, .
\end{equation*}

Recently, for analyzing the convergence rate of first-order methods for nonconvex objective functions, the so-called  Kurdyka–Lojasiewicz (KL) property is often used.
If the objective function of $F(\mathbf{b})$ satisfies the KL property with an exponent of $\alpha =1/2$ and the sequence $\{b_k\}$ generated by 
the proximal gradient algorithm is bounded, then it was proven that $\{b_k\}$ converges locally and linearly to a stationary point of $F$ (see, for example, \cite{Attouch_etal10, Attouch_etal13, KLfunc_Li2018}).
Therefore, here, we only need to prove that $F(\mathbf{b})$ is a KL function with exponent $1/2$.

The definition of KL functions encompasses broad classes of functions, and it is known that a proper closed semi-algebraic function is a KL function with a suitable exponent $\alpha \in [0, 1)$. The above function $F$ is also a KL function.

\begin{theorem}
	Any sequence $\{b_k\}$ generated by projected gradient algorithm for Problem~\eqref{supp_prob1} globally converges to a stationary point with locally linear convergence rate.
\end{theorem}
\begin{proof}
	First, we show global convergence. \cite{Bolte_etal14} implies that the objective function $F$ of \eqref{supp_prob2} is a proper lower semi-continuous KL function. Considering that  $F$ is lower bounded and 
	$\nabla f$ is Lipschitz continuous, we can confirm the global convergence of the proximal gradient method from
	\cite[Theorem 5.1 and Remark 5.2]{Attouch_etal13}. Now for proving the convergence rate, we will check the
	KL exponent of $F$. $F$ can be further rewritten as
	\[
	F(\mathbf{b}) = \min_{S \subseteq \{1,\ldots,n\}, |S|=m } f(\mathbf{b})+\delta_{\Omega_{S}}(\mathbf{b})  \, ,
	\]
	where $\Omega_{S} := \{\mathbf{b}  \in \mathbb{R}^n  \, | \, b_{i}=0, \forall i \in S \}$. Here, for all possible $S$, $\delta_{\Omega_{S}}(\mathbf{b})$
	are proper closed polyhedral functions. Then  \cite[Corollary 5.2]{KLfunc_Li2018}  implies that $F(\mathbf{b})$ is a KL function with an 
	exponent of $1/2$. From this, and the boundedness of $\{b_k\}$, \cite[Proposition 5.1]{KLfunc_Li2018} implies that $\{b_k\}$ achieves linear convergence locally.
\end{proof}


\subsection{Proof of Asymptotically Correct Outlier Rejection}
Here we prove Proposition 1. 
Note that ignoring constants, we may write the negative marginal log-likelihood (NLL) as
\begin{align*}
	\text{NLL}(\sigma^2, \eta, \mathbf{l}) &:= - 2 \log p(\mathbf{y} | X, \sigma^2, \eta, \mathbf{l}) -  n \log 2 \pi \\
	&=  \mathbf{y}^T (K_{\eta, \mathbf{l}} + \sigma^2 I)^{-1} \mathbf{y}  +  \log | K_{\eta, \mathbf{l}} + \sigma^2 I | \\
	&=  \frac{1}{\eta} \mathbf{y}^T (K + \frac{\sigma^2}{\eta}I)^{-1} \mathbf{y}  +  \log ( \eta^n | K +  \frac{\sigma^2}{\eta} I | )\, ,
\end{align*}
where $K := K_{1, \mathbf{l}}$ (that means $K$ is $K_{\eta, \mathbf{l}}$, with $\eta$ being set to 1).

First, we establish a lower bound on NLL.
Let $\lambda_0$ denote the smallest possible eigenvalue of $K_{1, \mathbf{l}}$, i.e. 
\begin{align*} 
	\lambda_0 := \min_{\mathbf{l} \in \mathbb{D}} \lambda_{\min}(K_{1, \mathbf{l}}) \, ,
\end{align*}
where $\lambda_{\min}( A )$ denotes the smallest eigenvalue of a matrix $A$. Note that $1 \geq \lambda_0 > 0$.
Analogously, let  $\lambda_1$ denote the largest possible eigenvalue of $K_{1, \mathbf{l}}$, i.e. 
\begin{align*} 
	\lambda_1 := \min_{\mathbf{l} \in \mathbb{D}} \lambda_{\max}(K_{1, \mathbf{l}}) \, ,
\end{align*}
where $\lambda_{\max}( A )$ denotes the largest eigenvalue of a matrix $A$. Note that $1 \leq \lambda_1  < n$. Therefore, for any $\mathbf{l} \in \mathbb{D}$, all eigenvalues of $K$ are bounded. 
In particular, we have 
\begin{align*} 
	\lambda_{\min}\Big(K +  \frac{\sigma^2}{\eta} \Big) \geq  \lambda_0 +  \frac{\sigma^2}{\eta} \, ,
\end{align*}
and 
\begin{align*} 
	\lambda_{\min}\Big((K +  \frac{\sigma^2}{\eta})^{-1} \Big) \geq  (\lambda_1 +  \frac{\sigma^2}{\eta})^{-1} \, .
\end{align*}
%
Define 
\begin{align*} 
	g_2(\sigma^2, \eta) := \frac{1}{\eta}  (\lambda_1 +  \frac{\sigma^2}{\eta})^{-1}  ||\mathbf{y} ||^2_2 +  \log ( \eta^n ( \lambda_0 +  \frac{\sigma^2}{\eta} )^n ) \, ,
\end{align*}
then we have 
\begin{align*} 
	g_2(\sigma^2, \eta) \leq \text{NLL}(\sigma^2, \eta, \mathbf{l}) \, .
\end{align*}

Since the function $g_2$ is still slightly difficult to analyze, we establish another lower bounding function $g_1$.

First note that $g_2$ can be written as follows
\begin{align*} 
	g_2(\sigma^2, \eta) = (\eta \lambda_1 +  \sigma^2)^{-1}  ||\mathbf{y} ||^2_2 +  n \log ( \eta \lambda_0 +  \sigma^2 )  \, .
\end{align*}


Noting that 
\begin{align*} 
	n \log ( \lambda_0 ) + n \log ( {\eta} + {\sigma}^2)  
	&= n \log ( \lambda_0 {\eta}  + \lambda_0 {\sigma}^2)  \\
	&\leq  n \log ( {\eta} \lambda_0 + {\sigma}^2)   \, ,
\end{align*}
and
\begin{align*} 
	\lambda_1^{-1} ( {\eta} + {\sigma}^2)^{-1}
	&= (\lambda_1 {\eta} + \lambda_1 {\sigma}^2)^{-1}  \\
	&\leq  (\lambda_1 {\eta} + {\sigma}^2)^{-1}   \, ,
\end{align*}
we have 
\begin{align*} 
	g_1(\sigma^2, \eta) \leq g_2({\sigma}^2, {\eta}) \, ,
\end{align*}
where we defined
\begin{align*} 
	g_1(\sigma^2, \eta) := \lambda_1^{-1} ( {\eta} + {\sigma}^2)^{-1} ||\mathbf{y} ||^2_2 + n \log ( \lambda_0 ) + n \log ( {\eta} + {\sigma}^2)  \, .
\end{align*}
%
Therefore, we have
\begin{align} \label{eq:supp_inequalities}
	\min_{\sigma^2, \eta} g_1(\sigma^2, \eta)  \leq \min_{\sigma^2, \eta} g_2(\sigma^2, \eta) \leq \min_{\sigma^2, \eta, \mathbf{l}}  \text{NLL}(\sigma^2, \eta, \mathbf{l}) \, .
\end{align}

Next, we will show that, if $||\mathbf{y}||^2_2 \rightarrow \infty$, 
then 
\begin{align*} 
	\min_{\sigma^2, \eta} g_1(\sigma^2, \eta) \rightarrow \infty \, .
\end{align*}

First, note that $g_1$ depends only on the sum ${\eta} + {\sigma}^2$, rather than the individual values.
Therefore, we can re-parameterize $g_1$ as follows
\begin{align*} 
	% g_{1*}(z) := \lambda_1^{-1} ( {\eta} + {\sigma}^2)^{-1} ||\mathbf{y} ||^2_2 + n \log ( \lambda_0 ) + n \log ( {\eta} + {\sigma}^2)  \, . \\
	g_{1*}(z) := \lambda_1^{-1} z ||\mathbf{y} ||^2_2 + n \log ( \lambda_0 ) - n \log z  \, ,
\end{align*}
where $z :=  ( {\eta} + {\sigma}^2)^{-1}$, and we have 
\begin{align*} 
	\min_{z} g_{1*}(z) = \min_{\sigma^2, \eta} g_1(\sigma^2, \eta) \, .
\end{align*}
Since $g_{1*}$ is a convex function, the minimum value of $g_{1*}$ is attained for $\hat{z}$ with
\begin{align*} 
	\frac{\partial g_{1*}}{\partial z}(\hat{z}) = \frac{ ||\mathbf{y} ||^2_2}{\lambda_1} - \frac{n}{\hat{z}}  = 0\, ,
\end{align*}
and therefore 
\begin{align*} 
	\hat{z} = n \frac{\lambda_1} { ||\mathbf{y} ||^2_2} \, ,
\end{align*}
and 
\begin{align*} 
	\min_{z} g_{1*}(z) = n + n \log ( \lambda_0 ) - n \log (\lambda_1 n) + n \log ( ||\mathbf{y} ||^2_2 ) \, .
\end{align*}
Therefore, if $||\mathbf{y}||^2_2 \rightarrow \infty$, 
\begin{align*} 
	\min_{z} g_{1*}(z)  \rightarrow \infty \, , 
\end{align*}
and as a consequence, from Inequalities \eqref{eq:supp_inequalities}, we have
\begin{align*} 
	\min_{\sigma^2, \eta, \mathbf{l}}  \text{NLL}(\sigma^2, \eta, \mathbf{l})   \rightarrow \infty \, .
\end{align*}
Therefore, as long as one or more observations belonging to $V$ are selected, we must have that $\min_{\sigma^2, \eta, \mathbf{l}}  \text{NLL}(\sigma^2, \eta, \mathbf{l}) \rightarrow \infty$. Since $\text{NLL}(\sigma^2, \eta, \mathbf{l})$ is bounded from above for observations belonging to $U$, the trimmed marginal likelihood GP will select only observations from $U$.


\subsection{Asymptotic bias correction for $\sigma^2$}
Here, we explain the asymptotic correction for estimating the noise variance for Algorithm 2 in the main paper.
% $\sigma^2$ for the student-t distribution. 
% Note that this estimate is not necessary for ranking the outliers, since two different $\sigma^2$ will produce the same ranking. 
% However, an estimate of $\sigma^2$  is important for defining a meaningful threshold for distinguishing outliers from inliers based on the residuals. 
% For example, this allows for plotting the standardized residuals that are used for outlier analysis, as shown in this supplement material in Section

The derivation presented here, generalizes the derivation for the correction of the median linear regression \cite{rousseeuw1984least}.
% that we apply for the variance which is used for the 
Let $Q_f$ denote the quantile function for distribution $f$, 
and by $Q_{\{r_i^2\}_{i = 1}^n}$ the empirical quantile function of observed squared residuals $r_i^2$.
We define $Q_{\{r_i^2\}_{i = 1}^n}(p) = r_{(\floor{pn})}^2$, where $r_{(1)}^2 \leq r_{(2)}^2 \ldots \leq r_{(n)}^2$.
Let $\nu$ be the user-set maximum outlier-ratio, i.e. $ 1 - \nu = \frac{m}{n}$.
Furthermore, note that each $r_i^2$ is distributed according to $\sigma^2 \chi^2(1)$, where $\chi^2(1)$ is the $\chi^2$ distribution with 1 degree of freedom.
For $n \rightarrow \infty$, we have, see e.g. \citep{walker1968note}, 
\begin{align*}
	Q_{\{r_i^2\}_{i = 1}^n} (1 - \nu) \;  \stackrel{p}{\longrightarrow} \; Q_{\sigma^2 \chi^2(1)} (1 - \nu) \, .
\end{align*} 
Therefore, for sufficiently large $n$, we have that 
\begin{align*}
	Q_{\{r_i^2\}_{i = 1}^n} (1 - \nu) 
	&\approx Q_{\sigma^2 \chi^2(1)} (1 - \nu)  \\
	&= \sigma^2  Q_{\chi^2(1)} (1 - \nu) \, .
\end{align*} 
The last line follows from properties of the quantile function (see for example Lemma 1 in this supplement material).
Therefore, we set
\begin{align*}
	\sigma^2 = \frac{r_{(\floor{(1 - \nu) n})}^2}{Q_{\chi^2(1)} (1 - \nu)}  \, .
\end{align*}


\begin{lemma}
	Let $Q_X$ be the quantile function of a real valued random variable $X$, and 
	define $Y := \alpha X$, where $\alpha > 0$. 
	Then the following holds
	\begin{align*}
		Q_Y = \alpha  Q_X \,  .
	\end{align*}
\end{lemma}

\begin{proof}
	First note that 
	\begin{align*}
		P(Y \leq y) &= P(X \alpha \leq y) \\
		&= P(X \leq \frac{y}{\alpha} ) \, .
	\end{align*}
	For any $u \in ]0, 1[$, we have
	\begin{align*}
		Q_Y(u) &= \inf\{ y  \in \mathbb{R} \, |  \, u \leq P(Y \leq y)\} \\
		&= \inf\{ y  \in \mathbb{R} \, | \,  u \leq P(X \leq \frac{y}{\alpha})\} \\
		&= \alpha \inf\{ \frac{y}{\alpha}  \in \mathbb{R} \, | \,  u \leq P(X \leq \frac{y}{\alpha} )\} \\
		&= \alpha \inf\{ x \in \mathbb{R} \, | \,  u \leq P(X \leq x)\} \\
		&= \alpha Q_X(u) \, . 
	\end{align*}
\end{proof}

\section{Details of Greedy Method}
The function starts with the index set of all data points $S := \{1, 2, \ldots, n\}$, and then removes 
the data point $i_*$ which leads to the largest marginal likelihood, i.e.
\begin{equation} \label{eq:supp_greedy_remove}
	i_* :=  \argmax_{i \in S}  \Big(  \log p(\mathbf{y}_{S \setminus \{i\}} | X_{S \setminus \{i\}}, \boldsymbol{\theta}) \Big) \, .
\end{equation}
This is repeated until $|S| = \ceil{(1 - \nu) n}$. Naively solving the optimization in Equation \eqref{eq:supp_greedy_remove} is in $O(n^4)$, since 
we need to repeat $n$-times the calculation of the determinant and inverse of $K_{S \setminus \{i\}}$, where $K_{S \setminus \{i\}}$ denotes the covariance matrix (plus $\sigma^2 I$) of the data points in  $S \setminus \{i\}$.
However, using the block matrix inversion lemma (together with the Woodbury formula) and the cofactor representation of the determinant, we can solve it in $O(n^3)$ as follows.
Without loss of generality assume that sample $i$ corresponds to the last row and column of $K_S$ and write 
\begin{align*}
	K_S =: 
	\begin{pmatrix}
		A & \mathbf{b} \\
		\mathbf{b}^T & c 
	\end{pmatrix} , \text{and} \quad 
	K_S^{-1} =: 
	\begin{pmatrix}
		U & \mathbf{v} \\
		\mathbf{v}^T & w 
	\end{pmatrix} \, .
\end{align*}
Using the block matrix inversion lemma, we have 
\begin{align*}
	U &= A^{-1} + A^{-1} \mathbf{b} (- \mathbf{v}^T) \\
	&= A^{-1} (I - \mathbf{b} \mathbf{v}^T) \, ,
\end{align*}
and therefore
\begin{align*}
	A^{-1} &= U (I - \mathbf{b} \mathbf{v}^T)^{-1} \\
	&= U (I + \mathbf{b} \mathbf{v}^T \frac{1}{1-\mathbf{v}^T \mathbf{b} })\, ,
\end{align*}
where in the last line we used the Woodbury formula. Since $A = K_{S \setminus \{i\}}$, this allows for an efficient calculation of $K_{S \setminus \{i\}}^{-1}$.
Finally, the determinant $| K_{S \setminus \{i\}}|$ can also be efficiently calculated as follows.
Denote the the cofactor matrix of $K_{S}$ as $C$, therefore we have $C_{nn} = |A|$.
Using the cofactor representation of the inverse, we have
\begin{align*}
	K_{S}^{-1} = \frac{1}{|K_{S}|} C  \, ,
\end{align*}
and therefore 
\begin{align*}
	|A| &= C_{nn} \\
	&=  |K_{S}|   (K_{S}^{-1})_{nn} \, .
\end{align*}

\section{Comment on Bias Model from Previous Works}
%The method in \citep{park2021robust} introduces a bias vector $\boldsymbol{\delta} \in \mathbb{R}^n$, where $n$ is the number of samples.
%If and only if $\delta_i \neq 0$, then sample $i$ is considered an outlier. They propose to learn $\boldsymbol{\delta}$ using the $\ell_1$-penalty. 
%However, we can prove the following proposition, showing that the method in \citep{park2021robust} has a breakdown point of 0:
%\begin{proposition}
%If there is only one outlier with $y_{i_*} \rightarrow \infty$, then $\forall i: \delta_i \neq 0$, meaning all samples are considered as outliers.
%\end{proposition}

The method in \citep{park2021robust} ("Constant Bias Model", Section 3.1) introduces a bias vector $\boldsymbol{\delta} \in \mathbb{R}^n$, where $n$ is the number of samples.
If $\delta_i \neq 0$, then sample $i$ is considered an outlier. Furthermore, introducing a Laplace prior on each $\delta_i$, with common scale $\lambda$, they propose to jointly estimate $\boldsymbol{\delta}$ and $\lambda$ as follows:
\begin{align*}
	\hat{\boldsymbol{\delta}}, \hat{\lambda} = \argmin_{\boldsymbol{\delta}, \lambda} \frac{1}{2} (\mathbf{y} - \boldsymbol{\delta})^T A^{-1} (\mathbf{y} - \boldsymbol{\delta}) + \lambda || \boldsymbol{\delta} ||_1 - \log \lambda\, ,
\end{align*}
for some positive definite matrix $A$, and responses $\mathbf{y} \in \mathbb{R}^n$.\footnote{The term $\lambda || \boldsymbol{\delta} ||_1 - \log \lambda$ is supposed to correspond to a Laplace prior on each component of $\delta_i$. However, note that the resulting penalty on $\lambda$, should be $- n \log \lambda$ rather than $-  \log \lambda$.} They suggest to alternate between the optimization of $\boldsymbol{\delta}$ and $\lambda$. 
However, even only one outlier can lead to a $\hat{\boldsymbol{\delta}}$ which has no zero entry, that is all samples are treated as outliers.
To see this, first consider the optimization of $\boldsymbol{\delta}$, leaving $\lambda$ fixed. Assume that sample $i_*$ is an outlier with $y_{i_*} \rightarrow \infty$, then we have $| \delta_{i_*} |  \rightarrow \infty$. (On the other hand, if $| \delta_{i_*} |$ were bounded, then $y_{i_*}$ would have an arbitrarily large influence on the marginal likelihood.)
Next, consider the optimization of $\lambda$, leaving $\boldsymbol{\delta}$ fixed: the problem is convex with the unique minimum at 
\begin{align*}
	\hat{\lambda} = \frac{1}{ || \boldsymbol{\delta} ||_1 }  \, .
\end{align*}
Note that $\frac{1}{ || \boldsymbol{\delta} ||_1 } <  \frac{1}{ |\delta_{i_*} | }$. Since $| \delta_{i_*} |  \rightarrow \infty$, we have that $\hat{\lambda} \rightarrow 0$. However, if $\hat{\lambda}$ is close to $0$, the penalty 
$ \lambda || \boldsymbol{\delta} ||_1$ will in effect be switched off, leading to $\hat{\boldsymbol{\delta}} = \mathbf{y}$.

\section{Additional Details and Experiments}
For all methods, we initialize all hyper-parameters $\boldsymbol{\theta}$ to $\log 2$, except the variance $\sigma^2$ which is initialized to 10.
For all data, we standardize the response and covariates using the median and and the interquartile range (IQR). 
For all experiments, we used an Nvidia DGX-2. %   Mac mini with ARM architecture M1 and 16GB memory.
For the real datasets, for evaluating the predictive performance of all methods, we randomly split the data into training (90\%) and test data (10\%). %  10-fold cross-validation.

\subsection{Additional Results}

\begin{table}
	\centering
	\caption{Estimated upper bound on outlier ratio $\nu$. Except "no extra outliers", the true ratio of added outliers is $0.1$.} \label{tab:supp_analysis_nu}
	\footnotesize
	\begin{tabular}{rlllll}
		% 
		\toprule % from booktabs package
		&  \bfseries  no extra outliers  & \bfseries  uniform  & \bfseries  focused  & \bfseries asym \\
		\midrule 
		bow & 0.02 (0.01) & 0.08 (0.0) & 0.09 (0.02) & 0.07 (0.0) \\
		F100 & 0.03 (0.01) & 0.07 (0.01) & 0.08 (0.03) & 0.08 (0.01) \\
		F400 & 0.02 (0.0) & 0.07 (0.0) & 0.1 (0.0) & 0.07 (0.0) \\
		body & 0.02 (0.0) & 0.06 (0.01) & 0.06 (0.02) & 0.07 (0.01) \\
		house & 0.02 (0.0) & 0.06 (0.0) & 0.06 (0.02) & 0.06 (0.0) \\
		spacega & 0.03 (0.0) & 0.07 (0.0) & 0.08 (0.0) & 0.07 (0.0) \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Runtime in minutes of each GP regression method.}  \label{tab:supp_analysis_runtime_GP_methods}
	\footnotesize
	\begin{tabular}{rllll}
		% 
		\toprule % from booktabs package
		\multicolumn{5}{c}{no extra added outliers} \\
		\midrule 
		& \bfseries  GP &  \bfseries  $\gamma$-GP  & \bfseries  $t$-GP  & \bfseries  $\nu$-GP   \\
		\midrule % from booktabs package
		bow & \textbf{0.06} (0.0) & 0.1 (0.0) & 0.1 (0.0) & 5.93 (0.76) \\
		F100 & \textbf{0.09} (0.01) & 0.13 (0.0) & 0.17 (0.0) & 4.33 (3.42) \\
		F400 & \textbf{0.1} (0.01) & 0.25 (0.02) & 0.31 (0.03) & 2.88 (0.6) \\
		body & \textbf{0.1} (0.0) & 0.27 (0.0) & 0.23 (0.0) & 67.3 (0.0) \\
		house & \textbf{0.12} (0.0) & 0.25 (0.0) & 0.36 (0.0) & 17.85 (0.0) \\
		spacega & \textbf{1.02} (0.0) & 8.88 (0.0) & 8.79 (0.0) & 9.05 (0.0) \\
		\midrule
		\multicolumn{5}{c}{uniform outliers} \\
		\midrule 
		bow & \textbf{0.06} (0.0) & 0.1 (0.0) & 0.1 (0.0) & 3.44 (0.44) \\
		F100 & \textbf{0.09} (0.0) & 0.13 (0.01) & 0.17 (0.0) & 2.53 (1.32) \\
		F400 & \textbf{0.11} (0.01) & 0.24 (0.01) & 0.15 (0.01) & 3.04 (1.23) \\
		body & 0.77 (0.48) & 0.25 (0.01) & \textbf{0.22} (0.0) & 29.44 (15.93) \\
		house & 0.41 (0.38) & \textbf{0.24} (0.02) & \textbf{0.24} (0.03) & 25.76 (29.33) \\
		spacega & \textbf{0.76} (0.02) & 8.8 (0.06) & 8.78 (0.07) & 9.07 (0.17) \\
		\midrule
		\multicolumn{5}{c}{focused outliers} \\
		\midrule
		bow & \textbf{0.06} (0.0) & 0.1 (0.0) & 0.1 (0.0) & 3.51 (0.53) \\
		F100 & \textbf{0.09} (0.01) & 0.13 (0.0) & 0.17 (0.01) & 3.24 (2.37) \\
		F400 & \textbf{0.1} (0.0) & 0.23 (0.0) & 0.12 (0.02) & 4.71 (1.13) \\
		body & \textbf{0.1} (0.0) & 0.24 (0.01) & 0.22 (0.0) & 55.5 (43.44) \\
		house & \textbf{0.11} (0.0) & 0.23 (0.01) & 0.28 (0.01) & 20.09 (4.42) \\
		spacega & \textbf{0.84} (0.01) & 8.74 (0.11) & 8.67 (0.09) & 23.81 (3.73) \\
		\midrule
		\multicolumn{5}{c}{asymmetric outliers} \\
		\midrule
		bow & \textbf{0.06} (0.0) & 0.1 (0.0) & 0.1 (0.01) & 3.33 (0.36) \\
		F100 & \textbf{0.09} (0.0) & 0.13 (0.01) & 0.17 (0.0) & 3.66 (3.32) \\
		F400 & \textbf{0.12} (0.02) & 0.23 (0.01) & 0.15 (0.02) & 2.68 (0.46) \\
		body & 0.46 (0.42) & 0.24 (0.03) & \textbf{0.22} (0.0) & 26.23 (14.72) \\
		house & 0.3 (0.38) & 0.24 (0.01) & \textbf{0.23} (0.01) & 9.58 (4.56) \\
		spacega & \textbf{0.76} (0.02) & 8.8 (0.06) & 8.78 (0.08) & 8.92 (0.23) \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}


\begin{table}
	\centering
	\caption{Runtime in minutes of each optimization method.}  \label{tab:supp_analysis_runtime_optimization_methods}
	\footnotesize
	\begin{tabular}{rlll}
		% 
		\toprule % from booktabs package
		\multicolumn{4}{c}{no extra added outliers} \\
		\midrule
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
		bow & \textbf{0.2} (0.02) & 10.37 (7.07) & 169.51 (32.26) \\
		F100 & \textbf{0.14} (0.12) & 8.86 (7.98) & 5.01 (3.68) \\
		F400 & \textbf{0.12} (0.05) & 10.89 (9.67) & 173.58 (52.01) \\
		body & \textbf{1.49} (0.0) & 3.4 (0.0) & 27.17 (0.0) \\
		house & \textbf{0.27} (0.0) & 7.29 (0.0) & 76.35 (0.0) \\
		spacega & \textbf{0.82} (0.0) & 23.8 (0.0) & - \\
		\midrule
		\multicolumn{4}{c}{uniform outliers} \\
		\midrule 
		bow & \textbf{0.14} (0.04) & 2.37 (0.29) & 160.39 (3.15) \\
		F100 & \textbf{0.13} (0.15) & 1.74 (1.85) & 7.76 (5.66) \\
		F400 & \textbf{0.15} (0.06) & 2.59 (1.44) & 42.53 (4.65) \\
		body & \textbf{0.79} (0.75) & 5.17 (3.97) & 65.61 (59.16) \\
		house & \textbf{0.21} (0.26) & 2.82 (2.64) & 150.36 (107.69) \\
		spacega & \textbf{0.6} (0.15) & 8.52 (0.11) & - \\
		\midrule
		\multicolumn{4}{c}{focused outliers} \\
		\midrule
		bow & \textbf{0.17} (0.01) & 3.49 (0.78) & 170.7 (26.81) \\
		F100 & \textbf{0.14} (0.18) & 1.37 (1.06) & 8.13 (4.43) \\
		F400 & \textbf{0.13} (0.0) & 2.94 (0.62) & 139.74 (19.06) \\
		body & \textbf{0.21} (0.24) & 2.03 (0.82) & 33.37 (37.42) \\
		house & \textbf{0.71} (1.19) & 6.12 (7.69) & 227.69 (209.95) \\
		spacega & \textbf{0.9} (0.07) & 9.09 (1.44) & - \\
		\midrule
		\multicolumn{4}{c}{asymmetric outliers} \\
		\midrule
		bow & \textbf{0.09} (0.0) & 2.2 (0.07) & 48.24 (1.08) \\
		F100 & \textbf{0.13} (0.15) & 2.61 (3.26) & 5.23 (3.56) \\
		F400 & \textbf{0.13} (0.0) & 2.13 (1.34) & 42.8 (5.34) \\
		body & \textbf{0.41} (0.48) & 3.18 (4.07) & 42.09 (37.34) \\
		house & \textbf{0.15} (0.1) & 1.3 (1.0) & 73.9 (69.2) \\
		spacega & \textbf{0.54} (0.01) & 8.47 (0.35) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Marginal likelihood of solution found by different optimization methods.}  \label{tab:supp_analysis_marginal_likelihood}
	\footnotesize
	\begin{tabular}{rlll}
		% 
		\toprule % from booktabs package
		\multicolumn{4}{c}{no extra added outliers} \\
		\midrule
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
		bow & \textbf{1.76} (0.09) & 1.75 (0.09) & \textbf{1.76} (0.08) \\
		F100 & \textbf{0.07} (0.12) & -0.06 (0.24) & -0.0 (0.46) \\
		F400 & 0.34 (0.2) & 0.36 (0.23) & \textbf{0.42} (0.24) \\
		body & \textbf{3.35} (0.0) & 3.11 (0.0) & 3.23 (0.0) \\
		house & 0.11 (0.0) & 0.09 (0.0) & \textbf{0.18} (0.0) \\
		spacega & -0.31 (0.0) & \textbf{0.38} (0.0) & - \\
		\midrule
		\multicolumn{4}{c}{uniform outliers} \\
		\midrule 
		bow & \textbf{1.7} (0.07) & 1.54 (0.08) & \textbf{1.7} (0.07) \\
		F100 & 0.01 (0.18) & -0.1 (0.14) & \textbf{0.1} (0.17) \\
		F400 & 0.19 (0.12) & 0.07 (0.19) & \textbf{0.2} (0.12) \\
		body & \textbf{-1.34} (2.33) & -1.5 (2.02) & \textbf{-1.34} (2.3) \\
		house & -1.99 (1.13) & -2.0 (1.11) & \textbf{-1.96} (1.16) \\
		spacega & -0.26 (0.03) & \textbf{0.05} (0.07) & - \\
		\midrule
		\multicolumn{4}{c}{focused outliers} \\
		\midrule
		bow & \textbf{1.8} (0.05) & 1.57 (0.05) & \textbf{1.8} (0.05) \\
		F100 & 0.13 (0.13) & -0.08 (0.25) & \textbf{0.22} (0.13) \\
		F400 & 0.15 (0.04) & -0.0 (0.05) & \textbf{0.22} (0.16) \\
		body & 0.72 (1.19) & 0.46 (0.91) & \textbf{0.74} (1.25) \\
		house & 0.27 (0.18) & 0.15 (0.26) & \textbf{0.32} (0.25) \\
		spacega & -0.26 (0.01) & \textbf{-0.02} (0.14) & - \\
		\midrule
		\multicolumn{4}{c}{asymmetric outliers} \\
		\midrule
		bow & \textbf{1.67} (0.1) & 1.49 (0.11) & \textbf{1.67} (0.1) \\
		F100 & \textbf{0.15} (0.13) & -0.13 (0.21) & 0.14 (0.32) \\
		F400 & 0.17 (0.07) & 0.03 (0.14) & \textbf{0.23} (0.13) \\
		body & -1.17 (2.25) & -1.56 (1.5) & \textbf{-1.14} (2.27) \\
		house & \textbf{-1.23} (0.96) & -1.29 (0.92) & \textbf{-1.23} (0.96) \\
		spacega & -0.25 (0.02) & \textbf{-0.07} (0.09) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Outlier ranking performance (R-precision) of different optimization methods.}  \label{tab:supp_analysis_r_precision}
	\footnotesize
	\begin{tabular}{rlll}
		% 
		\toprule % from booktabs package
		\multicolumn{4}{c}{uniform outliers} \\
		\midrule
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
		bow & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		F100 & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		F400 & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		body & \textbf{0.87} (0.06) & 0.86 (0.06) & 0.86 (0.06) \\
		house & \textbf{0.86} (0.06) & 0.85 (0.06) & \textbf{0.86} (0.05) \\
		spacega & 0.98 (0.0) & \textbf{0.99} (0.01) & - \\
		\midrule
		\multicolumn{4}{c}{focused outliers} \\
		\midrule
		bow & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		F100 & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		F400 & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		body & \textbf{1.0} (0.01) & 0.95 (0.11) & 0.98 (0.05) \\
		house & \textbf{0.91} (0.16) & 0.55 (0.24) & 0.71 (0.32) \\
		spacega & \textbf{0.97} (0.0) & 0.31 (0.3) & - \\
		\midrule
		\multicolumn{4}{c}{asymmetric outliers} \\
		\midrule
		bow & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		F100 & \textbf{1.0} (0.0) & 0.99 (0.03) & \textbf{1.0} (0.0) \\
		F400 & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
		body & \textbf{0.86} (0.06) & \textbf{0.86} (0.06) & \textbf{0.86} (0.06) \\
		house & \textbf{0.85} (0.05) & \textbf{0.85} (0.05) & \textbf{0.85} (0.05) \\
		spacega & 0.98 (0.0) & \textbf{0.99} (0.0) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Root mean squared error (RMSE) on test data of different optimization methods.}  \label{tab:supp_analysis_rmse}
	\footnotesize
	\begin{tabular}{rlll}
		% 
		\toprule % from booktabs package
		\multicolumn{4}{c}{no extra added outliers} \\
		\midrule
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
		bow & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) \\
		F100 & \textbf{0.32} (0.05) & 0.34 (0.08) & 0.42 (0.19) \\
		F400 & 0.25 (0.05) & \textbf{0.23} (0.06) & 0.24 (0.05) \\
		body & 0.08 (0.1) & \textbf{0.05} (0.08) & 0.08 (0.09) \\
		house & 0.55 (0.12) & \textbf{0.46} (0.11) & 0.54 (0.13) \\
		spacega & 0.49 (0.03) & \textbf{0.37} (0.02) & - \\
		\midrule
		\multicolumn{4}{c}{uniform outliers} \\
		\midrule 
		bow & \textbf{0.05} (0.0) & 0.06 (0.0) & \textbf{0.05} (0.0) \\
		F100 & 0.31 (0.06) & 0.31 (0.07) & \textbf{0.29} (0.07) \\
		F400 & 0.25 (0.03) & \textbf{0.23} (0.05) & 0.24 (0.03) \\
		body & \textbf{0.05} (0.08) & \textbf{0.05} (0.07) & \textbf{0.05} (0.07) \\
		house & 0.4 (0.14) & \textbf{0.37} (0.12) & 0.4 (0.13) \\
		spacega & 0.4 (0.02) & \textbf{0.36} (0.01) & - \\
		\midrule
		\multicolumn{4}{c}{focused outliers} \\
		\midrule
		bow & \textbf{0.05} (0.0) & \textbf{0.05} (0.0) & \textbf{0.05} (0.0) \\
		F100 & 0.26 (0.06) & 0.3 (0.14) & \textbf{0.25} (0.05) \\
		F400 & 0.25 (0.01) & 0.25 (0.01) & \textbf{0.24} (0.03) \\
		body & \textbf{0.07} (0.08) & 0.1 (0.09) & 0.08 (0.08) \\
		house & 0.4 (0.07) & \textbf{0.34} (0.06) & 0.39 (0.09) \\
		spacega & \textbf{0.41} (0.06) & 0.43 (0.04) & - \\
		\midrule
		\multicolumn{4}{c}{asymmetric outliers} \\
		\midrule
		bow & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) \\
		F100 & \textbf{0.26} (0.05) & 0.33 (0.09) & 0.3 (0.12) \\
		F400 & 0.25 (0.02) & \textbf{0.24} (0.04) & \textbf{0.24} (0.03) \\
		body & \textbf{0.12} (0.11) & 0.15 (0.11) & \textbf{0.12} (0.12) \\
		house & 0.35 (0.13) & \textbf{0.33} (0.09) & 0.34 (0.12) \\
		spacega & 0.4 (0.02) & \textbf{0.37} (0.02) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\bibliography{../../../all_papers_bibliography_extended}

\end{document}
