% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams



\usepackage{graphicx}
% \usepackage[ruled,noline,linesnumbered]{algorithm2e}
% \usepackage{algorithmic}

\usepackage{amsthm}
\usepackage{amsmath, amssymb}
\usepackage{bbold}

\usepackage{url}

\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{remark}{Remark}


\newcommand{\floor}[1]{\left\lfloor #1 \right\rfloor}
\newcommand{\ceil}[1]{\left\lceil #1 \right\rceil}
\newcommand*{\comb}[2]{{}^{#1}C_{#2}}%


\DeclareMathOperator*{\minimize}{minimize}
\DeclareMathOperator*{\maximize}{maximize}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\V}{\textup{Var}}
\DeclareMathOperator{\C}{\textup{Cov}}
\DeclareMathOperator{\MSE}{\textup{MSE}}

\DeclareMathOperator{\rank}{\textup{rank}}
\newcommand{\tr}{\textup{tr}}

\usepackage[ruled,noline,linesnumbered]{algorithm2e}
\SetKwInput{KwInput}{Input}
\SetKwInput{KwOutput}{Output}
\SetKw{KwIn}{in}
\SetEndCharOfAlgoLine{}

\usepackage{xr} 
\externaldocument{uai2023-supplement_robust_GP}

%\SetKwInput{KwInput}{Input}
%\SetKwInput{KwOutput}{Output}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Robust Gaussian Process Regression with the Trimmed Marginal Likelihood}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<andrade@hiroshima-u.ac.jp>?Subject=Your UAI 2023 paper}{Daniel Andrade}{}}
\author[2,3]{Akiko Takeda}

% Add affiliations after the authors
\affil[1]{%
	Education and Research Center for Artificial Intelligence and Data Innovation\\
	Hiroshima University\\
   Hiroshima, Japan
}
\affil[2]{%
    Department of Mathematical Informatics\\ 
    The University of Tokyo\\
    Tokyo, Japan
}
\affil[3]{%
   Center for Advanced Intelligence Project\\
   RIKEN\\
   Tokyo, Japan
  }
  
  \begin{document}
\maketitle

\begin{abstract}
	Accurate outlier detection is not only a necessary preprocessing step, but can itself give important insights into the data. However, especially, for non-linear regression the detection of outliers is non-trivial, and actually ambiguous. 
	We propose a new method that identifies outliers by finding a subset of data points $T$ such that the marginal likelihood of all remaining data points $S$ is maximized. Though the idea is more general, it is particular appealing for Gaussian processes regression, where the marginal likelihood has an analytic solution. 
	While maximizing the marginal likelihood for hyper-parameter optimization is a well established non-convex optimization problem, optimizing the set of data points $S$ is not.
	Indeed, even a greedy approximation is computationally challenging due to the high cost of evaluating the marginal likelihood. As a remedy, we propose an efficient projected gradient descent method with provable convergence guarantees. Moreover, we also establish the breakdown point when jointly optimizing hyper-parameters and $S$.
	For various datasets and types of outliers, our experiments demonstrate that the proposed method can improve outlier detection and robustness when compared with several popular alternatives like the student-t likelihood. 
\end{abstract}

\section{Introduction}\label{sec:intro}

Many real world data sets contain outliers, i.e. data points that are not representative of the majority of samples. For example, the output of a broken sensor might lead to an outlier observation. 
It is well known that estimating the parameters of a statistical model from data which contains outliers, can often lead to arbitrarily bad estimates, and therefore various robust learning techniques have been proposed \citep{rousseeuw2005robust,basu1998robust,fujisawa2008robust}.

Once the model has been robustly trained, we can detect outliers by ranking them according to the absolute value of the residuals, or remove some of the outliers in order to improve predictive performance.
% using the framework proposed in \citep{rice2006simple}.
However, the success hinges on choosing the correct hyper-parameters for the robust training procedure.

%\begin{enumerate}
%	\item Based on the learned model, determine the p-values of all training data samples.
%	\item Given a desired false discovery rate (FDR) $\alpha$, use the Benjamini–Hochberg procedure  \citep{benjamini1995controlling} to classify the outliers.
%\end{enumerate}
%
%The framework is appealing, since $\alpha$ becomes a meaningful threshold for distinguishing between outliers and inliers. 
%In particular, our goal is to guarantee that, in expectation, the actual FDR is upper bounded by the nominal FDR $\alpha$ which is set by the user.
%
%However, the success of this approach hinges on basically the following two issues:
%\begin{itemize}
%	\item Choosing the correct hyper-parameters for the robust training procedure. % , which can typically not be learned using empirical Bayes.
%	\item Ensuring that the dispersion parameter is not under-estimated (for the goal of controlling FDR).
%\end{itemize}

Here in this work, we address the issue by proposing the use of the trimmed marginal likelihood.
Let $M$ be some probabilistic model, and denote by $p(\mathbf{y}_S | M)$ the marginal likelihood of data samples index by $S$. 
Let $\Omega = \{1, 2, \ldots, n\}$ denote the index set of all training samples.
Given some trimming factor $\nu$, we propose to find the set $T$, such that $p(\mathbf{y}_S | M)$ is maximized, with $S = \Omega \setminus T$,
and subject to $|T| = \floor{\nu  n}$.

% Conceptually, the proposed trimmed marginal likelihood is appealing since all parameters are integrated out, leaving only the hyper-parameter $\nu$.
The trimmed marginal likelihood is particularly attractive for Gaussian process (GP) regression where the marginal likelihood has an analytic solution.
In particular, we focus here on non-parametric regression model: % with Gaussian noise:
\begin{align*}
	y = f(\mathbf{x}) + \epsilon \, , 
\end{align*}
where $y$ and $\mathbf{x}$ are the response and covariates, respectively; $f$ is sampled from a GP, and $\epsilon$ is some random noise, for example, 
$\epsilon \sim N(0, \sigma^2)$. 

For GP regression, $\nu$ can be easily specified, since, as we prove in Section \ref{sec:theoreticQuarantee}, $\nu$ corresponds to the breakdown point of our proposed method.
%, even when jointly optimizing the hyper-parameters of the GP and set $S$.  
% $\nu$ can be interpreted as the upper bound on the ratio of outliers that we expect to be contained in the data. 
In case where knowledge about the upper bound on the ratio of outliers is not available, we propose an iterative procedure for estimating $\nu$ (see Section \ref{sec:estimating_nu}). 

However, the optimization over the set of data points $S$ is NP hard and even a greedy approximation is computationally challenging.
As a remedy, we propose an efficient projected gradient descent method with provable convergence guarantees (see Section \ref{sec:PGD}). 

Our experiments on various datasets and types of outliers demonstrate that the proposed method improves outlier detection and robustness when compared to several popular alternatives. Building on GPyTorch \citep{gardner2018gpytorch}, we also provide a computationally efficient implementation of our proposed method: \url{https://github.com/andrade-stats/TrimmedMarginalLikelihoodGP}

% Since our proposed method  like the student-t likelihood
 
% that also improves runtime over.
%Notably, we observe improvements, even when $\nu$ is assumed to be unknown (and therefore estimated with Algorithm in Section \re])
%In constrast to robust Gaussian process regression with the student-t likelihood, it is standard practise to optimizize the 
%Compare this to the student-$t$ likelihood, where imposing a meaning full prior on the degrees of freedom is non-trivial.
%Therefore, for robust Gaussian process regression with the student-t likelihood, it is standard practise to optimizize the 

%In the next section, we review previous related works, followed by Section \ref{sec:proposedMethod} which explains in detail the proposed method. The proposed method, which we call $\nu$-trimmed marginal likelihood GP, involves a joint optimization problem over covariance function hyper-parameters and the set $S$. In Section \ref{sec:optimization}, we propose a new projected gradient method for efficiently selecting the set $S$. In Section \ref{sec:selectionOfNu}, we show how it is possible to get an estimate on the outlier ratio $\nu$. 
%Finally, in Section \ref{sec:experiments}, we show on several synthetic and real data sets that the proposed method can lead to a better outlier detection accuracy than previous methods, while also providing a useful conservative estimate of the outlier ratio $\nu$.
%We summarize our findings in Section \ref{sec:conclusions}.

\section{Related Work}

Using the marginal likelihood for outlier detection has been proposed in \cite{Shotwell2011}. However, different from their works, we use the \emph{trimmed} marginal likelihood, which has the advantage that we do not require any probabilistic model for the outliers.

Our proposed method is related to the trimmed likelihood approach for linear regression \citep{rousseeuw2005robust,Rousseeuw2006} (also known as trimmed least squares).
Extending the trimmed likelihood approach beyond linear regression, was explored in \cite{muller2003breakdown}, though, they did not consider non-parametric models.

It is well known that the  trimmed least squares method tends to underestimate the true variance, and therefore asymptotic correction factors \citep{rousseeuw2005robust}  and correction factors based on simulations \citep{pison2002small} were previously proposed. 
%However, our proposed finite-sample bias correction method does not need any estimate of the true noise variance $\sigma^2_0$.
%Furthermore, for non-parametric regression, extensive simulations as in \citep{pison2002small} are not feasible.

Another general approach for robust parameter estimation is to replace the Kullback-Leibler-divergence, underlying the maximum likelihood estimate, by the $\beta$ or $\gamma$-distribution \citep{basu1998robust,fujisawa2008robust}. This approach has also been extended to Bayesian inference in general \citep{nakagawa2020robust,futami2018variational}, and Gaussian processes \citep{knoblauch2019generalized} in particular. 
However, how to specify the hyper-parameters of these methods is less clear \citep{nakagawa2020robust}. %, and often default values 
%  due to interpretation of  , and often done using cross-validation \cite{futami2018variational}.

The most popular method for robust GP regression is to replace the Gaussian likelihood function by a student-t distribution \citep{jylanki2011robust}.
However, the student-t distribution assumes that outliers are symmetric, i.e. an approximate even number of unusual large and small values.
Furthermore, when combined with a GP prior, the marginal likelihood is not analytically tractable anymore.

Recently, also several other methods for robust GP regression have been proposed, which 
can roughly be categorized into likelihood robustification methods and residual-based methods. 

\textbf{Likelihood Robustification Methods} The methods in \citep{daemi2019identification,daemi2019gaussian}
propose to use a mixture of two normal distributions for noise: one for modeling inliers and one for modeling outliers. % and also assume symmetric outliers.
\citep{lindfors2020robust} proposes to use a G-confluent distribution which generalizes the $t$-distribution,
but still assumes symmetric outliers.
In contrast, the work in \citep{alodat2020gaussian} and \citep{benavoli2021unified} propose to use the skew-normal distribution
instead of the normal likelihood. However, all of the above methods make a particular assumption on the type of noise/outliers through the choice of the likelihood function.

\textbf{Residual-based Methods}
The method in \citep{li2021robust} proposes to first train an ordinary GP regression model and then remove the data points with the largest residuals. Afterwards the GP regression model is trained again on the smaller set of data points, and the procedure of removing and retraining is repeated after a pre-defined number of steps.
However, it is not difficult to see that their proposed method has a break down point of 1, meaning that one data can have an arbitrarily large impact on the posterior distribution: consider one outlier with $y_{i_*} \rightarrow \infty$, then the residual to the outlier $i_*$ will always be smaller than the residual of all other data points, which will lead to $i_*$ being never removed. \\
%
Similarly, \citep{ramirez2021robust} proposes to assign weights to each observation, based on the distance of the response to other neighboring data points.
However, the method is sensitive to the choice of the neighborhood. \\
%
The method in \citep{park2021robust} introduces a bias vector $\boldsymbol{\delta} \in \mathbb{R}^n$, where $n$ is the number of samples.
If and only if $\delta_i \neq 0$, then sample $i$ is considered an outlier. They propose to learn $\boldsymbol{\delta}$ using the $\ell_1$-penalty. 
However, it can be shown that if there is even only one outlier with $y_{i_*} \rightarrow \infty$, then $\forall i: \delta_i \neq 0$, meaning all samples are considered as outliers (see supplement material for details).

\section{Proposed Method} \label{sec:proposedMethod}

Let $\Omega := \{1, ..., n\}$ denote the indices of all observations.
Let $M$ denote some probabilistic model (likelihood + prior), and $ \log p(\mathbf{y}_S | M) $ the log-marginal likelihood of a given subset $S \subseteq \Omega$ of observations.
For detecting a set of outliers $T \subseteq \Omega$, with $|T| = \floor{\nu n}$, we propose to use the $\nu$-trimmed marginal likelihood given as follows
% approach selects set of inliers $S$ using
\begin{align*}
	\hat{S}  := \argmax_{S \subseteq \Omega} \log p(\mathbf{y}_S | M)  , \;  \text{subject to $|S| = \ceil{(1 - \nu) n}$}  \, ,
	%	| X, \eta, \mathbf{l}, \sigma^2) = - \frac{1}{2} \mathbf{y}^T (K_{\eta, \mathbf{l}} + \sigma^2 I)^{-1} \mathbf{y}  - \frac{1}{2} \log | K_{\eta, \mathbf{l}} + \sigma^2 I | - \frac{n}{2} \log 2 \pi \, .
\end{align*}
where $\hat{T} := \Omega \setminus \hat{S}$ is the set of potential outliers. % \footnote{}
This is a natural way to define the set of outliers and  inliers, since the set $\hat{S}$ contains the samples that are best explained given model $M$.

In particular, for our model, we assume a zero mean GP process prior with covariance function $k$, and a Gaussian likelihood, that is
\begin{align*}
	f &\sim GP(0, k) \, , \\
	y &\sim N(f(\mathbf{x}), \sigma^2) \, .
\end{align*}

For our analysis and experiments we consider the scaled squared exponential covariance function, i.e.
\begin{align} \label{eq:employedCovarianceFunction}
	k_{\eta, \mathbf{l}}(\mathbf{x}_{i_1}, \mathbf{x}_{i_2} )  = \eta e^{- \sum_{j = 1}^d \frac{1}{2l_j}  (\mathbf{x}_{i_1}(j) -  \mathbf{x}_{i_2}(j))^2} \, ,
\end{align}
where $\eta$ is the variance of the signal, and $l_j$ are the length-scale parameters which control the change in correlation when the data points differ in dimension $j$. We assume that $\mathbf{l} = (l_1, \ldots, l_d) \in \mathbb{D}$, where $\mathbb{D}$ is a compact subset of $\mathbb{R}_+^d$.\footnote{We denote by $\mathbb{R}_+$ the set of positive reals which excludes 0 and $\infty$.}
% Note that we require the scope of $\mathbf{l}$ to be compact set excluding }
Furthermore, we assume $\sigma^2 \in \mathbb{R}_+$ and $\eta \in \mathbb{R}_+$.

Let $K_{\eta, \mathbf{l}} \in \mathbb{R}^{n \times n}$ denote the covariance matrix of all training data points, when using the covariance function from Equation \eqref{eq:employedCovarianceFunction}. We assume that $K_{\eta, \mathbf{l}}$ is a positive definite matrix for all  $\mathbf{l} \in \mathbb{D}$.

The log marginal likelihood $\log p(\mathbf{y} | X, \eta, \mathbf{l}, \sigma^2)$ is therefore given by
\footnotesize
\begin{align*} 
- \frac{1}{2} \mathbf{y}^T (K_{\eta, \mathbf{l}} + \sigma^2 I)^{-1} \mathbf{y}  - \frac{1}{2} \log | K_{\eta, \mathbf{l}} + \sigma^2 I | - \frac{n}{2} \log 2 \pi \, .
\end{align*}
\normalsize
%
Since a fully Bayesian approach, i.e. integrating out the hyper-parameters $\boldsymbol{\theta} := (\eta, \mathbf{l}, \sigma^2$), is computationally too expensive, we use empirical Bayes. For $S \subseteq \Omega$, let $(\mathbf{y}_S, X_S)$ denote the corresponding subset of the data. 
%, with 
% sample $i$ belonging to the subset iff $\gamma_i = 1$.
% We define the $v$-trimmed empirical Bayes (EB) marginal log-likelihood as 
We define the $v$-trimmed marginal likelihood GP by
\begin{align} \label{eq:trimmedGP_definition}
\maximize_{S, \boldsymbol{\theta}}   \;  \log p(\mathbf{y}_S | X_S, \boldsymbol{\theta}) \;  ,  \text{subject to $|S| = \ceil{(1 - \nu) n}$}  \, .
\end{align}
%The maximization is taken over the scope of all hyper-parameters ($\eta, \mathbf{l}, \sigma^2$), and $S$.
%In the following, we will always assume that optimizations with respect to $\boldsymbol{\gamma}$ are subject to $\sum_{i=1}^n \gamma_i = n - v$, and do not state the condition explicitly anymore in order to simplify exposition.

\subsection{Asymptotically Correct Outlier Rejection} \label{sec:theoreticQuarantee} % Outlier Prone Model (Breakdown Point)}

Similar in spirit to the definition of an outlier-prone model \citep{o1979outlier}, we define an outlier rejection method as asymptotically correct, if the set of observations with $y_i \rightarrow \infty$, or $y_i \rightarrow - \infty$ are detected as outliers.\footnote{The original definition of outlier-prone is only applicable to parametric models.} The following proposition ensures asymptotic correctness.

\begin{proposition} Assume the covariance function from Equation \eqref{eq:employedCovarianceFunction}. 
Let $V$ denote the true set of outliers, with
$y_i \rightarrow \infty$, or $y_i \rightarrow -\infty$, for $i \in V$.
Let $U$ denote the true set of inliers, with $y_i$ being bounded, for $i \in U$.
Then, eventually (i.e. for $i \in V$, $|y_i|$ being large enough), we have 
\begin{align*}
S \subseteq U \, ,
\end{align*}
where $S$ is the set of observations selected by the $\nu$-trimmed marginal likelihood GP, with $\floor{\nu n} \geq |V|$.   % m = \nu n
\label{prop:Asymptotically_Correct_Outlier_Rejection}
\end{proposition} 

We defer the proof to the supplement material. Note that the proof were trivial, if the hyper-parameters $\boldsymbol{\theta}$ were fixed.
However, since $S$ and $\boldsymbol{\theta}$ are jointly optimized, a careful, non-trivial proof is required.

Also note that Proposition \ref{eq:employedCovarianceFunction} expresses that the $\nu$-trimmed marginal likelihood GP has a breakdown point of $\nu$, in the sense that $\nu$ is the minimal ratio of data points that need to be contaminated in order to lead to an arbitrary bad posterior.\footnote{For a more formal definition of the classical concept of breakdown point see \citep{rousseeuw2005robust}, which should be read by replacing "parameters" with "hyper-parameters".}
% , before an outlier is included in $S$, and therefore can have arbitriy effect on the learned hyper-parameters $\hat{\boldsymbol{\theta})}$.


\section{Optimization} \label{sec:optimization}

Though conceptually easy, the $\nu$-trimmed marginal likelihood GP,
as defined in Equation \eqref{eq:trimmedGP_definition}, is a computationally difficult optimization problem.
Even if the hyper-parameters $\boldsymbol{\theta}$ were fixed, the remaining discrete optimization problem over $S \subseteq \Omega$ is still NP-hard. 

In the following let $m :=  \ceil{(1 - \nu) n}$.
After initializing all hyper-parameters $\boldsymbol{\theta}$, we iterate between the optimization of $\boldsymbol{\theta}$ and $S$, as follows:
%\footnote{We use the softplus transformation. This means all parameter defined on the positive real line are initialized to $\log 2$ after transformation. Note that  all parameters are initialized 
	\begin{enumerate}
		\item For fixed $\boldsymbol{\theta}$, find the set $S$ that approximately maximizes the marginal likelihood, subject to the constraint $|S| = m$.
		\item For fixed $S$, optimize $\boldsymbol{\theta}$ using one gradient descent step.
	\end{enumerate}	
	We repeat Step 1 and Step 2 till the marginal likelihood is not improved anymore.
	Step 2 is equal to the typical hyper-parameter optimization for GPs. % which is a non-convex optimization problem. % For setting the learning rate in Step 2, we use Adam \citep{kingma2015adam}.
	
%	\begin{align*}   \tag{P1} \label{pFinal}
%		\minimize_{S \subseteq  \{1, 2, \ldots, n\}}  \quad \mathbf{y}_S^T [ (K + \sigma^2 I)^{-1}]_S \mathbf{y}_S  \, , \\
%		\text{subject to} \quad |S| = m \,.
%	\end{align*}
	
	The complete algorithm is shown in Algorithm \ref{alg:fullOptimization}.
	When the step size $\xi^{(t)}$ is set small enough to ensure that $\ell^{(t)}$ decreases, Algorithm \ref{alg:fullOptimization} is guaranteed to converge.
	In our implementation, we set step size $\xi^{(t)}$ and search direction $\Delta \boldsymbol{\theta}^{(t)}$ using Adam \citep{kingma2015adam} as Optimizer $\mathcal{O}$.

	% CHECKED 
	\begin{algorithm}
		\caption{Trimmed-GP (Joint Optimization)}\label{alg:fullOptimization} % (Joint Optimization of $S$ and $\boldsymbol{\theta}$)}\label{alg:fullOptimization}
		\KwInput{$X, \mathbf{y}, \nu$}
		\KwOutput{set of inliers $S^{(t)}$, hyperparameters $\boldsymbol{\theta}^{(t)}$}
		$m := \lceil(1 - \nu) n \rceil$  \\ % \tcp*[f]{$n$ is the number of data samples in $X$} \\
		$t  := 1 ;  \;  \ell^{(t)} := \infty$  \\
		initialize $\boldsymbol{\theta}^{(t)}$. \\
		initialize optimizer $\mathcal{O}$ with global learning rate $\xi_0$. \\
		\Repeat{$\ell^{(t-1)}  < \ell^{(t)}$} { % \tcp*[f]{convergence criterion}}{ 
			%\tcp*[f]{Step 1: maximize $\log p(\mathbf{y}_S | X_S, \boldsymbol{\theta}^{(t)})$ with respect to $S$  } \\
			% \tcp*[f]{Step 1: maximize $\log p(\mathbf{y}_S | X_S, \boldsymbol{\theta}^{(t)})$ with respect to $S$  } \\
			  \tcp{Step 1: Optimize S  with PGD or Greedy}
			 % \tcp*[f]{Step 1: Optimize S  with PGD/Greedy}  \\ % $\quad \quad \quad \quad$ (solve optimiation approximately)} \\
			% calculate new covariance matrix $K^{(t)}$ using $\boldsymbol{\theta}^{(t)}$. \\
			$\displaystyle S' := \argmin_{S \subseteq \Omega  , \text{ $|S| = m$.}}  \;  \log p(\mathbf{y}_S | X_S, \boldsymbol{\theta}^{(t)})$ \\ % \;  ,  \text{subject to $|S| = m$.}$ \\
			% $S' := \argmin_{S}  \;  \mathbf{y}_S^T [ (K^{(t)} + \sigma^2 I)^{-1}]_S \mathbf{y}_S  \; ,   \text{subject to $|S| = m$.}$ \\
			\eIf{\scalebox{0.92}{$\log p(\mathbf{y}_{S'} | X_{S'}, \boldsymbol{\theta}^{(t)}) > \log p(\mathbf{y}_{S^{(t)}} | X_{S^{(t)}}, \boldsymbol{\theta}^{(t)})$}}{
				$S^{(t+1)} := S'$   \\ 
				reset history of optimizer $\mathcal{O}$. 
			}{
				$S^{(t+1)} := S^{(t)}$  % \tcp*[f]{leave set of inliers unchanged}
			}
			%\tcp*[f]{Step 2: increase $\log p(\mathbf{y}_{S^{(t+1)}} | X_{S^{(t+1)}}, \boldsymbol{\theta})$ by updating $\boldsymbol{\theta}$} \\
		    % \tcp{Step 2: Optimize  $\boldsymbol{\theta}$ with $\mathcal{O}$}  
		    \tcp{Step 2: increase $\log p(\mathbf{y}_{S^{(t+1)}} | X_{S^{(t+1)}}, \boldsymbol{\theta})$ by updating $\boldsymbol{\theta}$}
			find step size $\xi^{(t)}$ and direction $\Delta \boldsymbol{\theta}^{(t)}$ with $\mathcal{O}$. \\
			$\boldsymbol{\theta}^{(t+1)} := \boldsymbol{\theta}^{(t)} + \xi^{(t)} \Delta \boldsymbol{\theta}^{(t)} $  \\
			$\ell^{(t+1)} := - \frac{1}{m} \log p(\mathbf{y}_{S^{(t+1)}} | X_{S^{(t+1)}}, \boldsymbol{\theta}^{(t+1)})$ \\
			$t  := t + 1$ \\
		}  
\end{algorithm}

The optimization problem in Step 1 can be expressed as follows. Find the set of samples $S \subseteq \{1, 2, \ldots, n\}$, with $|S| =  m$, that maximize the marginal likelihood
\begin{align} \label{eq:marginalLikelihood_forS_optimization}
- \frac{1}{2} \mathbf{y}_S^T (K_S + \sigma^2 I)^{-1} \mathbf{y}_S  - \frac{1}{2} \log | K_S + \sigma^2 I | - \frac{m}{2} \log 2 \pi \, ,
\end{align}
%
where $K_S \in \mathbb{R}^{m \times m}$ is a sub-matrix of the positive-definite matrix $K \in \mathbb{R}^{n \times n}$,
such that $K_S$ contains the rows and columns of $K$ indexed by $S$.
Step 1 is challenging, since even a greedy search algorithm is computationally expensive due to the need for the repeated evaluation of the marginal likelihood.
%  which leads to a total complexity of $O(m^4)$ for one greedy update.\footnote{Using the block matrix inversion lemma, this can be reduced to $O(m^3)$, which is still too slow.}

\subsection{Projected Gradient Descent (PGD)} \label{sec:PGD}

For finding a computationally feasible solution to Step 1, we proceed as follows.
Assuming that the outliers are in the responses $ \mathbf{y}$, and not in the covariates, we can ignore the term $ \log | K_S + \sigma^2 I |$ in Equation \eqref{eq:marginalLikelihood_forS_optimization}.
%\footnote{This might sound like a strong assumption, but note that }
This reduces the problem to the maximization of 
% To facilitate the optimization, we propose to maximize the following lower bound:
\begin{align} \label{eq:firstSimplification}
- \frac{1}{2} \mathbf{y}_S^T (K_S + \sigma^2 I)^{-1} \mathbf{y}_S  \, ,
\end{align}
subject to the constrain that $|S| = m$. 

Since, we assume, that there are no outliers in the covariates, we can re-express this as 
%
%
%Further, assuming that 
%\begin{align}  \label{eq:approximationWhenIsItValid}
%(K_S + \sigma^2 I)^{-1} \approx  [ (K + \sigma^2 I)^{-1}]_S \, ,
%\end{align}
%we arrive at the following optimization problem
%\begin{align*}   \tag{P1} \label{pFinal}
%\minimize_{S \subseteq  \{1, 2, \ldots, n\}}  \quad \mathbf{y}_S^T [ (K + \sigma^2 I)^{-1}]_S \mathbf{y}_S  \, , \\
%\text{subject to} \quad |S| = m \,.
%\end{align*}
%The resulting problem is still NP-hard  (see e.g. \citep{Chen2019}), but can be solved approximately using the following projected gradient method.
% We can express \eqref{pFinal} as follows
%
\begin{align*}   \tag{P1} \label{pPGM}
	\minimize_{\mathbf{b}}  \quad f(\mathbf{b})   \, ,
	\quad \text{subject to} \quad \| \mathbf{b} \|_0 = n - m \,,
\end{align*}
where we defined $f(\mathbf{b}) := (\mathbf{y} + \mathbf{b})^T (K + \sigma^2 I)^{-1} (\mathbf{y} + \mathbf{b})$, 
and $ \| \cdot \|_0$ counts the number of non-zero entries.

The auxiliary variables $(b_1, \ldots, b_n) = \mathbf{b}^T$ can be interpreted as corrections to the original responses $\mathbf{y}$ such that Equation \eqref{eq:firstSimplification} is maximized.
In particular, if $b_i = 0$, then this means that no correction for sample $i$ is needed, suggesting that $y_i$ is no outlier. Therefore, the constraint $|| \mathbf{b} ||_0 = n - m$ says that we assume that there are $m$ inliers, which corresponds to the constraint $|S| = m$. 

Problem \ref{pPGM} can be solved (approximately) with the following projected gradient descent algorithm.
Denote by $c$ a Lipschitz constant of $\nabla f(\mathbf{b})$, i.e. 
\begin{align*}  
	\forall \mathbf{b}_1, \mathbf{b}_2:   \| \nabla f(\mathbf{b}_1) -  \nabla f(\mathbf{b}_2) \|_2  \leq c \| \mathbf{b}_1 - \mathbf{b}_2 \|_2 \, .
\end{align*}
Here, the smallest Lipschitz constant of $f$ is given by
\begin{align*}  
	\max_{\mathbf{x}, \|\mathbf{x}\|_2 = 1}	\| 2 (K + \sigma^2 I)^{-1} \mathbf{x} \|_2 = 2 \frac{1} {\lambda_{\min}(K + \sigma^2 I)} \, .
\end{align*}

A local minima can then be found by iterating
\begin{align*}  
	\mathbf{b}_{k+1} = \text{proj}_C \Big[ \mathbf{b}_k - \frac{1}{c} \nabla f (\mathbf{b}_k) \Big] \, ,
\end{align*}
where 
\begin{align*}  
	\nabla f(\mathbf{b}) = 2  (K + \sigma^2 I)^{-1} (\mathbf{y} + \mathbf{b}) \, , 
\end{align*}
and $ \text{proj}_C$ denotes the projection onto the set $C := \{ \mathbf{x} \in \mathbb{R}^n \; |  \;  \| \mathbf{x} \|_0 \leq n - m\}$, which is given by
\begin{align*}  
	\text{proj}_C \big[ \mathbf{b} \big] = \argmin_{\mathbf{x}, \| \mathbf{x} \|_0 \leq n - m} \| \mathbf{b} - \mathbf{x} \|_2^2 \, .
\end{align*}

Note that, though the constraint $|| \mathbf{b} ||_0 = n - m$ is not convex, we can prove that the proposed projected gradient algorithm is guaranteed to converge to a stationary point:
\begin{theorem} \label{thm:pgdConvergence}
	Any sequence $\{\mathbf{b}_k\}$ generated by the projected gradient descent algorithm for Problem~\eqref{pPGM} globally converges to a stationary point with locally linear convergence rate.
\end{theorem}
The proof is in the supplement material.
% \citep{Attouch_etal13,KLfunc_Li2018}.\footnote{Details on convergence are described in the supplement material.}
We note that since $(K + \sigma^2 I)^{-1}$ is fixed, each iteration involves only one matrix-vector multiplication which is in $O(n^2)$ and can be efficiently computed with GPUs.

\subsection{Greedy Methods} \label{sec:Greedy}

Recall that our goal is to maximize the marginal likelihood, Equation \eqref{eq:marginalLikelihood_forS_optimization}. However, the projected gradient descent method described in the previous section optimizes the simplified objective in (P1). Therefore, we also compare to a greedy method that directly optimizes Equation \eqref{eq:marginalLikelihood_forS_optimization}.

%The optimization in Line 8 of Algorithm \ref{alg:fullOptimization} 
%The projected Gradient Descent method described before needs to 

The greedy method starts with the index set of all data points $S := \{1, 2, \ldots, n\}$, and then removes 
the data point $i_*$ that leads to the largest marginal likelihood, i.e.
\begin{equation} \label{eq:greedy_remove}
	i_* :=  \argmax_{i \in S}  \Big(  \log p(\mathbf{y}_{S \setminus \{i\}} | X_{S \setminus \{i\}}, \boldsymbol{\theta}) \Big) \, .
\end{equation}
This is repeated until $|S| = \ceil{(1 - \nu) n}$. Naively solving the optimization in Equation \eqref{eq:greedy_remove} is in $O(n^4)$, since 
we need to repeat $n$-times the calculation of the determinant and inverse of $K_{S \setminus \{i\}}$, where $K_{S \setminus \{i\}}$ denotes the covariance matrix (plus $\sigma^2 I$) of the data points in  $S \setminus \{i\}$.
However, using the block matrix inversion lemma (together with the Woodbury formula) and the cofactor representation of the determinant, we can solve it in $O(n^3)$ (details in supplement material). 
Since the computation needs to be repeated $\floor{\nu n}$ times, the greedy algorithm can still be too computationally expensive. 
Therefore, we also propose a batched version:  first evaluate the leave-one-out (loo) estimate $\log p(\mathbf{y}_{S \setminus \{i\}} | X_{S \setminus \{i\}}, \boldsymbol{\theta})$ for all $i \in \{1, 2, \ldots, n\}$, and, second, remove at once the $\floor{\nu n}$ samples with the highest loo estimate. We call the original greedy method Greedy (1-by-1), and the batched version Greedy (batch).

\section{Improved $\nu$ estimate}   \label{sec:estimating_nu}

The upper bound $\nu$ on the ratio of the number of outliers might be too conservative, and as a consequence can lead to statistical inefficiency.
Therefore, we propose the following procedure to improve upon the initial upper bound $\nu$:
\begin{enumerate}
	\item Using $k$-fold cross-validation, we estimate the residuals $\mathbf{r}$ of all data points. 
	\item Based on the residuals $\mathbf{r}$, we calculate a robust estimate of the noise variance $\sigma^2$.
	\item We count the number of data points which residuals $\mathbf{r}$ are within two standard deviations $\sigma$,
	and use this number to get a new estimate for $\nu$. 
\end{enumerate}
In Step 3, if the new estimate is  smaller than the original $\nu$, we repeat the above procedure.
The details of the algorithm are show in Algorithm \ref{alg:improvingNuEstimate}, where $k$ denotes the number of folds,
and $(train, test)$ denotes the training and test indices of one fold. For our experiments we use $k = 10$, i.e. 10-fold cross-validation.
Furthermore, note that within the cross-validation, we use $\nu_*$ (defined in line 5) instead of $\nu^{(t)}$ due to a possibly uneven split of outliers in $(train, test)$.
In line 7, $\mathbb{E}[\hat{\mathbf{y}}_{test} | X_{test}, X_S, \mathbf{y}_S, \boldsymbol{\theta}]$ denotes the predicted mean response at data points $X_{test}$ using the GP with training data points $(X_S, \mathbf{y}_S)$ and covariance function hyperparameters $\boldsymbol{\theta}$.
Note that in line 9, $r_{(w)}^2$ denotes the $w$-th smallest squared residual, and  $Q_{\chi^2(1)} $ denotes the quantile function for the $\chi^2$ distribution with 1 degree of freedom. The robust variance estimator, in line 9, is a generalization of the estimator proposed in \citep{rousseeuw1984least} and is explained in more detail in the supplement material. 

\begin{algorithm}
	\caption{Improved $\nu$ estimate}\label{alg:improvingNuEstimate}
	\KwInput{$X, \mathbf{y}, \nu$}
	\KwOutput{new upper bound on outlier ratio $\nu^{(t)}$}
	$t  := 1$ \\
	$\nu^{(t)} := \nu$   \tcp*[f]{set to initial estimate of number of outliers} \\
	\Repeat{$\nu^{(t)}  \geq \nu^{(t-1)}$ }{ 
		\For{(train, test) \KwIn $k$-$\text{Fold}(n)$} {
			\tcp{use $\nu_*$ instead of $\nu^{(t)}$ due to possibly uneven split of outliers }
			$\nu_* := \nu^{(t)} / (1 - \frac{1}{k}) $   \\	
			$S, \boldsymbol{\theta}$ = Trimmed-GP($X_{train}, \mathbf{y}_{train}, \nu_*$) \\
			\tcp{residuals at test points} 
			$\mathbf{r}_{test} := \mathbf{y}_{test} - \mathbb{E}[\hat{\mathbf{y}}_{test} | X_{test}, X_S, \mathbf{y}_S, \boldsymbol{\theta}] $ \\
		}
		$\sigma^2 := r_{(\floor{(1 - \nu^{(t)}) n})}^2 / Q_{\chi^2(1)} (1 - \nu^{(t)})$  \\ 
		$\nu^{(t+1)} := \#( | \mathbf{r} |  > 2 \sigma) / n $  \tcp*[f]{count samples not within two std}  \\
		$t  := t + 1$ \\		
	}  
\end{algorithm}

Algorithm \ref{alg:improvingNuEstimate} is inspired by the iterative procedure for least trimmed squares described in the book \citep{rousseeuw2005robust} (pages 132ff). However, the difference is that, since \citep{rousseeuw2005robust} only use a linear model, they ignore possible over-fitting and estimate the residuals without any cross-validation procedure.

\section{Experiments}\label{sec:experiments}

In this section, we evaluate the proposed method and several baselines on the task of correctly identifying outliers and in terms of predictive performance.

\paragraph{Baselines and Implementations}
We compare to a GP with student-$t$ likelihood for estimating $\mathbb{E}[y | \mathbf{x}]$, denoted as $t$-GP. 
Note that the student-$t$ likelihood does not explicitly distinguish between inliers and outliers. Therefore it is essentially a noise model for both the inliers and outliers. 
% In order to get an estimate for the noise variance $\sigma^2$ of the inliers, we use the median of the residuals with an asymptotic correction for getting the correct variance in case of no outliers \citep{rousseeuw1984least}.\footnote{In the supplement material we also show residual plots of the baselines that are normalized by the estimate for $\sigma^2$.}
%\footnote{The residuals for $t$-GP and $\gamma$-GP were calculated using 10-fold cross-validation, since a leave-one-out estimation as in Equation \eqref{eq:residualCalculation} is too computationally expensive. The standard GP and the proposed method learn the hyper-parameters using the full data, and then uses Equation \eqref{eq:residualCalculation} for estimating the residuals.}
We also compare our method to a standard GP trained by minimizing the KL-divergence (GP), and one trained by minimizing the $\gamma$-divergence ($\gamma$-GP). 
% For GP and $\gamma$-GP, we estimate the noise variance $\sigma^2$ using the empirical Bayes estimate.
All hyper-parameters are estimated with empirical Bayes using the complete data set $(X, \mathbf{y})$.

% For all methods we estimate the p-values by assuming that the residuals follow a Gaussian distribution with zero mean and corresponding estimate of $\sigma^2$. 
All methods were implemented using GPyTorch \citep{gardner2018gpytorch}, and the full dataset (no inducing points) was used. For the proposed method ($\nu$-GP) we set $\nu = 0.5$ and use Algorithm \ref{alg:improvingNuEstimate}. 
Note that $0.5$ is also the breakdown point of the student-$t$ distribution.
% For fair comparison, we chose $\nu = 0.5$ since this also corresponds to the breakdown point of the student-$t$ distribution, but note that in practice often a less conservative upper bound is known. %  (which we expect to be benificial for )
In Algorithm \ref{alg:fullOptimization} (line 6) we use the proposed projected gradient descent (PGD) method (if not mentioned otherwise).

We released the source code of the proposed method and all baselines here \url{https://github.com/andrade-stats/TrimmedMarginalLikelihoodGP}.

\paragraph{Synthetic Datasets}
For illustration of the differences between each method, we created a simple one-dimensional bow-shaped data, shown in Figure \ref{fig:syntheticSimpleSin_noNoise} with $n = 400$ (bow). %;  true noise variance $\sigma^2_0 = 0.01$.
Furthermore, we use the Friedman data set as in \citep{friedman1991multivariate,naish2007robust} with $d = 10$, and $n = 100$ (F100), and $n = 400$ (F400). %;  true noise variance $\sigma^2_0 = 1.0$.

\begin{figure}
\centering
\includegraphics[width=0.7\linewidth,page=1]{../../all_plots/syntheticSimpleSin_noNoise_onlyData.pdf}
\caption{Blue dots show the samples from the synthetic bow-shaped data ($\sigma^2 = 0.01$). Black line shows true function. } \label{fig:syntheticSimpleSin_noNoise}
\end{figure}

\paragraph{Real Datasets}
We also evaluated all methods on three commonly used regression datasets: bodyfat ($d = 14, n = 252$), housing ($d = 13, n = 506$) and spacega ($d = 6, n = 3107$) that are available from the LIBSVM archive.\footnote{\url{https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html}}
% cadata ($d = 8, n = 800$)\footnote{For cadata we took only a random subsample of 800.} 

\paragraph{Outlier Types}
A random subset of data points is replaced by the following three types of outliers.
\begin{itemize}
\item \textbf{uniform}
The position (=covariates) of the outliers is unchanged, but the response is changed by randomly adding or subtracting a value which is uniformly drawn between 3 and 9 standard deviations of the original response.
\item \textbf{focused}
The position of the outliers is the median of each dimension plus some jitter. The response is set to the original response minus 3 times the standard deviation of the original response plus some jitter.%\footnote{For more than one dimension the original response is the median of the median of each dimension's response.}
\item \textbf{asymmetric}
Same as \emph{uniform}, but the responses, corresponding to the outliers, are changed by either always adding or always subtracting a uniformly drawn positive number.
\end{itemize}
In all cases, we change $10\%$ of the existing data points to outliers. 
For all experiments we report the average over 10 times randomly adding outliers (and standard deviation in brackets).
Additional details on data preprocessing and hyper-parameter initialization are provided in the supplement material.

\subsection{Results}
The results for the bow-shaped data are shown in Figure \ref{fig:syntheticSimpleSin_noise}.
First, looking at the results for uniform outliers, we observe that all methods approximately infer the true underlying function, while only the standard GP shows a few deviations. As a consequence, all methods correctly identify all outliers.
However, for the focused outliers, the situation is quite different: $t$-GP and $\gamma$-GP assume that the focused outliers are part of the true function, and the top of the bow are the outliers, while our proposed method $\nu$-GP infers the opposite. Both results are plausible, and show that $\nu$-GP can detect different types of outliers than the popular $t$-GP.
Finally, for the asymmetric outliers all robust GP methods are able to infer the correct function, while only the standard GP is influenced by the outliers.

While we used the bow-shaped data to show the qualitative differences between the GP methods, we next evaluate all methods also quantitatively on the more challenging datasets F100, F400, and the three real datasets (bodyfat, housing, spacega).
We investigate each method's performance in terms of ranking the set of outliers correctly using the residuals.
Since we know the total number of outliers, we use R-precision for evaluation.\footnote{At least for the synthetic data, bow, F100 and F400;  for the real data the true number of outliers is unknown, but assumed to be at least the number of extra added outliers.} 
Let $r$ be the total number of outliers, then R-precision is defined as the number of true outliers within the top-$r$ largest residuals divided by $r$.

The results, summarized in Table~\ref{tab:rprecision_comparison}, show that the proposed method is better in identifying outliers than other robust GP methods. 
As can be seen in Table~\ref{tab:rmse_comparison} this also leads to better prediction performance at test time. 
Notably, for all real datasets we achieve considerable improvements in root mean squared error (RMSE)  when compared to other robust GP methods.
In terms of runtime, our proposed method is slower, but still in the same order as other robust methods for the largest dataset (details in supplement material).
% Overall, we also find that our proposed method $\nu$-GP is faster than other robust GP methods (see Table \ref{tab:runtime_GP_methods}).

\begin{figure}
\centering
\includegraphics[width=1.0\linewidth,page=1]{../../all_plots/syntheticSimpleSin_uniform_all.pdf} \\
\vspace{0.5cm}
\includegraphics[width=1.0\linewidth,page=1]{../../all_plots/syntheticSimpleSin_focused_all.pdf} \\
\vspace{0.5cm}
\includegraphics[width=1.0\linewidth,page=1]{../../all_plots/syntheticSimpleSin_asymmetric_all.pdf}
\caption{Shows uniform (top), focused (middle) and asymmetric (bottom) outliers for the synthetic bow-shaped data. Note that here focused outliers (middle) are at around position (0, -2). Red shows the predicted function of each method. Pink and blue dots are the true outliers and inliers, respectively. Black line shows true function. } \label{fig:syntheticSimpleSin_noise}
\end{figure}

\begin{table}
\centering
\caption{Evaluation of all methods in terms of outlier ranking performance (R-precision). $10\%$ of data points are outliers. }\label{tab:rprecision_comparison}
\footnotesize
\begin{tabular}{rllll}
% 
\toprule % from booktabs package
\multicolumn{5}{c}{uniform outliers} \\
\midrule 
& \bfseries  GP &  \bfseries  $\gamma$-GP  & \bfseries  $t$-GP  & \bfseries  $\nu$-GP   \\
\midrule
bow & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
F100 & 0.92 (0.17) & 0.86 (0.22) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
F400 & \textbf{1.0} (0.0) & 0.97 (0.02) & 0.98 (0.02) & \textbf{1.0} (0.0) \\
body & 0.84 (0.05) & \textbf{0.86} (0.05) & \textbf{0.86} (0.05) & \textbf{0.86} (0.06) \\
house & \textbf{0.85} (0.06) & 0.84 (0.04) & 0.84 (0.04) & \textbf{0.85} (0.06) \\
spacega & \textbf{0.99} (0.0) & 0.87 (0.07) & 0.95 (0.01) & 0.98 (0.0) \\
\midrule % from booktabs package
\multicolumn{5}{c}{focused outliers} \\
\midrule
bow & 0.57 (0.06) & 0.18 (0.1) & 0.2 (0.06) & \textbf{0.97} (0.1) \\
F100 & 0.59 (0.16) & 0.44 (0.21) & 0.39 (0.14) & \textbf{0.72} (0.43) \\
F400 & 0.41 (0.08) & 0.47 (0.17) & 0.64 (0.34) & \textbf{1.0} (0.0) \\
body & 0.54 (0.13) & 0.54 (0.09) & 0.56 (0.18) & \textbf{0.78} (0.24) \\
house & 0.34 (0.26) & 0.46 (0.12) & 0.46 (0.13) & \textbf{0.64} (0.28) \\
spacega & 0.23 (0.02) & 0.18 (0.02) & 0.17 (0.01) & \textbf{0.97} (0.01) \\
\midrule
\multicolumn{5}{c}{asymmetric outliers} \\
\midrule
bow & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
F100 & 0.8 (0.17) & 0.75 (0.21) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
F400 & 0.96 (0.03) & 0.95 (0.03) & 0.95 (0.02) & \textbf{1.0} (0.0) \\
body & 0.81 (0.07) & \textbf{0.86} (0.05) & \textbf{0.86} (0.05) & \textbf{0.86} (0.06) \\
house & 0.82 (0.05) & 0.85 (0.04) & 0.85 (0.03) & \textbf{0.87} (0.05) \\
spacega & 0.96 (0.01) & 0.85 (0.02) & 0.94 (0.01) & \textbf{0.98} (0.0) \\
\bottomrule % from booktabs package
\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Root mean squared error (RMSE) of predictions on test data.}\label{tab:rmse_comparison}
	\footnotesize
	\begin{tabular}{rllll}
		% 
		\toprule % from booktabs package
		\multicolumn{5}{c}{no extra added outliers} \\
		\midrule 
		& \bfseries  GP &  \bfseries  $\gamma$-GP  & \bfseries  $t$-GP  & \bfseries  $\nu$-GP   \\
		\midrule % from booktabs package
		bow & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) \\
		F100 & 0.23 (0.04) & 0.25 (0.05) & \textbf{0.22} (0.04) & 0.31 (0.07) \\
		F400 & \textbf{0.15} (0.01) & 0.61 (0.2) & 0.61 (0.19) & 0.27 (0.01) \\
		body & 0.11 (0.09) & 0.22 (0.11) & 0.56 (0.23) & \textbf{0.06} (0.08) \\
		house & \textbf{0.35} (0.07) & 0.83 (0.39) & 0.99 (0.29) & 0.48 (0.13) \\
		spacega & 0.41 (0.03) & 0.48 (0.04) & 0.49 (0.03) & \textbf{0.39} (0.02) \\
		\midrule
		\multicolumn{5}{c}{uniform outliers} \\
		\midrule 
		bow & 0.12 (0.04) & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) & \textbf{0.06} (0.0) \\
		F100 & 0.66 (0.18) & 0.47 (0.25) & \textbf{0.29} (0.1) & 0.32 (0.06) \\
		F400 & 0.38 (0.05) & 0.64 (0.05) & 0.64 (0.05) & \textbf{0.26} (0.02) \\
		body & 0.27 (0.15) & 0.57 (0.1) & 0.58 (0.08) & \textbf{0.1} (0.06) \\
		house & 0.65 (0.22) & 0.85 (0.15) & 0.86 (0.14) & \textbf{0.38} (0.11) \\
		spacega & \textbf{0.4} (0.02) & 0.68 (0.05) & 0.53 (0.04) & 0.41 (0.02) \\
		\midrule
		\multicolumn{5}{c}{focused outliers} \\
		\midrule
		bow & 0.2 (0.01) & 0.26 (0.03) & 0.27 (0.03) & \textbf{0.07} (0.07) \\
		F100 & 0.44 (0.05) & 0.46 (0.05) & 0.44 (0.05) & \textbf{0.28} (0.05) \\
		F400 & 0.3 (0.04) & 0.4 (0.14) & 0.46 (0.15) & \textbf{0.2} (0.05) \\
		body & 0.41 (0.08) & 0.5 (0.06) & 0.46 (0.08) & \textbf{0.1} (0.09) \\
		house & \textbf{0.34} (0.05) & 0.44 (0.11) & 0.51 (0.12) & 0.37 (0.12) \\
		spacega & 0.44 (0.09) & 0.51 (0.09) & 0.51 (0.09) & \textbf{0.41} (0.06) \\
		\midrule
		\multicolumn{5}{c}{asymmetric outliers} \\
		\midrule
		bow & 0.34 (0.04) & \textbf{0.06} (0.01) & \textbf{0.06} (0.0) & 0.07 (0.01) \\
		F100 & 0.74 (0.13) & 0.61 (0.2) & \textbf{0.23} (0.02) & 0.34 (0.02) \\
		F400 & 0.54 (0.04) & 0.57 (0.14) & 0.63 (0.04) & \textbf{0.26} (0.02) \\
		body & 0.42 (0.05) & 0.57 (0.15) & 0.64 (0.06) & \textbf{0.16} (0.08) \\
		house & 0.65 (0.07) & 0.76 (0.17) & 0.81 (0.15) & \textbf{0.31} (0.08) \\
		spacega & 0.56 (0.02) & 0.73 (0.05) & 0.55 (0.02) & \textbf{0.42} (0.02) \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}


\section{Analysis}
Here we investigate several aspects of the proposed $\nu$-GP. 

\subsection{Estimation of $\nu$}
The values of $\nu$ estimated with Algorithm \ref{alg:improvingNuEstimate} were around $2\%$ for the datasets without added outliers, and around $8\%$ for the datasets with added outliers. %(more details in supplement material). %  are in Table \ref{tab:supp_analysis_nu} of the 
That means,  the estimated $\nu$ were considerably smaller than the initial value of $50\%$, but slightly lower than the true ratio of outliers (which is $10\%$). This might be because, some of the outliers do not conflict with the smoothness properties of the covariance function and thus cannot be distinguished from inliers.
% a correct upper bound on the true number of outliers (at least for the synthetic data, bow, F100 and F400;  for the real data the true number of outliers is unknown).



\subsection{Marginal Likelihood Optimization}
In order to optimize the marginal likelihood in Equation \ref{eq:trimmedGP_definition}, we proposed Algorithm \ref{alg:fullOptimization} either with a projected gradient descent (PGD) method (Section \ref{sec:PGD}) or a greedy method (Section \ref{sec:Greedy}). % Algorithm \ref{alg:fullOptimization}
Here, we compare the solutions of these different optimization methods with respect to the marginal likelihood,  outlier detection, and prediction on test data.\footnote{Due to the long runtime of the greedy methods, here, we fix $\nu$ to $0.2$ for all methods.} 
For conciseness, we report here the average results over all outlier types, detailed results for each outlier type (no, uniform, focused, asymmetric) can be found in the supplement material. 
As we can see in Table \ref{tab:analysis_marginal_likelihood}, expect for spacega, the PGD method often provides better solutions to the combinatorial optimization problem than greedy (batch), but worse than greedy (1-by-1). However, as can be seen in Table \ref{tab:analysis_runtime_optimization_methods}, the runtime of PGD is considerably faster than all greedy methods, and, for the larger dataset spacega, Greedy (1-by-1) was actually infeasible.
Comparing Table \ref{tab:analysis_marginal_likelihood} and \ref{tab:analysis_r_precision}, we see that in most cases better marginal likelihood translates into better outlier detection.
However, the relation between the marginal likelihood and prediction on test data, Table \ref{tab:analysis_rmse}, is slightly mixed - a result that is in line with recent discussions about the optimization of the marginal likelihood for improving test performance \citep{Lotfi2022}.

\begin{table}
\centering
\caption{Average marginal likelihood of solution found by different optimization methods.} \label{tab:analysis_marginal_likelihood}
\footnotesize
\begin{tabular}{rlll}
	% 
	\toprule % from booktabs package
	&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
	\midrule 
bow & \textbf{1.73} (0.09) & 1.59 (0.13) & \textbf{1.73} (0.09) \\
F100 & 0.09 (0.15) & -0.09 (0.22) & \textbf{0.12} (0.31) \\
F400 & 0.21 (0.14) & 0.11 (0.22) & \textbf{0.27} (0.19) \\
body & \textbf{0.39} (2.56) & 0.13 (2.33) & 0.37 (2.53) \\
house & -0.71 (1.2) & -0.77 (1.17) & \textbf{-0.67} (1.23) \\
spacega & -0.27 (0.03) & \textbf{0.08} (0.2) & - \\
	\bottomrule % from booktabs package
\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Average outlier ranking performance (R-precision) of different optimization methods.} \label{tab:analysis_r_precision}
	\footnotesize
	\begin{tabular}{rlll}
		\toprule % from booktabs package
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
bow & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
F100 & \textbf{1.0} (0.0) & \textbf{1.0} (0.02) & \textbf{1.0} (0.0) \\
F400 & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) & \textbf{1.0} (0.0) \\
body & \textbf{0.91} (0.08) & 0.89 (0.09) & 0.9 (0.08) \\
house & \textbf{0.87} (0.11) & 0.75 (0.21) & 0.81 (0.2) \\
spacega & \textbf{0.97} (0.0) & 0.76 (0.36) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Average root mean squared error (RMSE) on test data of different optimization methods.} \label{tab:analysis_rmse}
	\footnotesize
	\begin{tabular}{rlll}
		\toprule % from booktabs package
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
bow & \textbf{0.05} (0.01) & 0.06 (0.01) & \textbf{0.05} (0.01) \\
F100 & \textbf{0.29} (0.06) & 0.32 (0.1) & 0.32 (0.14) \\
F400 & 0.25 (0.03) & \textbf{0.24} (0.04) & \textbf{0.24} (0.04) \\
body & \textbf{0.08} (0.1) & 0.09 (0.1) & \textbf{0.08} (0.09) \\
house & 0.42 (0.14) & \textbf{0.38} (0.11) & 0.42 (0.14) \\
spacega & 0.43 (0.05) & \textbf{0.38} (0.04) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}

\begin{table}
	\centering
	\caption{Average runtime in minutes of each optimization method.} \label{tab:analysis_runtime_optimization_methods}
	\footnotesize
	\begin{tabular}{rlll}
		\toprule % from booktabs package
		&  \bfseries  PGD  & \bfseries  Greedy (batch)  & \bfseries  Greedy (1-by-1)   \\
		\midrule 
bow & \textbf{0.15} (0.05) & 4.61 (4.9) & 137.21 (55.65) \\
F100 & \textbf{0.13} (0.15) & 3.64 (5.38) & 6.53 (4.64) \\
F400 & \textbf{0.13} (0.04) & 4.64 (6.13) & 99.66 (64.59) \\
body & \textbf{0.72} (0.67) & 3.44 (3.08) & 42.06 (42.27) \\
house & \textbf{0.34} (0.65) & 4.38 (4.76) & 132.08 (138.23) \\
spacega & \textbf{0.71} (0.17) & 12.47 (6.59) & - \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}


\subsection{Higher number of outliers}
Finally, we compare the performance of all methods under higher contamination, setting the ratio of outliers to $\{0.2, 0.3, 0.4\}$.
For these experiments, we fixed $\nu$ to $0.5$, meaning that we expect up to 50\% of all data points to be outlier. 
The average outlier detection performance is shown in Table \ref{tab:rprecision_comparison_high_contamination}, suggesting that the proposed $\nu$-GP is also suited for outlier detection with higher number of outliers.
%
\begin{table}
	\centering
	\caption{Evaluation in terms of outlier ranking performance (R-precision) with different ratio of outliers; average over the outlier types "uniform", "focused", and "asymmetric".}\label{tab:rprecision_comparison_high_contamination}
	\footnotesize
	\begin{tabular}{rllll}
		% 
		\toprule % from booktabs package
		\multicolumn{5}{c}{20\% outliers} \\
		\midrule 
		& \bfseries  GP &  \bfseries  $\gamma$-GP  & \bfseries  $t$-GP  & \bfseries  $\nu$-GP   \\
		\midrule
		bow & 0.87 (0.17) & 0.72 (0.4) & 0.73 (0.38) & \textbf{0.99} (0.02) \\
		F100 & 0.78 (0.12) & 0.66 (0.16) & 0.81 (0.21) & \textbf{0.96} (0.05) \\
		F400 & 0.8 (0.2) & 0.87 (0.15) & 0.77 (0.35) & \textbf{0.99} (0.02) \\
		body & 0.8 (0.23) & 0.76 (0.17) & 0.77 (0.26) & \textbf{0.99} (0.01) \\
		house & 0.67 (0.39) & 0.74 (0.24) & 0.83 (0.17) & \textbf{0.94} (0.03) \\
		spacega & 0.7 (0.34) & 0.7 (0.23) & 0.69 (0.3) & \textbf{0.97} (0.01) \\
		\midrule % from booktabs package
		\multicolumn{5}{c}{30\% outliers} \\
		\midrule
		bow & 0.77 (0.19) & 0.66 (0.39) & 0.72 (0.34) & \textbf{0.8} (0.37) \\
		F100 & 0.75 (0.11) & 0.6 (0.15) & 0.65 (0.18) & \textbf{0.98} (0.03) \\
		F400 & 0.77 (0.17) & 0.72 (0.28) & 0.86 (0.12) & \textbf{1.0} (0.01) \\
		body & 0.76 (0.18) & 0.71 (0.18) & 0.72 (0.27) & \textbf{1.0} (0.01) \\
		house & 0.67 (0.32) & 0.85 (0.08) & 0.86 (0.1) & \textbf{0.96} (0.02) \\
		spacega & 0.7 (0.28) & 0.61 (0.22) & 0.58 (0.26) & \textbf{0.98} (0.01) \\
		\midrule
		\multicolumn{5}{c}{40\% outliers} \\
		\midrule
		bow & \textbf{0.71} (0.2) & 0.61 (0.35) & 0.6 (0.35) & 0.66 (0.46) \\
		F100 & 0.75 (0.12) & 0.6 (0.15) & 0.66 (0.14) & \textbf{0.93} (0.22) \\
		F400 & 0.76 (0.15) & 0.69 (0.2) & 0.77 (0.16) & \textbf{0.93} (0.24) \\
		body & 0.74 (0.16) & 0.67 (0.12) & 0.66 (0.2) & \textbf{1.0} (0.0) \\
		house & 0.68 (0.25) & 0.83 (0.13) & 0.84 (0.08) & \textbf{0.98} (0.01) \\
		spacega & 0.73 (0.19) & 0.55 (0.25) & 0.57 (0.24) & \textbf{0.98} (0.01) \\
		\bottomrule % from booktabs package
	\end{tabular}
\end{table}
%
\section{Conclusions} \label{sec:conclusions}

%We proposed the trimmed marginal likelihood for robust Bayesian learning, and studied in detail its theoretical and empirical performance for Gaussian process regression. 

The $\nu$-trimmed marginal likelihood ($\nu$-GP) approach is a natural extension of the empirical Bayes framework to robust Gaussian Process (GP) regression.
While for GP regression it is common to optimize the covariance function parameters by maximizing the marginal likelihood,  here, we additionally proposed to optimize (= select) the subset of data points that maximize the marginal likelihood.
We showed that the trimming ratio $\nu$ is an intuitive hyper-parameter since it corresponds to an upper bound on the outlier ratio and has the theoretic guarantee of controlling the breakdown point.
Note that this is in contrast to the hyper-parameters of commonly used robust methods like the student-$t$ likelihood and the $\gamma$-divergence, which are difficult to interpret.
In case where prior knowledge about an upper bound on the outlier ratio is unknown, we proposed to iteratively refine a conservative estimate of $\nu = 0.5$, which is 
the same break-down point as the student-$t$ likelihood.

In practice, the success of $\nu$-GP hinges on an efficient method for optimizing the subset of inliers. 
For that purpose, we proposed a projected gradient descent (PGD) method, proved its theoretic convergence guarantees, and showed empirically that the quality of the optimization is at par with greedy methods, while being computationally much more efficient.
Finally, the resulting $\nu$-GP with PGD compared favorable against common robust GP methods in terms of outlier detection and test prediction.

\section{Limitations and Future Work}
Due to the cross-validation, the computational costs of Algorithm \ref{alg:improvingNuEstimate} can be too high for large datasets. Moreover, for non-parametric regression there is an inherent ambiguity in whether a group of samples should be considered as outliers or as samples from the inlier distribution. Therefore, our future work aims to identify not only one partition of outliers and inliers, but different plausible partitions, similar in spirit to the works in \citep{Riani2014}.


\begin{contributions}
	The first author Daniel Andrade conceived the idea, created all code and wrote the paper.
	The coauthor Akiko Takeda suggested the projected gradient descent method and proved Theorem \ref{thm:pgdConvergence}.
\end{contributions}

\begin{acknowledgements}
	This work was supported by JSPS KAKENHI Grant Numbers 19H04069 and 23H03351.
	We also thank NEC Corporation and Yuzuru Okajima who helped with fruitful discussions about related preliminary work.
	We are also very grateful for the constructive comments of the anonymous reviewers, which helped to improve this work.
\end{acknowledgements}


%We proposed a new method for outlier detection in non-linear regression that is based only on the marginal likelihood. 
%Our proposed method has the advantages of 
%\begin{itemize}
%\item not imposing a particular outlier model (as for example the student-t GP);
%\item only has one intuitive hyper-parameter $\nu$ with the theoretic guarantee of controlling the breakdown point;
%\item a bias correction method that allows for automatic determination of $\nu$, and also for intuitive plots that can be used for visually inspecting the number of outliers.
%\end{itemize}
%
%Furthermore, we proposed a new projected gradient descent method that allows for a computationally efficient solution to the combinatorial problem of selecting the best subset $S$. %  subject to the constraint on its size.

% References
\bibliography{../../../all_papers_bibliography_extended}
\end{document}
