\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{wrapfig}
\usepackage{mathptmx}
\usepackage{bm} 
\usepackage{soul} 
\usepackage{enumitem} 
\setlist[itemize]{noitemsep, topsep=0pt,leftmargin=9pt}
\usepackage{amsmath,amsthm,amsfonts,amssymb}
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}



\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
%% User defined newcommand 
\newcommand{\bl[1]}{{\bf #1}}
\newcommand{\bz}{\bl[z]}
\newcommand{\bxi}{\bm{\xi}}
\newcommand{\bgamma}{\bm{\gamma}}
\newcommand{\bx}{{\bf x}}
\newcommand{\Dmat}{W}
\newcommand{\Domega}{\bm{\omega}}
%\newcommand{\Domega}{D_\Omega}
\newcommand{\bv}{\bl[v]}
\newcommand{\bu}{\bl[u]}

\newcommand{\RN}[1]{%
  \textup{\uppercase\expandafter{\romannumeral#1}}%
}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Linearly Constrained Gaussian Processes are SkewGPs: application to Monotonic Preference Learning and Desirability}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Alessio Benavoli}
\author[2]{Dario Azzimonti}

% Add affiliations after the authors
\affil[1]{%
    School of Computer Science and Statistics\\
    Trinity College Dublin, Ireland
}
\affil[2]{%
    Dalle Molle Institute for Artificial Intelligence (IDSIA)\\
    USI/SUPSI\\
    Lugano, Switzerland
\vspace{0cm}}
  
  \begin{document}
\maketitle

\begin{abstract}
We show that existing approaches to Linearly Constrained Gaussian Processes (LCGP) for regression, based on imposing constraints on a finite set of operational points, can be seen as Skew Gaussian Processes (SkewGPs). In particular, focusing on inequality constraints and building upon a recent unification of regression, classification, and preference learning through SkewGPs, we extend LCGP to handle monotonic preference learning and desirability, crucial for  understanding and predicting human decision making.  We demonstrate the efficacy of
the proposed model on simulated and real data.
\end{abstract}

\section{Introduction}\label{sec:intro}
Preference learning \citep{furnkranz2010preference} aims at learning predictive preference models from data.  Unlike regression or classification, where the target variable is a scalar, preference data is in the form of pairwise comparisons, which express a  subject's preference between alternative options. Applications of preference learning are ubiquitous in recommendation systems across diverse domains, such as e-commerce, social media, and entertainment platforms. Consider a  set $\mathcal{X} \subset \mathbb{R}^n$ and a binary relation $R$ on $\mathcal{X}$ expressed by a subject (that is $R$ is a subset of $\mathcal{X} \times \mathcal{X}$). Mathematically, a \textit{strict preference}\footnote{This paper focuses on strict preference rather than weak preference. Learning weak preference would result in a zero denominator in Bayes' rule when using continuous distributions like Gaussian Processes (GPs), as opposed to discrete distributions. In strict prefrence, we could incorporate a `just noticeable difference' threshold to model situations, where a subject judges two options equivalent because the difference in their utility is small (below a threshold).} is a binary relation, denoted by $\succ$, which is asymmetric and  negatively transitive \citep[Ch.\ 2]{kreps1990course}.\footnote{\textit{Asymmetric}: $\forall {\bf x}, {\bf y} \in \mathcal{X}$ if ${\bf x} R  {\bf y}$ then not ${\bf y} R {\bf x}$. \textit{Negatively transitive}: if ${\bf x} R {\bf y}$ then for any other element ${\bf z} \in \mathcal{X}$ either ${\bf x} R {\bf z}$ or ${\bf z} R {\bf y}$ or both.} For instance, imagine you are planning a trip from destination A to destination B and have three train options. What is your preference?
$$
\begin{aligned}
&\text{Option~1:~ cost=5€, travel-time=15min},\\
&\text{Option~2:~ cost=7€, travel-time=10min},\\
&\text{Option~3:~ cost=3€, travel-time=20min}.
\end{aligned}
$$
In this case $\mathcal{X}=\mathbb{R}^2$ and ${\bf x}_1=[5,15]^\top$, ${\bf x}_2=[7,10]^\top$, ${\bf x}_3=[3,20]^\top \in \mathcal{X}$. Stating that Option 1 is preferred to Option 2 is denoted as ${\bf x}_1 \succ {\bf x}_2$. Then, asymmetry implies that if  ${\bf x}_1 \succ {\bf x}_2$ then  ${\bf x}_2 \nsucc {\bf x}_1$. Negative transitivity  implies that if  ${\bf x}_1 \succ {\bf x}_2$  then either  ${\bf x}_1 \succ {\bf x}_3$  or  ${\bf x}_3 \succ {\bf x}_2$  or both. These are the minimum \textit{consistency} properties defining a strict preference relation. However, in many applications, it is reasonable to assume further properties. For instance, in the above example, it  seems natural to assume that any subject should prefer Option 5 to Option 4:
$$
\begin{aligned}
&\text{Option~4:~ cost=3€, travel-time=9min},\\
&\text{Option~5:~ cost=2€, travel-time=4min}.
\end{aligned}
$$
This property is called \textit{strict monotonicity}: if ${\bf x},{\bf y} \in \mathcal{X}$ and ${\bf x}\leq {\bf y}$, ${\bf x}\neq {\bf y}$ then  ${\bf x}\succ {\bf y}$, (where $\leq$ means that any element of ${\bf x}$ is at least as small as the corresponding component ${\bf y}$).\footnote{Depending on the application, we can obviously define strict monotonicity by changing the direction ${\bf x}\geq {\bf y}$, ${\bf x}\neq {\bf y}$ then  ${\bf x}\succ {\bf y}$.} Our objective is to learn strictly monotonic preferences from pairwise data. 
Assuming also continuity of the preference relation \citep[Ch.\ 2]{kreps1990course}, one can prove that any strictly monotonic preference is representable by a strictly monotone utility function $f$. Therefore, learning a preference can be formulated as the problem of learning monotonic utility functions that represents it.\footnote{This representation is not unique. Utility functions are invariant under increasing transformations. We can define a new utility function $g(u(x))$ for any  increasing function $g$, that is
$x \succ y ~\textit{ iff }~ u (x) > u (y) ~\textit{ iff }~g(u (x)) > g(u (y))$. So $u$ and $g(u)$ represent the same strict preference relation. We will go back to this point later in in the paper.}
In real-world scenarios, individuals, when expressing their preferences, often deviate from these consistencies properties for different reasons.  Accurately representing erroneous preferences requires modellings errors through tools like random utility models \citep{mcfadden1974,mcfadden1978}, i.e., a subject's preference is determined by a noisy utility function. This is crucial for learning because it requires us to define a likelihood function for preference data that accounts for these errors.


A powerful way to learn unknown functions is through  \textit{Gaussian Processes} (GP)\citep{o1978curve,rasmussen2006gaussian}, which are priors over functions. A GP-based method to learn  preference learning was firstly proposed by \citep{ChuGhahramani_preference2005,houlsby2011bayesian} with a \textit{probit} likelihood (to account for errors).\footnote{Due the probit likelihood, in the preference $x \succ y ~\textit{ iff }~ u (x) > u (y)$, now also  the magnitude of their utility difference $u (x)-u(y)$ is important (and not only the relative ranking of two items $u (x) > u (y)$). The reason is that we are assuming that the probability of errors is proportional to $|u (x) - u (y)|$, that is it is more difficult for a subject to express a preference between two options with close utility. This makes the scale of the utility function to be identifiable.} This approach offers two advantages: a nonlinear utility in the covariates and the representation of uncertainty through the posterior. Since the posterior is not Gaussian, \citet{ChuGhahramani_preference2005} proposed the Laplace's approximation for inference. Other approximations were considered in \cite{houlsby2011bayesian}.  More recently, \cite{benavoli2020preferential,Benavoli2021} showed that the posterior has a closed-form, called SkewGP, and exploited this relationship to efficiently sample from the posterior. Applications of preference learning for active learning and Bayesian optimisation, have been investigated by \citep{shahriari2015taking,gonzalez2017preferential,siivola2021preferential,benavoli2021preferential,nguyen2021top,benavoli2023d}.

Many recent works \cite{riihimaki2010gaussian,wang2016estimating,agrell2019,da2020gaussian,golchi2015monotone,maatouk2017gaussian,lopez2018finite,lopez2022high,maatoukhal-04084865,lin2014bayesian,andersen2018non} developed GP models for regression that satisfy monotonic, or more generally linear inequality constraints. In particular, \cite{riihimaki2010gaussian,wang2016estimating,agrell2019,da2020gaussian,golchi2015monotone}  enforce monotonicity constraints by imposing them on a finite set of operational points. The works in \citep{maatouk2017gaussian,lopez2018finite,lopez2022high,maatoukhal-04084865} exploit a finite-dimensional kernel to extend the monotonicity constraint to the whole domain. Other approaches  \citep{lin2014bayesian,andersen2018non} impose shape constraints through squared Gaussian process derivatives and series expansions. Finally \textit{monotonic-GP-flow} \citep{ustyuzhaninov2020monotonic}, imposes monotonicity on GPs  based on numerical solutions of stochastic differential equations. We point the reader to \citep{swiler2020survey} for a comprehensive survey study of LCGP, including bound constraints,  monotonicity and linear partial differential operator constraints. In this work we aim to bring the recent advances in modelling monotonicity constraints with GPs for regression to the preference learning setting. 

The contributions of this work are the following:  
\begin{itemize}
    \item %The imposition of monotonic constraints, more in general linear constraint, in GP for regression  has been investigated by  \cite{riihimaki2010gaussian,wang2016estimating,agrell2019,da2020gaussian,golchi2015monotone,maatouk2017gaussian,lopez2018finite,lopez2022high,maatoukhal-04084865,lin2014bayesian,andersen2018non}. In these works, the constraints are imposed on a finite set of operational points. 
    We show that linearly constrained Gaussian Processes (LCGP) that impose monotonicity constraints with a finite set of operational points are SkewGPs. 
    \item Exploiting the conjugacy of SkewGPs with the normal and probit likelihood (and their product)  \citep{Benavoli2021}, we extend LCGP models to preference learning and classification tasks, deriving a novel nonparametric model for monotonic preference learning and desirability learning (which is equivalent to a monotonic classification problem \citep{Benavoli2023e,CASANOVA20231}).
    \item We compare our SkewGP-formulation of monotonic regression and preference learning against \textit{monotonic-GP-flow} on % \citep{ustyuzhaninov2020monotonic}, which imposes monotonicty on GPs  based on numerical solutions of stochastic differential equations. Using 
     7 1D benchmark functions. Our SkewGP outperforms monotonic-GP-flow in both accuracy and uncertainty quantification.
    \item We apply SkewGP to two preference datasets demonstrating that models without monotonicity constraints can produce wrong predictions, thus highlighting the importance of incorporating monotonicity constraints.
\end{itemize}

In this work, we focus on monotonicity constraints, we leave the extension to any linear inequality constraint for future work.

%In sec.~\ref{sec: Background} we provide the necessary background on SkewGPs, sec.~\ref{sec:LCGPasSkewGP} shows how to build a skewGP that satisfies linear constraints and sec.~\ref{sec:monGP} links this model with existing models. Sec.~\ref{sec:Desirability} applies the proposed model to desirability learning and sec.~\ref{sec:experiments} benchmarks SkewGP against the state-of-the-art and showcases the method on two real-world preference learning problems. 

%\hl{Dario, ho tolto il summary del paper per guadagnare spazio. Poi vedo se posso rimetterlo}




\section{Skew-normal distribution and  skew-Gaussian processes} \label{sec: Background}
The unified skew-normal distribution \citep{arellano2006unification,azzalini2013skew,durante2018conjugate,alodat2020gaussian}  generalises the normal distribution by allowing for non-zero skewness. A vector ${\bf z}$ distributed as skew-normal can be constructed from a multivariate normal which is truncated in part of its component, see \cite[Ch.7]{azzalini2013skew}. Consider two vectors ${\bf z}_0\in \mathbb{R}^{s},{\bf z}_1\in \mathbb{R}^{p}$ such that:
\begin{align}
\label{eq:jointnormal}
\begin{bmatrix}
{\bf z}_1\\
{\bf z}_0
\end{bmatrix}\sim N({\bf 0}_{s+p}, M), ~M=\begin{bmatrix}
\Omega  & \Delta\\
\Delta^\top & \Gamma
\end{bmatrix},
\end{align}
where $M$ is a full-rank covariance matrix. Define $\boldsymbol{\zeta}$ to be distributed as $N({\bf z}_1 |{\bf z}_0 + \bgamma > {\bf 0}_{s})$, where $\bgamma \in \mathbb{R}^s$  and the inequality
${\bf z}_0 + \bgamma > {\bf 0}_{s}$ holds component-wise.\footnote{Note that, in the standard construction of the SUN distribution \cite[Ch.7]{azzalini2013skew}, the matrix $M$ is a correlation matrix. However, we can obtain the standard construction from \eqref{eq:jointnormal} by a change of variables.}
Then, given a location vector $\bxi \in \mathbb{R}^p$, ${\bf z}=\bxi+\boldsymbol{\zeta}\in \mathbb{R}^{p}$ is distributed as a \textit{multivariate
unified skew-normal distribution} with latent skewness dimension $s$. We denote $ \bz \sim \text{SUN}_{p,s}(\bxi,\Omega,\Delta,\bgamma,\Gamma )$ and its Probability Density Function (PDF) is given by:
\begin{align}
\nonumber
p(\bz) &= \phi_{p}(\bz-\bxi;\Omega)\\
\label{eq:sun}
& \frac{\Phi_s\left(\bgamma+\Delta^\top{\Omega}^{-1}(\bz-\bxi);\Gamma-\Delta^\top{\Omega}^{-1}\Delta\right)}{\Phi_s\left(\bgamma;\Gamma \right)}, 
\end{align}
where $\phi_p(\bz-\bxi;\Omega)$ denotes the PDF of a multivariate normal distribution with mean $\bxi \in \mathbb{R}^{p}$ and covariance $\Omega\in \mathbb{R}^{p\times p}$. $\Phi_s(\bl[a];M)$ represents the Cumulative Distribution Function (CDF) of a $s$-dimensional multivariate normal distribution with zero mean and covariance matrix $M$ evaluated at $\bl[a]\in \mathbb{R}^s$. 
The parameters $\bgamma \in \mathbb{R}^s, \Gamma \in \mathbb{R}^{s\times s},\Delta \in \mathbb{R}^{p \times s}$ control the skewness of the distribution, in particular $\Delta$ is called \textit{skewness matrix}. When $\Delta=0$, eq.~\eqref{eq:sun}  reduces  to  $\phi_p(\bz-\bxi;\Omega)$, i.e. a skew-normal with zero skewness matrix is a normal distribution. Moreover, we assume that $\Phi_0(\cdot)=1$, so that, for $s=0$, eq.~\eqref{eq:sun} becomes a multivariate normal distribution. Figure~\ref{fig:SUN1d} shows the density of a univariate SUN distribution with latent dimensions $s=1$ and $s=2$.  %\hl{Dario, riesci a rifare le immagii con i fonts piu' grandi}

	\begin{figure}[htp]
	\centering
	\begin{tabular}{cc}
		\includegraphics[width=0.48\linewidth]{figs/SUN_p1s1_jmrl.pdf} &
\includegraphics[width=.48\linewidth]{figs/SUN_p1s2_jmlr.pdf} \\
		\small $s=1$, $\Gamma=1$  & \small $s=2$, $\Gamma=diag([0.8,0.8])$
	\end{tabular}
	\caption{Density plots for $\text{SUN}_{1,s}(0,1,\Delta,\gamma,\Gamma)$. For all plots $\Gamma$ is a correlation matrix, $\gamma = 0$, dashed lines are the contour plots of $y \sim N_1(0,1)$.}
	\label{fig:SUN1d}
\end{figure}

%Figure~\ref{fig:SUN1d} shows the density of a univariate SUN distribution with latent dimensions $s=1$ (left) and $s=2$ (right). 

A SkewGP \citep{benavoli2020skew, Benavoli2021} is a generalisation of a SUN distribution to a stochastic process which becomes a GP when skewness is zero, see Appendix~\ref{app:gp} for a quick recap on GPs. To define a SkewGP, we consider here a location function $\xi: \mathbb{R}^d \rightarrow \mathbb{R}$, a scale (kernel) function $\Omega: \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}$, a skewness vector function $\Delta: \mathbb{R}^d \rightarrow \mathbb{R}^{s}$ and the parameters $\bgamma \in \mathbb{R}^s, \Gamma \in \mathbb{R}^{s \times s}$. 
A real function $f: \mathbb{R}^d \rightarrow \mathbb{R}$ is SkewGP-distributed with latent dimension $s$, if for any sequence of $n$ points $\bx_1, \ldots, \bx_n \in \mathbb{R}^d$, the vector $[f(\bx_1), \ldots, f(\bx_n)]^\top \in \mathbb{R}^n$ is SUN distributed with parameters $\bgamma, \Gamma$ and location, scale and skewness matrices,
respectively, given by
\begin{equation}
\begin{array}{rl}
\xi(X)&=\left[\begin{smallmatrix}
\xi(\bx_1), \xi(\bx_2),\dots, \xi(\bx_n)
\end{smallmatrix}\right]^\top,\vspace{0.1cm}\\
\Omega(X,X)&=\left[
\begin{smallmatrix}
\Omega(\bx_1,\bx_1) & \Omega(\bx_1,\bx_2) &\dots & \Omega(\bx_1,\bx_n)\\
\Omega(\bx_2,\bx_1) & \Omega(\bx_2,\bx_2) &\dots & \Omega(\bx_2,\bx_n)\\
\vdots & \vdots &\dots & \vdots\\
\Omega(\bx_n,\bx_1) & \Omega(\bx_n,\bx_2) &\dots & \Omega(\bx_n,\bx_n)\\
\end{smallmatrix}\right],\vspace{0.1cm}\\
\Delta(X)&=\left[\begin{smallmatrix}\Delta(\bx_1),\Delta(\bx_2),\dots, \Delta(\bx_n)\\
\end{smallmatrix}\right],
\end{array}
\end{equation}
where $X=[\bx_1, \bx_2, \dots, \bx_n]^\top$.  In this case, we write $f({\bf x}) \sim \text{SkewGP}_s(\xi({\bf x}), \Omega({\bf x},{\bf x}),\Delta({\bf x}),\bgamma, \Gamma)$.
SkewGPs are conjugate with both the normal and affine probit likelihood and, more in general, with their product. This allows us  to derive their posterior for nonparametric regression, classification, preference learning and mixed problems.

In particular,  consider the affine-probit-normal product  likelihood:  
\begin{equation}
\begin{aligned}
p(Y,Z,W \mid f(X)) &= \Phi_{m_a}(Z+Wf(X); \Sigma)\\
 &\cdot \phi_{m_r}(Y-Cf(X);R).  
\end{aligned}
\label{eq:mixedlike}
\end{equation}
where $m_r$ (the subscript $r$ stands for regression) denotes the number of regression-type observations  and $m_a$ the number of binary/preference-type observations (the subscript $a$ stands for affine) . Therefore, we have that
$Y \in \mathbb{R}^{m_r}$, $C \in \mathbb{R}^{m_r \times n}$, $W \in \mathbb{R}^{m_a \times n}$, $Z \in \mathbb{R}^{m_a \times 1}$. The matrices $ \Sigma \in \mathbb{R}^{m_a \times m_a}$, $R \in  \mathbb{R}^{m_r \times m_r}$ are covariance matrices.  
This likelihood encompasses all the standard likelihood functions used in regression, classification and preference-learning. 
For instance, a standard regression  is obtained by setting $C=I_{m_r}$, $R=\sigma^2 I_{m_r}$ and $m_a=0$; classification is obtained for $W=diag(2Y-1)$, $Z={\bf 0}_{m_a}$, $\Sigma=I_{m_a}$ and $m_r=0$, where $Y$ is the vector containing the observed class values $Y_i \in\{0,1\}$. 
Preference learning is obtained with $Z={\bf 0}_{m_a}$, $\Sigma=I_{m_a}$ and $m_r=0$ and $W \in \mathbb{R}^{m_a \times n}$   whose s-th row is all zero apart from $W_{i}=1,W_{j}=-1$ if the data includes the preference ${\bf x}_i\succ {\bf x}_j$. %(the remaining matrices are the same as in classification).

We  now report this result from \cite[Theorem 3]{Benavoli2021}.

\begin{proposition}
	\label{prop:postmixed}	
	Let us assume a SkewGP prior 	$f({\bf x}) \sim \text{SkewGP}_s(\xi({\bf x}), \Omega({\bf x},{\bf x}),\Delta({\bf x}),\bgamma, \Gamma)$, the likelihood \eqref{eq:mixedlike}, then a-posteriori $f({\bf x})$ is SkewGP with mean, covariance and skewness functions:
 \vspace{-0.15cm}
   \begin{align}
   \nonumber
	\tilde{\bxi}({\bf x})  &=\bxi({\bf x})\\
    \nonumber
 &+\Omega({\bf x},X) C^T(C\Omega(X,X) C^T+R)^{-1}(Y-C\xi(X)),\\
 \nonumber
	\tilde{\Omega}({\bf x},{\bf x}) &= \Omega({\bf x},{\bf x})\\
 \nonumber
 &-\Omega({\bf x},X) C^T(C\Omega(X,X) C^T+R)^{-1}C\Omega(X,{\bf x}),\\
	\nonumber
\tilde{\Delta}({\bf x}) &=\begin{bmatrix}
\Delta(\bx)~ & \Omega(\bx,X)W^T
\end{bmatrix}\\
\nonumber
	&-\Omega({\bf x},X)C^T (C\Omega(X,X) C^T+R)^{-1} C \\
 \nonumber
 &\cdot \begin{bmatrix}
\Delta(X)~ & \Omega(X,X)W^T
\end{bmatrix},\\
\nonumber
\tilde{\bgamma} &= \bgamma_p+\begin{bmatrix}
\Delta(X)~ & \Omega(X,X)W^T
\end{bmatrix}^T\\
\nonumber
&\cdot {\Omega}(X,X)^{-1}(\tilde{\bxi}(X)-\bxi(X))\\
\nonumber
\tilde{\Gamma} &=  \Gamma_p- \begin{bmatrix}
\Delta(X)~ & \Omega(X,X)W^T
\end{bmatrix}^T\\
\nonumber
&\Omega^{-1}(X,X)\begin{bmatrix}
\Delta(X)~ &\Omega(X,X)W^T
\end{bmatrix}\\
\nonumber
&+\Delta_p^T\tilde{\Omega}(X,X)^{-1}\Delta_p,\\
\nonumber
\Delta_p&=\tilde{\Omega}(X,X){\Omega}^{-1}(X,X)\\
\nonumber
&\cdot\begin{bmatrix}
\Delta(X)~ & \Omega(X,X)W^T
\end{bmatrix},\\
\nonumber
\bgamma_p &=[\bgamma,~~Z+W\xi(X)]^T, \\
\nonumber
\Gamma_p&=\begin{bmatrix}
	\Gamma & ~~\Delta(X)^T  W^T \\
	W  \Delta(X) & ~~(W \Omega(X,X) W^T + \Sigma) \end{bmatrix}.
\end{align}
\end{proposition}
The computation of predictive inference (posterior mean, credible intervals etc.) can be achieved by sampling the posterior SkewGP. 
 Recall \cite[Ch.7]{azzalini2013skew} that $\mathbf{z} \sim \operatorname{SUN}_{p,s}(\bxi,\Omega, \Delta, \bgamma, \Gamma)$ can be written as $\mathbf{z} = \bxi + \mathbf{r}_0 + \Delta \Gamma^{-1}\mathbf{r}_{1,-\gamma}$ with $\mathbf{r}_0\sim \phi_p(0; \bar{\Omega}-\Delta \Gamma^{-1}\Delta^T)$ and $\mathbf{r}_{1,-\gamma}$ is the truncation below $\gamma$ of $\mathbf{r}_{1} \sim \phi_s(0;\Gamma)$. Note that sampling $\mathbf{r}_0$ can be achieved efficiently with standard methods, and  $\mathbf{r}_{1,-\gamma}$  can be obtained efficiently using methods such as Gibbs sampling \citep{taylor2016restrictedmvn} linear elliptical slice sampler \citep{gessner2019integrals},  minimax tilting method accept-reject sampler \cite{botev2017normal} and Hamiltonian Monte-Carlo \citep{pakman2014exact}.

Similarly to GPs, the functions and matrices defining a SkewGP, $\text{SkewGP}_s(\xi({\bf x}), \Omega({\bf x},{\bf x}),\Delta({\bf x}),\bgamma, \Gamma)$ may depend on hyperparameters  ${\boldsymbol \theta}$. These parameters  are chosen by maximising the marginal likelihood, which is equal to:
 \begin{equation}
  \label{eq:ml_normal_mix}
  p(Y)=\phi_{m_r}(Y-C\xi(X);C\Omega(X,X)C^T+R)\frac{ \Phi_{s+m_a}(\tilde{\bgamma};~\tilde{\Gamma})}{\Phi_{s}(\bgamma;~\Gamma)},
 \end{equation}
 with $\tilde{\bgamma},\tilde{\Gamma}$ are defined in  Proposition \ref{prop:postmixed}. This involves the computation of a high-dimensional multivariate CDFs $ \Phi_{s+m_a}(\cdot), \Phi_{s}(\cdot)$. We use a variational inference technique to approximate the posterior distribution with a Gaussian distribution. This provides a lower bound for \eqref{eq:ml_normal_mix}, which we maximise to find the hyperparameters.




\section{A linearly constrained  GP is a SkewGP}
\label{sec:LCGPasSkewGP}
We recall \citep[Sec.\ 9.4]{rasmussen2006gaussian} that if $f:\mathbb{R}^D \rightarrow \mathbb{R}$ is GP distributed, that is $f \sim \text{GP}(0,k)$ with kernel $k$, then its first derivative $\tfrac{\partial f_i}{\partial x_{ik}}$
%and second   $\tfrac{\partial^2 f_{i}}{\partial x_{ik}^2}$ derivative 
is also GP-distributed with covariance (kernel):
\begin{align}
 k^I(\bx_i,\bx_j) &:=\text{cov}\left(f_i,\tfrac{\partial f_j}{\partial x_{jl}}\right)=\frac{\partial k(\bx_i,\bx_j)}{\partial x_{jl}},\\
%\text{cov}\left(f_i,\tfrac{\partial^2 f_j}{\partial x_{jl}^2}\right)&=\frac{\partial^2 k(\bx_i,\bx_j)}{\partial x_{jl}^2},\\
k^{II}(\bx_i,\bx_j)&:=\text{cov}\left(\tfrac{\partial f_i}{\partial x_{il}},\tfrac{\partial f_j}{\partial x_{je}}\right)=\frac{\partial k(\bx_i,\bx_j)}{\partial x_{il}\partial x_{je}},
%\text{cov}\left(\tfrac{\partial f_i}{\partial x_{ik}},\tfrac{\partial^2 f_j}{\partial x_{jj}^2}\right)&=\frac{\partial^3 k(\bx_i,\bx_j)}{\partial x_{ik}\partial x_{jj}^2},\\
%\text{cov}\left(\tfrac{\partial^2 f_i}{\partial x_{ii}},\tfrac{\partial^2 f_j}{\partial x_{jj}^2}\right)&=\frac{\partial^4 k(\bx_i,\bx_j)}{\partial x_{ii}^2\partial x_{jj}^2}.
\end{align}
for each $i,j,l,e \in \{1,2,\dots,D\}$. 

We introduce a vector $U=[\bu_1,\bu_2,\dots,\bu_{r}]^\top$, with $\bu_{i} \in \mathbb{R}^D$, of \textit{operational points} and define
 ${\bf f}'(\bu_i)=[\tfrac{\partial}{\partial u_{i1}}f(\bu_i),\dots,\tfrac{\partial}{\partial u_{iD}}f(\bu_i)]^\top$. 
 %and ${\bf f}''(\bu_i)=[\tfrac{\partial^2}{\partial u_{i1}^2}f(\bu_i),\dots,\tfrac{\partial^2}{\partial u_{id}^2}f(\bu_i)]^\top$ 
 We assume that the vector $[f(\bx_1), \dots, f(\bx_n), { f}(\bu_1),\dots, { f}(\bu_r),
{\bf f}'(\bu_1)^\top, \dots, {\bf f}'(\bu_r)^\top]^\top$ %\ {\bf f}''(\bu_1), \dots, {\bf f}''(\bu_r) 
is GP distributed with zero-mean and covariance matrix
\begin{equation}
\label{eq:covder}
%\resizebox{\hsize}{!}{$
M=\left[\begin{matrix}
 K(X,X)  &  K(X,U)  &  K^\RN{1}(X,U)  \\ %&   K^\RN{2}(X,U) \Domega_3^{-1}\\
 K(X,U)^\top   & K(U,U)  & K^\RN{1}(U,U) \\ % &  K^\RN{2}(U,U)\Domega_3^{-1}\\
 K^\RN{1}(X,U)^\top  &  K^\RN{1}(U,U) &  K^\RN{2}(U,U) \\ %&  K^\RN{3}(U,U) \Domega_3^{-1}\\
 %\Domega_3^{-1}K^\RN{2}(X,U)^\top  &  \Domega_3^{-1}K^\RN{2}(U,U)^\top  &  \Domega_3^{-1} K^\RN{3}(U,U)  &  \Domega_3^{-1}K^\RN{4}(U,U) \Domega_3^{-1}\\
\end{matrix}\right].%}
\end{equation}
%\hl{Alessio, anche gli altri LCGP hanno delle componenti $K(X,U)$ $K^\RN{1}(X,U)$ che fissano la dipendenza tra operational e training?}
%\hl{Dario, ho controllato, si!}
We  define a linearly constrained GP by imposing:
\begin{align}
\label{eq:constr}
L\left[\begin{smallmatrix}
{ f}(\bu_1)\\
\vdots \\
{ f}(\bu_r)\\
{\bf f}'(\bu_1)\\
\vdots \\
{\bf f}'(\bu_r)\\
%{\bf f}''(\bu_1)\\
%\vdots \\
%{\bf f}''(\bu_r)
\end{smallmatrix}\right] + \bgamma >0.
\end{align}
It is immediate to verify that, by suitably selecting $L,\bgamma$, \eqref{eq:constr} allows us to impose bound and monotonicity constraints on $f$ at $U$. We could similarly impose constraints on the second derivative, integral of $f$ or on other affine operators, which preserve Gaussianity.

\begin{theorem} \label{th:linconstr} Assume that \\ $[f(\bx), {\bf f}(\bu_1),\dots, {\bf f}(\bu_r),
{\bf f}'(\bu_1)^\top, \dots, {\bf f}'(\bu_r)^\top]^\top$ is GP distributed with zero-mean and covariance matrix \eqref{eq:covder}. Then, subject to 
 the constraint \eqref{eq:constr}, $f(\bx)+\bxi(\bx)$ is SkewGP distributed with parameters $\bgamma, \bxi(\bx)$, 
\begin{align}
\label{eq:gammaConstr}
\Gamma &= 
\left[\begin{matrix}
LK(U,U)L^T  & L K^\RN{1}(U,U)L^\top \\
  LK^\RN{1}(U,U)L^\top &  LK^\RN{2}(U,U)L^\top\\
\end{matrix}\right],\\
\label{eq:deltaConstr}
\Delta(\bx) &=  
\left[\begin{matrix}
 K(\bx,U)L^\top  &  K^\RN{1}(\bx,U)L^\top\\
\end{matrix}\right],
\end{align}
and scale function $\Omega(\bx,\bx)=k(\bx,\bx)$.
\end{theorem}
%Therefore,  a linearly constrained GP can be viewed as a SkewGP with the parameters defined as before.
The proof of this and next theorems is in Appendix \ref{app:proofs}.
This result allows us to leverage the SkewGP as a prior distribution over functions and compute posteriors for regression, classification, and preference learning tasks (as shown in Proposition \ref{prop:postmixed}). These posteriors are guaranteed to satisfy the specified linear constraint at all operational points.
In the next section, we will illustrate this theorem with a concrete example related to monotonicity.




\section{Monotonic GP}
\label{sec:monGP}
In this section, we demonstrate how SkewGPs offer a unified approach to imposing monotonicity constraints on GPs. We achieve this by showing how SkewGPs can encompass various existing methods from the literature. This unification, combined with the conjugacy property of SkewGPs established in Proposition \ref{prop:postmixed} and Theorem \ref{th:linconstr}, enables us to extend these approaches beyond regression, which has been the primary focus of previous work on linearly constrained GPs.% and apply it to monotonic preference learning.
\setlength{\intextsep}{0pt}%
\setlength{\columnsep}{0pt}%
\begin{wrapfigure}{r}{0.29\textwidth}
\centering
\includegraphics[width=0.25\textwidth,trim={0.25cm 0.05cm 0 0},clip]{figs/1dexample.pdf}
\vspace{-0.2cm}
\caption{}
\label{fig:ex1d}
\end{wrapfigure} As a simple illustration of imposing monotonicity constraints in  regression and preference learning, we will consider the function $f(x) = 3/(1+\exp(-20x+10))$ for $x \in [0,1]$ shown in Figure \ref{fig:ex1d}. We generated a regression and a preference learning dataset as follows. We first evaluated $f$ at $50$ equally spaced points in $[0,0.45]$ and at $50$ equally spaced points in $[0.75,1]$. The $100$ points $\{x_l\}_{l=1}^{100}$ were used to generate regression data with $y_i=f(x_i)+\epsilon_i$ where $\epsilon_i\sim N(0,0.0225)$. From $\{x_l\}_{l=1}^{100}$ we also generated $200$ pairwise preferences as follows: $x_i \succ x_j$ if $f(x_i)+\epsilon_i>f(x_j)+\epsilon_j$ with $\epsilon_i,\epsilon_j\sim N(0,0.0225)$, for randomly selected $x_i,x_j \in \{x_l\}_{l=1}^{100}$.\footnote{$\int I_{f(x_i)+\epsilon_i>f(x_j)+\epsilon_j}(\epsilon_i,\epsilon_j)dN(\epsilon_i,0,\sigma^2)dN(\epsilon_j,0,\sigma^2)=\Phi_1(\tfrac{f(x_i)-f(x_j)}{\sqrt{2}\sigma})$, which gives rise to the probit likelihood in preference learning.} %We will use these datasets to plot the posterior SkewGP under monotonicity constraints.


%We have evaluated $f$ in 50 equally space values in the interval $[0,0.45]$ and, respectively, $[0.75,1]$. We have then used $\{x_l\}_{l=1}^{100}$ to generate regression-type data $y_i=f(x_i)+\epsilon_i$ with $\epsilon_1\sin N(0,0.0225)$ and $200$ pairwise preferences as follows: $x_i \succ x_j$ if $f(x_i)+\epsilon_i>f(x_j)+\epsilon_j$  for randomly selected $x_i,x_j \in \{x_l\}_{l=1}^{100}$.\footnote{$\int I_{f(x_i)+\epsilon_i>f(x_j)+\epsilon_j}(\epsilon_i,\epsilon_j)dN(\epsilon_i,0,\sigma^2)dN(\epsilon_j,0,\sigma^2)=\Phi_1(\tfrac{f(x_i)-f(x_j)}{\sqrt{2}\sigma})$, which gives rise to the probit likelihood in preference learning.} We will use these datasets to plot the posterior SkewGP under monotonicity constraints.



\subsection{Infinite-dimensional kernel}

Existing approaches \cite{riihimaki2010gaussian,wang2016estimating,agrell2019,da2020gaussian,golchi2015monotone}  enforce monotonicity constraints by holding them at specific operational points, denoted as $U$. These approaches can be applied with any kernel, including infinite-dimensional ones, but in general, they guarantee global monotonicity only with high probability. Specific methods for imposing constraints and inference vary across these works. For a detailed comparison, please refer to  \citet[Table 1]{agrell2019}. Our SkewGP-based approach incorporates methods like  \cite{wang2016estimating,da2020gaussian} that perfectly enforce monotonicity constraints at operational points. Note that, for both regression and preference learning, all computations are performed analytically as described in Proposition \ref{prop:postmixed}. By leveraging the analytical derivations, we efficiently obtain the posterior samples (and so mean and credible region) using tailored  MCMC methods as described in Section \ref{sec: Background}. This translates to fast inference.  It is worth noting that methods using soft constraints with indicators replaced by probit function (or Normal likelihood) can also be formulated as SkewGP with different parameters, which will reduce to the frameworks in \citep{riihimaki2010gaussian,agrell2019,golchi2015monotone}.

In order to apply these methods with SkewGPs therefore we only need to define $k,k^I,k^{II}$. For instance, the $D$-dimensional squared-exponential (SE) kernel is
\begin{align}
\nonumber
&k({\bf x}_i,{\bf x}_j)=\sigma_0^2 \exp\left( - \sum_{d=1}^D  \frac{(x_{id} - x_{jd})^2}{2\ell_d^{2}} \right), \\
\nonumber
&k^I({\bf x}_i,{\bf x}_j) = - \sigma_0^2 \exp\left( - \sum_{d=1}^D  \frac{(x_{id} - x_{jd})^2}{2\ell_d^{2}} \right) \ell_l^{-2}  (x_{il} - x_{jl}),\\
\nonumber
&k^{II}({\bf x}_i,{\bf x}_j)  = \sigma_0^2 \exp\left( - \sum_{d=1}^D  \frac{(x_{id} - x_{jd})^2}{2\ell_d^{2}} \right) \\
\nonumber
& \ell_l^{-2} \left( \delta_{lh} - \ell_h^{-2} (x_{il} - x_{jl}) (x_{ih} - x_{jh}) \right) ,
\end{align}
respectively, where $\delta_{lh} = 1$ if $l=h$ and 0 otherwise and $\sigma_0,\ell_d$ for $d=1,\dots,D$ are the hyperparameters of the kernel.
Figure \ref{fig:rbf1D} reports the sampled posterior SkewGP for both regression and preference learning using the dataset generated from the function in Figure \ref{fig:ex1d}. We used a SE kernel with $\ell=0.15$,  $\sigma_0=1$ for regression and $\sigma_0=90$ for preference and imposed the constraints on equally spaced operational points. Note that this approach can be applied directly to multi-dimensional functions. Figure \ref{fig:rbf1D} shows that the posterior inference improves in the constrained case (given the original function $f$ is monotonic) compared to the unconstrained case. In the constrained case, however, the samples do not preserve monotonicity globally. This is a known drawback of the approaches \cite{riihimaki2010gaussian,wang2016estimating,agrell2019,da2020gaussian,golchi2015monotone}. 
Several techniques exist for selecting the location of operational points $U$. We refer to \cite{agrell2019} for a review of these techniques. Appendix~\ref{sec:hyp} details how they are selected in the experimental section.



\begin{figure*}
\begin{tabular}{cccc}
regression unconstr. & regression constr. & preference unconstr. & preference constr. \\
\includegraphics[width=4cm]{figs/1dexample_RBF_constrFalse_regression.pdf} &
\includegraphics[width=4cm]{figs/1dexample_RBF_constrTrue_regression.pdf}&
\includegraphics[width=4cm]{figs/1dexample_RBF_constrFalse_preference.pdf}&
\includegraphics[width=4cm]{figs/1dexample_RBF_constrTrue_preference.pdf}
\end{tabular}
\caption{SkewGP with RBF kernel, $\ell=0.15$,  $\sigma=1$ for regression and $\sigma=90$ for preference. The thick red line shows the posterior mean, and the shaded region represents the 95\% credible interval. Ten sampled functions are also included to illustrate the uncertainty. Vertical lines denote the operational points where the monotonicity constraint is enforced.}
\label{fig:rbf1D}
\end{figure*}

\subsection{Finite-dimensional kernel}
\label{sec:Mspline}
A way to impose constraints in the whole domain was proposed by  
\citep{maatouk2017gaussian,lopez2018finite,lopez2022high,maatoukhal-04084865}. They achieve this through finite-dimensional
approximations of the GP that converge uniformly at the increase of the number of the knots. Here we follow \citep{lopez2022high} and  consider degree 2 monotone splines (M-spline,  \citep{ramsay1988monotone}). To define a M-spline of degree $2$, we consider $l+1$ grid points (knots) $(t_{0},\dots,t_{l+1})$ such that $t_0<t_1 < \dots < t_{l}<t_{l+1}$. M-spline are piecewise  polynomials defined as:
\begin{align}
\label{eq:mspline}
M_i(x)=\left\{\begin{array}{ll}
\frac{x-t_{i-1}}{t_{i}-t_{i-1}} & t_{i-1} \leq x \leq t_{i}\\
\frac{t_{i+1}-x}{t_{i+1}-t_{i}} & t_{i} \leq x \leq t_{i+1}\\
\end{array}
\right.
\end{align}
for $i=1,\dots,l$. Figure \ref{fig:mspline} shows the polynomial for $l=8$ and $\{t_i\}_{i=1}^\ell$  equally spaced in $[0,1]$ (and $t_0=-1,t_{\ell+1}=2$). 

\setlength{\intextsep}{0pt}%
\setlength{\columnsep}{0pt}%
\begin{wrapfigure}{r}{0.28\textwidth}
\centering
\includegraphics[width=4cm]{figs/msplines.pdf} 
\vspace{-0.25cm}
\caption{M-spline.}
\vspace{-0.45cm}
\label{fig:mspline}
\end{wrapfigure}
Then, the finite-dimensional GP is defined as
\begin{equation}
    f(x)=\sum_{i=1}^l \beta_i M_i(x),
\end{equation}
where $\beta_i$ are  Gaussian distributed with zero-mean and covariance matrix $E[\beta_i,\beta_j]=\check{k}(t_i,t_j)$, where $\check{k}$ is a kernel. In the rest of the paper, we assume that $\check{k}$ is the SE kernel.
It is then immediate to verify that 
$f$ is GP distributed with zero-mean and covariance kernel
\begin{equation}
\label{eq:kernelMSP}
    k(x,x')=\sum_{i=1}^l\sum_{j=1}^l M_i(x)\check{k}(t_i,t_j)M_j(x').
\end{equation}
We call $k$ in eq.~\eqref{eq:kernelMSP} the `MSP' kernel.
We now show how we can impose monotonicity using SkewGP.
\begin{theorem}
\label{th:mspline}
Consider $l$ operational points $[u_1,\dots,u_{l+1}]$ defined as $u_i=(t_{i}+t_{i-1})/2$, then the SkewGP obtained from Theorem  \ref{th:linconstr} with $L=diag([{\bf 0}_r,I_{r}])$ and kernel defined as in \eqref{eq:kernelMSP} is monotone increasing in $[t_1,t_l]$.
\end{theorem}
Therefore, we can also include the approaches \citep{maatouk2017gaussian,lopez2018finite,lopez2022high,maatoukhal-04084865} into the SkewGP framework exploiting Proposition \ref{prop:postmixed} ad Theorem \ref{th:linconstr}. As before, we only need to compute $k^I(x,x'),k^{II}(x,x')$. Figure \ref{fig:MSPline1D} shows the posterior SkewGP obtained with the kernel \eqref{eq:kernelMSP}. Compared to Figure \ref{fig:rbf1D}, it can be noticed that the mean and trajectories are piecewise linear and, more importantly, the monotonicity constraint holds globally in the interval $[0,1]$, i.e., all sampled trajectories are monotonic in $[0,1]$.

In \citep{lopez2022high}, the extension to the multidimensional case ${\bf x}=[x_1,\dots,x_D]^\top \in \mathbb{R}^D$ is obtained by considering an additive model
$f({\bf x})=\sum_{d=1}^D f(x_d)$ and, therefore, an additive kernel  $k({\bf x},{\bf x}')=\sum_{d=1}^D\sum_{i=1}^l\sum_{j=1}^l M_{di}(x_d)\check{k}(t_{di},t_{dj})M_{dj}(x_d')$. This is the approach we will follow in the rest of the paper.
Note that, it is also possible to use the product kernel, similar to \citep{maatouk2017gaussian} or the ANOVA kernel (including both sums and products). The additive kernel holds the advantage of scaling more effectively to high dimensions.


\begin{figure*}
\begin{tabular}{cccc}
regression unconstr. & regression constr. & preference unconstr. & preference constr. \\
\includegraphics[width=4cm]{figs/1dexample_MSPline_constrFalse_regression.pdf} &
\includegraphics[width=4cm]{figs/1dexample_MSPline_constrTrue_regression.pdf}&
\includegraphics[width=4cm]{figs/1dexample_MSPline_constrFalse_preference.pdf}&
\includegraphics[width=4cm]{figs/1dexample_MSPline_constrTrue_preference.pdf}
\end{tabular}
\caption{SkewGP with MSP kernel, $\ell=0.1$,  $\sigma=1$ for regression and $\sigma=90$ for preference. The thick red line shows the posterior mean, and the shaded region represents the 95\% credible interval. Ten sampled functions are also included to illustrate the uncertainty. Vertical lines denote the operational points where the monotonicity constraint is enforced.}
\label{fig:MSPline1D}
\end{figure*}

\subsection{Transformed GP}
The works \citep{lin2014bayesian,andersen2018non} designed methods for imposing shape constraints on functions through squared Gaussian process derivatives and basis expansions. 
In particular, they approximate the kernel with a  basis expansion 
$k(x,x')\approx Cov(h(x),h(x'))$ with $h(x)=\sum_{i=1}^m \beta_i \phi_i(x)$ where $\beta_i$ are independent Gaussian distributed variables and $\phi_i$ are basis functions derived from the eigenfunctions of the  Laplace operator. Then they build a monotonic function as 
\begin{equation}
\label{eq:quadratic}
  h^+(x)=\int_{-\infty}^x \left(\sum_{i=1}^m \beta_i \phi_i(z)\right)^2dz, 
\end{equation}
 which is equal to $h^+(x)=\sum_{i=1}^m \sum_{j=1}^m \beta_i \beta_j \int_{-\infty}^x \phi_i(z)\phi_j(z)dz$. Note  the nonlinearity introduced by the multiplication between the coefficients $\beta_i$. This  breaks the connection with SkewGP and, therefore, the conjugacy with normal and probit-affine likelihoods.
 Moreover, the  basis function $\int_{-\infty}^x \phi_i(z)\phi_j(z)dz$ loses interpretability. 
  We will show next that we can build on the same ideas proposed in \citep{lin2014bayesian,andersen2018non}, while preserving linearity and interpretability. 

First, we note that, $M_i(x)$ in \eqref{eq:mspline} is an unnormalised triangular distribution and, therefore, nonnegative. We can integrate it in $[t_{i-1},t_{i}]$ to get a monotone-increasing function (an unnormalised CDF) $\mathcal{I}_i(x)=  \int_{t_{i-1}}^{x}M_i(z) dx$:
\begin{equation}
  \mathcal{I}_i(x)=  \left\{\begin{array}{ll}
\frac{(x-t_{i-1})^2}{2(t_{i}-t_{i-1})} & t_{i-1} \leq x \leq t_{i},\\
\frac{t_{i+1}-t_{i-1}}{2}-\frac{(t_{i+1}-x)^2}{2(t_{i+1}-t_{i})} & t_{i} \leq x \leq t_{i+1}.\\
\end{array}
\right.
\end{equation}
These are so-called I-splines \citep{ramsay1988monotone}. Note the quadratic polynomials which play a similar role to the quadratic transformation in \eqref{eq:quadratic}. The difference is that we do not transform the coefficients $\beta_i$, i.e., we still consider $f(x)=\sum_{i=1}^l \beta_i  \mathcal{I}_i(x)$ thus preserving linearity. Therefore, we  define the  kernel
\begin{equation}
\label{eq:kernelIntMSP}
    k(x,x')=\sum_{i=1}^l\sum_{j=1}^l \check{k}(t_i,t_j) \mathcal{I}_i(x) \mathcal{I}_j(x').
\end{equation}
\begin{theorem}
\label{th:Ispline}
Consider $l$ operational points $[u_1,\dots,u_{l}]$ defined as $u_i=t_{i}$, then the SkewGP obtained from Theorem  \ref{th:linconstr} with $L=diag([I_{r},{\bf 0}_r])$ and kernel defined as in \eqref{eq:kernelIntMSP} is monotone increasing in $[t_1,t_l]$. \vspace{-0.35cm}
\end{theorem}

It is worth noticing that in this case we are imposing the monotonicity constraint through $[{ f}(\bu_1),
\dots, { f}(\bu_r)] >0$ which does not involve the derivatives. This is due to the choice of the I-spline basis function. 
This approach can also be applied to the multivariate case  by using the same techniques discussed at the end of  the previous section. 

\section{Desirability as classification}
\label{sec:Desirability}
 In desirability theory \citep{walley1991statistical,quaeghebeur2015accept,augustin2014introduction}, decision making under uncertainty can be viewed as
a choice between gambles. Formally, a gamble is a real-valued function on the possibility space: it represents a positive or negative pay-off that is uncertain in the sense that it depends on the unknown outcome. For instance, consider a simple coin toss, where the possible outcomes are Heads (H) and Tails (T). We can represent a gamble, $g$, as a two-dimensional vector, i.e. $g=[1, -2]$. This means you win $1$ unit if it lands on Heads and lose $2$ units if it lands on Tails. By choosing to accept or reject such gambles, a subject reveals their beliefs about the outcomes of the uncertain event. Consider buying a call-put option in finance as a tangible example of accepting a gamble.

Assume a subject has accepted  the gambles 
$\mathcal{A}=\{g_1=[1,0],g_2=[0,1]\}$ and rejected $\mathcal{R}=\{g_3=[-1,2],g_4=[2,-1],g_5=[-0.5,3.5],g_6=[3.5,-0.5]\}$, are they willing to accept the gamble $g_7=[-1,0]$?

This prediction task can be cast as a classification problem where we aim to predict the subject's acceptance (class 1) or rejection (class 0) of the gamble $g_7$. Here, consistency (rationality of the gambler) means that if the subject accepts the gamble $g=[g_a,g_b]$ they should also accept any gamble $g+h$ where $h>0$ element-wise.   Similarly, a subject should always reject gambles $g\leq0$, because they are not favourable. These additional consistency constraints can be satisfied by finding a monotonic classifier that separates the augmented sets 
$\mathcal{A}'=\mathcal{A} \cup \{[-\epsilon,-\epsilon]\}$ and $\mathcal{R}'=\mathcal{R} \cup \{[0,0]\}$ for some small $\epsilon>0$. In linear desirability theory, we consider linear classifiers. It is well-known  that linearity \citep{zaffalon2021desirability,miranda2023nonlinear,pmlr-v215-de-bock23a} is a strong assumption, being violated for instance in domains with budget constraints, problems with lack of liquidity, wealth effects and risk-aversion \cite{nau1992indeterminate,pelessoni2005uncertainty, pelessoni2016,wheeler21a}. We can then consider a more general nonlinear classifier and learn the subject's behaviour.
Figure \ref{fig:desirability} shows in blue the region classified as 1 (accepted) for two nonlinear classifiers (we used the MSP kernel). It can be noted that the left one violates consistency: it implies the subject would accept  negative gambles (third orthant) and reject  positive gambles (first orthant). The right figure shows the accepted region after imposing monotonicity, which now satisfies consistency.\footnote{Technically, consistency holds within an error margin of $\epsilon$. SkewGPs are continuous model. In the example, this means that around the origin $(0,0)$ the classifier may exclude some positive gambles, like $(\epsilon/2,\epsilon/2)$ for instance.}
Note that, while this example employs the MSP kernel with an additive combination across the dimensions of the gamble, a product kernel could  be used to capture interactions between the two dimensions.

\begin{figure}[htp]
	\centering
 \setlength{\tabcolsep}{1pt}
	\begin{tabular}{cc}
		\includegraphics[width=.48\linewidth,trim={1.33cm 0.05cm 1.5cm 0},clip]{figs/desirability_unconsr.pdf} &
\includegraphics[width=.48\linewidth,trim={1.25cm 0.05cm 1.5cm 0},clip]{figs/desirability_consr.pdf} \\
		\small SkewGP unconstr.,   & \small SkewGP constr.
	\end{tabular}
	\caption{Blue region:   set of desirable gambles implied by an unconstrained classifier (left) versus a constrained classifier (right). The blue circles are the gambles in $\mathcal{A}$ and the red triangles  those in $\mathcal{R}$.}
	\label{fig:desirability}
\end{figure}


\section{Numerical results}
\label{sec:experiments}
We assess the performance of our SkewGP formulation of monotonic constrained GP in simulated and real datasets. We will use the M-spline kernel defined in Section \ref{sec:Mspline}. Hyperparameters estimation is discussed in Appendix~\ref{sec:hyp}. Appendix~\ref{sec:predictive_posterior} provides an algorithmic description of the predictive posterior computation.

\vspace{-0.2cm}
\subsection{1D monotonic benchmark functions}
To assess the performance of our SkewGP formulation for both regression and preference learning tasks, we leverage 7 established benchmark functions from prior works \citep{lin2014bayesian,maatoukhal-04084865,shively2009bayesian,ustyuzhaninov2020monotonic}, reported in table~\ref{tab:benchFuns}. As our method includes  several previous approaches for imposing monotonicity (details in Section \ref{sec:monGP}), we only compare it with monotonic-GP-flow (MF, \citep{ustyuzhaninov2020monotonic}), which uses the numerical solution of a particular stochastic differential equation to impose monotonicity. Notably, we extend their model to preference learning by employing a probit likelihood.

\begin{table}
    \centering
    \footnotesize
    \caption{1-D monotonic benchmark functions.}\label{tab:benchFuns}
    \setlength{\tabcolsep}{3pt}
    \begin{tabular}{l l}
    \toprule
    $g_1(x)=0.32(10x+\sin(10x))$ &  $g_2(x)=3(x<0.8)+6(x\geq 0.8)$ \\
    $g_3(x)=3x$ & $g_4(x)= 0.15e^{6x-3}$ \\
    $g_5(x)= 3/(1+e^{-20x+10})$ & $g_6(x)=5 x^2$ \\ $g_7(x)=10(x+1)$ & \\
    \bottomrule
    \end{tabular}
\end{table}


% $g_1(x)=0.32(10x+\sin(10x))$, $g_2(x)=3(x<0.8)+6(x\geq 0.8)$ , $g_3(x)=3x$ , $g_4(x)= 0.15e^{6x-3}$, $g_5(x)= 3/(1+e^{-20x+10})$, $g_6(x)=5 x^2$, $g_7(x)=10(x+1)$.

For the regression task, the training data is generated by evaluating these functions at $100$ randomly generated points in $[0,1]$ and adding independent Gaussian noise with variance $\sigma^2$ calculated using SNR=$\{10,30\}$, that is: $\sigma^2 = \text{signal variance} / SNR$. We generated $400$ testing data from each $g_i$ to evaluate the performance of the models using the root-mean-square-error
(RMSE) and the continuous-ranked-probability-score (CRPS) for the evaluation of probabilistic predictions. Table~\ref{tab:benchFunsResultsReg} shows the performance on test data evaluated with CRPS (lower is better) in the case SNR=10. SkewGPc denotes a monotonically constrained model while SkewGPu is the  unconstrained one. \\


\begin{table}[h]
    \centering
    \footnotesize
    \caption{Results on regression task (CRPS).}\label{tab:benchFunsResultsReg}
\setlength{\tabcolsep}{3pt}
\begin{tabular}{lccc}
\toprule
% & \multicolumn{3}{c}{CRPS regression} \\
fun &  MF & SkewGPu & SkewGPc  \\
\midrule
$g_1$ & 0.36$\pm$0.14 & 0.25$\pm$ 0.03 & \bf 0.15$\pm$ 0.01 \\
$g_2$ & 0.82$\pm$0.12 & 0.74$\pm$ 0.14 & \bf 0.54$\pm$0.03 \\
$g_3$ & 0.28$\pm$0.17 & 0.19$\pm$ 0.02 & \bf 0.1$\pm$0.01 \\
$g_4$ & 0.44$\pm$0.11 & 0.19$\pm$ 0.02 & \bf 0.20$\pm$0.01 \\
$g_5$ & 0.62$\pm$0.16 & 0.29$\pm$ 0.03 & \bf 0.30$\pm$0.02 \\
$g_6$ & 0.56$\pm$0.31 & 0.31$\pm$ 0.04 & \bf 0.21$\pm$0.02 \\
$g_7$ & 1.80$\pm$0.44 & 1.96$\pm$ 0.37 & \bf 0.74$\pm$ 0.21 \\
\bottomrule
\end{tabular}
\end{table}

For the preference task, the training data is generated by evaluating these functions at $50$ randomly generated points in $[0,1]$ and then generating preference as $x_i \succ x_j$ if $g_l(x_i)+\epsilon_i >g_l(x_j)+\epsilon_j$ where $\epsilon_i,\epsilon_j$ are independent Gaussian noises with the same variance of the regression task. We generated $100$ pairwise comparison between randomly selected $x_i$ in the training data. We also generated additional $400$ pairwise comparison for testing and  used the logarithmic-score (LogP) for the evaluation of probabilistic predictions. The definition of CRPS and LogP are provided in Appendix \ref{app:additional1d} together with additional details about the numerical experiments.  Table~\ref{tab:benchFunsResultsPref} shows the performances evaluated with LogP (higher is better) for the preference learning task with SNR=10. SkewGPc denotes a monotonically constrained model, SkewGPu the unconstrained one. \\


%The following table reports the CRPS for regression (lower is better) and LogP (higher is better) for preference learning  for SNR=10, where SkewGPc denotes a monotonically constrained model and SkewGPu the  unconstrained one.

% \begin{table}
%     \centering
%     \footnotesize
%     \caption{Results on monotonic benchmark functions.}\label{tab:benchFunsResults}
% {\scriptsize
% \setlength{\tabcolsep}{3pt}
% \begin{tabular}{l|ccc|ccc|}
%  & \multicolumn{3}{c|}{CRPS regression} & \multicolumn{3}{c|}{LogP preference}\\
% fun &  MF & SkewGPu & SkewGPc  &  MF & SkewGPu & SkewGPc\\
% \hline
% $g_1$ & 0.36$\pm$0.14  & 0.25$\pm$ 0.03  & \bf 0.15$\pm$ 0.01  & -0.50$\pm$ 0.02 & -1.04$\pm$ 0.50 &  -0.48$\pm$ 0.09 \\
% $g_2$ &  0.82$\pm$0.12 & 0.74$\pm$ 0.14  & \bf 0.54$\pm$0.03  & -0.63$\pm$ 0.04 & -0.88$\pm$ 0.11 & -0.62$\pm$ 0.05\\
% $g_3$ &  0.28$\pm$0.17 & 0.19$\pm$ 0.02  & \bf 0.1$\pm$0.01  & -0.50$\pm$ 0.03  & -1.04$\pm$ 0.50 & \bf -0.45$\pm$ 0.03\\
% $g_4$ & 0.44$\pm$0.11  & 0.19$\pm$ 0.02  & \bf 0.20$\pm$0.01  &  -0.52$\pm$ 0.02 & -1.13$\pm$ 0.06  & \bf -0.44$\pm$ 0.04  \\
% $g_5$ &  0.62$\pm$0.16 & 0.29$\pm$ 0.03  & \bf 0.30$\pm$0.02  & -0.5$\pm$ 0.02  & -0.92$\pm$ 0.52 & \bf -0.40$\pm$ 0.03 \\
% $g_6$ & 0.56$\pm$0.31  & 0.31$\pm$ 0.04  & \bf 0.21$\pm$0.02   & -0.47$\pm$ 0.03 & -0.84$\pm$ 0.37 & \bf -0.38$\pm$ 0.08\\
% $g_7$ & 1.80$\pm$0.44  & 1.96$\pm$ 0.37  & \bf 0.74$\pm$ 0.21  & -0.62$\pm$ 0.02 & -0.96$\pm$ 0.20 & -0.63$\pm$ 0.03 \\
% \end{tabular}}
% \end{table}



\begin{table}[h]
    \centering
    \footnotesize
    \caption{Results on preference task (LogP).}\label{tab:benchFunsResultsPref}
    \setlength{\tabcolsep}{3pt}
    \begin{tabular}{lccc}
    \toprule
%        & \multicolumn{3}{c}{LogP preference}\\
    fun &  MF & SkewGPu & SkewGPc \\
    \midrule
    $g_1$ & -0.50$\pm$ 0.02 & -1.04$\pm$ 0.50 &  -0.48$\pm$ 0.09 \\
    $g_2$ & -0.63$\pm$ 0.04 & -0.88$\pm$ 0.11 & -0.62$\pm$ 0.05\\
    $g_3$ & -0.50$\pm$ 0.03  & -1.04$\pm$ 0.50 & \bf -0.45$\pm$ 0.03\\
    $g_4$ & -0.52$\pm$ 0.02 & -1.13$\pm$ 0.06  & \bf -0.44$\pm$ 0.04  \\
    $g_5$ & -0.50$\pm$ 0.02  & -0.92$\pm$ 0.52 & \bf -0.40$\pm$ 0.03 \\
    $g_6$ & -0.47$\pm$ 0.03 & -0.84$\pm$ 0.37 & \bf -0.38$\pm$ 0.08\\
    $g_7$ & -0.62$\pm$ 0.02 & -0.96$\pm$ 0.20 & -0.63$\pm$ 0.03 \\
    \bottomrule
    \end{tabular}
\end{table}

In both regression and preference learning, it can be noticed that  SkewGPc outperforms MF in probabilistic predictions. This is not fully surprising, because of the conjugacy of SkewGPs with both the normal and probit-affine likelihood. In Appendix \ref{app:additional1d}, we reported the timings for the algorithms and the  results for SNR=30 and the RMSE.

\vspace{-0.2cm}
\subsection{Swiss Route Choice Data}
In stated preference surveys, participants choose between options with trade-offs (like cost, time, or reliability), revealing their preferences in hypothetical scenarios. This approach is widely used in transportation for understanding how people value different features. We consider a dataset that includes the choices made by subjects regarding their preferred railway connections/routes in Switzerland.  Each scenario includes two alternatives described in terms of \textit{travel time} (tt), \textit{cost} (tc), \textit{headway} (hw) and \textit{number of interchanges} (ch) \citep{vrtic2002impact}. 
 There are also  subject specific variables: \textit{household income}, \textit{car-availability} (binary) and \textit{purpose of the trip} (commute, shopping, business, leisure). Table \ref{table:swiss} in Appendix shows a subset of the dataset.  An example of a scenario where the subjects were asked to state their preference is:
$$
\begin{array}{rl}
 Option1:& tt=14,~tc=3,~hw=15,~ch=0,\\
  Option2:& tt=15,~tc=4,~hw=15,~ch=0.\\
\end{array}
$$
It is clear that Option1 should be preferable to Option2. The dataset includes  3,492 pairwise preferences expressed by 388 individuals. 
In this type of analysis, it is common to learn a preference model for each group. For instance, hereafter we focus on  commuters with car availability and compare an unconstrained SkewGP versus a constrained SkewGP, where we impose monotonicity (less is better) on all the covariates.
We used 10-fold CV to compare the two models and we assessed the LogP score. \\


\begin{table}[h]
    \centering
    \footnotesize
    \caption{Swiss route choice data (LogP).}\label{tab:SwissRouteChoice}
%{\centering
%{\footnotesize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{lcc}
\toprule
LogP &   SkewGPu & SkewGPc  \\
\midrule
other-options & $-0.53$ & $-0.53$\\
monotone-options &  $-0.30$ & $-0.15$ \\
\bottomrule
\end{tabular}%}\par}
\end{table}


Focusing on the options where one option is monotonically better than the other, SkewGPu achieves a worse average LogP value of $-0.30$ compared to $-0.15$ for SkewGPc, as shown in table~\ref{tab:SwissRouteChoice}. This is due to the uncertainty as shown in Figure \ref{fig:swsspair} for the two options above. When one option is monotonically better than the other, SkewGPu often exhibits high uncertainty, predicting a utility difference near zero. In contrast, SkewGPc  predicts the correct preference  with high probability (the utility of the monotonically better option is always higher).
As expected, the two models perform similarly for pairwise comparisons that are not monotonically dominated (LogP around $-0.53$).



 

\begin{figure}[htp]
	\centering
 \setlength{\tabcolsep}{1pt}
	\begin{tabular}{cc}
		\includegraphics[width=.48\linewidth,trim={1.cm 0.05cm 1.5cm 0},clip]{figs/swiss_unconstr.pdf} &
\includegraphics[width=.48\linewidth,trim={1.cm 0.05cm 1.5cm 0},clip]{figs/swiss_constr.pdf}  \\
		\small SkewGP unconstr.,   & \small SkewGP constr.
	\end{tabular} 
	\caption{Posterior distribution of the utility difference between the two options. A positive difference denotes the correct  prediction.}
	\label{fig:swsspair}
\end{figure} 

\subsection{Risky choice dataset}
\label{sec:risky}
Understanding and predicting human decision-making becomes increasingly crucial as automated systems interact more closely with people. Building on this need, \citep{peterson2021using} collected a large dataset (10,000 preference-pairs) of human decisions. Each problem involved choosing between two gambles with distinct payoff-probability combinations.
$$
\begin{array}{rl}
 Option1:& g_1=[26, -1],~p_1=[0.95, 0.05],\\
  Option2:& g_2=[21, ~23],~p_2=[0.95, 0.05].\\
\end{array}
$$
For this choice-problem, 10 out of 15 subjects (67\%) chose  Option2. Expected Utility (EU) theory dictates that Option1 is preferable to Option2 because $26 \cdot 0.95-1\cdot 0.05=24.65$ is higher than $21 \cdot0.95+23\cdot 0.05=21.1$. However, in Option2, we never lose money, so it is preferable in the worst case. There are other aspects to consider such as the way the payouts $g_{ij}$ are viewed  by the decision-makers, if they use linearity when combining payoffs and probabilities, if they evaluate each gamble separately or jointly. However, also in this case, assuming monotonicity on $g$ seems to be reasonable:  for instance, Option3  $g=[27, -0.5],~p=[0.95, 0.05]$ should be preferable to Option1. Note that, this choice problem is related to desirability discussed in Section \ref{sec:Desirability} - in desirability the probabilities are not given explicitly. We will use the dataset to learn a model to predict preferences for options by using 
$g_1,p_1,g_2,p_2,g_1\cdot p_1,g_2 \cdot p_2$ as covariates. This will allow us to understand in which way the human choices deviates from EU theory. We will compare SkewGPu versus SkewGPc to understand the effect of monotonicity. 
We used 10-fold CV  assessed the LogP score, the results are shown in table~\ref{tab:RiskyChoiceSkew}. We can reach similar conclusions to the ones for the Swiss route data: SkewGPc provides better estimates of the probability of the preference for monotone options. \\

\begin{table}[h]
    \centering
    \footnotesize
    \caption{Risky choice data (LogP).}\label{tab:RiskyChoiceSkew}
%{\centering
%{\footnotesize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{lcc}
\toprule
LogP &   SkewGPu & SkewGPc  \\
\midrule
other-options & $-0.31$ & $-0.31$\\
monotone-options &  $-0.17$ & $-0.13$ \\
\bottomrule
\end{tabular}%}\par}
\end{table}

Table~\ref{tab:RiskyChoiceEU} shows that SkewGPc  outperforms the EU model in terms of accuracy. This suggests that the underlying preferences in the dataset deviate from the linear assumptions of the EU model, and a nonlinear model like SkewGPc is more appropriate for capturing these preferences. \\


\begin{table}[h]
    \centering
    \footnotesize
    \caption{Risky choice data, comparison with EU (LogP).}\label{tab:RiskyChoiceEU}
%{\centering
%{\footnotesize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{lcc}
\toprule
Accuracy &   EU & SkewGPc  \\
\midrule
other-options & $0.75$ & $0.83$\\
monotone-options &  $0.97$ & $0.99$ \\
\bottomrule
\end{tabular}%}\par}
\end{table}

\section{Conclusions}
We derived a unified framework for linearly constrained Gaussian Processes (GPs) by using Skew GPs which includes regression, classification, and preference learning. Our unified framework demonstrated strong performance in both preference learning and modelling human decision-making. As future work, we aim to apply this approach to active learning and Bayesian optimisation, while including a larger class of linear constraints, beyond monotonicity. For human-decision making under risk, we plan to derive application-specific basis functions and kernels and impose constraints that are usually assumed in decision making, such as both monotonicity and convexity.

 


\begin{acknowledgements} % will be removed in pdf for initial .
For the first author, this publication has emanated from research conducted with the financial support
of the EU Commission Recovery and Resilience Facility under the Science Foundation Ireland Future Digital Challenge Grant Number 22/NCF/FD/10827. The second author acknowledges support
from the SNSF grant number 200021\_212164.
\end{acknowledgements}

% References
\bibliography{biblio}

\newpage

\onecolumn

\title{Linearly Constrained Gaussian Processes are SkewGPs: application to Monotonic Preference Learning and Desirability\\(Supplementary Material)}
\maketitle

\appendix

\section{Gaussian Processes}
\label{app:gp}
Gaussian Processes (GPs) are prior over functions \citep{o1978curve,rasmussen2006gaussian}, that  have attractive advantages over parametric (including neural networks) models.\footnote{GPs can be seen as single-layer neural networks  with an infinite number of hidden units \citep{williams1996computing}.} They have a small number of tunable hyperparameters (and so they can be trained on  small  datasets), and give a measure of prediction uncertainty. Moreover, by being kernel based, they provide a framework to learn utility functions defined on any domain $\mathcal{X}$ on which we can define a kernel-function.

To define a prior over a function $f: \mathcal{X} \rightarrow \mathbb{R}$, a GP assumes that, for every $n$, $p(f({\bf x}_1),\dots, f({\bf x}_n))$ is jointly Gaussian, with mean  $[\mu({\bf x}_1),\dots, \mu({\bf x}_n)]$ and covariance  $Cov(f({\bf x}_i),f({\bf x}_j)) = k({\bf x}_i,{\bf x}_j)$, for $i,j =1, \ldots, n$.
$\mu({\bf x})$ and $k({\bf x},{\bf x}')$ are the mean function and, respectively, the (positive definite) kernel function of the GP.  A GP is usually parameterised with a zero mean function $\mu({\bf x})=0$ and a covariance kernel $k_{\boldsymbol{\theta}}({\bf x},{\bf x}')$ which depends on hyperparameters  $\boldsymbol{\theta} \in \Theta$. A typical example is the automatic relevance determination (ARD) square-exponential kernel on $\mathbb{R}^c$, $c \in \mathbb{N}$. For ${\bf x},{\bf x}' \in \mathbb{R}^c$ it is defined as
\begin{equation}
\label{eq:RBF}
k_{\boldsymbol{\theta}}({\bf x},{\bf x}')= \displaystyle{\sigma_0^2 \,\exp\left(-\sum_{i=1}^c\frac{ (x_i-x_i')^2}{2 \ell_i^2}\right)},
\end{equation}
where $\boldsymbol{\theta}=[\ell_1,\dots,\ell_c,\sigma_0^2]$ includes the lengthscales hyperparameters $\ell_i$ (one for each dimension) and the scale parameter $\sigma_0^2$.
GPs have a natural Bayesian interpretation that makes them ideal for regression problems. If we assume that the observed values are the sum of a true function evaluated at some inputs plus Gaussian noise, i.e. $y_i = f({\bf x}_i) + \varepsilon_i$ with $\varepsilon_i\sim N(0, \sigma^2)$ for $i=1, \ldots,n$, then we can analytically compute the posterior distribution of $f$. We can write the observation model more compactly as  the likelihood
$$p(y_1, \ldots, y_n | f({\bf x}_1), \ldots, f({\bf x}_n))=N(\mathbf{y}_n| {\bf f}(X), \sigma^2I_n),$$
where $\mathbf{y}_n=[y_1,\dots,y_n]^\top$,  $X=[{\bf x}_1,\dots,{\bf x}_n]^\top$ and $I_n$ is the identity matrix of dimension $n$. In particular, the predictive posterior at a new test point ${\bf x}^*\in \mathcal{X}$ is $GP(\mu_p,k_p)$, with mean and covariance kernel given by:

\begin{align}
    \mu_p({\bf x}^*) &= K_{\boldsymbol{\theta}}({\bf x}^*,X) (K_{\boldsymbol{\theta}}(X,X)+\sigma^2I_n)^{-1}\mathbf{y}_n \\
    k_p({\bf x}^*, {\bf x}^*) &= K_{\boldsymbol{\theta}}({\bf x}^*,{\bf x}^*) - K_{\boldsymbol{\theta}}({\bf x}^*,X)  (K_{\boldsymbol{\theta}}(X,X)+\sigma^2I_n)^{-1}K_{\boldsymbol{\theta}}(X,{\bf x}^*),
\end{align}
where $K_{\boldsymbol{\theta}}(X,X)$ is a matrix whose ij-th element is defined as $(K_{\boldsymbol{\theta}}(X,X))_{ij}=k_{\boldsymbol{\theta}}({\bf x}_i,{\bf x}_j)$ (similar for $K_{\boldsymbol{\theta}}({\bf x}^*,X)$). Note that, the variance of the likelihood $\sigma^2$ is also considered to be a hyperparameter. The hyperparameters $\boldsymbol{\theta},\sigma^2$ are commonly estimated by maximising the marginal likelihood:
$$
p( ({\bf x}_i,y_i)|\boldsymbol{\theta},\sigma^2)= N(\mathbf{y}_n,K_{\boldsymbol{\theta}}(X,X)+\sigma^2 I_n).
$$

In tasks with likelihoods different from the Gaussian, the posterior is not a GP. For  Probit  (classification/preference learning) and Skew-Normal   likelihoods, the posterior is a Skew GP \citep{Benavoli2021}. For other likelihoods, in general the posterior does not have a closed-form and  is approximated with a GP using three main approaches: (i) Laplace Approximation (LP) \citep{mackay1996bayesian,williams1998bayesian}; (ii) Expectation Propagation (EP) \citep{minka2001family}; (iii) Kullback-Leibler divergence (KL) minimization \citep{opper2009variational}, comprising Variational Bounding (VB) \citep{gibbs2000variational} as a particular case. 


\section{Proofs}
\label{app:proofs}
The proofs are straightforward.

\paragraph{Theorem \ref{th:linconstr}}
By denoting with ${\bf z}_1=f(\bx)$ and ${\bf z}_0=[{\bf f}(\bu_1),\dots, {\bf f}(\bu_r),
{\bf f}'(\bu_1), \dots, {\bf f}'(\bu_r)]^\top$, we can rewrite the constraint \eqref{eq:constr} as $L{\bf z}_0+\bgamma>0$. Therefore, 
the distribution of ${\bf z}_1+\bxi(\bx)=f(\bx)+\bxi(\bx)$ conditioned on $L{\bf z}_0+\bgamma>0$ is SkewGP as derived in Section \ref{sec: Background}. We just need to consider a change of variables to take into account of the matrix $L$.

\paragraph{Theorem \ref{th:mspline}}
Consider $ f(x)=\sum_{i=1}^l \beta_i M_i(x)$ and $x' \in (t_{i-1},t_{i})$ and observe that 
$f(x')=\beta_{i-1} (t_{i}-x')/(t_{i}-t_{i-1})+\beta_i (x'-t_{i-1})/(t_i-t_{i-1})$. Therefore, we have that
$$
\frac{d}{dx} f(u_i)= (\beta_i-\beta_{i-1}) \frac{t_{i}}{t_{i}-t_{i-1}}.
$$
Therefore, we have that $\tfrac{d}{dx} f(u_i)>0$ implies that $\beta_i-\beta_{i-1}>0$ for $i=1,\dots,l$, which is equivalent to the constraint
\citep[Eq.\ (7)]{lopez2022high}.

\paragraph{Theorem \ref{th:Ispline}}
Consider $ f(x)=\sum_{i=1}^l \beta_i \mathcal{I}_i(x)$ and $x' \in (t_{i},t_{i+1})$ and observe that 
$$
\begin{aligned}
f(x')&=\beta_{i} \left(\frac{t_{i+2}-t_{i}}{2}-\frac{(t_{i+1}-x')^2}{2(t_{i+1}-t_{i})}\right)\\
&+\beta_{i+1} \frac{(x'-t_{i})^2}{2(t_{i+1}-t_{i})}
\end{aligned}
$$
Therefore, we have that $f(u_i)= \beta_{i-1} \left(\frac{t_{i+2}-t_{i}}{2}-\frac{t_{i+1}-t_{i}}{2}\right)$. Therefore, we have that $ f(u_i)>0$ implies that $\beta_{i}>0$ for $i=1,\dots,l$ which ensures monotonicity.


\section{Hyperparameters' estimation}
\label{sec:hyp}
We use the implementation of Variational Inference in \textit{GPytorch} \citep{gardner2018gpytorch} to estimate the kernel hyperparameters. This is based on  \citep{hensman2015scalable} although in our case the inducing points are equal to the set of the covariates $X$  plus the operational points (we perform a full variational inference). 
We apply the variational inference considering as prior the Multivariate Normal in \eqref{eq:jointnormal} and we include the constraint
$L\tilde{{\bf f}} + \bgamma >0$ with
\begin{align}
\tilde{{\bf f}}=\left[\begin{smallmatrix}
{\bf f}(\bu_1)\\
\vdots \\
{\bf f}(\bu_r)\\
{\bf f}'(\bu_1)\\
\vdots \\
{\bf f}'(\bu_r)\\
%{\bf f}''(\bu_1)\\
%\vdots \\
%{\bf f}''(\bu_r)
\end{smallmatrix}\right],
\end{align}
into the likelihood through a probit
$\Phi(\frac{1}{\tau}(L\tilde{{\bf f}}+\bgamma))$ so to make the gradient to be continuous. $\tau$ is a constant. It is well known that for $\tau \rightarrow 0$ the Gaussian CDF converges to an indicator function for its argument being positive, that is $L\tilde{{\bf f}}+\bgamma>0$. Therefore, we choose $\tau=10^{-3}$ and we decrease it during the maximisation of ELBO in order to get even closer to the indicator function (from $\tau=10^{-3}$ up to  $\tau=10^{-6}$). Note that, we use this approximation of the constraint only for estimating the kernel hyperpameters. The samples from the posterior are computed through the SkewGP derivations in Proposition \ref{prop:postmixed}  and Theorem \ref{th:linconstr}.

We fix the operational points for the SE kernel and knots for the MSP kernel to $n$ percentiles of the data and we do not change them during hyperparameter optimisation. Approaches to optimally placing the operational points has been discussed in previous literature \cite{riihimaki2010gaussian,wang2016estimating,agrell2019,da2020gaussian,golchi2015monotone,maatouk2017gaussian,lopez2018finite,lopez2022high,maatoukhal-04084865}.


\section{Sample from the constrained predictive posterior}
\label{sec:predictive_posterior}

Algorithm~\ref{algo:predictivePosterior} details how samples from the predictive posterior of SkewGPc are obtained. Note that the posterior parameters (lines 1 and 2) and the truncated normal sampling (line 3) are computed once for all as they do not depend on ${\bf x^*}$. The steps at line 4-5 are sampling from a multivariate Gaussian and matrix-vector multiplications which are fast operations.
 
\begin{algorithm}
\caption{Predictive posterior for SkewGPc \label{algo:predictivePosterior}}
\KwData{$k$ kernel function, $U \in \mathbb{R}^{r \times D}$ matrix of operational points, $L$ matrix specifying monotonicity constraints, $W \in \mathbb{R}^{m_a \times n}$ matrix of preference data, $m$ number of posterior samples, ${\bf x}^*$ new input.}
%\KwResult{}
Compute the prior constrained parameters $\Gamma$, $\Delta({\bf x})$ by using eqs.~\eqref{eq:gammaConstr} and \eqref{eq:deltaConstr}\;
Compute the posterior functions and parameters $\tilde{\bxi}({\bf x})$, $\tilde{\Omega}({\bf x}, {\bf x^\prime})$, $\tilde{\Delta}({\bf x})$, $\tilde{\bgamma}$, $\tilde{\Gamma}$ as in Prop.~\ref{prop:postmixed} \;
Sample ${\bf r_{1,-\tilde{\bgamma}}^*}$ from the multivariate normal $N(0, \tilde{\Gamma})$ truncated below $\tilde{\bgamma}$ by using \emph{lin-ess} \;
At the predictive input point ${\bf x}^*$, sample ${\bf r_0^*}$ from the multivariate normal $N(0, \overline{\tilde{\Omega}}({\bf x^*},{\bf x^*})- \tilde{\Delta}({\bf x^*})\tilde{\Gamma}^{-1}\tilde{\Delta}({\bf x^*})^T)$ \;
Compute ${\bf 
\tilde{z}} = \tilde{\bxi}({\bf x^*}) + {\bf r_0^*}({\bf x^*}) + \Delta({\bf x^*}) \Gamma^{-1} {\bf r_{1,-\tilde{\bgamma}}^*}$ \;
% \While{$b \neq 0$}{
%     $r \leftarrow a \mod b$\;
%     $a \leftarrow b$\;
%     $b \leftarrow r$\;
% }
\Return{Samples ${\bf \tilde{z}}$}\;
\end{algorithm}

\section{Numerical results}
\label{sec:additional}

\subsection{1D simulations}
\label{app:additional1d}
The logarithmic score is used to evaluate probabilistic prediction for binary observations.
Consider a variable $y$ with possible values 1 or 0, denote the probability of $y=1$ with $p$, then one can write the logarithmic scoring rule as 
$LogP(p)=y \ln(p) + (1 - y) \ln(1 - p)$. Since we are comparing Bayesian methods, we computed the average logarithmic score  by averaging over the $S$ samples from the posterior
$$
LogP(y,p_1,\dots,p_S)=\sum_{i=1}^S y \ln(p_i) + (1 - y) \ln(1 - p_i)
$$
The continuous ranked probability score (CRPS) is a strictly proper scoring rule much used to assess probabilistic prediciton for continuous variables. It is defined as
$$
{\displaystyle CRPS(F,y)=\int _{\mathbb {R} }(F(x)-H(x\geq y))^{2}dx}
$$
where 
${\displaystyle F}$ is the predicted cumulative distribution function,  ${\displaystyle H}$ is the Heaviside step function and ${\displaystyle y\in \mathbb {R} }$ is the observation. We computed the CRPS using the empirical CDF computed from the posterior samples and we used as $y$ the true value of the function (and not the noisy one).

For both monotonic-flow and SkewGP we used 20 inducing points and, respectively, Knots. We initialised them with the percentiles of the data.  For monotonic-flow, we used the SE kernel and T=1. For both the models, in regression, we standardised the $y$s before `training'.



Table~\ref{tab:benchFunSNR30} reports the results with SNR=30 for the regression task (CRPS, lower better) and the preference task (LogP, higher better). \\

\begin{table}[h!]
    \centering
    %\footnotesize
    \caption{1-D benchmark functions results with SNR=30.}\label{tab:benchFunSNR30}
%\begin{center}
%{
%{
%\centering
\setlength{\tabcolsep}{3pt}
\begin{tabular}{lccc|ccc}
\toprule
 & \multicolumn{3}{c|}{CRPS regression} & \multicolumn{3}{c}{LogP preference}\\
fun &  MF & SkewGPu & SkewGPc  &  MF & SkewGPu & SkewGPc\\
\midrule
$g_1$ & 0.59$\pm$0.04  & 0.19$\pm$ 0.01  & \bf 0.21$\pm$ 0.06  & -0.46$\pm$ 0.04 & -0.82$\pm$ 0.40 & \bf -0.34$\pm$ 0.03 \\
$g_2$ &  0.89$\pm$0.08 & 0.58$\pm$ 0.10  & \bf 0.53$\pm$0.06  & -0.62$\pm$ 0.04 & -0.88$\pm$ 0.21 & -0.58$\pm$ 0.08\\
$g_3$ &  0.50$\pm$0.05 & 0.14$\pm$ 0.01  & \bf 0.16$\pm$0.05  & -0.44$\pm$ 0.04  & -0.91$\pm$ 0.40 & \bf -0.33$\pm$ 0.05\\
$g_4$ & 0.50$\pm$0.05  & 0.18$\pm$ 0.01  & \bf 0.20$\pm$0.03  &  -0.49$\pm$ 0.03 & -0.95$\pm$ 0.46  & \bf -0.40$\pm$ 0.06  \\
$g_5$ &  0.79$\pm$0.06 & 0.25$\pm$ 0.02  & \bf 0.34$\pm$0.11  & -0.49$\pm$ 0.03  & -0.83$\pm$ 0.34 & \bf -0.36$\pm$ 0.04 \\
$g_6$ & 0.99$\pm$0.07  & 0.25$\pm$ 0.02  & \bf 0.34$\pm$0.14   & -0.43$\pm$ 0.04 & -0.94$\pm$ 0.48 & \bf -0.32$\pm$ 0.08\\
$g_7$ & 1.70$\pm$0.33  & 0.95$\pm$ 0.2  & \bf 0.57$\pm$ 0.2  & -0.55$\pm$ 0.02 & -0.88$\pm$ 0.20 & -0.53$\pm$ 0.04 \\
\bottomrule
\end{tabular}%}
%}
%\end{center}
\end{table}

In regression, it can be noted that SkewGPu and SkewGPc have a similar performance (the performance of the latter has higher variability).  This is mainly due to the small SNR (SkewGPu can learn the monotonicity of the functions from the data) and variability of the numerical optimisation. 

For preference learning, to compute the kernel hyperparameters and predict 2000 posterior samples, the running (wall-clock) time is $330$s for MF and $166$s for SkewGPc. 

% \begin{center}
% \begin{tabular}{|l|l|}
% \hline
% MF &330s\\
% \hline
% SkewGPc &166s\\
% \hline
% \end{tabular}
% \end{center}
For fixed hyperparameters, to predict 2000 posterior samples, the running (wall-clock) time is $180$s for MF and $5$s for SkewGPc. 
% \begin{center}
% \begin{tabular}{|l|l|}
% \hline
% MF &180s\\
% \hline
% SkewGPc &5s\\
% \hline
% \end{tabular}
% \end{center}
Sampling with Monotonic GPflow requires solving a stochastic differential equation numerically.

Finally, table~\ref{tab:benchFunRMSE} reports the RMSE $\sqrt{\frac{1}{n}\sum_{i=1}^n (g_j(x_i) - \hat{g}_j(x_i))^2}$ 
 where $\hat{g}_j$ is the posterior mean, for $j=1,\dots,7$,  for both the SNR 10 and 30 case. \\

 \begin{table}[h!]
    \centering
    %\footnotesize
    \caption{Regression benchmark functions RMSE results.}\label{tab:benchFunRMSE}
%\begin{center}
\setlength{\tabcolsep}{3pt}
\begin{tabular}{lccc|ccc}
\toprule
 & \multicolumn{3}{c|}{RMSE SNR 10} & \multicolumn{3}{c}{RMSE SNR 30}\\
fun &  MF & SkewGPu & SkewGPc  &  MF & SkewGPu & SkewGPc\\
\midrule
$g_1$ & 0.55$\pm$0.17 & 0.43$\pm$0.05  & \bf 0.25$\pm$0.02 & 0.78$\pm$0.06 & \bf 0.33$\pm$0.026 & 0.35$\pm$0.1\\
$g_2$ & 1.1$\pm$0.1 & 1.14$\pm$0.11  & \bf 0.88$\pm$0.05 &  1.11$\pm$0.17 & 1.0$\pm$0.14 & \bf 0.86$\pm$0.04\\
$g_3$ &  0.44$\pm$0.24  & 0.35$\pm$0.04 & \bf 0.11$\pm$0.05 & 0.71$\pm$0.05  & \bf 0.25$\pm$0.03 & \bf 0.25$\pm$0.1\\
$g_4$ & 0.6$\pm$0.1 & 0.41$\pm$0.05 & \bf 0.35$\pm$0.02 & 0.64$\pm$0.04 & \bf 0.38$\pm$0.04 & \bf 0.38$\pm$0.06\\
$g_5$ & 0.92$\pm$0.21 & \bf 0.49$\pm$0.06 & 0.53$\pm$0.05 & 1.1$\pm$0.09 & \bf 0.41$\pm$0.03 & 0.67$\pm$0.12\\
$g_6$ & 0.82$\pm$0.37 & 0.57$\pm$0.09  & \bf 0.35$\pm$0.05 & 1.28$\pm$0.09 & \bf 0.46$\pm$0.06 & 0.55$\pm$0.22\\
$g_7$ & 2.67$\pm$0.54 & 2.7$\pm$0.3 & \bf 1.30$\pm$0.46 & 2.5$\pm$0.4 & 1.65$\pm$0.3 & \bf 0.90$\pm$0.48\\
\bottomrule
\end{tabular}
%\end{center}
\end{table}

SkewGPu outperforms MF. Note that, the RMSE for SNR=10 is sometimes  lower than for SNR=30. This counter intuitive result occurs because both RMSE and CRPS were calculated on the noise-free function values. 

Our simulation results align with \citep{ustyuzhaninov2020monotonic}, where the  method \citep{andersen2018non} achieved similar or even better performance than MF in some benchmark function. As previously discussed, MSP kernels based on M-splines or  MSP kernels based on I-splines provide an improvement over \citep{andersen2018non}  by preserving conjugacy with both the normal and affine-probit likelihood.





 



\subsection{Swiss route choice data}
In the MSP kernel, we used 10 knots per covariate. \\ 

\begin{table}[h!]
\centering
\tiny
\begin{tabular}{lrrrrrrrrrrrrrrr}
\toprule
 & choice & tt1 & tc1 & hw1 & ch1 & tt2 & tc2 & hw2 & ch2 & hh\_inc\_abs & car\_availability & commute & shopping & business & leisure \\
ID &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
2439 & 2 & 58 & 7 & 30 & 1 & 50 & 8 & 30 & 0 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 1 & 30 & 8 & 60 & 0 & 41 & 7 & 15 & 2 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 1 & 41 & 7 & 30 & 0 & 34 & 8 & 15 & 2 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 1 & 44 & 10 & 60 & 1 & 52 & 9 & 60 & 2 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 2 & 43 & 9 & 60 & 0 & 34 & 10 & 30 & 0 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 2 & 36 & 8 & 60 & 1 & 43 & 7 & 15 & 1 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 2 & 30 & 8 & 60 & 0 & 43 & 7 & 15 & 0 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 1 & 43 & 8 & 30 & 1 & 30 & 9 & 60 & 0 & 50000 & 1 & 1 & 0 & 0 & 0 \\
2439 & 1 & 41 & 8 & 30 & 2 & 58 & 7 & 60 & 0 & 50000 & 1 & 1 & 0 & 0 & 0 \\
5641 & 1 & 77 & 19 & 15 & 1 & 110 & 16 & 60 & 1 & 10000 & 0 & 0 & 0 & 0 & 1 \\
5641 & 2 & 94 & 23 & 60 & 1 & 125 & 18 & 15 & 0 & 10000 & 0 & 0 & 0 & 0 & 1 \\
5641 & 2 & 82 & 18 & 60 & 2 & 91 & 15 & 30 & 0 & 10000 & 0 & 0 & 0 & 0 & 1 \\
5641 & 2 & 101 & 15 & 60 & 0 & 86 & 20 & 15 & 0 & 10000 & 0 & 0 & 0 & 0 & 1 \\
5641 & 1 & 99 & 18 & 15 & 0 & 110 & 16 & 15 & 0 & 10000 & 0 & 0 & 0 & 0 & 1 \\
5641 & 1 & 91 & 18 & 30 & 1 & 101 & 16 & 15 & 0 & 10000 & 0 & 0 & 0 & 0 & 1 \\
\bottomrule
\end{tabular}
\caption{10 pairwise options: each rows is a different scenario: tt1,tc1,hw1,ch1 against tt2,tc2,hw2,ch2. \\Choice denotes the option selected by th  user ID.}
\label{table:swiss}
\end{table}

\subsection{Risky choice data}
We only focused on choice-pairs whose gambles have only two components (two-dimensional), as in the example in Section \ref{sec:risky}: a total of 5347 choices.


\end{document}
