% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} 
%% In your camera-ready you should use the 'accepted' parameter. This shows the authors and how an accepted paper will look like. The footer is 'Acccepted for X'. In the final version, the proceedings chairs will add the page numbers for PMLR and the final footer will be 'Proceedings of X'.
%
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{ dsfont }
\usepackage{float}
\usepackage{ulem}
\usepackage{multirow}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\DeclareMathOperator*{\diag}{diag}
\DeclareMathOperator*{\tr}{tr}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\EI}{EI}
\DeclareMathOperator*{\KL}{kl}
\newtheorem{theorem}{Theorem}

\title{Bayesian Quantile and Expectile Optimisation - Supplementary material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
% 
% Important: in case of equal contributions, we strongly recommend to NOT show it in this part of the paper, but rather describe it in the appropriate section at the end of the paper "Author Contribution", where you have more space to describe how each author contributed.
%
% Add authors
% Remember to use the order convention "First/Given name" "Last/Family name", e.g. John Smith, Hanako Yamada, Marco Rossi, Wei Zhang
\author[1]{\href{mailto:<victor@secondmind.ai>?Subject=Your UAI 2022 paper}{Victor Picheny}{}}
\author[1]{Henry Moss}
\author[2]{L\'eonard Torossian}
\author[1]{Nicolas Durrande}

% Add affiliations after the authors
\affil[1]{%
    Secondmind Labs\\
    Cambridge, UK
}
\affil[2]{%
    Inria\\
    Universit\'e C\^ote d'Azur, France
}
  
  \begin{document}
\maketitle


\section{Supplementary material: Calculation of Q-GIBBON}
\label{appendix:GIBBON}
We derive here the analytical form of our proposed Q-GIBBON acquisition function.
For simplicity, we focus on the quantile setting, but the expectile case only requires a
straightforward modification of the following derivation. 

Recall that Q-GIBBON is defined as
\begin{align}
    \alpha_n^{\textrm{Q-GIBBON}} = \frac{1}{2}\log|C| &-\frac{1}{2M}\sum_{g^*\in\mathcal{M}_n}\sum_{i=1}^B\log V_i(g^*) , \nonumber
\end{align}
where $|C|$ is the determinant of the $B\times B$ predictive covariance matrix with elements $C_{i,j} =\textrm{Cov}(y_{{x}_i},y_{{x}_j}|\mathcal D_n)$ and $V(g^*)$ denotes the conditional variances $V_i(g^*) = \textrm{Var}(y_{{x}_i}|g^*,\mathcal D_n)$. Therefore, calculating Q-GIBBON boils down to being able to calculate $V_i(g^*)$ and $C_{i,j}$ across any candidate batch of points (i.e. for all $i,j\in\{1,..,B\}$). We now derive closed-form expressions for $V_i(g^*)$ and $C_{i,j}$.

\subsection{Required Predictive Quantities}

For ease of notation, we will consider just a single pair of input values of ${x}_1$ and ${x}_2$ and show how to calculate $V_1(g^*)$ and $C_{1,2}$. Denote the quantiles, scales and (noisy) observations at these two location as $g_1=g({x}_1)|\mathcal D_n$, $g_2=g({x}_2)|\mathcal D_n$, $\sigma_1=\sigma({x}_1)|\mathcal D_n$, $\sigma_2=\sigma({x}_2)|\mathcal D_n$, $y_1 = y({x}_1)|\mathcal D_n$ and $y_2=y({x}_2)|\mathcal D_n$, respectively. Then, from our underlying GP models we can extract our
current beliefs about these random variables: 
\begin{align*}
\begin{pmatrix}g_1\\
g_2
\end{pmatrix} & \sim & N\left[\left(\begin{array}{c}
\mu^g_{1}\\
\mu^g_{2}
\end{array}\right),\left(\begin{array}{ccc}
(\sigma^g_{1})^2 & \Sigma^g_{1,2} \\
\Sigma^g_{1,2} & (\sigma^g_{2})^2 
\end{array}\right)\right],
\\
\begin{pmatrix}\log(\sigma_1)\\
\log(\sigma_2)
\end{pmatrix} & \sim & N\left[\left(\begin{array}{c}
\mu^{\sigma}_{1}\\
\mu^{\sigma}_{2}
\end{array}\right),\left(\begin{array}{ccc}
(\sigma^{\sigma}_{1})^2 & \Sigma^{\sigma}_{1,2} \\
\Sigma^{\sigma}_{1,2} & (\sigma^{\sigma}_{2})^2 
\end{array}\right)\right].
\end{align*}	
For closed form expressions of $\mu^g_{1}$, $\sigma^g_{1}$, ...  see any GP textbook, e.g. \cite{rasmussen2003gaussian}. 

Before deriving expressions for $V_1(g^*)$ and $C_{1,2}$, it is convenient to write the conditional mean and variance of our noisy observations $y_1$ and $y_2$. Following \cite{yu2001bayesian}, we have
\begin{align}
 \mathds{E}[y_1|g_1,\sigma_1] &= g_1 + \frac{1-2\tau}{\tau(1-\tau)}\sigma_1,\label{AL_mean}\\
 \mathrm{Var}(y_1|g_1,\sigma_1) &= \frac{1-2\tau+2\tau^2}{\tau^2(1-\tau)^2}\sigma_1^2\label{AL_var},
\end{align}
with similar expressions for the moments of $y_2|g_2,\sigma_2$

\subsection{Calculating the conditional variance V}
We now have all the quantities required to calculate $V_1(g^*)=\textrm{Var}(y|g^*)$. Recall that $g^*$ denotes the maximal value obtained by the quantile (i.e. $g(x)$). First, we use the law of total variance to decompose $V_1$ into two terms:
\begin{align}
    V_1 =& \nonumber \textrm{Var}_{g_1,\sigma|g^*}\left(\mathds{E}[y_1|g_1,\sigma_1,g^*]\right) \\&+ \mathds{E}_{g_1,\sigma|g^*}\left[\textrm{Var}\left(y_1|g_1,\sigma_1,g^*\right)\right] \label{step_1}.
\end{align}

Note that conditioning on $g_1,\sigma,g^*$ is equivalent to conditioning on $g_1,\sigma$ only, as knowing that $g^*=\max g({x})$ does not provide additional information over knowing $g_1$ itself. Therefore, we can insert our expressions for the moments of the asymmetric Laplace (\ref{AL_mean}) and (\ref{AL_var}) into (\ref{step_1}) which, after simple manipulation provides:
\begin{align}
    V_1(g^*) = \nonumber \textrm{Var}_{g_1|g^*}(g_1)&+ \frac{3(1-2\tau)^2+1}{2\tau^2(1-\tau)^2}\textrm{e}^{2(\mu_1^{\sigma}+(\sigma_1^{\sigma})^2)}\\&+ \frac{(1-2\tau)^2}{2\tau^2(1-\tau)^2}\textrm{e}^{2\mu_1^{\sigma}+(\sigma_1^{\sigma})^2} \label{step_2}.
\end{align}
All that remains for the calculation of $V(g^*)_1$ is an expression for $\textrm{Var}_{g_1|g^*}(g_1)$. Fortunately, as shown by \cite{wang2017max}, $g|g^*$ is simply an upper truncated Gaussian variable. Therefore, using the well-known expression for the variance of a truncated Gaussian, we have
\begin{align}
    \textrm{Var}_{g_1|g^*}(g_1) = (\sigma_1^g)^2\left(1 + \frac{\phi(\gamma_{g^*})}{\Psi(\gamma_{g^*})}\left(\gamma_{g^*} - \frac{\phi(\gamma_{g^*})}{\Psi(\gamma_{g^*})}\right)\right), \label{var_TG}
\end{align}
where $\gamma_{g^*} = \frac{g^*-\mu_1^g}{\sigma_1^g}$, and $\phi$ and $\Psi$ are the probability density functions and cumulative density functions of a standard Gaussian variable, respectively. 

Finally, inserting ($\ref{var_TG}$) into (\ref{step_2}) yields a closed form expression for $V_1(g^*)$.

\subsection{Calculating the predictive covariance C}
 
 Just like when calculating the conditional variance $V_1$, we begin our decomposition of $C_{1,2}=Cov(y_1,y_2)$ by applying the law of total variance to get the following two term expansion:
 \begin{align}
     C_{1,2}=& \textrm{Cov}_{g_1,g_2,\sigma_1,\sigma_2}\left(\mathds{E}\left[y_1|g_1,\sigma_1\right],\mathds{E}\left[y_2,g_2,\sigma_2\right]\right)\nonumber\\
     &+\mathds{E}_{g_1,g_2,\sigma_1,\sigma_2}\left[\textrm{Cov}(y_1,y_2|g_1,g_2,\sigma_1,\sigma_2)\right].\label{C_step_1}
 \end{align}
 
 Now, as $y_1|g_1,\sigma_1$ and  $y_2|g_2,\sigma_2$ are independent (all that remains after this conditioning is observation noise), the second term of (\ref{C_step_1}) is in fact zero (at least for unique ${x}_1$ and ${x}_2$).
 
To calculate the first term of (\ref{C_step_1}), we insert the expression for the first moment of $y|g,\sigma$ ( i.e. Equation (\ref{AL_mean})) which, after recalling the independence of $g$ and $\sigma$, yields
\begin{align}
    C_{1,2}=\ &\textrm{Cov}_{g_1,g_2}(g_1,g_2)  \nonumber \\ &+\frac{(1-2\tau)^2}{\tau^2(1-\tau)^2}\textrm{Cov}_{\sigma_1,\sigma_2}(\sigma_1,\sigma_2). \label{C_step_2}
\end{align}
 
Finally, we can extract $\textrm{Cov}(g_1,g_2)$ and $\textrm{Cov}(\sigma_1,\sigma_2)$ from our underlying GP models as $\Sigma^g_{1,2}$ and $\textrm{e}^{\mu_1^\sigma+\mu_2^\sigma+0.5(\sigma_1^\sigma+\sigma_2^\sigma)}(\textrm{e}^{\Sigma^{\sigma}_{1,2}}-1)$ (using the formulae for the covariance of joint log Gaussian variables). Inserting these two covariances into (\ref{C_step_2}) provides a closed-from expression for $C_{1,2}$.

\section{Supplementary material: RFF for Matern kernels}
\label{sec:supp:RFF}
%RFFs then consists in approximating this expectation using a Monte-Carlo estimate:
%\begin{equation}
%    k(x,x')\approx \phi(x)^T\phi(x')\label{approx}
%\end{equation}
%with $\phi(x)$ a $m$-dimensional feature such that $\phi_i(x)~=~\sqrt{2\alpha/m}\cos(w_i^T x+b_i)$ where $w_i$ and $b_i$ are i.i.d. samples from $p(\omega)$ and $p(b)$. 

%Such methodology has been classically used to approximate the squared-exponential kernel because it is self conjugated \citep[see][]{hernandez2014predictive}. 
We present in this section how to use RFFs to generate samples from $d$-dimensional Matern kernels with regularity $\nu$, variance $\sigma^2$ and lengthscales $\theta\in\mathds{R}^d$. First of all, we start from the spectral density of a Mat\'ern kernel:
\begin{equation*}
s(w)=\sigma^2 |\Lambda|^{1/2}\dfrac{\Gamma(\frac{d}{2}+\nu)}{\Gamma(\nu)}\dfrac{(2\sqrt{\pi})^d}{(1+w^T\Lambda w)^{\frac{d}{2}+\nu}},
\end{equation*}
where $\Lambda= \diag(\theta_1,\cdots,\theta_d)$ is the diagonal matrix containing the length scale hyperparameters. Using the change of variable $\Lambda'= 2\nu\times\Lambda$ and introducing rescaling factor $\sigma^2(\sqrt{2} \pi)^d$, one can recognise here the probability density function of the \textit{multivariate t-distribution}:
\begin{equation*}
p(w)=|\Lambda|^{1/2}\dfrac{\Gamma(\frac{d}{2}+\nu)}{\Gamma(\nu)\pi^{d/2}\nu^{d/2}}\dfrac{1}{(1+\frac{1}{2\nu}w^T\Lambda w)^{\frac{d}{2}+\nu}}.
\end{equation*}
As a consequence, prior samples can be generated by computing
\begin{equation*}
g(x) = \sigma \sqrt{2 (\sqrt{2} \pi)^{d} / m} \sum_{i=1}^m \omega_i \cos(w_i^T x + b_i)
\end{equation*}
where $\omega_i \sim \mathcal{N}(0, 1)$, $w_i \sim p$, $b_i \sim \mathcal{U}(0, 2 \pi)$, 
and $m$ is the number of features.

\section{Supplementary material: description of the GLD synthetic case}
Several formulations of the GLD exist, we use here the parameterisation of \cite{freimer1988study}.
The GLD is defined by its quantile function:
\begin{equation}
 Q(u) = \lambda_0 + \lambda_1 \left(T_1 - T_2\right),\label{eq:quantile}
\end{equation}
with:
\begin{align*}
 T_1 &= \left\{\begin{array}{ll}
           \frac{u^{\lambda_2} - 1}{\lambda_2} & \textit{ if } \lambda_2 \neq 0 \\
           \log(u) & \textit{ if } \lambda_2 = 0
          \end{array}\right. \\
  T_2 &= \left\{\begin{array}{ll}
           \frac{(1-u)^{\lambda_3} - 1}{\lambda_3} & \textit{ if } \lambda_3 \neq 0 \\
           \log(1-u) & \textit{ if } \lambda_3 = 0
          \end{array}\right. .\\         
\end{align*}
Here, the only constraint for the parameter values is $\lambda_1 > 0$.

To define an experiment, each $\lambda_j$ is a realisation of a GP, except for $\lambda_1$ for which we 
use a softplus transform to ensure positivity:
\begin{align*}
 \lambda_j(x) & \sim \mathcal{GP} \big(0, k(\cdot, \cdot)\big), \quad j \in \{0, 2, 3\},\\
 \phi( \lambda_1(x)) & \sim  \mathcal{GP} \big(0, k(\cdot, \cdot)\big),
\end{align*}
with $\phi^{-1}(w) = \log(1 + e^w)$. All GPs have a Matern 5/2 kernel $k$ with unit variance. 
We add to $\lambda_0(x)$ a small quadratic mean function 
to avoid having the optimum located on the edges of the domain.
We use a lengthscale of 0.5 in dimension 3 and 1.0 in dimension 6. These settings ensure that the 6-dimensional test cases do not have too many local optima.


\bibliography{picheny_522}

\end{document}
