\documentclass[accepted]{uai2023}
\usepackage[british]{babel}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 

% making xr work on overleaf
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[2]{%
\externaldocument[#2]{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}

\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
\usepackage{booktabs}
\usepackage[capitalize]{cleveref}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{xcolor}
\usepackage{tikz}
\usepackage{placeins}




\usepackage{amsthm, amsmath, amssymb} % Mathematical typesetting

%% MACROS
\renewcommand{\v}[1]{\boldsymbol{\mathbf{#1}}}

\newcommand{\nolb}[1]{\mbox{#1}}
\newcommand{\bcket}[3]{\left#1 #3 \right#2}
\newcommand{\mbcket}[5]{\left#1 #4 \middle#2 #5 \right#3}
\renewcommand{\b}{\bcket{(}{)}}
\newcommand{\bc}{\mbcket{(}{\vert}{)}}
\newcommand{\sqb}{\bcket{[}{]}}
\newcommand{\abs}{\bcket{\lvert}{\rvert}}
\newcommand{\cb}{\bcket{\{}{\}}}

\newcommand{\K}{\mathbf{K}}
\newcommand{\W}{\mathcal{W}}
\newcommand{\GW}{\mathcal{GW}}
\newcommand{\AGW}{\mathrm{A}\text{-}\mathcal{GW}}
\newcommand{\ABGW}{\mathrm{AB}\text{-}\mathcal{GW}}
\newcommand{\G}{\mathbf{G}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Rm}{\mathbf{R}}
\renewcommand{\L}{\mathbf{L}}
\newcommand{\dx}{\mathrm{d}x}
\newcommand{\dy}{\mathrm{d}y}
\newcommand{\md}{\mathrm{d}}
\newcommand{\dG}{\mathrm{d}G}
\newcommand{\dlam}{\mathrm{d}\Lambda}
\newcommand{\wM}{\mathbf W} %Wishart Matrix
\newcommand{\nD}{P} %Number of datapoints
\newcommand{\E}{\mathbb{E}}

\renewcommand{\P}[1][]{\operatorname{P}_{#1}\b}
\newcommand{\Qd}[1][]{\operatorname{Q}_{#1}\b}

\newcommand{\qd}[1][]{\operatorname{Q}_{{#1}}\b}
\newcommand{\pd}[1][]{\operatorname{P}_{{#1}}\b}
\newcommand{\pt}{\operatorname{P}}

\newcommand{\chol}{\mathrm{chol}}
\newcommand{\normal}{\mathcal{N}\b}
\newcommand{\gammad}{\operatorname{Gamma}\b}
\newcommand{\gammat}{\operatorname{Gamma}}
\newcommand{\siid}{\sim_{\mathrm{iid}}}
\newcommand{\sind}{\sim_{\mathrm{ind}}}

\newcommand{\QGW}{\operatorname{Q}_{\GW}}
\newcommand{\QAGW}{\operatorname{Q}_{\AGW}}
\newcommand{\QABGW}{\operatorname{Q}_{\ABGW}}

% datasets
\newcommand{\boston}{\textsc{Boston}}
\newcommand{\concrete}{\textsc{Concrete}}
\newcommand{\energy}{\textsc{Energy}}
\newcommand{\kinnm}{\textsc{Kin8nm}}
\newcommand{\naval}{\textsc{Naval}}
\newcommand{\power}{\textsc{Power}}
\newcommand{\protein}{\textsc{Protein}}
\newcommand{\wine}{\textsc{Wine}}
\newcommand{\yacht}{\textsc{Yacht}}

% to make seb's appendix work:
\newcommand{\Wish}[1]{\mathcal{W}\b{#1}}
\newcommand{\f}{\v f}
\newcommand{\F}{\mathbf{F}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\0}{{\mathbf{0}}}
\newcommand{\N}{\mathcal{N}\b}
\newcommand{\I}{\mathbf{I}}
\newcommand{\transpose}[1]{{#1}^T}
\renewcommand{\L}{\mathbf{L}}
\renewcommand{\S}{\mathbf{S}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\iWish}{\mathcal{W}^{-1}\b}
\newcommand{\p}{\mathrm{p}\b}
\newtheorem{defi}{Definition}
\DeclareMathOperator*{\etr}{\mathrm{etr}}
\newcommand{\T}{\mathbf{T}}
\newcommand{\bsc}[2]{\left( #1 ; #2 \right)}
\newcommand{\Nc}{\mathcal{N}\bsc}
\newcommand{\y}{\mathbf{y}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\La}{\mathbf{\Lambda}}
\newcommand{\nt}{{\tilde{\nu}}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\C}{\mathbf{C}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\dd}[2][]{\frac{\partial #1}{\partial #2}}
\newcommand{\q}{\mathrm{q}\b}

% macro to make algorithm work
\newcommand{\hF}{\mathbf{\hat{F}}}
\renewcommand{\L}{\mathbf{L}}
\newcommand{\Q}[1][]{\operatorname{Q}_{#1}\b}
\newcommand{\MN}{\mathcal{MN}\b}




%if overleaf, use this:
\myexternaldocument{ober_402}{}

%else:
%\externaldocument{uai2023-template}

\author[1]{Sebastian W. Ober}
\author[2]{Ben Anson\thanks{These authors contributed equally to this work.}}
\author[2]{Edward Milsom$^*$}
\author[2]{Laurence Aitchison}
\affil[1]{%
    University of Cambridge
}
\affil[2]{%
    University of Bristol
}
  
\begin{document}
% \input{appendix}
\onecolumn
\appendix
\title{An Improved Variational Approximate Posterior for the Deep Wishart Process\\(Supplementary Material)}
\maketitle
\section{Derivation of $\AGW$ and $\ABGW$ densities}\label{app:densities_deriv}
We briefly provide further background on the Wishart distribution, the Barlett decomposition, and discuss how to derive Jacobians for matrix transformations. Then we use this machinery to derive densities for the $\AGW$ and $\ABGW$ distributions.
\subsection{The Wishart distribution}
The Wishart distribution, $\Wish{\mathbf{S}, \nu}$, is a distribution over positive semi-definite $P \times P$ matrices, where  $\S\in\R^{P\times P}$ is a positive definite covariance matrix, and $\nu > 0$ is an integer-valued degrees-of-freedom parameter. The Wishart distribution is most straightforwardly interpreted as a sum of outer products of multivariate Gaussian random variables. That is, if
we define a random matrix $\wM$ such that,
\begin{align}
  \v f_\lambda &\siid \N{\0, \S},\,\lambda\in\{1,\ldots,\nu\},\\
  \wM &= \sum_{\lambda=1}^\nu \v f_\lambda \v{f}^T_\lambda,
\end{align}
then we say that $\wM$ is Wishart distributed, and write $\wM \sim \Wish{\S, \nu}$. Equivalently, $\wM = \F \F^T $, where $\F\in\R^{P\times P}$ is defined by stacking the vectors $\v f_\lambda$, $\F = \b{\v f_1\quad \cdots\quad\v f_\lambda}$. We say that $\wM$ is standard Wishart distributed if $\S = \I$.

It is easy to generate Wishart random matrices from only standard Gaussian samples. Take $\L = \chol(\S)$ to be the Cholesky of $\S$ and $\v \xi_\lambda \siid\N{\0, \I}$, then  $\L\v\xi_\lambda \siid \N{\0, \S}$. It follows that,
\begin{align}\label{eq:wishart_as_std_normal}
\L\b{\sum_{\lambda=1}^\nu \v \xi_\lambda \v \xi_\lambda^T} \L^T = \L \v \Xi \v \Xi^T \L^T \sim \Wish{\S, \nu},
\end{align}
where $\v \Xi$ is the matrix of stacked vectors $\v \xi_\lambda$ such that $\v \Xi = \b{\v \xi_1\quad\cdots\quad \v \xi_\nu}$. From~\eqref{eq:wishart_as_std_normal}, it can be observed that
${\E\sqb{\wM} = \L (\nu \I)\L^T = \nu\S}$. Additionally, $\H=\v \Xi \v \Xi^T$ is standard Wishart distributed, therefore~\eqref{eq:wishart_as_std_normal} also
gives us a way to transform a standard Wishart into a Wishart with covariance parameter $\S$: $\H \sim \Wish{\I, \nu}\implies \L \H \L^T\sim\Wish{\S, \nu}$.

Finally, note that the density of the Wishart distribution is
given by,
\begin{align}
    \label{eq:dwp:wishdens}
    \pt\b{\wM} = \frac{\pi^{\nu (\nt - P)/2}}{2^{\nu P/2}\abs{\S}^{\nu/2} \v \Gamma_\nt\b{\tfrac{\nu}{2}}} \abs{\wM_{:\nt, :\nt}}^{\b{\nu-P-1}/2} \etr \b{-\S^{-1}\wM/2},
\end{align}
where $\nt = \min(\nu, P)$, and $\Gamma_\nt$ is the multivariate gamma function~\citep{srivastava2003singularwishart}.
\subsection{The Bartlett decomposition and some generalisations}
Suppose $\wM\sim \Wish{\I,\nu}$, and $\nu\geq P$, then 
the Bartlett decomposition \citep{bartlett1933on} allows for efficient sampling of $\wM$ (the constraint $\nu\geq P$ refers to the fact that $\wM$ almost surely has full rank). Rather than sampling $\nu P^2$ Gaussian random variables to construct $\wM$ (which can become prohibitively costly when $\nu$ is large),
the Bartlett decomposition allows us to sample only $P(P-1)/2$ Gaussian random variables, and $P$ Gamma random variables. In particular, if $\T$ is a random matrix distributed according to,
\begin{subequations}\label{eq:standard_bartlett}
\begin{align}
\T &= \begin{pmatrix}
  T_{11} & \dotsm & 0 \\
  \vdots & \ddots & \vdots \\
  T_{P1} & \dotsm & T_{PP}
\end{pmatrix},\\
\pt\b{T_{jj}^2} &= \gammat\b{T_{jj}^2;\, \tfrac{\nu-j+1}{2}, \tfrac{1}{2}},\\
\pt\b{T_{j > k}} &= \Nc{T_{jk}}{\,0, 1},
\end{align}
\end{subequations}
then $\T\T^T \sim \Wish{\I,\nu}$.
The utility of~\eqref{eq:standard_bartlett} can be extended in two ways. Firstly, we can use~\eqref{eq:standard_bartlett} to sample from non-standard Wisharts, since $\L (\T \T^T) \L^T \sim\Wish{\L \L^T, \nu}$. Secondly,~\cite{srivastava2003singularwishart} extends the Bartlett decomposition to allow for sampling of singular Wisharts. Suppose $\nu < P$, and take $\T$ to be distributed according to,
\begin{subequations}\label{eq:nonsingular_bartlett}
\begin{align}
  \T &= \begin{pmatrix}
    T_{11}    & \dotsm & 0          \\ 
    \vdots    & \ddots & \vdots     \\ 
    T_{\nu 1} & \dotsm & T_{\nu \nu} \\ 
    \vdots    & \ddots & \vdots     \\ 
    T_{\nD 1}   & \dotsm & T_{\nD \nu}
  \end{pmatrix},\\
  \pt\b{T_{ii}^2} &= \gammad{T_{ii}^2;\, \tfrac{\nu-j+1}{2}, \tfrac{1}{2}},\,  i \in\{1,\ldots,\nu\},\\
  \pt\b{T_{i > j}} &=\normal{T_{i>j};\,0, 1},
\end{align}
\end{subequations}
then $\T \T^T \sim \Wish{\I, \nu}$.

We arrive at the A- and AB-generalised (singular) Wishart distributions by generalising the (singular) Barlett decomposition in~\eqref{eq:nonsingular_bartlett}.
Concretely, we borrow the form of~\eqref{eq:nonsingular_bartlett}, but allow the parameters of the Gaussian and gamma distributions to be arbitrary,
\begin{subequations}\label{eq:generalised_bartlett2}
\begin{align}
  \T &= \begin{pmatrix}
    T_{11}    & \dotsm & 0          \\ 
    \vdots    & \ddots & \vdots     \\ 
    T_{\nu 1} & \dotsm & T_{\nu \nu} \\ 
    \vdots    & \ddots & \vdots     \\ 
    T_{\nD 1}   & \dotsm & T_{\nD \nu}
  \end{pmatrix},\\
  \pt\b{T_{ii}^2} &= \gammad{T_{ii}^2;\, \alpha_i,\beta_i},\,  i \in\{1,\ldots,\nu\},\\
  \pt\b{T_{i > j}} &=\normal{T_{i>j};\,\mu_{ij}, \sigma^2_{ij}}.
\end{align}
\end{subequations}
For any invertible matrix $\A\in\R^{P\times P}$ and any invertible lower triangular $\B \in \R^{\nu \times \nu}$, we write ${\A \T \T^T \A^T\sim \AGW\b{\A, \nu, \v \alpha, \v \beta, \v \mu, \v \sigma}}$ and $\A \T \B \B^T \T^T \A^T\sim \ABGW\b{\A, \B, \nu, \v \alpha, \v \beta, \v \mu, \v \sigma}$.
Given the necessary parameters, it is straightforward to sample
matrices from the $\AGW$ and $\ABGW$ families using~\eqref{eq:generalised_bartlett2}. However it is non-trivial to write down the corresponding densities --- the rest of this section is dedicated to this task.
\subsection{Jacobians for matrix transformations}\label{sec:app:jac:deriving}
We want to obtain the densities of $\wM_A := \A \T \T^T \A^T$ and $\wM_{AB} := \A \T \B \B^T \T^T\A^T$, where we know the density of $\T$.
Ultimately, we will use the change of variables formula,
\begin{align}
    \qd{\wM} = \qd{\T}\abs{\dd[\T]{\wM}},
\end{align}
where $\abs{\partial{\T} / \partial{\wM}}$ is the Jacobian determinant of the transformation.

For a vector-vector transformation $\v y = \v f(\v x)$, where $\v x\in\R^n$ and $\v y \in \R^{m}$, the Jacobian $\partial \v y/\partial \v x$ can be calculated by
evaluating $\partial y_i/\partial x_j$ for $i\in\{1,\ldots,m\}$, and $j\in\{1,\ldots,n\}$. It is less simple to calculate the Jacobian for matrix-matrix transformations, but it can be done by stacking the columns of our matrices into a long vector, and then calculating the associated vector-vector Jacobian. We demonstrate this
with a simple example for $2\times 2$ matrices. Consider,
\begin{align}
    \underbrace{\begin{pmatrix} Y_{11} & Y_{12} \\ Y_{21} & Y_{22} \end{pmatrix}}_{\Y} &=
    \underbrace{\begin{pmatrix} A_{11} & A_{12} \\ A_{21} & A_{22} \end{pmatrix}}_{\mathbf{A}}
    \underbrace{\begin{pmatrix} X_{11} & X_{12} \\ X_{21} & X_{22} \end{pmatrix}}_{\X}.
\end{align}
We `vectorise' $\Y$ and $\X$ to obtain,
\begin{align}
    \begin{pmatrix}
      Y_{11} \\
      Y_{21} \\
      Y_{12} \\
      Y_{22}
    \end{pmatrix}
    &=
    \underbrace{\begin{pmatrix}
      A_{11} & A_{12} & 0 & 0 \\
      A_{21} & A_{22} & 0 & 0 \\
      0 & 0 & A_{11} & A_{12} \\
      0 & 0 & A_{21} & A_{22}
    \end{pmatrix}}_{\mathbf{A}^*}
    \begin{pmatrix}
      X_{11} \\
      X_{21} \\
      X_{12} \\
      X_{22}
    \end{pmatrix}.
\end{align}
The Jacobian of this transformation is clearly,
\begin{align}
\dd[\Y]{\X} = \A^*,
\end{align}
and the associated Jacobian determinant is therefore,
\begin{align}
\abs{\dd[\Y]{\X}} = \abs{\A}^2.
\end{align}
We now consider how to calculate some Jacobian determinants that are relevant in calculating the $\AGW$ and $\ABGW$ densities.
\subsection{Jacobian for the product of a lower triangular matrix with itself}\label{sec:jac:J_LLT}
Consider the transformation $\G = \v \La \La^T$, where $\La \in \reals^{P \times P}$, and $\La$ is lower triangular.~\cite{ober2021vardwp} showed that the Jacobian determinant is,
\begin{align}
  \label{eq:jac:J_LaLaT}
  \abs{\dd[\G]{\La}} = \prod_{i=1}^P 2 \Lambda_{ii}^{P - i + 1}.
\end{align}
They also showed that the same transformation, $\G = \v \La \La^T$, but in the case $\La\in\R^{P\times \nu}$
has Jacobian determinant,
\begin{align}\label{eq:jac:J_LaLaTsing}
  \abs{\dd[\G]{\La}} &= \prod_{i=1}^\nt 2 \Lambda_{ii}^{P - i + 1},
\end{align}
where $\tilde\nu = \min\{P, \nu\}$.
\subsection{Jacobian for the product of two different lower triangular matrices}\label{sec:jac:J_LA}
Consider the transformation $\T \mapsto \La = \L \T$, where $\T \in \reals^{P \times \nu}$ and is lower triangular, and $\L\in\R^{P\times P}$ is also lower triangular.
~\cite{ober2021vardwp} showed that the Jacobian determinant is,
\begin{align}
\label{eq:jac:J_LA}
  \abs{\dd[\v \La]{\T}} = \prod_{i=1}^P L_{ii}^{\min(i, \nu)}.
\end{align}
We also need the Jacobian determinant for a right linear transformation. Therefore, consider also the transformation $\T\mapsto\La = \T\B$, where again $\T\in\reals^{P\times \nu}$,
but $\B\in\reals^{\nu\times \nu}$ and is invertible lower triangular. It is helpful to write down the matrices explicitly,
\begin{align}
  \label{eq:jac:AL}
  \begin{pmatrix}
    \Lambda_{11}    & \dotsm & 0          \\ 
    \vdots          & \ddots & \vdots     \\ 
    \Lambda_{\nu 1} & \dotsm & \Lambda_{\nu \nu} \\ 
    \vdots          & \vdots & \vdots     \\ 
    \Lambda_{P 1}   & \dotsm & \Lambda_{P \nu}
  \end{pmatrix} = 
    \begin{pmatrix}
    T_{11}    & \dotsm & 0          \\ 
    \vdots          & \ddots & \vdots     \\ 
    T_{\nu 1} & \dotsm & T_{\nu \nu} \\ 
    \vdots          & \vdots & \vdots     \\ 
    T_{P 1}   & \dotsm & T_{P \nu}
  \end{pmatrix}
  \begin{pmatrix}
  B_{11} & \dotsm & 0 \\
  \vdots & \ddots & \vdots \\
  B_{\nu 1} & \dotsm & B_{\nu\nu}
  \end{pmatrix},
\end{align}
and consider the rows of $\La$. For the first row, we have
\begin{align*}
    \begin{pmatrix}
      \Lambda_{11} 
    \end{pmatrix}
    = 
    \begin{pmatrix}
      T_{11}
    \end{pmatrix}
    \begin{pmatrix}
      B_{11}
    \end{pmatrix},
\end{align*}
or equivalently,
\begin{align*}
    \La_{1, :1} = \T_{1, :1}\B_{:1, :1}.
\end{align*}
Similarly, for rows up to the $\nu^\text{th}$ row, i.e.\ for $i \leq \nu$, we have,
\begin{align*}
    \begin{pmatrix}
      \Lambda_{i1} & \dotsm & \Lambda_{ii}
    \end{pmatrix}
    =
    \begin{pmatrix}
      T_{i1} & \dotsm & T_{ii}
    \end{pmatrix}
    \begin{pmatrix}
      B_{11} & \dotsm & 0 \\
      \vdots & \ddots & \vdots \\
      B_{i1} & \dotsm & B_{ii}
    \end{pmatrix},
\end{align*}
which can be written as
\begin{align*}
    \La_{i, :i} = \T_{i, :i}\B_{:i, :i}.
\end{align*}
For rows beyond the $\nu^\text{th}$ row, i.e., $i > \nu$, the expression becomes,
\begin{align*}
    \begin{pmatrix}
      \Lambda_{i1} & \dotsm & \Lambda_{i\nu}
    \end{pmatrix}
    =
    \begin{pmatrix}
      T_{i1} & \dotsm & T_{i\nu}
    \end{pmatrix}
    \begin{pmatrix}
      B_{11} & \dotsm & 0 \\
      \vdots & \ddots & \vdots \\
      B_{\nu1} & \dotsm & B_{\nu \nu}
    \end{pmatrix},
\end{align*}
which again can be written as,
\begin{align*}
    \La_{i, :\nu} = \T_{i, :\nu} \B_{:\nu, :\nu} = \T_{i, :} \B.
\end{align*}
To calculate the Jacobian, we proceed by taking the transpose of each of the rows and stacking them, giving,
\begin{align*}
    \begin{pmatrix}
      \La_{1, :1}^\top \\
      \La_{2, :2}^\top \\
      \vdots \\
      \La_{\nu, :\nu}^\top \\
      \La_{\nu + 1, :\nu}^\top \\
      \vdots \\
      \La_{P, :\nu}^\top
    \end{pmatrix}
    =
    \begin{pmatrix}
      \B_{:1, :1}^\top & \0 & \dotsm & \0 & \0 & \dotsm & \0 \\
      \0 & \B_{:2, :2}^\top & \dotsm & \0 & \0 & \dotsm & \0 \\
      \vdots & \vdots & \ddots & \vdots & \vdots & \ddots & \vdots \\
      \0 & \0 & \dotsm & \B^\top & \0 & \dotsm & \0 \\
      \0 & \0 & \dotsm & \0 & \B^\top & \dotsm & \0 \\
      \vdots & \vdots & \ddots & \vdots & \vdots & \ddots & \vdots \\
      \0 & \0 & \dotsm & \0 & \0 & \dotsm & \B^\top \\
    \end{pmatrix}
    \begin{pmatrix}
      \T_{1, :1}^\top \\
      \T_{2, :2}^\top \\
      \vdots \\
      \T_{\nu, :\nu}^\top \\
      \T_{\nu + 1, :\nu}^\top \\
      \vdots \\
      \T_{P, :\nu}^\top
    \end{pmatrix}.
\end{align*}
Since the square matrix is upper triangular, its determinant is simply the product of the elements of its diagonal. This gives the Jacobian determinant,
\begin{align}\label{eq:jac:TB}
\abs{\dd[\La]{\T}} =  \prod_{i=1}^{\nt} B_{ii}^{P - i + 1}.
\end{align}
\subsection{Jacobian for $\C = \La\La^\top \mapsto \D = \A\C\transpose{\A}$, where $\A$ is an invertible matrix}
Now consider the transformation $\C = \La\La^\top\mapsto \D = \A\C\A^T$, where $\La \in \reals^{P\times \nu}$ is lower triangular with rank $\nu$, and $\A\in\R^{P\times P}$ is invertible. This Jacobian is difficult to derive from scratch; however, we can obtain it using the density of the singular Wishart.
In particular, the probability density function of $\D \sim \mathcal{W}(\S, \nu)$ is given by,
\begin{align*}
    \pt_{1}(\D) = \frac{\pi^{\nu(\nt - P)/2}}{2^{\nu P/2}\abs{\S}^{\nu/2}\Gamma_{\nt}\b{\tfrac{\nu}{2}}} \abs{\D_{:\nt, :\nt}}^{(\nu - P - 1)/2}\etr \b{-\S^{-1}\D /2},
\end{align*}
where $\nt = \min \b{\nu, P}$ as before. 
Note that $\D_{:\nt, :\nt}$ is almost surely full rank.
For $\C \sim \mathcal{W}(\I_P, \nu)$, this simplifies to,
\begin{align*}
    \pt_{2}(\C) = \frac{\pi^{\nu(\nt-P)/2}}{2^{\nu P/2}\Gamma_{\nt}\b{\tfrac{\nu}{2}}}\abs{\C_{:\nt,:\nt}}^{(\nu-P-1)/2}\etr\b {-\C/2}.
\end{align*}
Using these densities, we can use the identity,
\begin{align*}
    \pt_1(\D) = \pt_2\b{\C} \abs{\dd[\C]{\D}},
\end{align*}
to obtain the desired Jacobian determinant,
\begin{align}\label{eq:jac:J_AXAT}
    \abs{\dd[\D]{\C}} = \pt_2\b{\C}/\pt_1(\D) = 
    \abs{\A \A^T}^{\nu/2} \frac{\abs{\C_{:\nt,:\nt}}^{(\nu-P-1)/2}}{\abs{\D_{:\nt,:\nt}}^{(\nu - P - 1)/2}} = \abs{\A}^{\nu} \frac{\abs{\C_{:\nt,:\nt}}^{(\nu-P-1)/2}}{\abs{\D_{:\nt,:\nt}}^{(\nu - P- 1)/2}}.
\end{align}
We can now put these Jacobian determinant results together to derive the densities for the A- and AB-generalised (singular) Wishart distributions.
\subsection{The A-generalised (singular) Wishart density}\label{appendix:deriving_densities}
In Section~\ref{sec:a_ab_gswd} we said that $\G = \A \T (\A \v T)^T\sim\AGW\b{\A, \nu, \v \alpha, \v \beta, \v \mu, \v \sigma}$ if
$\A\in\R^{P\times P}$ is invertible and $\T\in\R^{P\times \nu}$ is distributed according to~\eqref{eq:generalised_bartlett}. If we define $\C$ such that $\G = \A \C \A^T = \A \T \T^T \A^T$, then by the change of variables formula for probability densities,
\begin{align}
    \qd{\G} = \qd{\T}\abs{\dd[\T]{\C}}\abs{\dd[\C]{\G}}.
\end{align}
By combining the density of $\T$,
\begin{align}
\qd{\T} = 2^{\nt}
\prod_{j=1}^\nt T_{jj}\gammat\b{T_{jj}^2;\,\alpha_j, \beta_j}\prod_{i=j+1}^P \Nc{T_{ij}}{\mu_{ij}, \sigma_{ij}^2},
\end{align}
the result from~\eqref{eq:jac:J_LaLaTsing},
\begin{align}
  \abs{\dd[\C]{\T}} &= 2^\nt\prod_{j=1}^\nt  T_{jj}^{P - j + 1},
\end{align}
and the result from~\eqref{eq:jac:J_AXAT},
\begin{align}
    \abs{\dd[\G]{\C}} = 
     \abs{\A}^{\nu} \frac{\abs{\C_{:\nt,:\nt}}^{(\nu-P-1)/2}}{\abs{\G_{:\nt,:\nt}}^{(\nu - P - 1)/2}},
\end{align}
we obtain the A-generalised (singular) Wishart density,
\begin{align}
    \qd{\G} &= \frac{\abs{\G_{:\nt, :\nt}}^{(\nu - P - 1)/2}}{\abs{\A}^\nu\abs{\C_{:\nt, :\nt}}^{(\nu - P- 1)/2}} 
    \prod_{j=1}^\nt \frac{\gammat\b{T_{jj}^2;\,\alpha_j, \beta_j}}{T_{jj}^{P-j}}\prod_{i=j+1}^P \Nc{T_{ij}}{\mu_{ij}, \sigma_{ij}^2}.
\end{align}
\subsection{The AB-generalised (singular) Wishart density}
The derivation for the AB-generalised (singular) Wishart is similar to that of the A-generalised (singular) Wishart, with the addition of one extra step.
Namely, as the AB-generalised (singular) Wishart defines $\G = \A\T\B\transpose{\b{\A\T\B}}$, we define $\La = \T\B$ and $\C = \La\transpose{\La}$, so that,
\begin{align*}
    \qd{\G} = \qd{\T}\abs{\dd[\T]{\La}}\abs{\dd[\La]{\C}}\abs{\dd[\C]{\D}}.
\end{align*}
This first Jacobian determinant can be obtained using~\eqref{eq:jac:TB}, 
\begin{align*}
    \abs{\dd[\T]{\La}} = \prod_{i=1}^\nt \frac{1}{B_{ii}^{P - i + 1}},
\end{align*}
whereas the second,
\begin{align*}
    \abs{\dd[\La]{\C}} &= \frac{1}{2^\nt}\prod_{i=1}^{\nt} \frac{1}{\Lambda_{ii}^{P - i + 1}}= \frac{1}{2^\nt}\prod_{i=1}^{\nt} \frac{1}{T_{ii}^{P - i + 1}B_{ii}^{P - i + 1}},
\end{align*}
arises from~\eqref{eq:jac:J_LaLaTsing}.
The remaining Jacobians remain unchanged in form, so that our final density is given by,
\begin{align}
    \qd{\G} &= \frac{\abs{\G_{:\nt, :\nt}}^{(\nu - P - 1)/2}}{\abs{\A}^\nu\abs{\C_{:\nt, :\nt}}^{(\nu - P - 1)/2}} 
    \prod_{j=1}^\nt \frac{\gammat\b{T_{jj}^2;\,\alpha_j, \beta_j}}{T_{jj}^{P-j}B_{jj}^{2(P-j+1)}}\prod_{i=j+1}^P \Nc{T_{ij}}{\mu_{ij}, \sigma_{ij}^2}.
\end{align}
\section{Detailed Experimental Results}\label{appendix:detailed_results}
All models were trained on the UCI splits from~\cite{pmlr-v48-gal16}, of which there are 20 for each dataset apart from \protein. The datasets and the splits are available at~\url{https://github.com/yaringal/DropoutUncertaintyExps/tree/master/UCI_Datasets}. Deep Wishart processes
with the three kinds of approximate posterior ($\GW$, $\AGW$, and $\ABGW$)  were trained, with number of layers $\ell\in\{2,\ldots,5\}$, and width $\nu_\ell$ fixed to the number of input features. We applied the squared exponential kernel as a non-linearity at each layer,
with automatic relevance determination (ARD,~\cite{williams2006gaussian}) in the first layer only. The DGPs trained reflected this architecture, with each GP layer returning features with dimension equal to the number of input features. In particular the DGPs were trained using global inducing point methods~\citep{ober21globalinducing}. The final layer of the DWP also uses
a global inducing approximate posterior~\citep{ober21globalinducing}.


All models were trained using the same scheme. $20\,000$ gradient steps were used to train each model, with the ADAM optimizer~\cite{kingma14adam}.
We began with an initial learning rate of $10^{-2}$, and then stepped the learning rate down to $10^{-3}$ after $10\,000$ gradient steps. The KL was annealed using a factor increasing linearly from $0$ to $1$ over the first $1\,000$ gradient steps. No pre-processing of the data was performed, other than normalizing inputs and outputs. To train, $10$ samples were drawn from the approximate posterior, and to test $100$ samples were drawn. For the smaller datasets (\boston, \concrete, \energy, \wine, \yacht), training was performed on a CPU (Intel Core i9-10900X), and for the other (larger) datasets, an internal cluster of machines was used, with NVIDIA GeForce 2080 Ti GPUs.
\subsection{Tables}
\cref{tab:dwp:uci_elbos1,tab:dwp:uci_elbos2,tab:dwp:uci_lls,tab:dwp:uci_rmses} report the ELBOs, test log likelihoods, and RMSEs from our UCI experiments respectively. In all cases, we give the mean of each metric (plus or minus one standard error), and highlight the model with the best mean value in bold for each configuration (unless all are equal).
\begin{table}[ht]
\footnotesize
  \caption{ELBOs per datapoint. We report mean plus or minus one standard error over the splits. Bold numbers correspond to the best models overall.}
  \label{tab:dwp:uci_elbos1}
  \centering
  \begin{tabular}{rcccc}
    \toprule
& & & DWP & \\
\{Dataset\}-\{Depth\} & DGP & $\QGW$ & $\QAGW$ & $\QABGW$ \\
\midrule  
\textsc{\boston} - 2 & -0.38 $\pm$ 0.01 & -0.33 $\pm$ 0.00 & \textbf{-0.32 $\pm$ 0.01} & \textbf{-0.32 $\pm$ 0.00} \\ 
3 & -0.40 $\pm$ 0.00 & -0.34 $\pm$ 0.01 & \textbf{-0.33 $\pm$ 0.00} & \textbf{-0.33 $\pm$ 0.01} \\ 
4 & -0.43 $\pm$ 0.00 & \textbf{-0.35 $\pm$ 0.00} & \textbf{-0.34 $\pm$ 0.01} & \textbf{-0.34 $\pm$ 0.01} \\ 
5 & -0.45 $\pm$ 0.00 & \textbf{-0.37 $\pm$ 0.01} & \textbf{-0.36 $\pm$ 0.00} & \textbf{-0.36 $\pm$ 0.00} \\ 
\midrule 
\textsc{\concrete} - 2 & -0.45 $\pm$ 0.00 & -0.42 $\pm$ 0.00 & -0.40 $\pm$ 0.00 & \textbf{-0.39 $\pm$ 0.00} \\ 
3 & -0.47 $\pm$ 0.00 & -0.43 $\pm$ 0.00 & \textbf{-0.41 $\pm$ 0.00} & \textbf{-0.41 $\pm$ 0.00} \\ 
4 & -0.49 $\pm$ 0.00 & -0.46 $\pm$ 0.00 & \textbf{-0.43 $\pm$ 0.00} & \textbf{-0.43 $\pm$ 0.00} \\ 
5 & -0.50 $\pm$ 0.00 & -0.49 $\pm$ 0.00 & \textbf{-0.45 $\pm$ 0.00} & \textbf{-0.45 $\pm$ 0.00} \\ 
\midrule 
\textsc{\energy} - 2 & 1.43 $\pm$ 0.00 & \textbf{1.46 $\pm$ 0.00} & \textbf{1.46 $\pm$ 0.00} & \textbf{1.46 $\pm$ 0.00} \\ 
3 & 1.42 $\pm$ 0.00 & 1.44 $\pm$ 0.00 & \textbf{1.45 $\pm$ 0.00} & \textbf{1.45 $\pm$ 0.00} \\ 
4 & 1.40 $\pm$ 0.00 & 1.42 $\pm$ 0.00 & \textbf{1.43 $\pm$ 0.00} & \textbf{1.43 $\pm$ 0.00} \\ 
5 & 1.38 $\pm$ 0.00 & 1.40 $\pm$ 0.00 & \textbf{1.42 $\pm$ 0.00} & 1.41 $\pm$ 0.00 \\ 
\midrule 
\textsc{\kinnm} - 2 & -0.15 $\pm$ 0.00 & -0.16 $\pm$ 0.00 & \textbf{-0.14 $\pm$ 0.00} & \textbf{-0.14 $\pm$ 0.00} \\ 
3 & -0.14 $\pm$ 0.00 & -0.15 $\pm$ 0.00 & \textbf{-0.13 $\pm$ 0.00} & \textbf{-0.13 $\pm$ 0.00} \\ 
4 & -0.14 $\pm$ 0.00 & -0.14 $\pm$ 0.00 & \textbf{-0.11 $\pm$ 0.00} & \textbf{-0.11 $\pm$ 0.00} \\ 
5 & -0.14 $\pm$ 0.00 & -0.14 $\pm$ 0.00 & \textbf{-0.11 $\pm$ 0.00} & \textbf{-0.11 $\pm$ 0.00} \\ 
\midrule 
\textsc{\naval} - 2 & 3.93 $\pm$ 0.05 & 3.82 $\pm$ 0.09 & 3.80 $\pm$ 0.13 & 3.84 $\pm$ 0.10 \\ 
3 & 3.83 $\pm$ 0.06 & 3.71 $\pm$ 0.12 & 3.86 $\pm$ 0.06 & \textbf{3.99 $\pm$ 0.04} \\ 
4 & \textbf{3.91 $\pm$ 0.05} & 3.66 $\pm$ 0.13 & \textbf{3.75 $\pm$ 0.11} & \textbf{3.85 $\pm$ 0.09} \\ 
5 & \textbf{3.92 $\pm$ 0.04} & 3.59 $\pm$ 0.12 & \textbf{3.97 $\pm$ 0.02} & 3.63 $\pm$ 0.22 \\ 
\midrule 
\textsc{\power} - 2 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 & \textbf{0.04 $\pm$ 0.00} & \textbf{0.04 $\pm$ 0.00} \\ 
3 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 \\ 
4 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 & 0.03 $\pm$ 0.00 \\ 
5 & \textbf{0.03 $\pm$ 0.00} & 0.02 $\pm$ 0.00 & \textbf{0.03 $\pm$ 0.00} & \textbf{0.03 $\pm$ 0.00} \\ 
\midrule 
\textsc{\protein} - 2 & \textbf{-1.06 $\pm$ 0.00} & -1.07 $\pm$ 0.00 & \textbf{-1.06 $\pm$ 0.00} & \textbf{-1.06 $\pm$ 0.00} \\ 
3 & -1.04 $\pm$ 0.00 & -1.04 $\pm$ 0.00 & \textbf{-1.03 $\pm$ 0.00} & \textbf{-1.03 $\pm$ 0.00} \\ 
4 & -1.02 $\pm$ 0.00 & -1.02 $\pm$ 0.00 & \textbf{-1.00 $\pm$ 0.00} & -1.01 $\pm$ 0.00 \\ 
5 & \textbf{-1.00 $\pm$ 0.00} & -1.01 $\pm$ 0.00 & \textbf{-1.00 $\pm$ 0.00} & \textbf{-1.00 $\pm$ 0.00} \\ 
\midrule 
\textsc{\wine} - 2 & -1.18 $\pm$ 0.00 & -1.18 $\pm$ 0.00 & \textbf{-1.18 $\pm$ 0.00} & \textbf{-1.18 $\pm$ 0.00} \\ 
3 & -1.19 $\pm$ 0.00 & \textbf{-1.18 $\pm$ 0.00} & \textbf{-1.18 $\pm$ 0.00} & \textbf{-1.18 $\pm$ 0.00} \\ 
4 & -1.19 $\pm$ 0.00 & \textbf{-1.18 $\pm$ 0.00} & \textbf{-1.18 $\pm$ 0.00} & \textbf{-1.18 $\pm$ 0.00} \\ 
5 & -1.19 $\pm$ 0.00 & -1.19 $\pm$ 0.00 & -1.19 $\pm$ 0.00 & -1.19 $\pm$ 0.00 \\ 
\midrule 
\textsc{\yacht} - 2 & 1.88 $\pm$ 0.03 & 2.02 $\pm$ 0.01 & \textbf{2.07 $\pm$ 0.01} & \textbf{2.07 $\pm$ 0.01} \\ 
3 & 1.62 $\pm$ 0.01 & 1.86 $\pm$ 0.02 & \textbf{2.02 $\pm$ 0.01} & \textbf{2.03 $\pm$ 0.01} \\ 
4 & 1.47 $\pm$ 0.02 & 1.73 $\pm$ 0.02 & \textbf{1.93 $\pm$ 0.01} & 1.91 $\pm$ 0.01 \\ 
5 & 1.46 $\pm$ 0.02 & 1.59 $\pm$ 0.02 & \textbf{1.79 $\pm$ 0.02} & \textbf{1.79 $\pm$ 0.02} \\ 
\bottomrule
  \end{tabular}
\end{table}

\begin{table}[ht]
\footnotesize
  \caption{ELBO differences per datapoint. We report mean differences plus or minus one standard error over the splits.}
  \label{tab:dwp:uci_elbos2}
  \centering
  \begin{tabular}{rccc}
    \toprule
\{Dataset\}-\{Depth\}  & $\QAGW - \QGW$ & $\QABGW - \QGW$ & $\QAGW - \QABGW$ \\
\midrule  
\textsc{\boston} - 2   &   0.01 $\pm$ 0.01 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.01 \\
3                      &   0.01 $\pm$ 0.01 &  0.01 $\pm$ 0.01 &   0.00 $\pm$ 0.01 \\
4                      &   0.01 $\pm$ 0.01 &  0.01 $\pm$ 0.01 &   0.00 $\pm$ 0.01 \\
5                      &   0.01 $\pm$ 0.01 &  0.01 $\pm$ 0.01 &   0.00 $\pm$ 0.00 \\
\midrule 
\textsc{\concrete} - 2 &   0.02 $\pm$ 0.00 &  0.03 $\pm$ 0.00 &  -0.01 $\pm$ 0.00 \\
3                      &   0.02 $\pm$ 0.00 &  0.02 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
4                      &   0.03 $\pm$ 0.00 &  0.03 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
5                      &   0.04 $\pm$ 0.00 &  0.04 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
\midrule 
\textsc{\energy} - 2   &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
3                      &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
4                      &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
5                      &   0.02 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.01 $\pm$ 0.00 \\
\midrule 
\textsc{\kinnm} - 2    &   0.02 $\pm$ 0.00 &  0.02 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
3                      &   0.02 $\pm$ 0.00 &  0.02 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
4                      &   0.03 $\pm$ 0.00 &  0.03 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
5                      &   0.03 $\pm$ 0.00 &  0.03 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
\midrule 
\textsc{\naval} - 2    &  -0.02 $\pm$ 0.16 &  0.02 $\pm$ 0.13 &  -0.04 $\pm$ 0.16 \\
3                      &   0.15 $\pm$ 0.13 &  0.28 $\pm$ 0.13 &  -0.13 $\pm$ 0.07 \\
4                      &   0.09 $\pm$ 0.17 &  0.19 $\pm$ 0.16 &  -0.10 $\pm$ 0.14 \\
5                      &   0.38 $\pm$ 0.12 &  0.04 $\pm$ 0.25 &   0.34 $\pm$ 0.22 \\
\midrule 
\textsc{\power} - 2    &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
3                      &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
4                      &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
5                      &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
\midrule 
\textsc{\protein} - 2  &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
3                      &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
4                      &   0.02 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.01 $\pm$ 0.00 \\
5                      &   0.01 $\pm$ 0.00 &  0.01 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
\midrule 
\textsc{\wine} - 2     &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
3                      &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
4                      &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
5                      &   0.00 $\pm$ 0.00 &  0.00 $\pm$ 0.00 &   0.00 $\pm$ 0.00 \\
\midrule 
\textsc{\yacht} - 2    &   0.05 $\pm$ 0.01 &  0.05 $\pm$ 0.01 &   0.00 $\pm$ 0.01 \\
3                      &   0.16 $\pm$ 0.02 &  0.17 $\pm$ 0.02 &  -0.01 $\pm$ 0.01 \\
4                      &   0.20 $\pm$ 0.02 &  0.18 $\pm$ 0.02 &   0.02 $\pm$ 0.01 \\
5                      &   0.20 $\pm$ 0.03 &  0.20 $\pm$ 0.03 &   0.00 $\pm$ 0.03 \\
\bottomrule
  \end{tabular}
\end{table}

\begin{table}[ht]
\footnotesize
  \caption{Average test log likelihoods. We report mean plus or minus one standard error over the splits. Bold numbers correspond to the best models overall.}
  \label{tab:dwp:uci_lls}
  \centering
  \begin{tabular}{rcccc}
    \toprule
& & & DWP & \\
\{Dataset\}-\{Depth\} & DGP & $\QGW$ & $\QAGW$ & $\QABGW$ \\
\midrule   
\textsc{\boston} - 2 & -2.43 $\pm$ 0.05 & -2.40 $\pm$ 0.05 & \textbf{-2.37 $\pm$ 0.05} & \textbf{-2.37 $\pm$ 0.05}  \\ 
3 & -2.39 $\pm$ 0.04 & -2.38 $\pm$ 0.05 & \textbf{-2.35 $\pm$ 0.04} & \textbf{-2.35 $\pm$ 0.04} \\ 
4 & -2.41 $\pm$ 0.04 & -2.38 $\pm$ 0.04 & \textbf{-2.37 $\pm$ 0.04} & \textbf{-2.37 $\pm$ 0.04} \\ 
5 & -2.43 $\pm$ 0.04 & -2.38 $\pm$ 0.04 & -2.39 $\pm$ 0.05 & \textbf{-2.38 $\pm$ 0.04} \\ 
\midrule 
\textsc{\concrete} - 2 & -3.10 $\pm$ 0.02 & -3.12 $\pm$ 0.02 & \textbf{-3.08 $\pm$ 0.02} & \textbf{-3.08 $\pm$ 0.02} \\ 
3 & -3.08 $\pm$ 0.02 & -3.10 $\pm$ 0.02 & \textbf{-3.06 $\pm$ 0.02} & -3.07 $\pm$ 0.02 \\ 
4 & -3.13 $\pm$ 0.02 & -3.12 $\pm$ 0.02 & \textbf{-3.07 $\pm$ 0.02} & \textbf{-3.07 $\pm$ 0.02}  \\ 
5 & -3.13 $\pm$ 0.02 & -3.13 $\pm$ 0.02 & \textbf{-3.07 $\pm$ 0.02} & -3.08 $\pm$ 0.02  \\ 
\midrule 
\textsc{\energy} - 2 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03  \\ 
3 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03  \\ 
4 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 & -0.70 $\pm$ 0.03 \\ 
5 & -0.71 $\pm$ 0.03 & -0.71 $\pm$ 0.03 & \textbf{-0.70 $\pm$ 0.03} & \textbf{-0.70 $\pm$ 0.03}  \\ 
\midrule 
\textsc{\kinnm} - 2 & 1.35 $\pm$ 0.00 & 1.35 $\pm$ 0.00 & \textbf{1.36 $\pm$ 0.00} & \textbf{1.36 $\pm$ 0.00} \\ 
3 & 1.37 $\pm$ 0.00 & 1.37 $\pm$ 0.00 & \textbf{1.38 $\pm$ 0.00} & \textbf{1.38 $\pm$ 0.00}  \\ 
4 & 1.38 $\pm$ 0.00 & 1.39 $\pm$ 0.01 & \textbf{1.40 $\pm$ 0.00} & \textbf{1.40 $\pm$ 0.00}  \\ 
5 & 1.38 $\pm$ 0.00 & 1.40 $\pm$ 0.01 & \textbf{1.41 $\pm$ 0.01} & \textbf{1.41 $\pm$ 0.01}\\ 
\midrule 
\textsc{\naval} - 2 & \textbf{8.24 $\pm$ 0.06} & 8.23 $\pm$ 0.08 & 8.18 $\pm$ 0.11 & 8.18 $\pm$ 0.13  \\ 
3 & 8.15 $\pm$ 0.06 & 8.18 $\pm$ 0.07 & 8.27 $\pm$ 0.05 & \textbf{8.38 $\pm$ 0.03} \\ 
4 & 8.28 $\pm$ 0.04 & 8.17 $\pm$ 0.11 & 8.14 $\pm$ 0.13 & \textbf{8.32 $\pm$ 0.06} \\ 
5 & 8.28 $\pm$ 0.04 & 8.17 $\pm$ 0.07 & \textbf{8.40 $\pm$ 0.02} & 8.10 $\pm$ 0.19 \\ 
\midrule 
\textsc{\power} - 2 & -2.78 $\pm$ 0.01 & -2.77 $\pm$ 0.01 & \textbf{-2.76 $\pm$ 0.01} & \textbf{-2.76 $\pm$ 0.01} \\ 
3 & -2.77 $\pm$ 0.01 & \textbf{-2.76 $\pm$ 0.01} & \textbf{-2.76 $\pm$ 0.01} & \textbf{-2.76 $\pm$ 0.01}  \\ 
4 & -2.78 $\pm$ 0.01 & -2.77 $\pm$ 0.01 & \textbf{-2.75 $\pm$ 0.01} & \textbf{-2.75 $\pm$ 0.01}  \\ 
5 & -2.78 $\pm$ 0.01 & -2.77 $\pm$ 0.01 & \textbf{-2.76 $\pm$ 0.01} & \textbf{-2.76 $\pm$ 0.01} \\ 
\midrule 
\textsc{\protein} - 2 & -2.82 $\pm$ 0.00 & \textbf{-2.81 $\pm$ 0.00} & \textbf{-2.81 $\pm$ 0.00} & \textbf{-2.81 $\pm$ 0.00} \\ 
3 & -2.78 $\pm$ 0.00 & -2.77 $\pm$ 0.00 & \textbf{-2.76 $\pm$ 0.00} & \textbf{-2.76 $\pm$ 0.00}  \\ 
4 & -2.75 $\pm$ 0.00 & -2.73 $\pm$ 0.00 & \textbf{-2.72 $\pm$ 0.00} & -2.73 $\pm$ 0.01  \\ 
5 & -2.73 $\pm$ 0.01 & -2.72 $\pm$ 0.01 & -2.71 $\pm$ 0.01 & \textbf{-2.70 $\pm$ 0.00} \\ 
\midrule 
\textsc{\wine} - 2 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 \\ 
3 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 \\ 
4 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01\\ 
5 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01 & -0.96 $\pm$ 0.01  \\ 
\midrule 
\textsc{\yacht} - 2 & -0.29 $\pm$ 0.12 & \textbf{-0.04 $\pm$ 0.10} & \textbf{-0.04 $\pm$ 0.08} & -0.08 $\pm$ 0.10  \\ 
3 & -0.63 $\pm$ 0.04 & -0.13 $\pm$ 0.07 & 0.12 $\pm$ 0.07 & \textbf{0.14 $\pm$ 0.06} \\ 
4 & -0.77 $\pm$ 0.07 & -0.26 $\pm$ 0.07 & \textbf{-0.04 $\pm$ 0.09} & \textbf{-0.04 $\pm$ 0.09}  \\ 
5 & -0.73 $\pm$ 0.07 & -0.58 $\pm$ 0.06 & -0.22 $\pm$ 0.09 & \textbf{-0.18 $\pm$ 0.07} \\ 
\bottomrule
  \end{tabular}
\end{table}


\begin{table}[ht]
\footnotesize
  \caption{Root mean square error. We report mean plus or minus one standard error over the splits. Bold numbers correspond to the best models overall.}
  \label{tab:dwp:uci_rmses}
  \centering
  \begin{tabular}{rcccc}
    \toprule
& & & DWP & \\
\{Dataset\}-\{Depth\} & DGP & $\QGW$ & $\QAGW$ & $\QABGW$ \\
\midrule  
\textsc{\boston} - 2 & 2.72 $\pm$ 0.14 & 2.67 $\pm$ 0.14 & 2.60 $\pm$ 0.12 & \textbf{2.59 $\pm$ 0.13} \\ 
3 & 2.73 $\pm$ 0.14 & 2.66 $\pm$ 0.13 & \textbf{2.62 $\pm$ 0.13} & 2.63 $\pm$ 0.13 \\ 
4 & 2.76 $\pm$ 0.14 & 2.74 $\pm$ 0.15 & 2.71 $\pm$ 0.14 & \textbf{2.68 $\pm$ 0.14} \\ 
5 & 2.81 $\pm$ 0.14 & 2.82 $\pm$ 0.17 & \textbf{2.77 $\pm$ 0.16} & 2.81 $\pm$ 0.17 \\ 
\midrule 
\textsc{\concrete} - 2 & 5.41 $\pm$ 0.10 & 5.50 $\pm$ 0.12 & \textbf{5.29 $\pm$ 0.12} & 5.30 $\pm$ 0.12 \\ 
3 & 5.31 $\pm$ 0.11 & 5.32 $\pm$ 0.10 & \textbf{5.22 $\pm$ 0.12} & 5.23 $\pm$ 0.12 \\ 
4 & 5.54 $\pm$ 0.10 & 5.43 $\pm$ 0.11 & 5.24 $\pm$ 0.13 & \textbf{5.22 $\pm$ 0.13} \\ 
5 & 5.49 $\pm$ 0.10 & 5.53 $\pm$ 0.10 & 5.26 $\pm$ 0.11 & \textbf{5.24 $\pm$ 0.11} \\ 
\midrule 
\textsc{\energy} - 2 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 \\ 
3 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 \\ 
4 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 & 0.48 $\pm$ 0.01 \\ 
5 & 0.49 $\pm$ 0.01 & \textbf{0.48 $\pm$ 0.01} & \textbf{0.48 $\pm$ 0.01} & \textbf{0.48 $\pm$ 0.01} \\ 
\midrule 
\textsc{\kinnm} - 2 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.00 & 0.06 $\pm$ 0.00 \\ 
3 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.00 & 0.06 $\pm$ 0.00 \\ 
4 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.00 & 0.06 $\pm$ 0.00 \\ 
5 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 0.06 $\pm$ 0.00 & 0.06 $\pm$ 0.00 \\ 
\midrule 
\textsc{\naval} - 2 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 \\ 
3 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 \\ 
4 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 \\ 
5 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 & 0.00 $\pm$ 0.00 \\ 
\midrule 
\textsc{\power} - 2 & 3.87 $\pm$ 0.04 & 3.83 $\pm$ 0.04 & 3.82 $\pm$ 0.04 & \textbf{3.81 $\pm$ 0.04} \\ 
3 & 3.87 $\pm$ 0.03 & 3.82 $\pm$ 0.04 & \textbf{3.81 $\pm$ 0.04} & \textbf{3.81 $\pm$ 0.04} \\ 
4 & 3.89 $\pm$ 0.04 & 3.84 $\pm$ 0.04 & \textbf{3.78 $\pm$ 0.04} & \textbf{3.78 $\pm$ 0.04} \\ 
5 & 3.88 $\pm$ 0.04 & 3.84 $\pm$ 0.04 & \textbf{3.80 $\pm$ 0.04} & \textbf{3.80 $\pm$ 0.04} \\ 
\midrule 
\textsc{\protein} - 2 & 4.08 $\pm$ 0.01 & 4.06 $\pm$ 0.01 & \textbf{4.05 $\pm$ 0.02} & \textbf{4.05 $\pm$ 0.01} \\ 
3 & 3.92 $\pm$ 0.02 & 3.90 $\pm$ 0.01 & 3.88 $\pm$ 0.01 & \textbf{3.87 $\pm$ 0.01} \\ 
4 & 3.82 $\pm$ 0.01 & 3.79 $\pm$ 0.01 & \textbf{3.75 $\pm$ 0.01} & 3.79 $\pm$ 0.02 \\ 
5 & 3.77 $\pm$ 0.02 & 3.76 $\pm$ 0.02 & 3.73 $\pm$ 0.02 & \textbf{3.70 $\pm$ 0.01} \\ 
\midrule 
\textsc{\wine} - 2 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 \\ 
3 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 \\ 
4 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 \\ 
5 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 & 0.63 $\pm$ 0.01 \\ 
\midrule 
\textsc{\yacht} - 2 & 0.41 $\pm$ 0.04 & \textbf{0.33 $\pm$ 0.03} & \textbf{0.33 $\pm$ 0.03} & \textbf{0.33 $\pm$ 0.03} \\ 
3 & 0.53 $\pm$ 0.03 & 0.35 $\pm$ 0.03 & 0.31 $\pm$ 0.03 & \textbf{0.30 $\pm$ 0.03} \\ 
4 & 0.58 $\pm$ 0.05 & 0.41 $\pm$ 0.04 & \textbf{0.33 $\pm$ 0.03} & \textbf{0.33 $\pm$ 0.03} \\ 
5 & 0.57 $\pm$ 0.05 & 0.50 $\pm$ 0.04 & \textbf{0.37 $\pm$ 0.03} & 0.38 $\pm$ 0.03 \\ 
\bottomrule
  \end{tabular}
\end{table}

\FloatBarrier
\bibliography{ober_402}
% \bibliography
\end{document}
