%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{subfigure}
\usepackage{algorithm}

% \usepackage{hyperref}
% \usepackage{nameref}
% \usepackage{zref-xr}
% \zxrsetup{toltxlabel}
% % uai_submission/
% \zexternaldocument*{achituve_219}

\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\externaldocument{achituve_219}

\input{math_commands}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{achituve_219}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\title{Guided Deep Kernel Learning - Supplementary Material}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<Idan.Achituve@biu.ac.il>?Subject=GDKL on UAI2023}{Idan Achituve}}
\author[2, 3]{Gal Chechik}
\author[1]{Ethan Fetaya}
% Add affiliations after the authors
\affil[1]{%
    Faculty of Engineering\\
    Bar-Ilan University\\
    Israel
}
\affil[2]{%
    Computer Science Dept.\\
    Bar-Ilan University\\
    Israel
}
\affil[3]{%
    NVIDIA\\
    Israel
  }

\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\section{Experimental Details}
\label{app:exp_details}
All experiments were done with GPyTorch \citep{gardner2018gpytorch} on NVIDIA GeForce RTX 2080 Ti having 11GB of memory. To compute the kernel of the NNGP we used the Neural Tangents library \citep{NovakXHLASS20}.

%\subsection{Datasets}
\textbf{Toy Dataset.} To construct the toy example we define the following target function (following  \citep{leclercq2018bayesian}): $f(x) = 0.6 - e^{-(x - 2)^2} - e^{-\frac{1}{10}(x - 6)^2} - \frac{1}{x^2+1}$. We sample uniformly at random $800$ points in $[-2, 12]$, evaluate the function on them and add observation noise of $\sigma_n^2 = 0.05$. Then we partition the dataset by random sampling to two subsets of $400$ points each, namely $\gD_1$ and $\gD_2$. We remove from $\gD_1$ all the points from the domain $[4, 8]$, and fit a GP with an RBF kernel to this dataset. We learn the hyper-parameters of this kernel using the ADAM optimizer \citep{KingmaB14} with the log marginal likelihood. Then, we evaluate $p(f_* | x_*, \gD_1)$ and $p(f_* | x_*, y_*, \gD_1)$ for all $(x_*, y_*) \in \gD_2$. 

\textbf{UCI.} We followed most of the training protocol suggested in \citep{ober2021promises}. 
To download and manipulate the datasets we used the Bayesian benchmarks git repository: {\href{https://github.com/hughsalimbeni/bayesian_benchmarks}{[https://github.com/hughsalimbeni/bayesian\_benchmarks]}}. 
To train the models, on Boston, Concrete, and Energy we perform 10-fold cross-validation using random seeds according to $90\%-10\%$ train-test splits. On Buzz and CTSlice we perform 3-fold cross-validation. We computed the normalization statistics (i.e., mean and std) based on the train split only and normalize all the data using them for model fitting. However, the results shown in the paper are on the unnormalized target values (i.e., the original values) which differ only in the scale. In all experiments, we used a fully connected network with the following architecture $[d, 100, 100, 100, 20]$ and ReLU activations. We used the same number of layers and activation for the infinite-width network. We initialized the variance of the observation noise to $\sim 0.02$ and learned it along with the model parameters. We used a weight decay of $1e-4$ for the DKL model only (no weight decay for GDKL), and we set the variance of the weights and biases of the NNGP to $1.6$ and $0.2$ as we found these values to work well on several toy examples. We trained all baseline models for $8000$ iterations. In GDKL we first pre-train the NNGP model observation noise and output scale of the kernel for $1000$ iterations, and then we train the DKL model for another $7000$ iterations in order to be comparable in the number of gradient steps used by the baseline methods. Also, on Buzz and CTSlice we used $\beta = 1.2$ as we found it to work slightly better than $1$ on a predefined validation set. We used a learning rate of $1e-2$ which drops by a factor of $10$ after $60\%$ and $80\%$ of the training (not including the pre-train stage of GDKL).

%To perform hyper-parameter search for all methods we sampled two different random seeds and evaluated the metrics on their test set. For the DKL model we searched over the weight decay and two normalization schemes to the network: scaling the features, and L2 normalization. We found that a weight decay of $1e-4$ generated good results, and that both normalization schemes degrade the results compared to no normalization. So, we didn't use normalization. For the NNGP model we searched over the weight and bias variances under the constraint that the weight variance is larger than the bias variance. The results were similar for all, so we fixed the weight variance to $1.6$ and the bias variance to $0.2$. beta = 1.2 on Buzz and CTSlice. No ARD.

\textbf{CIFAR-10/100.} CIFAR-10 and CIFAR-100 \citep{krizhevsky2009learning} contain 60K images each with 10 and 100 distinct classes respectively. We used the default train-test split of 50K-10K. To perform a hyperparameters search, we allocated 5K examples from the training set. To report the results in \Secref{sec:large_dataset} in the main text, we use all of the training data (i.e., training set and validation set). In all experiments on these datasets we used Wide Residual Networks \citep{ZagoruykoK16} with a widen factor $k = 5$ so it will fit in the GPU. We used the features obtained in the last layer, after applying average pooling, as the input to the GP layer for DKL, DUE, and GDKL. As for the DLVKL baselines, we used an additional linear layer of size $100$ which we split into two halves for the mean and variance vectors of the Gaussian. For the NNGP model we used the same network, but without the average pooling layer as it imposed a large computational burden. We note that this step may harm the performance of the NNGP model \citep{NovakXBLYHAPS19}. On these datasets, for all DKL-based methods, we used a dropout rate of $0.3$.

On CIFAR-10 experiments in \Secref{sec:high_dim_data} we leverage the Dirichlet likelihood function suggested in \citep{milios2018dirichlet} with $\alpha_\epsilon = 0.01$. To make predictions with this likelihood one needs to sample from the posterior of $f_*$. Hence during test time, we sampled $1024$ values. During the training of GDKL, we sampled $256$ values as it uses the predictive distribution to train the model. We train all methods for a total of $7000$ gradient steps. For GDKL, we use the first $1000$ iteration to pre-train the NNGP model hyper-parameters and then train the DKL model for another $6000$ iterations. We used SGD with momentum of $0.9$, and an initial learning rate of $1e-2$ that drops by a factor of $10$ after $60\%$ and $80\%$ of the training (not including the pre-training stage for GDKL). We used a weight decay of $5e-4$ in all DKL-based methods except for GDKL which was set to $0$.
We did a grid search to select the best hyper-parameters for each method based on the validation set. The DLVKL objective has two KL divergences. We applied a grid search over their coefficients in $\{0.01, 0.1, 1.0\}$. Since DLVKL is based on variational inference, we set the number of inducing points to be the minimum value between the number of examples and $200$. The inducing locations were initialized using k-means. As in the official code of this baseline, we used a prior over the latent variable $z$ having a unit variance and a mean value that corresponds to the PCA projection of the input data. For the DUE baseline, we searched over the normalization coefficient and number of power iterations in $\{1, 3\}$, and for the NNGP model we searched over the weight variance in $\{1, 3, 5\}$ while keeping the variance of the bias fixed at $0.2$. We found that performance was similar for all values and picked the value $5$. For GDKL we also searched over the parameter $\beta \in \{0.1, 1.\}$ and we found that using $\beta=1.$ generated better results. Note that since this is a multi-output learning setup the KL divergence in GDKL objective results in a summation over the classes. To make it invariant to the number of classes we take an average instead of a sum. This is effectively the same as scaling the KL divergence term by another factor that equals to $0.1$. We did not perform data augmentations in these experiments.

%As the objective the DLVKL baseline involves two sampling processes, one for the number of $z's$ and another for the number of likelihood samples, we split .... No ARD.

In the experiments of \Secref{sec:large_dataset}, for the most part, we followed the protocol suggested in \citep{van2021feature}. Here, we used the Softmax likelihood function for training the deep kernels of all methods. To train the models we used $16$ samples from the latent GP when computing the likelihood, and on novel test points, we used $320$ samples. On CIFAR-10 we set the number of inducing points to $10$, and on CIFAR-100 to $200$ for all methods. All inducing locations were initialized using k-means. We set the number of training epochs to $200$ with a batch size of 256. To train GDKL we initially sampled $5\%$ of the data to train the hyper-parameters of the NNGP model for $1000$ iterations which correspond to $\sim 5$ epochs with a batch size of 256. Then we train the DKL model for an additional $194$ epochs. We used SGD with momentum of $0.9$, and an initial learning rate of $1e-1$ that drops by a factor of $10$ after $50\%$ and $75\%$ of the training (not including the pre-training stage for GDKL). We used a weight decay of $5e-4$ for all methods. Here we used $\beta=0.1$ for GDKL. Also, as in the exact setting of GDKL, we approximate the objective in Eq. 10 with MC samples. Note, however, that unlike Eq. 7 where $\gD_1$ appears in all terms of the objective, here the corresponding element $\gB_1$ appears only in the posterior distribution of the NNGP model.  Thus, to be more data efficient, we use two samples. One with $\gB_1$ as the "observed data" and another one with $\gB_2$ as the "observed data". In DUE and DKL, we also searched over the coefficient of the KL divergence in the variational ELBO objective in $\{0.1, 1.0\}$. We used random cropping and random horizontal flip for data augmentation.

\section{The GDKL Objective}
\label{app_sec:gdkl_objective}

In \Secref{sec:GDKL} we presented to following objective:
\begin{equation}
\label{app_eq:hyb_obj}
    \E_{\rvx_*,y_*\sim\gD_2} D_{KL} [q_\theta(f_{*} | \rvx_*, \gD_1) || p(f_{*} |y_*, \rvx_*, \gD_1)].
\end{equation}
We now show that it is equivalent to the objective of Eq. 6 in the main text.

\begin{equation}
\label{app_eq:hyb_obj_derivation}
\begin{aligned}
    &\E_{\rvx_*,y_*\sim\gD_2}D_{KL} [q_\theta(f_{*} | \rvx_*, \gD_1) || p(f_{*} |\rvx_*, y_*, \gD_1)] \\
    &=\E_{\rvx_*,y_*\sim\gD_2} \E_{  q_\theta(f_{*} | \rvx_*, \gD_1)} [\log~\frac{q_\theta(f_{*} | \rvx_*, \gD_1)}{p(f_{*} |\rvx_*, y_*, \gD_1)}] \\
    &= \E_{\rvx_*,y_*\sim\gD_2} \E_{  q_\theta(f_{*} | \rvx_*, \gD_1)}[\log~q_\theta(f_{*} | \rvx_*, \gD_1) - \log~\frac{p(y_* |f_{*}) p(f_{*}| \rvx_*, \gD_1)}{p(\rvx_*, \gD_1)}] \\
    &= \E_{\rvx_*,y_*\sim\gD_2} \E_{  q_\theta(f_{*} | \rvx_*, \gD_1)}[\log~q_\theta(f_{*} | \rvx_*, \gD_1) - \log~p(y_* |f_{*}) - \log~p(f_{*}| \rvx_*, \gD_1) + \log~p(\rvx_*, \gD_1)]\\
    &\propto \E_{\rvx_*,y_*\sim\gD_2} \E_{  q_\theta(f_{*} | \rvx_*, \gD_1)}[- \log~p(y_* |f_{*})] + D_{KL}[q_\theta(f_{*} | \rvx_*, \gD_1) || p(f_{*}| \rvx_*, \gD_1)].
\end{aligned}
\end{equation}
Where, in the third step we used Bayes rule, and in the last step we dropped the constant factor $\log~p(\rvx_*, \gD_1)$ which doesn't effect the optimization process.
\\~\\
For a Gaussian likelihood, the posterior predictive distributions can be derived using standard Gaussian algebra \citep{gp_book}. For instance,
\begin{equation}
    \begin{aligned} 
    & p(f_*|\rvx_*, \gD_1)=\mathcal{N}(\mu_*^p, (\sigma_*^p)^2),\\
    &\mu_*^p=\rvk_{*}^T(\rmK+\sigma^2_n \bld{I})^{-1}\rvy,\\
    &(\sigma_*^p)^2 = k_{**} - \rvk_*^T(\rmK+\sigma^2_n\bld{I})^{-1}\rvk_*.
    \end{aligned} 
\end{equation}
Where, $K_{ij}=k(\rvx_i,\rvx_j)$, $k_{**} = k(\rvx_*, \rvx_*)$, and $\rvk_*[i]=k(\rvx_i, \rvx_*)$.
In a similar fashion $q_{\theta}(f_{*}| \rvx_*, \gD_1) = \gN(f_{*} | \mu_*^q, (\sigma_*^q)^2)$ can be obtained.

Now, the $D_{KL}$ term has the following closed-form solution:
\begin{equation}
\label{app_eq:kl}
\begin{aligned}
D_{KL}[q_\theta(f_{*} | \rvx_*, \gD_1) || p(f_{*}| \rvx_*, \gD_1)] = \log~\frac{\sigma_*^p}{\sigma_*^q} + \frac{(\sigma_*^q)^2 + (\mu_*^q - \mu_*^p)^2}{2 (\sigma_*^p)^2} - \frac{1}{2}.
\end{aligned}
\end{equation}

Similarly, the expected log-likelihood term can be computed analytically using the following:
\begin{equation}
\label{app_eq:ell}
\begin{aligned}
\E_{  q_\theta(f_{*} | \rvx_*, \gD_1)}[- \log~p(y_* |f_{*})] = \frac{1}{2}(\log~2\pi + \log~\sigma_n^2 + \frac{(y_* - \mu_*^q)^2 + (\sigma_*^q)^2}{\sigma_n^2}).
\end{aligned}
\end{equation}

\section{Computational Considirations}
\label{app_sec:comp_cons}
In this section we address the computational complexity of GDKL from two aspects: (1) the scaling limitations imposed by the NNGP kernel, and (2) comparison to baseline methods.

\textbf{Scaling limitations imposed by the NNGP kernel.} GDKL leverages NNGP kernels to learn the model. One may wonder if that may pose a limit to GDKL as computing the NNGP kernel can be costly. To address this concern we provide two important computational aspects to showcase that it is not an issue for GDKL. Furthermore, we argue that GDKL posses computational advantages over using NNGPs directly, aside from the added benefit in performance.

\begin{itemize}
    \item First, as with standard NNGPs, GDKL inherits the flexibility in choosing the architecture of the NNGP and other design choices, such as the data resolution on which this kernel is computed. However, unlike traditional NNGPs where the posterior is heavily influenced by the NNGP kernel, in GDKL one may choose simpler kernels that possibly can be computed more efficiently without it resulting in a significant performance drop. This is because the NNGP basically serves as a prior in our model, aimed at calibrating the uncertainties of the DKL model. Hence, as we see it, GDKL can provide practitioners with the freedom to choose an NNGP model based on their hardware constraints, without compromising on model performance significantly. To validate that point, in \tblref{app_tab:shallower_NNGP} we present the effect of using only one residual block in each group, instead of four, in the kernel obtained by the wide-residual network architecture which we used throughout. The table shows the results in terms of test accuracy under the setup of \Secref{sec:high_dim_data}, but a similar trend was observed in terms of log-likelihood as well. The results in the table indicate a clear advantage, albeit small but still statistically significant, to the deeper NNGP model compared to the shallower one. However, when using the shallower NNGP model in the training process of GDKL, it has almost no effect on the test accuracy of GDKL.


    \begin{table*}[!h]
    \centering
    \caption{Effect of infinite network depth - test accuracy on CIFAR-10 with $\{50, 100, 200, 400, 800\}$ training examples.}
    %\vskip 0.1in
    \scalebox{.85}{
        \begin{tabular}{l c ccccc}
        \toprule
        && 50 & 100 & 200 & 400 & 800\\
        \midrule
        NNGP - One Res. Block &&  18.80 $\pm$ 0.11 & 21.10 $\pm$ 0.11 & 27.71 $\pm$ 0.13 & 32.18 $\pm$ 0.11 & 36.34 $\pm$ 0.09\\
        NNGP - Four Res. Blocks &&  18.87 $\pm$ 0.16 & 22.47 $\pm$ 0.16 & 28.94 $\pm$ 0.13 & 34.39 $\pm$ 0.09 & 38.73 $\pm$ 0.07\\
        \hline
        GDKL - One Res. Block && 19.41 $\pm$ 0.05 & 26.70 $\pm$ 1.11 & 36.52 $\pm$ 0.94 & 42.60 $\pm$ 0.53 & 49.34 $\pm$ 0.52\\
        GDKL - Four Res. Blocks && 19.35 $\pm$ 0.65 & 26.53 $\pm$ 0.98 & 36.45 $\pm$ 0.72 & 42.97 $\pm$ 0.71 & 49.75 $\pm$ 0.94\\
        % GDKL && \textbf{-2.49 $\pm$ 0.20} && \textbf{3.03 $\pm$ 0.54} && \textbf{-0.50 $\pm$ 0.07} && \textbf{0.32 $\pm$ 0.06} && \textbf{-2.93 $\pm$ 0.10} && 4.58 $\pm$ 0.57 \\
        \bottomrule
        \end{tabular}
    }
    \label{app_tab:shallower_NNGP}
    \end{table*}

    \item Second, in terms of computation of the kernel. When dealing with small to medium-sized datasets, computing the NNGP kernel is not typically expensive and can be done offline before training. However, for larger datasets, scalability becomes more challenging. In these cases, usually one will need to deal with scalability issues of GPs in general, and the common practice is to use inducing point (IP) methods such as the one we proposed in the paper.  
    In the IP variant of GDKL, batches are sampled during training and the NNGP kernel depends only on the examples in them. Therefore, one option is to compute the NNGP kernel online based on the examples in each batch. Since the batch size is usually small (e.g., 256), efficient optimization packages (e.g., \citep{NovakXHLASS20}) can be utilized to compute these kernel matrices in a relatively efficient manner. However, this approach may still be slower than training standard DKLs. Therefore, a further improvement can be done with proper engineering work. One can pre-compute the kernel of the examples in each batch offline before training by taking into account the stochasticity in forming batches during training. These pre-calculated kernels can then be used during training of GDKL. This is an advantage of GDKL over the standard NNGP model, which requires computation of the full kernel matrix.  
\end{itemize}

Finally, we would like to highlight two important points. First, although the training of GDKL can be slower compared to DKL, when making predictions the models are equivalent. This is unlike the NNGP model which scales linearly with the number of training points. Second, there are ongoing efforts to scale NNGP models to larger datasets, as evident by recent studies such as \citep{adlam2023kernel}. These advancements in scaling NNGP models could potentially be leveraged in GDKL as well, if needed, to further improve its scalability and applicability to larger datasets.

\textbf{Comparison to baseline methods.}
The GDKL objective consists of two components in the loss function: a predictive distribution term and a KL divergence term. The most computationally intensive factor in calculating both terms is the inversion of the DKL and the NNGP kernel matrices, each with a complexity of $\mathcal{O}((n / 2)^3)$ in the exact case. The division by 2 is due to GDKL partitioning the data into two halves at each iteration, using one half for making predictions on the other half.  
When employing $m$ inducing points, the complexity of the computation involves the inverse of the kernel over the inducing locations and the inverse of the NNGP kernel over half of the examples in the batch, resulting in a complexity of $\mathcal{O}(m^3 + (\mathcal{B} / 2)^3)$, where $\mathcal{B}$ is the batch size. A potential speedup can be achieved by pre-calculating the inverse of the NNGP kernel offline before training, and using it during training. This way, during training the model's computational speed would be similar to standard DKL models.  

To estimate the training time difference between the methods we measured the average time per-iteration in seconds on CIFAR-10 based on $100$ iterations five times. In \tblref{app_tab:calc_time} we present these timing along with the constant time taken for computing the NNGP kernel before training.  According to the data presented in the table, GDKL tends to exhibit slightly slower performance compared to DKL and DUE. However, it's worth noting that the current implementation of the code is not highly optimized, and there are several aspects that can be improved, such as the computation time for the NNGP kernel and the average iteration time.

\begin{table*}[!h]
    \centering
    \caption{Average run time (Sec.) on CIFAR-10 with $\{50, 100, 200, 400, 800\}$ training examples. Results are based on $100$ iterations done $5$ times.}
    %\vskip 0.1in
    \scalebox{.85}{
        \begin{tabular}{l c ccccc}
        \toprule
        && 50 & 100 & 200 & 400 & 800\\
        \midrule
        NNGP (One Time) &&  5.44 $\pm$ 0.32 & 5.51 $\pm$ 0.07 & 5.91 $\pm$ 0.10 & 8.12 $\pm$ 0.23 & 16.70 $\pm$ 0.16\\
        \hline
        DKL &&  0.06 $\pm$ 0.00 & 0.12 $\pm$ 0.00 & 0.22 $\pm$ 0.00 & 0.44 $\pm$ 0.00 & 0.65 $\pm$ 0.00\\
        DLVKL && 0.10 $\pm$ 0.01 & 0.18 $\pm$ 0.00 & 0.29 $\pm$ 0.00 & 0.56 $\pm$ 0.00 & 0.84 $\pm$ 0.00\\
        DUE && 0.08 $\pm$ 0.00 & 0.14 $\pm$ 0.00 & 0.23 $\pm$ 0.00 & 0.45 $\pm$ 0.00 & 0.67 $\pm$ 0.00\\
        GDKL && 0.07 $\pm$ 0.00 & 0.14 $\pm$ 0.00 & 0.24 $\pm$ 0.00 & 0.48 $\pm$ 0.00 & 0.76 $\pm$ 0.04\\
        \bottomrule
        \end{tabular}
    }
    \label{app_tab:calc_time}
    \end{table*}






\section{Additional Experiments}
\label{app_sec:add_exp}

\subsection{Objective Functions Analysis}
\label{app_sec:obj_fun_analysis}
\begin{table*}[!h]
\centering
\caption{Ablation on objective functions - test results on the UCI datasets based on ten random splits.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Boston} && \multicolumn{3}{c}{Energy} && 
    \multicolumn{3}{c}{Concrete}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$)\\
    \midrule
    NNGP && -2.49 $\pm$ 0.16 && 3.19 $\pm$ 0.70 && -1.07 $\pm$ 0.04 && 0.69 $\pm$ 0.02 && -3.15 $\pm$ 0.04 && 4.71 $\pm$ 0.49 \\
    \midrule
    $\ell_{dist}$ && \textbf{-2.51 $\pm$ 0.13} && 3.32 $\pm$ 0.52 && -1.23 $\pm$ 0.04 && 0.72 $\pm$ 0.08 && -3.03 $\pm$ 0.14 && 5.02 $\pm$ 0.58 \\
    $\ell_{pred}$ && -499. $\pm$ 229. && 3.27 $\pm$ 0.92 && -1.78 $\pm$ 1.10 && \textbf{0.28 $\pm$ 0.07} && -6.72 $\pm$ 1.52 && \textbf{4.21 $\pm$ 0.90} \\
    \midrule
    GDKL && \textbf{-2.49 $\pm$ 0.20} && \textbf{3.03 $\pm$ 0.54} && \textbf{-0.50 $\pm$ 0.07} && \textbf{0.32 $\pm$ 0.06} && \textbf{-2.93 $\pm$ 0.10} && 4.58 $\pm$ 0.57 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:objectives}
\end{table*}

Here we study the effect of using the GDKL objective vs $\ell_{dist}$ and $\ell_{pred}$ which were presented in \Secref{sec:GDKL}. We evaluated all methods on the UCI datasets Boston, Concrete, and Energy according to the setup described in \Secref{sec:small_sized_data}. The results are presented in \tblref{app_tab:objectives}. The table shows that the results are in agreement with our intuition. First, $\ell_{dist}$ behavior is similar to that of the NNGP, the model that it tries to distill. Second, $\ell_{pred}$ clearly overfits as indicated by the log-likelihood values, yet according to the RMSE it is able to maintain a good mean prediction. And lastly, our proposed approach balances well between these two edges, it presents the best results on both metrics in almost all cases.

\subsection{Comparison to a standard Neural Network}
\label{sec:comparison_to_nn}

\begin{table*}[!h]
\centering
\caption{Comparison to a standard NN - test results on CIFAR-10 with $\{50, 100, 200, 400, 800\}$ training examples.}
%\vskip 0.1in
\scalebox{.65}{
    \begin{tabular}{l c ccccc c ccccc}
    \toprule
    && \multicolumn{5}{c} {Log-Likelihood} && \multicolumn{5}{c}{Accuracy} \\
    \cmidrule(l){3-7}  \cmidrule(l){9-13}
    && 50 & 100 & 200 & 400 & 800 && 50 & 100 & 200 & 400 & 800\\
    \midrule
    NN && -4.55 $\pm$ 0.17 & -4.63 $\pm$ 0.27 & -4.20 $\pm$ 0.26 & -3.76 $\pm$ 0.34 & -3.28 $\pm$ 0.21 &&  19.44 $\pm$ 1.17 & 20.94 $\pm$ 1.90 & 27.82 $\pm$ 1.67 & 34.96 $\pm$ 2.48 & 42.81 $\pm$ 2.06\\
    GDKL (Ours) && \textbf{-2.29 $\pm$ 0.01} & \textbf{-2.08 $\pm$ 0.03} & \textbf{-1.83 $\pm$ 0.01} & \textbf{-1.66 $\pm$ 0.01} & \textbf{-1.49 $\pm$ 0.01} &&  19.35 $\pm$ 0.65 & \textbf{26.53 $\pm$ 0.98} & \textbf{36.45 $\pm$ 0.72} & \textbf{42.97 $\pm$ 0.71} & \textbf{49.75 $\pm$ 0.94}\\
    % GDKL && \textbf{-2.49 $\pm$ 0.20} && \textbf{3.03 $\pm$ 0.54} && \textbf{-0.50 $\pm$ 0.07} && \textbf{0.32 $\pm$ 0.06} && \textbf{-2.93 $\pm$ 0.10} && 4.58 $\pm$ 0.57 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:comparison_nn}
\end{table*}
Here we compare GDKL to a standard NN on the CIFAR-10 dataset under the setup outlined in \Secref{sec:high_dim_data}. We present the test results when varying the number of training examples from $50$ to $800$ based on ten random seeds in \tblref{app_tab:comparison_nn}. From the table, GDKL demonstrates superior performance compared to a standard NN in terms of both log-likelihood and accuracy in almost all cases. Furthermore, when cross referencing these results with those in \Figref{fig:high_dim_data}, in general, GP-based methods outperform standard NNs in these experiments conducted under low-data regime conditions.


\subsection{Reliability diagrams}
\label{app_sec:reliability_diagrams}
Here we quantify the confidence through calibration for GDKL and baseline methods on the CIFAR-10 dataset in the setting described in \Secref{sec:high_dim_data}. We use reliability diagrams and the following metrics \citep{brier1950verification, guo2017calibration}: (1) Expected Calibration Error (ECE), which measures the weighted average distance between the classifier confidence and accuracy; (2) Maximum Calibration Error (MCE) which measures the maximum distance between the classifier confidence and accuracy; and (3) Brier score (BRI) which measures the average squared error between the prediction probabilities and the actual labels. \Figref{app_fig:calibration} shows that GDKL is best calibrated across all metrics in all cases when $n \geq 200$, and on smaller dataset sizes only the NNGP model is better. We note that temperature scaling can improve calibration, yet finding the right temperature requires having an additional validation set.


\begin{figure*}[!b]
    \centering
    \includegraphics[width=1.\textwidth]{figures/calibration.png}
    \caption{Reliability diagrams on CIFAR-10 test set for the experiments in \Secref{sec:high_dim_data} with training examples ranging from $50$ (top row) to $800$ (bottom row) examples.}
    \label{app_fig:calibration}
\end{figure*}

\subsection{Full Results}
In this section, we provide full numerical results for the experiments described in Sections \ref{sec:small_sized_data} and \ref{sec:high_dim_data}. 

\begin{table*}[!h]
\centering
\caption{Train results on small UCI datasets based on ten random splits.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Boston} && \multicolumn{3}{c}{Energy} && 
    \multicolumn{3}{c}{Concrete}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$)\\
    \midrule
    DKL && 1.45 $\pm$ 0.01 && 0.00 $\pm$ 0.00 && 1.36 $\pm$ 0.00 && 0.00 $\pm$ 0.00 && -1.89 $\pm$ 0.47 && 1.28 $\pm$ 0.03 \\
    NNGP && -2.07 $\pm$ 0.05 && 1.66 $\pm$ 0.09 && 1.00 $\pm$ 0.01 && 0.01 $\pm$ 0.00 && -2.96 $\pm$ 0.01 && 2.56 $\pm$ 0.07 \\
    GP-RBF && -1.85 $\pm$ 0.09 && 1.31 $\pm$ 0.16 && -0.18 $\pm$ 0.03 && 0.27 $\pm$ 0.01 && -2.15 $\pm$ 0.05 && 1.70 $\pm$ 0.01 \\
    \midrule
    GDKL && -1.91 $\pm$ 0.04 && 1.04 $\pm$ 0.06 && -0.08 $\pm$ 0.04 && 0.03 $\pm$ 0.00 && -2.64 $\pm$ 0.02 && 2.71 $\pm$ 0.07 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig2_train}
\end{table*}

\begin{table*}[!h]
\centering
\caption{Test results on small UCI datasets based on ten random splits.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Boston} && \multicolumn{3}{c}{Energy} && 
    \multicolumn{3}{c}{Concrete}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$)\\
    \midrule
    DKL && -553. $\pm$ 265. && 3.12 $\pm$ 0.71 && -3.59 $\pm$ 2.09 && 0.32 $\pm$ 0.07 && -3.89 $\pm$ 0.66 && 3.96 $\pm$ 0.63 \\
    NNGP && -2.49 $\pm$ 0.16 && 3.19 $\pm$ 0.70 && -1.07 $\pm$ 0.04 && 0.69 $\pm$ 0.02 && -3.15 $\pm$ 0.04 && 4.71 $\pm$ 0.49 \\
    GP-RBF && -2.39 $\pm$ 0.21 && 2.82 $\pm$ 0.64 && -0.51 $\pm$ 0.13 && 0.40 $\pm$ 0.05 && -2.90 $\pm$ 0.23 && 5.59 $\pm$ 0.80 \\
    \midrule
    GDKL && -2.47 $\pm$ 0.14 && 3.03 $\pm$ 0.49 && -0.31 $\pm$ 0.06 && 0.28 $\pm$ 0.03 && -2.92 $\pm$ 0.09 && 4.58 $\pm$ 0.60 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig2_test}
\end{table*}

\begin{table*}[!h]
\centering
\caption{Test results on Buzz, CTSlice, and CIFAR-10 - 50 examples.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Buzz} && \multicolumn{3}{c}{CTSlice} && 
    \multicolumn{3}{c}{CIFAR-10}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && Acc. ($\uparrow$)\\
    \midrule
    DKL && -421. $\pm$ 208. && 1.61 $\pm$ 0.26 && -283. $\pm$ 577. && 20.07 $\pm$ 3.54 && - 2.61 $\pm$ 0.05 && 20.26 $\pm$ 0.97 \\
    NNGP && -1.29 $\pm$ 0.04 && 1.06 $\pm$ 0.13 && -3.94 $\pm$ 0.05 && 12.35 $\pm$ 0.93 && - 2.23 $\pm$ 0.00 && 18.87 $\pm$ 0.16 \\
    GP-RBF && -1.61 $\pm$ 0.27 && 1.30 $\pm$ 0.16 && -4.53 $\pm$ 0.00 && 22.34 $\pm$ 0.15 && -- $\pm$ -- && -- $\pm$ -- \\
    DLVKL && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.42 $\pm$ 0.04 && 18.79 $\pm$ 0.86 \\
    DUE && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.64 $\pm$ 0.07 && 20.18 $\pm$ 0.85 \\
    \midrule
    GDKL && -1.23 $\pm$ 0.09 && 0.86 $\pm$ 0.08 && -3.92 $\pm$ 0.11 && 11.73 $\pm$ 1.10 && - 2.29 $\pm$ 0.02 && 19.35 $\pm$ 0.65 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig3_50}
\end{table*}

\begin{table*}[!h]
\centering
\caption{Test results on Buzz, CTSlice, and CIFAR-10 - 100 examples.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Buzz} && \multicolumn{3}{c}{CTSlice} && 
    \multicolumn{3}{c}{CIFAR-10}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && Acc. ($\uparrow$)\\
    \midrule
    DKL && -336. $\pm$ 185. && 1.38 $\pm$ 0.25 && -518. $\pm$ 350. && 13.10 $\pm$ 5.09 && - 2.73 $\pm$ 0.04 && 24.67 $\pm$ 1.26 \\
    NNGP && -1.18 $\pm$ 0.04 && 0.98 $\pm$ 0.14 && -3.75 $\pm$ 0.03 && 10.25 $\pm$ 0.46 && - 2.14 $\pm$ 0.00 && 22.47 $\pm$ 0.17 \\
    GP-RBF && -1.36 $\pm$ 0.27 && 1.10 $\pm$ 0.09 && -4.52 $\pm$ 0.00 && 22.33 $\pm$ 0.14 && -- $\pm$ -- && -- $\pm$ -- \\
    DLVKL && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.40 $\pm$ 0.07 && 22.81 $\pm$ 1.68 \\
    DUE && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.77 $\pm$ 0.14 && 24.32 $\pm$ 1.53 \\
    \midrule
    GDKL && -1.14 $\pm$ 0.05 && 0.77 $\pm$ 0.06 && -3.64 $\pm$ 0.08 && 9.05 $\pm$ 0.77 && - 2.08 $\pm$ 0.03 && 26.53 $\pm$ 0.98 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig3_100}
\end{table*}



\begin{table*}[!h]
\centering
\caption{Test results on Buzz, CTSlice, and CIFAR-10 - 200 examples.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Buzz} && \multicolumn{3}{c}{CTSlice} && 
    \multicolumn{3}{c}{CIFAR-10}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && Acc. ($\uparrow$)\\
    \midrule
    DKL && -178. $\pm$ 127. && 1.39 $\pm$ 0.21 && -459. $\pm$ 210. && 8.18 $\pm$ 0.74 && - 2.58 $\pm$ 0.04 && 33.65 $\pm$ 1.24 \\
    NNGP && -1.09 $\pm$ 0.02 && 0.91 $\pm$ 0.10 && -3.58 $\pm$ 0.05 && 8.89 $\pm$ 0.36 && - 2.02 $\pm$ 0.00 && 28.94 $\pm$ 0.13 \\
    GP-RBF && -1.09 $\pm$ 0.08 && 0.90 $\pm$ 0.11 && -4.42 $\pm$ 0.19 && 20.21 $\pm$ 3.41 && -- $\pm$ -- && -- $\pm$ -- \\
    DLVKL && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.42 $\pm$ 0.09 && 27.59 $\pm$ 1.68 \\
    DUE && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.56 $\pm$ 0.21 && 33.10 $\pm$ 1.18 \\
    \midrule
    GDKL && -1.08 $\pm$ 0.04 && 0.77 $\pm$ 0.06 && -3.45 $\pm$ 0.07 && 7.73 $\pm$ 0.66 && - 1.83 $\pm$ 0.01 && 36.45 $\pm$ 0.72 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig3_200}
\end{table*}

\begin{table*}[!h]
\centering
\caption{Test results on Buzz, CTSlice, and CIFAR-10 - 400 examples.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Buzz} && \multicolumn{3}{c}{CTSlice} && 
    \multicolumn{3}{c}{CIFAR-10}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && Acc. ($\uparrow$)\\
    \midrule
    DKL && -136. $\pm$ 82.2 && 1.34 $\pm$ 0.35 && -285. $\pm$ 130. && 7.22 $\pm$ 2.23 && - 2.36 $\pm$ 0.10 && 40.96 $\pm$ 0.89 \\
    NNGP && -1.00 $\pm$ 0.01 && 0.86 $\pm$ 0.09 && -3.39 $\pm$ 0.03 && 7.57 $\pm$ 0.24 && - 1.92 $\pm$ 0.00 && 34.39 $\pm$ 0.09 \\
    GP-RBF && -0.97 $\pm$ 0.03 && 0.79 $\pm$ 0.05 && -3.77 $\pm$ 0.29 && 10.84 $\pm$ 0.86 && -- $\pm$ -- && -- $\pm$ -- \\
    DLVKL && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.43 $\pm$ 0.09 && 33.48 $\pm$ 1.54 \\
    DUE && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.20 $\pm$ 0.21 && 40.33 $\pm$ 0.76 \\
    \midrule
    GDKL && -1.01 $\pm$ 0.03 && 0.71 $\pm$ 0.04 && -3.20 $\pm$ 0.05 && 5.98 $\pm$ 0.42 && - 1.66 $\pm$ 0.01 && 42.97 $\pm$ 0.71 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig3_400}
\end{table*}

\begin{table*}[!h]
\centering
\caption{Test results on Buzz, CTSlice, and CIFAR-10 - 800 examples.}
%\vskip 0.1in
\scalebox{.85}{
    \begin{tabular}{l c cccc cccc cccc}
    \toprule
    && \multicolumn{3}{c}{Buzz} && \multicolumn{3}{c}{CTSlice} && 
    \multicolumn{3}{c}{CIFAR-10}\\
    \cmidrule(l){3-5}  \cmidrule(l){7-9} \cmidrule(l){11-13}
    && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && RMSE ($\downarrow$) && LL ($\uparrow$) && Acc. ($\uparrow$)\\
    \midrule
    DKL && -114. $\pm$ 94.1 && 1.09 $\pm$ 0.12 && -246. $\pm$ 110. && 5.70 $\pm$ 1.47 && - 2.13 $\pm$ 0.10 && 48.57 $\pm$ 0.59 \\
    NNGP && -0.94 $\pm$ 0.01 && 0.81 $\pm$ 0.13 && -3.15 $\pm$ 0.02 && 6.25 $\pm$ 0.19 && - 1.80 $\pm$ 0.06 && 38.73 $\pm$ 0.07 \\
    GP-RBF && -0.89 $\pm$ 0.01 && 0.72 $\pm$ 0.01 && -3.14 $\pm$ 0.11 && 7.61 $\pm$ 0.56 && -- $\pm$ -- && -- $\pm$ -- \\
    DLVKL && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 2.22 $\pm$ 0.06 && 43.52 $\pm$ 0.75 \\
    DUE && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && -- $\pm$ -- && - 1.91 $\pm$ 0.08 && 49.22 $\pm$ 0.79 \\
    \midrule
    GDKL && -0.95 $\pm$ 0.02 && 0.67 $\pm$ 0.03 && -2.99 $\pm$ 0.06 && 4.87 $\pm$ 0.45 && - 1.49 $\pm$ 0.02 && 49.75 $\pm$ 0.94 \\
    \bottomrule
    \end{tabular}
}
\label{app_tab:fig3_800}
\end{table*}

\clearpage
\bibliography{achituve_219}


\end{document}