% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument[main]{li_470}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
  \bibliographystyle{abbrvnat}
  \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% packages from NNGP_Prior
% \addbibresource{references.bib}
% \usepackage[export]{adjustbox}
% \usepackage[percent]{overpic}
\usepackage{wrapfig}

\usepackage[ruled]{algorithm2e}

\usepackage{flafter}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}

%\usepackage[hidelinks]{hyperref}
\usepackage{graphicx}
%\usepackage[export]{adjustbox}
\usepackage[colorinlistoftodos]{todonotes}

\usepackage{caption}
\usepackage{subcaption}
\captionsetup{font=small,labelfont={bf}}
\captionsetup[sub]{font=footnotesize,labelfont={bf}}
\linepenalty=1000  % Fewer side effects.


\definecolor{vibrant-blue}{RGB}{0,119,187}
\definecolor{vibrant-cyan}{RGB}{51,187,238}
\definecolor{vibrant-teal}{RGB}{0,153,136}
\definecolor{vibrant-orange}{RGB}{238,119,51}
\definecolor{vibrant-red}{RGB}{204,51,17}
\definecolor{vibrant-magenta}{RGB}{238,51,119}
\definecolor{vibrant-grey}{RGB}{187,187,187}

\definecolor{muted-indigo}{RGB}{51,34,136}
\definecolor{muted-cyan}{RGB}{136,204,238}
\definecolor{muted-teal}{RGB}{68,170,153}
\definecolor{muted-green}{RGB}{17,119,51}
\definecolor{muted-olive}{RGB}{153,153,51}
\definecolor{muted-sand}{RGB}{221,204,119}
\definecolor{muted-rose}{RGB}{221,102,119}
\definecolor{muted-wine}{RGB}{136,34,85}
\definecolor{muted-purple}{RGB}{170,68,153}

% In-text references.
\usepackage[capitalize,nameinlink]{cleveref}
\crefalias{objective}{equation}
\crefname{objective}{Objective}{Objectives}
\Crefname{objective}{Objective}{Objectives}
\crefdefaultlabelformat{(#2\color{muted-indigo}\textbf{#1}#3)}

% Figure captions.
\DeclareCaptionFormat{captionsize}{\fontsize{8}{10}\selectfont#1#2#3}
\DeclareCaptionFormat{subcaptionsize}{\fontsize{7}{8}\selectfont#1#2#3}
\captionsetup{format=captionsize,labelfont={bf,color=muted-indigo}}
\captionsetup[sub]{format=subcaptionsize,labelfont={bf,color=muted-indigo}}

% Project jargon.
\usepackage{glossaries-extra}
\setabbreviationstyle[long]{long}             % do not italicize the long form
\setabbreviationstyle[long-noshort]{long-em}  % italicize the long form
\newabbreviation[category=long-short]{nn}{NN}{neural network}
\newabbreviation[category=long-short]{mlp}{MLP}{multi-layer perceptron}
\newabbreviation[category=long-short,longplural={Gaussian processes}]{gp}{GP}{Gaussian process}
\newabbreviation[category=long-short]{smk}{SMK}{spectral mixture kernel}
\newabbreviation[category=long-short]{mk}{MK}{Mat\'{e}rn kernel}
\newabbreviation[category=long-short]{sgd}{SGD}{stochastic gradient descent}
\newabbreviation[category=long-short]{relu}{ReLU}{rectified linear unit}
\newabbreviation[category=long-short]{relu-nn}{rectifier NN}{rectifier neural network}
\newabbreviation[category=long-short]{gelu}{GELU}{Gaussian error linear unit}
\newabbreviation[category=long-short]{silu}{SiLU}{sigmoid-weighted linear unit}
\newabbreviation[category=long-short]{sin}{$\sin$}{sine}
\newabbreviation[category=long-short]{sin-nn}{sinusoidal NN}{sinusoidal neural network}
\newabbreviation[category=long-short]{tanh}{$\tanh$}{hyperbolic tangent}
\DeclareMathOperator{\erf}{erf}
\newabbreviation[category=long-short]{erf}{$\erf$}{Gauss error function}
\newabbreviation[category=long-short]{adam}{Adam}{adaptive momentum}
\newabbreviation[category=long-short]{ntk}{NTK}{neural tangent kernel}
\newabbreviation[category=long-short]{ard}{ARD}{automatic relevance determination}
\newabbreviation[category=long-short]{uci}{UCI}{UC Irvine Machine Learning Repository}
\newabbreviation[category=long-short]{mse}{MSE}{mean-squared error}
\newabbreviation[category=long-short]{ml}{ML}{marginal likelihood}
\newabbreviation[category=long-short]{mll}{MLL}{marginal log-likelihood}
\newabbreviation[category=long-short]{mcmc}{MCMC}{Markov chain Monte Carlo}

\glsdisablehyper

%% Math symbols.
% \input{front-matter/math}
\newcommand{\X}{\mathbf{X}}
\newcommand{\y}{\mathbf{y}}

\newcommand{\modelfamily}{\mathfrak{F}}
\newcommand{\model}{\mathbf{g}}
\newcommand{\modelindex}{r}
\newcommand{\modelindexfinal}{R}
\newcommand{\datasetfamily}{\mathfrak{D}}
\newcommand{\dataset}{\mathcal{D}}
\newcommand{\datasetindex}{s}
\newcommand{\datasetindexfinal}{S}
\newcommand{\nbpairs}{T}

% Math / text macros. 
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\Tau}{\mathrm{T}}


\newcommand{\etal}{\textit{et~al}.~\ }
\newcommand{\eg}{\textit{e.g.,}~}
\newcommand{\ie}{\textit{i.e.,}~}
\newcommand{\cf}{\textit{c.f.,}~}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\newcommand{\parencite}{\citep}
\newcommand{\textcite}{\citet}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Gaussian Process Surrogate Models for Neural Networks}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


\title{Gaussian Process Surrogate Models for Neural Networks \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<michaelyli@stanford.edu>?Subject=Your UAI 2023 paper}{Michael~Y.~Li}{}}
\author[2]{Erin~Grant}
\author[3]{Thomas~L.~Griffiths}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    Stanford University\\
    Stanford, California, USA 
}
\affil[2]{%
    Gatsby Computational Neuroscience Unit\\
    University College London\\
    London, UK
}
\affil[3]{%
    Departments of Psychology and Computer Science\\
   Princeton, NJ, USA
  }
  
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix
\section{Additional experimental results}
\subsection{Ranking NN generalization with the GP marginal likelihood}
In previous sections, we demonstrated that \gls{gp} surrogate models could yield insight into \gls{nn} behavior.
The benefits of \glspl{gp} extend beyond this.
Since the \gls{gp} marginal likelihood has a closed form expression, many have advocated for using the marginal likelihood in model selection and as an indicator of expected generalization performance \parencite{Mackay1992APB}.
In this section, we leverage the learned \gls{gp} surrogate to \textit{rank NNs by their generalization error with the GP marginal likelihood}.
In particular, we learn \gls{gp} surrogates from different \glspl{nn} at random initialization, and
we then study if the marginal likelihood of the surrogates can rank the \glspl{nn} by test error after training.
In the following experiments with varying classes of \gls{nn} families, we find that we can indeed predict test error using the marginal likelihood  of the training set under the learned surrogate \gls{gp}.

\subsubsection{The idealized case: Large-width \glspl{nn}}
\label{sec:gen-rank-large-width}

Before we consider arbitrary \gls{nn} families, we check that the marginal likelihood is predictive in an idealized setting.
In particular, we consider large-width \glspl{nn} whose infinite-width analogs are equivalent to \gls{gp}s \parencite{leedeep}.  % 
If the marginal likelihood is not predictive in this case in which the kernel function can be analytically determined, it is unlikely to be useful in a general setting where the kernel is learned and \glspl{gp} approximate \glspl{nn} priors but are not equivalent.

\paragraph{\Gls{nn} hyperparameters.}
We consider \glspl{nn} with \gls{sin} or \gls{erf}\footnote{
Here, \gls{erf} is defined as $a \,\erf(bx)+c$, where $\erf(x) = \frac {2}{\sqrt {\pi }} \int _{0}^{x}e^{-t^{2}}\,\mathrm{d}t$.
}
activations and 2 hidden layers of 1024 units each.
We randomly initialize the weights about zero with weight variance $\sigma_w^2=1.5$ and bias variance $\sigma_b^2=0.05$. 
We train an ensemble of 50 randomly initialized \glspl{nn} from each family using full-batch (vanilla) gradient descent with learning rates of $\eta \in \{0.01, 0.1\}$.

% \input{figures/05_model_selection}
\begin{figure}
  \centering
  \adjincludegraphics
  [Clip={.0\width} {.0\height} {0.48\width} {.0\height},
    height=110pt]
  {figures/plots/predictability_sanity.pdf}
  \adjincludegraphics
  [Clip={.58\width} {.0\height} {0.0\width} {.0\height},
    height=110pt]
  {figures/plots/predictability_sanity.pdf}
  \caption{
    \textbf{Ranking generalization from MLL in large-width NNs.}
    Mean and standard error of the test MSE
    of large-width
    sinusoidal and \acrshort{erf} \glspl{nn}
    trained with 
    learning rates
    $\eta = 0.01$ \textbf{(left)} and
     $\eta = 0.1$ \textbf{(right)}
    on the target function of \cref{sec:gen-rank-large-width}.
    The \acrshort{mll} of the target function under the surrogate model corresponding to the limiting kernel for each model family is shown in the legend. 
    Consistent with expectations, the model family whose surrogate assigns higher MLL to the target function achieves lower test error for both values of $\eta$.
  }
  \label{fig:predictabilitysanity}
\end{figure}
\paragraph{Target function.}
The target function is $ \sin(0.5x)$.

\paragraph{\Gls{gp} surrogate.}
We do not learn a kernel from \gls{nn} predictions as in previous sections.
Instead, we use the kernels corresponding to the infinite width analogs of the \glspl{nn} using the neural-tangents package
\parencite{neuraltangents2020}.

\paragraph{Results.}
\cref{fig:predictabilitysanity} compares the performance of these \gls{nn} families along with the marginal likelihood of the target function under the surrogate model.
The performance (\gls{mse} on the test set) is averaged across each ensemble of \glspl{nn}.
The \gls{mll} of the target function is higher for the better-performing \gls{nn} family.

\subsubsection{Small width neural networks and learning the kernel}
\label{sec:gen-rank-small-width}

In the previous experiment, we showed that the marginal likelihood could be predictive when we consider large-width \glspl{nn} and when we use a corresponding, analytically derived kernel.
Is the marginal likelihood predictive when we consider
smaller-width \glspl{nn} and when we learn the kernel empirically?

\paragraph{\Gls{nn} hyperparameters.}
We consider ensembles of width 16, depth 4 \glspl{nn} from two families: \glspl{nn} with \gls{sin} activations and \glspl{nn} with \gls{relu} activations.
We randomly initialize weights about zero with weight variance $\sigma_w^2=1.5$ and bias variance $\sigma_b^2=0.05$. 
We train an ensemble of 50 randomly initialized \glspl{nn} from each family on the target functions using full-batch gradient descent with a learning rate of $\eta = 0.1$.

\paragraph{Target function.}
The target function families mirror the \gls{nn} model families: We collect predictions from randomly initialized, width 16, depth 4 \glspl{nn} with \gls{sin} or \gls{relu} activations.
These target functions are a useful sanity check, as the inductive biases of the model families are perfectly suited for a target function family.

% \input{figures/06_model_selection}
\begin{figure}{\textwidth}
  \centering
  \begin{subfigure}{0.5\textwidth}
    \centering
    {
      \includegraphics[scale=0.9]{figures/plots/predictability_toy_Sin.pdf}
    }
    \label{fig:predictabilityexp1}
  \end{subfigure}
  \begin{subfigure}{0.5\textwidth}
    \centering
    {
      \includegraphics[scale=0.9]{figures/plots/predictability_toy_Relu.pdf}
    }
    \label{fig:predictabilityexp2}
  \end{subfigure}


  \caption{
    \textbf{Ranking generalization from MLL in small-width NNs.}
    Mean and standard error of test MSE %(across ensemble)
    \textbf{(left)}
    of small-width sinusoidal and rectifier \gls{nn} ensembles on \gls{sin} \textbf{(top)} and \gls{relu} \textbf{(bottom)} target function families,
    with the target function MLL under the surrogate learned from each model family in the legend.
    Covariance \textbf{(right)}
    of surrogate kernels alongside
    data kernels learned from the \gls{sin} \textbf{(top)} and \gls{relu} \textbf{(bottom)} target function families.
    Even in the small-width regime and when the kernel is learned, the model family whose surrogate assigns a higher MLL to the target function attains lower error
    \textbf{(left)};
    the surrogate kernel learned from the better-performing model family better matches the data kernel
    \textbf{(right)}.
  }
  \label{fig:predictabilityexp12}
\end{figure}

\paragraph{\Gls{gp} surrogate.}
For each ensemble, we learn the hyperparameters of an \gls{smk} with $Q=5$ mixture components by optimizing the marginal likelihood across the ensemble.
To optimize, we randomly initialize the kernel hyperparameters and run Adam for 250 iterations with a learning rate of $\eta = 0.1$.
We initialize the frequency parameters by sampling from a uniform distribution whose upper limit is the Nyquist frequency.
We choose the kernel hyperparameters with the highest objective value across three random initializations.

\paragraph{Results.}
In \cref{fig:predictabilityexp12}, we compare the performances of the two \gls{nn} families on the two target function families.
We also display the kernels learned
from \gls{nn} behavior (\emph{sin surrogate kernel} or \emph{\gls{relu} surrogate kernel})
and
learned from the target function family (\emph{data kernel}) directly.
Across both experiments, the \gls{mll} averaged across the target function family of the better-performing \gls{nn} family is higher.
In general, the structure of a learned kernel reflects the properties of the learned \gls{gp} prior,
and so we can compare kernels to assess similarity between target function and \gls{nn} families.
We see that the data kernel provides a better qualitative match to the kernel of the better-performing model family.

\subsubsection{Systematic study of various learning rates and architectures}
\label{sec:gen-rank-systematic}
% \input{figures/07_model_selection}
\begin{figure}[t]
  \centering
  {
    \includegraphics[scale=1.0]{figures/plots/predictability_sine_1layer.pdf}

    \includegraphics[scale=1.0]{figures/plots/predictability_sine_3layer.pdf}
  }
\caption{\textbf{Ranking generalization performance from MLL across different learning algorithms and architectures.}
Each panel displays mean and standard error of test MSE of an NN family trained on the target function $\sin(0.5x)$ with noise; legend displays \acrshort{mll} of the training data under the surrogate for one of two NN families:
1-layer (256 hidden units) sinusoidal or rectifier \glspl{nn} \textbf{(top)});
3-layer (256 hidden units) sinusoidal or rectifier \glspl{nn} \textbf{(bottom)}.
\Glspl{nn} are trained with batch gradient descent with Adam (learning rates $\eta= 0.003$, $\eta = 0.0003$) or vanilla batch gradient descent ($\eta = 0.01)$.
Across architectures and learning algorithms, the \gls{nn} family whose surrogate assigns higher MLL to the target function achieves lower test error.
}
\label{fig:predictability_panel}
\end{figure}

In this last experiment on ranking generalization performance, we establish that Gaussian process surrogates reliably rank performance across a range of learning rates and gradient descent algorithms.

\paragraph{\Gls{nn} hyperparameters.}
We consider ensembles of randomly initialized \glspl{nn} with \gls{sin} or \gls{relu} activations and 1 or 3 hidden layers with 256 hidden units in each layer.
We randomly initialize the weights about zero with weight variance $\sigma_w^2=1.5$ and bias variance $\sigma_b^2=0.05$. 
We train 50 randomly initialized \glspl{nn} from each family using either vanilla full-batch gradient descent with a constant learning rate of $\eta = 0.01$, or Adam~\parencite{kingma:adam} using learning rates of $\eta \in \{0.0003, 0.003\}$.

\paragraph{Target function.}
We consider a target function of $\sin(0.5x)$.
\paragraph{\Gls{gp} surrogate.}
For each ensemble, we learn the hyperparameters of an \gls{smk} with $Q=5$ mixture components by optimizing  the marginal likelihood across the ensemble.
To optimize, we randomly initialize the kernel hyperparameters and run \gls{adam} for 250 iterations with a learning rate of $\eta = 0.1$.
We choose the kernel hyperparameters with the highest objective value across three random initializations.
To randomly initialize the frequency parameters, we uniformly sample from the real-valued interval $(0, 25]$.


\paragraph{Results.}
In \cref{fig:predictability_panel},  we find that the marginal likelihood of the better-performing \gls{nn} family is higher.
The marginal likelihood depends on the diagonal noise $\sigma_n^2$ added to the Gram matrix.
We find that our result are robust across three levels of this diagonal noise ($10^{-3}, 10^{-4}, 10^{-5}$).
These results suggest we can rank these \gls{nn} families when they are not in the asymptotic regime and when we learn the kernel, in contrast to \cref{sec:gen-rank-large-width}, as well as when \emph{a priori} no model family should perform better, unlike \cref{sec:gen-rank-small-width}.


% \input{figures/12_corr_sensitivity}
\begin{figure}[ht!]
  \centering
  {\includegraphics{figures/plots/gen_gap_lengthscale_corr_sensitivity.pdf}}
  \caption{
  \textbf{Sensitivity analysis of generalization gap and lengthscale profile relationship.}
  Each panel a histogram and mean (red line) of correlations obtained by recomputing the correlation between lengthscale profile correlation and generalization gap after removing each UCI dataset.
  Across datasets and architectures, even when a single dataset is removed, there remains an negative correlation between generalization gap and lenthscale profile correlation.
  Therefore, the inverse relationship between generalization gap and lengthscale profile correlation demonstrated in 
  Section 4.3.1 is robust to outlier datasets.  
 }
  \label{fig:corr_sensitivity}
\end{figure}

\subsection{Correlation sensitivity}
\label{sec:corr-sensitivity}
We present some additional results to supplement our analysis from Section 4.3.1 where we demonstrated that discrepancy in lengthscale profiles between data and neural network predicts the generalization gap.
Correlation can be sensitive to outliers.
Does any single dataset account for the negative correlations?
To answer this, we characterize how the correlation changes as a result of dropping each dataset.
Specifically, for each UCI dataset, we remove that dataset and then compute the correlation between lengthscale profile correlation and generalization gap for the remaining datasets.
We plot the resulting distribution of correlations in \cref{fig:corr_sensitivity}.
We find there is a tight spread around the correlation computed from all the UCI datasets.
Importantly, when we remove any UCI dataset, we still see moderate to high negative correlations between lengthscale profile correlation and generalization gap.

% \input{figures/01_b_prior_samples}
\begin{figure}[ht!]
  \centering
  {\includegraphics[scale=0.5]{figures/plots/sm_kernel_hparam_vary.pdf}}
  {\includegraphics[scale = 0.5]{figures/plots/matern_kernel_hparam_vary.pdf}}
  \caption{
   \textbf{Illustrating the effect of \gls{gp} kernel hyperparameters on the \gls{gp} prior.}
(\textbf{Left}) Samples from a GP prior with \gls{smk} with varying mixture weights $\omega$, mixture scale $\tau$, and mixture means $\mu$.
(\textbf{Right}) Samples from a GP prior with Matern kernel with varying $\nu$ and $\ell$ (lengthscale).
\gls{gp}s are flexible models whose properties can be controlled through hyperparameters.
}
  \label{fig:gp_examples}
\end{figure}

\subsection{Properties of the Spectral Mixture Kernel and the Matern Kernel}
\label{sec:app-kernels}

We describe how the various hyperparameters of the \acrshort{smk} and \acrshort{mk} kernel affect the \gls{gp} prior.
We begin with the spectral mixture kernel.
The mixture weights $w$ are signal variances and control the scale of the function values.
The mixture means ($\mu$) encode periodic behavior.
The variances ($\tau$) are (inverse) lengthscales, which control the smoothness.
The (ARD) \gls{mk} kernel has lengthscales $\theta$, which controls the smoothness of the function with respect to each dimension.
$\nu$ is another hyperparameter that also modulates smoothness, and the Matern covariance function admits a simple expression when $\nu$ is a half-integer.
$\nu = 2.5$ corresponds to twice differentiable functions and $\nu = 1.5$ corresponds to once differentiable functions.

In \cref{fig:gp_examples}, we vary the hyperparameters of the \gls{smk} ($w, \mu, \tau$)and Matern kernels ($\nu, \theta$) and illustrate how they impact the prior over functions.

\clearpage

\bibliography{uai2023-template}

\end{document}
