\RequirePackage[hyphens]{url}
\documentclass[accepted]{uai2022} 
\usepackage{xr-hyper}
\usepackage{hyperref}
\usepackage[dvipsnames]{xcolor}
\definecolor{c1}{RGB}{41,140,190}
\newcommand{\xxcomment}[4]{\textcolor{#1}{[$^{\textsc{#2}}_{\textsc{#3}}$ #4]}}
\newcommand{\wesley}[1]{\xxcomment{purple}{W}{M}{#1}}
\newcommand{\ap}[1]{\xxcomment{c1}{A}{P}{#1}}
\newcommand{\agw}[1]{\xxcomment{c1}{A}{W}{#1}}

\usepackage[american]{babel}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{natbib} 
\bibliographystyle{abbrvnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
\usepackage{booktabs} 
\usepackage{tikz} 

\newcommand{\swap}[3][-]{#3#1#2} 
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{bm}
\newcommand{\mbf}[1]{{\boldsymbol{\mathbf{#1}}}}
\renewcommand{\bm}{\mbf}

\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\set}[1]{\{#1\}}
\newcommand{\dotprod}[1]{\langle#1\rangle}
\DeclareMathOperator{\tr}{Tr}
\newcommand{\ceil}[1]{\left\lceil#1\right\rceil}
\newcommand{\vn}[1]{\textbf{#1}}
\usepackage{nicefrac}
\usepackage{comment}
\usepackage{subcaption}

\title{Supplementary Materials for \\ 
	Low Precision Arithmetic for Fast Gaussian Processes}

\author[1,*]{Wesley J. Maddox}
\author[1,*]{Andres Potapczynski}
\author[1]{Andrew Gordon Wilson}
\affil[1]{%
	Center for Data Science\\
	New York University
}
\affil[*]{Equal contribution.}

\usepackage{xcolor}
\definecolor{dark-blue}{rgb}{0.15,0.15,0.4}
\definecolor{medium-blue}{rgb}{0,0,0.5}
\hypersetup{
	colorlinks, linkcolor={dark-blue},
	citecolor={dark-blue}, urlcolor={medium-blue}
}


% In your preamble

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
	\typeout{(#1)}
	\@addtofilelist{#1}
	\IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
	\externaldocument{#1}%
	\addFileDependency{#1.tex}%
	\addFileDependency{#1.aux}%
}

%change name when we rename
\myexternaldocument{maddox_652}

\begin{document}
	\onecolumn
	\appendix
	\maketitle
	
	\renewcommand\thefigure{A.\arabic{figure}}
	\renewcommand\thetable{A.\arabic{table}}
	\setcounter{figure}{0}  
	
	
	Our Appendix is structured as follows: 
	\begin{itemize}
		\item In Appendix \ref{app:relwork}, we further describe related work, including on conjugate gradients.
		\item In Appendix \ref{app:experiments}, we show several other experiments on both the properties of half precision kernel matrices and half precision conjugate gradients.
		\item In Appendix \ref{sec:exp_details}, we outline experimental details for all of our experiments.
		\item In Appendix \ref{app:theory}, we give some detailed theoretical analysis of half precision kernel matrices, focusing on the quantized effective dimension and the effect of finite precision on the support of the kernel.
	\end{itemize}
	
	\section{Extended Related Work}\label{app:relwork}
	
	\paragraph{Conjugate Gradients: } 
	A description of the conjugate gradients algorithm is given in Alg. \ref{alg:CG_basic} while using preconditioning \citep{nocedal2006,golub2018matrix}.
	\citet{gardner2018gpytorch} propose a variant of conjugate gradients that they call modified batched CG (mBCG) which we use in our work. 
	The primary difference between mBCG and CG is that mBCG enables solving several linear systems at once by performing all computations in batch mode so that linear operators such as $\bm K (\bm v)$ are actually matrix matrix multiplications rather than matrix vector products.
	Then, an individual set of learning rates $\alpha_k$ and $\beta_k$ is used for each system.
	Our stable CG implementation (Alg. \ref{alg:rCG}) is actually based off of mBCG, but for didactic purposes we display only the standard CG version.
	
	\begin{algorithm}[!ht]
		\caption{CG }
		\label{alg:CG_basic}
		\begin{algorithmic}[1]
			\STATE \textbf{Input:} MVM function $\bm{K}\left(\cdot\right)$, initial solution guess $\bm{x}_{0}$, linear system right hand side $\bm{b}$, 
			tolerance $\epsilon$, preconditioner function $\bm{P}\left(\cdot\right)$
			\STATE \textbf{Initialize:} $k \leftarrow 0$, $\bm{r}_{0} \leftarrow \bm{K}\left(x_{0}\right) - \bm{b}$, $\bm{d}_{0} \leftarrow -\bm{r}_{0}$,
			$\bm{z}_{0} = \bm{P}\left(\bm{r}_{0}\right)$ and
			$\gamma_{0} = \bm{r}_{0}^{T} \bm{z}_{0}$.
			\vspace{0.5em}
			\WHILE{$\norm{\bm{r}_{k}}_{2} < \epsilon$}
			\STATE $\alpha_{k} = \frac{\gamma_{k}}{\bm{d}_{k}^{T} \bm{K}\left(\bm{d}_{k}\right)}$
			\STATE $\bm{x}_{k+1} = \bm{x}_{k} + \alpha_{k} \bm{d}_{k}$
			\STATE $\bm{r}_{k+1} = \bm{r}_{k} + \alpha_{k} \bm{K}\left(\bm{d}_{k}\right)$
			\STATE $\bm{z}_{k+1} = \bm{P}\left(\bm{r}_{k+1}\right)$
			\STATE $\gamma_{k+1} = \bm{r}_{k+1}^{T} \bm{z}_{k+1}$
			\STATE $\beta_{k+1} = \frac{\gamma_{k+1}}{\gamma_{k}}$
			\STATE $\bm{d}_{k+1} = -\bm{r}_{k+1} + \beta_{k+1} \bm{d}_{k}$
			\ENDWHILE
		\end{algorithmic}
	\end{algorithm}
	
	\paragraph{Other Scalable Gaussian Processes}
	
	We note that our matrix-free schemes can be used to scale up approximate kernel methods such as Nystrom style approximations \citep{smola2000sparse,williams2000using}; 
	indeed, \citet{meanti2020kernel} use Nystrom approximations and KeOps for their kernel ridge regression approach.
	However, \citet{pmlr-v89-zhang19f} found limited speedups when quantizing (which is slightly distinct from half precision) Nystrom approximations.
	Similarly, \citet{pleiss2020fast} used iterative methods (in their cast MINRES) to speed up variational Gaussian processes \citep{titsias2009variational,hensman2013gaussian}, and we hope to speed up their approach as well.
	\citet{chen2013parallel,nguyen2019exact} proposed parallel direct Cholesky based GP schemes for more scalable GP regression; however, their approaches will probably perform poorly in lower precision, as we demonstrate is the case for pivoted Cholesky based solves in Section \ref{sec:pc_exp}.
	
	Finally, kernels with compact support have been previously studied from a kernel approximation point of view \citep{genton2001classes,gneiting2002compactly}.
	However, these works focused on developing new techniques to approximate an infinitely supported kernel with a kernel that has demonstrated compact support, rather than using floating point precision to develop an approximate kernel with compact support.
	
	\section{Extended Experiments}\label{app:experiments}
	
	\paragraph{Summation Approaches}
	
	In Figure \ref{fig:mm_full_times}, we display the different times of block summation across precisions, as well as Kahan summation and floating point accumultion of float kernel matrix MVMs, finding that all half precision accumulation mechanisms behave similarly, with Kahan summation being slightly slower than the other two.
	This plot is inspired by the study performed by the authors of KeOps \citep{charlier2021kernel}, available at \url{https://www.kernel-operations.io/keops/_auto_benchmarks/plot_accuracy.html}.
	Due to these results, we use block summation, casting each block's summation up to float before down-casting to half, as is the default in KeOps.
	
	\begin{figure}[!ht]
		\centering
		\begin{subfigure}{0.32\textwidth}
			\centering
			\includegraphics[height=3.5cm]{./figs/mm_times_full.pdf}
			\caption{Summations.}
			\label{fig:mm_full_times}
		\end{subfigure}
		\begin{subfigure}{0.32\textwidth}
			\centering
			\includegraphics[height=3.5cm]{figs/matern05_maxdistrep.pdf}
			\caption{Mat\'ern-$1/2.$}
			\label{fig:max_distances_matern}
		\end{subfigure}
		\begin{subfigure}{0.32\textwidth}
			\centering
			\includegraphics[height=3.5cm]{figs/sparsity_error_per_mm.pdf}
			\caption{Error of sparsified MVMs.}
			\label{fig:sparsity_error}
		\end{subfigure}
		\caption{\textbf{(a)} Block summation in floating point adds negligible overhead compared with block summation in half precision, while being as accurate as Kahan summation.
			\textbf{(b)} Maximum distance representable for Mat\'ern-$1/2$ kernel; note the similar trend to Figure \ref{fig:max_distances_rbf}.
			\textbf{(c)} Error of the truncated MVM is zero as expected. To produce the sparsified MVM, we truncated any data points that had kernel entries that were un-representable in half precision.
		}
	\end{figure}
	\begin{figure}
		\centering
		\begin{subfigure}{0.32\textwidth}
			\centering
			\includegraphics[height=3.5cm]{figs/lambda_sqerr_matern05.pdf}
			\caption{Difference of $\lambda_{max}.$}
			\label{fig:eval_diff}
		\end{subfigure}
		\begin{subfigure}{0.32\textwidth}
			\centering
			\includegraphics[height=3.5cm]{figs/effdim_rbf.pdf}
			\caption{ED, RBF.}
		\end{subfigure}
		\begin{subfigure}{0.32\textwidth}
			\centering
			\includegraphics[height=3.5cm]{figs/espectrum_rbf.pdf}
			\caption{Spectrum, RBF.}
		\end{subfigure}
		\caption{
			\textbf{(a)} Difference of $\lambda_{\text{max}}$ across precisions for Mat\'ern-$1/2$ kernel. Other kernels have similar eigenvalue diffeences.
			\textbf{(b)} Effective dimension (ED) for RBF kernels, the trend is similar to that of the Mat\'ern-$1/2$ kernel because the eigenvalue spectrum \textbf{(c)} has a similar bunched up pattern in half.}
		\label{fig:other_spectra}
	\end{figure}
	
	\begin{figure}[!ht]
		\centering
		\includegraphics[width=0.66\linewidth]{./figs/rbf_kernel_inverses.pdf}
		\caption{Matrix inverses of an RBF kernel in double (left), float (middle), and half (right) precisions. The inverse is performed in double precision, while the evaluation itself is performed in half precision. The half precision inverse is qualitatively distinct from the other two indicating a slightly distinct spectrum.}
		\label{fig:rbf_kernels_inv}
	\end{figure}
	
	\paragraph{Properties of Half Precision Kernel Matrices}
	
	In Figure \ref{fig:max_distances_matern}, we display the maximum distance representable for Mat\'ern-$1/2$ kernels across varying lengthscales, as we show for RBF and rational quadratic kernels in Figures \ref{fig:max_distances_rbf} and \ref{fig:max_distances_rq}.
	The trend for the Mat\'ern family is similar to that of the RBF kernels, except that larger distances are representable.
	
	Finally, in Figure \ref{fig:sparsity_error}, we show the error of sparsified MVMs (which is zero) across increasing dataset size for the data reduction experiment in Section \ref{sec:support}.
	
	The difference of the largest eigenvalue of a Mat\'ern-$1/2$ kernel is shown in Figure \ref{fig:eval_diff} in float and half as compared to double precision (which we use as a proxy for infinite precision).
	Note that extremely small relative differences for these largest values.
	
	In Figure \ref{fig:other_spectra}\textbf{(b)}, we show ED for RBF kernels with the associated spectrum in \textbf{(c)}.
	
	In Figure \ref{fig:rbf_kernels_inv}, we display $(\bm K + 0.01)^{-1}$ for RBF kernels with lengthscale $1$ and $50$ data points in $[-3, 3]$ across double (left), float (middle), and half (right) precisions.
	We first evaluate the kernel to a lower precision and then pass into double precision before using a Cholesky factorization to invert the kernel matrix, finding that the half precision kernel inverse has a distinct pattern (larger magnitude off-diagonal values) compared to the float and double inverse matrices.
	
	\paragraph{Benchmarking Half Precision CG}
	
	\begin{figure}[ht]
		\centering
		\begin{subfigure}{0.23\textwidth}
			\includegraphics[width=\linewidth]{figs/buzz_types.pdf}
			\caption{Buzz}
			\label{fig:buzz_types}
		\end{subfigure}
		\begin{subfigure}{0.23\textwidth}
			\centering
			\includegraphics[width=\linewidth]{figs/elevators_residual_norm_precond.pdf}
			\caption{Elevators}
			\label{fig:elev_precond}
		\end{subfigure}
		\begin{subfigure}{0.23\textwidth}
			\includegraphics[width=\linewidth]{figs/keggd_residual_norm_precond.pdf}
			\caption{KeggD}
			\label{fig:kegg_precond}
		\end{subfigure}
		\begin{subfigure}{0.23\textwidth}
			\includegraphics[width=\linewidth]{figs/pol_kernels_noard.pdf}
			\caption{PoleTele}
			\label{fig:pol_precond}
		\end{subfigure}
		\caption{\textbf{(a)} Residual norms across solvers on buzz. \textbf{(b)} Residual norm for stable CG on elevators. Here no preconditioning also converges. \textbf{(c)} Residual norm for stable CG on Kegg Directed. \textbf{(d)} Residual norm for no ARD on Pol.}
		\label{fig:preconds_residual_tols}
	\end{figure}
	
	In Figure \ref{fig:buzz_types}, we display how CG in half diverges, but our stable CG converges as does CG in float.
	In Figure \ref{fig:elev_precond} and \ref{fig:kegg_precond}, we display the effect of preconditioning on solves, finding again that larger preconditioners tend to converge very slightly faster.
	
	In Figure \ref{fig:3droad_comp}, we display the optimization trajectory on \emph{3droad} finding that there are clearer divergences in terms of the outputscale; however, each parameter converges to similar values by the end of training.
	
	\begin{figure}[!ht]
		\centering
		\includegraphics[height=4cm]{./figs/3droad_comp.pdf}
		\caption{Optimization trajectory on \emph{3droad}. }
		\label{fig:3droad_comp}
	\end{figure}
	
	\paragraph{Additional Benchmark Results}
	
	In Table \ref{tab:rbf_nll}, we display NLLs across five seeds on UCI datasets for float, half, and SVGPs, analogous to our RMSE and timing results in Table \ref{tab:rbf_ard}.
	Compilation times for these results are shown in Table \ref{tab:compilation}.
	
	In Table \ref{tab:matern5_ard}, we display RMSEs, times, and NLLs for Mat\'ern-$5/2$ ARD kernels for both single and half precisions.. 
	In Table \ref{tab:matern3_ard}, we display RMSEs, times, and NLLs for Mat\'ern-$1/2$ ARD kernels for both single and half precisions. 
	In Table \ref{tab:matern1_ard}, we display RMSEs, times, and NLLs for Mat\'ern-$1/2$ ARD kernels for both single and half precisions.
	
	\begin{table*}[!ht]
		\caption{Times with compilation breakdown across $3$ seeds on a suite of UCI tasks.}
		\label{tab:compilation}
		\centering
		\scriptsize{
			\begin{tabular}{cccccc}
				\midrule
				\multicolumn{2}{c}{\textbf{}} & 
				\multicolumn{2}{c}{\textbf{Half}} & 
				\multicolumn{2}{c}{\textbf{Single}} \\
				Dataset & $(N , d)$ & Optimization Time & Compilation Time & Optimization Time & Compilation Time
				\\ \hline
				PoleTele & (13.5K, 26) 
				& $5 \pm  0.024$ & $73 \pm 0.035$ & $   23.6 \pm 0.5$ & $  34 \pm 0.066$ \\
				Protein & (41.1K, 9)  
				&  $  11 \pm 2.795$ & $  73 \pm 0.018$ & $  35.9 \pm 5.7$ & $  35 \pm 2.846$ \\
				3droad & (391.4K, 3) 
				&  $  912.2 \pm 0.96 $ & $  91 \pm 0.197$ & $  1,210 \pm 35.0 $ & $  50 \pm 0.220$ \\
			\end{tabular}
		}
	\end{table*}
	
	\begin{table}[!ht]
		\centering
		\caption{Test time NLLs across $5$ seeds on a suite of UCI tasks for float, half, and SVGPs with RBF ARD kernels.}
		\label{tab:rbf_nll}
		\begin{tabular}{ccccc}
			\midrule
			\multicolumn{1}{c}{\textbf{Dataset}} & \multicolumn{1}{c}{\textit{(N, d)}} & \multicolumn{1}{c}{\textbf{Single}} & \multicolumn{1}{c}{\textbf{Half}} & \multicolumn{1}{c}{\textbf{SVGP}} \\ 
			\hline
			PoleTele                    & (13.5K, 26)                & $-0.349 \pm 0.004$                 & $-0.316 \pm 0.004$               & $-0.513 \pm 0.011$               \\
			Elevators                   & (14.9K, 18)                & $0.515\pm 0.0195$                  & $0.663 \pm 0.024$                & $0.437 \pm 0.012$                \\
			Bike                        & (15.6K, 17)                & $-0.3714 \pm 0.0066$               & $-0.413 \pm 0.008$               & $-1.020 \pm 0.044$               \\
			Kin40K                      & (36K, 8)                   & $0.2352 \pm 0.005$                 & $0.241 \pm 0.005$                & $-0.327 \pm 0.007$               \\
			Protein                     & (41.1K, 9)                 & $0.9802 \pm 0.0115$               & $1.412 \pm 0.001$                & $0.964 \pm 0.015$                \\
			3droad                      & (391.4K, 3)                & $1.249 \pm 0.0129$                & $1.201 \pm 0.005$                & $0.537 \pm 0.025$                \\
			Song                        & (463.8K, 90)               & $1.146 \pm 0.0043$                 & $1.765 \pm 0.819$                & $1.418 \pm 0.002$                \\
			Buzz                        & (524.9K, 77)               & $-0.424 \pm 0.18$                  & $0.898 \pm 0.714$                & $-0.071 \pm 0.010$    \\
			Houseelectric & (1844.3K,9) & $-0.72 \pm 0.002$ & $-0.439 \pm 0.084$ & $-$
		\end{tabular}
	\end{table}
	
	\begin{table*}[!ht]
		\caption{RMSEs, NLL, and training time across $3$ seeds on a suite of UCI tasks. Here, we use Mat\'ern-$5/2$ ARD kernels with $50$ CG iterations and $50$ optimization steps.}
		\label{tab:matern5_ard}
		\centering
		\scriptsize{
			\begin{tabular}{ccccccccc}
				\midrule
				\multicolumn{2}{c}{\textbf{}} & 
				\multicolumn{2}{c}{\textbf{RMSE}} & 
				\multicolumn{2}{c}{\textbf{NLL}} & 
				\multicolumn{2}{c}{\textbf{Time}} & 
				\\
				\multicolumn{1}{c}{\textbf{Dataset}} & 
				\multicolumn{1}{c}{$(N , d)$} & 
				\multicolumn{1}{c}{\textbf{Single}} & 
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{c}{\textbf{Single}} & 
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{c}{\textbf{Single}} & 
				\multicolumn{1}{l}{\textbf{Half}} &
				\\ \hline
				PoleTele & (13.5K, 26) 
				&  $0.098 \pm 0.002$ & $0.102$ & $-0.454 \pm 0.004$ & $-0.447$ & $49.1 \pm 1.74$ & $97$\\
				Protein & (41.1K, 9) 
				& $0.498 \pm 0.007$ & $0.509 \pm 0.005$ & $1.07 \pm 0.01$ & $1.04 \pm 0.03$  & $69.17 \pm 1.7$ & $110 \pm 10$\\
				3droad & (391.4K, 3) 
				& $0.254 \pm 0.006$ & $0.231$ & $0.812 \pm 0.014$ & $0.899$ & $1666 \pm 214$ & $1501$\\
			\end{tabular}
		}
	\end{table*}
	
	\begin{table*}[!ht]
		\caption{RMSEs, NLL, and training time across $3$ seeds on a suite of UCI tasks. Here, we use Mat\'ern kernel $1/2$ with $50$ CG iterations and $50$ optimization steps.}
		\label{tab:matern1_ard}
		\centering
		\scriptsize{
			\begin{tabular}{cccccc}
				\midrule
				\multicolumn{2}{c}{\textbf{}} & 
				\multicolumn{1}{c}{\textbf{RMSE}} & 
				\multicolumn{1}{c}{\textbf{NLL}} & 
				\multicolumn{1}{c}{\textbf{Time}} & 
				\\
				\multicolumn{1}{c}{\textbf{Dataset}} & 
				\multicolumn{1}{c}{$(N , d)$} & 
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{l}{\textbf{Half}} &
				\\ \hline
				PoleTele & (13.5K, 26) 
				& $0.108 \pm 0.003$ & $0.0478 \pm 0.006$ & $85 \pm 2.4$  \\
				Elevators & (14.9K, 18) 
				& $0.365 \pm 0.002$ & $-0.432 \pm 0.010$ & $83 \pm 1.9$  \\
				Bike & (15.6K, 17) 
				& $0.096 \pm 0.003$ & $0.527 \pm 0.004$ & $84 \pm 0.4$  \\
				Kin40K &(36K, 8) 
				& $0.136 \pm 0.002$ & $0.194 \pm 0.005$ & $87 \pm 0.1$\\
				Protein & (41.1K, 9) 
				& $0.481 \pm 0.003$ & $-0.762 \pm 0.013 $ & $98 \pm 0.1$ \\
				3droad & (391.4K, 3) 
				& $0.083 \pm 0.001$ & $-0.045 \pm 0.020$ & $1285 \pm 2.3$\\
			\end{tabular}
		}
	\end{table*}
	
	\begin{table*}[!ht]
		\caption{RMSEs, NLL, and training time across $3$ seeds on a suite of UCI tasks. Here, we use Mat\'ern kernel $3/2$ with $50$ CG iterations and $50$ optimization steps.}
		\label{tab:matern3_ard}
		\centering
		\scriptsize{
			\begin{tabular}{cccccc}
				\midrule
				\multicolumn{2}{c}{\textbf{}} & 
				\multicolumn{1}{c}{\textbf{RMSE}} & 
				\multicolumn{1}{c}{\textbf{NLL}} & 
				\multicolumn{1}{c}{\textbf{Time}} & 
				\\
				\multicolumn{1}{c}{\textbf{Dataset}} & 
				\multicolumn{1}{c}{$(N , d)$} & 
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{l}{\textbf{Half}} &
				\\ \hline
				PoleTele & (13.5K, 26) 
				& $0.101 \pm 0.003$ &$0.475 \pm 0.005$ &$  90 \pm  2.2$ \\
				Elevators & (14.9K, 18) 
				& $0.498 \pm 0.112$ &$-0.990 \pm 0.423$ &$  86 \pm  3.5$ \\
				Bike & (15.6K, 17) 
				& $0.086 \pm 0.003$ &$0.567 \pm 0.006$ &$  88 \pm  0.1$ \\
				Kin40K &(36K, 8) 
				& $0.097 \pm 0.001$ &$0.186 \pm 0.004$ &$  96 \pm  0.1$ \\
				Protein & (41.1K, 9) 
				& $0.497 \pm 0.003$ &$-0.850 \pm 0.008$ &$ 107 \pm  0.1$ \\
				3droad & (391.4K, 3) 
				& $0.165 \pm 0.003$ &$-0.700 \pm 0.024$ &$1532 \pm  3.4$ \\
			\end{tabular}
		}
	\end{table*}
	
	\begin{table*}[!ht]
		\caption{RMSEs, NLL, and training time across $3$ seeds on a suite of UCI tasks. Here, we use Mat\'ern-$5/2$ kernels with $50$ CG iterations and $50$ optimization steps.}
		\label{tab:rbf_ard_app}
		\centering
		\scriptsize{
			\begin{tabular}{cccccc}
				\midrule
				\multicolumn{2}{c}{\textbf{}} & 
				\multicolumn{1}{c}{\textbf{RMSE}} & 
				\multicolumn{1}{c}{\textbf{NLL}} & 
				\multicolumn{1}{c}{\textbf{Time}} & 
				\\
				\multicolumn{1}{c}{\textbf{Dataset}} & 
				\multicolumn{1}{c}{$(N , d)$} & 
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{l}{\textbf{Half}} &
				\multicolumn{1}{l}{\textbf{Half}} &
				\\ \hline
				PoleTele & (13.5K, 26) 
				& $0.102 \pm 0.002$ &$0.437 \pm 0.011$ &$  96 \pm  2.1$ \\
				Elevators & (14.9K, 18) 
				& $0.520 \pm 0.209$ &$-0.967 \pm 0.571$ &$  94 \pm  5.9$ \\
				Bike & (15.6K, 17) 
				& $0.082 \pm 0.001$ &$0.568 \pm 0.008$ &$  93 \pm  0.1$ \\
				Kin40K &(36K, 8) 
				& $0.088 \pm 0.001$ &$0.078 \pm 0.002$ &$ 106 \pm  0.1$ \\
				Protein & (41.1K, 9) 
				& $0.512 \pm 0.008$ &$-1.048 \pm 0.008$ &$ 113 \pm 11.3$ \\
				3droad & (391.4K, 3) 
				& $0.226 \pm 0.005$ &$-0.905 \pm 0.007$ &$1486 \pm  4.5$ \\
			\end{tabular}
		}
	\end{table*}
	
	\section{Experimental Details}\label{sec:exp_details}
	
	\subsection{Maximum Distance Representable in finite precision}
	
	To compute these distances, we consider four separate stationary kernels \citep{rasmussen_gaussian_2008} with distance $d = |x  - x'|$ and lengthscale $l,$ focusing on determining what values they drop below a given $\epsilon.$
	For Mat\'ern-$1/2$ kernels, we need $\exp\{-d / l\} < \epsilon$ and solving gives $ d > -\log \epsilon * l$.
	For other Mat\'ern kernels (e.g. $3/2$ and $5/2$), there is no straightforward closed form solution, but empirical investigations showed that the maximum distance representable is somewhere between Mat\'ern-$1/2$ and RBF.
	
	For RBF Kernels, we have $ \exp\{- 1/2 d^2 / l^2\} < \epsilon$ and solving gets $d > (-2 \log \epsilon)^{1/2}  l$.
	For rational quadratic kernels, we have $(1 + \frac{d^2}{2 \alpha l^2})^{-\alpha} < \epsilon$
	and solving for $d$ gets $d > (2 \alpha (\epsilon^{1/\alpha} - 1))^{1/2} * l.$
	We found that for $\alpha = 2,3$ the size of the support was much larger, and so showed only $\alpha = 5.$
	For periodic kernels, we have
	%\begin{align*}
	$    -\frac{2}{\lambda}\sin^2\left(\frac{\pi}{p}|d|\right) < \log \epsilon$
	and solving gets
	$    |d| > \frac{p}{\pi}\arcsin\left(-\log \epsilon \frac{\lambda}{2}\right)$
	%\end{align*}
	which will only have solutions when $-\log \epsilon \frac{\lambda}{2} \leq 1$.
	
	\subsection{Experimental Setup}
	
	All timing based experiments were performed using single NVIDIA V$100$ GPUs with $32$GB of memory on a shared supercomputing cluster. 
	Non-timing experiments also used NVIDIA RTX GPUs with either $24$ or $48$ GB of memory on either the same cluster or on a private internal server.
	
	We used GPyTorch \citep{gardner2018gpytorch} with the default parameter settings from Botorch's single task GP model\footnote{\url{https://botorch.org/api/models.html\#module-botorch.models.gp\_regression}.} which are constraining the noise to be greater than $0.0001$ and a Gamma$(1.1, 0.05)$ prior on the noise with initialization to $2$, and Gamma$(2.0, 0.15)$ prior on the outpuscale and a Gamma$(3.0, 6.0)$ prior on the lengthscale(s). 
	We fit using Adam for either $50$ or $100$ iterations unless otherwise documented and used a tolerance of $1.0$ for the CG iterations unless otherwise stated.
	We used KeOps $1.5$ for our experiments, noting that preliminary experiments with KeOps $2.0$ produced significantly faster compilation times \citep{charlier2021kernel}.
	
	For the datasets, we used the bayesian benchmarks package of \url{https://github.com/hughsalimbeni/bayesian_benchmarks/}, following their default training and testing splits.
	
	At test time, we converted the models back to float precision; however, our experiments found that this actually had limited impact on the RMSEs.
	
	\section{Theoretical Analysis} \label{app:theory}
	
	\subsection{Effect of finite precision on support}
	
	Following \citet[Chapter 4.3 of ][]{rasmussen_gaussian_2008} we can express the eigenvalues of an RBF kernel as 
	$\lambda_{k} = \sqrt{\frac{2 a}{A}} B ^{k}$ for some positive constants $a$, $A$ and $B \in \left(0, 1\right)$ that depend on the hyperparameters of the RBF kernel. 
	In infinite-precision an RBF kernel has support over the whole space as $\lambda_{k} > 0$. 
	However, if 
	\begin{equation*}
		\begin{split}
			k \geq \frac{\log \delta + \frac{1}{2}\log \left(\frac{A}{2a}\right)}{\log B} = \mathcal{O}(\log \delta)
		\end{split}
	\end{equation*}
	then $\lambda_{k} =0$ in finite-precision, where $\delta$ represents the round-off error. This means that the support of the kernel gets cut-off. This is similar to the support a Gaussian distribution $\mathcal{N}\left(\mu, \sigma^{2}\right)$ being the whole line $\mathbb{R}$, however, computing the probability of a sample being several standard deviations from the mean get cut-off to zero due to the sharp decay of the tails. Thus, our results focus on the empirical support of the kernel, not on the theoretical one.
	
	\subsection{Effect of finite-precision on generalization}
	Following \citet{li2019dimension}, we assume that moving from infinite precision to finite precision and using stochastic rounding means that our finite precision number, $Q(a) \sim U(a - \delta, a + \delta)$ for some $\delta$ that depends on our quantization (e.g. our precision based scheme). 
	Eqs. 11-15 of \citet{opper1998general} (see also Thm 1 of \citet{dicker2017kernel}, Thm 4.1 of \citet{zhang2005learning} and Thm 4 of \citet{caponnetto_optimal_2007}), generalization bounds often depend on the effective dimension of the training kernel matrix. Recall that the effective dimension is computed from the eignenvalues as $\sum_{i=1}^N \frac{\lambda_i}{\lambda_i + s}$ for some value $s$. 
	We will compute our finite precision approximation by computing the expected value of the effective dimension under the stochastic rounding scheme.
	
	Furthermore, we assume that each eigenvalue is quantized independently, to compute the expected effective dimension, we need to compute $N$ integrals of the following form, where $p(x) = U(a - \delta, a + \delta)$:
	\begin{align}
		\mathbb{E}_{p(x)}\frac{x}{x+s} = \int_{a-\delta}^{a + \delta} &\frac{x}{x+s} \frac{1}{a + \delta - (a - \delta)} dx = \frac{1}{2\delta} (x - s \log (x + s))|_{a-\delta}^{a + \delta} \nonumber \\
		&=\frac{1}{2\delta}\left(a + \delta - s \log(a + \delta + s) - (a - \delta - s \log(a - \delta + s)\right) \nonumber \\
		&=1 + \frac{s}{2 \delta}\log\frac{a + s - \delta}{a + s + \delta} = 1 + \frac{s}{2 \delta}\log\left(1 - \frac{2\delta}{a + s + \delta}\right) \nonumber \\
		&=1 - \frac{s}{2\delta}\left(\frac{2\delta}{a + s + \delta} + \frac{4\delta^2}{2 (a + s + \delta)^2} + \frac{8\delta^3}{3 (a + s + \delta)^3} + \mathcal{O}(\delta^4)\right) \nonumber \\
		&=1 - \frac{s}{a + s + \delta} - \frac{\delta s}{(a + s + \delta)^2} - \frac{4 \delta^2 s}{3(a + s + \delta)^3} - \mathcal{O}(\delta^3) \\
		&\geq 1 - \frac{s}{a + s + \delta} - \frac{\delta s}{(a + s + \delta)^2}
	\end{align}
	Now, putting this into our expectation over the quantized eigenvalues:
	\begin{align}
		\mathbb{E} \left( \sum_{i=1}^N \frac{Q( \lambda_i)}{ Q(\lambda_i) + s} \right) &\geq \sum_{i=1}^N 1 - \frac{s}{\lambda_i + s + \delta} - \frac{\delta s}{(\lambda_i + s + \delta)^2} \\
		&= N - \sum_{i=1}^N \frac{s(\lambda_i + s + \delta) - \delta s}{(\lambda_i + s + \delta)^2} = N - \sum_{i=1}^N \frac{s(\lambda_i + s)  }{(\lambda_i + s + \delta)^2} \nonumber \\
		&\geq N - \sum_{i=1}^N \frac{s}{\lambda_i + s} \nonumber \\
		&=N_{\text{eff}}(K, s) \nonumber
	\end{align}
	Note that as $\delta \rightarrow 0$ all of these inequalities become tight as expected. What this shows is that in finite precision, the expected effective dimensionality can only be higher than the effective dimensionality in infinite precision.
	
	In general, bounds such as Eqs. 11, 16 of \citet{opper1998general} (also Eq. 7.26 in \citet{rasmussen_gaussian_2008} tend to depend on the expected eigenvalues rather than those estimated empirically (e.g. in finite precision). However, they are related as $\frac{1}{N} \lambda_i^{\text{emp}} \rightarrow \lambda_i$ (see Rasmussen \& Williams, 06 4.3.2) and so we estimate $N_{\text{eff}}(K, \sigma^2 / n)$ with $N_{\text{eff}}(K^{\text{emp}}, \sigma^2)$. 
	Plugging in our finite precision estimate to something like Thm. 4.1 of \citet{zhang2005learning} then suggests that the generalization error in (any) finite precision will tend to be higher than for infinite precision.
	Roughly, these bounds state that the generalization error of a kernel ridge regressor is upper bounded by the sum of approximation error terms (relating to the fit of the kernel to the function) plus $N_{\text{eff}}(K, \lambda) / n$ for a regularization term $\lambda$ (similar to the noise value in the GP setting).
	
	Finally, our analysis shows that a larger $\delta$ (e.g. a lower precision estimate) will tend to further increase the generalization error.
	This tends to confirm our experimental study on the effective dimension in Figure \ref{fig:matern_spectrum}.
	
\bibliography{ref.bib}

\end{document}
