\PassOptionsToPackage{colorlinks=true, linkcolor=teal, citecolor=teal, pdfborder={0 0 0}}{hyperref}
\documentclass[accepted]{uai2024} % for initial submission
% \documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

%% math
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{bm}
\usepackage{amsfonts}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{thmtools}
\DeclareMathOperator*{\argmin}{arg\,min}
\usepackage{algorithm}
\usepackage{algorithmic}
\input{notation}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
% \usepackage{hyperref}

\def\definitionautorefname{Definition}
\def\subsectionautorefname{Section}

% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xspace}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)



%% Self-defined macros

\title{Stein Random Feature Regression}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<houston.warren@sydney.edu.au>?Subject=Your UAI 2024 paper}{Houston Warren}{}}
\author[2]{Rafael Oliveira}
\author[1,3]{Fabio Ramos}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science\\
    The University of Sydney\\
    Sydney, Australia
}
\affil[2]{%
    DATA61\\
    CSIRO\\
    Sydney, Australia\\
}
\affil[3]{%
    NVIDIA\\
    USA\\
  }
  
\begin{document}
\maketitle

\begin{abstract}
    In large-scale regression problems, random Fourier features (RFFs) have significantly enhanced the computational scalability and flexibility of Gaussian processes (GPs) by defining kernels through their spectral density, from which a finite set of Monte Carlo samples can be used to form an approximate low-rank GP. However, the efficacy of RFFs in kernel approximation and Bayesian kernel learning depends on the ability to tractably sample the kernel spectral measure and the quality of the generated samples. We introduce Stein random features (SRF), leveraging Stein variational gradient descent, which can be used to both generate high-quality RFF samples of known spectral densities as well as flexibly and efficiently approximate traditionally non-analytical spectral measure posteriors. SRFs require only the evaluation of log-probability gradients to perform both kernel approximation and Bayesian kernel learning that results in superior performance over traditional approaches. We empirically validate the effectiveness of SRFs by comparing them to baselines on kernel approximation and well-known GP regression problems.
\end{abstract}

% ====================================================================================== INTRO

\section{Introduction}
\label{intro}
Gaussian Processes (GPs) are highly regarded in machine learning for their nonparametric regression capabilities. Their sustained prominence, despite the emergence of competitive alternative regression frameworks, stems from their principled approach to modeling uncertainty and the flexibility to incorporate domain-specific inductive biases via kernel covariance functions.

Despite their strengths, the Achilles heel to the GP method is their $\bigO{N^3}$ computational complexity with respect to the number of data points $N$. To address this, numerous low-rank and sparse methodologies have been developed that seek to preserve GP advantages while mitigating their computational footprint. Notably, random Fourier features (RFFs) \citep{rahimi_random_2008} and their use in sparse spectrum GPs (SSGP) \citep{lazaro-gredilla_sparse_2010}, represent leading efforts in this domain.

Applying Bochner's theorem \citep{rudin_fourier_2011}, RFF methods model stationary kernels as expectations under a spectral density $\sm(\w)$:
\begin{equation}\label{eq:bochner}
    k(\x, \xp) = \int_{\R^\xdim} \sm(\w) e^{-i\w^\transpose(\x-\xp)} d\w , \quad \x, \xp \in \R^\xdim\,.
\end{equation}
Given a spectral density $\sm$, $k$ can then be approximated using a finite set of $R \ll N$ Monte Carlo samples $\w\sim\sm(\w)$, thereby enabling efficient low-rank GP inference. 

Approximation of prevalent kernels with known spectral distributions (an example of which being the radial basis function (RBF) and Gaussian spectral measure pair) hinges on the spectral distribution's sampling method. Quasi Monte Carlo (QMC) sampling, noted for its superior approximation accuracy, is effective when $\pi(\w)$ has an accessible inverse-CDF, a condition not met by many common kernels.

RFFs also enable a flexible kernel learning scheme through direct optimization of the $R$ finite samples of $\sm(\w)$ as hyperparameters, offering a pathway to empirically approximate optimal stationary kernels directly from data. However, such schemes are generally susceptible to overfitting \citep{tan_variational_2016}. 

A natural remedy is to instead learn a Bayesian posterior over frequencies $p(\w | D)$ or spectral measure $\Pm(\sm | D)$, but this approach in general does not yield tractable inference. Recent advances have explored MCMC and mean-field variational inference (VI) for approximate kernel posterior inference \citep{hensman_variational_2018, miller_bayesian_2022}, which respectively offer downsides in computational expense and restricted prior selection on the spectral measure.

Separate to the rise of the RFF and SSGP paradigms has been the growth of particle-based sampling techniques, the bellwether for which is Stein variational gradient descent (SVGD) \citep{liu_stein_2016}, which blends the strengths of MC and VI methodologies. SVGD iteratively refines a set of particles to more closely approximate a target distribution $p$ through gradient descent on the Kullback-Liebler divergence. Crucially, SVGD leverages only gradient evaluations of a target's unnormalized log density. This facilitates efficient sampling from complex Bayesian posteriors previously deemed intractable.

For RFFs, where kernels are approximated through particle samples $\w$ of a spectral measure $\sm(\w)$, the application of SVGD presents a novel intersection of ideas. Despite their intuitive relationship, the combination of these techniques has received limited attention in literature. In this paper, we make an initial step towards fusing these fields with the presentation of Stein random features (SRFs), which leverages SVGD for fitting, learning, and performing approximate posterior inference on RFF spectral measures and their corresponding kernels. This approach offers novel flexibility and performance advantages, and a significant motivation for this work is to inspire further investigation into the confluence of these methods. We list our contributions as follows:
\paragraph{Contributions}
\begin{itemize}
    \item \textbf{SVGD Inference for RFFs:} We propose a novel application of Stein variational gradient descent to improve the accuracy of low-rank kernel approximations by utilizing only gradient evaluations of the kernel's spectral measure.
    \item \textbf{Mixture Stein Random Features (M-SRFR):} Extending beyond kernel approximation, we introduce a Bayesian inference framework which uses SVGD to efficiently generate diversified approximate posterior samples of empirical kernel spectral measures.
    \item \textbf{Empirical Benchmarks:} We provide evaluations on common benchmarks in order to demonstrate the flexibility and efficacy of our methods.    
\end{itemize}
% \item Finally, we present our SVGD-based scheme Mixture Stein Random Features (M-SRFR) for Bayesian inference over kernel spectral measures, and demonstrate how SVGD enables efficient and diversified samples of functional posteriors over kernel spectral measures while offering broad flexibility to incorporate domain-informed spectral measure prior specifications.
% Include here a repeat of \eqref{eq:srfm} to emphasize the contribution.
% \begin{itemize}
%     \item we additionally extend our method to the kernel learning domain in Gaussian process regression, where we formulate a functional kernel learning problem and leverage our method to tractably and efficiently sample functional over kernel spectral measures that enable the learning of diversified kernel function ensembles. Such ensembles improve the predictive and uncertainty calibration performance of sparse spectrum GPs without significantly increasing computational cost, and aditionally enable Bayesian inference over traditionally intractable kernel functions priors without the use of mean-field variational inference assumptions.
%     \item we demonstrate the value of our method through a series of kernel approximation and regression benchmarks on well-kown GP regression problems.
% \end{itemize}
% ====================================================================================== Preliminaries
\section{Preliminaries}
\label{prelim}
This section outlines the necessary preliminaries used in the derivation of Stein random features and mixture Stein random feature regression, assuming a baseline familiarity with Gaussian processes (GPs) and kernel covariances. For a thorough review, refer to~\citet{rasmussen_gaussian_2006}.

\subsection{Gaussian Processes}
Gaussian processes (GPs) \citep{rasmussen_gaussian_2006} are a Bayesian non-parametric regression method that define a distribution over functions. A zero-mean GP prior $f \sim \gp(0, k_{\vec\theta})$ is uniquely defined by its covariance function $k_{\vec\theta}$, which is specified by its own hyperparameters. Given observations $\y = f(\X) + \vec\epsilon$,
at a set of inputs $\X = \{\x_i\}_{i=1}^N \subset\R^\xdim$, assuming $\vec\epsilon \sim \normal(0, \sigma^2\eye)$, a GP model predicts $\vec f_* := f(\X_*) \sim \normal(\vec\mu_*, \mat\Sigma_*)$ at any $\X_*$ as:
\begin{align}\label{eq:gpmean}
    \vec\mu_* &= \K_{*\x}(\K_{\x\x} + \sigma^2 \mathbf{I})^{-1}\y,\\
    \mat\Sigma_* &= \K_{**} - \K_{*\x} (\K_{\x\x} + \sigma^2 \mathbf{I})^{-1}\K_{\x*}, \label{eq:gpcov}
\end{align}
where $\K_{\x\x} = k_{\vec\theta}(\x, \xp), \, \forall \, \x, \xp \in \X$. Kernel and GP hyperparameters $\bm{\theta}$ are usually estimated by minimising the negative log-marginal likelihood (NLL)\footnote{In NLL equations, $\pi$ denotes the usual irrational constant arising from the entropy of Gaussian distributions.}:
\begin{equation}\label{eq:gpnll}
    \begin{split}
        \mathcal{L}(\bm{\theta}) = \frac{1}{2} \log |\K_{\mathbf{xx}} + \sigma^2 \mathbf{I}| + \frac{1}{2} \mathbf{y}^{\top} &(\K_{\mathbf{xx}} + \sigma^2 \mathbf{I})^{-1} \mathbf{y} \\
        &+ \frac{N}{2} \log(2\pi)\,.
    \end{split}
\end{equation}
The critical limitation of GPs is the $\bigO{N^3}$ complexity due to Gram matrix $\K$ inversion, which for large $N$ grows computationally intractable.

\subsection{Random Fourier Features and Sparse Spectrum GPs}
The computational disadvantages of GPs on large datasets have led to significant interest in low-rank approximations, among which random Fourier features (RFFs) \citep{rahimi_random_2008} and sparse spectrum Gaussian processes (SSGPs) \citep{lazaro-gredilla_sparse_2010} have been significant developments. These approaches derive from Bochner's theorem, which establishes the connection between shift-invariant kernels and non-negative spectral measures. The formulation used here follows the presentation given in \citet{warren_generalized_2022}:
\begin{theorem}[Bochner's theorem \citep{rudin_fourier_2011}]\label{boch}
    A shift-invariant kernel $k(\x, \x^\prime) = k(\x - \x^\prime)$ is positive-definite if and only if it is the Fourier transform of a non-negative measure.
\end{theorem}

Bochner's theorem implies that kernels can be uniquely defined through probability measures such that kernel learning can be reframed as learning spectral measures.

\paragraph{Random Fourier Features:}
RFFs propose that we can form finite rank approximations to kernels using Monte Carlo samples of their spectral measure $\sm(\w)$:
\begin{equation}\label{eq:rff}
    \begin{split}
        k(\x - \x^\prime) & = \int_{\R^d} \sm(\bm{\omega}) e^{i\bm{\omega}^\transpose(\x - \x^\prime)} \ d\bm{\omega}, \\
        & = \int_{\R^d} \sm(\bm{\omega}) \cos(\bm{\omega}^\transpose(\x - \x^\prime)) \, d\bm{\omega} \\
        & \approx \frac{1}{R} \sum_{r=1}^R \cos(\bm{\omega}_r^\transpose(\x - \x^\prime)) \ ,
    \end{split}
\end{equation}

An alternative representation is as the dot-product between trigonometric basis functions $k(\x - \x^\prime) \approx \Phi(\x)^\transpose \Phi(\xp)$:
\begin{equation}\label{eq:rffphi}
    \Phi(\x) = \frac{\sqrt{2}}{\sqrt{2R}} \begin{bmatrix}
           \cos(\w_1^\transpose \x) \\
           \sin(\w_1^\transpose \x) \\
           \vdots \\
           \cos(\w_R^\transpose \x) \\
           \sin(\w_R^\transpose \x)
         \end{bmatrix} \,.
\end{equation}

\paragraph{Sparse Spectrum Gaussian Processes:}
SSGPs leverage the RFFs to form a low-rank GP approximation, where the GP predictive equations and log-likelihood are given by:
\begin{align}
    \vec\mu_* &= \Phi(\X_*)^\transpose \mathbf{A}^{-1} \Phi(\X) \y, \label{eq:rff_mu}\\  
    \mat\Sigma_* &= \sigma^2\Phi(\X_*)^\transpose\mathbf{A}^{-1}\Phi(\X_*), \label{eq:rff_var}\\
    \begin{split}\label{eq:rff_nll}
        \log p(\y|\bm{\theta}) = -\frac{1}{2\sigma^2} &\left[ \y^\transpose \y - \y^\transpose \Phi(\X)^\transpose \A^{-1} \Phi(\X)\y  \right] \\
        &- \frac{1}{2} \log | \A | - R \log (R \sigma^2) \\
        &- \frac{N}{2} \log (2 \pi \sigma^2) ,
    \end{split}
\end{align}
where $\Phi(\X) := [\Phi(\x_1),\dots,\Phi(\x_N)]$ is defined as in Equation \ref{eq:rffphi} and $\mathbf{A}$ is an $R \times R$ matrix defined by:
\begin{equation}
    \mathbf{A} = \Phi(\X)\Phi(\X)^\transpose + \sigma^2 \mathbf{I},
\end{equation}
thus reducing the computational complexity of GP inference to $\bigO{R^3}$, where $R\ll N$ is the number of Fourier features. 

% The authors additionally propose that SSGPs can be leveraged in a kernel learning scheme where the $R$ spectral measure samples $\w_i$ are treated as kernel hyperparameters. This approach, while prone to overfitting, does not assign a distributional family to $\sm(\w)$ by instead opting for a finite empirical representation. This naturally aligns with the empirical particle distributions of Stein variational gradient descent.

\subsection{Stein Variational Gradient Descent} \label{sec:stein}
Stein variational gradient descent (SVGD) \citep{liu_stein_2016} represents a means of sampling complex distributions that unifies the strengths of MC methods and variational inference (VI) frameworks via a particle-based, gradient-oriented strategy. VI \citep{bishop_pattern_2006} seeks to approximate an intractable target distribution $p$ from a family of tractable distributions $q \in \mathcal{Q}$ through minimization of the Kullback-Leibler (KL) divergence between $q$ and $p$:
\begin{equation}\label{eq:KL}
    q^* \in \argmin_{q \in \mathcal{Q}} \kl{q}{p}.
\end{equation}
VI's success hinges on the choice of approximation family $\mathcal{Q}$, which must offer straightforward inference while being adequately flexible to represent an arbitrarily complex $p$.

SVGD differs from VI in that it instead proposes to apply a sequence of transformations $\tf_{\vhfunction_t}(\w) = \w + \vhfunction_t(\w)$ to particles $\w$ sampled from an initial distribution $\w \sim q_0$, steering them towards $p$ by following gradient flows of $\kl{q}{p}$ \citep{liu_understanding_2019}. Crucially, the variational approximation $q$ is non-parametric, and therefore not confined to a specific family $\mathcal{Q}$. Assuming $\vhfunction_t$ is a member of a reproducing kernel Hilbert space $\Hspace_\kappa^\xdim$, the optimal transformation $\tf_{\vhfunction_t}$ has an analytic expression, resulting in a tractable algorithm for variational inference:
\begin{equation}
    \w_i^{t+1} = \w_i^{t} + \epsilon \vhfunction_t(\w_i^{t}),
\end{equation}
where $\epsilon$ is a small step-size parameter and $\vhfunction_t$ is given by:
\begin{equation}\label{eq:svgd}
    \begin{split}
        \vhfunction_t(\w) = \frac{1}{R} \sum_{j=1}^R \kappa(\w, \w_j^t) \nabla_{\w_j^t} &\log p(\w_j^t) 
        + \nabla_{\w_j^t} \kappa(\w, \w_j^t),
    \end{split}
\end{equation}
in which $\kappa: \R^\xdim\times\R^\xdim\to\R$ is a positive-definite kernel function, and $\w^i_0 \sim q_0$, for a given base distribution $q_0$. The first term in Equation \ref{eq:svgd} serves as an attractive force for particles $\w_i$ to converge on high density regions of $p$ while the second term serves as a repulsive force that encourages diversity between particles and avoids mode collapse. 

SVGD's advantage over VI is that it relies only on the specification of a kernel function $\kappa$ and gradient evaluations of a target's (unnormalized) log probability $\log p(\w)$. Such advantages make SVGD applicable in the approximation of posterior distributions for which it could be challenging to find a suitable variational family of parametric distributions.

\subsection{Functional Kernel Learning}\label{sec:fkl}
In the GP framework, optimization of kernel hyperparameters $\bm{\theta}$ focuses on the GP log-likelihood in Equation \ref{eq:gpnll}, aiming to find:
\begin{equation}\label{eq:gpobjective}
    \begin{split}
        \tta^* &\in \argmin_{\tta} \, -\log p(\y | \tta) \\
        &= \argmin_{\tta} \, \mathcal{L}(\tta)
    \end{split}
\end{equation}

By adopting Bochner's theorem, which allows for expressing kernels via their spectrum, one may shift towards optimizing the kernel directly via its spectral measure $\sm(\w)$. This transforms GP optimization into a \textit{functional} objective over the space $\Pspace(\R^\xdim)$ of probability measures on $\R^\xdim$:
\begin{equation}\label{eq:functional_obj}
    \sm^* \in \argmin_{\sm \in \Pspace(\R^\xdim)} \mathcal{L}[\sm]
\end{equation}
For a finite sample $\mat\Omega_0 := \{\w_{i,0}\}_{i=1}^R$, where $\w_{i,0} \sim \pi_0$, one approach is to follow the gradient of the GP NLL $\nll[\hat\sm_t] = \nll(\mat\Omega_t) = -\log p(\y|\mat\Omega_t)$, updating:
\begin{equation}
    \mat\Omega_{t+1} = \mat\Omega_t - \epsilon \nabla_{\mat\Omega_t} \nll(\mat\Omega_t)\,.
\end{equation}
Note that $\mat\Omega$ can be simply seen as a matrix, so that $\nabla_{\mat\Omega} \nll(\mat\Omega)$ is also a matrix.
This approach was proposed in the original sparse spectrum Gaussian processes work by \citet{lazaro-gredilla_sparse_2010}, which also included the other GP hyperparameters into the same optimization loop. As a result, one obtains a maximum likelihood estimate (MLE) of the kernel and its empirical spectral measure $\hat\sm := \frac{1}{R}\sum_{i=1}^R \Dirac_{\w_i}$, where $\delta_\w$ represents the Dirac measure at $\w\in\R^\xdim$.

% ====================================================================================== Related Work
\section{Related Work}\label{related}
\paragraph{Kernel Approximation with RFFs}
There has been significant focus on improving the quality of the RFF kernel approximation presented in Equation \ref{eq:rff}. One avenue considers enhancing the quality MC and QMC samples $\w_i$ through post-hoc adjustments to reduce variance and bolster approximation quality \citep{Le2013Fastfood, yu_orthogonal_2016, chang_data-driven_2017}. Additionally, alternative quadrature techniques, including numerical and Bayesian quadrature, have been proposed as alternative means for the integral approximation of Equation \ref{eq:rff} \citep{ohagan_bayeshermite_1991, mutny_efficient_2018}. A thorough review is available in \citet{liu_random_2021}. We posit that our SVGD-based approach for kernel approximation, detailed in Section \ref{sec:kernelapprox}, offers distinct benefits. It simplifies implementation across spectral measures using gradient evaluations and is underpinned by SVGD's robust theory. 

\paragraph{Spectral Kernel Learning}
Bochner's Theorem \ref{boch} has motivated a plethora of techniques that conceptualize kernel learning as probabilistic inference on spectral measures. Beyond RFFs, spectral mixture kernels (SMKs) \citep{wilson_gaussian_2013} represent kernel spectral measures as Gaussian mixture models. Recent advances have generalized and extended the SMK approach to introduce nonstationarity, scalability, and variational inference \citep{samo_generalized_2015, remes_non-stationary_2017, shen_harmonizable_2019, jung_efficient_2022}. 

Concurrently, RFFs have evolved through integration with advanced kernel learning and GP architectures, including deep kernels \citep{xie_deep_2019, xue_deep_2019, mallick_deep_2021}, generative adversarial networks \citep{li_implicit_2019}, and deep Gaussian process \citep{cutajar_random_2017}. To avoid overfitting, Bayesian inference over frequencies $\w$ using MCMC \cite{miller_bayesian_2022} and variational inference \citep{hensman_variational_2018, zhen_learning_2020, cheema_integrated_2023} has also been proposed.

The methodology we propose compliments many of these advances. M-SRFR can be viewed as simply adding a mixture dimension $M$ to a point estimate of kernel spectral measure parameters $\w$, with access to gradients of the score function -- which nearly all of the aforementioned architectures calculate in training -- being the only requirement for implementation.

\paragraph{Statistical Inference over Functions}
Statistical inference over functions, within kernel learning and broader contexts, is an area of active research. Others have considered performing functional inference on kernels like we do here, but differ in that they assign distributional families to the spectral measure priors as GPs \citep{benton_function-space_2019} or Gaussian mixtures \citep{hamid_marginalising_2022}.

Functional inference is in general a popular research topic within the context of Bayesian inference and Bayesian neural networks \citep{wang_nonlinear_2019, sun_functional_2019, Ma2021, dangelo_stein_2021, pielok_approximate_2022}. A notable commonality between these approaches is their use of SVGD or other particle-based techniques for inference. These works help to inspire the method we now present, which is a more focused application of the functional inference problem to kernel learning in GPs.

% ====================================================================================== SRF
\begin{figure*}[!htb]
  \centering
  \includegraphics[width=\linewidth]{figures/co2.pdf}
  \caption{Comparison of traditional RFF Kernel Learning to an M-SRFR posterior with $M = 8$ components}\label{fig:co2}
\end{figure*}

\section{Stein Random Features}
\label{srfsection}
We now derive our proposed methodologies for operationalizing the theoretical and intuitive connections between RFFs and SVGD. We specifically propose two promising routes:
\begin{enumerate}
    \item Using SVGD as a sampling mechanism for forming RFF approximations to kernels with known spectral measures.
    \item Extending the results of Section \ref{sec:fkl} to propose mixture Stein random features (M-SRFR) for posterior inference over kernel spectral measures.
\end{enumerate}

The former item we leave to Section \ref{sec:kernelapprox}, as it represents a straightforward, though nonetheless novel, application of SVGD sampling routines for generating RFF kernel frequencies. Instead, we focus here on motivating the use of SVGD in kernel learning from a theoretical perspective, and subsequently extending to posterior inference.

\subsection{Recovering SVGD Through Functional Kernel Learning}
% We now form concrete theoretical connections between SVGD and kernel learning in the spectral domain. We specifically review the connections between functional kernel learning \ref{sec:fkl} and SVGD.
% We first note that \citet{rahimi_random_2008} elucidate that kernel matrix entries $k(\x, \xp)$ correspond to expectations under the spectral measure:
% \begin{equation}\label{eq:rffkernelapprox}
%     \begin{split}
%         k(\x - \xp) &= \int \sm(\w) e^{-i \w (\x - \xp)} d\w \\
%         &= E_{\w\sim\sm} \left[e^{-i \w \x} e^{-i \w \xp} \right].
%     \end{split}
% \end{equation}
\citet{mallick_deep_2021}, deriving a result for deep probabilistic kernel learning applicable in the RFF context, show that with kernel matrices defined in the form of Equation \ref{eq:bochner}, the optimal $\sm^*$ for the functional objective of Equation \ref{eq:functional_obj} can be approximated as a non-parametric particle approximation $q^*$ with gradients:
\begin{equation}\label{eq:mallick_grad}
    \begin{split}
        \nabla_{\sm} \mathcal{L} \left[ \sm \right] &\approx \nabla_q \mathcal{L} \left[ q \right] \\
        &\approx \sum_{r=1}^R \kappa(\w_r, \cdot) \nabla_{\w_r} \mathcal{L}(\w) ,
    \end{split}
\end{equation}
We can observe that the gradient step defined in \eqref{eq:mallick_grad} equates to a particle update similar to the SVGD update in \eqref{eq:svgd}, but without a particle repulsion term $\nabla_{\w_j} \kappa(\w, \w_j)$. To directly recover a variant of SVGD through incorporation of the repulsive term, we can use the results of \citet{liu_understanding_2019} who show that particle repulsion can be derived from the addition of an entropy regularization term $H\left[ q \right]$ to the functional marginal log-likelihood:
\begin{equation}\label{eq:svgd_functional}
    \sm^* \approx q^* = \argmin_q \mathcal{L} \left[ q \right] -  H\left[ q \right].
\end{equation}
% where $\w_r$ are a finite set of particles sampled from the base distribution $q_0(\w)$, and $\kappa$ is a positive definite kernel between samples $\w_r$. 

The result of Equation \ref{eq:svgd_functional} a kernel learning scheme with near equivalence to SVGD, thereby signifying a theoretical convergence between kernel learning on RFFs and SVGD that matches their intuitive connections. Detailed derivations of these properties are provided in the supplement (see \autoref{app:functional-kl}).

\subsection{Posteriors Over Spectral Measures}
Equation \ref{eq:svgd_functional} represents entropy-regularized maximum-likelihood inference over a kernel spectral measure $\sm(\w)$, which results in a single estimate. We propose to extend these results to instead perform Bayesian inference over spectral measures, leading to a posterior over kernels. 
% However, maximum-likelihood fitting of RFFs is prone to overfitting, and our experiments show the addition of functional entropy term $H\left[ q \right]$ does not improve empirical performance. Reformulating the problem to posterior inference can addresses both of these concerns.

We begin by formulating the posterior over kernel spectral densities $\sm(\w)$. The GP likelihood of observing data $D$ under a kernel $k$ characterized by spectral measure $\sm$ is denoted as $p(D|\sm)$. Keeping aside measure-theoretic formalities and regularity conditions for now, assume we have a prior $\Pm$ over the space of probability distributions $\Pspace(\R^d)$. We can then formulate a posterior over a kernel's spectral measure $\sm$ as:
\begin{equation}
    \Pm(\sm|D) \propto p(D | \sm)P(\sm) \,.
    \label{eq:sm_posterior}
\end{equation}
% where $p(\sm)$ represents a probability measure over spectral measures $\mathcal{P}(\mathbb{R}^d)$. 
Taking a similar functional kernel learning approach,~\citet{benton_function-space_2019} choose to represent $\Pm(\sm)$ by placing a GP prior on $\log \sm(\w)$ and applying Markov chain Monte Carlo for inference over the latent process \citep{murray2010slice}. We however adopt a particle-based VI approach via SVGD.

We introduce a variational approximation $\Qm(\sm) \approx \Pm(\sm |D)$, which will be characterized by an empirical particle distribution. The variational objective is to minimize the KL divergence between the approximate and the true posterior:
\begin{equation}\label{eq:srfr_objective}
    \Qm^* \in \argmin_{\Qm} \kl{\Qm(\sm)}{\Pm(\sm | D)}.
\end{equation}
Given initial particles $\sm_m \sim Q_0(\sm)$, our goal is to identify a transformation $\tf$ which through minor adjustments $\epsilon$ guides particles towards minimizing the KL divergence. \citet{wang_nonlinear_2019} demonstrate that, even if $p(D | \sm)$ is a nonlinear functional of $\sm$, the optimal transformation of particles $\sm_m$ can still be given by a Stein update rule similar to \eqref{eq:svgd}. 

The GP likelihood in \eqref{eq:sm_posterior} is a nonlinear functional of the spectral measure $\sm(\w)$ via the kernel \eqref{eq:bochner}, which appears in nonlinear relations in \eqref{eq:gpnll}. We can thus apply SVGD to iteratively update particles $\sm_m$ to minimize \eqref{eq:srfr_objective} and form an empirical posterior approximation $Q^*$. The difference in this setting versus traditional SVGD is that we view individual particles as probability measures $\sm_m$ themselves, rather than samples of a probability measure over individual points.

\subsection{Inference in the space of measures}
Now we describe a more formal treatment to the inference problem we have at hand. To precisely define a prior over the space of probability measures $\Pspace(\R^\xdim)$, we consider transport maps and their pushforwards, instead of the individual measures directly. Similar to traditional SVGD, let $\tf_\vhfunction(\w) := \w + \vhfunction(\w)$, but now assume that $\vhfunction$ follows a stochastic process $\SP$, which defines a prior $\Pm_{\set{F}}$ over the space of functions $\set{F}(\R^\xdim)$ mapping $\R^\xdim$ to itself. For instance, we can have a vector-valued Gaussian process as a prior $\vhfunction \sim \gp(\vec 0, \mat\Sigma)$, defined by a matrix-valued kernel, e.g., $\mat\Sigma(\w, \w') := \kappa_\vhfunction(\w,\w')\eye$ \citep{alvarez2012kernels}. Given a base measure $\sm_0$, each realisation of the pushforward\footnote{The pushforward of a measure $P$ on $\set{X}$ by a measurable map $\tf:\set{X}\to\set{Y}$ is defined as the measure $Q := \tf\# P$ on $\set{Y}$ such that $Q(\set{A}) = P(\{x\in\set{X}\mid\tf(x)\in\set{A}\})$ for any measurable $\set{A}\subset \set{Y}$.} $\sm_\vhfunction := \tf_\vhfunction\#\sm_0$ defines a probability measure in $\Pspace(\R^\xdim)$. Therefore, the stochastic process $\vhfunction\sim\SP$ defines a prior over $\Pspace(\R^\xdim)$ via the corresponding transport maps $\tf_\vhfunction$.

Now we formulate an SVGD perspective over the space of vector-valued functions $\Fh := \set{F}(\R^\xdim)$. Given $\vgfunction:\R^\xdim\to\R^\xdim$, let $\tf_\vgfunction:\set{F}\to\set{F}$ define a transform on $\set{F}$ such that $\tf_\vgfunction(\vhfunction)(\w) = \vhfunction(\w) + \vgfunction(\vhfunction(\w))$, for all $\w\in \R^d$. Given a base measure $\Qm_0$ over $\set{F}$, associated with a stochastic process $\vhfunction_0 \sim \Qm_0$, we aim to apply a sequence of transformations $\tf_{\vgfunction_t}$ to $\Qm_0$, so that the pushforward $\Qm_t := \tf_{\vgfunction_t} \# \Qm_{t-1}$ converges to a target measure $\Pm_*$ on $\Fh$ as $t\to\infty$. To do so, we follow the gradient flow of the KL divergence:
\begin{equation}
    \kl{\Qm_t}{\Pm_*} = \expectation_{\vhfunction\sim\Qm_t} \left[\log \frac{\diff\Qm_t}{\diff\Pm_*}(\vhfunction)\right]\,,
\end{equation}
where $\frac{\diff\Qm_t}{\diff\Pm_*}$ is the Radon-Nikodym derivative of $\Qm_t$ with respect to $\Pm_*$ \citep{Bauer1981}.
Assuming $\Qm$ is absolutely continuous w.r.t. $\Pm_*$, the derivative $\frac{\diff\Qm}{\diff\Pm_*}$ is well defined. As shown in previous works in the literature of function-space VI \citep{Ma2021}, the KL divergence between two stochastic processes is equivalent to:
\begin{equation}
    \kl{\Qm_t}{\Pm_*} = \sup_{n, \mat\Omega_n} \kl{q_t(\mhfunction_n)}{p_*(\mhfunction_n)}\,,
    \label{eq:kl-sup}
\end{equation}
where $\mhfunction_n := \vhfunction(\mat\Omega_n) = [\vhfunction(\w_1), \dots, \vhfunction(\w_n)]^\transpose$, and $\mat\Omega_n$ is an $n$-element subset of $\R^\xdim$, or equivalently an $n$-by-$\xdim$ matrix, and $q_t$ and $p_*$ are the joint probability measures on $\R^{n\times\xdim}$ associated with the stochastic processes defined by $\Qm_t$ and $\Pm_*$, respectively.
Given the above, we can now state our theoretical result, which we prove in \autoref{app:distributional-svgd}.

\begin{theorem}
    \label{thr:kl-grad}
    Let $\tf_\vgfunction$ be as above, where $\vgfunction:\R^d\to\R^d$ is an element of the vector-valued reproducing kernel Hilbert space $\Hspace_\kappa^d$ associated with a positive-definite kernel $\kappa$. Given a probability measure $\Pm_*$ on $\Fh$, the direction of steepest descent in the KL divergence $\kl{\Qm_\vgfunction}{\Pm_*}$ is given by:
    \begin{equation}
    \begin{split}
        &\nabla_\vgfunction \kl{\Qm_\vgfunction}{\Pm_*}\big|_{\vgfunction=0} =\\
        &\quad- \expectation_{\vhfunction\sim\Qm}[ \kappa(\cdot, \mhfunction^*)\nabla_{\mhfunction^*}\log p_*(\mhfunction^*) + \nabla_{\mhfunction^*}\kappa(\cdot, \mhfunction^*)],
    \end{split}
    \end{equation}
    where $\mhfunction^* := \vhfunction(\mat\Omega_{n^*}^*)$, assuming the supremum is reached at $n^*$ and $\mat\Omega_{n^*}^*$ in \autoref{eq:kl-sup}.
    % are such that $\kl{\Qm}{\Pm_*} = \kl{q(\vhfunction(\mat\Omega_{n^*}^*))}{p_*(\vhfunction(\mat\Omega_{n^*}^*))}$. 
    In particular, for an empirical base measure $\hat\sm := \frac{1}{R}\sum_{i=1}^R\Dirac_{\w_i}$ supported on $\mat\Omega_R = \{\w_i\}_{i=1}^R$, we have that:
    \begin{equation}
    \begin{split}
        &\nabla_\vgfunction \kl{\Qm_\vgfunction}{\Pm_*}\big|_{\vgfunction=0} =\\
        &- \expectation_{\mhfunction\sim q(\mhfunction)}[ \kappa(\cdot, \mhfunction)\nabla_{\mhfunction}\log p_*(\mhfunction) + \nabla_{\mhfunction}\kappa(\cdot, \mhfunction)]\,,
    \end{split}
    \end{equation}
    where $\mhfunction := \vhfunction(\mat\Omega_R) \in \R^{R\times\xdim}$.
\end{theorem}
This result allows us to apply SVGD steps in the space of measures $\Pspace(\R^\xdim)$ by following the gradient flow of the KL divergence between stochastic processes. We can then identify $\Pm_*$ with $\Pm(\sm|D)$ in the previous section to learn an approximation to the posterior distribution over spectral measures. Moreover, this result also shows us that we can treat inference in the space of measures as inference over matrices, when restricted to empirical measures. Further discussion about the theoretical result in \autoref{thr:kl-grad} and its application to our problem formulation is deferred to \autoref{sec:thr-discussion} in the appendix.

%TODO: We might need to expand on how we can identify the posterior stochastic processes with the posterior distribution over spectral measures. And also we need to explain how in the end it all boils down to transforms over matrices. The main idea is that each \Omega_R^i = h_i(\Omega_R) is a random matrix realisation. For the initial transform, these can be completely random when \kappa is universal, since we can always find a transform in H_\kappa between two given points. The subsequent steps will all be operating on these transformed matrices.

\subsection{Mixture Stein Random Feature Regression}\label{sec:msrfr}

\begin{algorithm}
\caption{Mixture Stein Random Feature Regression (M-SRFR)}
\begin{algorithmic}[1]
\REQUIRE Dataset $D$, GP kernel $k$ parametrized by $M$ RFF frequency matrices $\{\mat\Omega_m\}_{m=1}^M$, SVGD kernel $\kappa$, particle prior $p(\mat\Omega)$, step size $\epsilon$, hyperparamter $\alpha$, and number of iterations $T$.
\FOR{t = 1 to T}
    \STATE Compute gradient $\nabla_{\mat\Omega_m} \log p(D | \mat\Omega_m) p(\mat\Omega_m)$ with $p(D | \mat\Omega_m)$ from \eqref{eq:rff_nll} for all $m \in \{1, \dots, M\}$.
    
    \FOR{each $\mat\Omega_m, m \in \{1, \dots, M\}$}
        \FOR{each $\mat\Omega_j, j \in \{1, \dots, M\}$}
            \STATE Compute the kernel value $\kappa(\mat\Omega_m, \mat\Omega_j)$ from \eqref{eq:matrix_kernel} and gradient $\nabla_{\mat\Omega_j} \kappa(\mat\Omega_m, \mat\Omega_j)$ from \eqref{eq:matrix_kernel_grad}
        \ENDFOR
        \STATE Apply M-SRFR update rule $\mat\Omega_m^{t+1} \leftarrow \mat\Omega_m^t$ according to \ref{eq:msrfr}
    \ENDFOR
\ENDFOR
\STATE \textbf{Output:} Learned kernel $k$ with frequencies $\{\mat\Omega_m^T\}_{m=1}^M$
\end{algorithmic}
\label{alg:msrfr}
\end{algorithm}

% Transitioning from a theoretical discourse to practical implementation, we seek an operative representation for particles $\sm_m$ to facilitate function SVGD in practice. One option might be to treat each $\sm_m$ as a flexible parametric distribution such as a Gaussian mixture, an approach taken by \citet{hamid_marginalising_2022}. We opt for a broader strategy that underscores our empirical particle distribution approach.

% Given the unbiased approximation capability of the RFF basis function projection $\Phi(\x)$, as detailed in Equation \ref{eq:rffphi}, and its utilization within SSGPs, we circumvent the necessity of predetermining a distributional family for $\sm(\w)$. Instead, we leverage an empirical representation given RFF's ability to approximate $k_{\sm(\w)}$ with exponential convergence \citep{rahimi}.
As above, let $\mat\Omega_m$ represent a set of samples $\{\w_{i, m}\}_{i=1}^R$ drawn from a spectral measure $\sm_m$. In the finite-particle setting, we can associate the spectral measure posterior in \eqref{eq:sm_posterior} with a posterior over the empirical representation of $\sm$ by a frequency matrix $\mat\Omega$:
\begin{equation}\label{eq:approx_sm_posterior}
    \begin{split}
        \Pm(\sm | D) &\approx \Pm(\mat\Omega | D) ,\\
        p(\mat\Omega | D) &\propto p(D | \mat\Omega) p(\mat\Omega).
    \end{split}
\end{equation}
Now we have a prior over matrices $p(\mat\Omega)$, which can be constructed by applying priors over the individual row vectors in it, each representing a spectral frequency. For example, standard Gaussians and mixtures of them can be trivially extended to the matrix-variate setting. In any case, the methodology we derive is agnostic to the choice of prior, as long as it is differentiable with respect to $\mat\Omega$, making it flexible to incorporate a variety of prior knowledge or smoothness assumptions on the spectral distribution.
% If $p(\mat\Omega)$ is Gaussian, it facilitates the treatment of $p(\mat\Omega | D)$ within the classical Bayesian posterior framework. However, embracing more informed priors necessitates a departure from strict Bayesian inference principles. Given the value of domain-derived input to GP methodologies, we accommodate non-Gaussian priors by redefining
% \begin{equation}
%     p(\Omega) \approx \sum_{i=1}^R p(\w_{i}),
% \end{equation}
% where $p(\w_{i})$ is prior over frequencies. This adjustment in the non-Gaussian setting shifts the model's conceptualization from a purely Bayesian posterior to a mixture model framework influenced by Bayesian methods. However, our empirical results demonstrate that Gaussian priors offer compelling performance, with the benefit of maintaining theoretical rigour.

With the theoretical underpinnings of Theorem \ref{thr:kl-grad}, we can substitute $\sm$ with $\mat\Omega$ into the Stein update rules in Equation \ref{eq:svgd}. This forms the key component of our proposed method, \emph{Mixture Stein Random Feature Regression} (M-SRFR).
\begin{definition}[Mixture Stein Random Feature Regression] \label{def:msrfr}
    Given an initial set of $M$ frequency matrices $\mat\Omega_m \in \mathbb{R}^{R \times d}$, where each $\w_{i, m} \sim q_0(\w)$, M-SRFR defines an update:
    \begin{equation}\label{eq:msrfr}
    \begin{split}
        \mat\Omega_m^{t+1} = \mat\Omega_m^t + \frac{\epsilon}{M} \sum_{j=1}^M [ \kappa(&\mat\Omega_m, \mat\Omega_j) \nabla_{\mat\Omega_j} \log p(\mat\Omega_j) \\
        &+ \alpha \nabla_{\mat\Omega_j} \kappa(\mat\Omega_m, \mat\Omega_j) ],
    \end{split}
    \end{equation}
    where $\alpha$ is a temperature parameter, and the score gradient $\nabla_{\mat\Omega_m} \log p(\mat\Omega_m)$ can be separated into:
    \begin{equation}
        \nabla_{\mat\Omega_m} \log p(D | \mat\Omega_m) + \sum_{i=1}^R \nabla_{\w_{i, m}} \log p(\w_{i, m}),
    \end{equation}
    which represents the gradient of the SSGP likelihood \eqref{eq:rff_nll} given the RFF frequency matrix $\mat\Omega_m$ and the sum of a frequency prior $p(\w)$ over the component frequencies $\{\w_{i, m}\}_{i=1}^R = \mat\Omega_m$. The inter-particle kernel is given by the kernel between the rows of each matrix of frequencies:
    \begin{align}
    \kappa(\mat\Omega, \mat\Omega') &= \begin{bmatrix}
        \kappa(\w_1, \w_1') &\dots &\kappa(\w_1, \w_R')\\
        \vdots &\ddots &\vdots\\
        \kappa(\w_R, \w_1') &\dots &\kappa(\w_R, \w_R')
    \end{bmatrix} .
    \end{align}
    % \kappa(\mat\Omega_m, \mat\Omega_j) = \exp{\frac{-d(\mat\Omega_m, \mat\Omega_j)^2}{2 \lambda^2}} ,
    % where $d(\mat\Omega_m, \mat\Omega_j)$ is the maximum mean discrepancy between the empirical frequency distributions given by each particle set \citep{Muandet2012}.
\end{definition}
M-SRFR is summarized in Algorithm \ref{alg:msrfr}, and we provide further implementation and gradient calculation details in Appendix \ref{sec:msrfr_update}. The construction of $\mat\Omega_m$ via frequencies $\{\w_{i, m}\}_{i=1}^R$ implies that gradient updates to $\mat\Omega_m$ directly modify its constituent frequencies. Collectively, the parameters of the M-SRFR model form a tensor with dimensions $M\times R\times d$. An example comparison of M-SRFR and traditional RFF learning is presented in Figure \ref{fig:co2}.

M-SRFR performs approximate Bayesian posterior inference over $P(\sm)$ when the temperature parameter $\alpha = 1$. However, when working with a small number of mixture components $M$, we observed empirical benefit to including $\alpha$ in the hyperparameter training routine to regulate the strength of the repulsive force. 

For making predictions with M-SRFR, summarized in Algorithm \ref{alg:msrfr_preds}, we combine the $M$ individual mixture predictive means and covariances into a single predictive distribution. We do so using properties of Gaussian mixtures, which allows for calculation of an overall mean and covariance as a uniformly-weighted aggregate of the mixture components.
\begin{algorithm}
\caption{M-SRFR Prediction on New Inputs}
\begin{algorithmic}[1]
\REQUIRE Dataset $D$, new inputs $\X^*$, and trained M-SRFR GP kernel $k$ parametrized by $M$ RFF frequency matrices $\{\mat\Omega_m\}_{m=1}^M$.
\FOR{each frequency matrix $\mat\Omega_m, m \in \{1, \dots, M\}$}
    \STATE Given inputs $\X^*$, compute SSGP prediction mean $\vec\mu_m^*$ from \eqref{eq:rff_mu} and covariance $\mat\Sigma_m^*$ from \eqref{eq:rff_var} using an RFF kernel defined by frequencies $\mat\Omega_m$
\ENDFOR
\STATE Calculate $\vec\mu^*$ and $\mat\Sigma^*$ with:
\begin{align*}
    \vec\mu^* &= \frac{1}{M} \sum_{m=1}^M \vec\mu_m^* \\
    \mat\Sigma^* &= \frac{1}{M} \sum_{m=1}^M \mat\Sigma_m^* + (\vec\mu_m^* - \vec\mu^*)(\vec\mu_m^* - \vec\mu^*)^T
\end{align*}
\STATE \textbf{Output:} predictions $\y^* \sim \mathcal{N}(\vec\mu^*, \mat\Sigma^* | \X^*, D)$
\end{algorithmic}
\label{alg:msrfr_preds}
\end{algorithm}

\subsection{Complexity and Extensibility}
Conceptually, M-SRFR orchestrates an ensemble of $M$ SSGPs, promoting diversity through kernel repulsion term $\nabla_{\mat\Omega_j} \kappa(\mat\Omega_m, \mat\Omega_j)$ and preventing mode collapse. The sparsity of SSGPs ensures computational feasibility, with an increase in complexity from $\bigO{R^3}$ to $\bigO{MR^3}$, and in practice we find that performance increases are present even when using a small number of $M \ll R$ mixture components. Additionally, empirical evidence in Section \ref{exp} suggests that the mixture approach of M-SRFR outperforms RFFs with an equivalent complexity using $R^* = \sqrt[3]{MR^3}$ features.

M-SRFR's versatility extends to a broad spectrum of kernel learning applications, including nonstationary Fourier features \citep{ton_spatial_2018}, spectral kernel learning \citep{wilson_gaussian_2013}, and GPs with non-Gaussian likelihoods. This adaptability stems from SVGD's sole reliance on gradient evaluations of the score function, readily facilitated by widely used auto-differentiation tools.

% ===================================================================================== Experiments
\begin{figure}[!tb]
  \centering
  \includegraphics[width=0.85\linewidth]{figures/kernel.pdf}
  \caption{Kernel approximation error and standard deviations over 10 random seeds.}\label{fig:kernel}
\end{figure}

\begin{table*}[h]
    \centering
    \caption{UCI Regression Benchmarks RMSE and NLPD with standard deviations over 10 random seeds.}\label{tab:data}
    \begin{tabular}{lllll}
    \toprule
    & airfoil & concrete & energy & wine \\
    & $R=100, M = 6$ & $R=100, M=6$ & $R=50, M = 10$ & $R=100, M = 10$ \\
    \midrule
    \multicolumn{5}{c}{\textit{RMSE}} \\
    % \midrule
    SVGP & 2.36 $\pm$ 0.24 & 6.35 $\pm$ 0.69 & 2.72 $\pm$ 0.17 & 0.62 $\pm$ 0.04 \\
    SSGP-RBF & 2.90 $\pm$ 0.60 & 5.74 $\pm$ 0.58 & 0.48 $\pm$ 0.03 & 0.82 $\pm$ 0.08 \\
    SSGP & 2.41 $\pm$ 0.53 & 5.03 $\pm$ 0.74 & 0.37 $\pm$ 0.05 & 0.87 $\pm$ 0.04 \\
    SSGP-$R^*$ & 2.54 $\pm$ 1.09 & 4.88 $\pm$ 0.65 & 0.36 $\pm$ 0.03 & 0.69 $\pm$ 0.06 \\
    SSGP-SVGD & 2.50 $\pm$ 0.59 & 5.51 $\pm$ 0.54 & 0.40 $\pm$ 0.09 & 0.76 $\pm$ 0.06 \\
    M-SRFR (Ours) & \textbf{1.88 $\pm$ 0.27} & \textbf{4.13 $\pm$ 0.72} & \textbf{0.29 $\pm$ 0.04} & \textbf{0.59 $\pm$ 0.04} \\
    \midrule
    \multicolumn{5}{c}{\textit{Negative Log Predictive Density (NLPD)}} \\
    % \midrule
    SVGP & 487.0 $\pm$ 203.4 & 272.8 $\pm$ 120.3 & 993.6 $\pm$ 205.3 & 2334.9 $\pm$ 457.0 \\
    SSGP-RBF & 780.5 $\pm$ 457.0 & 36.3 $\pm$ 11.8 & -249.4 $\pm$ 8.7 & 791.1 $\pm$ 160.7 \\
    SSGP & 216.0 $\pm$ 159.9 & 23.1 $\pm$ 14.0 & -288.0 $\pm$ 23.9 & 783.9 $\pm$ 82.3 \\
    SSGP-$R^*$ & 213.4 $\pm$ 369.6 & 20.2 $\pm$ 12.0 & -293.9 $\pm$ 17.0 & 404.0 $\pm$ 74.3 \\
    SSGP-SVGD & 4166.6 $\pm$ 2885.5 & 29.4 $\pm$ 10.0 & -261.7 $\pm$ 58.8 & 16924.9 $\pm$ 2615.7 \\
    M-SRFR (Ours) & 454.3 $\pm$ 134.5 & 113.9 $\pm$ 77.3 & -283.7 $\pm$ 38.4 & 1882.5 $\pm$ 205.3 \\
    \bottomrule
    \end{tabular} \label{tab:uci}
\end{table*}

\section{Experiments}\label{exp}
We now demonstrate the efficacy of both the SVGD approach to approximating kernels with known spectral measures as well as the performance of our M-SRFR method on common GP regression UCI benchmarks \citep{Dua2019}. Code has been made available\footnote{\scriptsize{\url{https://github.com/houstonwarren/m-srfr/}}}.

\subsection{Stein Random Features for Kernel Approximation}\label{sec:kernelapprox}
A notable yet less-emphasized contribution we introduce is leveraging SVGD-generated samples as frequencies $\w$ in RFFs for accurately reconstructing kernels with known spectral distributions. While QMC \citep{morokoff_quasi-monte_1995} sampling typically yields high-quality reconstructions by necessitating tractable inverse-CDFs of kernel spectral measures -- a requirement not met by many common kernels -- SVGD circumvents this by merely requiring the spectral measure's score gradient.

In a comparative experiment focused on the Gaussian (RBF) kernel, we evaluate the approximation quality of randomized kernel Gram matrices $\K$ using RFFs with varied sampling techniques as well as other common low-rank kernel approximation methods. Specifically, we benchmark Matrix-SVGD \citep{wang_stein_2019} sampling against MC, QMC, orthogonal random features (ORF) \citep{yu_orthogonal_2016}, and Nystr\"{o}m approximation (NYS) \citep{yang_nystrom_2012}. Results are demonstrated in Figure \ref{fig:kernel}, where we use as a metric the Frobenius norm $\frac{||\K - \hat{\K}||}{||\K||}$ between Gram approximation $\hat{\K}$ and true Gram matrix $\K$.

The results underscore SVGD's strong approximation capabilities over other sampling techniques across different ranks $R$, and notably, SVGD outperforms data-dependent methods like Nystroem as $R$ increases.  Across data dimensionality $d$, SVGD scales better than existing sampling based approaches, though not as efficiently as Nystr\"{o}m. Nonetheless, given the challenges kernel methods face in high-dimensional spaces, practitioners likely resort to dimensionality reduction before applying kernel techniques in such high-dimensional settings.
% Furthermore, recent work (CITE) demonstrate RFF's potential in approximating Spectral Mixture Kernels (SMKs), facilitating a sparser representation of a popular kernel learning approach.  SVGD can also be used within this domain to improve sample quality of the Gaussian mixture which parametrizes SMK. We provide a proof of concept in Figure FIG, which also serves as a usual visual aide that delineates the differential impacts of SVGD on sample quality.

\subsection{UCI Regression Benchmarks}
\begin{figure*}[!ht]
  \centering
  \includegraphics[width=0.9\linewidth]{figures/ocean.pdf}
  \caption{\textit{AUSWAVE} dataset and error with contributed methods labeled in \textcolor{blue}{blue}.}\label{fig:ocean}
\end{figure*}
We evaluate M-SRFR on a variety of regression problems from the UCI data repository \citep{Dua2019}, with baselines of sparse variational Gaussian processes \citep{hensman_scalable_2015} (SVGP), SSGPs with an RFF Gaussian kernel (SSGP-RBF), an SSGP with frequencies as hyperparameters, and an SSGP trained using the entropy-regularized functional kernel learning approach described in Section \ref{sec:fkl} (SSGP-SVGD), for which more details can be found in \citet{wang_nonlinear_2019}. Additionally we introduce SSGP-$R^*$, which has $R^* = \sqrt[3]{MR^3}$ frequency samples, matching M-SRFR's computational complexity for $M$ mixture components. 

All models and baselines are given a proper hyperparameter optimization treatment. The results in Table \ref{tab:uci}, where $M$ represents the number of mixture components used in the M-SRFR model, highlight M-SRFR's superior RMSE performance,  particularly when contrasted with SSGP-$R^*$, underscoring the mixture approach's advantage over simply enhancing RFF feature count. We employed wide Gaussian priors on the M-SRFR frequencies, and posit that specialized priors tailored to data characteristics may further enhance performance.

\paragraph{Limitations in Uncertainty Calibration} NLPD, which measures uncertainty calibration, results vary. All models exhibit high variance across seeds, hence we do not bold results. M-SRFR is competitive in NLPD on many datasets but is outperformed by simpler baselines that tend to make wider and less mean-accurate predictions, as we demonstrate in Appendix \ref{sec:viz}. The drop-off in performance from RMSE to NLPD for M-SRFR may be due to the non-standard predictive methodology defined in Algorithm \ref{alg:msrfr_preds}, suggesting potential refinements for future investigations.

\subsection{Large-Scale Ocean Modeling}
Lastly, we evaluate our methods on a real-world problem using public data sourced from the \textit{AUSWAVE} physics model produced by the \cite{auswave}. The task is to predict significant wave height across the spatial domain, shown in Figure \ref{fig:ocean}, using $N = 5000$ randomly sampled locations and an input dimension $d=8$  consisting of the spatial coordinates with additional physical model covariates. 

We chose this setting as it is an inherently non-stationary domain, both due to the complexity of oceanographic modeling, as well as the fact that the distribution is not supported over the entire spatial domain. As such, we demonstrate the flexibility of M-SRFR to adapt to alternative kernel learning methodologies by introducing a non-stationary M-SRFR variant through the use of deep kernels \citep{wilsonDeep2016}. Specifically, we jointly train a neural network with 3 hidden layers and 32 activations per layer to first project the data before an M-SRFR kernel mixture of RFFs is applied. The neural network is trained jointly with the M-SRFR RFF parameters, but does not receive the same ``mixture'' treatment -- ie. all $M$ M-SRFR kernels share the same input network. 

In this sense, we are measuring M-SRFR's ability to slot in as a modular component to alternative kernel learning schemes, and whether the benefits of the mixture approach extend to such a setting. We include an SSGP with a deep kernel, SSGP-Deep, as an additional baseline in order to differentiate the effect of the M-SRFR mixture from the neural network projection.

The results, shown in Figure \ref{fig:ocean} (right), show that M-SRFR's flexibility offers significant benefit. Most the of the stationary baselines, including traditional M-SRFR, have difficulty adjusting to the non-stationary domain. However, the deep M-SRFR variant significantly outperforms even the SSGP-Deep variant, as well as all baselines, demonstrating that there is unique value to the mixture approach. These results highlight that with little change of methodology, M-SRFR can be injected into alternative kernel learning schemes to improve performance and flexibility. 

% ====================================================================================== Conclusion
\section{Conclusion}
\label{conc}
This study introduces Stein variational gradient descent to kernel approximation and Bayesian inference over spectral measures with random Fourier features and sparse spectrum Gaussian processes. We establish a theoretical framework linking these areas through functional inference, highlighting their coherence as particle-based methods. We derive a method for approximate inference of kernel spectral measures using only gradient evaluations of mixture components, which is straightforward to extend to many spectrum-based kernel learning methods. Empirical evaluations showcase the potential of integrating these methodologies to augment kernel approximation and sparse GP regression.

This work paves way for future research exploring the integration of RFFs and SVGD. One such avenue is the implementation of M-SRFR into other spectral kernel learning techniques of Section \ref{related} to address challenges involving nonstationary processes, high-dimensional data, and non-Gaussian likelihoods. 
Future theoretical work will also focus on error and convergence analysis of our methods, which depend on further development of theoretical frameworks for the analysis of SVGD over infinite-dimensional spaces.
%, as well as investigation into how M-SRFR can be used for theoretically sound Bayesian inference with non-uniform spectral measure priors.

% ====================================================================================== References
\bibliography{references2}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\onecolumn
\title{Stein Random Feature Regression\\(Supplementary Material)}
\maketitle

\appendix

\section{Functional kernel learning with entropy regularisation} %TODO: Draft to be turned into a proof later on
\label{app:functional-kl}
We can derive functional gradients from the definition of Fr\'{e}chet derivatives and an analogous of Taylor's theorem for Hilbert spaces. Let $\Hspace_\kappa^\xdim := \bigotimes_{i=1}^\xdim \Hspace_\kappa$ be the vector-valued reproducing kernel Hilbert space defined by a positive semi-definite kernel $\kappa: \Omega\times\Omega\to\R$, for $\Omega\subseteq\R^\xdim$, which has the following reproducing property:
\begin{equation}
    \forall \vec{\hfunction} \in \Hspace_\kappa^\xdim, \quad \forall \anyvector \in \R^\xdim, \qquad \inner{\vec\hfunction, \kappa(\cdot, \w) \anyvector}_\kappa = \vec\hfunction(\w)^\transpose\anyvector\,,
\end{equation}
where $\inner{\cdot,\cdot}_\kappa$ denotes the inner product in $\Hspace_\kappa^\xdim$. Following the SVGD setup, at each time step $\iterIdx$, we consider a given variational probability distribution $\variational_\iterIdx$ on $\R^\xdim$ and apply a smooth transformation $\tf_{\vec\hfunction_\iterIdx}:\R^\xdim\to\R^\xdim$ to its samples $\x_\iterIdx \sim \variational_\iterIdx$:
\begin{equation}
    \w_{\iterIdx+1} = \tf_{\vec\hfunction_\iterIdx}(\w_\iterIdx) = \w_\iterIdx + \vec\hfunction_\iterIdx(\w_\iterIdx)\,,
\end{equation}
where $\vec\hfunction \in \Hspace_\kappa^\xdim$, so that the next variational distribution is the pushforward of the former, i.e., $\variational_{\iterIdx+1} := \variational_{\vec\hfunction_\iterIdx} := \tf_{\vec\hfunction_\iterIdx} \# \variational_\iterIdx$.

We want to transform the GP kernel in its Fourier domain in order to minimise the GP negative log marginal likelihood. At the same time, to prevent overfitting, we may include an entropy-regularisation term into our objective, which allows for modeling uncertainty about the optimal $\variational$ when data is limited. This leads us to the following functional objective for the transform of the variational frequencies distribution $\variational_\vec\hfunction = \tf_{\vec\hfunction}\#\variational$:
\begin{equation}
    \functional{f}[\vec\hfunction] := \functional{l}[\variational_{\vec\hfunction}] - \regf\entropy[\variational_\vhfunction]\,,
\end{equation}
for $\regf > 0$. Applying an SVGD-inspired approach, the optimal $\variational$ results from taking a sequence of optimal transformations in the RKHS $\Hspace_\kappa^\xdim$. The direction of steepest descent in $\Hspace_\kappa^\xdim$ is given by the functional gradient $\nabla_{\vec\hfunction}\functional{f}[\vec\hfunction]$, which is such that:
\begin{definition}[Functional gradient]
\label{def:func-grad}
The gradient of a functional $\functional{f}:\Hspace^\xdim\to\R$ at a point $\vhfunction\in\Hspace^\xdim$, defined over a Hilbert space $\Hspace^\xdim$ equipped with inner product $\inner{\cdot,\cdot}$, is the vector $\nabla_\vhfunction \functional{f}[\vhfunction] \in \Hspace^\xdim$ such that:
\begin{equation}
    \functional{f}[\vec\hfunction + \stepsize\vec\otherf] = \functional{f}[\vec\hfunction] + \stepsize\inner{\nabla_{\vec\hfunction}\functional{f}[\vec\hfunction], \otherf} + \bigO{\stepsize^2}\,.
    \label{eq:fgrad}
\end{equation}
\end{definition}
Given an initial $\variational$, applying an infinitesimal step in the direction of steepest descent means we only need to know $\nabla_{\vec\hfunction}\functional{f}[\vec\hfunction]$ at the limit when $\vec\hfunction \to 0$.

\subsection{Stationary covariance functions}
To calculate the functional gradient, we will follow the steps of \citet{mallick_deep_2021} in the derivation of their functional gradient for the GP NLL. We start by noticing that, by the chain rule, we have:
\begin{equation}
    \nabla_{\vec\hfunction}\functional{l}[\variational_{\vec\hfunction}] = \sum_{i,j=1}^n \frac{\partial \functional{l}}{\partial K_{ij}} \nabla_{\vec\hfunction} K_{ij}[\vec\hfunction]\,,
\end{equation}
where:
\begin{align}
    K_{ij}[\vec\hfunction] &:= k_{\vec\hfunction}(\x_i, \x_j), \qquad i,j \in \{1,\dots,n\}\\
    k_{\vec\hfunction}(\x, \x') &:= \int_{\R^\xdim} \variational_{\vec\hfunction}(\w)e^{\iota\w^\transpose(\x - \x')}\diff\w, \quad \iota:=\sqrt{-1}
\end{align}
Let $\rho_{ij}(\w) := e^{\iota\w^\transpose(\x_i - \x_j)}$, for $i,j\in\{1,\dots,n\}$. Applying Taylor's expansion and the reproducing property leads us to:
\begin{equation}
\begin{split}
    \rho_{ij}(\w + \vhfunction(\w) + \stepsize\othervf(\w)) &= \rho_{ij}(\w + \vhfunction(\w)) + \stepsize \nabla \rho_{ij} (\w+\vhfunction(\w)) \cdot \othervf(\w) + \bigO{\stepsize^2\norm{\othervf(\w)}^2_2}\\
    &=\rho_{ij}(\w + \vhfunction(\w)) + \stepsize \inner{\kappa(\cdot, \w)\nabla \rho_{ij} (\w+\vhfunction(\w)), \othervf}_\kappa + \bigO{\stepsize^2}\,,
\end{split}
\end{equation}
noting that $\bigO{\stepsize^2\norm{\othervf(\w)}^2_2}$ is $\bigO{\stepsize^2}$, given that $\norm{\othervf(\w)}_2 \leq \norm{\othervf}_\kappa\sqrt{\kappa(\w,\w)}$ is bounded for any $\w\in\R^\xdim$, assuming a bounded kernel. Since $K_{ij}[\vhfunction] = \expectation_{\variational_{\vhfunction}(\w)}[\rho_{ij}(\w)] = \expectation_{\variational(\w)}[\rho_{ij}(\w + \vhfunction(\w))]$, we have that:
\begin{equation}
    \begin{split}
        K_{ij}[\vhfunction + \stepsize\othervf] - K_{ij}[\vhfunction] &= \expectation_{\variational(\w)} [\rho_{ij}(\w + \vhfunction(\w) + \stepsize\othervf(\w)) - \rho_{ij}(\w + \vhfunction(\w))]\\
        &= \stepsize \expectation_{\variational(\w)}[\inner{\kappa(\cdot, \w)\nabla \rho_{ij} (\w+\vhfunction(\w)), \othervf}_\kappa] + \bigO{\stepsize^2}\\
        &= \stepsize\inner{\expectation_{\variational(\w)}[\kappa(\cdot, \w)\nabla \rho_{ij} (\w+\vhfunction(\w))], \othervf}_\kappa + \bigO{\stepsize^2}
    \end{split}
\end{equation}
Applying \autoref{eq:fgrad} to $K_{ij}[\vhfunction]$, the functional gradient of the kernel is then given by:
\begin{equation}
    \nabla_\vhfunction K_{ij}[\vhfunction]\big|_{\vhfunction=\vec 0} = \expectation_{\variational(\w)}[\kappa(\cdot, \w)\nabla_{\w} \rho_{ij} (\w)]\,,
\end{equation}
and, for the NLL, we have:
\begin{equation}
    \begin{split}
        \nabla_{\vec\hfunction}\functional{l}[\variational_{\vec\hfunction}]\big|_{\vhfunction=\vec 0} &= \sum_{i,j=1}^n \frac{\partial \functional{l}}{\partial K_{ij}} \expectation_{\variational(\w)}[\kappa(\cdot, \w)\nabla_{\w} \rho_{ij} (\w)]\\
        &= \expectation_{\variational(\w)}\left[ \kappa(\cdot, \w) \sum_{i,j=1}^n \frac{\partial \functional{l}}{\partial K_{ij}} \nabla_{\w} \rho_{ij} (\w) \right]\,.
    \end{split}
\end{equation}
For a particle-based approximation of the kernel $\hat{K}_{ij} \approx \frac{1}{R}\sum_{r=1}^R \cos(2\pi\w^\transpose_r(\x_i - \x_j))$, the equations above simplify to:
\begin{equation}
    \begin{split}
        \nabla_\vhfunction K_{ij}[\vhfunction]\big|_{\vhfunction=\vec 0} \approx \nabla_\vhfunction \hat{K}_{ij}[\vhfunction]\big|_{\vhfunction=\vec 0} &= \frac{1}{R}\sum_{r=1}^R\kappa(\cdot,\w_r)\nabla_{\w_r}\cos(2\pi\w_r^\transpose(\x_i-\x_j))\\
        &= \sum_{r=1}^R \kappa(\cdot,\w_r)\nabla_{\w_r} \hat{K}_{ij}\,,
    \end{split}
\end{equation}
and, for the NLL with the particle-based kernel, we have:
\begin{equation}
    \begin{split}
        \nabla_{\vec\hfunction}\functional{l}[\variational_{\vec\hfunction}]\big|_{\vhfunction=\vec 0} \approx \nabla_{\vec\hfunction}\hat{\functional{l}}[\variational_{\vec\hfunction}]\big|_{\vhfunction=\vec 0} &= \sum_{r=1}^R \kappa(\cdot,\w_r) \sum_{i,j=1}^n \frac{\partial \hat{\functional{l}}}{\partial \hat{K}_{ij}}\nabla_{\w_r} \hat{K}_{ij}\\
        &= \sum_{r=1}^R\kappa(\cdot,\w_r) \nabla_{\w_r} \hat{\functional{l}}[\variational]\,,
    \end{split}
\end{equation}
which follows by another application of the chain rule.

Now, for the entropy-regularisation term, we have:
\begin{equation}
    \begin{split}
        \entropy[\variational_\vhfunction] &= \expectation_{\variational_\vhfunction(\w)}[-\log \variational_\vhfunction(\w)]\\
        &= \expectation_{\variational(\w)}[-\log \variational_\vhfunction(\w + \vhfunction(\w))]
    \end{split}
\end{equation}
By the change-of-variable formula for $\variational_{\vhfunction} = \tf_\vhfunction\#\variational$, we also have:
\begin{equation}
    \log \variational_{\vhfunction}(\w + \vhfunction(\w)) = \log \variational(\w) - \log \lvert \det(\eye + \nabla_{\w} \vhfunction(\w)) \rvert\,,
\end{equation}
where $\nabla_{\w} \vhfunction(\w)$ denotes the Jacobian matrix of $\vhfunction$. Applying the functional gradient formula (\autoref{eq:fgrad}) then yields:
\begin{equation}
    \begin{split}
        \entropy[\variational_{\vhfunction+\stepsize\othervf}] &= \expectation_{\variational_{\vhfunction + \stepsize\othervf}(\w)}[- \log\variational_{\vhfunction + \stepsize\othervf}(\w)]\\
        &= \expectation_{\variational_\vhfunction(\w)}[- \log\variational_{\vhfunction + \stepsize\othervf}(\w + \stepsize\othervf(\w))] \\
        &= \expectation_{\variational_\vhfunction(\w)}[- \log\variational_{\vhfunction}(\w) + \log|\det(\eye + \stepsize\nabla_{\w}\othervf(\w)|]\\
        &= \entropy[\variational_\vhfunction] + \expectation_{\variational_\vhfunction(\w)}[\log|\det(\eye + \stepsize\nabla_{\w}\othervf(\w)|]\\
        &= \entropy[\variational_\vhfunction] + \stepsize \expectation_{\variational_\vhfunction(\w)}\left[\tr\left(\frac{\partial\log|\det\mat{M}|}{\partial\mat{M}}\Bigg|_{\mat{M}=\eye} \nabla_{\w}\othervf(\w) \right)\right] + \bigO{\stepsize^2}\\
        &= \entropy[\variational_\vhfunction] + \stepsize \expectation_{\variational_\vhfunction(\w)}[\tr\left(\nabla_{\w}\othervf(\w) \right)] + \bigO{\stepsize^2}\\
        &= \entropy[\variational_\vhfunction] + \stepsize \expectation_{\variational_\vhfunction(\w)}[\nabla_{\w}\cdot\othervf(\w)] + \bigO{\stepsize^2}\\
        &= \entropy[\variational_\vhfunction] + \stepsize \inner{\expectation_{\variational_\vhfunction(\w)}[\nabla_{\w}\kappa(\cdot,\w)], \othervf}_\kappa + \bigO{\stepsize^2}\,,
    \end{split}
\end{equation}
where applied Taylor expansion to the log-determinant term around the identity matrix $\eye$ and then the reproducing property of the kernel to extract the kernel gradient out of the divergent of $\othervf$. As a result, we have:
\begin{equation}
    \nabla_\vhfunction\entropy[\variational_\vhfunction]\big|_{\vhfunction=\vec 0} = \expectation_{\variational(\w)}[\nabla_{\w}\kappa(\cdot,\w)] \approx \frac{1}{R}\sum_{r=1}^R \nabla_{\w_r}\kappa(\cdot,\w_r)
\end{equation}

Combining the results above finally yields:
\begin{equation}
    \begin{split}
        \nabla_{\vhfunction}\functional{f}[\vhfunction] \big|_{\vhfunction=\vec 0} &= \expectation_{\variational(\w)}\left[ \kappa(\cdot, \w) \sum_{i,j=1}^n \frac{\partial \functional{l}}{\partial K_{ij}} \nabla_{\w} \rho_{ij} (\w) - \regf\nabla_{\w}\kappa(\cdot,\w)\right]\\
        &\approx \sum_{r=1}^R\kappa(\cdot,\w_r) \nabla_{\w_r} \hat{\functional{l}}[\variational] -  \frac{\regf}{R}\nabla_{\w_r}\kappa(\cdot,\w_r)\\
        &=- \sum_{r=1}^R\kappa(\cdot,\w_r) \nabla_{\w_r} \log p(\y|\w_1, \dots, \w_R) +  \frac{\regf}{R}\nabla_{\w_r}\kappa(\cdot,\w_r)\,.
    \end{split}
\end{equation}
Note that the factor $\frac{1}{R}$ can further be absorbed into the regularisation factor $\regf$, which is a hyper-parameter of the algorithm.

\paragraph{Frequency update steps.} Given the functional gradient formulation above, the resulting update steps are given by:
\begin{equation}
    \begin{split}
        \w_i^{(t+1)} &= \w_i^{(t)} - \stepsize \nabla_\vhfunction \functional{f}[\vhfunction](\w_i^{(t)})\big|_{\vhfunction=\vec 0}\\
        &\approx \w_i^{(t)} + \stepsize \sum_{r=1}^R\kappa(\w_i^{(t)},\w_r) \nabla_{\w_r} \log p(\y|\w_1, \dots, \w_R) +  \frac{\regf}{R}\nabla_{\w_r}\kappa(\w_i^{(t)},\w_r)\,,
    \end{split}
\end{equation}
for $i \in \{1, \dots, R\}$.

% \subsection{Non-stationary covariance functions}
% The formulation above can be generalised to the case of non-stationary covariance functions. In fact, by Yaglom's theorem %TODO: Add REF
% any positive semi-definite kernel can be described in terms of its inverse Fourier transform as:
% \begin{equation}
%     k(\x, \x') = \int_{\Omega\times\Omega} e^{i(\w\cdot \x - \w'\cdot\x')}\diff P(\w, \w')\,,
% \end{equation}
% where $P$ is a measure associated with a positive semi-definite density over the product space $\Omega\times\Omega$. Note that stationary kernels can be recovered from the same formulation whenever is $P$ concentrates its mass along the diagonal $\w=\w'$.

% Given a general probability density function $\variational$, one way to build a positive semi-definite p.d.f. $p_\variational$ out of it is by letting: %TODO: Cite non-stationary GPs for spatial modeling paper
% \begin{equation}
%     p_\variational(\w, \w') := \frac{1}{4}(\variational(\w, \w') + \variational(\w', \w) + \variational(\w, \w) + \variational(\w',\w'))\,.
% \end{equation}
% To write a general formulation for the kernel, we further introduce the following notations:
% \begin{align}
%     \overline{\w} &:= \begin{bmatrix}
%         \w\\
%         \w'
%     \end{bmatrix}\\
%     \mat{\Pi} &:= \begin{bmatrix}
%         \eye &\vec 0
%     \end{bmatrix}\\
%     \mat{\Pi'} &:= \begin{bmatrix}
%         \vec 0 &\eye
%     \end{bmatrix}
% \end{align}

% The kernel formulation can then be rewritten as:
% \begin{equation}
%     k(\x, \x') = \frac{1}{4} \sum_{\mat M, \mat M' \in \{\mat\Pi, \mat{\Pi'}\}} \int_{\Omega^2} e^{i(\x \cdot \mat M\overline\w - \x' \cdot \mat M'\overline\w)}\variational(\overline\w)\diff\overline{\w},,
% \end{equation}
% where with a slight abuse of notation we redefined $\variational(\overline{\w}) := \variational(\w, \w')$.
% Now we can apply the results of the previous section to derive functional gradients for SVGD-like transforms to the unconstrained joint measure $\variational$ over the product space.

% Firstly, we derive the kernel functional gradient for a transform:
% \begin{equation}
%     \tf_\vhfunction(\overline{\w}) := \overline{\w} + \vhfunction(\overline{\w})\,,
% \end{equation}
% which will be applied to $\variational(\overline{\w})$, yielding 
% $\variational_\vhfunction := \tf_\vhfunction \# \variational$. Letting $\rho_{ij}(\w, \w') := e^{\w\cdot\x_i - \w'\cdot\x_j}$, for $i, j \in \{1,\dots, \nObs\}$, we have:
% \begin{equation}
%     \begin{split}
%         K_{ij}[\vhfunction] &:= \frac{1}{4} \sum_{\mat M, \mat M' \in \{\mat\Pi, \mat{\Pi'}\}} \expectation_{\variational_\vhfunction(\overline{\w})}[\rho_{ij}(\mat M \overline{\w}, \mat M' \overline{\w})]\\
%         &=\frac{1}{4} \sum_{\mat M, \mat M' \in \{\mat\Pi, \mat{\Pi'}\}} \expectation_{\variational(\overline{\w})}[\rho_{ij}(\mat M (\overline{\w} + \vhfunction(\overline{\w})), \mat M'(\overline{\w} + \vhfunction(\overline{\w}))]
%     \end{split}
% \end{equation}

\section{Distributional gradient}
\label{app:distributional-svgd}
In this section, we present a proof for \autoref{thr:kl-grad} and a discussion on how we go from the theorem's result to operations over matrices of spectral frequencies.

\subsection{Auxiliary notation}
We make use of notation shortcuts to express our main result in a compact form. We consider an RKHS $\Hspace_{\mat\Sigma}$ of vector-valued functions associated with a positive-definite matrix-valued kernel $\mat\Sigma: \Omega\times\Omega\to\set{B}(\Omega)$, where $\set{B}(\Omega)$ denotes the space of bounded-linear operators mapping $\Omega$ to $\Omega$. When $\Omega\subseteq\R^\xdim$, this simplifies to $\set{B}(\Omega) = \R^{\xdim\times\xdim}$, i.e., the space of $\xdim$-by-$\xdim$ real-valued matrices. Furthermore, we will focus on the case of $\mat\Sigma(\w,\w') := \kappa(\w,\w')\eye$, where $\kappa:\Omega\times\Omega\to\R$ is positive-definite scalar-valued kernel, for all $\w, \w'\in\Omega$. In this case, it is not hard to show that $\Hspace_{\mat\Sigma} = \Hspace_\kappa^\xdim := \bigotimes_{i=1}^\xdim \Hspace_\kappa$, i.e., the Cartesian product of $\xdim$ copies of the scalar-valued RKHS $\Hspace_\kappa$.

\paragraph{Point evaluation and inner product.} The reproducing property of a kernel $\mat\Sigma$ associated with a vector-valued RKHS states that \citep{alvarez2012kernels}:
\begin{align}
    \forall\vsfunction\in\Hspace_{\mat\Sigma}, \quad \inner{\vsfunction, \mat\Sigma(\cdot, \w)\anyvector}_{\mat\Sigma} = \inner{\vsfunction(\w), \anyvector}_2 = \vsfunction(\w)^\transpose \anyvector\,, \forall \anyvector\in \R^\xdim\,,
\end{align}
where $\inner{\cdot,\cdot}_2$ denotes the inner product associated with the 2-norm, i.e., the dot product in this case. If $\mat\Sigma(\w,\w') := \kappa(\w,\w')\eye$, $\forall\w,\w'\in\Omega$, we then have that  $\vsfunction(\cdot)^\transpose \anyvector:\Omega\to\R$ is an element of $\Hspace_\kappa$, so that $\vsfunction(\w)^\transpose \anyvector=\inner{\vsfunction(\cdot)^\transpose \anyvector, \kappa(\cdot, \w)}_\kappa$. Therefore, we will denote inner products in this vector-valued RKHS with the same notation subscript as for the scalar-valued case:
\begin{equation}
    \begin{split}
         \inner{\vsfunction, \kappa(\cdot, \w)\anyvector}_\kappa := \inner{\vsfunction, \kappa(\cdot, \w)\anyvector}_{\mat\Sigma} = \inner{\vsfunction, \mat\Sigma(\cdot, \w)\anyvector}_{\mat\Sigma}\,.
    \end{split}
\end{equation}

\paragraph{Matrix-valued evaluations.} Evaluating a function $\vsfunction\in\Hspace_\kappa^\xdim$ on a matrix of inputs $\mat\Omega_n := [\w_1, \dots, \w_n]^\transpose \in \R^{n\times\xdim}$ yields $\vsfunction(\mat\Omega_n) := [\vsfunction(\w_1),\dots,\vsfunction(\w_n)]^\transpose \in \R^{n\times\xdim}$. By the reproducing property of the kernel, we also have that:
\begin{equation}
    \begin{split}
        \forall \mat M := [\vec m_1, \dots, \vec m_n]^\transpose \in \R^{n\times\xdim}, \quad \inner{\vsfunction(\mat\Omega_n), \mat M}_2 &= \tr(\vsfunction(\mat\Omega_n)^\transpose \mat M)\\
        &= \tr\left(\sum_{i=1}^n \vsfunction(\w_i) \vec{m}_i^\transpose \right)\\
        &= \sum_{i=1}^n \vsfunction(\w_i)^\transpose \vec{m}_i\\
        &= \sum_{i=1}^n \inner{\vsfunction, \kappa(\cdot, \w_i)\vec{m}_i}_\kappa\\
        &= \flexinner{\vsfunction, \sum_{i=1}^n \kappa(\cdot, \w_i)\vec{m}_i}_\kappa\,,
    \end{split}
\end{equation}
where $\inner{\cdot, \cdot}_2$ corresponds to the Frobenius inner product when applied to matrices.
We therefore denote $\kappa(\cdot, \mat\Omega_n)$ as the operator mapping matrices in $\R^{n\times\xdim}$ to functions in $\Hspace_\kappa^\xdim$ which is such that:
\begin{align}
    \kappa(\cdot, \mat\Omega_n)\mat M &= \sum_{i=1}^n \kappa(\cdot, \w_i)\vec{m}_i \in \Hspace_\kappa^\xdim\\
    \kappa(\mat\Omega_m', \mat\Omega_n)\mat M &= \begin{bmatrix}
        \kappa(\w_1', \w_1) &\dots &\kappa(\w_1', \w_n)\\
        \vdots &\ddots &\vdots\\
        \kappa(\w_m', \w_1) &\dots &\kappa(\w_m', \w_n)
    \end{bmatrix} \mat M
\end{align}
for any $\mat\Omega_m' := [\w_1', \dots, \w_m']^\transpose \in \R^{m\times\xdim}$.

\paragraph{Jacobians.} The reproducing property also allows us to express Jacobians of a function in terms of kernel gradients as:
\begin{equation}
    \forall \vsfunction \in \Hspace_\kappa^\xdim, \quad \nabla_\anyvector \vsfunction(\anyvector) = \inner{\vsfunction, \nabla_\anyvector \kappa(\cdot, \anyvector)}_\kappa\,, \quad \forall \anyvector\in\R^\xdim\,.
\end{equation}
For matrix-valued transformations $\mat\Omega_n \mapsto \vsfunction(\mat\Omega_n)$ \citep[Ch. 1]{Gupta1999}, we further have that:
\begin{equation}
    \nabla_{\mat\Omega_n} \vsfunction(\mat\Omega_n) := \begin{bmatrix}
        \nabla_{\w_1} \vsfunction(\w_1) &\dots &\nabla_{\w_1} \vsfunction(\w_n)\\
        \vdots &\ddots &\vdots\\
        \nabla_{\w_n} \vsfunction(\w_1) &\dots &\nabla_{\w_n} \vsfunction(\w_n)
    \end{bmatrix}
    = \begin{bmatrix}
        \nabla_{\w_1} \vsfunction(\w_1) &\mat 0 &\dots & \mat 0\\
        \mat 0 &\nabla_{\w_2} \vsfunction(\w_2) &\dots &\mat 0\\
        \vdots &\vdots &\ddots &\vdots\\
        \mat 0 &\mat 0 &\dots & \nabla_{\w_n} \vsfunction(\w_n)
    \end{bmatrix}\,,
\end{equation}
so that the following holds for the trace and the determinant of the Jacobian:
\begin{align}
    \tr(\nabla_{\mat\Omega_n} \vsfunction(\mat\Omega_n)) &= \sum_{i=1}^n \tr(\nabla_{\w_i} \vsfunction(\w_i))\\
    |\nabla_{\mat\Omega_n} \vsfunction(\mat\Omega_n)| &= \prod_{i=1}^n |\nabla_{\w_i} \vsfunction(\w_i)|\,,
\end{align}
where $|\cdot|$ denotes the absolute value of the determinant. Considering the reproducing property of the kernel, we have that:
\begin{equation}
    \begin{split}
        \tr(\nabla_{\mat\Omega_n} \vsfunction(\mat\Omega_n)) = \sum_{i=1}^n \tr(\nabla_{\w_i} \vsfunction(\w_i)) = \flexinner{ \vsfunction, \sum_{i=1}^n \nabla_{\w_i} \kappa(\cdot, \w_i))}_\kappa = \inner{\vsfunction, \nabla_{\mat\Omega_n}\kappa(\cdot, \mat\Omega_n)}_\kappa \,.
    \end{split}
\end{equation}

\subsection{Proof of main result}
\begin{proof}[Proof of \autoref{thr:kl-grad}]
    Let $\vgfunction, \vsfunction \in \Hspace_\kappa^\xdim$, where $\kappa:\R^d\times\R^d\to\R$ is a positive-definite kernel over $\R^d$. Define a transform $\tf_\vgfunction:\Fh\to\Fh$ as a mapping such that $\tf_\vgfunction(\vhfunction)(\w) = \vhfunction(\w) + \vgfunction(\vhfunction(\w))$, for all $\w\in \R^d$. Considering the definition of functional gradient (\autoref{def:func-grad}) and the KL divergence between the measures $\Qm_\vgfunction := \tf_\vgfunction \# \Qm$ and $\Pm_*$, we have:
    \begin{equation}
        \begin{split}
            \kl{\Qm_{\vgfunction + \epsilon\vsfunction}}{\Pm_*} &= \sup_{n, \mat\Omega_n} \kl{q_{\vgfunction+\epsilon\vsfunction}(\mhfunction_n)}{p_*(\mhfunction_n)}\\
            &=\kl{q_{\vgfunction+\epsilon\vsfunction}(\mhfunction^*)}{p_*(\mhfunction^*)}\\
            &= \expectation_{\vhfunction\sim \Qm_\vgfunction} \left[ \log q_{\vgfunction + \epsilon\vsfunction} (\mhfunction^* + \epsilon\vsfunction(\mhfunction^*)) - \log p_* (\mhfunction^* + \epsilon\vsfunction(\mhfunction^*))\right]\,,
        \end{split}
    \end{equation}
    where $\mhfunction^* := \vhfunction(\mat\Omega_{n^*}^*)$, as defined in the theorem statement, assuming the supremum is achieved at a finite $n^*$ and that $\Omega_{n^*}^*$ exists.
    Now applying the change-of-variable formula and a Taylor expansion on the resulting log-determinant, for the first term in the supremum, taking any $n\in\N$ and $\mat\Omega_n \subset \Omega$, with $\mhfunction_n := \vhfunction(\mat\Omega_n)$, $\vhfunction \sim \Qm_\vgfunction$, we have:
    \begin{equation}
    \begin{split}
        \log q_{\vgfunction + \epsilon\vsfunction} (\mhfunction_n + \epsilon\vsfunction(\mhfunction_n)) &= \log q_{\vgfunction} (\mhfunction_n) - \log |\eye + \epsilon\nabla_{\mhfunction_n}\vsfunction(\mhfunction_n))|\\
        &=\log q_{\vgfunction} (\mhfunction_n) - \log |\eye| - \epsilon\flexinner{\nabla_{\mat M}\log|\mat M|\bigg|_{\mat M = \eye}, \nabla_{\mhfunction_n}\vsfunction(\mhfunction_n)}_2 + \bigO{\epsilon^2}\\
        &= \log q_{\vgfunction} (\mhfunction_n) -\epsilon\tr(\nabla_{\mhfunction_n}\vsfunction(\mhfunction_n))) + \bigO{\epsilon^2}\\
        &= \log q_{\vgfunction} (\mhfunction_n) -\epsilon\inner{\vsfunction,\nabla_{\mhfunction_n}\kappa(\cdot, \mhfunction_n)}_\kappa + \bigO{\epsilon^2}
    \end{split}
    \end{equation}
    where $\inner{\cdot, \cdot}_2$ here denotes the Frobenius inner product between matrices, and we applied the reproducing property of $\kappa$ to derive the last term.

    For the second term in the supremum, also applying Taylor's theorem and the reproducing property yields:
    \begin{equation}
    \begin{split}
        \log p_* (\mhfunction_n + \epsilon\vsfunction(\mhfunction_n)) &= \log p_* (\mhfunction_n) + \epsilon\inner{\nabla_{\mhfunction_n}\log p(\mhfunction_n), \vsfunction(\mhfunction_n)}_2 + \bigO{\epsilon^2}\\
        &= \log p_* (\mhfunction_n) + \epsilon\inner{\kappa(\cdot, \mhfunction_n)\nabla_{\mhfunction_n}\log p(\mhfunction_n), \vsfunction}_\kappa + \bigO{\epsilon^2}\,.
    \end{split}
    \end{equation}
    Combining the two equations above into the KL divergence, and applying the definition of functional gradient leads us to:
    \begin{equation}
        \nabla_{\vgfunction}\kl{\Qm_{\vgfunction}}{\Pm_*} = - \expectation_{\vhfunction\sim \Qm_\vgfunction}[\kappa(\cdot, \mhfunction^*)\nabla_{\mhfunction^*}\log p(\mhfunction^*) + \nabla_{\mhfunction^*}\kappa(\cdot, \mhfunction^*)]\,,
        \label{eq:kl-g-grad}
    \end{equation}
    which yields the first result in \autoref{thr:kl-grad} by letting $\vgfunction \to 0$.
    
    When applied to transforms over a finite set $\mat\Omega_R = \{\w_i\}_{i=1}^R \subset \R^\xdim$, the KL divergence simplifies to:
    \begin{equation}
        \kl{\Qm}{\Pm^*} = \sup_{n \leq R, \mat\Omega_n \subset\mat\Omega_R} \kl{q(\vhfunction(\mat\Omega_n))}{p_*(\vhfunction(\mat\Omega_n))} = \kl{q(\vhfunction(\mat\Omega_R))}{p_*(\vhfunction(\mat\Omega_R))} \,.
    \end{equation}
    The second result in \autoref{thr:kl-grad} then arises by setting $\mhfunction^* := \vhfunction(\mat\Omega_R)$ in \autoref{eq:kl-g-grad} and letting $\vgfunction \to 0$.
\end{proof}

\subsection{From stochastic processes to distributions over matrices}
\label{sec:thr-discussion}
As a note, we here discuss how \autoref{thr:kl-grad} gives rise to our algorithmic setting, which operates over matrices of frequencies, representing empirical spectral measures. For any fixed $\mat\Omega_R \in \R^{R\times\xdim}$ and \iid samples $\{\vhfunction_i\}_{i=1}^M \overset{\iid}{\sim}\Qm$, note that $\mat\Omega_R^{(i)} := \vhfunction_i(\mat\Omega_R)$ corresponds to the realisation of a random matrix. The corresponding matrix distribution is given by $q_R := \operator{E}_{\mat\Omega_R}\#\Qm$, where $\operator{E}_{\mat\Omega_R}$ is the matrix-valued evaluation operator defined as:
\begin{equation}
    \begin{split}
        \operator{E}_{\mat\Omega_R}: \Hspace_\kappa^\xdim &\to \R^{R\times\xdim}\\
        \vhfunction &\mapsto \vhfunction(\mat\Omega_R)\,,
    \end{split}
\end{equation}
recalling that $\vhfunction(\mat\Omega_R) := [\vhfunction(\w_1), \dots, \vhfunction(\w_R)]^\transpose \in \R^{R\times\xdim}$. Therefore, we can rewrite an expectation over $\Qm$ as an expectation over $q_R$ when it takes the following form:
\begin{equation}
    \expectation_{\vhfunction\sim\Qm}[f(\vhfunction(\mat\Omega_R))] = \expectation_{\mat\Omega \sim q_R}[f(\mat\Omega)]\,,
\end{equation}
for any integrable $f:\R^{R\times\xdim}\to\R$. The initial distribution is arbitrary, and subsequent SVGD steps operate directly on the samples. Hence, we can replace $\mhfunction$ in \autoref{thr:kl-grad} with the matrix particles in the empirical approximations used by SVGD.

\subsection{SVGD update}\label{sec:msrfr_update}
\autoref{thr:kl-grad} results in the following SVGD $M$-particle update rule:
\begin{equation}
    \begin{split}
        \mat\Omega_m^{t+1} &= \mat\Omega_m^t + \frac{\epsilon}{M}\sum_{j=1}^M\kappa(\mat\Omega_m^t, \mat\Omega_j^t)\nabla_{\mat\Omega_j^t}\log p(\mat\Omega_j^t) + \nabla_{\mat\Omega_j^t}\kappa(\mat\Omega_m^t, \mat\Omega_j^t)\,,
    \end{split}
\end{equation}
where, according to the notation for \autoref{thr:kl-grad}, for any $\mat\Omega, \mat\Omega' \in \R^{R\times \xdim}$, we have the kernel matrix as:
\begin{align}\label{eq:matrix_kernel}
    \kappa(\mat\Omega, \mat\Omega') &= \begin{bmatrix}
        \kappa(\w_1, \w_1') &\dots &\kappa(\w_1, \w_R')\\
        \vdots &\ddots &\vdots\\
        \kappa(\w_R, \w_1') &\dots &\kappa(\w_R, \w_R')
    \end{bmatrix}
    \in \R^{R \times R},
\end{align}
and the kernel matrix-valued gradient is given by:
\begin{align}\label{eq:matrix_kernel_grad}
    \nabla_{\mat\Omega'}\kappa(\mat\Omega, \mat\Omega') &= \begin{bmatrix}
        \sum_{j=1}^R\nabla_{\w_j'}\kappa(\w_1, \w_j')^\transpose\\
        \vdots\\
        \sum_{j=1}^R\nabla_{\w_j'}\kappa(\w_R, \w_j')^\transpose
    \end{bmatrix}
    =
    \sum_{j=1}^R \nabla_{\w_j'}
    \begin{pmatrix}
        \kappa(\w_1, \w_j')\\
        \vdots\\
        \kappa(\w_R, \w_j')
    \end{pmatrix}
    = \sum_{j=1}^R \nabla_{\w_j'} {\kappa}(\mat\Omega, \w_j') \in \R^{R \times \xdim},
\end{align}
which is the sum of Jacobian matrices of the vector-valued map ${\kappa}(\mat\Omega, \cdot):\w\mapsto\kappa(\mat\Omega,\w) := [\kappa(\w_1, \w) \dots \kappa(\w_R,\w)]^\transpose \in \R^R$.

% \newpage
\section{Experimental Details}
All experiments were performed on a single desktop using a AMD Ryzen 7 5800X CPU, 32GB of RAM, and an NVIDIA 3080 Ti GPU. Dataset sizes are as follows: 
\begin{table}[h]
\centering
\begin{tabular}{lcccc}
Dataset  & $N$     & $d$ & $N_{train}$ & $N_{test}$ \\
\toprule
airfoil  & 1503    & 5   & 1353        & 150        \\
concrete & 1030    & 8   & 824         & 206        \\
energy   & 768     & 16  & 615         & 153        \\
wine     & 1599    & 11  & 1440        & 159        \\
AUSWAVE  & 1787594 & 8   & 5000        & 1000       \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Hyperparameter Training}
We perform the following number of hyperparameter runs for all experiments, increasing the number of runs as hyperparameter count grows
\begin{table}[h]
\centering
\begin{tabular}{lcc}
Model           & Runs & \# of Hyperparameters \\
\toprule
SVGP            & 30                  & 1                              \\
SSGP-RBF        & 30                  & 2                              \\
SSGP/SSGP-$R^*$ & 30                  & 2                              \\
SSGP-SVGD       & 50                  & 5                              \\
M-SRFR          & 75                  & 6                               \\
\bottomrule
\end{tabular}
\end{table}

\section{Visualizations of Mixture Kernels and Predictions}\label{sec:viz}
We provide a visualization of the learned M-SRFR kernels, as well as a selection of the predictive mixture and combined distributions, for the UCI datasets.

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/airfoil_kernel.pdf}
  \caption{\textit{airfoil} Learned Kernels by Dimension.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/airfoil_preds.pdf}
  \caption{Selection of Single SSGP and M-SRFR Predictive Distributions for \textit{airfoil} Test Points.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/concrete_kernel.pdf}
  \caption{\textit{concrete} Learned Kernels by Dimension.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/concrete_preds.pdf}
  \caption{Selection of Single SSGP and M-SRFR Predictive Distributions for \textit{concrete} Test Points.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/energy_kernel.pdf}
  \caption{\textit{energy} Learned Kernels by Dimension.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/energy_preds.pdf}
  \caption{Selection of Single SSGP and M-SRFR Predictive Distributions for \textit{energy} Test Points.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/wine_kernel.pdf}
  \caption{\textit{wine} Learned Kernels by Dimension.}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[width=0.9\linewidth]{figures/wine_preds.pdf}
  \caption{Selection of Single SSGP and M-SRFR Predictive Distributions for \textit{wine} Test Points.}
\end{figure*}

% \subsection{UCI Dataset Information}
% - airfoil: $N = 1503, d = 6$
% - concrete: $N = 1030, d = 8$
% - energy: $N = 768, d = 16$
% - wine: $N = 1599, d = 11$

% \section{Additional Experimental Results}

%TODO: Add details on experiment settings

% Let $\tf_H: \Pspace \to \Pspace$ be such that $\tf_H(\Pi) = \Pi + H(\Pi)$, where $H: \Pspace\to\Pspace$ is a member of an RKHS $\Hspace$ with an operator valued kernel $K(\Pi,\Pi') = \kappa(\Pi,\Pi') \identity$, where $\identity: \Pspace\to\Pspace$ here stands for the identity operator, i.e., $\identity(\Pi) = \Pi$. Let $F(\Pi_H)$ be a (possibly non-linear) functional of the probability measure $\Pi_H := \tf_H\#\Pi$. We aim to derive $\nabla_H F(\Pi_H)$ to obtain an SVGD update rule. % TODO: We need to equip the space P with some proper structure, maybe the one imposed by using the MMD (of a universal kernel) as a metric, since that's the one used by the distributional kernel.

% \section{Experimental Details}
% CUT
% Owing to Bochner's Theorem (\ref{boch}), in the sparse spectrum regression setting the kernel choice $k_{p(\w)}$ is uniquely defined by distribution $p(\w)$ from which we draw Monte Carlo samples $\{ \w_r \}_{r=1}^R$ for use in the trigonometric basis functions in Equation \ref{eq:rffphi}. Consequently,  We can now 


% In practice, M-SRFR defines a flexible scheme for posterior inference over kernel spectral measures that encourages diversity and imposes minimal assumptions on distributional families. The spectral measure prior $p(\Omega)$, as a result of the SVGD formulation, and unlike mean-field variational inference, offers broad flexibility for incorporation of domain-specific kernel priors regardless of conjugacy. 

% As a result, and motivated by the connections drawn between SVGD and kernel learning in the previous section, we propose a Bayesian treatment of the functional kernel learning problem. This approach reduces the chance of overfitting while also leveraging the unique advantages SVGD in distributional flexibility SVGD bring to Bayesian inference.

% We now present our novel kernel learning methodology for sparse spectrum Gaussian process regression, which we title Stein random feature regression (SRFR). Our method leverages the natural compatibility between the RFF methodology and the particle-based sampling approach of SVGD. We focus on the formulation of our methodology in the regression setting in this section , but refer the reader to section (REF) for a treatment of kernel approximation using known spectral densities this approach.
% The challenge then turns to the manner in which we choose to approximate $p \approx q$ such that $q$ remains a flexible and accurate representation, while also ensuring that we can tractably minimize the objective \eqref{eq:functional_obj} w.r.t to this approximation. We posit that particle-based variational inference presented in Section \ref{sec:stein} is a dual solution to both of these challenges. 

% The support this conjecture, we first note  

% \cite{mallick_deep_2021} demonstrate that in a GP where the kernel covariance matrix $\K$ has entries of the form of \eqref{eq:rffkernelapprox}, we can approximate $p^*$ as a non-parametric particle approximation $q^*$. In this setting, minimizing the gradient of the functional NLL \eqref{eq:functional_obj} can be approximated as:
% \begin{equation}\label{eq:mallick_grad}
%     \begin{split}
%         \nabla_p \mathcal{L} \left[ p \right] &\approx \nabla_q \mathcal{L} \left[ q \right] \\
%         &\approx \sum_{r=1}^R \kappa(\w_r, \cdot) \nabla_{\w_r} \mathcal{L}(\w) ,
%     \end{split}
% \end{equation}


% This reformulation offers a subtle but important difference between the traditional GP hyperparameter training objective or those represented in other RFF kernel learning works (SMKs CITE), in that we are not imposing any constraint on the form of $p$, thus allowing for greater flexibility for $p$ to assume to the form of any stationary kernel in which kernel Hilbert space $\mathcal{H}_k$ contains the function $f$ under study. 

% Note here about now substituting SSGP loss for full GP loss.
% Considering that the low-rank SSGP kernel matrix projection EQN expands to a full-rank kernel matrix with entries defined as the Monte Carlo approximation to Equation \ref{eq:rffkernelapprox}, we produce the following proposition:
% \begin{proposition}\label{prop:particle_ssgp_opt}
%     Gradient-based optimization on frequencies $\w$ in the SSGP formulation REF, using the gradients of Equation \ref{eq:mallick_grad}, produces a low-rank GP with kernel $k_{q^*}$ that approximates a full-rank GP with kernel $k_{p^*}$, where $p^* = \argmin_{p \in \mathcal{P}} \mathcal{L} \left[ p \right]$
% \end{proposition}
% Equation \ref{eq:mallick_grad} exchanges the traditional GP kernel hyperparameter inference scheme of Equation \ref{eq:gpobjective} for one which in which we directly optimize the kernel spectral measure $p_k$. 
% The calculation of the gradient of \eqref{eq:srfr_objective} is trivial \citep{liu_understanding_2019}, as the gradients of each term are linearly separable. This leads us to propose a modification to Proposition \ref{prop:particle_ssgp_opt} which serves as an intermediate definition for our methodology.
% \begin{definition}[Stein random feature regression]\label{def:srfr}
%     Update step defined by stein, approximates a GP with kernel $k_{q^*}$, etc. 
% \end{definition}
% and can do so using theoretically sound and tractable approximations that are simple to implement in practice. This yields greater flexibility than a traditional hyperparameter optimization scheme by imposing fewer restrictions on the form of optimal kernel $k^*$.
% However, previous approaches that have sought to directly optimize SSGP and RFF frequencies $\w$ in a manner such as that defined in Proposition \ref{prop:particle_ssgp_opt} have been very prone to overfitting. We propose it is necessary to include a regularization term in the particle-based objective $\argmin_{\w} \mathcal{L}(\w)$ to account for these shortcomings, and we propose that the kernel repulsion term of SVGD (REF) is an ideal candidate given the particle-based scheme of the method thus far.
% The resulting theoretical updates are simple to incorporate with the existing framework.  Considering this, we propose a modified objective \eqref{eq:functional_obj} with the addition of an entropy regularization functional:
% It's trivial to extend this setup to many kernel learning schemes, as kernel distribution parameters can be jointly optimized with other hyperparameters, or  kernels optimized simultaneously. Ie. nonstationarity and deep kernels.

\end{document}
