\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

% \usepackage{xr}

% % In your preamble
% \makeatletter
% \newcommand*{\addFileDependency}[1]{% argument=file name and extension
%   \typeout{(#1)}
%   \@addtofilelist{#1}
%   \IfFileExists{#1}{}{\typeout{No file #1.}}
% }
% \makeatother

% \newcommand*{\myexternaldocument}[1]{%
%     \externaldocument{#1}%
%     \addFileDependency{#1.tex}%
%     \addFileDependency{#1.aux}%
% }

% \myexternaldocument{falk_456/latex/falk_456}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{hyperref}       % hyperlinks

%\usepackage[sort&compress,numbers]{natbib}
\usepackage[sort&compress,numbers]{natbib}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{siunitx} % for proper typesetting of numbers and units
\sisetup{separate-uncertainty}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{makecell}


%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\MP}[1]{{\color{red} #1}}
\newcommand{\JF}[1]{{\color{green} #1}}
\newcommand{\CC}[1]{{\color{blue} #1}}


%%%%
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts

\setcitestyle{authoryear,square}
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{graphicx}
\usepackage{graphbox}
\usepackage{subfig}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathrsfs}
\usepackage{bm}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsthm}
%\usepackage{subcaption}
\usepackage[toc,page]{appendix}
\usepackage{enumitem}
\captionsetup[subfigure]{labelformat=empty}

% Write differential operators and equations
\usepackage[ISO]{diffcoeff}

\newcommand{\rf}[1]{{\color{blue} #1}}
\newcommand{\ros}[1]{{\color{red} #1}}
\newcommand{\rosanna}[1]{{\color{green} #1}}
\newcommand{\ar}[1]{{\color{orange} #1}}
\newcommand{\mass}[1]{{\color{brown} #1}}

\newcommand{\red}[1]{\textcolor{blue}{#1}}
% \hypersetup{
%      colorlinks = true,
%      linkcolor = blue,
%      anchorcolor = blue,
%      citecolor = blue,
%      filecolor = blue,
%      urlcolor = blue
%      }

%%%%


\usepackage{thmtools}

\newcommand{\indic}{\chi}
\newcommand{\pushforward}{\#}

\newcommand{\R}{\ensuremath{\mathbb{R}}}
\newcommand{\inputspace}{\ensuremath{\mathcal{X}}}
\newcommand{\outputspace}{\ensuremath{\mathcal{Y}}}
\newcommand{\dataspace}{\ensuremath{\inputspace\times\outputspace}}
% \newcommand{\dataspace}{\ensuremath{\mathcal{Z}}}
\newcommand{\hypothesisspace}{\mathcal{H}}
\newcommand{\algorithmspace}{\Theta}
\newcommand{\latentspace}{\ensuremath{\mathcal{Z}}}
\newcommand{\Lip}{\ensuremath{\mathrm{Lip}}}

\newcommand{\distas}{\sim}
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\domain}{\ensuremath{\mathrm{dom}}}

\newcommand{\E}{\mathbb{E}}

%%% Meta-learning
% Distributions
\newcommand{\metadistribution}{\rho}
\newcommand{\taskdistribution}{\mu}

% Datasets
\newcommand{\taskdataset}{D}
\newcommand{\tasktrain}{D^{\operatorname{tr}}}
\newcommand{\taskval}{D^{\operatorname{val}}}

\newcommand{\ntr}{n_{\rm tr}}
\newcommand{\nva}{n_{\rm val}}
\newcommand{\nts}{n_{\rm ts}}

% Meta-datasets
\newcommand{\metatrain}{M}
\newcommand{\metaval}{M_{\operatorname{val}}}
\newcommand{\metatest}{M_{\operatorname{te}}}

% Algorithms
\newcommand{\inneralgorithm}{A}
\newcommand{\metaalgorithm}{\tilde{A}}

% Losses
\DeclareMathOperator{\risk}{\mathcal{R}}
\newcommand{\emprisk}{\hat{\mathcal{R}}}
\DeclareMathOperator{\innerloss}{\ell}
\newcommand{\metaloss}{L}
\newcommand{\randmetaloss}{\tilde{L}}
\DeclareMathOperator{\metarisk}{\mathcal{E}}
\newcommand{\empmetarisk}{\hat{\mathcal{E}}}

\DeclarePairedDelimiter{\abs}{\lvert}{\rvert}
\newcommand{\diam}{\ensuremath{\mathrm{diam}}}
\newcommand{\Tr}{\mathrm{Tr}}

\newcommand*{\matr}[1]{{\bf #1}}
%% Conjugation and transposition
% 2nd answer
% https://tex.stackexchange.com/questions/30619/what-is-the-best-symbol-for-vector-matrix-transpose
% the starred command disallows arguments
% with \par or newlines
% https://tex.stackexchange.com/questions/1050/whats-the-difference-between-newcommand-and-newcommand
%\newcommand*{\tran}{^{\mkern-1.5mu\mathsf{T}}}
\newcommand*{\tran}{^\top}
\newcommand{\idmat}{\matr{I}}
\newcommand{\diag}{\mathrm{diag}}

%% Kernels
\newcommand{\kernel}[1]{#1}
\newcommand{\rkhs}[1]{\mathcal{#1}}
\newcommand{\kernelmatr}[1]{\matr{#1}}

%% Linear Algebra
\DeclarePairedDelimiterX{\scal}[2]{\langle}{\rangle}{#1, #2}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}

% Simple set building (from mathtools manual, page 28)
% Make sure \given command exists
\providecommand\given{}
% Create symbol
\newcommand\SetSymbol[1][]{%
	\nonscript\:#1\vert
	\allowbreak
	\nonscript\:
	\mathopen{}}
% Create command
\DeclarePairedDelimiterX{\Set}[1]\{\}{%
  \renewcommand\given{\SetSymbol[\delimsize]}#1}

\DeclareMathOperator*{\argmin}{\ensuremath{argmin}}

%%% Theorem environments
% Add numberwithin=section to the declaration in order to have the section number as
% prefix to the theorem number.
\declaretheorem[name=Theorem,refname=Thm.]{theorem}
\declaretheorem[name=Lemma,sibling=theorem]{lemma}
\declaretheorem[name=Fact,sibling=theorem]{fact}
\declaretheorem[name=Proposition,refname=Prop.,sibling=theorem]{proposition}
\declaretheorem[name=Remark]{remark}
\declaretheorem[name=Corollary,refname=Cor.,sibling=theorem]{corollary}
\declaretheorem[name=Definition,refname=Def.]{definition}
\declaretheorem[name=Conjecture,sibling=theorem]{conjecture}
\declaretheorem[name=Axiom]{axiom}
\declaretheorem[name=Assumption,refname=Asm.]{assumption}
\declaretheorem[name=Example]{example}
% used for AfterEndEnvironment
% to remove the indentation after theorem environmentshttps://www.overleaf.com/project/614aed333c3b7721ea2b68a3
% if we remove it we simply need to add \noindent after any
% restatable environment
\usepackage{etoolbox}
\usepackage{placeins}

\AfterEndEnvironment{restatable}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{theorem}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{remark}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{example}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{assumption}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{lemma}{\noindent\ignorespacesafterend}

\usepackage{cleveref}
\crefname{assumption}{Assumption}{Ass.}
\crefname{equation}{}{}
\crefname{figure}{Fig.}{Fig.}
\crefname{table}{Tab.}{Tables}
\crefname{section}{Sec.}{Sec.}
\crefname{theorem}{Thm.}{Thm.}
\crefname{fact}{Fact}{Facts}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{corollary}{Cor.}{Cor.}
\crefname{example}{Example}{Examples}
\crefname{remark}{Remark}{Remarks}
\crefname{algorithm}{Alg.}{Algorithms}
\crefname{appendix}{App.}{Appendices}

\begin{document}

\title{Implicit Kernel Meta-Learning Using Kernel Integral Forms (Supplementary Material)}

\author[1, 2]{\href{mailto:<ucabitf@ucl.ac.uk>?Subject=IKML (UAI)}{John~Isak~Texas~Falk}{}}
\author[1]{Carlo~Ciliberto}
\author[1, 2]{Massimiliano~Pontil}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Computer Science\\
    University College London\\
    U.K. 
}
\affil[2]{%
    CSML\\
    Italian Institute of Technology\\
    Genoa, Italy
}

\onecolumn

\maketitle

The supplementary material is organized as follows. In \cref{app:glossary} we introduce a glossary of terms used in the main body. In \cref{app:closed-form-pf-gaussian} we derive the closed form of the  stochastic kernel of the affine pushforward kernel. In \cref{app:bound} we derive the detailed bounds presented in Theorem 1. In \cref{app:datasets} we elaborate on the creation of the Air Quality (\ref{app:air-quality-dataset}) and the Gas Sensor (\ref{app:gas-sensor-dataset}) datasets. In \cref{app:exp-results} we include the information on the numerical experiment presented in the main body. Finally, in  \cref{app:computational-complexity} we comment on the computational complexity of IKML and compare it against that of R2D2 since they both rely on KRR as the inner algorithm.

\newpage

\section{Glossary}
\label{app:glossary}

\FloatBarrier

\begin{table*}[h!]
  \centering
  \begin{tabular}{cl}
    Notation & Description\\
    \midrule
    $\mc{X}$ & input space\\
    $\mc{Y}$ & output space\\
    $\mc{Z}$ & data space / latent space\\
    $\mc{P}(\mc{Z})$ & set of distributions with support on $\mc{Z}$\\
    $\ell$ & inner loss\\
    $\mc{R}_{\mu}(f)$ & risk of estimator $f$ with respect to distribution $\mu$\\
    $\hat{\mc{R}}(f, D)$ & empirical risk of $f$ on dataset $D$\\
    $D^{\text{tr}}$ & train set\\
    $D^{\text{val}}$ & validation set\\
    $A(\theta, D)$ & inner algorithm with hyperparameter $\theta$ evaluated on dataset $D$\\
    $L(\theta, D)$ & meta-loss of $\theta$ on $D = D^{\text{tr}} \cup D^{\text{val}}$\\
    $K$ & kernel\\
    $\mc{H}, \mc{H}_{K}$ & Hilbert space with corresponding kernel $K$\\
    $\scal{\cdot}{\cdot}_K, \scal{\cdot}{\cdot}_{\mc{H}_K}$ & RKHS inner product of RKHS $\mc{H}_{K}$\\
    $K_{\tau}$ & Bochner kernel with measure $\tau$\\
    $\omega$ & frequency in random feature kernel\\
    $\psi_{\theta}$ & pushforward function parameterized by $\theta$\\
    $\mc{N}$ & latent distribution\\
    $\tau_{\theta}$ & pushforward $\psi_{\theta} \# \mc{N}$\\
    $K_{\tau_{\theta}}$ & bochner kernel using using pushforward $\tau_{\theta}$\\
    $K_{\hat{\tau}_{\theta S}}$ & random feature kernel using sample $S \sim \tau_{\theta}^M$\\ 
    $A_{\text{KRR}}(K, D)$ & estimator of KRR on dataset $D$ using kernel $K$\\
    $L(\theta, S, D)$ & meta-loss when using inner algorithm $A_{\text{KRR}}(K_{\hat{\tau}_{\theta S}}, \cdot)$ on the dataset $D = D^{\text{tr}} \cup D^{\text{val}}$\\
    $\tilde{L}(\theta, S, D)$ & train error on $D$ when using KRR with random feature kernel $K_{\hat{\tau}_{\theta S}}$\\
    $\rho$ & meta-distribution\\
    $\mc{E}(\theta)$ & transfer risk of hyperparameter $\theta$\\
    $\mc{E}_M(\theta) / \mc{E}(\theta, S)$ & transfer risk of $\theta$ when using an $M$-sample random feature kernel $K_{\hat{\tau}_{\theta S}}$ and KRR averaged over S\\
    $\hat{\mc{E}}(\theta, S)$ & estimation error for future task\\
    $\hat{\mc{E}}_T(\theta, S)$ & average train error (multitask empirical risk) on $(D_t)_{t=1}^T$ when using KRR with random feature kernel  $K_{\hat{\tau}_{\theta S}}$\\
    $\hat{\theta}$ & multitask empirical risk minimizer when using bochner kernel\\
    $n, M, T$ & dataset size, number of random features, number of datasets\\
    $\lambda$ & regularization strength in KRR\\
    $R_{n, M, T}$ & Rademacher complexity $\E_{\epsilon} \sup_{\theta \in \Theta} \epsilon_{i,j,t} \langle \psi_{\theta}(s_{j}), x_{i}^{t} \rangle$\\
    $G^*_n$ & complexity term $\E_{\mu \sim \rho}\E_{D \sim \mu^{n}}\norm{(K_{\tau_{\theta^*}}(x_i,x_j))_{i,j=1}^n}_{\infty}$ measuring alignment of kernel $K_{\tau_{\theta^*}}$ with $\rho$
  \end{tabular}
\end{table*}

\FloatBarrier

\section{Kernel for Affine Pushforward and Gaussian Latent}
\label{app:closed-form-pf-gaussian}
In this section we give the closed form of the kernel when the distribution \(\tau\) is the affine pushforward of a standard Gaussian.

We use the following trick to find the closed form kernel. We can rewrite the kernel in Bochner's theorem as
\begin{align}
    K(x, x') = \int \cos(\scal{\omega}{x - x'}) \dl \tau(\omega) & = \int \Re (\cos(\scal{\omega}{x - x'}) + i \sin(\scal{\omega}{x - x'})) \dl \tau(\omega) \\
                 & = \int \Re \exp(i \scal{\omega}{x - x'})\dl \tau(\omega) \\
                 & = \Re \int \exp(i \scal{\omega}{x - x'}) \dl \tau(\omega) \label{eq:real-cf-kernel-form}
\end{align}
so finding the kernel is the same as finding the real part of the characteristic function (CF) of \(\tau\). For a Gaussian the CF is well-known and we give it below.

\begin{lemma}
  \label{lem:gauss-cf}
  Let \(\omega \distas \tau = \mc{N}(\mu, \Sigma)\) where \(\Sigma\) is pd, then for any \(\Delta \in \R^d\)
  \begin{equation}
    \int_{\R^{d}}\exp(i \omega \tran \Delta) \dl \tau(\omega) = \exp(i \mu\tran \Delta - \frac{1}{2}\Delta \tran \Sigma \Delta).
  \end{equation}
\end{lemma}
\begin{proof}
  The pdf of \(\omega\) is
  \(f(\omega) = (2\pi)^{-d/2}\abs{\det(\Sigma)}^{-1/2}\exp(-\frac{1}{2}(\omega - \mu)\Sigma^{-1}(\omega - \mu))\).
  We make the change of variable \(\phi = \Sigma^{-1/2}(\omega - \mu)\) so
  \(\omega = \Sigma^{1/2}\phi + \mu\) where
  \(\Sigma^{1/2}\) and \(\Sigma^{-1/2}\) exist due to \(\Sigma\) being pd. This means that
  \(\dl \omega = \abs{\det(\Sigma)}^{1/2} \dl \phi\) so that we have
  \begin{align}
  \nonumber
    \int_{\R^{d}}\exp(i \omega \tran \Delta) \dl \tau(\omega) & = \int_{\R^{d}}\exp(i \omega\tran \Delta)f(\omega) \dl \omega \\
    \nonumber
                                                                 & = (2\pi)^{-d/2}\int_{\R^{d}}\exp(i (\Sigma^{1/2}\phi + \mu)\tran \Delta)\exp(-\frac{1}{2}\phi\tran\phi) \dl \phi \\
                                                                 \nonumber
                                                                 & = (2\pi)^{-d/2}\exp(i \mu \tran \Delta)\int_{\R^{d}}\exp(i \phi\tran \Sigma^{1/2} \Delta)\exp(-\frac{1}{2}\phi\tran\phi) \dl \phi \\
                                                                 \nonumber
                                                                 & = (2\pi)^{-d/2}(2\pi)^{d/2}\exp(i \mu \tran \Delta - \frac{1}{2}\Delta\tran \Sigma \Delta)  = \exp(i \mu \tran \Delta - \frac{1}{2}\Delta\tran \Sigma \Delta).
  \end{align}
\end{proof}

    Now we parameterize \(\tau\) using \(S \distas \mc{N}\) and $\theta = (Q,b)$ with $Q \in \mathbb{R}^{d \times d}$ and $b\in \mathbb{R}^d$ so that \(\tau = \psi_{(Q, b)}\pushforward \mc{N}\). An affine transformation of a Gaussian random variable is again Gaussian, and in this particular case it's easy to show that \(\tau \distas \mc{N}(b, Q Q\tran)\). Combining \cref{eq:real-cf-kernel-form} and \cref{lem:gauss-cf} we have
    \begin{align}
        \kernel{K}(x, x') & = \Re \int \exp(i \scal{\omega}{x - x'}) \dl \tau(\omega) \\
        & = \Re \exp(i b \tran (x - x') - \frac{1}{2} (x - x')\tran Q Q\tran (x - x')) \\
        & = \cos(b \tran (x - x'))\exp( - \frac{1}{2} (x - x')\tran Q Q\tran (x - x')) \\
        & = \cos(b \tran (x - x'))\exp( - \frac{1}{2} \norm{Q\tran(x - x')}^2).
    \end{align}

\section{Error Decomposition}
\label{app:bound}
\subsection{Setup}
We follow the notation of \citep{maurer09_trans_bound_linear_featur_learn}
with some modifications and note that this differs at places from the notation used in the main body of the paper. We recall the meta-learning setting. There is some
meta-distribution \(\rho\) which generates tasks \(\mu\), from \(\mu\) we are
given a train set \(\bm{z} = (\bm{x}, \bm{y}) \sim \mu^{n}\), where
\((x, y) \in \mc{X} \times \mc{Y} \subseteq \R^{d} \times [0, 1]\). Given the
kernel ridge regression (KRR) algorithm with a fixed regularization parameter
\(\lambda > 0\) and an RKHS and corresponding kernel indexed by
\(\theta \in \Theta\), where \(\Theta \subseteq \R^{D}\) is compact. We write this family as \(\mc{H}_{\Theta}\) and the family of
kernels as \(\mc{K}_{\Theta}\). For a kernel \(K_{\theta}\) let
\(\phi_{\theta}(x) = K_{\theta}(x, \cdot)\) be the canonical feature map and \(\mc{H}_{\theta}\) the corresponding RKHS.
The KRR solution is
\begin{equation}
  \label{eq:2}
  \omega_{\theta}(\bm{z}) = \argmin_{w \in \mc{H}_{\theta}}\left(\frac{1}{n}\sum_{i=1}^{n}(\scal{w}{\phi_{\theta}(x_{i})}_{\theta} - y_{i})^{2} + \lambda\norm{w}_{\theta}^{2}\right),
\end{equation}
where we use \(\scal{\cdot}{\cdot}_{\theta}\) and \(\norm{\cdot}_{\theta}\) to
denote the inner product and norm in RKHS \(\mc{H}_{\theta}\). We will drop
\(\theta\) when it's clear what RKHS we are referring to. Given a weight vector \(w \in \mc{H}_{\theta}\), a
prediction on a new datapoint \(x\) is given by \(\scal{w}{\phi_{\theta}(x)}\).

The transfer risk of the algorithm \(\omega_{\theta}\) and a loss
\(\ell: \mc{Y} \times \mc{Y} \to \R_{+}\) is defined to be
\begin{equation}
  \label{eq:3}
  \mc{E}(\theta) = \E_{\mu \sim \rho}\E_{\bm{z} \sim \mu^{n}}\E_{(x, y) \sim \mu} \ell(\scal{\omega_{\theta}(\bm{z})}{\phi_{\theta}(x)}, y).
\end{equation}
We have access to \(T\) datasets from tasks by sampling
\((\mu_{t})_{t=1}^{T} \sim \rho^{T}\) which gives rise to datasets
\(\bm{z}^{t} = (\bm{x}^{t}, \bm{y}^{t}) \sim \mu_{t}^{n}\). For the meta-dataset
\(\bm{Z} = (\bm{z}^{t})_{t=1}^{T}\) sampled by first sampling
\((\mu_{t})_{t=1}^{T} \sim \rho^{T}\) and then \(\bm{z}^{t} \sim \mu^{n}\), we
denote this sampling process by \(\bm{Z} \sim \hat{\rho}^{T}\). Using the KRR
algorithm \(\omega_{\theta}\) we let
\begin{equation}
  \label{eq:4}
  \hat{\ell}_{\theta}(\bm{z}^{t}) = \frac{1}{n}\sum_{i=1}^{n}
  \ell(\scal{\omega_{\theta}(\bm{z}^{t})}{\phi_{\theta}(x^{t}_{i})}, y^{t}_{i}),
\end{equation}
which is the training error of task \(t\) using \(\omega_{\theta}\). For a
sample of latent variables \(S = (s_{k})_{k=1}^{M} \sim \mc{N}^{M}\) so that the random features
\(\psi_{\theta}(s_{k}) \sim \tau_{\theta}\) (that is
\(\tau_{\theta} = \psi_{\theta} \pushforward \mc{N}\)), in which case we define \(K_{\theta} = K_{\tau_{\theta}}\) and \(K_{\theta, S} = K_{\hat{\tau}_{\theta}S}\), we let
\begin{equation}
  \label{eq:emprical-risk-with-S}
  \hat{\ell}_{\theta}(\bm{z}^{t}, S) = \frac{1}{n}\sum_{i=1}^{n}
  \ell(\scal{\omega_{\theta, S}(\bm{z}^{t})}{\phi_{\theta, S}(x^{t}_{i})}, y^{t}_{i}),
\end{equation}
where \(\omega_{\theta, S}\) is the same as \cref{eq:2} where we replace the
kernel \(K_{\theta}\) by the random feature kernel \(K_{\theta, S}\) and the
corresponding RKHS, see \cref{sec:kernel-family}. When the algorithm \(\omega\) is clear from context we simply write
\(\hat{\ell}(\bm{z})\) and \(\hat{\ell}(\bm{z}, S)\). We opt to select \(\theta\) using ERM, letting
\begin{equation}
  \label{eq:5}
  \hat{\theta} = \argmin_{\theta \in \Theta}\left\{ \hat{\mc{E}}_{T}(\theta) = \frac{1}{T}\sum_{t=1}^{T}\hat{\ell}_{\theta}(\bm{z}^{t})\right\}.
\end{equation}
As the problem of \cref{eq:5} is non-convex we cannot solve it in general. We let
\(\tilde{\theta}\) be the output of an optimization procedure
\(\tilde{\theta} = \textrm{Alg}(\hat{\mc{E}}_{T})\) and encode this optimization
discrepancy through the term
\(\hat{\mc{E}}_{T}(\tilde{\theta}) - \hat{\mc{E}}_{T}(\hat{\theta})\).

\subsection{Kernel Family}
\label{sec:kernel-family}
Let \(\mc{H}\) be an RKHS defined by Bochner's theorem through the
kernel defined by any probability measure \(\tau \in M_{1}(\mc{X})\),
\begin{equation}
  \label{eq:1}
  K(x, x') = \int \xi(x; v)\overline{\xi}(x'; v)\dl \tau(v).
\end{equation}
We will assume that \(\xi(x; v) = \exp(i v\tran x)\), but the analysis should
generalize to the more general setting. For a real-valued kernel
\(K: \mc{X} \times \mc{X} \to \R\), it can be shown
that any such kernel satisfying \cref{eq:1} can be rewritten as
\begin{equation}
  \label{eq:real-translation-invariant-kernel}
  K(x, x') = \int_{(-\pi/2, \pi/2]^{d}} \cos(\scal{v}{x - x'})\dl \tau'(v),
\end{equation}
for some measure \(\tau'\) with support on \((-\pi/2, \pi]^{d}\).
For IKML we parameterise a class of measures by
\(\psi_{\theta}\pushforward \mc{N}\) where \(\psi_{\theta}\) is an MLP with weights
\(\theta\) and we denote the kernel and RKHS by \(K_{\theta}\) and
\(\mc{H}_{\theta}\).

Given a dataset of inputs \(\bm{x} = (x_{i})_{i=1}^{n}\), denote the kernel matrix
\(\bm{G}_{\theta}(\bm{x})\) so that
\(\bm{G}_{\theta}(\bm{x})_{ij} = K_{\theta}(x_{i}, x_{j})\) and let
\(\bm{G}_{\theta, \lambda}(\bm{x}) = \bm{G}_{\theta}(\bm{x}) + n\lambda I\).
Similarly for a set of latents  \(S \sim \mc{N}^{M}\) we denote
the respective matrices \(\bm{G}_{\theta}(\bm{x}, S)\) and
\(\bm{G}_{\theta, \lambda}(\bm{x}, S)\) were we replace every instance of
\begin{equation}
  \label{eq:41}
  K_{\theta}(x, x') = \int \cos(\scal{\psi_{\theta}(s)}{x - x'})\dl \mc{N}(s)
\end{equation}
by the empirical mean

\begin{equation}
  \label{eq:42}
  K_{\theta, S}(x, x') = \frac{1}{M}\sum_{j=1}^{M}\cos(\scal{\psi_{\theta}(s_{j})}{x - x'}) = \phi_{\theta, S}(x)\tran\phi_{\theta, S}(x'),
\end{equation}
where \(\phi_{\theta, S}(x) = \frac{1}{\sqrt{M}}(\sin(\psi_{\theta}(s_1)\tran x), \cos(\psi_{\theta}(s_1)\tran x), \dots, \sin(\psi_{\theta}(s_M)\tran x), \cos(\psi_{\theta}(s_M)\tran x))\tran \in \R^{2M}\).
We will omit \(\theta\) and \(\bm{x}\) from
\(\bm{G}_{\theta}(\bm{x})\) when clear from context. Similarly we let
\(\hat{\ell}_{\theta}(\bm{x}, \bm{y}, S)\) be the train loss when trained on
\(\bm{x}, \bm{y}\) with random features induced by \(S\) and we omit \(\theta\) when clear from context.

\subsection{Auxiliary Results}
Let \(\norm{\cdot}_{\infty}\) be the operator norm and \(\norm{\cdot}_{F}\) the
Frobenius norm. For an algorithm \(\omega\) and a dataset \(\bm{z}\), let \(\hat{\ell}(\bm{z})\) be the training error of \(\omega\) on \(\bm{z}\) using loss \(\ell\).
\begin{definition}
  Given any \(\bm{z} = (\bm{x}, \bm{y})\) or two input sets
  \(\bm{x}_{1}, \bm{x}_{2}\) of size \(n\), where \(x \in \mc{X}\) and \(y \in [0, 1]\), relative to a fixed loss function \(\ell\), an algorithm \(\omega\) taking
  outputs in an RKHS \(\mc{H}\) is said to be
  \begin{itemize}
    \item \(\beta\)-bounded if \(\norm{\omega(\bm{z})} \leq \beta\) and
          \(\hat{\ell}(\bm{z}) \leq \beta\).
    \item have kernel stability \(L\) if
          \begin{equation}
            \label{eq:7}
            \hat{\ell}(\bm{x}_{1}, \bm{y}) - \hat{\ell}(\bm{x}_{2}, \bm{y}) \leq \frac{L}{n}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{F}
          \end{equation}
    \item have random feature stability \(L\) if
          \begin{equation}
            \hat{\ell}(\bm{x}, \bm{y}, S) - \hat{\ell}(\bm{x}, \bm{y}) \leq \frac{L}{n}\norm{\bm{G}(\bm{x}, S) - \bm{G}(\bm{x})}_{F}.
          \end{equation}
  \end{itemize}
\end{definition}

\begin{lemma}[\citep{maurer09_trans_bound_linear_featur_learn}, Lemma 3]
  \label{lem:maurer09-psd-operator-facts}
  Let \(G_{1}\) and \(G_{2}\) be positive semidefinite operators on any Hilbert
  space and \(\lambda > 0\), then
  \begin{enumerate}
    \item \(G_{i} + \lambda I\) is invertible,
    \item \(\norm{(G_{i} + \lambda I)^{-1}}_{\infty} \leq \frac{1}{\lambda}\) and
    \item we have
          \begin{equation}
            \label{eq:9}
            \norm{(G_{1} + \lambda I)^{-1} - (G_{2} + \lambda)^{-1}}_{\infty} \leq \frac{1}{\lambda^{2}}\norm{G_{1} - G_{2}}_{\infty}.
          \end{equation}
    \item Let \(\phi_{1}, \phi_{2}\) satisfy \((G_{i} + \lambda I)\phi_{i} = y\).
          Then
          \begin{equation}
            \label{eq:10}
            \abs{\norm{\phi_{1}}^{2} - \norm{\phi_{2}}^{2}} \leq 2\lambda^{-3}\norm{G_{1} - G_{2}}_{\infty}\norm{y}^{2}
          \end{equation}
  \end{enumerate}
\end{lemma}

For any dataset \(\bm{z} = (\bm{x}, \bm{y})\) of size \(n\), kernel \(K\) with
RKHS \(\mc{H}\) and feature map \(\phi\), and
corresponding KRR algorithm \(\omega\), we define the following quantities, following
\citep{maurer09_trans_bound_linear_featur_learn},
\begin{align}
  \omega(\bm{z}) & = \argmin_{w \in \mc{H}}\left(\frac{1}{n}\sum_{i=1}^{n}(\scal{w}{\phi(x_{i})} - y_{i})^{2} + \lambda\norm{w}^{2}\right),\\
  \hat{\ell}_{\omega}(\bm{z}) & = \frac{1}{n}\sum_{i=1}^{n}(\scal{\omega(\bm{z})}{\phi(x_{i})} - y_{i})^{2},\\
  \xi_{\omega}(\bm{z}) & = \min_{w \in \mc{H}}\left(\frac{1}{n}\sum_{i=1}^{n}(\scal{w}{\phi(x_{i})} - y_{i})^{2} + \lambda\norm{w}^{2}\right) = \hat{\ell}_{\omega}(\bm{z}) + \lambda\norm{\omega(\bm{z})}^{2}
\end{align}

\begin{proposition}
  \label{prop:bounding-task-loss-terms}
  For any kernel \(K\) of the form \cref{eq:real-translation-invariant-kernel},
  for any dataset \(\bm{z} = (\bm{x}, \bm{y})\) or two input sets
  \(\bm{x}_{1}, \bm{x}_{2}\), where \(x \in \mc{X}, y \in [0, 1]\), of size \(n\) and a sample of random
  features \(S \sim \mc{N}^{M}\) we have
  that
  \begin{enumerate}
    \item \(\hat{\ell}_{\omega}(\bm{z}) \leq 1\),
          \(\norm{\omega(\bm{z})} \leq \lambda^{-1/2}\), \(\xi_{\omega}(\bm{z}) \leq 1\),
    \item
          \(\abs{\hat{\ell}_{\omega}(\bm{x}_{1}, \bm{y}) - \hat{\ell}_{\omega}(\bm{x}_{2}, \bm{y})} \leq \frac{2\lambda^{-1}}{n}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{F}\),
    \item
          \(\abs{\hat{\ell}_{\omega}(\bm{x}, \bm{y}) - \hat{\ell}_{\omega, S}(\bm{x}, \bm{y})} \leq \frac{2\lambda^{-1}}{n}\norm{\bm{G}(\bm{x}) - \bm{G}(\bm{x}, S)}_{F}\),
    \item
          \(\abs{\xi_{\omega}(\bm{x}_{1}, \bm{y}) - \xi_{\omega}(\bm{x}_{2}, \bm{y})} \leq \frac{\lambda^{-1}}{n}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{F}\),
    \item
          \(\abs{\xi_{\omega}(\bm{x}, \bm{y}) - \xi_{\omega, S}(\bm{x}, \bm{y})} \leq \frac{\lambda^{-1}}{n}\norm{\bm{G}(\bm{x}) - \bm{G}(\bm{x}, S)}_{F}\)
  \end{enumerate}
  where \(\bm{G}(\bm{x}, S)\) is the kernel matrix of \(\bm{x}\) using random
  features induced by \(S\).
\end{proposition}

\begin{proof}
  We simply note that
  \begin{align}
%    \label{eq:6}
    \hat{\ell}_{\omega}(\bm{z}) + \lambda\norm{\omega(\bm{z})}^{2} = \xi_{\omega}(\bm{z}) & = \min_{w \in \mc{H}}\left(\frac{1}{n}\sum_{i=1}^{n}(\scal{w}{\phi(x_{i})} - y_{i})^{2} + \lambda\norm{w}^{2}\right) \leq \frac{1}{n}\sum_{i=1}^{n}(\scal{0}{\phi(x_{i})} - y_{i})^{2} + \lambda\norm{0}^{2}\leq 1.
    \nonumber
  \end{align}
  Since both \(\hat{\ell}_{\omega}(\bm{z})\) and
  \(\lambda\norm{\omega(\bm{z})}^{2}\) are positive and the sum is less than \(1\),
  we have that \(\hat{\ell}_{\omega}(\bm{z}) \leq 1\) and
  \(\lambda\norm{\omega(\bm{z})}^{2} \leq 1\) which implies that
  \(\norm{\omega(\bm{z})} \leq \lambda^{-1/2}\).

  For the second point, using the dual formulation
  \(\bm{G}_{\lambda}(\bm{x})\alpha = \bm{y}\) and
  \(\scal{\omega(\bm{z})}{\phi(x_{i})} = (\bm{G}(\bm{x})\alpha)_{i}\),
  \begin{align}
    \hat{\ell}_{\omega}(\bm{z}) = \frac{1}{n}\norm{\bm{G}(\bm{x})\alpha - \bm{y}}^{2} = \frac{1}{n}\norm{\bm{G}_{\lambda}(\bm{x})\alpha - \bm{y} - \lambda n \alpha}^{2} = \frac{1}{n}\norm{\lambda n \alpha}^{2}= \lambda^{2}n\norm{\alpha}^{2}.
  \end{align}
  Using this and the fact that \(\norm{\omega(\bm{z})}^{2} = \alpha\tran \bm{G}(\bm{x})\alpha\) in \(\xi_{\omega}\),
  \begin{align}
    \xi_{\omega}(\bm{z}) & = \hat{\ell}_{\omega}(\bm{z}) + \lambda\norm{\omega(\bm{z})}^{2}\\
                         & = \lambda^{2}n\norm{\alpha}^{2} + \lambda \alpha\tran \bm{G}(\bm{x})\alpha\\
                         & = \lambda(\lambda n \alpha\tran \alpha + \alpha\tran \bm{G}(\bm{x})\alpha)= \lambda(\alpha\tran \bm{G}_{\lambda}(\bm{x}) \alpha) = \lambda(\bm{y}\tran \bm{G}_{\lambda}(\bm{x})^{-1} \bm{y}).
  \end{align}

  Thus
  \begin{align}
    \abs{\hat{\ell}_{\omega}(\bm{x}_{1}, \bm{y}) - \hat{\ell}_{\omega}(\bm{x}_{2}, \bm{y})} & = \lambda^{2}n\abs{\norm{\bm{G}_{\lambda}(\bm{x}_{1})^{-1}\bm{y}}^{2} - \norm{\bm{G}_{\lambda}(\bm{x}_{2})^{-1}\bm{y}}^{2}}\\
                                                                                            & \leq (\lambda^{2}n) 2(\lambda n)^{-3}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{\infty}\norm{\bm{y}}^{2}\\
                                                                                            & \leq 2\lambda^{-1} n^{-2}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{\infty}\norm{\bm{y}}^{2} \leq \frac{2\lambda^{-1}}{n}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{F},
  \end{align}
  where we have used point 4 in \cref{lem:maurer09-psd-operator-facts} and the fact that \(\norm{\bm{y}}^{2} = \sum_{i=1}^{n}y_{i}^{2} \leq n\) as \(y_{i} \in [0, 1]\) for any \(i \in [n]\). Then
  \begin{align}
    \abs{\xi_{\omega}(\bm{x}_{1}, \bm{y}) - \xi_{\omega}(\bm{x}_{2}, \bm{y})} & \leq \lambda\abs{\bm{y}\tran \bm{G}_{\lambda}(\bm{x}_{1})^{-1} \bm{y} - \bm{y}\tran \bm{G}_{\lambda}(\bm{x}_{2})^{-1} \bm{y}}\\
                                                                              & \leq \lambda\abs{\bm{y}\tran( \bm{G}_{\lambda}(\bm{x}_{1})^{-1} -  \bm{G}_{\lambda}(\bm{x}_{2})^{-1}) \bm{y}}\\
                                                                              & \leq \lambda (\lambda n)^{-2}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{\infty}\norm{\bm{y}}^{2}\\
                                                                              & \leq \lambda n (\lambda n)^{-2}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{F}\\
                                                                              & \leq \frac{\lambda^{-1}}{n}\norm{\bm{G}(\bm{x}_{1}) - \bm{G}(\bm{x}_{2})}_{F}.
  \end{align}
   For the third point and fifth point, the proof is the same as above, replacing
  \(\bm{G}_{\lambda}(\bm{x_{1}})\) and \(\bm{G}_{\lambda}(\bm{x_{2}})\) with
  \(\bm{G}_{\lambda}(\bm{x})\) and \(\bm{G}_{\lambda}(\bm{x}, S)\) respectively.
  Thus all of the results follows.
\end{proof}

\begin{definition}[Complexities]
  Let \((\sigma_{i})_{i=1}^{k}\) denote a sequence of independent Rademacher
  variables (Uniform distribution on \(\{-1, 1\}\)) independent of each other. For a set
  \(A \subseteq \R^{k}\), the Rademacher and Gaussian complexities are defined
  to be
  \begin{equation}
    \label{eq:16}
    \mc{R}(A) = \E_{\sigma}\sup_{\bm{x} \in A} \frac{2}{k}\sum_{i=1}^{k}\sigma_{i}x_{i}.
  \end{equation}

  If \(\mc{F}\) is a class of real functions on a space \(\mc{X}\) and
  \(\bm{x} \in \mc{X}^{k}\), we write
  \begin{equation}
    \label{eq:17}
    \mc{F}(\bm{x}) = \{(f(x_{1}), \dots, f(x_{k}))\: : \: f \in \mc{F}\} \subseteq \R^{k}.
  \end{equation}
  The empirical Rademacher complexities of \(\mc{F}\) on \(\bm{x}\)
  is \(\mc{R}(\mc{F}(\bm{x}))\). If
  \(\mu \in M_{1}(\mc{X})\) is a probability measure on \(\mc{X}\) then the
  corresponding expected complexity is
  \(\E_{\bm{x} \sim \mu^{k}}\mc{R}(\mc{F}(\bm{x}))\).
\end{definition}

% REMOVE: Can appeal to Maurere
\begin{theorem}[\citep{maurer09_trans_bound_linear_featur_learn}, Thm. 4]
  \label{thm:rad-gauss-complexity-results}
  Let \(\mc{F}\) be a real-valued function class on a space \(\mc{X}\) and
  \(\mu \in M_{1}(\mc{X})\). For
  \(\bm{x} = (x_{1}, \dots, x_{k}) \in \mc{X}^{k}\) define
  \begin{equation}
    \label{eq:15}
    \Phi(\bm{x}) = \sup_{f \in \mc{F}}\left( \E_{x \sim \mu}f(x) - \frac{1}{k}\sum_{i=1}^{k}f(x_{i}) \right).
  \end{equation}
  Then
  \begin{enumerate}
    \item
    \(\E_{\bm{x} \sim \mu^{k}}\Phi(\bm{x}) \leq \E_{\bm{x} \sim \mu^{k}}\mc{R}(\mc{F}(\bm{x}))\),
    \item if \(\mc{F}\) is \([0, 1]\)-valued, then for any \(\delta > 0\) we have
    with probability greater than \(1 - \delta\) in \(\bm{x} \sim \mu^{k}\)
    that
    \begin{equation}
      \label{eq:19}
      \Phi(\bm{x}) \leq \E_{\bm{x} \sim \mu^{k}}\mc{R}(\mc{F}(\bm{x})) + \sqrt{\frac{\log(1/\delta)}{2k}}.
    \end{equation}
  \end{enumerate}
\end{theorem}

\begin{corollary}[\citep{maurer09_trans_bound_linear_featur_learn}, Corollary 1]
  \label{cor:gauss-complexity-lipschitz-composition}
  Let \(A \subseteq \R^{k}\) and \(\phi_{1}, \dots, \phi_{k}\) be real
  functions, each with Lipschitz constant \(L\). Denote
  \(\phi \circ A = \{(\phi_{1}(x_{1}), \dots, \phi_{k}(x_{k}))\: : \: (x_{1}, \dots, x_{k}) \in A\}\).
  Then \(\mc{R}(\phi \circ A) \leq L \mc{R}(A)\).
\end{corollary}

\subsection{Decomposition}
We want to control the excess meta-risk
\(\E_{S \sim \mc{N}^M}[\mc{E}(\hat{\theta}, S) - \mc{E}(\theta^{*})]\), where \(\theta^{*} = \argmin_{\theta \in \Theta}\mc{E}(\theta
)\). We introduce the following terms
\begin{equation}
  \label{eq:population-train-error}
  \hat{\mc{E}}(\theta) = \E_{\bm{Z} \sim \rho^{T}}\hat{\mc{E}}_{T}(\theta)
\end{equation}
and the corresponding term \(\hat{\mc{E}}(\theta, S)\) where we replace the kernel \(K_{\theta}\) by \(K_{\theta, S}\). We decompose the excess meta-risk as follows
\begin{align}
  \label{eq:14}
  \E_{S \sim \mc{N}^M}[\mc{E}(\hat{\theta}, S) - \mc{E}(\theta^{*})] & =  \E_{S \sim \mc{N}^M}[\underbrace{\mc{E}(\hat{\theta}, S) - \hat{\mc{E}}(\hat{\theta}, S)}_{(A)} + \underbrace{\hat{\mc{E}}(\hat{\theta}, S) - \hat{\mc{E}}_{T}(\hat{\theta}, S)}_{(B)} + \underbrace{(\hat{\mc{E}}_{T}(\hat{\theta}, S) - \hat{\mc{E}}_{T}(\theta^{*}, S)}_{(C)} \\
                                               & + \underbrace{\hat{\mc{E}}_{T}(\theta^{*}, S) - \hat{\mc{E}}(\theta^{*}, S)}_{(D)} + \underbrace{\hat{\mc{E}}(\theta^{*}, S) - \mc{E}(\theta^{*}, S)}_{(E)} + \underbrace{\mc{E}(\theta^{*}, S) - \mc{E}(\theta^{*})}_{(F)}]
\end{align}
We bound each of the terms.

\subsection{Bounding the estimation error for the future task}
This follows \cite[Sec. 4.1]{maurer09_trans_bound_linear_featur_learn}, but we
present the results in the order that they are needed. This argument bounds both \((A)\)
and \((E)\).

\begin{theorem}[Upper bound of estimation error for future task]
  For any \(\theta \in \Theta\), any loss \(\ell\) such that for all \(y \in [0, 1]\) \(\ell(\cdot, y) : [-L, L] \to \R_{+}\) has Lipschitz constant \(\Lip(L)\), with \(\omega\) being KRR with regularization parameter \(\lambda > 0\) and RKHS induced by \(K_{\theta}\)
  \begin{equation}
    \label{eq:upper-bound-of-estimation-error}
    \E_{S \sim \mc{N}^M}[\mc{E}(\theta, S) - \hat{\mc{E}}(\theta, S)] \leq \Lip(\lambda^{-1/2})\E_{S \sim \mc{N}^{M}}\E_{\bm{z} \sim \hat{\rho}}\mc{R}(\mc{G}(\bm{z})),
  \end{equation}
  where \(\mc{G} = \{z = (x, y) \mapsto \lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x)}_{\theta, S} \: : \: \norm{v}_{\theta, S} \leq 1\}\). Furthermore, we also have the upper bound
  \begin{equation}
    \label{eq:upper-bound-of-estimation-error-full}
    \E_{S \sim \mc{N}^M}[\mc{E}(\theta, S) - \hat{\mc{E}}(\theta, S)] \leq \frac{2\lambda^{-1/2}\Lip(\lambda^{-1/2})}{\sqrt{n}}.
  \end{equation}
\end{theorem}

\begin{proof}
  We may rewrite term \(\E_{S \sim \mc{N}^M}[\mc{E}(\theta, S) - \hat{\mc{E}}(\theta, S)]\) as
  \begin{equation}
    \label{eq:upper-bound-of-estimation-error-full-first-eq}
    \E_{S \sim \mc{N}^K}\E_{\mu \sim \rho}\E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\left( \E_{(x, y) \sim \mu}\ell(\scal{\omega_{\theta, S}(\bm{x}, \bm{y})}{\phi_{\theta, S}(x)}, y) - \hat{\ell}_{\theta}(\bm{x}, \bm{y}, S) \right).
  \end{equation}
  This is bounded in \cite[Thm. 6]{maurer09_trans_bound_linear_featur_learn}, and
  we follow similarly. For a fixed \(\theta \in \Theta\) and any sample \(S\), let
  \(\mc{W} = \{w : \norm{w}_{\theta, S} \leq \lambda^{-1/2}\}\). By
  \cref{prop:bounding-task-loss-terms}, we have that for any dataset \(\bm{z}\) of
  size \(n\) generated according to our assumptions, for any
  \(\theta \in \Theta\), \(\norm{\omega_{\theta, S}(\bm{z})}_{\theta, S} \leq \lambda^{-1/2}\).
  Thus, for any \(\mu \in M_{1}(\mc{X} \times [0, 1])\),
  \begin{align}
    \E_{(\bm{x}, \bm{y}) \sim \mu^{n}} & \left( \E_{(x, y) \sim \mu}\ell(\scal{\omega_{\theta, S}(\bm{x}, \bm{y})}{\phi_{\theta, S}(x)}, y) - \hat{\ell}_{\theta}(\bm{x}, \bm{y}, S) \right) \label{eq:upper-bound-of-estimation-error-full-inner-term} \\
                                       & \leq \E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\sup_{w \in \mc{W}}\left( \E_{(x, y) \sim \mu}\ell(\scal{w}{\phi_{\theta, S}(x)}, y) - \frac{1}{n}\sum_{i=1}^{n}\ell(\scal{w}{\phi_{\theta, S}(x_{i})}, y_{i})\right)\\
                                       & = \E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\sup_{\norm{w}_{\theta, S} \leq \lambda^{-1/2}}\left( \E_{(x, y) \sim \mu}\ell(\scal{w}{\phi_{\theta, S}(x)}, y) - \frac{1}{n}\sum_{i=1}^{n}\ell(\scal{w}{\phi_{\theta, S}(x_{i})}, y_{i})\right)\\
                                       & = \E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\sup_{v\: : \: \norm{v}_{\theta, S} \leq 1}\left( \E_{(x, y) \sim \mu}\ell(\lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x)}, y) - \frac{1}{n}\sum_{i=1}^{n}\ell(\lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x_{i})}, y_{i})\right)\\
                                       & = \E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\sup_{f \in \mc{F}}\left( \E_{(x, y) \sim \mu}f(z) - \frac{1}{n}\sum_{i=1}^{n}f(z_{i})\right)
  \end{align}
  where we have the family of functions
  \(\mc{F} = \{z = (x, y) \mapsto \ell(\lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x)}, y) \:  : \: \norm{v}_{\theta, S} \leq 1\}\).
  By \cref{thm:rad-gauss-complexity-results} we can upper bound this by the
  Rademacher complexity, getting the upper bound
  \begin{equation}
    \label{eq:20}
    \E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\sup_{f \in \mc{F}}\left( \E_{(x, y) \sim \mu}f(z) - \frac{1}{n}\sum_{i=1}^{n}f(z_{i})\right) \leq \E_{\bm{z} \sim \mu^{n}}\mc{R}(\mc{F}(\bm{z}))
  \end{equation}

  Furthermore, by assumption \(y \in [0, 1]\) and \(\ell(\cdot, y)\) has a Lipschitz
  constant upper bounded by \(\Lip(L)\) when we consider
  \(\domain(\ell(\cdot, y)) = [-L, L]\). By Cauchy-Schwartz and
  \(\norm{v}_{\theta, S} \leq 1\), we
  have that
  \(\lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x)}_{\theta, S} \in [-\lambda^{-1/2}, \lambda^{1/2}]\)
  and so \(L = \lambda^{-1/2}\). Letting \(\phi_{i}: t \mapsto \ell(t, y_{i})\)
  with domain \([-\lambda^{-1/2}, \lambda^{-1/2}]\) then the Lipschitz constant is
  \(\Lip(\lambda^{-1/2})\). We let
  \(\mc{G} = \{z = (x, y) \mapsto \lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x)}_{\theta, S} \: : \: \norm{v}_{\theta, S} \leq 1\}\).

  Since \(\mc{F} = \phi \circ \mc{G}\) we have by
  \cref{cor:gauss-complexity-lipschitz-composition} that
  \begin{align}
    \label{eq:21}
    \E_{(\bm{x}, \bm{y}) \sim \mu^{n}} \left( \E_{(x, y) \sim \mu}\ell(\scal{\omega_{\theta, S}(\bm{x}, \bm{y})}{\phi_{\theta, S}(x)}, y) - \hat{\ell}_{\omega_{\theta, S}}(\bm{x}, \bm{y}) \right) & \leq \E_{\bm{z} \sim \mu^{n}}\mc{R}(\phi \circ \mc{G}(\bm{z}))\\                                                                          & \leq \Lip(\lambda^{-1/2})\E_{\bm{z} \sim \mu^{n}}\mc{R}(\mc{G}(\bm{z})).
  \end{align}
  We can further bound \(\E_{\bm{z} \sim \mu^{n}}\mc{R}(\mc{G}(\bm{z}))\) using a standard RKHS rademacher complexity argument.
  
  By standard arguments of Rademacher complexity of kernels such that \(K(x, x) = 1\) we have the bound
  \begin{equation}
    \label{eq:22}
    \E_{\bm{z} \sim \mu^{n}}\mc{R}(\mc{G}(\bm{z})) \leq \frac{2\lambda^{-1/2}}{\sqrt{n}}
  \end{equation}

Substituting the upper bounds \(\Lip(\lambda^{-1/2})\E_{\bm{z} \sim \mu^{n}}\mc{R}(\mc{G}(\bm{z}))\) or \(\frac{2\lambda^{-1/2}\Lip(\lambda^{-1/2})}{\sqrt{n}}\) of \cref{eq:upper-bound-of-estimation-error-full-inner-term} and combining everything we have
  \begin{align}
    \E_{S \sim \mc{N}^k}[\mc{E}(\hat{\theta}, S) - \hat{\mc{E}}(\hat{\theta}, S)] & \leq \Lip(\lambda^{-1/2})\E_{S \sim \mc{N}^{M}}\E_{\bm{z} \sim \hat{\rho}}\mc{R}(\mc{G}(\bm{z})) \\
    \E_{S \sim \mc{N}^k}[\mc{E}(\hat{\theta}, S) - \hat{\mc{E}}(\hat{\theta}, S)] & \leq \frac{2\lambda^{-1/2}\Lip(\lambda^{-1/2})}{\sqrt{n}} \label{eq:bound-on-A/E-full}
  \end{align}
\end{proof}

We note the following about the bound above. The bound \(\E_{\bm{z} \sim \hat{\rho}}\mc{R}(\mc{G}(\bm{z})) \leq \frac{2\lambda^{-1/2}}{\sqrt{n}}\) is standard and applies to all kernels such that \(K(x, x) = 1\).
However, in the benign case that \(\E_{S \sim \mc{N}^M}\mc{R}(\mc{G}(\bm{z})) \ll \frac{2\lambda^{-1/2}\Lip(\lambda^{-1/2})}{\sqrt{n}}\), using IKML would lead to a term much smaller than fixing a kernel and learning the tasks independently.

\subsection{Predicting the empirical error for the future task}
In this section we focus on the terms \((B), (D)\), each of the form
\(\E_{S \sim \mc{N}^M}\hat{\mc{E}}_{T}(\theta, S) - \hat{\mc{E}}(\theta, S)\) where
\(\theta \in \Theta\). To control this term we use \cite[Sec. 4.2]{maurer16}. We want to control the term \(\hat{\mc{E}}_{T}(\theta, S) - \hat{\mc{E}}(\theta, S)\) using a uniform bound of the form
\begin{equation}
    \hat{\mc{E}}_{T}(\theta, S) - \hat{\mc{E}}(\theta, S) \leq \sup_{\theta \in \Theta}\left\{\hat{\mc{E}}_{T}(\theta, S) - \hat{\mc{E}}(\theta, S) = \frac{1}{T}\sum_{t=1}^T \hat{\ell}_{\theta}(\bm{z}^t, S) - \E_{\bm{z} \sim \hat{\rho}}\hat{\ell}_{\theta}(\bm{z}^t, S)\right\},
\end{equation}
where \(\bm{z}^t = (\bm{x}^t, \bm{y}^t) \sim \mu_t^n\) and we let \(\bm{Z} = (\bm{z}^t)_{t=1}^T \sim \hat{\rho}^T\).
This enables us to control both \((B)\) involving the ERM parameter \(\hat{\theta}\) and \((D)\) involving \(\theta^*\). We switch the order of the terms to get the standard form \(\sup_{f \in \mc{F}}\E f(x) - \frac{1}{n}\sum_{i=1}^n f(x_i)\). We define the loss class \(\mc{F} = \{f : \mc{Z}^n \to \R_{\geq 0} , \: f(\bm{z}) = \hat{\ell}_{\theta}(\bm{z}, S), \: \forall \theta \in \Theta\}\) and note that it implicitly depends on the random feature sample \(S\).

\begin{theorem}
  For any \(\theta \in \Theta\), squared error \(\ell(y, \hat{y}) = (y - \hat{y})^2\) such that for all \(y \in [0, 1]\), \(\ell(\cdot, y) : [-L, L] \to \R_{+}\) has Lipschitz constant \(\Lip(L)\), with \(\omega\) being KRR with regularization parameter \(\lambda > 0\) and RKHS induced by \(K_{\theta}\), with probability greater than \(1 - \delta\) over the choice of meta-train set \(\overline{\bm{Z}} \in (\mc{Z}^n)^T\) we have
  \begin{equation}
    \label{eq:12}
    \E_{S \sim \mc{N}^M}[\hat{\mc{E}}_{T}(\theta, S) - \hat{\mc{E}}(\theta, S)] \leq \E_{S \sim \mc{N}^M}\E_{\bm{Z} \sim \hat{\rho}^T} \mc{R}(\mc{F}(\bm{Z})) + \sqrt{\frac{\log(1/\delta)}{2T}}.
  \end{equation}
\end{theorem}

\begin{proof}
  Relating our setting to \cite[]{maurer16}, for some \(S\) we let the loss function class be
  \begin{equation}
    \mc{F} = \{f : \mc{Z}^n \to \R_{\geq 0} , \: f(\bm{z}) = \hat{\ell}_{\theta}(\bm{z}, S), \: \forall \theta \in \Theta\},
  \end{equation}
  and for \(\overline{\bm{Z}} \in (\mc{Z}^n)^T\), we denote \(\Phi(\overline{\bm{Z}}) = \sup_{f \in \mc{F}}\E_{\bm{Z} \sim \hat{\rho}^{T}} f(\bm{Z}) - \frac{1}{T}\sum_{t=1}^T f(\overline{\bm{Z}}_t)\). By 2. of \cref{thm:rad-gauss-complexity-results}, since \(\hat{\ell}_{\theta}(\bm{z}, S) \in [0, 1]\), for any \(\delta > 0\) we have with probability greater than \(1 - \delta\) over \(\overline{\bm{Z}} \sim \hat{\rho}^T\) that
  \begin{equation}
    \Phi(\overline{\bm{Z}}) \leq \E_{\bm{Z} \sim \hat{\rho}^T} \mc{R}(\mc{F}(\bm{Z})) + \sqrt{\frac{\log(1/\delta)}{2T}},
  \end{equation}
  so we focus on controlling the Rademacher complexity \(\E_{\bm{Z} \sim \hat{\rho}^T}\mc{R}(\mc{F}(\bm{Z}))\). Following \cite[Sec. 3.3]{maurer16}, note that our notation differs, the translation is as follows (\cite[Sec. 3. 3]{maurer16} \(\to\) this work) \(\mc{H} \to \{\psi_{\theta, S}(\cdot), \: \forall \theta \in \Theta\}\), \(\psi_t(\cdot) \to \hat{\ell}_{(\cdot), S}(\bm{z}^t)\), \(\E_{t \sim \rho}[\psi_{t}(h)] \to \hat{\mc{E}}(\theta)\), \(\frac{2}{n}R(\mc{H}, \bar{\bm{x}}) \to \mc{R}(\mc{F}(\overline{\bm{Z}}))\), \(\phi_t(h) \to \Phi_{\theta, S}\), we will specify \(\Phi_{\theta, S}\) below.
  To apply \cite[Thm. 2]{maurer16} we need find a "Lipschitz" constant \(L\) such that
  \begin{equation}
    \hat{\ell}_{\theta, S}(\bm{z}^t) - \hat{\ell}_{\theta', S}(\bm{z}^t) \leq \frac{L}{\sqrt{n}}\norm{\Phi_{\theta, S} - \Phi_{\theta', S}}_{F},
  \end{equation}
  where \((\Phi_{\theta, S})_{:, j} = \phi_{\theta, S}(x_j)\) and \(\Phi_{\theta, S} \in \R^{2M \times n}\), e.g. \(\Phi_{\theta, S}\) is the feature matrix of the kernel \(\bm{G}_{\theta}(\bm{x}, S)\) so that \(\bm{G}_{\theta}(\bm{x}, S) = \Phi_{\theta, S}\tran \Phi_{\theta, S}\). We proceed as follows; assume that for loss \(\ell(\cdot, \cdot)\) we have kernel stability
  \begin{equation}
    \hat{\ell}_{\theta, S}(\bm{z}) - \hat{\ell}_{\theta', S}(\bm{z}) \leq \frac{L_{\ell}}{n}\norm{\bm{G}_{\theta}(\bm{x}, S) - \bm{G}_{\theta'}(\bm{x}, S)}_{F},
  \end{equation}
  this holds true for the least squares loss with \(L_{\ell} = 2\lambda^{-1}\) following similarly from the proof of \cref{prop:bounding-task-loss-terms}. Since \(\Phi_{\theta, S}\tran \Phi_{\theta, S} = \bm{G}_{\theta}(\bm{x}, S)\) for \(\theta\) and similarly for \(\theta'\) we can write
  \begin{equation}
    \norm{\bm{G}_{\theta}(\bm{x}, S) - \bm{G}_{\theta'}(\bm{x}, S)}_{F} = \norm{\Phi_{\theta, S}\tran \Phi_{\theta, S} - \Phi_{\theta', S}\tran \Phi_{\theta', S}}_{F}
  \end{equation}
  and letting \(M_{\max} = \max(\norm{\Phi_{\theta, S}}_{F}, \norm{\Phi_{\theta', S}}_{F})\), using the matrix identity \(A\tran A - B\tran B = A\tran(A - B) + (A - B)\tran B\), we can upper bound it
  \begin{align}
    \norm{\Phi_{\theta, S}\tran \Phi_{\theta, S} - \Phi_{\theta', S}\tran \Phi_{\theta', S}}_{F} & \leq \norm{\Phi_{\theta, S}}_{F} \norm{\Phi_{\theta, S} - \Phi_{\theta', S}}_{F} + \norm{\Phi_{\theta', S}}_{F} \norm{\Phi_{\theta', S} - \Phi_{\theta, S}}_{F} \leq 2M_{\max}\norm{\Phi_{\theta', S} - \Phi_{\theta, S}}_{F}.
  \end{align}
  Now
  \begin{align}
    \norm{\Phi_{\theta, S}}_{F}^2 = \frac{1}{M}\sum_{k=1}^{M}\sum_{j=1}^{n}(\sin(\psi_{\theta}(s_k)\tran x_j)^2 + \cos(\psi_{\theta}(s_k)\tran x_j)^2) = \frac{1}{M}\sum_{k=1}^M\sum_{j=1}^n 1 = n,
  \end{align}
  which means that \(M_{\max} = \sqrt{n}\) and thus we have the sought Lipschitz property
  \begin{equation}
    \hat{\ell}_{\theta, S}(\bm{z}^t) - \hat{\ell}_{\theta', S}(\bm{z}^t) \leq \frac{L_{\ell}}{\sqrt{n}}\norm{\Phi_{\theta', S} - \Phi_{\theta, S}}_{F}.
  \end{equation}
  From this we have that
  \begin{equation}
    \label{eq:mtl-rademacher}
    \mc{R}(\mc{F}(\overline{\bm{Z}})) \leq \frac{2L_{\ell}}{T\sqrt{nM}}\left(\E_{\epsilon}\sup_{\theta \in \Theta}\sum_{t, k, i}^{T, M, n}\epsilon_{tki}(\sin(\psi_{\theta}(s_k)\tran x_i^t) + \cos(\psi_{\theta}(s_k)\tran x_i^t))\right).
  \end{equation}
\end{proof}

\subsection{Random feature error}
In this section we show how to control the term \((F)\) in the bound, the term
\begin{equation}
  \label{eq:m1}
  \E_{S \sim \mc{N}^M}[\mc{E}(\theta^{*}, S) - \mc{E}(\theta^{*})].
\end{equation}
Remember that for a dataset \(\bm{x} \in \mc{X}^{n}\), random features \(S\) and \(\theta \in \Theta\), we let \(\bm{g}(x) = (K_{\theta}(x_{i}, x))_{i=1}^{n}\) and similarly \(\bm{g}(x, S) = (K_{\theta, S}(x_{i}, x))_{i=1}^{n}\).

We first need some additional results
\begin{theorem}[\citep{tropp19_matrix_concen_comput_linear_algeb}, Theorem 2.1]
  \label{thm:tropp19-rff}
  Let \(A\) be a self-adjoint matrix of size \(n\). Given a iid samples \((R_{k})_{k=1}^{M}\) of self-adjoint matrices such that \(\E R_{1} = A\) and \(\norm{R_{1}}_{\infty} \leq B\). Let \(m_{2}(R_{1}) = \norm{\E R_{1}^{2}}_{\infty}\) and \(\bar{R}_{M} = \frac{1}{M}\sum_{k=1}^{M}R_{M}\), then
  \begin{equation}
    \label{eq:sampling-expectation-bound}
    \E\norm{\bar{R}_{M} - A} \leq \sqrt{\frac{2m_{2}(R_{1})\log(2n)}{M}} + \frac{2B\log(2n)}{3M}.
  \end{equation}
\end{theorem}

\begin{lemma}
  \label{lem:g-bound}
  For any \(\bm{x} \in \mc{X}^{n}\), any \(\theta \in \Theta\), \(\E_{S \sim \mc{N}^M}\norm{\bm{g}(x) - \bm{g}(x, S)}_{2} \leq 2n^{1/2}M^{-1/2}\)
\end{lemma}

\begin{proof}
We have that
  \begin{align}
    \label{eq:8}
    \E_{S \sim \mc{N}^M}\norm{\bm{g}(x) - \bm{g}(x, S)}_{2} & \leq \sqrt{\E_{S \sim \mc{N}^M}\norm{\bm{g}(x) - \bm{g}(x, S)}_{2}^{2}}  = \sqrt{\sum_{i=1}^{n}\E_{S\sim \mc{N}^{M}}(\bm{g}(x)_{i} - \frac{1}{M}\sum_{k=1}^{M}\bm{g}(x, s_{k})_{i})^{2}}.
  \end{align}
  Define \(T_{i,k} = K_{\theta}(x, x_{i}) - K_{\theta, s_{k}}(x, x_{i})\), then \(\E(T_{i, k}) = 0\), for any \(i \in [n]\), \((T_{i, k})_{k=1}^{M}\) is an iid sample of random variables and finally \(\abs{T_{i, k}} \leq \abs{K_{\theta}(x, x_{i})} + \abs{K_{\theta, s_{k}}(x, x_{i})} \leq 2\). We can then express
  \begin{align}
    \label{eq:11}
    \sum_{i=1}^{n}\E_{S \sim \mc{N}^{M}}(\bm{g}(x)_{i} - \frac{1}{M}\sum_{k=1}^{M}\bm{g}(x, s_{k})_{i})^{2} & = \sum_{i=1}^{n}\E_{S \sim \mc{N}^{M}}\left(\frac{1}{M}\sum_{k=1}^{M}T_{i, k}\right)^{2}= M^{-2}\sum_{i=1}^{n}\E_{S \sim \mc{N}^{M}}\left(\sum_{k=1}^{M}T_{i, k}\right)^{2}.
  \end{align}
  For arbitrary \(i \in [n] \), we see that
  \begin{align}
    \label{eq:24}
    \E_{S \sim \mc{N}^{M}}\left(\sum_{k=1}^{M}T_{i, k}\right)^{2} & \leq \sum_{k=1}^{M}\E_{s_{k} \sim \mc{N}}T_{i, k}^{2} + 2\sum_{k < l}\E_{s_{k}, s_{l} \sim \mc{N}}T_{i, k}T_{i, l}\\
                                                                  & = \sum_{k=1}^{M}\E_{s_{k} \sim \mc{N}}T_{i, k}^{2} + 2\sum_{k < l}\E_{s_{k} \sim \mc{N}}T_{i, k}\E_{s_{l} \sim \mc{N}}T_{i, l} = \sum_{k=1}^{M}\E_{s_{k} \sim \mc{N}}T_{i, k}^{2},
  \end{align}
  where we used the fact that \(T_{i, k}\) is zero-mean and for fixed \(i \in [n]\), \(T_{i, k}, T_{i, l}\) are independent. Since \(\abs{T_{i, k}} \leq 2\) we see that \(\abs{T_{i, k}}^{2} \leq 4\), hence
  \begin{equation}
    \label{eq:25}
    \E_{S \sim \mc{N}^{M}}\left(\sum_{k=1}^{M}T_{i, k}\right)^{2} \leq 4M.
  \end{equation}
  Thus we see that
  \begin{equation}
    \label{eq:26}
    \E_{S \sim \mc{N}^M}\norm{\bm{g}(x) - \bm{g}(x, S)}_{2} \leq \sqrt{\E_{S \sim \mc{N}^M}\norm{\bm{g}(x) - \bm{g}(x, S)}_{2}^{2}} \leq 2n^{1/2}M^{-1/2}.
  \end{equation}
\end{proof}

Following \cite[Section 2.2]{tropp19_matrix_concen_comput_linear_algeb} we have the following result
\begin{lemma}
  \label{lem:G-bound}
  For any \(\bm{x} \in \mc{X}^{n}\), any \(\theta \in \Theta\), \(\E_{S \sim \mc{N}^M}\norm{\bm{G}(\bm{x}) - \bm{G}(\bm{x}, S)}_{\infty} \leq \sqrt{\frac{4\norm{\bm{G}(\bm{x})}_{\infty}n\log(2n)}{M}} + \frac{2n\log(2n)}{4M}\)
\end{lemma}
\begin{proof}
  Note that due to the identity \(\cos(x - y) = \sin(x)\sin(y) + \cos(x)\cos(y)\) we have \(K_{\theta}(x, y) = \E_{s \sim \mc{N}}\cos(\scal{x}{\psi_{\theta}(s)} - \scal{y}{\psi_{\theta}(s)}) = \E_{s \sim \mc{N}}\phi_{\theta, s}(x)\tran \phi_{\theta, s}(y)\). For the sample \(\bm{x} = (x_{i})_{i=1}^{n}\) let \(Z_{k, i, :} = \phi_{\theta, s_k}(x_{i})\) be the matrix of features corresponding to sample \(s_{k}\), and let \(Z_{k}^{l}\) be the \(l\)'th column of \(Z_{k}\). Thus we have that \(\E Z_{k}Z_{k}\tran = \bm{G}(\bm{x})\) and \(\bm{G}(\bm{x}, S) = \frac{1}{M}\sum_{k=1}^{M}Z_{k}Z_{k}\tran\).

  To put this in the notation of \cref{thm:tropp19-rff} we let \(\bar{R}_{M} = \bm{G}(\bm{x}, S)\) and \(A = \bm{G}(\bm{x})\). To invoke \cref{thm:tropp19-rff} need to upper bound the quantities \(\norm{Z_{k}Z_{k}}_{\infty}\) and \(m_{2}(Z_{k}Z_{k}\tran)\). For the first term
  \begin{equation}
    \label{eq:27}
    \norm{Z_{k}Z_{k}\tran}_{\infty} \leq \norm{Z_{k}}_{\infty}\norm{Z_{k}}_{\infty} \leq \norm{Z_{k}}_{F}^{2} = \sum_{i=1}^{n}\cos(\scal{x_i}{\psi_{\theta}(s_{k})})^{2} + \sin(\scal{x_i}{\psi_{\theta}(s_{k})})^{2} = n.
  \end{equation}
  For the second term, consider \(\E(Z_{k}Z_{k}\tran)^{2}\). We can rewrite this in the form \(\E Z_{k}CZ_{k}\tran\) where \(C = Z_{k}\tran Z_{k}\) hence symmetric and psd. We can write this as a sum
  \begin{equation}
    \label{eq:28}
    Z_{k}CZ_{k}\tran = C_{11}Z_{k}^{1}(Z_{k}^{1})\tran + C_{22}Z_{k}^{2}(Z_{k}^{2})\tran + C_{12}(Z_{k}^{1}(Z_{k}^{2})\tran + Z_{k}^{2}(Z_{k}^{1})\tran).
  \end{equation}
  We can bound \(0 \leq C_{11} = \norm{Z_{k}^{1}}^{2}_{2} \leq n, 0 \leq C_{22} = \norm{Z_{k}^{2}}^{2}_{2} \leq n\) and \(\abs{C_{12}} = \abs{\scal{Z_{k}^{1}}{Z_{k}^{2}}} \leq n\). Using the identity \(a b\tran + b a\tran = \frac{1}{2}((a + b)(a + b)\tran - (a - b)(a - b)\tran)\) we can then express \(Z_{k}Z_{k}\tran\) as a sum of four psd matrices (with possibly negative coefficients)
  \begin{equation}
    \label{eq:29}
    Z_{k}Z_{k}\tran = C_{11}Z_{k}^{1}(Z_{k}^{1})\tran + C_{22}Z_{k}^{2}(Z_{k}^{2})\tran + \frac{C_{12}}{2}(Z_{k}^{1} + Z_{k}^{2})(Z_{k}^{1} + Z_{k}^{2})\tran - \frac{C_{12}}{2}(Z_{k}^{1} - Z_{k}^{2})(Z_{k}^{1} - Z_{k}^{2})\tran,
  \end{equation}
  then we see that we can majorize \(Z_{k}Z_{k}\) by the matrix \(nZ_{k}^{1}Z_{k}^{1} + nZ_{k}^{2}Z_{k}^{2} + \frac{n}{2}(Z_{k}^{1} + Z_{k}^{2})(Z_{k}^{1} + Z_{k}^{2})\tran + \frac{n}{2}(Z_{k}^{1} - Z_{k}^{2})(Z_{k}^{1} - Z_{k}^{2})\tran\) in the Loewner order and expanding this majorant we see that \((Z_{k}Z_{k}\tran)^{2} \preceq 2n Z_{k}Z_{k}\tran\) where we let \(\preceq\) be the Loewner order on psd matrices. It follows that \(m_{2}(Z_{k}Z_{k}\tran) = \norm{\E(Z_{k}Z_{k}\tran)^{2}}_{\infty} \leq 2n\norm{\E Z_{k}Z_{k}\tran}_{\infty} = 2n \norm{\bm{G}(\bm{x})}_{\infty}\).
\end{proof}

We are now ready to state the theorem of the random feature error
\begin{theorem}[Random feature error]
  For any \(\theta \in \Theta\), any loss \(\ell\) such that for all \(y \in [0, 1]\), \(\ell(\cdot, y) : [-L, L] \to \R_{+}\) has Lipschitz constant \(\Lip(L)\), with \(\omega\) being KRR with regularization parameter \(\lambda > 0\) and RKHS induced by \(K_{\theta}\) we have that
  \begin{align}
    \label{eq:rff-error}
    \E_{S \sim \mc{N}^M}[\mc{E}(\theta, S) - \mc{E}(\theta)] & \leq 2\Lip(\lambda^{-1/2})\lambda^{-1}M^{-1/2}\\
    & + 2\Lip(\lambda^{-1/2})\lambda^{-2}M^{-1/2}n^{-1}\sqrt{\log(2n)\E_{\mu \sim \rho}\E_{\bm{x} \sim \mu^{n}}\norm{\bm{G}(\bm{x})}_{\infty}}\\
    & + \frac{1}{2}\Lip(\lambda^{-1/2})\lambda^{-2}M^{-1}n^{-1}\log(2n)
  \end{align}
\end{theorem}

\begin{proof}
  For any \(\theta\) we can bound
  \begin{align}
  \nonumber
    \E_{S \sim \mc{N}^M}\mc{E}(\theta, S) - \mc{E}(\theta) & = \E_{\mu \sim \rho}\E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\E_{(x, y) \sim \mu}\E_{S \sim \mc{N}^M}(\ell(\scal{\omega_{\theta, S}(\bm{x}, \bm{y})}{\phi_{\theta, S}(x)}, y) - \ell(\scal{\omega_{\theta}(\bm{x}, \bm{y})}{\phi_{\theta}(x)}, y)) \\  \nonumber
                                                           & \leq \Lip(\lambda^{-1/2})\E_{\mu \sim \rho}\E_{(\bm{x}, \bm{y}) \sim \mu^{n}}\E_{x \sim \mu_{\mc{X}}}\E_{S \sim \mc{N}^M}\abs{\scal{\omega_{\theta, S}(\bm{x}, \bm{y})}{\phi_{\theta, S}(x)} - \scal{\omega_{\theta}(\bm{x}, \bm{y})}{\phi_{\theta}(x)}}.
  \end{align}
  Then
  \begin{align}
    \E_{S \sim \mc{N}^M}\abs{\scal{\omega_{\theta, S}(\bm{x}, \bm{y})}{\phi_{\theta, S}(x)} - \scal{\omega_{\theta}(\bm{x}, \bm{y})}{\phi_{\theta}(x)}} &=& \E_{S \sim \mc{N}^M}\abs{\bm{y}\tran(\bm{G}_{\lambda}(\bm{x})^{-1}\bm{g}(x) - \bm{G}_{\lambda}(\bm{x}, S)^{-1}\bm{g}(x, S))}\\
                                                                                            & \leq &\E_{S \sim \mc{N}^M}\norm{\bm{y}}\norm{\bm{G}_{\lambda}(\bm{x})^{-1}\bm{g}(x) - \bm{G}_{\lambda}(\bm{x}, S)^{-1}\bm{g}(x, S)}\\
                                                                                            & \leq& \sqrt{n}\E_{S \sim \mc{N}^M}\norm{\bm{G}_{\lambda}(\bm{x})^{-1}\bm{g}(x) - \bm{G}_{\lambda}(\bm{x}, S)^{-1}\bm{g}(x, S)}.
  \end{align}
  Using the matrix identity \(AB - CD = A(B - D) + (A - C)D\), the
  triangle inequality, together with  \cref{lem:maurer09-psd-operator-facts} we get
  \begin{align}
\nonumber \norm{\bm{G}_{\lambda}(\bm{x})^{-1}\bm{g}(x) {-} \bm{G}_{\lambda}(\bm{x}, S)^{-1}\bm{g}(x, S)} &\leq \norm{\bm{G}_{\lambda}(\bm{x})^{-1}}_{\infty}\norm{\bm{g}(x) {-} \bm{g}(x, S)}_{2}+ \norm{\bm{G}_{\lambda}(\bm{x})^{-1} {-} \bm{G}_{\lambda}(\bm{x}, S)^{-1}}_{\infty}\norm{\bm{g}(x, S)}_{2}\\ & \leq  (n \lambda)^{-1}\norm{\bm{g}(x) - \bm{g}(x, S)}_{2} + (n\lambda)^{-2}\norm{\bm{G}(\bm{x}) - \bm{G}(\bm{x}, S)}_{\infty}\sqrt{n}
    \label{eq:rff-semi-bound}
  \end{align}
  and so we consider the terms \(\E_{S \sim \mc{N}^M}\norm{\bm{g}(x) {-} \bm{g}(x, S)}_{2}\) and \(\E_{S \sim \mc{N}^M}\norm{\bm{G}(\bm{x}) {-} \bm{G}(\bm{x}, S)}_{\infty}\). Using \cref{lem:g-bound} and \cref{lem:G-bound} we can upper bound \cref{eq:rff-semi-bound} (together with factor \(\sqrt{n}\)) as
  \begin{align}
    \label{eq:31}
    \sqrt{n}\norm{\bm{G}_{\lambda}(\bm{x})^{-1}\bm{g}(x) - \bm{G}_{\lambda}(\bm{x}, S)^{-1}\bm{g}(x, S)} & \leq 2\lambda^{-1}M^{-1/2}
    + 2\lambda^{-2}M^{-1/2}n^{-1/2}\sqrt{\log(2n)\norm{\bm{G}(\bm{x})}_{\infty}}\\
    & + \frac{1}{2}\lambda^{-2}M^{-1}n^{-1/2}\log(2n)
  \end{align}

  The final bound follows by pulling \(\E_{\mu \sim \rho}\E_{\bm{z} \sim \mu^{n}}\) into the square root using Jensen and multiplying by \(\Lip(\lambda^{-1/2})\).

\end{proof}

Combining the above we have that
\begin{theorem}[IKML Excess risk bound]
  Assume that \(\mc{X} \times \mc{Y} \subseteq \R^d \times [0, 1]\) and \(\ell(y, \hat{y}) = (y - \hat{y})^2\). Let \(\mc{G}_{future} = \{z = (x, y) \mapsto \lambda^{-1/2}\scal{v}{\phi_{\theta, S}(x)}_{\theta, S} \: : \: \norm{v}_{\theta, S} \leq 1\}\) and \(\mc{F} = \{f : \mc{Z}^n \to \R_{\geq 0} , \: f(\bm{z}) = \hat{\ell}_{\theta}(\bm{z}, S), \: \forall \theta \in \Theta\}\), and let \(\overline{\bm{Z}} = (\bm{x}^{t}, \bm{y}^{t})_{t=1}^{T} \sim \hat{\rho}^{T}\) then with probability greater than \(1 - \delta\) over the sampling of \(\overline{\bm{Z}}\)
  \begin{align}
    \E_{S \sim \mc{N}^M}[\mc{E}(\hat{\theta}, S) - \mc{E}(\theta^{*})] & \leq 2\Lip(\lambda^{-1/2})\E_{S \sim \mc{N}^{M}}\E_{\bm{z} \sim \hat{\rho}}\mc{R}(\mc{G}_{future}(\bm{z})) \label{eq:final-1}\\
                                                                       & + \E_{S \sim \mc{N}^M}\E_{\bm{Z} \sim \hat{\rho}^T} \mc{R}(\mc{F}(\bm{Z})) + \sqrt{\frac{\log(1/\delta)}{2T}} \label{eq:final-2}\\
                                                                       & + \E_{S \sim \mc{N}^M}(\hat{\mc{E}}_{T}(\hat{\theta}, S) - \hat{\mc{E}}_{T}(\theta^{*}, S)) \label{eq:final-3}\\
                                                                       & + 2\Lip(\lambda^{-1/2})\lambda^{-1}M^{-1/2} \label{eq:final-4}\\
                                                                       & + 2\Lip(\lambda^{-1/2})\lambda^{-2}M^{-1/2}n^{-1/2}\sqrt{\log(2n)\E_{\mu \sim \rho}\E_{\bm{x} \sim \mu^{n}}\norm{\bm{G}(\bm{x})}_{\infty}} \label{eq:final-5}\\
                                                                       & + \frac{1}{2}\Lip(\lambda^{-1/2})\lambda^{-2}M^{-1}n^{-1/2}\log(2n) \label{eq:final-6}
  \end{align}
\end{theorem}

We note the following
\begin{itemize}
  \item \cref{eq:final-1} can be replaced by the strictly greater bound \(2\Lip(\lambda^{-1/2})\lambda^{-1/2}n^{-1/2}\)
  \item We hypothesise that the term \(\E_{S \sim \mc{N}^M}\E_{\bm{Z} \sim \hat{\rho}^T} \mc{R}(\mc{F}(\bm{Z}))\) in  \cref{eq:final-2} is \(O(1/\sqrt{T})\) due to standard form of rademacher complexities of bounded balls in RKHS
  \item \cref{eq:final-3} is the optimization error and we assume that this is negligible.
  \item Since we are using the squared loss \(\Lip(L) = 2(L + 1)\) and thus all terms \(\Lip(\lambda^{-1/2}) = 2(\lambda^{-1/2} + 1) = O(\lambda^{-1/2})\).
\end{itemize}

\section{Datasets}
\label{app:datasets}

\subsection{Beijing Air Quality}
\label{app:air-quality-dataset}
The Beijing Air Quality dataset
~\citep{zhang2017cautionary} is a time-series dataset
measuring air-quality and meterological factors at 12 air-quality monitoring
sites. The meterological data for each site is matched with the closest of
available weather stations. The data was collected hourly and from the period
March 1st, 2013 to February 28th, 2017.

Each datum consists of a timestamp, the site name and various features. We use
the feature of interest, ``PM2.5'' for the  fine particulate matter (PM) concentration and remove
the features ``PM10'', ``wd'', ``WSPM'' since the first one correlates heavily
with ``PM2.5'' and ``wd'', ``WSPM'' since ``wd'' is the direction of the wind
and thus categorical and ``WSPM'' since this is the wind speed of the direction.
This leaves us with 9 features and one output feature.

The dataset was created as follows:
\begin{enumerate}
	\item Remove any rows with missing entries.
	\item	For each station, split the time-series into 3 sub-series of
	      \(64 / 16 / 20 \%\) starting at the beginning forming the meta-train,
	      validation and test sets.
\end{enumerate}

Tasks are sampled as follows:
\begin{enumerate}
	\item Sample a station uniformly at random.
	\item Given a train and validation size \(n = n_{\rm tr} + n_{\rm  val}\) sample a
	      contiguous sequence of size \(n\) at random from the available starting
	      points. We add a temporal feature \(t\) which is just an index from
	      \(1\) to \(n\) to encode temporal dependency local to the task.
	\item From this contiguous sequence randomly assign \(n_{\rm tr}\) datapoints to the train
	      set and \(n_{\rm  val}\) datapoints to the validation set.
\end{enumerate}

\subsection{Gas Sensor}
\label{app:gas-sensor-dataset}
The Gas Sensor Modulation
dataset \citep{burgues18_estim_limit_detec_semic_gas} is a collection of
multivariate timeseries collected in a controlled environment using MOX sensors
for CO detection. The sensors output voltage recordings sampled at a frequency
of 3.5 Hz.

Each timeseries can be chunked up into contiguous subseries corresponding to
experiments by looking at the heating cycle, the end of a cycle which marks a
new experiment. We let each subsequence correspond to one task distribution from
which we sample \(n = n_{\rm tr} + n_{\rm  val}\) datapoints and permute the indices to
make the task into a supervised regression task. The output was chosen to be the
2nd feature 3 timesteps into the future as this seen to vary over the tasks and not directly inferable by
one of the other features. In total there are 13 csv files with experiment. Each such file has a set number of experiments after preprocessing, we split each files experiment into  \(64 / 16 / 20 \%\) meta-train, validation and test splits.

The dataset was created as follows:
\begin{enumerate}
	\item All subsequences were extracted by locating the start and end of a
	      heating cycle.
	\item All extra features which were not gas sensors were dropped.
	\item Output feature isolated and lagged.
\end{enumerate}

Tasks are sampled as follows:
\begin{enumerate}
	\item For a new task we first sample one of the csv files uniformly at random.
	\item From the experiments in this csv file we sample a subsequence uniformly at random which
	      is the task-distribution.
	\item Add ``\(t\)'' to the features.
	\item From this subsequence we sample \(n = n_{\rm tr} + n_{\rm  val}\) points at
	      random which forms out train and validation set.
\end{enumerate}

\section{Experimental Results}
\label{app:exp-results}
\subsection{Hardware}
\label{app:exp-hardware}
All of the experiments were run on a single computer with specifications
\begin{description}
    \item[CPU] AMD Ryzen 7 3700X 8-Core Processor
    \item[GPU] NVIDIA GeForce RTX 2060 SUPER
    \item[RAM] 2x16GB DDR4 Vengeance
\end{description}

\subsection{Algorithms}
\label{app:exp-algorithms}
In this section we elaborate on the algorithm used.

MAML \citep{finn17_model} parameterize a set of functions \(f_{\theta}: \mc{X} \to \mc{Y}\), typically a family of neural networks. For a new task \(\taskdataset\) it optimizes the objective \(\argmin_{\theta}\hat{\mc{E}}(f_{\theta}, \tasktrain)\) using gradient descent starting from the hyperparameter \(\theta_0\) so that the fine-tuned weight vector is a function of \(\theta_0\), \(\theta(\theta_0)\). In the outer loop it optimizes the objective \(\argmin_{\theta_0}\hat{\mc{E}}(f_{\theta(\theta_0)}, \taskval)\) using gradient descent.

R2D2 \citep{bertinetto18_meta_learn_with_differ_closed_form_solver} parameterize a feature map \(\phi_{\theta}: \mc{X} \to \R^d\) which give rise to a kernel \(\kernel{M}_{\theta}(x, x') = \scal{\phi_{\theta}(x)}{\phi_{\theta}(x')}\). The inner algorithm is KRR with \(\kernel{K}_{\theta}\). For a task \(\taskdataset\) it first does KRR in the inner loop giving the KRR estimator \(f_{\theta} = A_{\rm KRR}(K_{\theta}, \tasktrain)\) and in the outer loop it optimizes \(\argmin_{\theta}\hat{\mc{E}}(f_{\theta}, \taskval)\) using gradient descent.

LS Biased Regularization
\citep{denevi2019learning} performs biased ridge regression where the functions are given by \(f_{\theta}(x) = \scal{\theta}{x}\). For the inner algorithm it solves the biased ridge regression problem \(\argmin_{w} \frac{1}{n}\norm{X \theta - y}^2 + \lambda \norm{\theta - \theta_0}^2\) which has a closed form, see \citep{denevi2019learning}. For a task \(\taskdataset\) the algorithm first finds \(\theta(\theta_0)\) using \(\tasktrain\) using biased RR and in the outer loop it optimizes \(\argmin_{\theta_0}\hat{\mc{E}}(f_{\theta(\theta_0)}, \taskval)\) using gradient descent.

IKML-MLP is the same as IKML but uses the general random features representation of the kernel \(K(x, x') = \int_{\Omega}\varphi(x, \omega)\tran \varphi(x, \omega) \dl \tau(\omega)\). In this case, we let \(\varphi(\cdot, \omega): \R^d \to \R^o\) be an MLP with ReLU activation functions, some fixed hidden dimension \(h\) and an output dimension \(o\). Let \(D\) be the size of \(\omega\). In this case the feature map is complicated, so we opt for a simpler pushforward to make it easier to train. In particular, we let the pushforward take on a ``variational form'' by letting the pushforward \(\psi_{\theta}(s) = \psi_{\mu, \sigma}(s) = \mu + \sigma \odot s\) where \(s, \mu, \sigma \in \R^D\) and for two matrices \(A, B\), \((A \odot B)_{ij} = A_{ij}B_{ij}\) is the Hadamard product. We train using the same procedure as in Alg. 1. Of note is that this can be seen as an ensemble method over R2D2 where instead of ensembling over the learned functions we ensemble over kernel functions.

\subsection{Toy Regression}
\label{app:synthetic-experiment}

\par{\bf Setup}
We create a synthetic high-dimensional meta-learning regression setting where each task is sampled from an RKHS
\(\rkhs{H}\) with a ``complicated'' kernel \(\kernel{K}^o\). In particular, we choose
\(\kernel{K}^o\) to be the kernel given by Bochner's theorem and a
pushforward of a 3-layers Multi-Layer Perceptron (MLP) with 32 hidden units per layer, ReLU activation
functions and a 16-dimensional latent Gaussian distribution. The network was initialized with weights given by the PyTorch \citep{paszke2019pytorch} default initialization scaled by 100. Since this kernel lacks an analytic form, we sample 10000 frequencies and use the random features kernel in its place.

The tasks are generated in \(\rkhs{H}\) by independently sampling $R$ points $(x_r)_{r=1}^R$ with \(x_r \distas U_{[0, 0.2]^{d}}\) and \(R\) coefficients \((\alpha_{r})_{r=1}^{R}\) with
\(\alpha_{j} \distas U_{[0, 1]}\). Together they model the target regressor as
\(f(x) = \sum_{r=1}^{R}\alpha_{r}\kernel{K}(x, x_{r})\). We set $R=3$. The task
datasets is created by independently sampling \(n = n_{\rm tr} + n_{\rm val} = 50 + 50\) datapoints
\((x_{i},y_i)_{i=1}^{n}\) with \(x_{i} \distas U_{[0, 0.2]^{d}}\) and
\(y_{i} = f(x_{i})\).

\par{\bf Initial and Learned Kernel} For the same setup of the environment as in the synthetic experiments we look at how the initial and learned kernels differ from the true kernel. We do this for the algorithms \textit{IKML} and \textit{Gaussian MKL meta-KRR}. These algorithms where chosen since they define translation invariant kernels and are easy to visualize. We let all of the algorithms (R2D2, MAML, IKML) be parameterized by a 3-layer MLP but with varying the dimensionality of \(x\). We also tried using 1 and 2-layer MLPs for the parameterization but the results were almost identical.

For an experiment with dimension \(d\) we visualize the kernels of Gaussian MKL meta-KRR and IKML by sampling 5 directions \((v_i)_{i=1}^5\) from the unit ball in \(\R^d\). For a direction \(v_i\) we plot the value of the kernel on the line with direction \(v_i\) where on the \(x\)-axis we have \(t\) from \(-0.4\) to \(0.4\) and on the \(y\)-axis the value of \(\kernel{K}(0, t \cdot v_i)\). We plot the result of the first run for each experiment, other runs look similar.

We plot the learning curves and kernel for each dimension 1, 5, 10, 20. For each row in \cref{fig:synthetic-3-layer-MLP} corresponding to a dimension \(d\) the \(i\)'th column plots kernels in the direction of \(v_i\) with the first row of the subplot corresponding to the kernels at initialization and the second row the kernels after training. The sample \((v_i)_{i=1}^5\) is resampled for each dimension. For IKML we sample 10000 frequencies once and fix them before plotting.

\subsection{Learning curves}
\label{sec:learning-curves}
We show the behaviour of the optimization trajectory of the algorithms R2D2, IKML and ANP. See \cref{fig:beijing-air-quality-learning-curves} and \cref{fig:gas-sensor-learning-curves}.

\begin{figure}[h!] \subfloat[][(a)]{\includegraphics[width=0.5\linewidth,height=0.4\linewidth]{images/air_quality/post_cross_validation/learning_curves_valid_new.png}\label{fig:beijing-air-quality-learning-curves}}
  \hfill \subfloat[][(b)]{\includegraphics[width=0.5\linewidth,height=0.4\linewidth]{images/gas_sensor/post_cross_validation/learning_curves_valid_new.png}\label{fig:gas-sensor-learning-curves}}
  \caption{Learning curves of meta-validation RMSE for the algorithms IKML, R2D2 and ANP for (a)
    Beijing Air Quality (25-shot), (b) Gas Sensor dataset (20-shot)
    over 5 runs (mean \(\pm\) 1 std). R2D2 and ANP were chosen due to their recency and performance as few-shot learning algorithms compared to all other algorithms evaluated.}
\end{figure}

\subsection{Cross-validation for Real-World Datasets}
\label{sec:cross-val}
We cross-validated R2D2, IKML and ANP on the 25-shot Air Quality and 20-shot Gas Sensor dataset
where we do a grid search over the number of hidden layers in an MLP with ReLU
activation function and the meta-learning rate. For IKML and R2D2 the number of hidden layers are
in \(\{1, 2, 3, 4, 6, 8, 10, 15, 20\}\) while for ANP we use the same
architecture for encoder and decoder and use \(\{1, 2, 3, 4\}\) layers, the hidden dimension is fixed to 64, the meta-learning rate are in
\(\{10^{-4}, 10^{-5}\}\). The
training setup is the same as in the main body and the
metric is the RMSE on a holdout-set sampled from the meta-validation split of
the best model from the snapshots during the 30000 iterations. The results can be seen in \cref{tab:valid-rmse-for-r2d2-ikml-anp}.

\begin{table*}[b]
  \caption{Validation results for meta-hyperparameter configurations for IKML,
R2D2 \citep{bertinetto18_meta_learn_with_differ_closed_form_solver} and ANP
\citep{kim19_atten_neural_proces} on 25-shot Air Quality dataset and 20-shot Gas
Sensor. Best set of parameters in \textbf{bold}. We run the algorithms for 30000
iterations and evaluate it on the validation set at 250 intervals. We get the
validation RMSE on a holdout set (3000 tasks) using the model with the lowest
evaluation validation error.}
  \centering
  \begin{tabular}{llccccccccc}
	& & & \multicolumn{3}{c}{$25$-shot Air Quality} & \phantom{abc} &
																	  \multicolumn{3}{c}{$20$-shot Gas Sensor}\\
	\cmidrule{4-6} \cmidrule{8-10}
	Meta-lr & Layers & & IKML & R2D2 & ANP & & IKML & R2D2 & ANP \\
	\midrule
	$10^{-4}$ & 1 & & 101.65 & 8861.31 & 1390.14 & & 2.16 & 2.64 & 2.38 \\
	& 2 & & 98.37 & 13761.01 & 38.61 & & 2.14 & 1.85 & 1.72 \\
	& 3 & & 98.07 & 205.06 & \textbf{38.37} & & 2.14 & 1.65 & \textbf{1.53} \\
	& 4 & & 21.45 & 508.55 & 36.32 & & 2.11 & 1.49 & 1.57 \\
	& 6 & & 24.24 & 21.57 & -- & & 2.13 & \textbf{1.46} & -- \\
	& 8 & & 23.88 & 21.96 & -- & & \textbf{2.06} & 1.53 & -- \\
	& 10 & & 27.30 & \textbf{21.32} & -- & & 2.06 & 1.49 & -- \\
	& 15 & & 27.57 & 22.85 & -- & & 2.12 & 1.48 & -- \\
	& 20 & & 40.57 & 25.01 & -- & & 7.20 & 1.50 & -- \\
	\midrule
	$10^{-5}$ & 1 & & 125.75 & 3237.35 & 76.50 & & 19.53 & 6.45 & 7.83 \\
	& 2 & & 110.01 & 1233.41 & 41.75 & & 2.70 & 3.20 & 8.43 \\
	& 3 & & 76.58 & 431.61 & 47.24 & & 2.50 & 2.34 & 7.42 \\
	& 4 & & \textbf{19.05} & 57.37 & 43.71 & & 2.41 & 1.86 & 6.35 \\
	& 6 & & 20.52 & 22.68 & -- & & 2.43 & 1.59 & -- \\
	& 8 & & 23.86 & 21.98 & -- & & 2.35 & 1.55 & -- \\
	& 10 & & 134.89 & 22.44 & -- & & 2.45 & 1.53 & -- \\
	& 15 & & 28.40 & 24.80 & -- & & 2.46 & 1.56 & -- \\
	& 20 & & 135.18 & 26.62 & -- & & 2.45 & 1.55 & -- \\
	\bottomrule
  \end{tabular}
  \label{tab:valid-rmse-for-r2d2-ikml-anp}
\end{table*}

\subsection{More shots}
\label{sec:more-shots}
Further test-RMSE for various numbers of shots for Air Quality \cref{tab:more-shots-air-quality} and Gas Sensor \cref{tab:more-shots-gas-sensor}. We benchmark LS Biased
Regularization, IKML, R2D2, ANP for both Air Quality and Gas Sensor and additionally IKML-MLP
for Gas Sensor. We reuse the cross-validated
models for IKML, R2D2 and ANP and the hyperparameters used for the other models.
We get 5 test-RMSE scores for Air Quality and 2 for Gas Sensor and report the
mean and standard deviation for Air Quality and mean for Gas Sensor. The low number of shots in Gas Sensor is due to
many of the underlying time series from which each task is generated having as few as
\(40\) points.
\begin{table*}[b]
  \caption{Test-RMSE (mean \(\pm\) 1 std) for IKML, R2D2, ANP and LSBR over 5
	independent runs on Air Quality for various shots. Same tasks for all
	algorithms over each run. Best result for each shot in \textbf{bold}.}
  \centering
  \begin{tabular}{lccccc}
	& & \multicolumn{4}{c}{Air Quality (shots)}\\
	\cmidrule{3-6}
	Model & & 10 & 25 & 50 & 100 \\
	\midrule
	IKML & & $24.32\pm5.21$ & $\mathbf{19.14\pm0.93}$ & $\mathbf{19.36\pm1.02}$ & $\mathbf{18.88\pm0.51}$\\
	R2D2 & & $\mathbf{21.21\pm0.28}$ & $20.23\pm0.55$ & $23.42\pm3.44$ & $20.75\pm0.79$\\
	ANP & & $31.05\pm0.90$ & $33.77\pm0.70$ & $37.30\pm0.94$ & $41.08\pm1.07$\\
	LSBR & & $21.49\pm0.40$ & $21.68\pm0.29$ & $23.69\pm0.47$ & $27.32\pm0.16$\\
	\bottomrule
  \end{tabular}
  \label{tab:more-shots-air-quality}
\end{table*}

\begin{table*}
  \caption{Test-RMSE (mean, standard deviation left out due to low number of runs) for IKML, R2D2, ANP, LSBR and IKML-MLP over 2
	independent runs on Gas Sensor for various shots. Same tasks for all
	algorithms over each run. Best result for each shot in \textbf{bold}.}
  \centering
  \begin{tabular}{lccccc}
	& & \multicolumn{4}{c}{Gas Sensor (shots)}\\
	\cmidrule{3-6}
	Model & & 5 & 10 & 15 & 20 \\
	\midrule
	IKML & & 10.04 & 4.57 & 3.42 & 2.80 \\
	R2D2 & & 6.00 & \textbf{2.44} & 2.12 & 1.95 \\
	ANP & & \textbf{2.57} & \textbf{2.44} & \textbf{2.10} & 2.12 \\
	LSBR & & 13.97 & 12.21 & 11.12 & 12.44 \\
	IKML-MLP & & 4.03 & 2.64 & 2.23 & \textbf{1.94} \\
	\bottomrule
  \end{tabular}
  \label{tab:more-shots-gas-sensor}
\end{table*}

%E2: 3-layer MLP
\begin{figure}[htbp]
  \centering
  \subfloat[Learning curves for \(d = 1\)]{\includegraphics[align=c,width=0.29\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=1.png}}%
  \subfloat[Initial and learned kernels for \(d = 1\)]{\includegraphics[align=c,width=0.58\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/kernels-d=1.png}}\\
  \subfloat[Learning curves for \(d = 5\)]{\includegraphics[align=c,width=0.29\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=5.png}}%
  \subfloat[Initial and learned kernels for \(d = 5\)]{\includegraphics[align=c,width=0.58\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/kernels-d=5.png}}\\
  \subfloat[Learning curves for \(d = 10\)]{\includegraphics[align=c,width=0.29\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=10.png}}%
  \subfloat[Initial and learned kernels for \(d = 10\)]{\includegraphics[align=c,width=0.58\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/kernels-d=10.png}}\\
  \subfloat[Learning curves for \(d = 20\)]{\includegraphics[align=c,width=0.29\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=20.png}}%
  \subfloat[Initial and learned kernels for \(d = 20\)]{\includegraphics[align=c,width=0.58\linewidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/kernels-d=20.png}}\\
  \caption{Parameterization using a 3-layer MLP: Learning curves and initial vs learned kernels for different input dimension on synthetic dataset (all algorithms using a 3-layer MLP). \textbf{Column 1}: learning curves (meta-test RMSE) over 3 runs. \textbf{Column 2}: \textbf{Sub-row 1} kernel before training, \textbf{Sub-row 2} kernel at test time. Each column plots the kernel in a random direction drawn from the unit ball. }
  \label{fig:synthetic-3-layer-MLP}
\end{figure}

\FloatBarrier

\subsection{Sensitivity of R2D2 and IKML-MLP}
We highlight the qualitative difference between R2D2 and IKML-MLP. We compare the learning curves and holdout meta-valid and test RMSE. Note that the test-split is used to asses out-of-sample few-shot performance since we train and choose best model using train and validation set respectively. In this case we let the feature map \(\varphi(x, \omega)\) be an MLP with ReLU activation functions, with number of layers and number of hidden units varied. We perform this analysis on the Air Quality and Gas Sensor datasets with the settings as given in the main body unless specified and compare the results.

\paragraph{Air Quality}
We run IKML-MLP and R2D2 for 10000 iterations using Adam with meta-learning rate \(3 \cdot 10^{-4}\) and vary the number of layers and and the number of hidden units in isolation. Both of the algorithms share the same base feature map \(\varphi(\cdot, \omega)\) but IKML-MLP calculates the kernel \(K(x, x') = \int_{\Omega}\varphi(x, \omega)\tran \varphi(x, \omega) \dl \tau(\omega)\) by sampling while R2D2 has a fixed feature map yielding the kernel \(K(x, x') = \varphi(x, w)\tran \varphi(x', w)\) for a fixed weight \(w\). We use a feature dimension of \(8\). The only difference to the setup in the main body is that we use 10000 iterations insted of 30000.

When we fix the number of layers to be 2, we can see from \cref{fig:r2d2-vs-ikml-mlp-air-quality-num-hidden} that IKML-MLP dominates R2D2 in terms of performance both on the validation and test set. In contrast, when we fix the number of hidden units to be 64 and vary the number of layers, we can see from  \cref{fig:r2d2-vs-ikml-mlp-air-quality-num-layers} that R2D2 performs equally well as IKML-MLP. As the network becomes deeper we noticed, for this dataset, that IKML-MLP requires more random features to train well (in contrast to the Gas Sensor case). We hypothesis that for noisy tasks, like in the Air Quality dataset, the number of random features required to get accurate gradients to be able to train deeper networks increase quickly with depth. However, on this dataset we see that the number of layers is not required to be very deep to reach good performance, so in this case it does not pose a problem.

% Gas Sensor
\paragraph{Gas Sensor}
We run IKML-MLP and R2D2 for 10000 iterations using Adam with meta-learning rate \(3 \cdot 10^{-4}\) and vary the number of layers and and the number of hidden units in isolation. Both of the algorithms share the same base feature map \(\varphi(\cdot, \omega)\) but IKML-MLP calculates the kernel \(K(x, x') = \int_{\Omega}\varphi(x, \omega)\tran \varphi(x, \omega) \dl \tau(\omega)\) by sampling while R2D2 has a fixed feature map yielding the kernel \(K(x, x') = \varphi(x, w)\tran \varphi(x', w)\) for a fixed weight \(w\). We use a feature dimension of \(4\). The only difference to the setup in the main body is that we use 10000 iterations insted of 30000.

Compared to the Air Quality figures \cref{fig:r2d2-vs-ikml-mlp-air-quality-num-hidden,fig:r2d2-vs-ikml-mlp-air-quality-num-layers} training is much more stable due to the dataset being much less noisy than that of the Air Quality dataset. R2D2 and IKML-MLP both train well and have good performance, although IKML-MLP overfits less to the meta-split as can be seen on the holdout performance plots in \cref{fig:r2d2-vs-ikml-mlp-gas-sensor-num-hidden,fig:r2d2-vs-ikml-mlp-gas-sensor-num-layers}. In this case 100 random features were enough for the training to be successful for IKML-MLP which supports the hypothesis given in the previous section on Air Quality.

% Air Quality
%% Num Hidden
\begin{figure}[t!]
  \centering
  \subfloat[Learning curves]{\includegraphics[width=0.9\linewidth]{images/air_quality/r2d2_vs_ikml_mlp_num_hidden/learning_curves.png}}%
  \\
  \subfloat[Holdout Performance]{\includegraphics[width=0.4\linewidth]{images/air_quality/r2d2_vs_ikml_mlp_num_hidden/performance_against_hidden_units.png}}%
  \caption{Learning curves (above) and performance plots (below) of R2D2 vs IKML-MLP on the Air Quality dataset when varying the number of hidden units. IKML-MLP is more robust to hyperparameter settings than R2D2. We fix the number of hidden layers to 2 and feature dimension to be 8. For IKML-MLP we fix the number of random features to be 100. Note that performance plot is log-scaled due to large range of reported numbers.}
  \label{fig:r2d2-vs-ikml-mlp-air-quality-num-hidden}
  \centering
  \subfloat[Learning curves: we optimize the models using the train split ]{\includegraphics[width=.9\linewidth]{images/air_quality/r2d2_vs_ikml_mlp_num_layers/learning_curves.png}}%
  \\
  \subfloat[Holdout Performance]{\includegraphics[width=0.4\linewidth]{images/air_quality/r2d2_vs_ikml_mlp_num_layers/performance_against_num_layers.png}}%
  \caption{Learning curves (above) and performance plots (below) of R2D2 vs IKML-MLP on the Air Quality dataset when varying the number of layers. IKML-MLP stabilize training up to a point but for deeper networks we found that IKML-MLP requires more random features. We fix the number of hidden units to 64 and feature dimension to be 8. For IKML-MLP we fix the number of random features to be 400.}
  \label{fig:r2d2-vs-ikml-mlp-air-quality-num-layers}
\end{figure}

\FloatBarrier

\begin{figure}[t!]
  \centering
  \subfloat[Learning curves]{\includegraphics[width=.86\linewidth]{images/gas_sensor/r2d2_vs_ikml_mlp_num_hidden/learning_curves.png}}%
  \\
  \subfloat[Holdout Performance]{\includegraphics[width=0.39\linewidth]{images/gas_sensor/r2d2_vs_ikml_mlp_num_hidden/performance_against_hidden_units.png}}%
  \caption{Learning curves (above) and performance plots (below) of R2D2 vs IKML-MLP on the Gas Sensor dataset when varying the number of hidden units. IKML-MLP is more robust to hyperparameter settings than R2D2. We fix the number of hidden layers to 2 and feature dimension to be 8. For IKML-MLP we fix the number of random features to be 100.}
  \label{fig:r2d2-vs-ikml-mlp-gas-sensor-num-hidden}
  \centering
  \subfloat[Learning curves: we optimize the models using the train split ]{\includegraphics[width=.86\linewidth]{images/gas_sensor/r2d2_vs_ikml_mlp_num_layers/learning_curves.png}}%
  \\
  \subfloat[Holdout Performance]{\includegraphics[width=0.39\linewidth]{images/gas_sensor/r2d2_vs_ikml_mlp_num_layers/performance_against_num_layers.png}}%
  \caption{Learning curves (above) and performance plots (below) of R2D2 vs IKML-MLP on the Gas Sensor dataset when varying the number of layers. IKML-MLP stabilize training up to a point but for deeper networks we found that IKML-MLP requires more random features. We fix the number of hidden units to 64 and feature dimension to be 8. For IKML-MLP we fix the number of random features to be 100.}
  \label{fig:r2d2-vs-ikml-mlp-gas-sensor-num-layers}
\end{figure}

\FloatBarrier

\section{Computational Complexity and Walltime Table}
\label{app:computational-complexity}
In this section we show the computational complexity using big-\(O\) notation of IKML / IKML-MLP and compare it against that of R2D2 since they both rely on KRR as the inner algorithm. In addition we measure the performance in practice through wall-time table of IKML, IKML-MLP, MAML and R2D2. 
%
%\subsection{Computational Complexity}
We first recall the complexity of training and validation of KRR in the dual
form when we have a train set \(D^{\text{tr}} = (x_i, y_i)_{i=1}^{\ntr}\) and a validation set \(D^{\text{val}} = (x_j, y_j)_{j=1}^{\nva}\). We focus on the dual formulation since generally data set sizes are
small in meta-learning while the feature space dimension is large, which means
that the dual form is more efficient than the primal form.

Assume that we have a kernel \(K: \inputspace \times \inputspace \to \R\) that
can be evaluated in \(O(\kappa)\). For training we need to calculate the dual
coefficients \(\alpha = (G + \ntr \lambda I)^{-1}\matr{y}\) where \(\matr{y}\) is
the output vector. This means we first need to calculate the regularized kernel
matrix of the train set, \(G + \ntr \lambda I \in \R^{\ntr \times \ntr}\), which can be
calculated in \(O(\kappa \ntr^2 + \ntr)\) since \(\ntr \lambda I\) is a diagonal matrix, then invert this matrix which can be
calculated in \(O(\ntr^3)\) and finally perform the matrix-vector multiplication
which is \(O(\ntr^2)\). Summing all of the steps gives a final complexity of
\(O(\kappa \ntr^2 + \ntr^3)\). Prediction on the validation set \(D^{\text{val}}\)
means first calculating the matrix
\((M_{ls})_{l,s = 1}^{\nva, \ntr} = (K(x_l, x_s))_{l,s = 1}^{\nva, \ntr}\) between the
validation and train set which is done in \(O(\kappa \ntr \nva)\). After calculating
\(M\), calculating
\(\hat{\matr{y}} = M\alpha\) can be done in \(O(\ntr \nva)\) which means that the total
number of operations is \(O((\kappa + 1)\ntr\nva)\).

The complexity for both training and validation when using KRR depends
implicitly on \(\kappa\) which will depend on the meta-learning algorithm. For
IKML with Bochner kernel using $M$ random features we first need to sample
$M$ features. This can be done in \(O(Cm)\) where \(C\) is the time it takes
to evaluate the pushforward neural network. Note that this is a one-time cost
before training and validation. In practice we use batches so that for \(B\)
tasks we sample $M$ features once, which reduces this cost further by a factor
of the number of tasks in a batch. Letting \(W \in \R^{M \times d}\) be the
matrix of random features stacked horizontally then the feature map
\(\phi: \inputspace \to \R^M\) is \(\phi(x) = \cos(Wx + b)\) where \(b\) is a
vector of iid entries sampled uniformly from \([0, 2\pi]\), sampled at the same
time as \(W\). Evaluating \(\phi\) once is done in \(O((d + 1)m)\). For one
task, we first calculate the $M$ features in \(O((d + 1)m)\) and training
This means that training and prediction for IKML costs \(O(dm\ntr^2 + \ntr^{3})\) and \(O(dm\ntr\nva)\) respectively, both which are linear in $M$.

For R2D2 the feature map \(\phi: \inputspace \to \R^h\) where \(h\) is the
dimension of the feature space is a neural network. Assuming that \(\phi\) takes
the form of an \(L\)-layer MLP with weights and biases
\((W_{i}, b_{i})_{i=1}^{T}\) such that
\(W_{1} \in \R^{h_{1} \times d}, b_{1} \in \R^{h_{1}}\), and for \(1 < l < T\),
\(W_{l} \in \R^{h_{l} \times h_{l-1}}, b_{l}\) and finally
\(W_{L} \in \R^{h \times h_{l}}, b_{l} \in \R^{h}\) with nonlinearity \(\sigma\)
which can be evaluated in constant time \(A\), then evaluating \(\phi(x)\)
is done in
\(O(\prod_{l=1}^{L}h_{l}h_{l-1} + \sum_{l=1}^{L-1}(1 + A)h_{l} + h_{L}) = O(\prod_{l=1}^{L}h_{l}h_{l-1})\). Running IKML-MLP, if \(h_{l} = h\) for any \(l\) we get \(O(dh^{2L-1})\). Except for the extra factor
of \(h^{2L-1}\) the same conclusion as for Bochner holds in this case.

\begin{table}[t!]
\caption{\emph{Time (seconds) to solve one batch of tasks for IKML, IKML-MLP, R2D2 and MAML.} We measure the time required for solving one batch of tasks: training, calculating the meta-loss, and updating the hyperparameters. We use the Air Quality \((d = 10)\) dataset with 25 train and 25 validation points per task, meta-batch size of 4. All algorithms use a 4-layer MLP with 64 hidden units. For IKML we let \(M = 2 \cdot 10^4\), while for IKML-MLP we let \(M = 100\). We run each algorithm for 5000 steps and normalize the total time by dividing with 5000. We repeat this 3 times and report the mean and standard deviation.}
  \centering
  %\footnotesize
  \begin{tabular}{ll}
	\toprule
    	{Algorithm}                     & {Seconds for one batch (mean \(\pm\) 1 std)} \\
	\midrule
    IKML & \(0.017 \pm 0.00004\) \\ % 20:05-21:33 (88), 21:35-22:59 (84), 22:59-24:23 (84)
    IKML-MLP & \(0.075 \pm 0.002\) \\
    R2D2 & \(0.031 \pm 0.001\) \\ % 24:25-27:02 (147), 27:02-29:42 (160), 29:42-32:14 (152)
    MAML & \(0.022 \pm 0.001\) \\ % (105), (115), (105)
	\bottomrule
  \end{tabular}
  \label{tab:timings}
\end{table}

{
\bibliography{falk_456/latex/bibliography.bib}
}

\end{document}
