\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

% \usepackage{xr}

% % In your preamble
% \makeatletter
% \newcommand*{\addFileDependency}[1]{% argument=file name and extension
%   \typeout{(#1)}
%   \@addtofilelist{#1}
%   \IfFileExists{#1}{}{\typeout{No file #1.}}
% }
% \makeatother

% \newcommand*{\myexternaldocument}[1]{%
%     \externaldocument{#1}%
%     \addFileDependency{#1.tex}%
%     \addFileDependency{#1.aux}%
% }

% \myexternaldocument{falk_456/latex/falk_456-supp}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{hyperref}       % hyperlinks

%\usepackage[sort&compress,numbers]{natbib}
\usepackage[sort&compress,numbers]{natbib}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{siunitx} % for proper typesetting of numbers and units
\sisetup{separate-uncertainty}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{makecell}


%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\MP}[1]{{\color{red} #1}}
\newcommand{\JF}[1]{{\color{green} #1}}
\newcommand{\CC}[1]{{\color{blue} #1}}


%%%%
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts

\setcitestyle{authoryear,square}
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{graphicx}
\usepackage{graphbox}
\usepackage{subfig}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathrsfs}
\usepackage{bm}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsthm}
%\usepackage{subcaption}
\usepackage[toc,page]{appendix}
\usepackage{enumitem}
%\captionsetup[subfigure]{labelformat=empty}

% Write differential operators and equations
\usepackage[ISO]{diffcoeff}

\newcommand{\rf}[1]{{\color{blue} #1}}
\newcommand{\ros}[1]{{\color{red} #1}}
\newcommand{\rosanna}[1]{{\color{green} #1}}
\newcommand{\ar}[1]{{\color{orange} #1}}
\newcommand{\mass}[1]{{\color{brown} #1}}

% \usepackage[textwidth=2.0cm, textsize=tiny]{todonotes} % for writting
% \newcommand{\rt}[2][noinline]{\todo[color=red!20,#1]{{\bf Ros:} #2}}
% \newcommand{\remi}[2][noinline]{\todo[color=blue!20,#1]{{\bf Remi:} #2}}
% \newcommand{\gino}[2][noinline]{\todo[color=yellow!20,#1]{{\bf Gino:} #2}}
% \newcommand{\mas}[2][noinline]{\todo[color=brown!20,#1]{{\bf Mas:} #2}}
% \newcommand{\alain}[2][noinline]{\todo[color=orange!20,#1]{{\bf Alain:} #2}}
% \newcommand{\leo}[2][noinline]{\todo[color=black!20,#1]{{\bf Leo:} #2}}

\newcommand{\red}[1]{\textcolor{blue}{#1}}
% \hypersetup{
%      colorlinks = true,
%      linkcolor = blue,
%      anchorcolor = blue,
%      citecolor = blue,
%      filecolor = blue,
%      urlcolor = blue
%      }

%%%%


\usepackage{thmtools}

\newcommand{\indic}{\chi}
\newcommand{\pushforward}{\#}

\newcommand{\R}{\ensuremath{\mathbb{R}}}
\newcommand{\inputspace}{\ensuremath{\mathcal{X}}}
\newcommand{\outputspace}{\ensuremath{\mathcal{Y}}}
\newcommand{\dataspace}{\ensuremath{\inputspace\times\outputspace}}
% \newcommand{\dataspace}{\ensuremath{\mathcal{Z}}}
\newcommand{\hypothesisspace}{\mathcal{H}}
\newcommand{\algorithmspace}{\Theta}
\newcommand{\latentspace}{\ensuremath{\mathcal{Z}}}
\newcommand{\Lip}{\ensuremath{\mathrm{Lip}}}

\newcommand{\distas}{\sim}
\newcommand{\mc}[1]{\mathcal{#1}}
\newcommand{\domain}{\ensuremath{\mathrm{dom}}}

\newcommand{\E}{\mathbb{E}}

%%% Meta-learning
% Distributions
\newcommand{\metadistribution}{\rho}
\newcommand{\taskdistribution}{\mu}

% Datasets
\newcommand{\taskdataset}{D}
\newcommand{\tasktrain}{D^{\operatorname{tr}}}
\newcommand{\taskval}{D^{\operatorname{val}}}

\newcommand{\ntr}{n_{\rm tr}}
\newcommand{\nva}{n_{\rm val}}
\newcommand{\nts}{n_{\rm ts}}

% Meta-datasets
\newcommand{\metatrain}{M}
\newcommand{\metaval}{M_{\operatorname{val}}}
\newcommand{\metatest}{M_{\operatorname{te}}}

% Algorithms
\newcommand{\inneralgorithm}{A}
\newcommand{\metaalgorithm}{\tilde{A}}

% Losses
\DeclareMathOperator{\risk}{\mathcal{R}}
\newcommand{\emprisk}{\hat{\mathcal{R}}}
\DeclareMathOperator{\innerloss}{\ell}
\newcommand{\metaloss}{L}
\newcommand{\randmetaloss}{\tilde{L}}
\DeclareMathOperator{\metarisk}{\mathcal{E}}
\newcommand{\empmetarisk}{\hat{\mathcal{E}}}

\DeclarePairedDelimiter{\abs}{\lvert}{\rvert}
\newcommand{\diam}{\ensuremath{\mathrm{diam}}}
\newcommand{\Tr}{\mathrm{Tr}}

\newcommand*{\matr}[1]{{\bf #1}}
%% Conjugation and transposition
% 2nd answer
% https://tex.stackexchange.com/questions/30619/what-is-the-best-symbol-for-vector-matrix-transpose
% the starred command disallows arguments
% with \par or newlines
% https://tex.stackexchange.com/questions/1050/whats-the-difference-between-newcommand-and-newcommand
%\newcommand*{\tran}{^{\mkern-1.5mu\mathsf{T}}}
\newcommand*{\tran}{^\top}
\newcommand{\idmat}{\matr{I}}
\newcommand{\diag}{\mathrm{diag}}

%% Kernels
\newcommand{\kernel}[1]{#1}
\newcommand{\rkhs}[1]{\mathcal{#1}}
\newcommand{\kernelmatr}[1]{\matr{#1}}

%% Linear Algebra
\DeclarePairedDelimiterX{\scal}[2]{\langle}{\rangle}{#1, #2}
\DeclarePairedDelimiter{\norm}{\lVert}{\rVert}

% Simple set building (from mathtools manual, page 28)
% Make sure \given command exists
\providecommand\given{}
% Create symbol
\newcommand\SetSymbol[1][]{%
	\nonscript\:#1\vert
	\allowbreak
	\nonscript\:
	\mathopen{}}
% Create command
\DeclarePairedDelimiterX{\Set}[1]\{\}{%
  \renewcommand\given{\SetSymbol[\delimsize]}#1}

\DeclareMathOperator*{\argmin}{\ensuremath{argmin}}

%%% Theorem environments
% Add numberwithin=section to the declaration in order to have the section number as
% prefix to the theorem number.
\declaretheorem[name=Theorem,refname=Thm.]{theorem}
\declaretheorem[name=Lemma,sibling=theorem]{lemma}
\declaretheorem[name=Fact,sibling=theorem]{fact}
\declaretheorem[name=Proposition,refname=Prop.,sibling=theorem]{proposition}
\declaretheorem[name=Remark]{remark}
\declaretheorem[name=Corollary,refname=Cor.,sibling=theorem]{corollary}
\declaretheorem[name=Definition,refname=Def.]{definition}
\declaretheorem[name=Conjecture,sibling=theorem]{conjecture}
\declaretheorem[name=Axiom]{axiom}
\declaretheorem[name=Assumption,refname=Asm.]{assumption}
\declaretheorem[name=Example]{example}
% used for AfterEndEnvironment
% to remove the indentation after theorem environmentshttps://www.overleaf.com/project/614aed333c3b7721ea2b68a3
% if we remove it we simply need to add \noindent after any
% restatable environment
\usepackage{etoolbox}
\usepackage{placeins}

\AfterEndEnvironment{restatable}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{theorem}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{remark}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{example}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{assumption}{\noindent\ignorespacesafterend}
\AfterEndEnvironment{lemma}{\noindent\ignorespacesafterend}

\usepackage{cleveref}
\crefname{assumption}{Assumption}{Ass.}
\crefname{equation}{}{}
\crefname{figure}{Fig.}{Fig.}
\crefname{table}{Tab.}{Tables}
\crefname{section}{Sec.}{Sec.}
\crefname{theorem}{Thm.}{Thm.}
\crefname{fact}{Fact}{Facts}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{corollary}{Cor.}{Cor.}
\crefname{example}{Example}{Examples}
\crefname{remark}{Remark}{Remarks}
\crefname{algorithm}{Alg.}{Algorithms}
\crefname{appendix}{App.}{Appendices}

\title{Implicit Kernel Meta-Learning Using Kernel Integral Forms}

\author[1, 2]{\href{mailto:<ucabitf@ucl.ac.uk>?Subject=IKML (UAI)}{John~Isak~Texas~Falk}{}}
\author[1]{Carlo~Ciliberto}
\author[1, 2]{Massimiliano~Pontil}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Computer Science\\
    University College London\\
    U.K. 
}
\affil[2]{%
    CSML\\
    Italian Institute of Technology\\
    Genoa, Italy
}
  \begin{document}
\maketitle

\begin{abstract}
Meta-learning algorithms have made significant progress in the
context of meta-learning for image classification but less attention has been given to the regression setting. In this paper we propose to learn the probability distribution representing a
random feature kernel that we wish to use within kernel ridge regression (KRR).
We introduce two instances of this meta-learning framework, learning a neural network pushforward for a translation-invariant kernel and an affine pushforward for a neural network random feature kernel, both mapping from a Gaussian latent distribution. We learn the parameters of the pushforward by minimizing a meta-loss associated to the KRR objective. Since the resulting kernel does not admit an analytical form, we adopt a random feature sampling approach to approximate it.
We call the resulting method Implicit Kernel Meta-Learning (IKML). We derive a meta-learning bound for IKML, which shows the role played by the number of tasks $T$, the task sample size $n$, and the number of random features $M$. In particular the bound implies that $M$ can be the chosen independently of $T$ and only mildly dependent on $n$. We introduce one synthetic and two real-world meta-learning regression benchmark datasets. Experiments on these datasets show that IKML 
performs best or close to best when compared against competitive meta-learning methods.
\end{abstract}

\section{Introduction}

A common scenario in the real world is learning from similar tasks in order to transfer this knowledge to new tasks.
In machine learning this setting is called meta-learning
or learning-to-learn \citep{baxter2000model, thrun2012learning}, where we assume that a set of tasks are sampled
from a meta-distribution on supervised learning problems. The goal is to design \emph{meta-algorithms} which from a set of tasks output a learning algorithm. This algorithm should perform well on average with respect to the tasks sampled from the meta-distribution, analogous to having low risk in supervised learning.

Meta-learning operates on top of an inner algorithm, tuning it to
perform better on new tasks.
%In other words,
The meta-algorithm acts at an outer level, relying on the inner algorithm to compute a meta-loss and corresponding meta-gradient, based on which a meta-parameter associated to the inner algorithm is updated \citep[see e.g.][]{franceschi18_bilev_progr_hyper_optim_meta_learn}. For example, in regression settings, a common choice of inner
algorithm is ridge regression and the meta-parameter is a representation or embedding shared across the tasks that we wish to meta-learn~\citep{bertinetto18_meta_learn_with_differ_closed_form_solver}.

There has been considerable interest in meta-learning
for few-shot image classification \citep{vinyals16_match, finn17_model, snell17_protot, rusu18_meta_learn_with_laten_embed_optim, ren18_meta_learn_semi_super_few_shot_class, li17_meta_sgd, koch15_siames} but less attention has been given to designing meta-learning algorithms for regression. Most few-shot
regression benchmarks fall under that of interpolating sinusoidals or a variety thereof
\citep{finn17_model, oreshkin18_tadam, finn18_probab} which lacks many aspects of real-world regression problems such as being multivariate and noisy. This
highlights the importance of more realistic meta-learning regression datasets and how to design meta-learning algorithms in this setting. In this paper we aim to close this gap.

Meta-learning algorithms employ a variety of different kinds of base algorithms, ranging from metric based to optimization based, and to black-box ones. A common theme is to learn a shared representation which lead to faster adaptation of a base learning algorithm to new tasks. Often the representation is modeled by a neural network. Indeed, recently \citep{tian20_rethin_few_shot_image_class, raghu19_rapid_learn_or_featur_reuse} observed that the representation is the most important part of meta-learning algorithms.

In this paper we extend this thinking further, in that we implicitly learn the representation via a kernel function from a large class of kernels defined by a random feature form. This kernel is in turn implicitly parametrized by a neural network pushforward which is learned by a meta-algorithm. When using the random feature family of translation-invariant kernels this has two main advantages:
since kernel algorithms can be expressed in terms of inner products of
features which are simple to compute we don’t have to work
with this high-dimensional feature space directly. We show that modeling the kernel directly leads to improved performance in the meta-learning regression case. A second advantage is that translation invariant kernels might be used as ``plug-in'' representations. We also experiment with using a neural network random feature representation, effectively combining ensembling with with random features.

\par{\bf Contributions~} The principal contribution of this paper is a method for meta-learning regression together with a bound on the excess risk which highlights how problem-specific quantities impact the number of random features needed to generalize. In particular, our method can be used to learn within a family of translation invariant kernels that is well-suited when using kernel ridge regression as the class of base learning algorithms. According to Bochner's Theorem~\citep[see e.g.][]{rahimi2007random}, these kernels are parameterized by a distribution in the frequency space. In line with \citep{li19_implic_kernel_learn}, we parametrize this distribution as a neural network pushforward. The weights of the network are learned from a sequence of datasets within a meta-learning setting. Although we focus on distributions in the context of Bochner's theorem, our framework extends directly to radial kernels using Schoenberg's theorem~\citep{schoenberg38_metric_spaces_compl_monot_funct}. Additionally we experiment with using a neural network random feature kernel, an extension of R2D2 \citep{bertinetto18_meta_learn_with_differ_closed_form_solver}, and show competitive performance.

Finally, we introduce three novel meta-learning regression benchmark datasets, one synthetic and two real-world and show that our algorithm ranks at the top or close to competing meta-learning regression algorithms. We believe these results, including the theoretical guarantees together with the flexibility and ease of our method, make it a
competitive candidate to be used as a plug-in meta-learning algorithm in general contexts.

\par{\bf Related Work~} Learning-to-learn or meta-learning can be traced back to at least \citep{schmidhuber1987evolutionary} with one seminal work being \citep{baxter2000model}. Well-developed theory exists in the batch case~\citep{maurer2005algorithmic, maurer2016benefit} and lately similar results have been developed in the online setting \citep{denevi2019learning, balcan2019provable}.

Recent advances in the image few-shot classification setting \citep{fei2006one, lake2011one} starting with the work of \citep{finn17_model, snell17_protot, vinyals16_match} has lead to renewed interest in meta-learning, notably from the deep learning community by formulating it as an optimization problem \citep{ravi16_optim_as_model_few_shot_learn}. While classification has received a lot of interest, regression has been given less attention. Some examples are given by \citep{tossou19_adapt_deep_kernel_learn, titsias2020information, patacchiola2020bayesian} who apply gaussian processes \citep{williams06_gauss} together with deep kernel learning \citep{wilson2016deep} to regression. From the ridge regression point of view; \cite{kong2020robust} investigate theoretically the meta-mixed linear regression setting while \cite{nguyen2021dataset} applied kernel ridge regression (KRR) to meta-learn dataset compression.

Our work can be traced directly to ideas from \citep{zhen20_learn_to_learn_kernel_with, sinha2016learning, li19_implic_kernel_learn} to leverage the characterization provided by Bochner's theorem for kernel learning \citep{ong2005learning, cristianini2006kernel}. In \citep{sinha2016learning} they fine-tune a convex combination of sampled kernels in a supervised learning setting using kernel target alignment \citep{cristianini2006kernel}. We also mention the work \citep{zhen20_learn_to_learn_kernel_with} which apply variational inference to optimize a latent variable model for few-shot learning, and \citep{li19_implic_kernel_learn} where they learn an implicit kernel using a pushforward in the case that the learning objective is linear in the kernel evaluations. 

\par{\bf Organization~} \cref{sec:metaR} introduces the meta-learning setting. We describe our proposed method in \cref{sec:implicit-kernel-meta-learning}, analyze it in \cref{sec:genbound} and benchmark it in \cref{sec:experiments}. We discuss our findings in \cref{sec:conclusion}.


\section{Meta-Learning Problem}
\label{sec:metaR}

In this section we introduce the main elements of the meta-learning setting and introduce the notion of stochastic meta-learning algorithm. To this end, we first recall the standard notion of supervised learning problem.


\par{\bf Supervised Learning~} Given an input $\inputspace$ and output $\outputspace$ set, a supervised learning problem is characterized by a data generating distribution
\(\taskdistribution \in \mc{P}(\dataspace)\) on the joint space $\dataspace$ and a loss function
\(\ell:\outputspace\times\outputspace\to\R\) measuring prediction errors. The goal of a supervised learning problem is to find a map $f:\inputspace\to\outputspace$ minimizing the {\itshape risk}
%
\begin{equation}\label{eq:expected_risk}
    \mc{R}_{\taskdistribution}(f) = \E_{(x, y) \distas \taskdistribution}~\ell(f(x), y).
\end{equation}
%
In practice, the data generating distribution is unknown and only a finite number $\ntr$ of examples $\tasktrain = (x_i,y_i)_{i=1}^{\ntr}$ independently sampled from $\taskdistribution$ are available (denoted here by \(\tasktrain \distas \taskdistribution^{n_{\rm tr}}\)). A learning algorithm is a function mapping datasets into candidate solutions to \cref{eq:expected_risk}.

\par{\bf Learning Algorithm~} Let $\outputspace^\inputspace = \{f:\inputspace\to\outputspace\}$ be the space of all functions from $\inputspace$ to $\outputspace$ and $\mathcal{D}$  the space of all datasets of any size on $\dataspace$. Then, a learning algorithm (referred to as {\itshape inner algorithm} in meta-learning) is a function \(\inneralgorithm: \mathcal{D} \to \outputspace^\inputspace\), mapping datasets $D\in\mathcal{D}$ to functions $f:\inputspace\to\outputspace$. Typically, learning algorithms are parametrized as $\inneralgorithm(\cdot) = \inneralgorithm(\theta,\cdot)$, by a set of so-called hyperparameters (here referred to as {\itshape meta-parameters}) $\theta\in\Theta$, that allow to adapt the algorithm to the specific problem. Typical examples of hyperparameters include the regularizer in Tikhonov regularization
%\citep{de2005learning, tikhonov1977solutions}
or the number of iterations of an early-stopping procedure. 
%~\citep[see e.g.][]{yao2007early}. 
Ideally, we aim to find the best meta-parameter for a given task, namely the $\theta$ minimizing the expected risk $\risk(\inneralgorithm(\theta,\taskdataset))$. We do not have access to $\taskdistribution$ but we can sample a {\itshape validation} set $\taskval\distas\taskdistribution^{\nva}$ and consider the empirical risk
\begin{equation}
    \hat{\mc{R}}(f,\taskval) = \frac{1}{n_{\rm val}}\sum_{(x, y) \in \taskval}\ell(f(x), y),
\label{eq:meta-loss1}
\end{equation}
as a suitable proxy. Since $\taskval$ and $\tasktrain$ are sampled independently, $\hat{\mc{R}}(\inneralgorithm(\theta,\tasktrain),\taskval)$ is an unbiased estimator of \(\mc{R}_{\taskdistribution}(\inneralgorithm(\theta,\tasktrain))\). Given a train and validation set $D = (\tasktrain,\taskval)$, the process of minimizing the meta-loss
\begin{equation}\label{eq:meta-loss}
\metaloss(\theta,D) = \hat{\mc{R}}(\inneralgorithm(\theta,\tasktrain),\taskval),
\end{equation}
with respect to $\theta$ is known as {\itshape cross-validation}.


\par{\bf Meta-Learning~} The meta-learning paradigm lifts the notion of cross-validation to the level of multiple tasks: assuming that we have access to many supervised learning problems (or tasks) sharing some form of similarity, meta-learning aims to find a single set of meta-parameters $\theta$ that works well across all tasks. More formally, we assume that the tasks are sampled from a \emph{meta-distribution} \(\metadistribution\). From each $\taskdistribution\in\mc{P}(\dataspace)$ sampled from $\metadistribution$, we then sample a pair of datasets $D = (\tasktrain,\taskval)\distas\taskdistribution^{n}$ with $n = \ntr + \nva$
(even though in the following we assume $\ntr$ and $\nva$ to be fixed for simplicity, our discussion can be extended to more general settings). Then, meta-learning is formulated as the problem of finding the meta-parameters $\theta\in\Theta$ minimizing the {\itshape transfer risk} \citep{denevi18_learn}
\begin{equation}
  \label{eq:meta-risk}
  \metarisk(\theta) = \E_{\taskdistribution \distas \metadistribution} \E_{\taskdataset \distas \taskdistribution^{n}}~ \metaloss(\theta, \taskdataset).
\end{equation}
%
If $L(\cdot,D)$ is (sub)differentiable, we can adopt standard stochastic first order method (e.g. SGD or Adam \citep{kingma15_adam}) to approximate the optimal meta-parameters. This consists in iteratively sampling a task $\taskdistribution_t\distas\rho$ and a train-val split $D_t\distas\taskdistribution_t^{n}$ at each time step $t=1,\dots,T$. Then, update the meta-parameters, e.g. via the SGD rule $\theta_{t+1} = \theta_t - \eta\nabla_\theta L(\theta,D_t)$. We refer to \cref{alg:IKML-online-optimisation} for a concrete example in the setting discussed in this work.

%\par{\bf Previous work}
%
\par{\bf Meta Representation Learning~} In practice, the above approach might pose computational challenges since, by the chain rule, differentiating $L$ requires computing $\nabla_\theta \inneralgorithm(\theta,D)$. Depending on the inner algorithm $\inneralgorithm$, its gradient with respect to the meta-parameters $\theta$ might be hard to compute or not even exist. In the literature, a wide range of meta-learning strategies have been proposed, considering different choices of inner algorithm $\inneralgorithm$ and meta-parameters $\theta$. For example, \citep{bertinetto18_meta_learn_with_differ_closed_form_solver} considered the case that  $\inneralgorithm$ performs ridge regression (see \cref{sec:kernels}) and $\theta$ parameterizes the weights of a feature map $\phi_\theta:\inputspace\to\R^d$ (e.g. a neural network). Leveraging the closed-form solution of the ridge regression estimator, this allows us to efficiently compute the gradient $\nabla_\theta \inneralgorithm(\theta,D)$. In settings where $\inneralgorithm$ is minimizing the empirical risk but with a loss function that does not admit a closed form, we can adopt a bi-level optimization perspective \citep{franceschi18_bilev_progr_hyper_optim_meta_learn}. This amounts to interpret $\inneralgorithm$ as returning the $T$-th iteration of an iterative optimization algorithm. This allows to access $\nabla_\theta \inneralgorithm(\theta,D)$ by recursively differentiating along the iterates. This approach is related to the well-known MAML algorithm \citep{finn17_model}, which proposed to perform fine-tuning of a shared starting network $f_\theta:\inputspace\to\outputspace$ with weights $\theta$, that is adapted by $A(\theta,D) = f_{\theta'}$ to each new task by performing a step of gradient descent $\theta' = \theta - \eta \nabla_\theta \hat{\mathcal{R}}(\phi_\theta,D)$, to fit the training data. In the following we introduce the family of inner algorithms (and corresponding parameters) proposed in this work to tackle the meta-learning problem.


\section{Implicit Kernel Meta-Learning}
\label{sec:implicit-kernel-meta-learning}

We now introduce the propose meta-learning strategy. While most previous work focused on learning a shared data representation or feature map \citep{finn17_model,bertinetto18_meta_learn_with_differ_closed_form_solver,franceschi18_bilev_progr_hyper_optim_meta_learn} across tasks, here we propose the dual approach of learning a shared kernel function.

\subsection{Reproducing Kernels and Feature Maps}
\label{sec:kernels}
%
Reproducing kernels are a well-established tool in machine learning, at the root of most non-parametric algorithms \citep{scholkopf2002}. They consist of positive definite functions  $K:\inputspace\times\inputspace\to\R$ that may be interpreted as
a similarity between data points. A fundamental result dating back to Moore and Aronszajn~\citep[see e.g.][and references therein]{aronszajn50_theor_reprod_kernel,CS02,scholkopf2002} establishes that a kernel is into one-to-one correspondence with a (possibly infinite dimensional) Hilbert space ${\cal H}_K$ of real-valued functions on $\inputspace$, such that for every $x \in \inputspace$ and $f \in {\cal H}_K$, the function $K(x,\cdot) \in {\cal H}_K$ and $\langle f, K(x,\cdot) \rangle_K = f(x)$, where $\langle \cdot,\cdot\rangle_K$ denotes the inner product in ${\cal H}_K$. A kernel is in duality with the notion of feature map: given a mapping $\phi:\inputspace\to{\mathbb H}$ into a Hilbert space ${\mathbb H}$ with inner product $\langle \cdot,\cdot\rangle$ such that $K_\phi(x,x') \equiv \scal{\phi(x)}{\phi(x')}$ is a reproducing kernel. The converse is also true, namely for any kernel $K$ there exists a Hilbert space $\mathbb{H}$ and feature map $\phi_K:\inputspace\to{\mathbb H}$ such that $K(x,x') = \scal{\phi_K(x)}{\phi_K(x')}$ \citep{aronszajn50_theor_reprod_kernel}; when ${\cal X}$ is compact we can choose  ${\mathbb H}= \ell_2$, the space of square summable sequences. A key practical advantage of kernels is that they allow to learn functions parametrized as $f(x) = \scal{f}{\phi_K(x)}$ even when $\hypothesisspace_K$ is infinite dimensional (namely the feature vector $\phi_K(x)$ has infinitely many entries). As a concrete example we recall the case of kernel ridge regression \citep[see e.g][]{caponnetto07_optim_rates_regul_least_squar_algor,steinwart2008} which we will use also as plug-in inner algorithm for the proposed meta-learning approach in this work.

\par{\bf Kernel Ridge Regression~} Kernel ridge regression (KRR) performs Tikhonov regularization using the least-square loss function over the space of hypotheses associated to a reproducing kernel \citep[see e.g.][]{scholkopf2002}. More precisely, assume $\outputspace\subset\R$. Given a dataset $\tasktrain = (x_i,y_i)$, and a kernel function $K:\inputspace\times\inputspace\to\R$, KRR is the algorithm
\begin{equation}\label{eq:krr}
    \inneralgorithm_{\rm KRR}(K,\tasktrain) = \argmin_{f\in\hypothesisspace_K}~ \hat\risk(f,\tasktrain) + \lambda \|f\|_{K}^2,
\end{equation}
with $\lambda>0$ a regularization parameter. Thanks to the reproducing property of the kernel, \cref{eq:krr} can be solved in closed form. We have that for any $x\in\inputspace$, that
\begin{equation}
\label{eq:krr-solution}
\inneralgorithm_{\rm KRR}(K,\tasktrain)(x) = \sum_{i=1}^n \alpha_i K(x_i,x)
\end{equation}
with $\alpha {=} ({G} + \lambda n I)^{-1}{y}$, 
where ${G}{=}(K(x_i,x_j))_{i,j=1}^{\ntr}$ is the $\ntr\times \ntr$ kernel (Gram) matrix, $I$ the $d\times d$ identity matrix, and with some abuse of notation we let ${y} = (y_1,\dots,y_{\ntr})^\top\in\R^{\ntr}$ be the vector of output examples. Notice that we highlighted the dependency of KRR with respect to the kernel $K$. This suggests that in meta-learning settings one might be interested in learning the kernel as a meta-parameter.

% \par{\bf Learning translation invariant kernels}
\subsection{Learning Translation Invariant Kernels}
%
The definition of positive definite function underlying the notion of reproducing kernel is very general. Therefore, to formulate the problem of meta-learning a kernel, we need first to identify a suitable family. In \citep{rudi2017generalization} they introduce a "recipe" for random feature kernels defined by a random feature map \(\varphi: \mc{X} \times \Omega \to \R^o\) and a distribution \(\tau\) so that any kernel in this family has the form
\begin{equation}
\label{eq:random-feature-kernel}
    K(x, x') = \int_{\Omega} \varphi(x, \omega)\tran \varphi(x, \omega) \dl \tau(\omega).
\end{equation}

Given the focus of this work towards regression settings, we first consider the class of {\itshape translation invariant} kernels, which are particularly suited to deal with such settings and are \emph{interpretable} ( see e.g. Fig. 2 in
the appendix). Let $\inputspace=\R^d$. A kernel $K$ is called {\em translation invariant} if $K(x,x') = g(x - x')$ for some function $g:\R^d\to\R$; a well-known example is the Gaussian $K(x,x') = e^{-\|x-x'\|^2/\sigma^2}$ with $\sigma>0$. A famous theorem by Bochner \citep[see e.g.][]{rahimi2007random,rudin62_fourier,sriperumbudur2015optimal}, adapted here to real-valued kernels, establishes that any properly re-scaled continuous bounded translation invariant function $K:\mathbb{R}^d \times \mathbb{R}^d\to\R$ is a kernel if and only if there exists a  probability measure \(\tau\in\mc{P}(\R^d)\) such that
\begin{equation}
\label{eq:bochners-theorem}
K(x,x') = K_\tau(x,x') \equiv \int\cos( \scal{\omega}{x-x'}) \dl \tau(\omega),
\end{equation}
which can be written in the form of~\eqref{eq:random-feature-kernel} by expanding the cosine using the trigonometric identity $\cos(x - y) = \cos(x)\cos(y) + \sin(x)\sin(y)$. We call any kernel that can be written in the form of \cref{eq:random-feature-kernel} a \emph{Bochner kernel}.
%$o=2$, see e.g \citep{rahimi2007random}. 
Eq. \cref{eq:bochners-theorem} implies that we can represent the class of translation invariant kernels as $\mc{T}=\{K_\tau~|~ \tau\in\mc{P}(\R^d)\}$. Thus we can translate the problem of learning a kernel to that of learning a probability distribution. This perspective is in line with the implicit kernel learning approach devised in \citep{li19_implic_kernel_learn} for generative modeling and single task settings. The second type of kernel is inspired by the success of using neural network to extract features and is given by letting \(\varphi(x, \omega)\) be a neural network with \(\omega\) the weights and \(\tau\) a distribution over \(\omega\).

\par{\bf Pushforward Models~} To learn the underlying distribution $\tau$ we consider a parametrization in terms of a pushforward model. More formally, let $\mc{N}$ be the unit Gaussian distribution over a latent space $\latentspace$ and let $\psi_{\theta}:\latentspace\to\R^d$ be a vector-valued function parameterized by a vector $\theta\in\Theta$ (e.g. a neural network with weights $\theta$). We denote by $\tau_\theta = \psi_\theta\pushforward\mc{N}$ the probability distribution such that, the process of sampling $\omega\sim\tau_\theta$ is equivalent to first sampling $z\sim\mc{N}$ and then taking $\psi_\theta(z) = \omega$.\footnote{Formally, for any $B \subseteq \mathbb{R}^d$, $\tau_\theta(B) = {\cal N}(\{z ~|~ \psi_\theta(z) \in B\})$.} This is the strategy adopted to model the generator distribution in generative adversarial networks (GAN) settings and in the implicit kernel learning approach of \citep{li19_implic_kernel_learn}. Several alternatives for the latent distribution $\mc{N}$ are possible (e.g. uniform). Under the notation above, we adopt as inner algorithm,
\begin{equation}\label{eq:ideal-meta-krr}
    \inneralgorithm(\theta,D) = \inneralgorithm_{\rm KRR}(K_{\tau_\theta},D),
\end{equation}
namely KRR trained with a translational invariant kernel $K_{\tau_\theta}$ meta-parametrized by the pushforward map $\tau_\theta$. Below we give an example where a parametrization of $\tau_\theta$ yields an analytic form for the corresponding $K_{\tau_\theta}$; see the appendix for a derivation.
\begin{example}[Affine Pushforward Maps]\label{ex:affine-pushfowrward}
Let $\theta = (Q,b)$ with $Q \in \mathbb{R}^{d \times d}$ and $b\in \mathbb{R}^d$ and consider the affine pushforward map
\(\psi_{(Q,b)}(s) = {Q}s + b\). In these settings, the kernel $K_{\tau_{(Q,b)}}$ can be expressed analytically as
\begin{equation}\label{eq:affine-kernel}
  \kernel{K}_{\tau_{(Q,b)}}(x, x') = \cos(\scal{b}{x - x'}) e^{-\norm{Q\tran(x - x')}^{2}/2}.
\end{equation}
\end{example}
The example above identifies a relevant family of kernels that are particularly amenable for meta-learning. Thanks to the analytic form of affine pushforward kernels, we can easily compute meta-gradients and thus directly minimize the transfer risk $\metarisk(\theta)$. On the other hand, if we consider more expressive maps \(\psi_{\theta}\), we will hardly be able to obtain \(\kernel{K}_{\tau_{\theta}}\) in analytic form. Still, this may be well worth the effort: while for large training sets the difference between \cref{eq:affine-kernel} and a more sophisticate kernel may be less severe since any universal kernel is optimal
\citep{caponnetto07_optim_rates_regul_least_squar_algor}, in the few-shot learning setting (where we have small training sets) the inductive bias plays an important role and being able to modify the kernel in a flexible way is key.

\subsection{Stochastic Meta-Learning}
\label{sec:method}
%
The discussion above highlighted that except for a few special cases (see e.g. \cref{ex:affine-pushfowrward}), given a distribution $\tau_\theta$ it is not possible to compute the kernel $K_{\tau_\theta}$ (and its gradient with respect to $\theta$) analytically. In principle, this might prevent us from applying meta-learning algorithms of the form in \cref{eq:ideal-meta-krr}. To circumvent this issue, we consider a strategy based on random features \citep{rahimi2007random,rudi2017generalization}. Rather than evaluating $K_{\tau_\theta}$, we sample a set $S = (s_j)_{j=1}^M$ from $\mc{N}$ and then approximate the ideal Bochner kernel by the {\itshape random features kernel}
\begin{equation}\label{eq:random-features-kernel}
    K_{\hat{\tau}_{\theta S}}(x,x') = \frac{1}{M}\sum_{j=1}^M \cos(\scal{\psi_\theta(s_j)}{x-x'}),
\end{equation}
where $\hat\tau_{\theta S} = \frac{1}{M}\sum_{j=1}^M \delta_{\psi_\theta(s_j)}$ is an empirical distribution associated to $\tau_\theta$ and $\delta_{\omega}$ denotes a Dirac's delta centered in $\omega\in\R^d$ which we call \emph{frequency}. Thanks to the characterization of $K_{\tau_\theta}$ as an expectation in \cref{eq:bochners-theorem}, we have that
\begin{equation}
    K_{\tau_\theta}(x,x') = \mathbb{E}_{S\distas\mc{N}^M} ~K_{\hat\tau_{\theta S}}(x,x'),
\end{equation}
namely $K_{\hat\tau_{\theta S}}$ is an unbiased estimator of $K_{\tau_\theta}$. It is possible to prove also non-asymptotic results bounding the distance beetween the two kernels in sup norm \citep{rahimi2007random}.

\par{\bf Stochastic Meta-Learning~} We now introduce a stochastic variant to the meta-learning approach from \cref{sec:metaR}, by defining the meta-loss associated to a set of random features
\begin{equation}\label{eq:stochastic-meta-loss}
   \metaloss(\theta,S,D) = \hat{\mc{R}}(\inneralgorithm_{\rm KRR}(K_{\tau_{\theta S}},\tasktrain),\taskval),
\end{equation}
and the corresponding transfer risk
\begin{equation}\label{eq:stochastic-meta-risk}
    \metarisk_M(\theta) = \E_{\taskdistribution \distas \metadistribution} \E_{\taskdataset \distas \taskdistribution^{n}}\mathbb{E}_{S\sim\mc{N}^M}~ \metaloss(\theta,S, \taskdataset),
\end{equation}
which we will also denote \(\metarisk(\theta, S)\) when wanting to highlight the dependence on \(S\) explicitly.

In this work we propose to address the stochastic meta-learning problem
\begin{equation}
    \min_{\theta\in\Theta}~\metarisk_M(\theta).
\end{equation}
\cref{alg:IKML-online-optimisation} provides the pseudocode for a (meta) stochastic gradient descent algorithm applied to this problem. At each iteration $t=1,\dots,T$, we sample a new task $\taskdistribution_t$ and datasets $D_t=(\tasktrain_t,\taskval_t)\distas\taskdistribution_t^n$ and a set of random features $S_t\distas\mc{N}^M$ and then perform a gradient descent step in the direction of $\nabla_\theta L(\theta_t,S_t,D_t)$. Note that the gradient can be computed by means of automatic differentiation (\textsc{AutoGrad}) \citep[see e.g.][]{baydin2018automatic}. Many other strategies to perform this optimization step are available, such as Adam \citep{kingma15_adam}. While using a large number of random feature may seem expensive, both training and prediction time 
is linear in $M$, see the section on computational complexity in the appendix. We refer to this method as {\itshape Implicit Kernel Meta-Learning (IKML)}.

\begin{algorithm}[t]
  \caption{Implicit Kernel Meta-Learning}
  \label{alg:IKML-online-optimisation}
  \begin{algorithmic}
    \STATE \textbf{Input: } meta-distribution \(\metadistribution\), step-sizes
      \((\gamma_{t})_{t=1}^{\infty}\), number of random features $M$,
      initial meta-parameters $\theta_0$, total number of iterations $T$.
    \STATE {\bfseries For} $t=1,\dots,T$
    \STATE\quad Sample a task/dataset $D = (\tasktrain,\taskval)$ from $\rho$
    \STATE\quad Sample \(M\) random features \(S\) from \(\mc{N}\)
    \STATE\quad Form $K_{\tau_{\theta_t S}}$ and compute $L(S,\theta_t,D)$ as in \cref{eq:stochastic-meta-loss}
    \STATE\quad Get $\nabla_\theta L(\theta_t,S,D) = $\textsc{AutoGrad}$(L(\cdot,S,D),\theta_t)$
    \STATE\quad Update $\theta_{t+1} \gets \theta_t - \gamma_t \nabla L(\theta_t,S,D)$
    \STATE \textbf{Return} $\theta_T$
  \end{algorithmic}
\end{algorithm}

\section{Generalization bound}
\label{sec:genbound}
We now study the generalization ability of the proposed meta-learning method. In particular, our goal is 
to study the effect of the number of random features on the performance of the meta algorithm.  
To present our observations we focus for simplicity on the case that the meta loss uses the task dataset for both training and validation, that is we use the empirial risk
\begin{equation}
    \tilde{L}(\theta, S, D) = \hat{\mc{R}}(A_{\text{KRR}}(K_{\hat{\tau}_{\theta S}}, D), D)
\end{equation}
which is the empirical error of KRR with kernel \eqref{eq:random-features-kernel} on the dataset $D$ instead of \cref{eq:stochastic-meta-loss}. For a collection of datasets $(D_t)_{t=1}^T$ and a sample $S = (s_j)_{j=1}^M$ from $\mc{N}$, define the multitask empirical risk
\begin{equation}
   \hat{\mc{E}}_T(\theta, S) = \frac{1}{T}\sum_{t=1}^T \tilde{L}(\theta, S, D_t). 
\end{equation}
We aim to bound the excess transfer risk 
%defined as 
\begin{equation}
    \mc{E}_M({\hat \theta}) - \mc{E}(\theta^*)
\end{equation}

where $\theta^* \in \Theta$ is such that $\mc{E}(\theta^*) = \min_\theta \mc{E}(\theta)$ and ${\hat \theta}$ is the minimizer of the multitask empirical risk, which we call the multitask empirical risk minimizer (MERM) which in practice we approximate by the solution returned by \cref{alg:IKML-online-optimisation}.
\begin{theorem}
\label{thm:main}
Assume that \(\mc{Z} = \mc{X} \times \mc{Y} \subseteq \R^d \times [0, 1]\), $\rho$ is a meta-distribution on \(\mc{Z}\), the loss \(\ell(y, \hat{y}) = (y - \hat{y})^2\) and kernel family $\mc{K} = \{K_{\tau_{\theta}}~|~\theta \in \Theta\}$ is a family of Bochner kernels parameterized by some latent distribution \(\mc{N}\) with support on \(\R^l\) and a family of measurable functions \(\{\psi_{\theta} : \R^l \to \R^d ~|~\theta \in \Theta\}\). For any \(n, M, T \in \mathbb{N}\) let the training task datasets $D_1,\dots,D_T$ be given by 
iteratively sampling a task $\taskdistribution_t\distas\rho$ and  $D_t\distas\taskdistribution_t^{n}$ and $S \sim \mc{N}^M$, the family of inner algorithms being KRR with kernels $K_{\tau_{\theta}} \in \mc{K}$ and fixed regularization parameter $\lambda > 0$ and $\hat{\theta}$ being the MERM over the task datasets and random features. Then, for $\delta\in (0,1)$, with probability at least $1-\delta$ over the datasets and random features
\begin{eqnarray}
\mc{E}_M(\hat{\theta}) - \mc{E}(\theta^*) \leq O\left(\frac{\sqrt{M}R_{n,M,T}}{T \lambda \sqrt{n}} + \sqrt{\frac{\log \frac{1}{\delta}}{T}}\right) + \label{1rr}\\
O\left(\frac{1}{\lambda \sqrt{n}}\right) + \label{2rr}\\
O\left(\frac{1}{\sqrt{M \lambda^3}} \bigg(1+ \sqrt{\frac{G^*_n \log n}{\lambda^2 n}} \bigg) \right)
\label{3rr}
\end{eqnarray}
where 
\begin{equation}
    R_{n,M,T}= \E_{(D_t)_{t=1}^T \sim \hat{\rho}^T}\E_{S, \epsilon} \sup_{\theta \in \Theta}\sum_{i, j, t}^{n, M, T} \epsilon_{i,j,t} \langle \psi_{\theta}(s_{j}), x_{i}^{t} \rangle,
\end{equation}
the random variables $\epsilon_{i,j,t}$ being i.i.d Rademacher and $D \sim \hat{\rho}$ means first sampling \(\mu \sim \rho\) and then \(D \sim \mu^n\), and $G_n^* = \E_{\mu \sim \rho}\E_{D \sim \mu^{n}}\norm{(K_{\theta^*}(x_i,x_j))_{i,j=1}^n}_{\infty}$. 
\end{theorem}
\begin{proof}[Proof Sketch]
We discuss the key elements of the proof and present the full details in the appendix. We write 
$\mc{E}_M(\hat{\theta}) - \mc{E}(\theta^{*}) = \E_S [ \mc{E}_M(\hat{\theta},S) - \mc{E}(\theta^{*})]$ and decompose the term inside the expectation as
\begin{align}
\nonumber
 \underbrace{\mc{E}\hspace{-.05truecm}(\hat{\theta}, S) {-} \hat{\mc{E}}\hspace{-.05truecm}(\hat{\theta}{,}\hspace{.03truecm}{S})}_{(A)} {+} \underbrace{\hat{\mc{E}}\hspace{-.05truecm}(\hat{\theta}{,}\hspace{.03truecm}{S}) {-} \hat{\mc{E}}_{T}(\hat{\theta}{,} \hspace{.03truecm}S)}_{(B)} {+} \underbrace{\hat{\mc{E}}_{T}(\hat{\theta}, S) {-} \hat{\mc{E}}_{T}({\theta^*}\hspace{-.1truecm}{,}\hspace{.03truecm}S)}_{(C)} \\\nonumber
                                               {+} \underbrace{\hat{\mc{E}}_{T}(\theta^{*}\hspace{-.1truecm}{,}\hspace{.03truecm}{S}) {-} \hat{\mc{E}}(\theta^{*}\hspace{-.1truecm}{,} \hspace{.03truecm}S)}_{(D)} + \underbrace{\hat{\mc{E}}(\theta^{*}\hspace{-.1truecm}{,} \hspace{.03truecm}S) {-} \mc{E}(\theta^{*}\hspace{-.1truecm}{,} \hspace{.03truecm}S)}_{(E)} {+} \underbrace{\mc{E}({\theta^*}\hspace{-.1truecm}{,}\hspace{.03truecm}S) {-} \mc{E}(\theta^*)}_{(F)}
\end{align}
where $\hat{\mc{E}}({\theta},{S})$ and $\hat{\mc{E}}_{T}(\theta,S)$ are the average empirical error and the multitask empirical error, for the meta-parameter $\theta$ and random features $S$; -- see the secion on the bound in the appendix. Bounding the terms (A) and (E) leads to \eqref{1rr} while bounding the terms (B) and (D) leads to \eqref{2rr}. The term (C) is the optimization error and is negative if we can minimize the empirical risk objective. Finally the term (F) is bounded using \citep[][Theorem 2.1]{tropp19_matrix_concen_comput_linear_algeb} and auxiliary results presented in the appendix.
\end{proof}

We now comment on the implications of the above theorem. The first term in the r.h.s. of \eqref{1rr} contains the unnormalized Rademacher complexity $R_{n, M, T}$ of the set $\{ (\langle \psi_{\theta}(s_{j}), x_{i}^{t} \rangle)_{i,j,t=1}^{n,M,T} : \theta \in \Theta \} \subseteq \mathbb{R}^{n \times M \times T}$. This is a measure of the capacity of the RKHS's we consider as part of using the kernel family $\mc{K}$ and quantifies the kernel families ability to fit random noise. While this quantity requires a case by case analysis it is often of order $\sqrt{T}$. Since in meta-learning the number of tasks is very large this term is negligible in many practical scenarios. For example following the reasoning in \citep{nips2020mmd} we obtain that $R_{n, M, T}= O(\sqrt{n M T})$. The number of random features should then be chosen so that the quantity \eqref{3rr} is smaller than \eqref{2rr}. $G^*_n$ represents the size of best RKHS needed to explain the data averaged over the possible datasets sampled from the environment. In some sense it represents the degrees of freedom of the best model \(\theta^*\) given the meta-distribution. A direct computation gives the condition
\[
M > O\bigg(\frac{n}{\lambda} + \frac{G_n^* \log n}{\lambda^3}\bigg).
\]
Since $G_n^* \in [1,n]$, we conclude that the number of random features needed by the algorithm in order to be competitive with meta-learning without random feature approximation is {\em independent of the number of tasks} and only mildly dependent on $n$. For example, assuming $\lambda= 1/\sqrt{n}$ we obtain that $M = \Omega(n^\frac{3}{2} \log n)$ or $M=\Omega(n^{\frac{5}{2}} \log n)$ when $G^*_n = 1$ or $G^*_n = n$, respectively.
The case that $G^*_n = O(n)$ requiring more random features corresponds to a low rank Gram matrix, meaning that the tasks are strongly related. This is however worth the effort since in this case the optimal risk $\mc{E}(\theta^*)$ we compare to will be very small, because the optimal low rank kernel makes learning very easy. Finally we note that \(\lambda\) being in the denominator of all terms is an artifact due to comparing to the best KRR algorithm 
\(\theta^*\) instead of the quantity \(\mc{E}^* = \E_{\mu \sim \rho} \mc{R}_{\mu}(f_{\mu})\) where \(f_{\mu} = \E[y | \cdot]\) is the optimal predictor for the distribution \(\mu\) \citep[see][for a discussion]{denevi2019learning}.

\section{Experimental Results}
\label{sec:experiments}

We evaluate the performance of the proposed meta-learning strategy on both synthetic and real experiments against several baselines. We make all datasets and code available as a Github repository.\footnote{\url{https://github.com/IsakFalk/IKML}}

\subsection{Synthetic Multivariate Regression}
\label{sec:synthetic}
For IKML to be effective in realistic meta-learning regression scenarios it is important that it can approximate non-trivial functions defined on \(\R^{d}\) where
\(d \gg 1\). To investigate this we create a synthetic high-dimensional meta-learning regression setting where each task is sampled from an RKHS
\(\rkhs{H}\) with a ``complicated'' kernel \(\kernel{K}^o\). In particular, we choose
\(\kernel{K}^o\) to be the kernel given by Bochner's theorem and a
pushforward of a 3-layers Multi-Layer Perceptron (MLP) with 32 hidden units per layer, ReLU activation functions and a 16-dimensional latent Gaussian distribution. The network was initialized with weights given by the PyTorch \citep{paszke2019pytorch} default initialization scaled by 100. Since this kernel lacks an analytic form, we sample 10000 frequencies and use the random features kernel from \cref{eq:random-features-kernel} in its place. The tasks are generated from a distribution on \(f \in \rkhs{H}\) and a marginal distribution on inputs fixed across all tasks. For each task we sample \(n = \ntr + \nva = 50 + 50\) inputs \((x_i)_{i=1}^n\), a function \(f\) and create the task \((x_i, f(x_i))_{i=1}^n\), for more details see Sec. 5 in the appendix.

We compare the following meta-learning algorithms:

{\itshape IKML.} \cref{alg:IKML-online-optimisation} parameterizing the pushforward $\psi_\theta$ for the measure $\tau_\theta$ with a three-layer MLP with hidden dimension set to $32$ and the dimension of the latent space $\latentspace=\R^{16}$. The number of random features is set to \(M = 10^4\).

{\itshape Gaussian MKL meta-KRR (GMKL).} Multiple Kernel Learning (MKL) with KRR as inner algorithm. The meta-algorithm consists in learning the weights of a kernel \(\kernel{K} = \sum_{j=1}^{k}\lambda_{j}\kernel{K}_{j}\) that is a convex combinations of Gaussian kernels \(\kernel{K}_{j}(x, x') = \exp(-\frac{1}{2\sigma_{j}^{2}}\norm{x - x'}^{2})\) with lengthscale $\sigma_j$ taken from an
log-equidistant grid from $10^{-3}$ to $10^{3}$. The meta-learning algorithms learns the weights $\lambda$ parameterized in terms of the vector $z\in\R^k$ as
\(\lambda_{j} = \frac{\exp(z_{j})}{\sum_{i=1}^k\exp(z_{i})}\).

{\itshape MAML \citep{finn17_model}.} Optimizing through inner gradient descent with MLP to learn a good initalization in the outer loop. We use a three-layer MLP with 32 hidden units and ReLU activation functions.

{\itshape R2D2 \citep{bertinetto18_meta_learn_with_differ_closed_form_solver}.} Ridge regression as inner algorithm, learning a shared feature map in the outer loop. We use a three-layer MLP with 32 hidden units and ReLU activation functions.

%{\itshape \(K^o\) ITL KRR Oracle.}
{\itshape Oracle.}~Running a separate instance of KRR on each task, with the same kernel $K^o$ used to generate the tasks, and finding \(\lambda\) by cross validation on the test set.

\begin{figure*}[t!]
  \centering
  \includegraphics[width=0.24\textwidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=1.png}
  \includegraphics[width=0.24\textwidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=5.png}
  \includegraphics[width=0.24\textwidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=10.png}
  \includegraphics[width=0.24\textwidth]{images/toy_regression/signal_recovery/bochner/3_layer_mlp/learning_curves-d=20.png}
%   \vspace{-2\baselineskip}
  \caption{Learning curves of meta-test RMSE over three runs (mean
		  \(\pm\) 1 std) of Gaussian MKL meta-KRR , MAML, R2D2 and IKML together with the KRR Oracle on the synthetic meta-learning problem introduced in \cref{sec:synthetic} for $d=1, 5, 10, 20$. We generate \(K^o\) once for each experiment and resample tasks for each run. Note that for low dimensions, MKL and R2D2 performs comparably to IKML. As the dimension increases, IKML outperforms all algorithms with performance on par with Oracle.} \label{fig:bochner-synthetic-regression}
\end{figure*}


\subsection{Real-World Data Experiments}
%Meta-learning Regression}
We evaluate the proposed approach on two new real world meta-learning regression datasets adapted to the meta-learning setting from the UCI repository \citep{Dua:2019}. Apart from IKML and Gaussian MKL meta-KRR, we used the following algorithms in our experiments:
{\itshape LS Biased Regularization \citep{denevi2019learning} (LSBR).} Running linear ridge regression with biased regularization $\lambda \|f - \theta\|^2$ in the inner algorithm, learning the bias $\theta$ in the outer loop.

{\itshape ANP \citep{kim19_atten_neural_proces}.} Learns to map datasets to
	stochastic processes over functions using neural networks to do meta-learning. Predictor is the conditional mean
	of the stochastic process.

{\itshape Gaussian Oracle KRR (GO).} Gaussian KRR addressing each task as a separate learning problem but cross-validating the kernel bandwidth and regularization parameters \(\sigma^2\) and \(\lambda\) on the average validation error directly on the meta-test set.

We chose the baselines from landmark papers in the few-shot learning (MAML, R2D2, ANP, LSBR) and multiple-kernel learning (GMKL, GO) literature applicable to regression. We think these are natural baselines to compare against.

\label{par:training-procedure}
For both meta-learning datasets, we run the
algorithms above in an online fashion where we use a meta-batch of 4 tasks per
iteration sampled from the meta-train set. For IKML we fix the number of random features to 20000 which is on the order of \(\Omega(n^{5/2}\log(n))\) if we would have pooled the train and validation set of \(25\) datapoints to one train set of size \(50\). Note however that further experiments show that in practice we can get away with as little as 2500 random features while mainting performance. Every 250
steps we sample 1000 tasks from the meta-validation set and evaluated the average meta-loss for each algorithm and save the model parameters. After training we sample 3000 tasks from the
meta-test set. For the meta-test evaluation, for all algorithms, we use the
meta-parameters with the lowest meta-validation error and get the test performance for all algorithms. We measure performance in terms of the root mean square error (RMSE). This procedure was run 5 times over
different random seeds in order to get learning curves and results on the meta-test set. Below we describe the datasets and comment on the empirical evidence.

\par{\bf Air Quality~} The Beijing Air Quality dataset
\label{par:air-quality}
~\citep{zhang2017cautionary} is a time-series dataset
measuring air-quality and meterological factors at 12 air-quality monitoring
sites. The meterological data for each site is matched with the closest of
available weather stations. The data was collected hourly and from the period
March 1st, 2013 to February 28th, 2017. Further details in Sec. 4.1 in the appendix. 

We generate a task of train and validation size \(n_{\rm tr}, n_{\rm val}\) by randomly picking a station and picking a contiguous subsequence of size \(n = n_{\rm tr} + n_{\rm val}\) at random from the split. We append the feature ``$t$'' which is the local order of data points and then randomly assign \(n_{\rm tr}\) of the \(n\) points to the train set and the rest to the validation set. This can be seen as a reconstruction problem: given data from sensor of which some have failed, we want to infer the output given an input at some points in time. 
%
We choose to use \(n_{\rm tr} = n_{\rm val} = 25\). 

After experimenting we use the following configuration of the algorithms; For Gaussian MKL meta-KRR we use 20 Gaussian kernels with lengthscale sampled geometrically from $1$ to $10^{12}$ and learn the coefficients and regularisation parameter using the same parameterization as in the synthetic experiment with Adam and a meta-learning rate of 0.001. For LS Biased Regularization we learn the bias and regularisation parameter using Adam with meta-learning rate 0.01. We parameterized MAML with a 2-layer MLP with 64 hidden
dimensions and with inner learning rate \(10^{-7}\) and one
adaptation step, learning the initialization using Adam with meta-learning rate of 0.001. We found that using a very small inner learning rate and few steps was important to get MAML to converge. For R2D2, IKML and ANP we cross-validated to find the best set of hyparparameters, see Sec. 5 and Tab. 1 in the appendix for more information. For Gaussian meta-KRR we learn the lengthscale and regularisation parameter using Adam with a meta-learning rate of 0.001. We benchmark a neural network IKML, called IKML-MLP in where we use a 4-layer MLP with 64 hidden units, 8 output features and 500 random features trained using Adam with learning rate of \(3 \cdot 10^{-4}\), see Sec. 5 in the appendix.

From \cref{tab:beijing-air-quality-and-gas-sensor-experiment} we can see that IKML performs best with R2D2 and IKML-MLP close seconds.


\par{\bf Gas Sensor~} The Gas Sensor Modulation dataset
\label{par:gas-sensor}
\citep{burgues18_estim_limit_detec_semic_gas} is a collection of
multivariate timeseries collected in a controlled environment using MOX sensors
for CO detection sampled at 3.5 Hz. Each task corresponds to a subsampled
time-series from an experiment. As noted in \citep{burgues18_estim_limit_detec_semic_gas} the regression tasks are
hard due to being heteroscedastic, non-normal and non-linear as a function of time but with tasks
sharing a lot of structure, making it suitable as a meta-learning regression dataset. Further details in Sec. 4.2 in the appendix.

\begin{table}[t!]
  \caption{Test RMSE on Beijing Air Quality and Gas Sensor. Best results in \textbf{bold}.
  }
\centering
\begin{tabular}{c|ll}
    	{Model}                     & \thead{Air Quality \\ RMSE} & \thead{Gas Sensor \\ RMSE} \\
    	\toprule
      GMKL      & 23.27 $\pm$ 0.16 & 9.61 $\pm$ 0.07 \\
      LSBR    & 21.68 $\pm$ 0.29 & 12.44 $\pm$ 0.14\\
      MAML 						  & 34.96 $\pm$ 3.58 & 2.81 $\pm$ 0.12\\
      R2D2                        & 20.23 $\pm$ 0.55 & {\bfseries 1.95 $\pm$ 0.06}\\
      Gaussian meta-KRR           & 25.08 $\pm$ 0.48 & 9.80 $\pm$ 0.09\\
      GO         & 25.94 $\pm$ 0.91 & 12.78 $\pm$ 0.10\\
      IKML                        & {\bfseries 19.14 $\pm$ 0.93} & 2.80 $\pm$ 0.10\\
      IKML-MLP                    & 20.77 $\pm$ 0.57 & 2.06 $\pm$ 0.09\\
      ANP                         & 33.77 \(\pm\) 0.70 & 2.12 \(\pm\) 0.09\\
      \bottomrule
    \end{tabular}
    \label{tab:beijing-air-quality-and-gas-sensor-experiment}
\end{table}

\begin{table*}[t]
  \caption{Test RMSE / MAE / SMAPE on Beijing Air Quality and Gas Sensor datasets for R2D2, IKML and ANP.}
  \centering
  \begin{tabular}{lccccccccc}
	& & \multicolumn{3}{c}{Air Quality} & \phantom{abc} &
																	  \multicolumn{3}{c}{Gas Sensor}\\
	\cmidrule{3-5} \cmidrule{7-9}
	Model & & RMSE & MAE & SMAPE & & RMSE & MAE & SMAPE \\
	\toprule
	R2D2 & & 20.23 $\pm$ 0.55 & 11.67 $\pm$ 0.40 & 0.24 $\pm$ 0.01 & & {\bfseries 1.95 $\pm$ 0.06} & {\bfseries 0.94 $\pm$ 0.09} & 0.18 $\pm$ 0.05 \\
	IKML & & {\bfseries 19.14 $\pm$ 0.93} & {\bfseries 10.62 $\pm$ 0.19} & {\bfseries 0.22 $\pm$ 0.00} & & 2.80 $\pm$ 0.10 & 1.61 $\pm$ 0.26 & 0.24 $\pm$ 0.03 \\
	ANP & & 33.77 \(\pm\) 0.70 & 21.08 $\pm$ 0.40 & 0.35 $\pm$ 0.01 & & 2.12 \(\pm\) 0.09 & 1.06 $\pm$ 0.06 & {\bfseries 0.09 $\pm$ 0.01} \\
	\bottomrule
  \end{tabular}
  \label{tab:additional-metrics}
\end{table*}

We benchmark the algorithms for \(n_{\rm tr} = n_{\rm val} = 20\). After experimenting we use the following configuration of
the algorithms; For Gaussian MKL meta-KRR we use 20 Gaussian kernels with
lengthscale chosen geometrically from $1$ to $10^{8}$ and learn the
coefficients and regularisation parameter using the same parameterization as in
the synthetic experiment with Adam and a meta-learning rate of 0.001. For LS
Biased Regularization we learn the bias and regularisation parameter using Adam
with meta-learning rate 0.01.

We parameterized MAML with a 4-layer MLP with 64 hidden
units with inner learning rate \(10^{-4}\) and one
adaptation step, learning the initialization using Adam with meta-learning rate of \(10^{-4}\). For R2D2, IKML and ANP we cross-validated to find the best set of hyparparameters, see Sec. 5 and Tab. 1 in the appendix for more information. For Gaussian meta-KRR we learn the lengthscale and regularization parameter using Adam with a meta-learning rate of 0.001. IKML-MLP is as for Gas Sensor, but with 2 layers and 100 random features.

As can be seen from the table, IKML and MAML gets a low meta-test error after R2D2 and IKML-MLP as can be seen in
\cref{tab:beijing-air-quality-and-gas-sensor-experiment}. 

\par{\bf Additional Metrics~}
For the algorithms R2D2, IKML and ANP, with the same setting and training strategy as outlined for Air Quality and Gas Sensor datasets, we evaluate them on two additional metrics: mean average error (MAE) and symmetric mean absolute scaled error (SMAPE)\footnote{Note that we present this as a ratio instead of as a percentage.}~\citep{chicco2021coefficient}. From  \cref{tab:additional-metrics} we see that IKML performs the best on all metrics on the Air Quality dataset with ANP performing poorly. For Gas Sensor R2D2 performs best except for SMAPE where ANP performs much better.

\section{Conclusion and Future Work}
\label{sec:conclusion}

We introduced a framework for implicit kernel meta-learning (IKML) in context of translation-invariant and deep random kernel families. Our approach focuses on problems where data does not present a clear input structure (in contrast e.g. to image classification settings) and using a plug-in translation invariant kernel might be a safer strategy. Our approach leverages the characterization of random feature kernels, in particular the translation invariant kernels granted by Bochner's theorem and ideas from the random features literature to learn it in practice. We derive a novel bound on the excess transfer risk shedding light on how to choose the number of random features. To validate our method we introduced two real-world meta-learning regression datasets.

IKML achieve best or close-to-best performance on all of the datasets against state-of-the-art methods designed for few-shot image classification. We hypothesize that when the data does not have enough structure (e.g. in most regression settings), learning a deep representation -- as done by state-of-the-art methods such as MAML or R2D2 -- may be less effective. We leave further investigation of this question to future work.

We close by mentioning three relevant directions for future research: {\em i) Conditional meta-learning} Is it possible to extend the framework to conditional meta-learning? One way would be to use KTA similar to \citep{sinha2016learning} and adjusting the initial starting kernel similar to MAML; {\em ii) Theoretical guarantees} Can we show that IKML converges to a stationary point for benign settings? This would require understanding the bias-variance decomposition of the gradient; {\em iii) Alternative Kernel Classes} Can we extend IKML to other kernel families? An example is dot-product kernels \citep{kar2012random}.

\section{Acknowledgements}
The authors would like to thank comments by the anonymous reviewers which helped improve the paper. John Isak Texas Falk is supported through the Department of Computer Science at UCL, and this work was partially carried out while he was a visiting doctoral student at Istituto Italiano di Tecnologia, Genoa. Carlo Ciliberto acknowledges the
support of the Royal Society (grant SPREM RGS\textbackslash{}R1\textbackslash{}201149) and Amazon.com Inc. (Amazon
Research Award – ARA).% MASSI ADD HERE! Massimiliano Pontil were supported in part by EPSRC Grant N. EP/P009069/1 and SAP SE.

\bibliography{./bibliography.bib}

\end{document}
