\documentclass[accepted]{uai2023}
\usepackage[american]{babel}
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%%%%% NEW MATH DEFINITIONS %%%%%
\usepackage{amsmath, amsthm,amssymb, amsfonts, xcolor, tikz, float, url, algorithm,algpseudocode, bm, bbm, mathtools}
\usepackage{thmtools}
\usepackage{algorithm}
\usepackage{algpseudocode}
%\usepackage{autonum}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{lipsum}
\usepackage{wrapfig}
\usepackage{xpatch}
%\usepackage{nomencl}
\usepackage{xspace}
\usepackage{bookmark}
\usepackage{hyperref}
\usepackage[capitalise]{cleveref}


\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{schur_138-supp}


\definecolor{hanblue}{rgb}{0.27, 0.42, 0.81}
\hypersetup{
hidelinks,
    colorlinks=true,
    linkcolor=hanblue,
    urlcolor=hanblue,
    citecolor=hanblue,
    anchorcolor=black}
\usetikzlibrary{decorations, calligraphy, positioning}
\usepackage{multirow}

\newcommand\norm[1]{\|#1 \|}


\newtheorem{theorem}{Theorem}[section]
\newtheorem{remark}[theorem]{Remark}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}

\newtheorem{innercustomthm}{Theorem}
\newenvironment{mythm}[1]
  {\renewcommand\theinnercustomthm{#1}\innercustomthm}
  {\endinnercustomthm}
  
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\trace}{tr}
\DeclareMathOperator{\Lap}{Lap}
\DeclareMathOperator{\asvec}{vec}
\DeclareMathOperator{\asmatrix}{mat}

% colors
\definecolor{parnian}{rgb}{0.36, 0.54, 0.66}
\definecolor{oracle}{HTML}{4f7992}
\definecolor{naive}{HTML}{d33f49}
\definecolor{meta}{HTML}{419d78}
\definecolor{federated}{HTML}{f9a620}
\definecolor{grey}{rgb}{0.7, 0.75, 0.71}
\definecolor{optimalrate}{HTML}{69C498}%{rgb}{0.67, 0.88, 0.69}
\definecolor{notopt}{HTML}{d33f49}%{ff0a47}%{CD515F}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}

% Vectors
\def\bzero{{\bm{0}}}
\def\bone{{\bm{1}}}
\def\bmu{{\bm{\mu}}}
\def\btheta{{\bm{\theta}}}
\def\ba{{\bm{a}}}
\def\bb{{\bm{b}}}
\def\bc{{\bm{c}}}
\def\bd{{\bm{d}}}
\def\be{{\bm{e}}}
\def\bf{{\bm{f}}}
\def\bg{{\bm{g}}}
\def\bh{{\bm{h}}}
\def\bi{{\bm{i}}}
\def\bj{{\bm{j}}}
\def\bk{{\bm{k}}}
\def\bl{{\bm{l}}}
\def\bn{{\bm{n}}}
\def\bo{{\bm{o}}}
\def\bp{{\bm{p}}}
\def\bq{{\bm{q}}}
\def\br{{\bm{r}}}
\def\bs{{\bm{s}}}
\def\bt{{\bm{t}}}
\def\bu{{\bm{u}}}
\def\bv{{\bm{v}}}
\def\bw{{\bm{w}}}
\def\bx{{\bm{x}}}
\def\by{{\bm{y}}}
\def\bz{{\bm{z}}}




% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}




% Graph
\def\calA{{\mathcal{A}}}
\def\calB{{\mathcal{B}}}
\def\calC{{\mathcal{C}}}
\def\calD{{\mathcal{D}}}
\def\calE{{\mathcal{E}}}
\def\calF{{\mathcal{F}}}
\def\calG{{\mathcal{G}}}
\def\calH{{\mathcal{H}}}
\def\calI{{\mathcal{I}}}
\def\calJ{{\mathcal{J}}}
\def\calK{{\mathcal{K}}}
\def\calL{{\mathcal{L}}}
\def\calM{{\mathcal{M}}}
\def\calN{{\mathcal{N}}}
\def\calO{{\mathcal{O}}}
\def\calP{{\mathcal{P}}}
\def\calQ{{\mathcal{Q}}}
\def\calR{{\mathcal{R}}}
\def\calS{{\mathcal{S}}}
\def\calT{{\mathcal{T}}}
\def\calU{{\mathcal{U}}}
\def\calV{{\mathcal{V}}}
\def\calW{{\mathcal{W}}}
\def\calX{{\mathcal{X}}}
\def\calY{{\mathcal{Y}}}
\def\calZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}





% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\ubar}[1]{\text{\b{$#1$}}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak


% Algo names
\def\algoff{{\textsc{Meta-KGL}}\xspace}
\def\algon{{\textsc{LiBO}}\xspace}
\def\falgoff{{\textsc{F-Meta-KGL}}\xspace}
\def\falgon{{\textsc{F-LiBO}}\xspace}
\def\gpucb{{\textsc{GP-UCB}}\xspace}
\def\bba{{\textsc{BaseBO}}\xspace}

% Variable names
\def\tH{{\calH_{k^\star}}}
\def\Hhat{{\calH_{\hat{k}}}}
\def\Hfull{{\calH_{k_\mathrm{full}}}}
\def\tk{{k^\star}}
\def\tbeta{\beta^\star}
\def\tvbeta{\bm{\beta}^\star}
\def\khat{{\hat{k}}}
\def\betahat{{\hat{\beta}}}
\def\vbetahat{\hat{\bm{\beta}}}
\def\vvarepsilon{{\bm{\varepsilon}}}
\def\vbeta{{\bm{\beta}}}
\def\vphi{{\bm{\phi}}}
\def\gj{{^{(j)}}}
\def\tJ{{J^\star}}
\def\Jhat{{\hat{J}}}
\def\dmax{{d_{\mathrm{max}}}}
\def\kfull{{k^\mathrm{full}}}
\def\Roracle{{R^\star}} %_\mathrm{oracle}}}
\def\Dexp{\calD^{\mathrm{exp}}}
\def\fed{{^{(\mathrm{fed})}}}

% delta value for the differntial privacy analysis
\def\ddp{{\delta_{\mathrm{dp}}}}
% epsilon value for the differntial privacy analysis
\def\edp{{\epsilon_{\mathrm{dp}}}}
% sigma value for the differntial privacy analysis
\def\sdp{{\sigma_{\mathrm{dp}}}}
% known upper bound on s^*
\def\cdp{{c_{s^\star}}}

\makeatletter
\def\munderbar#1{\underline{\sbox\tw@{$#1$}\dp\tw@\z@\box\tw@}}
\makeatother

\usepackage{multicol}
\newcommand*{\myalign}[2]{\multicolumn{1}{#1}{#2}}

\usepackage{colortbl}

\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{makecell}

\makeatletter
\newcommand{\printfnsymbol}[1]{%
  \textsuperscript{\@fnsymbol{#1}}%
}
\makeatother

%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================

\title{Lifelong Bandit Optimization: No Prior and No Regret}

\author[1]{Felix~Schur\thanks{ Equal contribution.}}
\author[1]{Parnian~Kassraie\printfnsymbol{1}}
\author[1]{Jonas~Rothfuss}
\author[1]{Andreas~Krause}
\affil[1]{%
    ETH Zurich\\
    Switzerland
}
  
  \begin{document}
\maketitle

\begin{abstract}
\looseness -1 Machine learning algorithms are often repeatedly applied to problems with similar structure over and over again. We focus on solving a sequence of bandit optimization tasks and develop \algon, an algorithm which {\em adapts} to the environment by learning from past experience and becomes more sample-efficient in the process. 
We assume a kernelized structure where the kernel is {\em unknown} but {\em shared} across all tasks.
\algon sequentially meta-learns a kernel that approximates the true kernel and solves the incoming tasks with the latest kernel estimate. Our algorithm can be paired with {\em any} kernelized or linear bandit algorithm and guarantees {\em oracle optimal} performance, meaning that as more tasks are solved, the regret of \algon on each task converges to the regret of the bandit algorithm with oracle knowledge of the true kernel. Naturally, if paired with a sublinear bandit algorithm, \algon yields a sublinear lifelong regret.  We also show that direct access to the data from each task is not necessary for attaining sublinear regret. We propose \falgon, which solves the lifelong problem in a federated manner. \looseness-1
\end{abstract}
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Introduction} \label{sec:intro}
A key aspect of human intelligence is our ability to harness previous experience and quickly improve when repeatedly solving similar problems. 
In this paper, we study how to solve a {\em sequence of learning problems, on related instances}, and become more efficient in the process.  
In particular we focus on problems which are solved through Bayesian Optimization, a.k.a.~kernelized bandit algorithms (BO), where the kernel captures regularity structure of the tasks. 
A motivating application are AutoML systems, which perform hyper-parameter tuning for the same model on different datasets, or different models on the same dataset. We expect that the more tasks our machine learning system solves, the better the system becomes at solving the next one.  

\looseness -1 We model this as {\em lifelong learning},
where an agent sequentially faces kernelized bandit problems with different unknown reward functions. 
While prior work assumes the kernel to be known (e.g., hand-designed), we consider the kernel $k^*$ to be {\em unknown}, but {\em shared} between the problem instances. 
After each bandit task, we use the previously collected data to {\em meta-learn a kernel function} $\hat{k}$ as a proxy for the unknown $\tk$. 
We transfer knowledge across tasks by sequentially updating the meta-learned kernel and using it to solve the next task. This way, we adapt to the environment and gradually improve the bandit performance.
Ideally, we would like to reach the oracle-optimal performance, i.e. the performance of a bandit algorithm with complete knowledge of the environment.

Lifelong bandit optimization is a delicate problem for two reasons. First, the success of each round of BO depends on the validity of the meta-learned kernel: We only have guaranteed convergence and sublinear regret if the reproducing kernel Hilbert space (RKHS), induced by the estimated kernel $\hat{k}$, contains the reward functions. Second, the data that is used for meta-learning is collected at the previous BO tasks. Thus, during each BO round, we not only have to quickly find reward maximizing actions, but also have to collect exploratory data that is sufficiently informative for successful meta-learning of the kernel.

We address these challenges when the true kernel is a sparse convex combination of a large number of candidate kernels.
We propose an approach for meta-learning a provably consistent estimator of the true kernel, given data from previous tasks (\cref{thm:offline_main_consitency}).
To ensure that this data is sufficiently informative, we interlace the queries of the BO agent with purely exploratory queries.
Combining these two key ideas, we design our main algorithm, the {\em Lifelong Bandit Optimizer (\algon)}.
This algorithm is versatile since it is {\em agnostic to the bandit policy}, i.e. it can be wrapped around {\em any} kernelized or linear base bandit algorithm to influence its policy and satisfy lifelong guarantees. 
We prove that it is {\em oracle-optimal}, i.e. that by using \algon, we can eventually achieve the {\em same} worst-case performance as the base bandit algorithm which has oracle knowledge of the true kernel (\cref{thm:lifelong}). 
We {\em do not make assumptions} about the base bandit algorithm, and our convergence guarantees hold for many bandit solvers such as OFUL \citep{abbasi2011improved}, GP-UCB \citep{srinivas2009gaussian} or GP-TS \citep{chowdhury2017kernelized}. 
Additionally, we consider a federated setting where each BO task is performed by a client node in a network and the data ought not to be exchanged with the server node due to privacy concerns.
We propose the {\em Federated Lifelong Bandit Optimizer (\falgon)}, 
and show that it satisfies a guarantee similar to \algon (\cref{thm:lifelong_federated}).
If we take \gpucb as base bandit solver,
\algon and \falgon have the {\em same worst-case regret bound rates} as the \gpucb solver when given {\em oracle knowledge} of the true kernel (\cref{cor:ll_ucb_decreasing_main_text} and \ref{cor:ucb_federated}).
In \cref{sec:experiments} we support our theoretical findings by experiments on synthetic and real-world data in the AutoML context. 
Lastly, we discuss related works in \cref{sec:related_work}. \looseness-1
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Problem Statement} 
\label{sec:problem}
\looseness -1 We consider a lifelong optimization setting, where an agent interacts with a sequence of black-box optimization problems, arriving one after another. Throughout the sequence of optimization tasks, the agent can adapt to the environment based on the previously collected data and improve its performance on the succeeding tasks.
Formally, the agent iteratively faces bandit problems with unknown reward functions $f_1, ..., f_m$ residing in a RKHS $\tH$ that corresponds to an {\em unknown} kernel function $\tk$. 
To impose regularity, we assume that the reward function has a bounded kernel norm $\norm{f}_\tk \leq B$ and that the domain $\calX \subset \sR^{d_0}$ is compact.
The agent interacts with each task $f_s$ for $n$ time steps. For each task $s =1,\dots,m$, at time step $i =1, \dots, n$, the agent selects an action $\bx_{s,i} \in \calX$ and receives a stochastic reward via
$y_{s,i} = f_{s}(\bx_{s,i}) + \varepsilon_{s,i}$. Here, $\varepsilon_{s,i}$ are i.i.d.~samples from a zero-mean sub-Gaussian noise with variance proxy $\sigma^2$.
The goal of the agent is to maximize its rewards across all tasks. This can be formalized as minimizing the {\em lifelong regret} over $m$ tasks of size $n$, defined as
\[
R(m,n) \coloneqq \sum_{s=1}^m \sum_{i=1}^n f_s(\bx_s^\star) - f_s(\bx_{s,i}),
\]
where $\bx_s^\star$ is a global maximum of $f_s$.
If \smash{$\frac{R(m,n)}{mn} \rightarrow 0$} as $m,n \rightarrow \infty$ then the agent eventually converges to the global optimum of each upcoming optimization task. This property is commonly referred to as \emph{sublinearity} of the regret.
\looseness -1 To attain a small regret, the agent maintains an estimate of the unknown reward function $f_s$ based on its history.
Typically, a kernelized regression oracle (e.g. kernel ridge regression or Gaussian Processes) is employed for this task. The choice of the kernel function plays a key role in the success and data-efficiency of the bandit optimization. If the hypothesis space $\calH_k$ induced by the kernel $k$ is too restrictive and does not contain the true reward functions $f_s$, the agent will likely never find reward maximizing actions. To prevent this, most practitioners pick a kernel with a conservatively complex kernel with a large hypothesis space that is very likely to contain $\calH_{k^*}$. However, the larger $\calH_k$, the more observations it takes to form a good reward estimate, making the finding an optimal solution less efficient.

\looseness -1 We take a data-driven approach to select the kernel. In particular, we aim to sequentially meta-learn a kernel $\hat{k}$ which approximates the true kernel $\tk$, using the data from previous bandit tasks.
Let $\calD_s \coloneqq \{ (\bx_{s,i}, y_{s,i})_{i \leq n} \}$ be the data corresponding to task $s$, and $\calD_{1:s} \coloneqq \calD_1 \cup \dots \cup \calD_s$ be the collection of datasets from the first $s$ tasks.
Then once the agent solves task $s$, we pass $\calD_{1:s}$ to the {\em meta-agent}, who meta-learns a kernel $\khat_s$. This kernel is then provided to the agent who uses it for solving the next BO task. 
Our meta-learning algorithm can be paired with any kernelized bandit algorithm and achieves sublinear lifelong regret, if the bandit algorithm achieves sublinear single-task regret given oracle knowledge of the kernel. 
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Meta-Learning Kernels}
\label{sec:metalearn}

We first present {\em Meta Kernelized Group Lasso (\algoff)}, our approach to estimate the kernel $\tk$, given data from previous tasks.
We consider a large set of eligible known base kernels $\{k_1, \dots, k_p\}$ where $k_j: \calX \times \calX \to \sR$ for all $j = 1, \dots, p$ and $k_j(\bx,\bx^\prime) \leq 1$ for all $\bx,\bx^\prime \in \calX$ without loss of generality.
We assume that while $\tk$ is unknown, it is a sparse
linear combination of kernels selected from this set, i.e., 
there exists $J^\star \subset \{1,\dots,p\}$ and $\alpha_1, \dots \alpha_p \in \mathbb{R}$ such that
\[
    \tk(\bx,\bx') = \sum_{j \in J^\star} \alpha_j k_j(\bx,\bx^\prime),
\]
where $|\tJ| \ll p$. The set $\{k_1, \ldots, k_p\}$ can be very large, since we prove that the cost of finding $\tJ$ depends only logarithmically on $p$.
We further assume that each $k_j$ corresponds to a $d_j$-dimensional feature map, i.e., $k_j(\bx,\bx^\prime) = \vphi_j(\bx)^T\vphi_j(\bx^\prime)$, where $\vphi_j \in \sR^{d_j}$ and $d_j < \infty$.
This setting generalizes the common linear bandit assumption, to also account for higher-order terms and interaction between coordinates of the input.
Let $\vphi(\bx)$ denote the concatenated $d$-dimensional feature map, where $d \coloneqq  \sum_{j=1}^pd_j$ and
$
\vphi(\bx) \coloneqq [\vphi_j^T(\bx)]_{j \leq p}.
$
Then for $s = 1, \dots, m$ the reward functions can be written as 
$f_s(\cdot) = \sum_{j=1}^p \vphi_j^\top(\cdot)\tvbeta_s\gj$ such that
$\tvbeta_s\gj = 0$ for all $j \notin \tJ$. Moreover, the RKHS norm of $f_s$ will be equal to $\norm{f_s}^2_\tk = \sum_{j=1}^p \norm{\tvbeta_s\gj}_2^2$. 
This kernel model is inspired by \citet{kassraie2022metalearning}, who assume $\tk$ lies in the convex cone of the base kernels.  

\looseness -1 In the lifelong setting, we sequentially form kernel estimates $\khat_s$ based on $\calD_{1:s}$ for $s=1, \dots, m$. In this section, we consider one snapshot of this process for $s=m$, where we have fixed meta-training data $\calD_{1:m}$ and meta-learn $\khat \coloneqq \khat_m$.
We assume without loss of generality (c.f. \cref{app:wlog}),
\begin{align*}
    \tk(\bx,\bx') = \frac{1}{|\tJ|} \sum_{j \in J^\star} k_j(\bx,\bx^\prime),
\end{align*}
and minimize a sparsity inducing loss which allows us to discard kernels that do not appear in the above formulation.
\algoff first minimizes  $\Ls(\vbeta; \calD_{1:m})$ over $\vbeta \in \sR^{md}$. 
\begin{align}
\label{eq:meta_loss}
\Ls\left(\vbeta; \calD_{1:m} \right) \coloneqq & \frac{1}{\vert \calD_{1:m}\vert} \norm{ \by - \mPhi \vbeta}_2^2  + \lambda \sum_{j=1}^{p} \|\vbeta\gj\|_2 \\
 =& \frac{1}{mn}  \sum_{s=1}^m\sum_{i=1}^n \big(y_{s,i} - \sum_{j=1}^{p} \vphi_j^T(\bx_{s,i})\vbeta_s\gj\big)^2 \nonumber\\
& \,\,+ \lambda \sum_{j=1}^{p} \sqrt{\sum_{s=1}^m\|\vbeta_s\gj\|^2_2}\nonumber
\end{align}
The vectorized formulation uses the following notation
\begin{align*}
    \by & \coloneqq \left[\left[y_{1, i}\right]_{i\leq n}, \dots,  \left[y_{m, i}\right]_{i\leq n}\right] \in \sR^{mn},\\
    \vbeta & \coloneqq \left[ [\vbeta_1\gj]_{j \leq p}, \dots, [\vbeta_m\gj]_{j \leq p} \right] \in \sR^{md},\\
    \vbeta\gj & \coloneqq \left[ \vbeta_1\gj, \dots, \vbeta_m\gj \right] \in \sR^{md_j},\\
    \mPhi & \coloneqq \diag(\mPhi_1, \dots \mPhi_m) \in \sR^{mn \times dm}.
\end{align*}
Here $\mPhi_s \coloneqq (\vphi^\top(\bx_{s,1}), \dots, \vphi^\top(\bx_{s, n}))^\top$ is the $n \times d$ feature matrix of a task $s$, and therefore $\mPhi$ denotes a block diagonal matrix which gathers the features across all tasks. 
This meta-loss function is convex, and is equivalent to the well-known Group Lasso objective \citep{lounici2011oracle}. Therefore, it can be efficiently optimized using Group Lasso solvers \citep[e.g.,][]{celer2018} and enjoys the statistical properties of the Group Lasso, e.g., consistency and variable selection.
Let $\vbetahat \coloneqq \argmin \Ls(\vbeta; \calD_{1:m})$. 
The first term in \cref{eq:meta_loss} represents the squared prediction error of $\vbetahat$, while the second is a regularization term that induces group sparsity in $\vbetahat$.
Mainly, the solutions $\vbetahat = (\vbetahat^{(1)}, \dots, \vbetahat^{(p)})$ to this problem are group sparse, i.e. $\vbetahat\gj=0$ for many of the indices $j\in \{1, \dots, p\}$.
\algoff then constructs the set of plausible kernels $\Jhat$, by thresholding $\norm{\vbetahat\gj}_2$ and discarding the kernels that do no appear to be influencing the data, i.e., 
\begin{align*}
    \Jhat \coloneqq \left\{ j ~|~ j \in \{1,\dots,p\}\ \text{s.t.}\ \norm{\vbetahat\gj}_2 > \omega \sqrt{m} \right\}.
\end{align*}
where $\omega > 0$ is a hyperparamter of the algorithm. We then construct the estimated kernel as
\[
\khat(\bx,\bx^\prime) \coloneqq \frac{1}{|\Jhat|} \sum_{j \in \Jhat} k(\bx,\bx^\prime).
\]
\algoff is summarized in Algorithm \ref{alg:meta_learning}. 
Under mild assumptions on the dataset, we can show that $\khat$ converges to the true kernel $\tk$ in probability. 
Our first assumption ensures that if $j \in \tJ$, i.e., $k_j$ is active in the true kernel, then the contribution of $k_j$ to the data is large enough to be statistically detectable under noise.
\begin{assumption}[Beta-min]
\label{ass:betamin}
    There exists $c_1 > 0$ such that for all $j \in J^\star$,
    \begin{equation*}
        \norm{\vbeta^{*(j)}}_2 \geq c_1 \sqrt{m}.
    \end{equation*}
\end{assumption}
This assumption is commonly used in the high-dimensional statistics literature \citep{buhlmann2011statistics,bunea2013group,zhao2006model}.
Our second assumption requires that the meta-training data is sufficiently diverse. In \cref{prop:forced_exp}, we propose a policy which provably satisfies this assumption.
\begin{assumption}[Sufficiently Informative Data]
\label{ass:compatibility}
The feature matrix  $\mPhi\in \sR^{mn \times d}$ is sufficiently informative if
there exists a constant $c_\kappa>0$ such that $\kappa(\mPhi) \geq c_\kappa$ where
\begin{align*}
\kappa(\mPhi) \coloneqq & \inf_{(J,\bb)} \frac{1}{\sqrt{n}} \frac{\|\mPhi \bb\|_2}{ \sum_{j \in J} \|\bb\gj\|_2}\\
 \text{s.t. }&  \ \bb \in \sR^d\backslash \{0\},\ \sum_{j \notin J} \|\bb^{(j)}\|_2 \leq 3 \sum_{j \in J} \|\bb^{(j)}\|_2,\\
 \,\, & J \subset \{1, \dots, p\}, \ \vert J \vert \leq \vert \tJ \vert.
\end{align*}
\end{assumption}
\looseness -1 Intuitively, $\kappa(\Phi)$ measures the quality of the data: data points that are almost identical decrease $\kappa(\Phi)$, and $\kappa(\Phi)$ is large when data points are diverse.
If the minimum eigenvalue of $\mPhi$ is positive, Assumption \ref{ass:compatibility} is automatically fulfilled.
This type of assumption is common in the literature on representation/meta-learning for sequential decision-making \citep{yang2021impact, cella2021multi,kassraie2022metalearning} and sparse linear bandits \citep{bastani2020online,hao2020high, kim2019doubly}. It is also known in the Lasso literature as the compatibility condition \citep{buhlmann2011statistics}. 
Given these assumptions, we show that \algoff recovers the true kernel with high probability.
\begin{theorem}[Consistency of \algoff]
\label{thm:offline_main_consitency}
    Suppose $D_{1:m}$ satisfies \cref{ass:betamin}~and~\ref{ass:compatibility} with constants $c_1$ and $c_\kappa$ respectively.
    Set $\omega \in (0, c_1)$ and define $\bar \omega = \min \{\omega, c_1-\omega\}$.
    Choose $\lambda = \bar{\omega} c_{\kappa}^2/(8 \sqrt{m})$. 
    Then for \smash{$ \sqrt{n}> 32 \sigma /(\bar \omega c_{\kappa}^2)$}, \algoff satisfies
    \begin{align*}
        \mathbb{P} \left[\Jhat = J^\star \right] \geq 1 - p \exp\left( -m \left(\tfrac{\bar{\omega} c_{\kappa}^2 \sqrt{n}}{32 \sigma} - 1 \right)^2 \right).  
    \end{align*}
    In particular, $\Jhat$ is a consistent estimator both in $n$ and $m$,
    \begin{align*}
        \lim_{n \to \infty} \mathbb{P}\left[ \Jhat = J^\star \right] = 1, \ \text{and} \ \lim_{m \to \infty} \mathbb{P}\left[ \Jhat = J^\star \right] = 1.
    \end{align*}
\end{theorem}
\cref{app:offline} presents the proof to Theorem \ref{thm:offline_main_consitency}.
This theorem shows that our meta-learned kernel converges to $\tk$ as the number of meta-training tasks increases. First, this implies that the meta-learned hypothesis space includes the unknown reward functions
allowing downstream bandit algorithms to provably converge to the optimum.
Second, all candidate kernels $k_j$ that are not active in $k^\star$ are eventually excluded from $\khat$. By excluding all $k_j$ with $j \notin J^\star$ which are not necessary for estimating $f_s \in \calH_{k^\star}$, we effectively shrink the size of the hypothesis space, thereby reducing the uncertainty of the reward function estimates during bandit optimization.
Compared to \smash{$\kfull \coloneqq \tfrac{1}{p}\sum_{j=1}^p k_j$}, which naively uses all kernels, this leads to significant improvements in the query efficiency and performance of the bandit optimization.

\textbf{Comparison with Prior Work.}
\citet{kassraie2022metalearning} propose \textsc{Meta-KeL}, a Lasso-equivalent loss for meta-learning a sparse kernel, given i.i.d. offline data from i.i.d. tasks. 
We emphasize that is not possible to achieve lifelong guarantees by sequentially applying this algorithm. 
\algoff differs from \textsc{Meta-KeL} in key points, and satisfies stronger consistency guarantees: 1) It converges to $k^\star$ as either $n$ the number of samples per task, or $m$ number of tasks grow. In contrast, \textsc{Meta-KeL} converges in $m$ only. 
2) \algoff satisfies the exact recovery guarantee for $k^\star$ since $J^* = \hat{J}$ with high probability. 
While \textsc{Meta-KeL} only guarantees that $J^\star \subset \hat{J}$. This is not sufficient to show that meta-learning improves upon the trivial kernel choice $k_{\mathrm{full}}$.
Both of these properties are required in the lifelong analysis.  \looseness -1
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Lifelong Bandit Optimization}
\label{sec:lifelong}
We now use \algoff as a building block to develop the {\em Lifelong Bandit Optimizer} (\algon), an algorithm for lifelong bandit or Bayesian optimization. 
%
\algon is paired with a \bba agent which can be instantiated by any kernelized bandit algorithm, e.g., \gpucb \citep{srinivas2009gaussian} or \textsc{GP-TS} \citep{chowdhury2017kernelized}. 
For each task $f_s$, the \bba agent is given the kernel $\khat_{s-1}$ meta-learned on the $s-1$ first tasks. Equipped with the kernel, \bba interacts with the current bandit environment, aiming to optimize its payoff by balancing exploration and exploitation. 

In the lifelong setting, we not only have to explore for the sake of optimizing the current reward function $f_s$, but also we need to make sure to that the sequence of action-reward pairs will be sufficiently informative (in the sense of Assumption \ref{ass:compatibility}) for meta-learning $\hat{k}_s$ in the next stage.
To this end, \algon forces the base agent to select purely exploratory actions for the first $n_s$ steps of the task, by i.i.d. sampling from uniform distribution on $\calX$. 
Following \citet{basu2021no}, we refer to this as {\em forced exploration}
and use $\Dexp_s \coloneqq \{(\bx_{s,i}, y_{s,i}), i\leq n_s\}$ to refer to the collected exploratory data of task $f_s$. We use a decreasing sequence $(n_1, \dots, n_m)$ as detailed below, since less exploration by \bba will be required once more multi-task data is collected. For steps $i > n_s$, \bba selects actions according to its normal bandit policy. 
After the agent has interacted with the current task for $n$ steps, we pass the exploratory data $\Dexp_{1:s}$ to \algoff to meta-learn $\khat_s$. We then announce this new kernel estimate to the \bba agent for solving the next task $s+1$. Figure~\ref{fig:lifelong_algo} visualizes this process and \cref{alg:lifelong} summarizes \algon.

\begin{figure}
    \centering
    \resizebox{\columnwidth}{!}{
    \input{Figures/Fig_lifelong}
    }
    \caption{Overview of \algon.\looseness-1}
    \label{fig:lifelong_algo}
\end{figure}

\subsection{Regret Bounds}
\looseness-1 Let $\Roracle(n)$ be the worst-case regret of \bba with oracle knowledge of true kernel $\tk$ on single tasks when the reward resides in $\tH$. When employed sequentially on $m$ bandit tasks, the worst-case lifelong regret $R(m,n)$ will be of the order $m \Roracle(n)$ with high probability. We refer to this as oracle regret, since the $\bba$ has access to the true kernel $k^*$ which does not hold in practice. Since our meta-learned kernels $\hat{k}_s$ are an approximations of $k^*$, the oracle regret is a natural lower bound on the regret of \algon.

\looseness -1 In the following, we show that if $\Roracle(n)$ the single-task oracle regret of the base bandit algorithm is sublinear (e.g., as for \gpucb or \textsc{GP-TS}), then so is the lifelong regret $R(m,n)$ of \algon. 
Importantly, $R(m,n)$ is not only sublinear in $n$, but also converges with high probability to $\Roracle(m,n)$.
\cref{thm:lifelong} presents this guarantee, assuming that the forced exploration datasets $\Dexp_s$ satisfy assumption \cref{ass:compatibility} which \algoff requires to yield a provably consistent estimator of $\tk$. Later in \cref{prop:forced_exp}, we show that exploration by i.i.d. sampling from a uniform distribution over $\calX$ will guarantee this assumption.
\begin{theorem}
\label{thm:lifelong}
     For all tasks $s=1, \dots, m$, assume that the reward function $f_s \in \calH_{\tk}$ has bounded RKHS norm $\norm{f_s}_{\tk} \leq B$. Set the number of forced exploration actions as $n_s = \frac{\sqrt{n}}{s^{1/4}}$, and assume that \cref{ass:betamin} and \ref{ass:compatibility} hold for the data $\Dexp_{1:s}$ for all $s = 1, \dots, m$. Suppose, with probability greater than $1-\delta/2$, \bba has worst-case oracle regret $\Roracle(m,n)$.
    Then, the lifelong regret of \algon satisfies
    \begin{equation*}
    \resizebox{\columnwidth}{!}{
    $
        R(m,n) - \Roracle(m,n) = \calO \Big(  \underbrace{Bm^{3/4}\sqrt{n}}_{\text{forced exp.}} + \underbrace{ B (nm)^{1/3} \log^{3/4}(mp/\delta) }_{\text{kernel mismatch} }\Big) 
    $
    }        
    \end{equation*}
    with probability greater than $1-\delta$.
\end{theorem}
The explicit inequality without the $\calO$-notation can be found in Appendix \ref{app:lifelong}, together with the proof.
In the following, we give a sketch of the proof, aiming to explain the source of each term in the bound.
For every forced exploration step, in the worst-case, we suffer regret of $2B$. When accumulated over a total of $\sum_{s=1}^m n_s$ such steps, this gives the first term in the bound.
If $\khat_s \neq \tk$, it is possible to suffer from linear regret in the worse-case. 
To account for this, we calculate the smallest integer $m_0$, for which, with high probability, $\khat_{s} = \tk$ for all $m_0 < s \leq m$. Based on \cref{thm:offline_main_consitency}, we show that $m_0 = \calO((m/n^2)^{1/3}\log^{3/4}(mp/\delta))$.
For every task $s \leq m_0$ we suffer a linear regret of $2Bnm_0$ in the worst-case. This is upper bounded by the second term in \cref{thm:lifelong}, which can be regarded as the cost of learning $\tJ$. Notably, it grows only logarithmically with $p$ the number of considered features/kernels, offering a significant improvement about the polynomial rates given by prior works \citep[e.g.,][]{yang2021impact, hong2022hierarchical}. \cref{tab:litreview1} and \cref{tab:litreview2} present a comprehensive list of the related regret bounds. \looseness-1

We highlight that the excess regret of \algon in \cref{thm:lifelong} is sublinear in both $m$ and $n$. This implies that the algorithm is {\em oracle optimal}, meaning that as $m\rightarrow \infty$, the single-task regret {\em without} knowledge of $\tk$, eventually approaches the oracle single-task regret. Recall that $\Roracle(m,n) = m \Roracle(n)$ and therefore, $R(m,n)/m \rightarrow \Roracle(n)$.
This guarantee is stronger than that of \citep{basu2021no, pmlr-v151-peleg22a}, where the excess regret depends linearly on $m$ due to excessive forced exploration. By decreasing $n_s \propto s^{-1/4}$ the number of exploratory steps vanishes throughout the sequence of tasks.

As an example, we analyze the performance if \gpucb\footnote{\cref{app:gp_ucb}  provides a background on \gpucb.} \citep{srinivas2009gaussian} is used as the \bba algorithm. In this case, we demonstrate that the worst-case lifelong regret of \algon is of the same rate as the corresponding oracle regret.
To highlight the benefit of this oracle optimality we compare to a naive baseline which uses $\khat_s = \kfull=  \smash{\sum_{j=1}^p \frac{1}{p} k_j} $ for all tasks instead of meta-learning $\khat_s$ sequentially. 
In particular, we consider solving a sequence of $m$ tasks in three scenarios: 1) running \algon paired with \gpucb 2) repeatedly running \gpucb with oracle access to $k^*$, and 3) repeatedly running \gpucb with $\kfull$.
The following corollary shows that the worst-case upper bound for the first two scenarios match in $\calO$-notation. \cref{app:lifelong_ucb} presents the proof.

\begin{corollary}[Lifelong \gpucb]
\label{cor:ll_ucb_decreasing_main_text}
    Consider the setting of \cref{thm:lifelong} with \gpucb as \bba agent. 
    Then, with probability at least $1-\delta$, the lifelong regret of \algon paired with \gpucb satisfies
    \begin{align*}
        R(m,n) & = \calO\big(\Roracle(m,n)\big)\\
        & = \calO \left( Bm d^\star \sqrt{n}\log\tfrac{n}{d^\star} + m\sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}}\right)
    \end{align*}
    where $\smash{d^\star \coloneqq \sum_{j \in \tJ} d_j}$. 
\end{corollary}
In the third scenario, we conservatively set $\khat_s = \kfull$ for all $s=1, \dots, m$. While this is sufficient for attaining a lifelong regret that is sublinear in $n$, the performance will \emph{not} be oracle optimal. In particular, this algorithm suffers from a regret of \looseness-1
    \begin{equation*}
    \resizebox{\columnwidth}{!}{
    $
        R(m,n) = \calO\left( B \tfrac{m dp}{\vert \tJ\vert} \sqrt{n}\log\tfrac{n}{d} + \sqrt{nd \log \tfrac{n}{d}\log \tfrac{1}{\delta}}  \right)
        $
        }
    \end{equation*}
where $d = \sum_{j=1}^p d_j \gg d^\star$ and $\smash{p/\vert \tJ\vert}$ can be very large. Our experiments confirm that the performance of the naive approach is significantly worse than the other variants. This is due to the fact that confidence bounds constructed using $\kfull$ tend to contract slower than the ones constructed with the sparse meta-learned $\khat_s$.


\subsection{Forced Exploration} \label{sec:forced_exp}

Our forced exploration scheme ensures that the collected data is sufficiently informative to guarantee successful meta-learning. From a technical perspective, it ensures that \cref{ass:compatibility} is met and allows for a consistent estimator of $\tk$.
The cost of this exploration in the regret of each task is smaller in rate than the minimax regret bound \citep{lattimore2020bandit}. Therefore it has only a negligible effect on the overall performance guarantees of \algon (see \cref{cor:ll_ucb_decreasing_main_text}).
We show that by uniformly drawing actions from the domain, the collected data satisfies this assumption:
%
\begin{proposition}
\label{prop:forced_exp}
    Assume that $\phi_j \in L^2(\calX)$, $j \in \{1,\dots,p\}$ are orthonormal and let $d_j=1$.
    Draw $\bx_{1}, \dots, \bx_{n_1}$ independently and uniformly from $\calX$, and repeatedly use them to construct $\Dexp_{1:s}$.
    Then with probability at least $1-\delta$, 
     $\Dexp_{1:s}$ satisfies \cref{ass:compatibility},
    for $s = 1,\dots, m$.
\end{proposition}
\looseness -1 The proof can be found in Appendix \ref{app:kappa_bounds}.
 The $d_j=1$ condition is met without loss of generality, by splitting the higher dimensional feature maps and introducing more base features, which will increase $p$. Moreover, the orthonormality condition is met by orthogonalizing and re-scaling the feature maps. Basis functions such as Legendre polynomials and Fourier features \citep{rahimi2007random} satisfy these conditions.

\looseness -1 Generally, it is natural to require \bba to explore more in lifelong setting compared to when it is used in isolation and with a known kernel. We observe in our experiments that \algon has a better empirical performance with forced exploration (i.e., $n_s>0$) than without. 
This additional exploration is also required in the Representation Learning \citep{yang2021impact, yang2022nearly, cella2021multi, cella2022meta} and hierarchical Bayesian bandit literature \citep{basu2021no, pmlr-v151-peleg22a, hong2022hierarchical}, where it is assumed that either the context distribution or the chosen actions are diverse enough. In the case of contextual bandits, if there is sufficient randomness, the \bba can be greedy and yet sample diverse enough actions \citep{bastani2021mostly}. \cref{tab:litreview3} gives a detailed overview of how the related works rely on uniform exploration.
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Federated \algon}
\label{sec:federated}

We consider a federated extension of the lifelong learning problem. Here, each BO task is performed by a peer in a network and the corresponding data is not exchanged due to privacy concerns, limited bandwidth, etc. 
Operations are mainly done at the client level, and the central server only performs light computations.
This setting formalizes problems such as optimizing the user experience of a software product on each user's device, e.g., for making better recommendations. 
Limiting the client-server communication 
reduces the transmit overhead time and motivates faster federated computation.
Moreover, sending detailed data on user preferences and interaction patterns to the central server may jeopardize the user's privacy. 
However, we want to harness the statistical patterns across the user pool to improve the automated tailoring of the software product to new users. 
We interpret such a federated learning problem \citep{kairouz2021federated} as a client-server adaptation of our lifelong setting as described in Section \ref{sec:problem}. 
The meta-agent represents the server and BO tasks arise sequentially at a client node $s=1, ..., m$ with a client specific reward function $f_s$.  

We propose the {\em Federated Lifelong Bandit Optimizer} (\falgon) to solve this problem without directly sharing $\calD_s$ the data corresponding to each client with the server. 
\falgon, pairs the clients and the server as follows.  
First, the client node $s$ receives $\khat_{s-1}$, the most recent estimate of the true kernel, and the required number of forced exploration queries $n_s$ from the server.
After taking some exploratory steps, the client performs actions according to its \bba policy.
In contrast to \algon, once the task is over after $n$ steps, the client keeps $\Dexp_s$ to itself, instead of passing it back the server. The client node optimizes for a local loss
\begin{align*}
\vbetahat_s^{(\mathrm{client})} & \coloneqq \argmin_{\vbeta_s \in \sR^{d}} \Ls\left(\vbeta_s; \Dexp_{s} \right) \\
& = \argmin_{\vbeta_s \in \sR^{d}} \frac{1}{n_s} \norm{ \by_s - \mPhi_s \vbeta_s}_2^2  + \lambda \sum_{j=1}^{p} \|\vbeta_s\gj\|_2,
\end{align*}
and calculates a local estimate of $\tJ$ by thresholding $\vbetahat_s^{(\mathrm{client})} $ with the hyperparamter $\omega >0$
\[
\hat J_s^{(\mathrm{client})} \coloneqq \left\{ j \in \{1,\dots,p\}\ \text{s.t.}\ \norm{\vbetahat_s^{(\mathrm{client})}\gj}_2 > \omega \right\}.
\]
It then sends {\em only} the indices $\Jhat_s^{(\mathrm{client})}$ back to the server. 
This leaves the server with the simple task of taking a $\alpha$-majority vote among the $s$ first clients, to decide which base kernels to include in $\hat{k}_s$. Formally, the server chooses 
\[
\khat_{s}(\bx, \bx^\prime)\coloneqq \frac{1}{\vert \hat J_s\vert } \sum_{j \in \hat J_{s}} k_j(\bx, \bx^\prime)
\]
where for $\alpha \in [0,1]$,
\[
\hat J_{s} \coloneqq \left\{  j \in \{1,\dots,p\}\ \text{s.t.}\  \sum_{r =1}^s \mathbbm{1}(j \in \hat J_r^{(\mathrm{client})}) \geq s\alpha\right\}.
\]
In other words, after client $s$ finishes its job, the server includes the $j$-th kernel into its updated estimate $\hat{J}_s$, if and only if more than $s\alpha$ of the clients so far believe that it should be included.
Figure~\ref{fig:lifelong_algo_fed} in the appendix visualizes this process and \cref{alg:lifelong_fed} presents the pseudo-code to \falgon.
Similar to \algon, we show that if $\Roracle(n)$ the worst-case oracle regret of the base bandit algorithm is sublinear in $n$, then so is the lifelong regret $R(n,m)$ of \falgon:
\begin{theorem}
\label{thm:lifelong_federated}
For all tasks $s=1, \dots, m$, assume that the reward function $f_s \in \calH_{\tk}$ has bounded RKHS norm $\norm{f_s}_{\tk} \leq B$. Set the number of forced exploration actions as $n_s = \sqrt{n}$, and assume that \cref{ass:betamin} and \ref{ass:compatibility} hold for the data $\Dexp_s$. 
    Suppose, with probability $>$ $1-\delta/2$, that \bba has worst-case oracle regret $\Roracle(m,n)$. Then the lifelong regret of \falgon satisfies
    \begin{equation*}
    \resizebox{\columnwidth}{!}{
    $
        R(m,n) - \Roracle(m,n) = \calO \Big(Bm\sqrt{n} + B\sqrt{n} \log(mp/\delta)  \Big) 
    $
    }        
    \end{equation*}
    with probability greater than $1-\delta$.
\end{theorem}
\looseness -1 See \cref{app:federated_lifelong} for the proof.  
\cref{thm:lifelong_federated} demonstrates that even without direct access to the data, the lifelong regret of $\falgon$ will be sublinear in $n$. 
This theorem does not imply oracle optimality, since  $R(m,n)/m-\Roracle(n) \not\to 0$ for $m \rightarrow \infty$. 
This is due to the linear dependency of the first term on $m$, which arises from forced exploration. In the federated setting, we require all clients to take a fixed number of exploratory action \smash{$n_s =\sqrt{n}$}, so that they have equal resources for estimating $\tJ$ and the server's majority vote is fair.
We conjecture that with simple modifications, \algon can become provably differentially private. Replacing the majority voting step with GNMax Aggregator \citep{papernot2018scalable} or PRIME \citep{liu2021robust} yields a differential private voting mechanism to select $\hat{J}_s$, while preserving the lifelong regret guarantee.

\begin{figure*}
    \centering
    \begin{subfigure}{0.33\textwidth}
    \centering
         \includegraphics[width=\textwidth]{Figures/plot_ol_cosine_1.pdf}
    \end{subfigure}
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plot_ll_cosine_1_0.pdf}
    \end{subfigure}
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plot_glmnet_legendre_1.pdf}
    \end{subfigure}
    \begin{minipage}[t]{0.37\textwidth}
               \caption{Singe-task cumulative regret of \gpucb with meta-learned kernel $\khat_m$ on an increasing number of meta-training tasks $m$. 
    \label{fig:offline}}
       \end{minipage}
       \hfill
    \begin{minipage}[t]{0.6\textwidth}
               \caption{Lifelong cumulative regret of \gpucb for synthetic tasks (left) and \textsc{GLMNET} hyperparameter tuning (right). Vertical lines indicate the beginning of a new task. \label{fig:online}}    
    \end{minipage}
\end{figure*}

\looseness -1 Consider an example where we instantiate \falgon with \gpucb as \bba. 
The worst-case regret bound of \falgon, which neither has knowledge of $\tk$ nor direct access to $\Dexp_s$, matches the worst-case regret of the oracle \gpucb in $\calO$-notation.
\cref{cor:ucb_federated} formalizes this claim.
Here, $\Roracle(m,n)$ is the same as in \cref{cor:ll_ucb_decreasing_main_text}.
\begin{corollary}[Federated Lifelong \gpucb]
\label{cor:ucb_federated}
    Consider the setting of \cref{thm:lifelong_federated} with \gpucb as \bba. 
    Then, with probability at least $1-\delta$, \falgon paired with \gpucb satisfies 
    \begin{align*}
        R(m,n) & = \calO\big(\Roracle(m,n)\big).
    \end{align*}
\end{corollary}
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Experiments} 
\label{sec:experiments}

In all experiments, we use \gpucb as the \bba.
We repeat all experiments with 20 random seeds and report the corresponding mean outcome with standard error.
To evaluate the proposed algorithms, we use a synthetic as well as a hyper-parameter tuning environment.

\textbf{Synthetic environment.} The synthetic environment is based on our data model from Section \ref{sec:problem}.
We choose $\mathcal{X} = [0, 1]$ as the domain and use the first $p=50$ cosine basis functions $\phi_j(x) = \cos(j \pi x)$ as feature maps which form the kernels $k_j(x, x') = \phi_j(x)^\top\phi_j(x^\prime)$ for $j \in \{1, ..., 50\}$. 
The active kernel indices $\tJ$ are sampled uniformly from the set of 5-element subsets of $\{1, ..., 50\}$, i.e., $ |J^\star| = 5$. We sample the reward functions $f_s$ independently and uniformly from $\tH$ such that $||f||_{\tk} \leq 10$ and beta-min condition of $c_1 \geq 0.5$ holds. To the function evaluations we add i.i.d.~Gaussian noise with a standard deviation of $\sigma=0.1$.

\textbf{AutoML data.}
\looseness -1 A common application of Bayesian Optimization is AutoML, i.e., optimizing the hyper-parameters of machine learning algorithms. In this setting, $\calX$ is the learning algorithm's hyper-parameter space, and $f_s$ represents the test performance of the machine learning system. 
In our experiments, we consider a realistic lifelong AutoML setting where we face a sequence of hyper-parameter optimization problems. Here, each task corresponds to tuning the hyper-parameters of the \textsc{GLMNET} learning algorithm \citep{friedman2010regularization} for a different dataset.
%
Following previous work \citep[e.g.][]{perrone2018scalable, rothfuss2021meta}, we replace the evaluation step by a table lookup based on a large number of hyper-parameter evaluations \citep{kuhn2018automatic} on 38 classification datasets from the OpenML platform \citep{Bischl2017OpenMLBS}.

\subsection{Experiment with Offline Data}
\label{sec:offline_exps}
\looseness -1 We investigate how meta-leaning kernels with \algoff and its federated variant (\falgoff) affects the performance of test tasks. In particular, we use meta-training data that was generated offline, based on the synthetic environment. 
We create data for $m=30$ synthetic tasks, each of size $n=10$ to be used as offline meta-data. The tasks are generated according to our synthetic environment and the details can be found in \cref{app:exp_offline_detail}. Note that $n \ll p$ i.e., we are in the overparameterized setting.
We meta-learn a kernel with \algoff and \falgoff using the meta-training data $\calD_{1:s}$ for $s=1, ..., m$,
and evaluate the estimated kernel $\khat$ by running \gpucb, equipped with $\khat$, for $n=70$ iterations. Figure \ref{fig:offline} illustrates the corresponding single-task regrets in response to increasing the number of meta-training tasks in the offline data.
We report the performance of the agent that uses $\kfull$ (red) as a naive baseline, and the performance of an oracle agent that uses the true kernel (blue) as a natural lower bound for the achievable regret. 
Figure \ref{fig:offline} shows that the regret of both meta-learned agents quickly converges to the regret of the oracle agent as the number of meta-training tasks increases. 
\algoff performs slightly better than \falgoff, since it has direct access to all the data while the federated algorithm loses information during the voting mechanism. 
In \cref{app:experiment_results}, we evaluate \algoff and \falgoff with other choices of base kernels and higher dimensional action domains. Similar to Figure \ref{fig:offline}, we observe fast convergence to the oracle regret. Further details about the experiments are provided in \cref{app:exp_offline_detail}.

\subsection{Lifelong Experiments}
\label{exp:lifelong}
\looseness -1 We return to the lifelong setting where the tasks arrive sequentially and evaluate our algorithms.
We consider both the synthetic and AutoML environments. The horizon of each task is set to $n=100$ time steps. To solve the synthetic problem, we consider the $p=50$ first 1-dimensional cosine bases as the candidate feature maps. Since \textsc{GLMNET} has two hyper-parameters to tune, i.e., $\calX \subset \R^2$, here we use the $p=100$ first 2-dimensional cosine bases, i.e. $\phi_{i,j}(\bx) = \cos(i \pi x_1) \cos(j \pi x_2)$ for $i,j=1, ..., 10$. 

Figure \ref{fig:online} illustrates the cumulative lifelong regrets achieved by \algon, \falgon, the baseline \gpucb with $\kfull$, and oracle \gpucb with access to $\tk$. Note that in the AutoML environment, we do not know the true kernel $\tk$ and thus, cannot report the oracle performance. 
As we would expect, \algon and \falgon initially suffer the same regret as the the naive actor with $\kfull$ since no meta-learning data is available yet. 
However, as more tasks are attempted, the estimated kernel is improved and in turn, the base algorithm becomes more sample efficient on future tasks.
In case of \algon (green), over time, the forced exploration decreases and the estimated kernel converges to the true kernel. As a result, the behavior of the actor paired with the \algon becomes indistinguishable from the actor using the oracle kernel, reflected by the same slope of the regret curves.
Compared to the naive actor (red), our lifelong BO methods significantly improve the efficiency of the base agent as they accumulate more experience. In the AutoML setting, this means that we can find good hyper-parameters with fewer costly function evaluations.
This showcases how incorporating knowledge transfer into deployed machine learning systems
can yield significant performance gains and cost savings.
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\begin{table}[t]
\centering
\setlength\tabcolsep{2.5pt}
\begin{tabular}{l|c  |c | c | c | c} 
&   \makecell{oracle\\ optimal} &  \makecell{policy\\agnostic} & \makecell{learns \\ sparsity}  &  \makecell{meta \\cost} & \makecell{tasks}  \\[0.6ex]
 \hline
  \citeauthor{hu2021near} & \xmark & \xmark & \xmark & $\mathrm{poly}\, d$ &   conc\\[0.6ex] 
 \hline
 \citeauthor{yang2021impact} &  \cmark & \xmark &\xmark & $\mathrm{poly}\, d$ & conc\\[0.6ex] 
 \hline
 \hline
\citeauthor{pmlr-v151-peleg22a} & \xmark & \xmark & \xmark & $\mathrm{poly}\, d$ &  seq \\[0.6ex] 
\hline
 \multirow{ 2}{*}{\citeauthor{hong2022hierarchical}} & \cmark & \xmark &  \xmark & $\mathrm{poly}\, d$  & seq  \\[0.6ex] 
 \cline{2-6}
& \xmark &\xmark &  \xmark & $\mathrm{poly}\, d$  & conc \\[0.6ex] 
\hline
\hline
\algon  &   \cmark & \cmark& \cmark & $\log\, d$  & seq 
\end{tabular}
\caption{Related work (\cref{tab:litreview3} gives a comprehensive list.)\looseness-1\label{tab:litreview_main}}
\end{table} 

\section{Related Work}
\label{sec:related_work}

 \looseness -1 The lifelong bandit optimization problem addresses key shortcomings of classic kernelized bandits and Bayesian optimization. 
Early approaches assume that the agent knows the true kernel
\citep{srinivas2009gaussian,valko2013finite,chowdhury2017kernelized}, which is often not the case in practice. 
Recent work addresses this problem, either by studying the implications of misspecified kernels \citep{dylan2020miss,simchowitz2021bayesian, bogunovic2021misspecified,camilleri2021high} or proposing methods for adapting kernel parameters during the optimization \citep{wang2014theoretical, berkenkamp2019no}. 
Alternatively, the appropriate kernel can be learned from related data.
To this end, a number of algorithms are developed for meta-learning a kernelized Gaussian process (GP) prior \citep{harrison2018meta, perrone2018scalable, rothfuss2021pacoh, rothfuss2021meta, rothfuss2022meta}.
However, they come without theoretical guarantees. 


Theory of knowledge transfer between concurrent or sequential linear bandits has received recent attention from multiple perspectives.
Representation Learning literature \citep{yang2021impact,hu2021near,yang2022nearly,cella2022meta} assumes existence of a shared low-dimensional linear representation for the reward function, i.e. $f_s(\bx) = \langle \btheta_s, \mB^T\bx\rangle$ where $\mB \in \sR^{d\times d^\star}$ is shared by the tasks. 
This matrix is unknown, however $d^\star$ is known and $d^\star \ll d$.
Feature selection \citep{cella2021multi} takes a similar approach by assuming that $f_s(\bx) = \langle \btheta_s, \mS^T\bx \rangle$, where the unknown matrix $\mS$ screens the relevant features \smash{$\{\bx_j,\,j\in \tJ\}$}. The elements of this matrix are $0$ or $1$, but contrary to representation learning, $d^\star = \vert \tJ \vert$ is unknown. 
Alternatively, works on Bayesian Prior learning  assume existence of a shared Gaussian prior over the parameter vector, i.e. $f_s(\bx) = \langle \btheta_s, \bx\rangle$, where $\btheta_s \sim \calN(\mu, \Sigma)$.  This formulation does not aim for a low-dimensional solution.
Following this model, \citet{basu2021no} and \citet{hong2022hierarchical} assume that $\Sigma$ is known and learn distribution of $\mu$. \citet{pmlr-v151-peleg22a} estimate both the mean and the covariance. We consider a more relaxed setup where the mean is not shared, and meta-learn a shared covariance function. \cref{app:litreview} goes into more depth to formally compare the mentioned work. \looseness-1
 
\looseness-1 We compare \algon with prior algorithms based on the following properties.
In the context of meta-learning for BO, a desirable method is 1) oracle optimal, i.e., attains the regret guarantee of the oracle solver as $m$ grows, 2) able to utilize any BO algorithm, 3) sample efficient, i.e., pays a small cost for meta-learning the prior/relevant features and 4) recovers low-dimensional solutions, since the effective dimension influences the sample efficiency of the base algorithm. \cref{tab:litreview_main} compares \algon with previous work applicable to infinite action domains. Works limited to finite action sets are considered in \cref{tab:litreview3}.
\algon is the only oracle optimal algorithm that learns the effective dimension $d^\star$, while paying a cost that scales only logarithmically with the Euclidean dimension $d$. This is an exponential improvement compared to the polynomial dependency of prior work; moreover, it also applies to reward functions that are a linear combinations of non-linear features $f_s(\bx) = \langle \btheta_s, \bm{\phi}(\bx)\rangle$. Further, it can be wrapped around any linear or kernelized bandit algorithm, while earlier work require a specific bandit policy.

\falgon contributes to recent literature on federated learning which studies how agents can cooperate to solve a single bandit task \citep{dubey2020differentially, shi2021federated, huang2021federated, dai2022federated}. In federated lifelong learning, each agent interacts with a different environment, but collaborates with others to learn relevant features. 

 \looseness -1 Our work builds on ideas from Multiple-Kernel Learning \citep{cristianini2001kernel, bach2004multiple, ong2005learning, xu2010simple, gonen2011multiple} and Multi-Task Lasso \citep{obozinski2006multi, argyriou2006multi, lounici2011oracle} which address consistency of model selection for offline supervised learning. 
Our contribution is lifelong uncertainty quantification, using a meta-learned kernel. \looseness-1
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
\section{Conclusion}
\looseness -1 We introduce \algon, an algorithm which allows for lifelong knowledge transfer across BO tasks trough meta-learned kernels. We show theoretically and empirically that, if paired with \algon, the performance of a base bandit algorithm improves as more experience is gained on previous tasks.
%
In particular, we prove that \algon is oracle optimal in the limit.
With \falgon, the federated variant of our main algorithm, we establish that sublinear knowledge transfer is possible even without direct access to the bandit data. 

\looseness -1 This work opens up directions of future research such as quantifying the cost of privacy in Lifelong Learning, understanding the necessity of exploration in lifelong setting, or using large neural networks to extract relevant features from prior tasks instead of working with pre-determined features. 
%=====================================================================================
\nocite{li2019nearly}
\nocite{li2021tight}
\nocite{thompson1933likelihood}
\nocite{vershynin2018high}
\nocite{russo2014learning}
\nocite{vakili2021information}
%=====================================================================================
\begin{acknowledgements}
 This research was supported by the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation program grant agreement no. 815943. Jonas Rothfuss was supported by the Apple Scholars in AI/ML fellowship. 
 \end{acknowledgements}
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================
%=====================================================================================

\bibliography{schur_138}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%% SUPPLEMENT (OPTIONAL) %%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \appendix
% \numberwithin{equation}{section}


% \onecolumn
% \thispagestyle{empty}
% \title{Lifelong Bandit Optimization: No Prior and No Regret\\
% (Supplementary Materials)}
% %\input{Appendix/0_notation}
% \input{Appendix/0_algorithms}
% \input{Appendix/1_litreview.tex}
% \input{Appendix/0.5_without_loss_of_generality}
% \input{Appendix/2_metalearn_app}
% \input{Appendix/3_lifelong}
% % \input{Appendix/4_federated}
% \input{Appendix/4_federated_with_differential_privacy}
% \input{Appendix/5_experiments}
% % \input{Appendix/appendix} 

\end{document}