\documentclass[accepted]{uai2023}
\usepackage[american]{babel}
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%%%%% NEW MATH DEFINITIONS %%%%%
\usepackage{amsmath, amsthm,amssymb, amsfonts, xcolor, tikz, float, url, algorithm,algpseudocode, bm, bbm, mathtools}
\usepackage{thmtools}
\usepackage{algorithm}
\usepackage{algpseudocode}
%\usepackage{autonum}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{lipsum}
\usepackage{wrapfig}
\usepackage{xpatch}
%\usepackage{nomencl}
\usepackage{xspace}
\usepackage{bookmark}
\usepackage{hyperref}
\usepackage[capitalise]{cleveref}


\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{schur_138}


\definecolor{hanblue}{rgb}{0.27, 0.42, 0.81}
\hypersetup{
hidelinks,
    colorlinks=true,
    linkcolor=hanblue,
    urlcolor=hanblue,
    citecolor=hanblue,
    anchorcolor=black}
\usetikzlibrary{decorations, calligraphy, positioning}
\usepackage{multirow}

\newcommand\norm[1]{\|#1 \|}


\newtheorem{theorem}{Theorem}[section]
\newtheorem{remark}[theorem]{Remark}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}

\newtheorem{innercustomthm}{Theorem}
\newenvironment{mythm}[1]
  {\renewcommand\theinnercustomthm{#1}\innercustomthm}
  {\endinnercustomthm}
  
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\trace}{tr}
\DeclareMathOperator{\Lap}{Lap}
\DeclareMathOperator{\asvec}{vec}
\DeclareMathOperator{\asmatrix}{mat}

% colors
\definecolor{parnian}{rgb}{0.36, 0.54, 0.66}
\definecolor{oracle}{HTML}{4f7992}
\definecolor{naive}{HTML}{d33f49}
\definecolor{meta}{HTML}{419d78}
\definecolor{federated}{HTML}{f9a620}
\definecolor{grey}{rgb}{0.7, 0.75, 0.71}
\definecolor{optimalrate}{HTML}{69C498}%{rgb}{0.67, 0.88, 0.69}
\definecolor{notopt}{HTML}{d33f49}%{ff0a47}%{CD515F}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}

% Vectors
\def\bzero{{\bm{0}}}
\def\bone{{\bm{1}}}
\def\bmu{{\bm{\mu}}}
\def\btheta{{\bm{\theta}}}
\def\ba{{\bm{a}}}
\def\bb{{\bm{b}}}
\def\bc{{\bm{c}}}
\def\bd{{\bm{d}}}
\def\be{{\bm{e}}}
\def\bf{{\bm{f}}}
\def\bg{{\bm{g}}}
\def\bh{{\bm{h}}}
\def\bi{{\bm{i}}}
\def\bj{{\bm{j}}}
\def\bk{{\bm{k}}}
\def\bl{{\bm{l}}}
\def\bn{{\bm{n}}}
\def\bo{{\bm{o}}}
\def\bp{{\bm{p}}}
\def\bq{{\bm{q}}}
\def\br{{\bm{r}}}
\def\bs{{\bm{s}}}
\def\bt{{\bm{t}}}
\def\bu{{\bm{u}}}
\def\bv{{\bm{v}}}
\def\bw{{\bm{w}}}
\def\bx{{\bm{x}}}
\def\by{{\bm{y}}}
\def\bz{{\bm{z}}}




% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}




% Graph
\def\calA{{\mathcal{A}}}
\def\calB{{\mathcal{B}}}
\def\calC{{\mathcal{C}}}
\def\calD{{\mathcal{D}}}
\def\calE{{\mathcal{E}}}
\def\calF{{\mathcal{F}}}
\def\calG{{\mathcal{G}}}
\def\calH{{\mathcal{H}}}
\def\calI{{\mathcal{I}}}
\def\calJ{{\mathcal{J}}}
\def\calK{{\mathcal{K}}}
\def\calL{{\mathcal{L}}}
\def\calM{{\mathcal{M}}}
\def\calN{{\mathcal{N}}}
\def\calO{{\mathcal{O}}}
\def\calP{{\mathcal{P}}}
\def\calQ{{\mathcal{Q}}}
\def\calR{{\mathcal{R}}}
\def\calS{{\mathcal{S}}}
\def\calT{{\mathcal{T}}}
\def\calU{{\mathcal{U}}}
\def\calV{{\mathcal{V}}}
\def\calW{{\mathcal{W}}}
\def\calX{{\mathcal{X}}}
\def\calY{{\mathcal{Y}}}
\def\calZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}





% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\ubar}[1]{\text{\b{$#1$}}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak


% Algo names
\def\algoff{{\textsc{Meta-KGL}}\xspace}
\def\algon{{\textsc{LiBO}}\xspace}
\def\falgoff{{\textsc{F-Meta-KGL}}\xspace}
\def\falgon{{\textsc{F-LiBO}}\xspace}
\def\gpucb{{\textsc{GP-UCB}}\xspace}
\def\bba{{\textsc{BaseBO}}\xspace}

% Variable names
\def\tH{{\calH_{k^\star}}}
\def\Hhat{{\calH_{\hat{k}}}}
\def\Hfull{{\calH_{k_\mathrm{full}}}}
\def\tk{{k^\star}}
\def\tbeta{\beta^\star}
\def\tvbeta{\bm{\beta}^\star}
\def\khat{{\hat{k}}}
\def\betahat{{\hat{\beta}}}
\def\vbetahat{\hat{\bm{\beta}}}
\def\vvarepsilon{{\bm{\varepsilon}}}
\def\vbeta{{\bm{\beta}}}
\def\vphi{{\bm{\phi}}}
\def\gj{{^{(j)}}}
\def\tJ{{J^\star}}
\def\Jhat{{\hat{J}}}
\def\dmax{{d_{\mathrm{max}}}}
\def\kfull{{k^\mathrm{full}}}
\def\Roracle{{R^\star}} %_\mathrm{oracle}}}
\def\Dexp{\calD^{\mathrm{exp}}}
\def\fed{{^{(\mathrm{fed})}}}

% delta value for the differntial privacy analysis
\def\ddp{{\delta_{\mathrm{dp}}}}
% epsilon value for the differntial privacy analysis
\def\edp{{\epsilon_{\mathrm{dp}}}}
% sigma value for the differntial privacy analysis
\def\sdp{{\sigma_{\mathrm{dp}}}}
% known upper bound on s^*
\def\cdp{{c_{s^\star}}}

\makeatletter
\def\munderbar#1{\underline{\sbox\tw@{$#1$}\dp\tw@\z@\box\tw@}}
\makeatother

\usepackage{multicol}
\newcommand*{\myalign}[2]{\multicolumn{1}{#1}{#2}}

\usepackage{colortbl}

\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{makecell}

\makeatletter
\newcommand{\printfnsymbol}[1]{%
  \textsuperscript{\@fnsymbol{#1}}%
}
\makeatother

%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\title{Lifelong Bandit Optimization: No Prior and No Regret}

\author[1]{Felix~Schur\thanks{ Equal contribution.}}
\author[1]{Parnian~Kassraie\printfnsymbol{1}}
\author[1]{Jonas~Rothfuss}
\author[1]{Andreas~Krause}
\affil[1]{%
    ETH Zurich\\
    Switzerland
}
  
  \begin{document}
  \onecolumn
\maketitle

\appendix
\numberwithin{equation}{section}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\section{Pseudo-codes to Algorithms}
\begin{algorithm}[ht]
\caption{\algoff}
\label{alg:meta_learning}
\begin{algorithmic}
\Require Data from previous tasks $\Dexp_{1:s}$, threshold parameter $\omega > 0$
\State $\hat{\vbeta} \gets \min_{\vbeta \in \sR^{sd}} \Ls\left(\vbeta ; \Dexp_{1:s}\right)$ \Comment{solves \cref{eq:meta_loss}}
\State $\hat{J} \gets \{ j \leq p \ \vert \ \norm{\hat{\vbeta}^{(j)}}_2 \geq \omega \sqrt{s}\}$
\State $\hat{k}_s \gets \frac{1}{|\hat{J}|} \sum_{j \in \hat{J}} k_j$
\end{algorithmic}
\end{algorithm}

\begin{algorithm}
\caption{\algon}
\label{alg:lifelong}
\begin{algorithmic}
\Require $n,m \in \mathbb{N}$, $0 < \omega < c_1$, \bba %, distribution $\nu_s$ on $\calX$
\State $\hat{k}_0 \gets \sum_{j=1}^p \frac{1}{p} k_j$

\For{$s \in \{1, \dots,m\}$}
    %\State $l \gets 0$
    \State $\Dexp_s \gets \emptyset$ \Comment{Dataset for kernel prediction}
    \State $\calD_s \gets \emptyset$ \Comment{Dataset for the base bandit algorithm}
    \For{$i \in \{1, \dots, n\}$}
        \If{$i \leq \frac{\sqrt{n}}{s^{1/4}}$} \Comment{Forced exploration with rate $\sqrt{n}/s^{1/4}$}
            \State Sample $\bx_{s,i}$ uniformly from $\calX$
            \State Play action $\bx_{s,i}$ and observe $y_{s,i}$
            \State $\Dexp_s \gets \Dexp_s \cup \{(\bx_{s,i}, y_{s,i})\}$ \Comment{Add to kernel prediction dataset}
        \Else 
            \State $\bx_{s,i} \gets \text{\bba}(\hat{k}_{s-1})$ \Comment{Select action using base bandit algorithm}
            \State Play action $\bx_{s,i}$ and observe $y_{s,i}$
        \EndIf
        \State $\calD_s \gets \calD_s \cup \{(\bx_{s,i}, y_{s,i})\}$
        \State Update \bba using $\calD_s$ \Comment{Update base bandit algorithm}
    \EndFor
    \State $\khat_{s} \gets \text{\algoff}(\Dexp_{1:s}, \omega)$\Comment{Update $\hat{k}$ using \algoff and $\Dexp_s$}
\EndFor

\end{algorithmic}
\end{algorithm}


\begin{algorithm}[ht]
\caption{\falgon}
\label{alg:lifelong_fed}
\begin{algorithmic}
\Require $n,m \in \mathbb{N}$, $0 < \omega < c_1$, \bba
\State $\hat{k}_0 \gets \sum_{j=1}^p \frac{1}{p} k_j$

\For{$s \in \{1, \dots,m\}$}
    %\State $l \gets 0$
    \State $\Dexp_s \gets \emptyset$ \Comment{Dataset for kernel prediction}
    \State $\calD_s \gets \emptyset$ \Comment{Dataset for the base bandit algorithm}
    \For{$i \in \{1, \dots, n\}$}
        \If{$i \leq \sqrt{n}$} \Comment{Forced exploration with rate $\sqrt{n}$}
            \State Sample $\bx_{s,i}$ uniformely from $\calX$
            \State Play action $\bx_{s,i}$ and observe $y_{s,i}$
            \State $\Dexp_s \gets \Dexp_s \cup \{(\bx_{s,i}, y_{s,i})\}$ \Comment{Add to kernel prediction dataset}
        \Else 
            \State $\bx_{s,i} \gets \text{\bba}(\hat{k}_{s})$ \Comment{Select action using base bandit algorithm}
            \State Play action $\bx_{s,i}$ and observe $y_{s,i}$
        \EndIf
        \State $\calD_s \gets \calD_s \cup \{(\bx_{s,i}, y_{s,i})\}$
        \State Update \bba using $\calD_s$ \Comment{Update base bandit algorithm}
    \EndFor
    \State $\khat_{s} \gets \text{\falgoff}(\Dexp_{1:s}, \omega)$\Comment{Update $\hat{k}$ using \falgoff and $\Dexp_s$}
\EndFor

\end{algorithmic}
\end{algorithm}

\begin{algorithm}[H]
\caption{\falgoff}
\label{alg:offline_federated}
\begin{algorithmic}
\Require $n,m \in \mathbb{N}$, data for each task $\calD_s$, $\omega > 0$, $\alpha \in [0,1]$
\State $\mathrm{count}_1, \dots, \mathrm{count}_p \gets 0$
\For{$s \in \{1, \dots,m\}$}
    \State $\hat{\vbeta}_s \gets \min_{\vbeta \in \sR^{d}} \Ls\left(\vbeta ; \Dexp_{s}\right)$ 
    \For{$j \in \{1, \dots,p\}$}
        \State $\mathrm{count}_j \gets \mathrm{count}_j + \mathbbm{1}{\{ \norm{\hat{\vbeta}^{(j)}_s}_2 \geq \omega \}}$
    \EndFor
\EndFor
\State $\hat{J}\gets \{ j \leq p \ \vert \ \mathrm{count}_j \geq  m\alpha\}$
\State $\hat{k} \gets \frac{1}{|\hat{J}|} \sum_{j \in \hat{J}} k_j$
\end{algorithmic}
\end{algorithm}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================

\section{Extended Literature Review}\label{app:litreview}
In this section, we present an overview of works that consider learning a potentially low-dimensional reward function by leveraging data of similar bandit tasks.

\paragraph{Linear Contextual Bandits with Shared Representation.} The common assumption here is that the reward function for all $s \in [m]$, is linear $f_s(\bx) = \langle \btheta_s, \bx\rangle$ where $\btheta_s = \mB\bw_s$. The matrix $\mB \in \sR^{d\times r}$ is a shared representation matrix and $r\ll d$ is an intrinsic dimension.  This assumption becomes more intuitive if we re-write the reward as  $f_s(\bx) = \langle \bw_s, \mB^T\bx\rangle$, which implies that there exists a mapping $\mB: \sR^d \rightarrow \sR^r$ that produces a low-dimensional representation of the actions.
Our reward assumption implies that there exists a sparse matrix $\mS \in \sR^{d\times d^\star}$ which satisfies $f_s(\bx) = \langle \btheta_s, \mS^T\vphi(\bx)\rangle$ and screens the relevant features $\vphi_j$ with $j \in \tJ$. 
The intrinsic dimension $r$ then corresponds to $\vert \tJ \vert$. 

Recent work on shared representation learning, often consider the contextual setting, where at every step of the bandit problem, actions may only be chosen from a set $\calA_{s, t}$. Once the action is chosen, a noisy reward is observed. Regarding the occurrence of the tasks, two scenarios are often studied. The multi-task setting where all the tasks are solved concurrently, and the lifelong setting where the tasks arrive consecutively. \cref{tab:litreview1} summarizes these efforts in terms of the obtained regret bounds. Here, $\tilde \calO$ hides polylogarithmic factors.
With the exception of \citep{hu2021near}, these works either 1) require forced exploration to fulfill sufficient exploration assumptions (SE) similar to \cref{{ass:compatibility}}, or 2) design a greedy algorithm assuming that the actions in set $\calA_{s,t}$ are sampled from a diverse context distribution (DC) which gives free exploration \citep[c.f.][]{bastani2021mostly}. This suggests that for minimax optimality, either the algorithm has to explore, or the presented context should induce exploration for free.
To better understand the tightness of the results in \cref{tab:litreview1}, we recall that the oracle solver which has knowledge of the representation matrix $\mB$, has a lower-bound of $R^\star(n)= \Omega(\sqrt{rn\log n \log k})$, when $|\calA_{s,t}|=k$ \citep{li2019nearly}. If $\calA_{s,t}$ is an ellipsoid, the lower-bound achievable by the oracle is $\Omega(r\sqrt{n})$ \citep{li2021tight}.  Clearly, for $r \ll d$, the algorithms of \cite{hu2021near}, \cite{cella2021multi}, and \cite{cella2022meta} do not converge to the oracle solver as $m\rightarrow \infty$, since $R(m,n)/m \not\to R^\star(n)$. 

\begin{table}[ht]
\centering
\begin{tabular}{l| l| l |l |l |c | c} 
  & \myalign{c|}{$\calA_{s,t}$} &  \myalign{c|}{$r$} & \myalign{c|}{Tasks} & \myalign{c|}{Expected Lifelong Regret} & Base Policy & Assumptions \\ [0.6ex] 
 \hline
 \multirow{ 2}{*}{\cite{yang2021impact} }
& finite & known & conc.& $\tilde \calO \left(m\sqrt{rn} + \sqrt{rdnm}\right)$ & Greedy & DC\\ [0.6ex] 
& ellipsoid & known & conc.& $\tilde \calO \Big(mr\sqrt{n} + d^{3/2}r\sqrt{nm}\Big)$ & ETC & SE\\ [0.6ex] 
\hline
\cite{hu2021near} & compact & known & conc.& \cellcolor{notopt!70} $\tilde \calO \Big(m\sqrt{drn} + d\sqrt{rnm}\Big)$ & OFUL & - \\[0.6ex] 
 \hline
\cite{cella2022meta} & finite &  unknown & seq. & \cellcolor{notopt!70}$\tilde \calO \left(mr\sqrt{n} + r\sqrt{dnm}\right)$ & Greedy & DC \\[0.6ex] 
 \hline
\cite{cella2021multi} & finite & unknown & seq./conc. & \cellcolor{notopt!70} $\tilde \calO \Big(mr\sqrt{n}\Big)$ & Greedy & DC \& SE \\[0.6ex]
 \hline 
\cite{yang2022nearly} & compact &  known & seq. & $\tilde \calO \Big(mr\sqrt{n} + d\sqrt{rnm}\Big)$ & ETC & SE\\[0.6ex] 
 \hline
\algon (Ours)  & compact &  unknown & seq. & $\tilde \calO \Big(mr\sqrt{n} + m^{3/4}\sqrt{n} \Big)$ & any & SE\\[0.6ex] 
\end{tabular}
\caption{Overview of recent work on representation learning for contextual linear bandits. Oracle lower-bound is $R^\star(m,n)=\Omega\left(mr\sqrt{n}\right)$ for infinite action set, and $R^\star(m,n)=\Omega\left(m\sqrt{rn\log n\log k}\right)$ for finite set. Polylog terms are not included.\label{tab:litreview1}}
\end{table}

\paragraph{Bayesian Bandits with Shared Prior Distributions.} Alternatively, some works consider a Bayesian reward model, but without any assumption on sparsity, or low-dimensional representations. Let $f_s(\bx) = \langle \btheta_s, \bx\rangle$ where $\btheta_s$ are i.i.d.~from $\calN(\bm{\mu}, \Sigma)$ and the parameters $(\bm{\mu}, \Sigma)$ are shared across all tasks.
\cite{pmlr-v151-peleg22a} assume that $(\bm{\mu}, \Sigma)$ are unknown, and estimate it using the exploratory action-reward pairs collected during the first $m_0$ tasks. The suggested meta-algorithm can be wrapped around any Quasi-Bayesian base policy, such as Thomspon Sampling \citep{thompson1933likelihood} or Information Directed Sampling \citep{russo2014learning}, however, the resulting algorithm over-explores as indicated by the $\tilde \calO(md)$ term in the regret bound (See \cref{tab:litreview2}).\looseness -1

Taking a hierarchically Bayesian approach, \cite{basu2021no} and \cite{hong2022hierarchical} further assume that  $\bm{\mu} \sim \calN(\bm{\mu}_0, \Sigma_0)$ where $\bm{\mu}_0$ is unknown, but both covariance matrices $\Sigma$ and $\Sigma_0$ are known. Prior distribution of $\bm{\mu}$ is updated after each task, according to the evidence collected during the task. Both papers design a meta-algorithm with Thomspon Sampling as the base solver. While \cite{basu2021no} suffers from over-exploration, \cite{hong2022hierarchical} does not require any exploration. Indeed, if $\Sigma$ the covariance matrix between the actions is known, it helps with inferring rewards of other actions, and reduces the need for uniform exploration.

An overview is given in \cref{tab:litreview2}, here $R^\star(m,n)$ indicates the Bayesian lifelong regret of the oracle agent who has knowledge of $(\bm{\mu}, \Sigma)$. Note that \cref{thm:lifelong} gives slightly stronger result, which is a high-probability bound over the regret. Here, we have taken the average to make it comparable with the Bayes regret reported in other works. As $m$ grows, the average single-task regret is upper bounded by $R(m,n)/m$, implying that only \cite{hong2022hierarchical} and \algon can converge to the oracle solver.

\begin{table}[ht]
\centering
\begin{tabular}{l| c| c |l |l |c | c} 
  & $\Sigma \slash\Sigma_0$ &  sparse & \myalign{c|}{Tasks} & \myalign{c|}{Bayesian Regret} & Policy & Exp \\[0.6ex] 
 \hline
\cite{basu2021no} & known & no & seq. & $ \cellcolor{notopt!70}\tilde \calO \left(R^\star(m,n) + \sqrt{dnm} + md \right)$ & TS & yes \\[0.6ex] 
 \hline
\cite{pmlr-v151-peleg22a} & unknown &  no & seq. & $ \cellcolor{notopt!70}\tilde \calO \Big((1+d^3/\sqrt{m}) R^\star(m,n)+ md\Big)$ & any QB & yes\\[0.6ex] 
\hline
 \multirow{ 2}{*}{\cite{hong2022hierarchical}} & known & no & seq. &  $\tilde \calO \left(R^\star(m,n) + \sqrt{dmn} + d^{3/2} \right)$ & TS & no\\[0.6ex] 
& known & no & conc.& \cellcolor{notopt!70} $\tilde \calO \left(R^\star(m,n) + \sqrt{dmn} + md^{3/2}\right)$ & TS & yes   \\
\hline
\algon (Ours) & - & yes & seq. & $\tilde \calO \Big(R^\star(m,n) + m^{3/4}\sqrt{n} + (mn)^{1/3}\log (md)\Big)$ & any & yes \\[0.6ex] 
\end{tabular}
\caption{Overview of recent work on meta-learning Bayesian priors for linear bandits \label{tab:litreview2}. All works consider compact action set, except for \cite{basu2021no} which requires a finite set of actions selected from $\sR^d$. The regret of the oracle solver is denoted by $R^\star(m,n)$.}
\end{table}

\paragraph{Overall Landscape of Research.} \label{par:overview}We merge the two lines of work in \cref{tab:litreview3}, to give an overview of ongoing efforts on meta-learning for linear bandits and the important properties of each method.
Column $\vert \calA_t \vert$ shows if the model holds for compact action sets, or only finite ones.
Column ``sparse'' and ``learns $r$'' denote whether the model aims for sparse solution, and if it requires knowledge of the true sparsity $r$ or preferably, it learns it.
Column `` learns $\Sigma/\Sigma_0$'' only applies to Bayesian method, where some assume the covariance matrix of $\btheta_s$ is known, and some estimate it from data.
Column ``Tasks'' shows if the method considers simultaneous or sequentially appearing bandit tasks.
Column ``O-opt'' refers to oracle optimality, and has a checkmark only if $R(m,n)/m \rightarrow R^\star(n)$.
Column ``MS Cost'' shows the cost of model selection/meta-learning. In particular, shows whether the additional regret, incurred due to not knowing the true representation/features, is logarithmic in dimension $d$ or is it polynomial. 
Column ``Policy'' shows the base BO solvers that can be paired with the meta-learning algorithm, ``any'' indicates that the method can work with any (linear) policy, and ``any QB'' refers to Quasi-Bayesian methods.
Column ''Ass.'' shows the assumptions of the method on diversity of data, SE indicates Sufficient Exploration type assumptions, and DC refers to Diverse Context assumptions. 
Column ''Has no uniform draws'' shows if the algorithm requires forces exploration or not.

\begin{table}[ht]
\centering
\begin{tabular}{l| c| c |c |c |c | c | c | c|c|c} 
 & $\vert\calA_t\vert$ & Sparse &\makecell{Learns \\ $r$} & \makecell{Learns\\ $\Sigma \slash\Sigma_0$} &  Tasks & O-opt & MS cost & Policy & Ass. & \makecell{ Has no \\unif. \\ draws} \\[0.6ex] 
 \hline
 \multirow{ 2}{*}{\cite{yang2021impact} } & $k$ & \cmark & \xmark & - & conc & \cmark & $\mathrm{poly}(d)$ & greedy & DC & \xmark \\[0.6ex] 
 \cline{2-11}
 & $\infty$ & \cmark & \xmark & - & conc &\cmark & $\mathrm{poly}(d)$ & ETC & SE & \xmark \\[0.6ex] 
 \hline
 \cite{hu2021near} & $\infty$ & \cmark & \xmark & - & conc & \xmark & $\mathrm{poly}(d)$ &  OFUL & - & \cmark \\[0.6ex] 
 \hline
 \cite{cella2021multi} & $k$ & \cmark &  \cmark &- & both & \xmark & $\log(d)$ & greedy & \makecell{DC\\SE} & \xmark \\[0.6ex] 
 \hline
 \cite{cella2022meta} & $k$ & \cmark &  \cmark &- & seq & \xmark & $\mathrm{poly}(d)$ & greedy & DC &  \cmark\\[0.6ex] 
 \hline
 \cite{yang2022nearly} & $\infty$ & \cmark & \xmark & - & conc & \cmark & $\mathrm{poly}(d)$ & ETC & SE & \xmark \\[0.6ex] 
 \hline
 \hline
\cite{basu2021no} & $k$ & \xmark & - &  \xmark & seq &\xmark & $\mathrm{poly}(d)$ & TS & SE & \xmark \\[0.6ex] 
 \hline
\cite{pmlr-v151-peleg22a} & $\infty$ & \xmark & - & \cmark & seq & \xmark & $\mathrm{poly}(d)$ & any QB & SE & \xmark\\[0.6ex] 
\hline
 \multirow{ 2}{*}{\cite{hong2022hierarchical}} & $\infty$ & \xmark & - &  \xmark & seq & \cmark & $\mathrm{poly}(d)$ & TS & - & \cmark \\[0.6ex] 
 \cline{2-11}
& $\infty$ & \xmark &   - & \xmark & conc & \xmark & $\mathrm{poly}(d)$ & TS & 
 SE & \xmark \\
\hline
\hline
\algon (Ours) &  $\infty$ & \cmark &  \cmark & - & seq & \cmark & $\log(d)$ & any & SE & \xmark\\[0.6ex] \cline{2-11}
\falgon (Ours) & $\infty$ & \cmark &  \cmark & - & seq & \cmark & $\log(d)$ & any & SE & \xmark
\end{tabular}
\caption{Collective pro and cons of related works: \algon gives an overall pareto-optimal solution. Refer to the corresponding paragraph in \cref{par:overview} for information on meaning of each column. \cref{tab:litreview_main} presents a concise version. \label{tab:litreview3}}
\end{table}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\section{Generality of the Kernel Assumption}\label{app:wlog}
In \cref{sec:problem}, we claim that, the average of kernels formulation, i.e.
\[
\tk(\cdot, \cdot) =\frac{1}{\vert \tJ\vert} \sum_{j \in \tJ} k_j(\cdot, \cdot)
\]
is without loss of generality equivalent to assuming a linear combination,
\[
\tk(\cdot, \cdot) = \sum_{j \in \tJ}\alpha_j k_j(\cdot, \cdot).
\]
Here, we formally show this claim.
Assume there exist $\alpha_1, \dots, \alpha_p \in \mathbb{R}$ and kernels $k_1,\dots, k_p$ such that
\begin{align*}
    \tk(\bx,\bx^\prime) = \sum_{j \in J^\star} \alpha_j k_j(\bx,\bx^\prime), \quad \forall \bx, \bx^\prime \in \calX.
\end{align*}

Let $f \in \mathcal{H}_{\tk}$, then there exists $\beta_1 \in \mathbb{R}^{d_1}, \dots, \beta_p \in \mathbb{R}^{d_p}$ such that for all $x \in \mathcal{X}$
\begin{align*}
    f(\bx) = \sum_{j \in J^\star} \sqrt{\alpha_j} \beta_j^{\top} \phi_j(\bx).
\end{align*}

Define $m_j \coloneqq \max\{k(\bx, \bx) \, \mid \, \bx \in \mathcal{X} \}$, $\Tilde{\beta}_j \coloneqq p m_j \beta_j \sqrt{\alpha_j}$ and $\Tilde{k}_j \coloneqq k_j / m_j$ for all $j \in \{1,\dots,p\}$, then
\begin{align*}
    f(\bx) = \frac{1}{p} \sum_{j \in J^\star} \Tilde{\beta}_j^{\top} \phi_j(\bx)
\end{align*}
and therefore $f \in \mathcal{H}_{\Tilde{k}}$ for
\begin{align*}
    \Tilde{k}^\star(\bx,\bx') = \frac{1}{|J^\star|} \sum_{j \in J^\star} k_j(\bx,\bx^\prime).
\end{align*}

This shows that the corresponding Reproducing Kernel Hilbert Spaces are equivalent, i.e. the same functions reside in both, while the norm is scaled. Therefore, we can assume, without loss of generality, that the base kernels are normalized and that the true kernel is an average of base kernels.
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\section{Consistency of \algoff (Proof of \texorpdfstring{Theorem~\ref{thm:offline_main_consitency}}{})} \label{app:offline}
We start by proving the necessary lemmas. During this section we assume a slightly more general setting. More precisely, we assume that we have $n_s \leq n$ samples in task $s$, which means that the total samples size of the meta-dataset is $N = N_m \coloneqq \vert \Dexp \vert = \sum_{s=1}^m n_s$.

\begin{definition}[sub-Gaussian random variables]
    Let $X$ be a random variable. We call $X$ a \emph{$\sigma$ sub-Gaussian random variable} if $\mathbb{E}[X] = 0$ and
    \begin{equation}
        \mathbb{P}[|X| \geq t] \leq 2\exp\left(-\frac{t^2}{2\sigma^2}\right).
    \end{equation}
\end{definition}

\begin{lemma}[Theorem 6.3.2 of \citet{vershynin2018high}]
\label{thm:concentration}
    Let $\epsilon_1, \dots, \epsilon_n$ be independent, zero mean, unit variance sub-Gaussian random variables. Define $\epsilon = (\epsilon_1, \dots, \epsilon_n)$. Let $A \in \sR^{m \times n}$ and $t \geq 0$. Then
    \begin{align*}
        \mathbb{P}\left( \Big \vert \norm{A\epsilon}_2 - \norm{A}_F \Big \vert \geq t \right) \leq \exp\left( -\frac{t^2}{2\norm{A}^2_2} \right).
    \end{align*}
\end{lemma}


\begin{corollary}
\label{cor:concentration}
    Let $\epsilon_1, \dots, \epsilon_n$ be $i.i.d.$~$\sigma$ sub-Gaussian random variables and define $\epsilon = (\epsilon_1, \dots, \epsilon_n)$. Let $A \in \sR^{m \times n}$ and $t \geq \sigma \sqrt{\trace(AA^T)}$. Then
    \begin{align*}
        \mathbb{P}\left( \norm{A\epsilon}_2 \geq t \right) \leq \exp\left( -\frac{\left(t/\sigma - \sqrt{\trace(AA^T)}\right)^2}{2\norm{AA^T}_2} \right).
    \end{align*}
\end{corollary}

\begin{proof}
    The standard deviation of an $\sigma$ sub-Gaussian random variable is smaller equal $\sigma$. Therefore
    \begin{align*}
        \mathbb{P}\left( \norm{A\epsilon}_2 \geq t \right)
        & = \mathbb{P}\left( \norm{A\epsilon}_2 / \sqrt{Var(\epsilon_1)}\geq t/ \sqrt{Var(\epsilon_1)} \right)\\
        & \leq \mathbb{P}\left( \norm{A\epsilon}_2 / \sqrt{Var(\epsilon_1)}\geq t/ \sigma \right).
    \end{align*}
    It holds that $\norm{A}_F = \sqrt{\trace(AA^T)}$. Define $\tilde{\epsilon} = \epsilon / \sqrt{Var(\epsilon_1)}$. We have
    \begin{align*}
        \mathbb{P}\left( \norm{A\epsilon}_2 \geq t \right)
        & \leq \mathbb{P}\left( \norm{A\tilde{\epsilon}}_2 - \norm{A}_F \geq t/\sigma - \sqrt{\trace(AA^T)}\right)\\
        & \leq \mathbb{P}\left( \vert \norm{A\tilde{\epsilon}}_2 - \norm{A}_F \vert \geq t/\sigma - \sqrt{\trace(AA^T)}\right).
    \end{align*}
    Using \cref{thm:concentration} and noting that $\norm{A}^2_2 = \norm{AA^T}_2$ yields the desired result.
\end{proof}


\begin{lemma}
\label{lem:3_1}
    Consider the model in \cref{eq:meta_loss} with $\sigma$ sub-Gaussian $i.i.d.$~noise.
    Then, for $\frac{\lambda N}{4\sigma} > \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}$ with probability at least
    \begin{align*}
        1 - \sum_{j=1}^p \exp\left( -\frac{\left(\frac{\lambda N}{4\sigma} - \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}\right)^2}{2\norm{\mPhi^{(j)}(\mPhi^{(j)})^T}_2} \right)    
    \end{align*}
    we have for any solution $\hat{\vbeta}$ of \cref{eq:meta_loss}
    \begin{align*}
        \frac{1}{N} \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2^2 + &\frac{\lambda}{2} \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq \\
        &2\lambda \sum_{j \in J^\star} \min \left( \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 ,\norm{\vbeta^{\star(j)}}_2 \right).
    \end{align*}
\end{lemma}


\begin{proof}
    This proof is inspired by the proof of Lemma 3.1 in \citet{lounici2011oracle}.\\
    For all solutions $\hat{\vbeta}$ of \cref{eq:meta_loss}
    \begin{align*}
        \hat{\vbeta} = \argmin_{\vbeta \in \sR^{dm}} \frac{1}{N} \norm{\mPhi \vbeta^\star + \epsilon - \mPhi \vbeta}_2^2 + \lambda \sum_{j=1}^p \norm{\vbeta^{(j)}}_2.
    \end{align*}
    Therefore for all $\vbeta \in \sR^{dm}$
    \begin{align*}
        \frac{1}{N} \norm{\mPhi \vbeta^\star + \epsilon - \mPhi \hat{\vbeta}}_2^2 + \lambda \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)}}_2 \leq \frac{1}{N} \norm{\mPhi \vbeta^\star + \epsilon - \mPhi \vbeta}_2^2 + \lambda \sum_{j=1}^p \norm{\vbeta^{(j)}}_2.
    \end{align*}
    This yields 
    \begin{align*}
        \frac{1}{N} \norm{\mPhi (\hat{\vbeta} - \vbeta)}_2^2 \leq&  \frac{1}{N} \norm{\mPhi(\vbeta - \vbeta^\star)}_2^2 +\frac{2}{N} \epsilon^T \mPhi(\hat{\vbeta} - \vbeta) \\
        &+ \lambda \sum_{j=1}^p \left( \norm{\vbeta^{(j)}}_2 - \norm{\hat{\vbeta}^{(j)}}_2 \right).
    \end{align*}
    And in particular if $\vbeta = \vbeta^\star$, then,
    \begin{align*}
        \frac{1}{N} \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2^2 \leq& \frac{2}{N} \epsilon^T \mPhi(\hat{\vbeta} - \vbeta^\star) \\
        &+ \lambda \sum_{j=1}^p \left( \norm{\vbeta^{\star(j)}}_2 - \norm{\hat{\vbeta}^{(j)}}_2 \right).
    \end{align*}
    By \cref{cor:concentration} and union bound we have jointly for all $j \leq p$ with probability at least 
    \begin{align*}
        1 - \sum_{j=1}^p \exp\left( -\frac{\left(\frac{\lambda N}{4\sigma} - \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}\right)^2}{2\norm{\mPhi^{(j)}(\mPhi^{(j)})^T}_2} \right)
    \end{align*}
    that
    \begin{align*}
    \label{eq:conc}
        \norm{(\epsilon^T \mPhi)^{(j)}}_2 \leq \frac{\lambda N}{4}.
    \end{align*}
    Therefore, by Cauchy-Schwarz,
    \begin{align*}
        \epsilon^T \mPhi(\hat{\vbeta} - \vbeta^\star) &\leq \sum_{j=1}^p \norm{(\epsilon^T \mPhi)^{(j)}}_2 \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2\\
        & \leq \frac{\lambda N}{4} \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2.
    \end{align*}
    This implies that
    \begin{align*}
        \frac{1}{N} \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2^2 \leq \frac{\lambda}{2} \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2  + \lambda \sum_{j=1}^p \left( \norm{\vbeta^{\star(j)}}_2 - \norm{\hat{\vbeta}^{(j)}}_2 \right).
    \end{align*}
    Therefore
    \begin{align*}
        \frac{1}{N} \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2^2 + &\frac{\lambda}{2} \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq \\
        &\lambda \sum_{j=1}^p \left( \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 + \norm{\vbeta^{\star(j)}}_2 - \norm{\hat{\vbeta}^{(j)}}_2 \right)
    \end{align*}
    and since $\vbeta^{\star(j)} = 0$ for all $j \notin J^\star$
    \begin{align*}
        \frac{1}{N} \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2^2 + &\frac{\lambda}{2} \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq \\
        &2\lambda \sum_{j \in J^\star} \min \left( \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 ,\norm{\vbeta^{\star(j)}}_2 \right).
    \end{align*}
    This proves the statement.
\end{proof}

\begin{lemma}
\label{thm:meta_kel_general}
    Let Assumption \ref{ass:betamin} hold. If $\hat{\vbeta}$ is a solution of \cref{eq:meta_loss} then
    we have for $\frac{\lambda N}{4\sigma} > \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}$ with probability at least
    \begin{align*}
        1 - \sum_{j=1}^p \exp\left( -\frac{\left(\frac{\lambda N}{4\sigma} - \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}\right)^2}{2\norm{\mPhi^{(j)}(\mPhi^{(j)})^T}_2} \right)    
    \end{align*}
    that
    \begin{align*}
        \sum_{j \notin J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq 3 \sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2,
    \end{align*}
    and
    \begin{equation}
        \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq \frac{8 \lambda m}{\bar{\kappa}^2 },
    \end{equation}
    where
    \begin{equation}
        \bar{\kappa} \coloneqq \frac{\sqrt{m}}{\sqrt{N}} \frac{\norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2}{\sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2}.
    \end{equation}
\end{lemma}

\begin{proof}
    Lemma \ref{lem:3_1} implies
    \begin{align}
    \label{eq_uufg}
        \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq 4 \sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2
    \end{align}
    and therefore
    \begin{align*}
        \sum_{j \notin J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq 3 \sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2,
    \end{align*}
    which yields the first statement of  \cref{thm:meta_kel_general}.\\
    Again, by the first statement of  \cref{lem:3_1}, we have with probability at least
    \begin{align*}
        1 - \sum_{j=1}^p \exp\left( -\frac{\left(\frac{\lambda N}{4\sigma} - \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}\right)^2}{2\norm{\mPhi^{(j)}(\mPhi^{(j)})^T}_2} \right)    
    \end{align*}
    that
    \begin{equation}
        \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2 \leq \sqrt{2\lambda N} \sqrt{\sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2}.
    \end{equation}
    Therefore
    \begin{align*}
        \sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2
        &\leq \frac{\sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2}{\norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2} \norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2\\
        & \leq \frac{\sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2}{\norm{\mPhi (\hat{\vbeta} - \vbeta^\star)}_2} \sqrt{2\lambda N} \sqrt{\sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2}.
    \end{align*}
    Solving this yields
    \begin{align*}
        \sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2
        \leq \frac{2\lambda m}{\bar{\kappa}^2},
    \end{align*}
    and by \cref{eq_uufg} we have
    \begin{align*}
        \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 &\leq 4 \sum_{j \in J^\star} \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2\\
        & \leq \frac{8 \lambda m}{\bar{\kappa}^2 }.
    \end{align*}
    
\end{proof}

\begin{definition}[compatibility variable]
\label{def:comvar_app}
    Let
    \begin{align*}
        S \coloneqq \Big\{&(J,b) \subset \mathcal{P}(\{1,\dots,p\}) \times (\sR^d \backslash \{0\}) \ \Big\vert \ |J| \leq s^\star, \sum_{j \notin J} \|b^{(j)}\|_2 \leq 3 \sum_{j \in J} \|b^{(j)}\|_2 \Big\}.
    \end{align*}
    For $n,d \in \mathbb{N}$ and $\mPhi \in \sR^{N\times d}$ we define $\kappa(\mPhi)$ by
    \begin{align*}
        \kappa(\mPhi) \coloneqq \inf_{(J,b) \in S} \frac{\sqrt{m}}{\sqrt{N}} \frac{\|\mPhi b\|_2}{ \sum_{j \in J} \|b^{(j)}\|_2}.
    \end{align*}
    and call it the compatibility variable of $\mPhi$.
\end{definition}

\begin{remark}
\label{rem:kappa}
    It holds that $\kappa \leq \bar{\kappa}$.
\end{remark}

\begin{corollary}
\label{cor:main}
    Let $0< \omega < c_1$ and let $\hat{\vbeta}$ is a solution of \cref{eq:meta_loss} with
    \begin{align*}
        \lambda \leq \frac{\bar{\omega} \kappa^2 }{8 \sqrt{m}},
    \end{align*}
    where $\bar{\omega} \coloneqq \min \{ \omega, c_1 - \omega\}$.
    Then we have for $\frac{\lambda N}{4\sigma} > \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}$ with probability at least
    \begin{align*}
        1 - \sum_{j=1}^p \exp\left( -\frac{\left(\frac{\lambda N}{4\sigma} - \sqrt{\trace(\mPhi^{(j)} (\mPhi^{(j)})^T)}\right)^2}{2\norm{\mPhi^{(j)}(\mPhi^{(j)})^T}_2} \right) 
    \end{align*}
    that
    \begin{equation}
        \frac{1}{\sqrt{m}} \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \leq \bar{\omega}.
    \end{equation}
    If additionally Assumption \ref{ass:betamin} holds, then we have with the same probability for
    \begin{align*}
        \hat{J} \coloneqq \left\{ j \in \{1,\dots, p\} \ \Big\vert \ \norm{\hat{\vbeta}^{(j)}}_2 > \omega \sqrt{m} \right\}
    \end{align*}
    that
    \begin{align*}
        \hat{J} = J^\star.
    \end{align*}
\end{corollary}

\begin{proof}
    The first statement follows directly from \cref{thm:meta_kel_general}.\\
    Assume $j \in J^\star$. Then by Assumption \ref{ass:betamin}
    \begin{align*}
        \norm{\hat{\vbeta}^{(j)}}_2 > \sqrt{m} \Big( c_1 - \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 \Big)\geq \sqrt{m} (c_1 - \bar{\omega}) \geq\omega \sqrt{m},
    \end{align*}
    which implies $J^\star \subset \hat{J}$.
    Assume $j \notin J^\star$, then
    \begin{align*}
        \norm{\hat{\vbeta}^{(j)}}_2 \leq \norm{\hat{\vbeta}^{(j)} - \vbeta^{\star(j)}}_2 +  \norm{\vbeta^{\star(j)}}_2 \leq \bar{\omega}\sqrt{m} \leq \omega \sqrt{m},
    \end{align*}
    which implies $\hat{J} \subset J^\star$
\end{proof}

\begin{remark}
    Choosing $\omega$ optimally yields $\omega = \bar{\omega} = c_1/2$.
\end{remark}

\begin{proof}[\textbf{Proof of \cref{thm:offline_main_consitency}}]
    Note that $\mPhi^{(j)} \in \sR^{N \times md_j}$ is block-diagonal. Since by assumption $k_j(x,x') \leq 1, \forall j \leq p$ we have
    \begin{align*}
        \trace\left((\mPhi^{(j)})^T \mPhi^{(j)}\right) = \trace\left(\mPhi^{(j)} (\mPhi^{(j)})^T\right) = \sum_{s=1}^m \sum_{i=1}^{n_s} k_j\left(x^{(s)}_i, x^{(s)}_i\right) \leq N,
    \end{align*}
    and
    \begin{align*}
        \norm{(\mPhi^{(j)})^T \mPhi^{(j)}}_2
        &= \norm{\mPhi^{(j)} (\mPhi^{(j)})^T}_2 \\
        &\leq \max_{s\leq m} \trace\left(\mPhi_s^{(j)} (\mPhi_s^{(j)})^T\right)\\
        &\leq \max_{s\leq m} \sum_{i=1}^{n_s}  k_j\left(x^{(s)}_i, x^{(s)}_i\right)\\
        &\leq n.
    \end{align*}
    Corollary \ref{cor:main} yields the result.
\end{proof}

%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\section{Lifelong Analysis (Proof of \texorpdfstring{Theorem~\ref{thm:lifelong}}{})} \label{app:lifelong}

We start by proving a generic variant of \cref{thm:lifelong}, from which we can obtain the theorem in the main text as a corollary.

\begin{theorem}
\label{thm:lifelong_main}
    Assume that the true reward functions $f_1, \dots, f_m$ satisfy $\norm{f_i}_{\tH} \leq B$ for some constant $B > 0$. Assume $\{n_s\}_{s\in\mathbb{N}}$ is a non-increasing sequence with $n_s \leq n, \forall s$. Define $N_m \coloneqq \sum_{s=1}^m n_s$. Let $\nu$ be a distribution on $\calX^{N}$ independent of $\bm{\epsilon}_1, \dots, \bm{\epsilon}_m$. Let $V \sim \nu$ be the random vector used for forced exploration. Let $\tilde{\mPhi}_s \in \sR^{N_s \times md}$ be the data matrix obtained by forced exploration. Assume the forced exploration distribution $\nu$ and $\{k_j\}_{j\leq p}$ are such that, with probability at least $1-\delta/4$, there exists $c_{\kappa} > 0$ such that $\kappa(\tilde{\mPhi}_s) \geq c_{\kappa}, \forall s \leq m$.
    Assume further that \bba using the true kernel function for $m$ tasks with $n$ interactions with independent noise achieves with probability at least $1-\delta/2$ cumulative regret lower than $\Roracle(m, n)$ in the worst-case. Then, for $m_0 \in \mathbb{N}$ and $0 < \delta < 1$, if
    \begin{align*}
        N_{m_0} \geq \frac{2n_1 \log(4mp/\delta)}{(\sqrt{N_m/m} \frac{c_1c_{\kappa}^2}{32\sigma} - 1)^2},
    \end{align*}
    with probability at least $1-\delta$ \algon achieves
    \begin{align*}
        R(m,n) \leq 2B m_0 n + 2B N_m + \Roracle(m, n).
    \end{align*}
\end{theorem}

\begin{proof}
    Denote by $C \coloneqq \{v \ \vert \ \kappa(\tilde{\mPhi}_s(v)) \geq c_{\kappa}, \forall s \leq m \}$ the set of data points such that $\kappa$ is lower bounded by $c_{\kappa}$. By assumption we have $\mathbb{P}[V \in C] \geq 1 -\delta/4$.
    Denote by $\hat{J}_{s} \subset \{1,\dots,p\}$ the sparsity structure predicted by \algon after the first $s$ tasks. 
    Note that $\tilde{\mPhi}^{(j)}_s \in \sR^{n_s m \times md_j}$ is block-diagonal. Since by assumption $k_j(x,x') \leq 1, \forall j \leq p$ we have
    \begin{align*}
        \trace((\tilde{\mPhi}^{(j)}_s)^T \tilde{\mPhi}^{(j)}_s) = \trace(\tilde{\mPhi}^{(j)}_s (\tilde{\mPhi}_s^{(j)})^T) = \sum_{s=1}^m \sum_{i=1}^{n_s} k_j(x^{(s)}_i, x^{(s)}_i) \leq N,
    \end{align*}
    and
    \begin{align*}
        \norm{(\tilde{\mPhi}^{(j)}_s)^T \tilde{\mPhi}^{(j)}_s}_2
        &= \norm{\tilde{\mPhi}^{(j)}_s (\tilde{\mPhi}^{(j)}_s)^T}_2 \\
        &\leq \max_{s\leq m} \trace(\mPhi_s^{(j)} (\mPhi_s^{(j)})^T)\\
        &\leq \max_{s\leq m} \sum_{i=1}^{n_s}  k_j(x, x)\\
        &\leq \max_{s\leq m} n_s\\
        &= n_1.
    \end{align*}
    Since $V$ is independent of $\bm{\epsilon}_1, \dots, \bm{\epsilon}_m$, we have by Corollary \ref{cor:main} for all $s$ and $v' \in C$
    \begin{align*}
        \mathbb{P}\left[  \hat{J}_{s} = J^\star \mid V = v' \right] \geq 1 - p\exp\left(- \frac{N_s}{2n_1} \left( \sqrt{\frac{N_s}{s}}\frac{c_1 c_{\kappa}^2}{32 \sigma} -1\right)^2 \right).
    \end{align*}
    By union bound and since $\frac{N_s}{s}$ is non-increasing by assumption we have for $m_0 \leq m$
    \begin{align*}
        \mathbb{P}\left[ \forall m \geq s \geq m_0, \hat{J}_s = J^\star \mid V = v'  \right] &\geq 1 - \sum_{s=m_0}^m p\exp\left(- \frac{N_s}{2n_1} \left( \sqrt{\frac{N_s}{s}}\frac{c_1 c_{\kappa}^2}{32 \sigma} -1\right)^2 \right)\\
        & \geq 1 - m p\exp\left(- \frac{N_{m_0}}{2n_1} \left( \sqrt{\frac{N_m}{m}}\frac{c_1 c_{\kappa}^2}{32 \sigma} -1\right)^2 \right),
    \end{align*}
    where we defined $N_s \coloneqq \sum_{i=1}^s n_i$.
    If $m_0 \in \mathbb{N}$ is large enough such that
    \begin{align*}
        N_{m_0} \geq \frac{2n_1 \log(4mp/\delta)}{(\sqrt{N_m/m} \frac{c_1c_{\kappa}^2}{32\sigma} - 1)^2},
    \end{align*}
    then for all $v' \in C$
    \begin{align*}
        \mathbb{P}\left[ \forall m \geq s \geq m_0, \hat{J}_s = J^\star \mid V = v'  \right] &\geq 1 - \delta/4.
    \end{align*}
    By assumption we have
    \begin{align*}
        \mathbb{P}\left[ V \in C \right] \geq 1 - \delta/4.
    \end{align*}
    Because $V$ is independent of the noise
    \begin{align*}
        \mathbb{P}\left[  \exists m \geq s \geq m_0 , \hat{J}_s \neq J^\star \right] &= \int \mathbb{P}\left[ \exists m \geq s \geq m_0 ,\hat{J}_s \neq J^\star \mid V=v\right] p_V(v)dv\\
        &= \int_C \mathbb{P}\left[ \exists m \geq s \geq m_0,\hat{J}_s \neq J^\star \mid V=v\right] p_V(v)dv +\\
        & \quad \int_{C^c} \mathbb{P}\left[ \exists m \geq s \geq m_0,\hat{J}_s \neq J^\star \mid V=v\right] p_V(v)dv\\
        & \leq \mathbb{P} [V \in C^c] + \mathbb{P} [V \in C] \delta/2\\ 
        & \leq \delta/4 + \delta/4 = \delta/2.
    \end{align*}
    For all tasks that happen after task $m_0$ we have jointly with probability at least $1-\delta/2$ that $\hat{J}_s = J^\star$. \\
    Denote by $r(k, s)$ the regret that the base bandit algorithm achieves after $n$ interactions with kernel $k$ in task $s$. By assumption $\mathbb{P}[\sum_{s=m_0}^m r(\tk, s) \leq \Roracle(n, m-m_0)] \geq 1 - \delta/2$. Denote by $\hat{k}_s$ the predicted kernel for task $s$. By union bound
    \begin{align*}
        \mathbb{P} \left[ \sum_{s=m_0}^m r(\hat{k}_s, s) \leq \calO(\Roracle(m,n)) \right] &\geq \mathbb{P} \Bigg[\forall m_0 \leq s \leq m, \hat{k}_s = \tk \text{ and } \\
        & \qquad \sum_{s=m_0}^m r(\hat{k}_s, s) \leq \calO(\Roracle(m-m_0, n)) \Bigg]\\
        &\geq 1 - \mathbb{P} \left[\sum_{s=m_0}^m r(\tk_s, s) > \calO(\Roracle(m-m_0, n))  \right] \\
        & \quad - \mathbb{P} \left[\exists m_0 \leq s \leq m, \tk_s \neq \tk \right]\\
        &\geq 1- \delta
    \end{align*}
    
    Therefore it holds with probability at least $1 - \delta$
    \begin{align*}
        R(m,n) \leq  m_0 n L +  L N_m + \Roracle(m-m_0, n) .
    \end{align*}
    Here, the first term is an upper bound of the regret in the first $m_0$ tasks. The other terms are an upper bound on the reward for the other $m-m_0$ tasks. They can be divided into the regret obtained by forced exploration and the regret obtained by the base bandit task. By Lemma \ref{lem_jjsj} we know that the maximum instantaneous regret $L$ is bounded by $2B$. Therefore
    \begin{align*}
        R(m,n) &\leq  m_0 n L +  L N_m + \Roracle(m-m_0, n) \\
        &\leq 2B m_0 n + 2B N_m + \Roracle(m, n) .
    \end{align*}
\end{proof}

\begin{lemma}
\label{lem_jjsj}
    Let $k$ be a kernel with $k(x,x') \leq 1, \forall x,x' \in \mathcal{X}$ and let $f \in \calH_k$ with $\norm{f}_{k} \leq B$, then for all $\bx \in \calX$
    \begin{align*}
        |f(\bx)| \leq B.
    \end{align*}
\end{lemma}

\begin{proof}
    By the reproducing property, we have
    \begin{align*}
        |f(\bx)| &= | \langle f(\cdot), k(x,\cdot)\rangle_k |\\
        &\leq \norm{f}_{\mathcal{H}_{k}} k(x,x)\\
        & \leq B.
    \end{align*}
\end{proof}


A clarification is due, regarding the exact number of exploratory steps taken. 
In the algorithm design and in the main text, we require that during every task $s$, purely exploratory actions are taken at every step $i$ where $i \leq n_s$. 
The number of exploratory steps has to be an integer, while the proposed rate of $n_s = \sqrt{n}/s^{1/4}$ may not be an integer. 
Therefore, the $i \leq n_s$ condition implies that only the first $\floor{n_s}$ steps will be exploratory.
In our proofs so far, we have assumed that at least a total of $N_{m_0} = \sum_{s=1}^{m_0} n_s$ exploratory action are chosen, which may be well larger than $\sum_{s=1}^{m_0} \floor{n_s}$.
To resolve this gap, we accumulate the non-integer remainder $n_s - \floor{n_s}$ in a variable $r$. Whenever $r$ becomes larger than $1$, we increase the number forced exploration queries by $1$ to $\tilde n_s = \floor{n_s} + \floor{r}$. At every task $s$, we force exactly $\tilde n_s \in \sN$ exploratory actions, where $(\tilde n_1, \dots, \tilde n_s)$ is calculated as described in \cref{alg:forced_exp}.
Then to ensure that $N_{m_0}$ exploratory datapoint are available, we calculate the smallest $\tilde m_0$ which satisfies:
\[
\sum_{s=1}^{\tilde m_0} \tilde n_s \geq N_{m_0}
\]
It is straightforward to show that by construction of \cref{alg:forced_exp}, $m_0 \leq \tilde m_0 \leq m_0 + 1$.
In other words, by taking exploratory actions according to $\tilde n_s$ (which is an integer) we reqiure at most 1 additional task to fulfill the lower bound on the total number of required exploratory actions. 
In the next two corollaries we give a lower bound on the $\tilde m_0$ which satisfies the required dataset size $N_{m_0}$. 

\begin{algorithm}[ht]
\caption{Forced Exploration Rate to Integer number of Exploratory Steps}
\label{alg:forced_exp}
\begin{algorithmic}
\Require The sequence of $(n_1, \dots, n_m)$
\State $r \gets 0 $ \Comment{r is the sum of fractional residue}
\For{$s \in \{1, \dots, m\}$}
\State $r \gets r +  n_s - \floor{n_s}$ \Comment{Add the fractional part of $n_s$ to the residue sum}
\State $\tilde n_s \gets \floor{n_s} + \floor{r}$ \Comment{If the residue sum is over $1$, then add 1 to $\floor{n_s}$}
\State $r \gets r - \floor{r}$
\EndFor\\
\textbf{Output:} $(\tilde n_1, \dots, \tilde n_m)$
\end{algorithmic}
\end{algorithm}

\begin{corollary}
\label{cor:ll_const}
    Assume the setting of \cref{thm:lifelong_main}. Set the rate
    \begin{align*}
        n_s = \sqrt{n}
    \end{align*}
    for all $s \in \mathbb{N}$, and choose the integer number of forced exploration steps according to \cref{alg:forced_exp}. Then, for all $0 < \delta < 1$, with probability at least $1-\delta$
    \begin{equation*}
        R(m,n) \leq \calO\left( B\log(mp/\delta) \sqrt{n} \right) + 2mB \sqrt{n} + \Roracle(m,n).
    \end{equation*}
\end{corollary}

\begin{proof}
    Taking actions at a $n_s = \sqrt{n}$ rate via \cref{alg:forced_exp}, we can ensure that after $\tilde m_0$ many tasks the condition of \cref{thm:lifelong_main} on  $N_{m_0}$ is met, where 
    \begin{align*}
       \frac{2\log(4mp/\delta)}{( n^{1/4} \frac{c_1c_{\kappa}^2}{32\sigma} - 1)^2} \leq \tilde m_0 \leq \frac{2\log(4mp/\delta)}{( n^{1/4} \frac{c_1c_{\kappa}^2}{32\sigma} - 1)^2} + 1.
    \end{align*}
    Then the proof directly from \cref{thm:lifelong_main} with $n_1 = \sqrt{n} $, $N_m = m\sqrt{n} $.
\end{proof}

\begin{corollary}
\label{cor:ll_decreasing}
    Assume the setting of \cref{thm:lifelong_main}. Set the rate
    \begin{align*}
        n_s =  \frac{\sqrt{n}}{s^{1/4}} 
    \end{align*}
     for all $s \in \mathbb{N}$, and choose the explicit integer number of forced exploration steps according to \cref{alg:forced_exp}.
    Then, for all $0 < \delta < 1$, with probability at least $1-\delta$
    \begin{align*}
        R(m,n) \leq \calO\left( B n^{1/3} \log^{3/4}(mp/\delta) m^{1/3} +  B\sqrt{n} m^{3/4}\right) +  \Roracle(m,n).
    \end{align*}
\end{corollary}

\begin{proof}
    We have
    \begin{align*}
        N_m = \sum_{s=1}^m \frac{\sqrt{n}}{s^{1/4}} = \Theta\left( \sqrt{n}m^{3/4} \right)
    \end{align*}
    Choose
    \begin{align*}
        \tilde m_0 = \Theta \left(\frac{ \log(4mp/\delta)}{\left(\sqrt{\frac{n}{m^{1/4}}}\frac{c_1c_{\kappa}}{32\sigma} -1 \right)^2}\right)^{4/3} 
    \end{align*}
    and take exploratory actions according to \cref{alg:forced_exp} then, 
    \begin{align*}
        N_{m_0} \geq \frac{2n_1 \log(4mp/\delta)}{(\sqrt{N_m/m} \frac{c_1c_{\kappa}^2}{32\sigma} - 1)^2}.
    \end{align*}
    By \cref{thm:lifelong_main}, since $n_1 = \sqrt{n}$, for all $0 < \delta < 1$, with probability at least $1-\delta$ \algon achieves
    \begin{align*}
        R(m,n) 
        &\leq 2B m_0 n + 2B N_m + \Roracle(m,n) \\
        &\leq \calO\left( B n^{1/3} \log^{3/4}(mp/\delta) m^{1/3} + B\sqrt{n} m^{3/4} \right) + \Roracle(m,n).
    \end{align*}
\end{proof}
\subsection{Background on \gpucb}\label{app:gp_ucb}

To solve task $s$, \gpucb first constructs {\em confidence sets} for $f_s(\bx)$ based on the history $\{ (\bx_{s,t}, y_{s,t})_{t \leq i}\}$ to balance exploration and exploitation at any step $i$.
 For any $\bx \in \calX$, the set $\calC_{i-1}(\bx)$ defines an interval to which $f(\bx)$ belongs with high probability such that,
 \[\sP\left(\forall\bx\in\calX:f(\bx) \in \calC_{i-1}(\bx)\right)\geq 1-\delta.\]
Given a kernel $k$, \gpucb builds sets of the form 
\begin{align*}
    \calC_{i-1}(k;\bx) = [&\mu_{i-1}(k;\bx)-\nu_i\sigma_{i-1}(k;\bx), \,\, \mu_{i-1}(k;\bx)+\nu_i\sigma_{i-1}(k;\bx)]
\end{align*}
where the exploration coefficient $\nu_i$ depends on the desired confidence level $1-\delta$, and is often treated as a hyper-parameter of the algorithm. The functions $\mu_{i-1}$ and $\sigma_{i-1}$ set the center and width of the confidence set as
\begin{align*}
    \mu_{i-1} (k;\bx)& = {\bm{k}}_{i-1}^T(\bx)({\bm K}_{i-1}+\lambda^2_{\mathrm{ucb}}\bm{I})^{-1}\by_{i-1}  \\
     \sigma^2_{i-1}(k; \bx) & =  k(\bx, \bx) - {\bm k}^T_{i-1}(\bx)({\bm K}_{i-1}+\lambda^2_{\mathrm{ucb}}\bm{I})^{-1}{\bm k}_{i-1}(\bx)
\end{align*}
where $\lambda_{\mathrm{ucb}}$ is a regularizer, $\by_{i-1} = [y_{s,t}]_{t < i}$ is the vector of observed values, $\bm{k}_{i-1}(\bx) = [ k(\bx, \bx_{s,t})]_{t < i}$, and ${\bm K}_{i-1} = [ k(\bx_{s,t}, \bx_{s,t^\prime})]_{t,t^\prime < i}$ is the kernel matrix. 
\gpucb then chooses an action that maximizes the upper confidence bound, i.e.
\begin{equation*}\label{eq:UCB_policy}
\bx_{s,i} = \argmax_{\bx \in \mathcal{X}} \mu_{i-1}(\bx) + \nu_i\sigma_{i-1}(\bx).
\end{equation*}
The acquisition function balances exploring uncertain actions and exploiting the gained information via parameter $\nu_i$.
\citet{chowdhury2017kernelized} show that following this policy, and using $\tk$ as the kernel function, yields a regret of 
\[
\Roracle(n) = \calO \left( B d^\star \sqrt{n}\log\tfrac{n}{d^\star} + \sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}}\right).
\]
\subsection{Lifelong Regret of \gpucb Paired with \algon (Proof of \texorpdfstring{Corollary~\ref{cor:ll_ucb_decreasing_main_text}}{})}\label{app:lifelong_ucb}


\begin{definition}[maximum information gain]
    The maximum information gain after $t$ observations of GP-UCB with kernel $k$ and parameter $\lambda_{\mathrm{ucb}}$ is defined by
    \begin{equation*}
        \gamma_t(k) = \max_{\bx_1, \dots, \bx_t \in \calX} \frac{1}{2} \log \det \left( I + \lambda_{\mathrm{ucb}}^{-2} K_t\right),
    \end{equation*}
    where
    \begin{equation*}
        (K_t)_{ij} = (\mPhi^T \mPhi)_{ij} = k(\bx_i, \bx_j).
    \end{equation*}
\end{definition}


\begin{theorem}[Theorem 3 of \citet{chowdhury2017kernelized}]
\label{thm_cvgf_old}
    Let $k$ be a kernel and $f \in \calH_k$, where $\calH_k$ is the RKHS corresponding to kernel $k$. Let $\delta \in (0,1)$, $\norm{f}_{k} \leq B$ and assume the errors $\epsilon_t$ are conditionally $\sigma$-sub-Gaussian. Running GP-UCB with $\lambda_{\mathrm{ucb}} = 1 + 2/n$ for $n$ steps we have with probability at least $1-\delta$ that
    \begin{align*}
        R(n) &= \calO(B \sqrt{n \gamma_n(k)} + \sqrt{n \gamma_n(k)(\gamma_n(k) + \log(1/\delta))})
    \end{align*}
\end{theorem}

\begin{corollary}
\label{thm_cvgf}
    Let $k_s$ be kernels and $f_s \in \calH_{k_s}$, where $\calH_{k_s}$ is the RKHS corresponding to kernel $k_s$. Let $\delta \in (0,1)$, $\norm{f_s}_{k_s} \leq B$ and assume the errors $\epsilon_{s,t}$ are $i.i.d.$~$\sigma$-sub-Gaussian. Assume further that $k_s$ are $\sigma(\bm{\epsilon}_1,\dots,\bm{\epsilon}_{s-1})$-measurable. Running GP-UCB with $\lambda_{\mathrm{ucb}} = 1 + 2/n$ for $m$ tasks, each with $n$ steps, we have with probability at least $1-\delta$ that jointly for all $s \in \{1,\dots,m\}$
    \begin{align*}
        R_s(n) &= \calO(B \sqrt{n \gamma_n(k)} + \sqrt{n \gamma_n(k)(\gamma_n(k) + \log(1/\delta))})
    \end{align*}
    where $R_s(n)$ denotes the reward in task $s$ after $n$ interactions.
    In particular
    \begin{align*}
        \Roracle(m,n) \leq \calO \left( m\sqrt{n \gamma_n(k)}(B + \sqrt{\gamma_n(k)} + \log(1/\delta) ) \right).
    \end{align*}
\end{corollary}

\begin{proof}
    We will adapt the proof of Theorem 1 in \citet{chowdhury2017kernelized}.\\
    Let $\epsilon_1^s, \dots, \epsilon_n^s$ be the noise of task $s$. Define a function
    \begin{align*}
        s(t) = \sum_{j=1}^m j \mathbbm{1}_{\{ (j-1)n+1 \leq t \leq jn\}}
    \end{align*}
    and a filtration on $\{1,\dots,mn\}$
    \begin{align*}
        \mathcal{F}_t = \sigma(\epsilon_{1,1}, \dots, \epsilon_{1,n}, \epsilon_{2,1}, \dots, \epsilon_{2,n}, \dots, \epsilon_{s(t),1}, \dots, \epsilon_{s(t),t-(s(t)-1)n}).
    \end{align*}
    Further define for task $s$ a filtration on $\{1,\dots,n\}$
    \begin{align*}
        \mathcal{F}^s_t = \sigma(\epsilon_{s,1}, \dots, \epsilon_{s,t}).
    \end{align*}
    Similar to the proof of Theorem 1 in \citet{chowdhury2017kernelized} define for $t \in \{1,\dots,n\}$, $g:\calX \to \sR$ and $l_1,\dots,l_n \in \mathbb{N}$
    \begin{align*}
        M_t^{g,n}(s) = \exp \left( (\epsilon_{s,1:t})^T g_{1:t,l} - \frac{\sigma^2}{2} \norm{g_{1:t,l}}_2^2\right)
    \end{align*}
    where
    \begin{align*}
        g_{1:t,l} := [g(\bx_1) + l_1, \dots,g(\bx_t) + l_t]^T.
    \end{align*}
    Further let $N_1,\dots,N_n$ i.i.d. with distribution $\mathcal{N}(0, \kappa)$ and independent of $\mathcal{F}_n^s$ and let $h_s$ be a random function distributed according to the Gaussian Process measure $GP_{\calX}(0,k_s)$ and independent of $F^s_n$ and $N_1,\dots,N_n$. Define
    \begin{align*}
        M_t(s) = \mathbb{E}[ M_t^{h_s,N}(s) \mid \mathcal{F}^s_n].
    \end{align*}
    Now by the proof of Theorem 1 of \citet{chowdhury2017kernelized} we have that for all $s \in \{1,\dots,m\}$, $t\in\{1,\dots,n\}$ and all stooping times $\tau_s$ with respect to the filtration $\mathcal{F}^s_t$
    \begin{align}
    \label{eq_kjdfkjdfks}
        \mathbb{E}[M_{\tau_s}(s)] \leq 1.
    \end{align}
    Given stopping times $\tau_1,\dots,\tau_m$ on $\mathcal{F}^1_t,\dots,\mathcal{F}^m_t$ we construct a stopping time $\tau$ on $\mathcal{F}_t$
    \begin{align}
    \label{eq_kjdgjkgdf}
        \tau(\omega) = \min\{mn \geq t \geq 1 \mid \tau^{s(t)}(\omega) = t-(s(t)-1)n  \}.
    \end{align}
    We need to show that $\tau$ is a stopping time with respect to the filtration $\mathcal{F}_t$. We have
    \begin{align*}
        \{\omega\mid \tau(\omega) = t\} = \left(\bigcap_{s < s(t)} \{\omega \mid \tau_s(\omega) > n\} \right) \cap \{\omega \mid \tau^{s(t)}(\omega) = t-(s(t)-1)n \}.
    \end{align*}
    It holds that $\{\omega \mid \tau_s(\omega) > n\} = \{\omega \mid \tau_s(\omega) \leq n\}^c \in \mathcal{F}^s_n \subset \mathcal{F}_{sn}$ and $\{\omega \mid \tau^{s(t)}(\omega) = (s(t)-1)n-t \} \in \mathcal{F}^{s(t)}_{t-(s(t)-1)n} \subset \mathcal{F}_{t}$. This impies that $\{\omega\mid \tau(\omega) = t\} \in \mathcal{F}_t$ and therefore $\tau$ is a stopping time with respect to $\mathcal{F}_t$.
    Define
    \begin{equation*}
        M_t = M_{(s(t)-1)n - t}(s(t)).
    \end{equation*}
    We have that $M_t = M_{(s(t)-1)n - t}(s(t))$ is measurable with respect to $\mathcal{F}^{s(t)}_{(s(t)-1)n-t} \subset \mathcal{F}_t$, which means $M_t$ is $\mathcal{F}_t$-adapted. Let $\tau$ be a stopping time constructed as in Equation \ref{eq_kjdgjkgdf}. Then by Equation \ref{eq_kjdfkjdfks}
    \begin{align*}
        \mathbb{E}[M_{\tau}] \leq 1.
    \end{align*}
    Define for $t \in \{1,\dots,n\}$ and $s\in\{1,\dots,m\}$
    \begin{align*}
        B_t^s(\delta) = \left\{ \omega \mid \norm{\epsilon_{s,1:t}}^2_{((K^s_t +\kappa I)^{-1} + I)^{-1}} > 2 \log\left(\sqrt{\det((1+\kappa)I + K^s_t)}/\delta\right) \right\},
    \end{align*}
    where $K^s_t$ the design matrix for task $s$.
    Further define
    \begin{align*}
        \tau^s(\omega) = \min\{t \in \{1,\dots,n\} \mid \omega \in B^s_t(\delta)\}
    \end{align*}
    and let $\tau$ be the corresponding stopping time on $\mathcal{F}_t$. It holds by the proof of Theorem 1 of \citet{chowdhury2017kernelized} that
    \begin{align*}
        M_t(s) = \frac{\exp \left(\frac{1}{2} \norm{\epsilon_{s,1:t}}^2_{((K^s_{t} +\kappa I)^{-1} + I)^{-1}}\right)}{\sqrt{\det((1+\kappa)I + K^s_{t})}}
    \end{align*}
    and therefore
    \begin{align*}
        M_t = \frac{\exp \left(\frac{1}{2} \norm{\epsilon_{s(t),1:t-(s(t)-1)n}}^2_{((K^{s(t)}_{t-(s(t)-1)n} +\kappa I)^{-1} + I)^{-1}}\right)}{\sqrt{\det((1+\kappa)I + K^{s(t)}_{t-(s(t)-1)n})}}.
    \end{align*}
    Putting things together yields
    \begin{align*}
        \mathbb{P}\left[\bigcup_{s \leq m, t \leq n} B_t^s(\delta) \right] &= \mathbb{P}\left[\tau \leq mn \right]\\
        &= \mathbb{P}\Bigg[\tau \leq mn, \norm{\epsilon_{s(\tau),1:\tau-(s(\tau)-1)n}}^2_{((K^{s(\tau)}_{\tau} +\kappa I)^{-1} + I)^{-1}} >\\
        & \qquad 2 \log\left(\sqrt{\det((1+\kappa)I + K^{s(\tau)}_{\tau-(s(\tau)-1)n}}/\delta\right) \Bigg]\\
        &= \mathbb{P}\left[\tau \leq mn, M_{\tau} > 1/\delta \right]\\
        &\leq \mathbb{P}\left[M_{\tau} > 1/\delta \right]\\
        &\leq \mathbb{E}[M_{\tau}]\delta = \delta.
    \end{align*}
    Now follow the steps of the proof of Theorem 2 of \cite{chowdhury2017kernelized} and the claim follows.
\end{proof}


\begin{lemma}
\label{lem_cvdfg}
    Let $k: \calX \times \calX \to \sR$ be a kernel with $d^{(k)} \in \mathbb{N}$ dimensional feature map and assume $k(x,x') = \bm{\phi}(\bx)^T \bm{\phi}(x') \leq 1, \forall x,x' \in \calX$. Then the maximum information gain of GP-UCB with kernel $k$ and regularization parameter $\lambda_{\mathrm{ucb}}$ satisfies
    \begin{equation*}
        \gamma_n(k) \leq \frac{1}{2} d^{(k)} \log(1 + \frac{\lambda_{\mathrm{ucb}}^{-2}n}{d^{(k)}}).
    \end{equation*}
\end{lemma}

\begin{proof}
    This proofs follows the arguments of \citet{vakili2021information} and \citet{kassraie2022metalearning}. We have that $K_n = \mPhi_n \mPhi_n^T$ and by the Weinstein-Aronszajn identity
    \begin{align*}
        \frac{1}{2} \log \det (I_n + \lambda_{\mathrm{ucb}}^{-2} K_n) &= \frac{1}{2} \log \det (I_{d^{(k)}} + \lambda_{\mathrm{ucb}}^{-2} \mPhi_n^T \mPhi_n) \\
        &\leq \frac{1}{2} d^{(k)} \log\left(\trace(I + \lambda_{\mathrm{ucb}}^{-2} \mPhi_n^T \mPhi_n)/d^{(k)}\right)\\
        &\leq \frac{1}{2} d^{(k)} \log\left(1 + \frac{\lambda_{\mathrm{ucb}}^{-2}}{d^{(k)}}\trace(\mPhi_n^T \mPhi_n)\right).
    \end{align*}
    Now
    \begin{align*}
        \trace( \mPhi_n^T \mPhi_n) &= \sum_{i=1}^{n}  \trace (\bm{\phi}(\bx_i) \bm{\phi}(\bx_i)^T)\\
        &= \sum_{i=1}^{n}  \trace (\bm{\phi}(\bx_i)^T \bm{\phi}(\bx_i))\\
        &= n
    \end{align*}
    and therefore
    \begin{align*}
        \frac{1}{2} \log \det (I_n + \lambda_{\mathrm{ucb}}^{-2} K_n) &\leq \frac{1}{2} d^{(k)} \log\left(1 + \frac{\lambda_{\mathrm{ucb}}^{-2}n}{d^{(k)}}\right).
    \end{align*}
\end{proof}


\begin{corollary}
\label{cor:ll_ucb_const}
    Assume we are in the setting of Corollary \ref{cor:ll_const} with GP-UCB as the base bandit algorithm and $\lambda_{\mathrm{ucb}} = 1 + 2/n$. Then, for all $0<\delta<1$, with probability at least $1-\delta$,
    \begin{align*}
        R(m,n) = \calO \left(  Bm d^\star \sqrt{n}\log\tfrac{n}{d^\star} + m\sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}} + B\sqrt{n}(m+\log(mp/\delta))\right).
    \end{align*}
\end{corollary}

\begin{proof}
    By Corollary \ref{thm_cvgf} and Lemma \ref{lem_cvdfg} we have with high probability 
    \begin{align*}
        \Roracle(m, n) &= \calO \left( m\sqrt{n} \left((B+ \log(1/\delta))\sqrt{\frac{1}{2} d^\star \log\left(1 + \frac{n^3}{d^\star(n+2)^2}\right)} + \frac{1}{2} d^\star \log\left(1 + \frac{n^3}{d^\star(n+2)^2}\right)\right) \right)\\
        & = \calO \left( Bm d^\star \sqrt{n}\log\tfrac{n}{d^\star} + m\sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}}\right)
    \end{align*}
    where $d^\star := d^{(\tk)} = \sum_{j \in \tJ} d_j$.
    And using Corollary \ref{cor:ll_const} we have with high probability
    \begin{align*}
        R(m,n) &\leq \calO\left( B\log(mp/\delta) \sqrt{n} + mB \sqrt{n}\right) + \Roracle(m,n)\\
        &= \calO \left(  Bm d^\star \sqrt{n}\log\tfrac{n}{d^\star} + m\sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}} + Bm\sqrt{n}\right).
    \end{align*}
\end{proof}


\begin{corollary}
\label{cor:ll_ucb_decreasing}
    Assume we are in the setting of Corollary \ref{cor:ll_decreasing} with GP-UCB as the base bandit algorithm and $\lambda_{\mathrm{ucb}} = 1 + 2/n$. Then, for all $0<\delta<1$, with probability at least $1-\delta$,
    \begin{align*}
        R(m,n) = \calO \left( Bm d^\star \sqrt{n}\log\tfrac{n}{d^\star} + m\sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}} + B n^{1/3} \log^{3/4}(mp/\delta) m^{1/3} + B\sqrt{n} m^{3/4} \right).
    \end{align*}
\end{corollary}

\begin{proof}
    The proof is the same as the proof for Corollary \ref{cor:ll_ucb_const}, except that we use Corollary \ref{cor:ll_decreasing} in place of Corollary \ref{cor:ll_const}.
\end{proof}

\begin{remark}
    Compare the results of Theorem \ref{thm:lifelong} with the default alternative: not learning $k$ and just setting $\hat{k} = \sum_{j=1}^p \frac{1}{p} k_j$. We would then only get a bound of the form
    \begin{equation*}
        R(m,n) \leq \calO\left( m \hat{B} \sqrt{n}\log(n) d  \right),
    \end{equation*}
    where $B = \norm{f}_{\hat{k}} = \frac{p}{s^\star} B$ and $d = \sum_{j=1}^p d_j \geq n$, which is not sublinear in $n$.
\end{remark}

\subsection{Forced Exploration Lower Bound (Proof of \texorpdfstring{Proposition~\ref{prop:forced_exp}}{})} \label{app:kappa_bounds}

\begin{assumption}
\label{ass:der}
    Assume there exists $c_c, c_{od} > 0$
    \begin{align*}
        \frac{m}{N} (\mPhi^T \mPhi)_{i,i} \geq c_d, \qquad \forall i \in \{1,\dots, md\}
    \end{align*}
    and 
    \begin{align*}
        \frac{m}{N} (\mPhi^T \mPhi)_{i,j} < c_{od}, \qquad \forall i \neq j \in \{1,\dots, md\}.
    \end{align*}
\end{assumption}

\begin{lemma}
    Let Assumption \ref{ass:der} be satisfied. Then $\kappa \geq \sqrt{c_{d}/s^\star - 5 c_{od}}$.
\end{lemma}

\begin{proof}
    Let $(b, J) \in S$. We have by definition of $S$
    \begin{align*}
        \left( \frac{\sqrt{m}}{\sqrt{N}} \frac{\|\mPhi \bb\|_2}{ \sum_{j \in J} \|b^{(j)}\|_2}\right)^2
        &=  \frac{m}{N} \frac{b^T (\mPhi^T \mPhi )  b}{\left(\sum_{j \in J} \|b^{(j)}\|_2\right)^2}\\
        & \geq \frac{m}{N} \frac{\sum_{s=1}^m \sum_{i,j \in J} \bb_s^{(i)}(\mPhi^T \mPhi)_{i,j} \bb_s^{(j)}}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2} \\
        &\quad - 4 \frac{m}{N} \frac{\sum_{s=1}^m \sum_{i \in J, j \in J^c} |\bb_s^{(i)}(\mPhi^T \mPhi)_{i,j} \bb_s^{(j)}|}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)\left(\sum_{j \notin J} \sqrt{\sum_{s=1}^m(\bb_s^{(j)})^2}\right)}
    \end{align*}
    By Assumption \ref{ass:der}
    \begin{align*}
        \frac{m}{N} \frac{\sum_{s=1}^m \sum_{i,j \in J} \bb_s^{(i)}(\mPhi^T \mPhi)_{i,j} \bb_s^{(j)}}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2}
        & \geq \sum_{s=1}^m \sum_{i \in J} \frac{ \bb_s^{(i)} c_2 \bb_s^{(i)}}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2} \\
        & \quad - \frac{\sum_{s=1}^m \sum_{i \neq j, i,j \in J} |\bb_s^{(i)} c_{od} \bb_s^{(j)}|}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2}.
    \end{align*}
    Since for $q > 0$ using $\norm{\cdot}_1 \leq \sqrt{s} \norm{\cdot}_2$
    \begin{align*}
        \sum_{s=1}^m \sum_{i \in J} \frac{ \bb_s^{(i)} c_2 \bb_s^{(i)}}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2} \geq c_2 \sum_{s=1}^m \sum_{i \in J} \frac{ (\bb_s^{(i)})^2}{s^\star\left(\sqrt{\sum_{j \in J} \sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2} = \frac{c_2}{s^\star}
    \end{align*}
    and using Cauchy-Schwarz to prove
    \begin{equation*}
        \sum_{k,l} (x_k y_l)^2 \geq \sum_{k,l} (x_k y_l)(y_k x_l)
    \end{equation*}
    which implies
    \begin{equation*}
        \sqrt{\sum_{k,l} (x_k y_l)^2} \geq \sum_{l} |x_l y_l|
    \end{equation*}
    we get 
    \begin{align*}
        \frac{\sum_{s=1}^m \sum_{i \neq j, i,j \in J} |\bb_s^{(i)} c_{od} \bb_s^{(j)}|}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)^2} 
        &= c_{od} \frac{\sum_{s=1}^m \sum_{i \neq j, i,j \in J} |\bb_s^{(i)}| |\bb_s^{(j)}|}{\sum_{i,j \in J} \sqrt{\sum_{k,l} (\bb_l^{(i)}\bb_k^{(j)})^2}} \\
        &\leq c_{od} \frac{\sum_{s=1}^m \sum_{i \neq j, i,j \in J} |\bb_s^{(i)}| |\bb_s^{(j)}|}{\sum_{i,j \in J} \sum_{s=1}^m |\bb_s^{(i)}| |\bb_s^{(j)}|} \\
        & \leq c_{od}.
    \end{align*}
    Also
    \begin{align*}
        &\frac{\sum_{s=1}^m\sum_{i \in J, j \in J^c} |\bb_s^{(i)} c_{od} \bb_s^{(j)}|}{\left(\sum_{j \in J} \sqrt{\sum_{s=1}^m (\bb_s^{(j)})^2}\right)\left(\sum_{j \notin J} \sqrt{\sum_{s=1}^m(\bb_s^{(j)})^2}\right)}\\
        & = c_{od} \frac{\sum_{s=1}^m \sum_{i \in J, j \in J^c} |\bb_s^{(i)} | | \bb_s^{(j)}|}{\sum_{i \in J, j \in J^c} \sqrt{\sum_{k,l} (\bb_l^{(i)}\bb_k^{(j)})^2}}\\
        & = c_{od} \frac{\sum_{s=1}^m \sum_{i \in J, j \in J^c} |\bb_s^{(i)} | | \bb_s^{(j)}|}{\sum_{i \in J, j \in J^c} \sum_{s=1}^m |\bb_s^{(i)}\bb_s^{(j)}|}\\
        & = c_{od}.
    \end{align*}
    Therefore    
    \begin{align*}
        \frac{\sqrt{m}}{\sqrt{N}} \frac{\|\mPhi \bb\|_2}{ \sum_{j \in J} \|b^{(j)}\|_2}
        &\geq \sqrt{c_2/s - 5 c_{od}}.
    \end{align*}
\end{proof}

\begin{proposition}
\label{prop:kappa}
    Let $\mu$ be the Lebesgue measure and $d=p$. Assume that $\bm{\phi}_i \in L_{\mu}^2(\calX)$, $i \in \{1,\dots,p\}$ are orthogonal and satisfy $\|\bm{\phi}_i\|_{L_{\mu}^2(\calX)} / \text{Vol}(\calX) \geq z$, for all $i \in \{1,\dots,p\}$. Assume also that $k_i(x,x) = \bm{\phi}_i(\bx)^2 \leq 1$ for all $\bx \in \calX$. 
    Choose $\bx_1, \dots, \bx_{n}$ i.i.d.~uniformly from $\calX$ and let
    \begin{align*}
        \mPhi_s \coloneqq
        \begin{bmatrix}
            \bm{\phi}(\bx_1) & \dots & \bm{\phi}(\bx_n)
        \end{bmatrix}^T
        \in \sR^{n \times d} \qquad \forall s \leq m.
    \end{align*}
    Then with probability at least $1-\delta$ Assumption \ref{ass:der} is satisfied with
    \begin{align*}
        c_d = \left(z - \sqrt{\frac{1}{2n}\log(4d/\delta)}\right)
    \end{align*}
    and
    \begin{align*}
        c_{od} = \sqrt{\frac{2}{n} \log\left(\frac{4d^2}{\delta}\right)}.
    \end{align*}
\end{proposition}

\begin{proof}
        For the second, let $X$ be a random variable uniformly distributed on $\calX$ and denote by $v_i \coloneqq \bm{\phi}_i(\bx_{1:n})$ the $i$th column of $\mPhi_s$. It holds that
    \begin{align*}
        \mathbb{E}[\bm{\phi}_i(\bx)^2] = \frac{1}{\text{Vol}(\calX)}\int_{\calX} \bm{\phi}_i(\bx)^2 \text{d} \mu(\bx) \geq z, \quad \forall i \leq d.
    \end{align*}
    Therefore
    \begin{equation*}
        \mathbb{E}[\norm{v_i}_2^2] = \mathbb{E}\left[\sum_{i=1}^n \bm{\phi}_i(\bx_i)^2 \right] \geq nz.
    \end{equation*}
    By union bound and Höffding's inequality
    \begin{align*}
        \mathbb{P}\left[\exists i \leq d, \left| \norm{v_i}^2_2 - \mathbb{E}\left[\norm{v_i}^2_2\right] \right| \geq \epsilon \right] \leq 2d\exp(-\frac{2\epsilon^2}{n})
    \end{align*}
    or
    \begin{align*}
        \mathbb{P}\left[\exists i \leq d, \left| \norm{v_i}^2_2 - \mathbb{E}\left[\norm{v_i}^2_2\right] \right| \geq \sqrt{\frac{n}{2} \log(\frac{4d}{\delta})} \right] \leq \delta.
    \end{align*}
    Therefore with probability at least $1-\delta/2$ for all $i \leq d$
    \begin{align*}
        \norm{v_i}_2^2 \geq \mathbb{E}\left[\norm{v_i}^2_2\right] - \sqrt{\frac{n}{2} \log(\frac{4d}{\delta})} \geq nz - \sqrt{\frac{n}{2} \log(\frac{4d}{\delta})}.
    \end{align*}
    Further, for $i \neq j$
    \begin{align*}
        \mathbb{E}[\bm{\phi}_i(\bx)\bm{\phi}_j(\bx)] = \frac{1}{\text{Vol}(\calX)}\int_{\calX} \bm{\phi}_i(\bx)\bm{\phi}_j(\bx) \text{d} \mu(\bx) = 0,
    \end{align*}
    since $\bm{\phi}_i$ and $\bm{\phi}_j$ are orthogonal in $L_{\mu}^2(\calX)$.
    By assumption $\bm{\phi}_i(\bx) \leq 1, \forall i \leq d, \forall \bx \in \calX$ and by Höffding's inequality
    \begin{align*}
        \mathbb{P}[|\langle v_i, v_j \rangle| \geq \epsilon] \leq 2\exp(-\frac{\epsilon^2}{2n}).
    \end{align*}
    and therefore for $0 \leq \delta \leq 1$
    \begin{align*}
        \mathbb{P}\left[\exists i \neq j, |\langle v_i, v_j \rangle| \geq \sqrt{2n \log\left(\frac{4d^2}{\delta}\right)}\right] \leq \delta/2.
    \end{align*}
    We derived that with probability at least $1-\delta$
    \begin{align*}
        (\mPhi^T \mPhi)_{ii} / n\geq c_2 = \left(z - \sqrt{\log(4d/\delta)/2n}\right)
    \end{align*}
    and for $i \neq j$
    \begin{align*}
        (\mPhi^T \mPhi)_{ij} / n < \sqrt{\frac{2}{n} \log\left(\frac{4d^2}{\delta}\right)}.
    \end{align*}
\end{proof}

\begin{corollary}
    Assume the setting of Proposition \ref{prop:kappa}. Then
    \begin{align*}
        \kappa \geq \sqrt{z/s^\star - \sqrt{\frac{\log(4d/\delta) / (2s^\star) + 50 \log(4d^2/\delta)}{n}}} = \calO(1).
    \end{align*}
\end{corollary}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\section{Federated Analysis (Proof of \texorpdfstring{Theorem~\ref{thm:lifelong_federated}}{})}\label{app:federated}

\begin{figure}
    \centering
    \resizebox{0.8\columnwidth}{!}{
    \input{Figures/Fig_federated}
    }
    \caption{\falgon visualized. The yellow boxes corresponds to modules of \falgoff.\looseness-1}
    \label{fig:lifelong_algo_fed}
\end{figure}
 
 
 Recall that in the federated setting, each client minimizes the following loss locally.
 \begin{align}
 \label{eq:meta_loss_federated}
\vbetahat_{s, \mathrm{prvt}} & \coloneqq \argmin_{\vbeta_s \in \sR^{d}} \Ls\left(\vbeta_s; \Dexp_{s} \right) \\
& = \argmin_{\vbeta_s \in \sR^{d}} \frac{1}{n_s} \norm{ \by_s - \mPhi_s \vbeta_s}_2^2  + \lambda \sum_{j=1}^{p} \|\vbeta_s\gj\|_2 .\notag
\end{align}
In this section, for simplicity we refer to the solution as $\vbetahat_s$. We may further omit the subscript $s$, whenever it can be determined from the context.
For our federated analysis, we require a slightly stronger version of the Beta-min assumption.

\begin{assumption}[Beta-min federated]\label{ass:betamin_federated}
    Assume there exists $c_{1,{\mathrm{f}}} > 0$ such that for all $s \leq m$ and $j \in J^\star$
    \begin{align*}
        \norm{\vbeta_s^{\star(j)}}_2 \geq c_{1,{\mathrm{f}}}.
    \end{align*}
\end{assumption}

\begin{remark}
    Note that Assumption \ref{ass:betamin_federated} implies Assumption \ref{ass:betamin}.
\end{remark}

 \subsection{Consistency of the Meta-learned Kernel}
 \label{app:federated_offline}

In this section we prove the equivalent of \cref{thm:offline_main_consitency} in the federated setting. 
 \begin{theorem}[Consistency of \falgoff]
\label{thm:federated_offline}
    Let $\omega \in (0, c_{1, \mathrm{f}})$ and $\bar{\omega} \coloneqq \min \{ \omega, c_{1, \mathrm{f}} - \omega\}$. Let Assumption \ref{ass:betamin_federated} hold.
    Let $n_s = \ubar{n}, \forall s \leq m$ and assume $\ubar{n}$ is large enough to satisfy \smash{$\bar{\omega} > (\sqrt{\log(p/\bar{\alpha})} + 1)32 \sigma/(\sqrt{\ubar{n}} c_{\kappa}^2)$}, where $\bar{\alpha} \coloneqq \max\{\alpha,1-\alpha\}$.
    Assume that $\mPhi_s \in \mathbb{R}^{\ubar{n} \times d}$ satisfy \cref{ass:compatibility} with $c_\kappa$ for $s=1, \dots, m$.
    Let \smash{$\hat{\vbeta}$} be a solution of \Eqref{eq:meta_loss_federated} with regularization parameter \smash{$\lambda = \bar{\omega} c_{\kappa}^2/8$}.
    Then \smash{$\hat{J}_{\mathrm{f}}$} is a consistent estimator in $\ubar{n}$ and $m$, that is
    \begin{align*}
        \lim_{\ubar{n} \to \infty} \mathbb{P}\left[ \hat{J}_{\mathrm{f}} = J^\star \right] = 1 \qquad \text{and} \qquad \lim_{m \to \infty} \mathbb{P}\left[ \hat{J}_{\mathrm{f}} = J^\star \right] = 1.
    \end{align*}
\end{theorem}

We start by proving the necessary lemmas.

\begin{lemma}
\label{lem:federal}
    Let $\hat{\vbeta}$ is a solution of \eqref{eq:meta_loss_federated} and
    \begin{align*}
        \lambda \leq \frac{\bar{\omega} \kappa^2 }{8},
    \end{align*}
    where $\bar{\omega} \coloneqq \min \{ \omega, c_{1,{\mathrm{f}}} - \omega\}$ for $0 < \omega < c_{1,{\mathrm{f}}}$.
    Then we have for $\frac{\lambda n_s}{4 \sigma} > \sqrt{\trace(\Phi_s^{(j)} (\Phi_s^{(j)})^T)}$ with probability at least
    \begin{align*}
        1 - p \max_{j \leq p} \exp\left( -\frac{\left(\frac{\lambda n_s}{4 \sigma} - \sqrt{\trace(\Phi_s^{(j)} (\Phi_s^{(j)})^T)}\right)^2}{2\norm{\Phi_s^{(j)}(\Phi_s^{(j)})^T}_2} \right) 
    \end{align*}
    that
    \begin{equation}
        \sum_{j=1}^p \norm{\hat{\vbeta}_s^{(j)} - \vbeta_s^{\star(j)}}_2 \leq \bar{\omega}.
    \end{equation}
    If additionally Assumption \ref{ass:betamin_federated} holds, then we have with the same probability for
    \begin{align*}
        \hat{J}_{s, \mathrm{f}} = \left\{ j \in \{1,\dots, p\} \ \Big\vert \ \norm{\hat{\vbeta}_s^{(j)}}_2 > \omega \right\}
    \end{align*}
    that
    \begin{align*}
        \hat{J}_{s, \mathrm{f}} = J^\star.
    \end{align*}
\end{lemma}

\begin{proof}
    Follows directly from Corollary \ref{cor:main} with $m=1$.
\end{proof}

\begin{lemma}[Chernoff-Höffding bound]
\label{lem_chbound}
    Let $X_1, \dots, X_n$ be i.i.d Bernoulli random variables with $\mathbb{E}[X_i] = p_i$. Define $p\coloneqq \frac{1}{n} \sum_{i=1}^n p_i$, then for $t < np$,
    \begin{align*}
        \mathbb{P}\left[\sum_{i=1}^n X_i \leq t\right] &\leq \exp\left(-n \left( \frac{t}{n} \log\left(\frac{t}{np}\right) + (1-t/n) \log\left(\frac{1-t/n}{1-p}\right) \right)\right)\\
        &\leq \exp\left(-n  \frac{(p - t/n)^2}{2p(1-p)}\right).
    \end{align*}
\end{lemma}


\begin{lemma}
\label{thm:main_federated}
    Let $ 0 < w < c_{1,{\mathrm{f}}}$ and let $\hat{\vbeta}_s$ be the solution of \eqref{eq:meta_loss_federated} for tasks $s \leq m$ and $\lambda \leq \frac{\bar{\omega} \kappa^2 }{8}$. Define for $j \in \{1,\dots,p\}$
    \begin{align*}
        Q_j = \big\{ s \in \{1,\dots, m\} \ \big\vert \ \norm{\hat{\vbeta}^{(j)}_s}_2 > w\big\}
    \end{align*}
    and for $\alpha > 0$
    \begin{equation}
        \hat{J}_{\mathrm{f}} = \big\{ j \in \{1,\dots,p\}  \ \big\vert \ |Q_j| > m\alpha  \big\}.
    \end{equation}
    Define for $s \in \{1,\dots,m\}$
    \begin{align*}
        v_s &\coloneqq 1 - p \max_{j \leq p} \exp\left( -\frac{\left(\frac{\lambda n_s}{4 \sigma} - \sqrt{\trace(\Phi_s^{(j)} (\Phi_s^{(j)})^T)}\right)^2}{2\norm{\Phi_s^{(j)}(\Phi_s^{(j)})^T}_2} \right)
    \end{align*}
    and
    \begin{align*}
        v \coloneqq \frac{1}{m} \sum_{s=1}^m v_s.
    \end{align*}
    Assume that $\frac{\lambda n_s}{4 \sigma} > \sqrt{\trace(\Phi_s^{(j)} (\Phi_s^{(j)})^T)}, \forall s \leq m$ and $v > \bar{\alpha} \coloneqq \min\{\alpha, 1-\alpha\}$. Then
    \begin{align*}
        \mathbb{P}\left[ J^\star = \hat{J}_{\mathrm{f}} \right] \geq 1 - p\exp\left( -m \frac{(v-\bar{\alpha})^2}{2v(1-v)} \right).
    \end{align*}
\end{lemma}

\begin{proof}
Recall that $\bm{\epsilon}_s = [\epsilon_{s, i}]_{i=1}^n$.
    Since $\bm{\epsilon}_1, \dots, \bm{\epsilon}_m$ are independent,
    \begin{align*}
        \mathbbm{1}_{\left\{ \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)}_1 - \vbeta^{\star(j)}_1}_2 \leq \bar{\omega} \right\}}, \dots, \mathbbm{1}_{\left\{ \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)}_m - \vbeta^{\star(j)}_m}_2 \leq \bar{\omega} \right\}}
    \end{align*}
    are independent and Bernoulli distributed with coefficient 
    \begin{align*}
        \mathbb{P}\left[\sum_{s=1}^p \norm{\hat{\vbeta}^{(j)}_s - \vbeta^{\star(j)}_s}_2 \leq \bar{\omega} \right] \geq v_s, 
    \end{align*}
    where we used Lemma \ref{lem:federal} and set $\bar{\omega} \coloneqq \min \{ \omega, c_{1,{\mathrm{f}}} - \omega\}$.
    If $j \in J^\star$ and $\norm{\hat{\vbeta}^{(j)}_s - \vbeta^{\star(j)}_s}_2 < \bar{\omega}$, then by Assumption \ref{ass:betamin_federated}
    \begin{align*}
        \norm{\hat{\vbeta}_s^{(j)}}_2 > c_{1,{\mathrm{f}}} - \norm{\hat{\vbeta}_s^{(j)} - \vbeta^{\star(j)}}_2  \geq c_{1,{\mathrm{f}}} - \bar{\omega} \geq \omega,
    \end{align*}
    which implies $J^\star \subset \hat{J}$.
    If $j \notin J^\star$ and $\norm{\hat{\vbeta}^{(j)}_s - \vbeta^{\star(j)}_s}_2 < \bar{\omega}$, then
    \begin{align*}
        \norm{\hat{\vbeta}_s^{(j)}}_2 \leq \norm{\hat{\vbeta}_s^{(j)} - \vbeta_s^{\star(j)}}_2 +  \norm{\vbeta_s^{\star(j)}}_2 \leq \bar{\omega} \leq \omega.
    \end{align*}   
    We have by Lemma \ref{lem_chbound} and for $v > \bar{\alpha}$,
    \begin{equation}
    \label{eq:main_federated}
    \begin{split}
        \mathbb{P}\left[  \hat{J}_{\mathrm{f}} = J^\star \right] &\geq
        \mathbb{P}\left[ \forall j \in J^\star, |Q_j| \geq m/x; \forall j \notin J^\star, |Q_j| < m/x \right]\\
        &\geq \mathbb{P}\Bigg[ \forall j \notin J^\star, \sum_{s=1}^m \mathbbm{1}_{\left\{ \norm{\hat{\vbeta}^{(j)}_s}_2 \leq w \right\}} \geq m/x; \forall j \in J^\star, \sum_{s=1}^m \mathbbm{1}_{\left\{ \norm{\hat{\vbeta}^{(j)}_s}_2 > w \right\}} \geq m/x \Bigg]\\ 
        &\geq \mathbb{P}\left[ \forall j \in \{1, \dots,p\}, \sum_{s=1}^m  \mathbbm{1}_{\left\{ \norm{\hat{\vbeta}^{(j)}_s - \vbeta^{\star(j)}_s}_2 \leq \bar{\omega} \right\}} \geq m\min\{\alpha, 1 - \alpha\} \right]\\ 
        &\geq \mathbb{P}\left[\sum_{s=1}^m \mathbbm{1}_{\left\{ \sum_{j=1}^p \norm{\hat{\vbeta}^{(j)}_s - \vbeta^{\star(j)}_s}_2 \leq \bar{\omega} \right\}}  \geq m \bar{\alpha} \right]\\
        & \geq 1- \exp\left(-m \left( \bar{\alpha} \log\left(\frac{\bar{\alpha}}{v}\right) + (1-\bar{\alpha}) \log\left(\frac{1-\bar{\alpha}}{1-v}\right) \right)\right)\\
        & \geq 1 - \exp\left( -m \frac{(v-\bar{\alpha})^2}{2v(1-v)} \right).
        \end{split}
    \end{equation}
\end{proof}

\begin{proof}[\textbf{Proof of \cref{thm:federated_offline}}]
Assume the setting of \cref{thm:main_federated} and that there exists $c_{\kappa}> 0$ such that $\kappa \geq c_{\kappa}$. Set
    $\lambda = \frac{\bar{\omega} c_{\kappa}^2 }{8}$, $n_s = \ubar{n}, \forall s\leq m$ and assume
    $\frac{\lambda \sqrt{\ubar{n}}}{4 \sigma} > 1$ and $v = 1 - p\exp(-(\lambda \sqrt{\ubar{n}} / 4 \sigma - 1)^2/2)  > \bar{\alpha}$. 
    
    Note that $\Phi_s^{(j)} \in \sR^{N \times md_j}$ is block-diagonal. Since by assumption $k_j(x,x') \leq 1, \forall j \leq p$, we have
    \begin{align*}
        \norm{(\Phi_s^{(j)})^T \Phi_s^{(j)}}_2 \leq \trace((\Phi_s^{(j)})^T \Phi_s^{(j)}) = \trace(\Phi_s^{(j)} (\Phi_s^{(j)})^T) = \sum_{i=1}^{\ubar{n}} k_j\left(x^{(s)}_i, x^{(s)}_i\right) \leq \ubar{n}.
    \end{align*}
     \cref{thm:main_federated} yields the result.
\end{proof}

 \subsection{Lifelong Regret of \falgon (Proof of \texorpdfstring{Theorem~\ref{thm:lifelong_federated}}{})} \label{app:federated_lifelong}
 
 We start by stating \cref{thm:lifelong_federated} more rigorously. 
 
\begin{theorem}
\label{thm:lifelong_main_federated}
    Assume that the true reward functions $f_1, \dots, f_m$ satisfy $\norm{f_i}_{\tH} \leq B$ for some constant $B > 0$. Let $\bar{n}$ be the number of times forced exploration is used in each task. Let $\nu$ be a distribution on $\calX^{\bar{n}m}$ independent of $\bm{\epsilon}_1, \dots, \bm{\epsilon}_m$. Let $V \sim \nu$ be the random vector used for forced exploration. Let $\tilde{\Phi}_s \in \sR^{\bar{n} \times md}$ be the data matrix obtained by forced exploration in task $s$. Set $\lambda = \bar{\omega} c_{\kappa}^2/8$. Assume the forced exploration distribution $\nu$ and $\{k_j\}_{j\leq p}$ are such that, with probability at least $1-\delta/4$, there exists $c_{\kappa} > 0$ such that $\kappa(\tilde{\Phi}_s) \geq c_{\kappa}, \forall s \leq m$.
    Assume further that the base bandit algorithm using the true kernel function achieves on $m$ tasks with independent noise with probability at least $1-\delta/2$ cumulative regret lower than $\Roracle(n, m)$. Define
    \begin{align*}
        v &\coloneqq 1 - p\exp\left( -\frac{1}{2} \left(\frac{\bar{\omega} c_{\kappa}^2\sqrt{\bar{n}}}{32 \sigma} -1 \right)^2 \right).
    \end{align*}
    and assume for all $s \leq m$
    \begin{equation}
        v \geq \bar{\alpha}, \qquad \qquad \qquad \frac{\bar{\omega} c_{\kappa}^2\sqrt{\bar{n}}}{32 \sigma} > 1.
    \end{equation}
    Then with probability at least $1-\delta$, \algon (using \falgoff to predict the kernel) achieves
    \begin{align*}
        R(m,n) \leq \calO\left( B n \log(mp/\delta) / \bar{n} + B m \bar{n}  \right)+ \Roracle(n, m).
    \end{align*}
\end{theorem}


\begin{proof}
    Similar to the proof of Theorem \ref{thm:lifelong_main} we have by  \Eqref{eq:main_federated} for all $s$ and $v' \in C$
    \begin{align*}
        \mathbb{P}\left[  \hat{J}_{s} = J^\star \mid V = v' \right] \geq 1 - p\exp\left(-s \left( \bar{\alpha} \log\left(\frac{\bar{\alpha}}{v}\right) + (1-\bar{\alpha}) \log\left(\frac{1-\bar{\alpha}}{1-v}\right) \right)\right).
    \end{align*}
    By union bound we have for $m_0 \leq m$
    \begin{align*}
        \mathbb{P}\Big[ \forall m \geq s \geq m_0, \hat{J}_s = J^\star &\mid V = v'  \Big] \\
        &\geq 1 - \sum_{s=m_0}^m p\exp\left(-s \left( \bar{\alpha} \log\left(\frac{\bar{\alpha}}{v}\right) + (1-\bar{\alpha}) \log\left(\frac{1-\bar{\alpha}}{1-v}\right) \right)\right)\\
        & \geq 1 - m p\exp\left(-m_0 \left( q - (1-\bar{\alpha}) \log\left(1-v \right) \right)\right),
    \end{align*}
    where $q \coloneqq \bar{\alpha} \log\left(\bar{\alpha}\right) + (1-\bar{\alpha}) \log\left(1-\bar{\alpha} \right)$.
    Set
    \begin{align*}
        m_0 = \left\lceil \frac{\log(4mp/\delta)}{\bar{q} + (1-\bar{\alpha})(\bar{w} c_{\kappa}^2 \sqrt{\bar{n}}/32 \sigma-1)^2/2 } \right\rceil,
    \end{align*}
    where $\bar{q} \coloneqq q  - (1-\bar{\alpha}) \log(p)$.
    Following the same steps as in the proof of Theorem \ref{thm:lifelong_main} we get
    \begin{align*}
        R(m,n) &\leq \calO\left(m_0 n L +  L m \bar{n} \right)+ \Roracle(n, m - m_0)\\
        &\leq \calO\left(2B m_0 n + 2B m \bar{n} \right)+ \Roracle(n, m) \\
        & \leq \calO\left( B n \log(mp/\delta) / \bar{n} + B m \bar{n}\right) + \Roracle(n, m) .
    \end{align*}
\end{proof}

\begin{corollary}
\label{cor:main_ll_federated}
    Assume the setting of Theorem \ref{thm:lifelong_main_federated} and set $\bar{n} =  \sqrt{n} $. Then with probability at least $1-\delta$ we have
    \begin{align*}
        R(m,n) \leq \calO\left( B \sqrt{n} (\log(mp/\delta) +m) \right) + \Roracle(n, m) .
    \end{align*}
\end{corollary}

 \subsection{Performance of \gpucb paired with \falgon} \label{app:federated_ucb}
 
 \begin{corollary}
\label{cor:ll_ucb_federated}
    Assume we are in the setting of Corollary \ref{cor:main_ll_federated} with GP-UCB as the base bandit algorithm and $\lambda_{\mathrm{ucb}} = 1 + 2/n$. Then, for all $0<\delta<1$, with probability at least $1-\delta$,
    \begin{align*}
  R(m,n) = \calO \left(  Bm d^\star \sqrt{n}\log\tfrac{n}{d^\star} + m\sqrt{nd^\star \log \tfrac{n}{d^\star}\log\tfrac{ 1}{\delta}} + B\sqrt{n}(m+\log(mp/\delta))\right).
    \end{align*}
\end{corollary}

\begin{proof}
    The proof is the same as the proof for Corollary \ref{cor:ll_ucb_const}, except that we use Corollary \ref{cor:main_ll_federated} in place of Corollary \ref{cor:ll_const}.
\end{proof}
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
\section{Experiment Details}
\label{app:experiment_details}

For the synthetic experiments, we initiate the algorithms with $\omega = c_1/2$.
For all experiments, the exploration coefficient of the \gpucb algorithm is set to $\nu_i = 10$ and $\lambda_{\mathrm{ucb}} = 0.1$. 
Experiment are all repeated $20$ times for difference random seeds, and the plots show the corresponding standard error. 
The remaining experiment settings are detailed in the following subsections.

\subsection{Offline Data experiments}\label{app:exp_offline_detail}
We generate the reward functions $f_1, ..., f_{30}$ from the synthetic environment. Corresponding to each $f_s$, we generate a data set $\mathcal{D}_s$ of size $n=10$ by sampling points $\bx_{s,1}, \dots, \bx_{s,n}$ i.i.d from a uniform distribution $\calU(\calX)$ over the domain $\calX = [0,1]$ and collecting the corresponding noisy function values $y_{s,i} = f_s(\bx_{s,i}) + \epsilon$, where the noise is samples from $\calN(0, \sigma^2=0.01)$.
We initiate \algoff with the lasso regularization parameter of $\lambda = 0.25$ and $\falgoff$ with $\lambda = 0.015$. For \falgoff, we set the majority vote threshold to $\alpha = 0.25$.

\subsection{Lifelong Data Experiments}\label{app:exp_lifelong_detail}
For experiments using synthetic data, we set $n=100$, and for the experiments on \textsc{GLMNET} data, there are $n=144$ BO steps in each task.
To run \algon on the synthetic environment we set $\lambda = 0.5$ and for \falgon we set $\lambda = 0.2$.
On the \textsc{GMLNET} environment, we instantiate \algon with $\omega=0.25$ and $\lambda=0.015$, and \falgoff with $\alpha = 0.25$, $\omega=10^{-6}$, $\lambda= 2.6 \times 10^{-6}$.


\subsection{Further Experiments with Synthetic Data}
\label{app:experiment_results}

\begin{figure}[ht]
    \centering
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plots_appendix/plot_ol_cosine_2.pdf}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plots_appendix/plot_ol_legendre_1.pdf}
    \end{subfigure}
    \caption{Single task cumulative regret of GP-UCB with meta-learned kernel $\hat{k}$ on an increasing number of meta-training tasks. Left: base kernels constructed with 2-dimensional cosine basis. Right: base kernels constructed with 1-dimensional Legendre polynomials. The BO performance with meta-leaned kernels quickly approaches oracle performance as the number of meta-training task increases.\label{fig:offline_more}}
\end{figure}


\paragraph{Offline Data}  Analogous to the offline data experiments in Section \ref{sec:offline_exps}, we provide additional results for a two-dimensional domain and Legendre polynomials instead of cosine bases in Figure \ref{fig:offline_more}. In particular, the left plot corresponds to $\mathcal{X} = [0, 1]^2$ as the domain and the first $50$ 2-dimensional cosine basis functions, i.e., $\phi_{i,j}(x) = \cos(i \pi x_1)\cos( \pi x_2), \forall x \in \mathcal{X}$, as the feature maps. For the right plot we choose $\mathcal{X} = [-1, 1]$ as the domain and use the first $50$ Legendre Polynomials as the feature maps. 

Figure \ref{fig:offline_more} shows that both meta-learners converge with increasing number of tasks to the oracle kernel. This holds for different sets of base kernels and kernels with more than $1$ input dimension. This empirically validates the theoretical findings of \cref{thm:offline_main_consitency} and \cref{thm:federated_offline}. Somewhat peculiar is that we can observe oscillating behavior for the federated algorithm (yellow). This is a result of discrete nature of the voting system. The the total of number of tasks is a multiple of $\alpha$ the value $\vert \Jhat_s \vert$ is large, while for points directly after that $\vert \Jhat_s \vert$ are small. With increasing number of tasks the discretization has a lesser impact on the kernel estimation and the amplitude of the oscillations decreases. 

\begin{figure}[ht]
    \centering
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plots_appendix/plot_ll_cosine_1_0.pdf}
        \caption{}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plots_appendix/plot_ll_cosine_1_1.pdf}
        \caption{}
    \end{subfigure}
        \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plots_appendix/plot_ll_legendre_1_0.pdf}
        \caption{}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{Figures/plots_appendix/plot_ll_legendre_1_1.pdf}
        \caption{}
    \end{subfigure}
    \caption{Lifelong regret with cosine basis (\textbf{a} \& \textbf{b}) and Legendre polynomials as feature maps (\textbf{c} \& \textbf{d}).
    In the plots \textbf{(a \& c)} on the left, we use only the forced exploration data $\Dexp_{1:s}$ for meta-learning the kernel. For the plots \textbf{(b \& d)} on the right, $D_{1:s}$, the data from all previous bandit interactions, is used. We observe that convergence is much faster when all interaction data is used in \algon and \falgon. \label{fig:online_more}}
\end{figure}

\paragraph{Lifelong Data} We now present modifications of the lifelong BO experiments in Section \ref{exp:lifelong}. In particular, we consider other base kernels as well as a modification of \algon where we use all collected data for meta-learning $\hat{k}$ instead of only the forced exploration data $\Dexp_{1:s}$. The results are depicted in Figure \ref{fig:online_more}. Figure \textbf{(a)} and \textbf{(b)} correspond to $50$ cosine basis functions as feature maps for the base kernels. For Figure \textbf{(c)} and \textbf{(d)}, we use the first $50$ Legendre polynomials as feature maps. The plots on the left (i.e. Fig. a, c) are generated with \algon and \falgon, as presented in Algorithm \ref{alg:lifelong} and \ref{alg:lifelong_fed}, where only the forced exploration data is used for meta-learning. The plots on the right (i.e. Fig. b, d) correspond to a modified version of \algon and \falgon where we use $D_{1:s}$, i.e., all previous bandit interactions, to meta-learn the kernel.

Generally, we observe that \algon and \falgon substantially outperform the naive method which uses all base kernels.
The gray vertical lines in Figure \ref{fig:online_more} indicate the beginning of a new task. We see that for every new task all algorithms initially experiences high regret, but, over time, as reward estimation improves, the cumulative regret flattens. As the rate of single-task convergence is dependent on the kernel, we see that differences in the performance between the algorithms emerge.
When running \algon, over time, forced exploration decreases and the estimated kernel converges to the true kernel.
This means that, over time, the behavior of the agent using the \algon estimator becomes indistinguishable form the agent using the oracle kernel.
This is evident from \ref{fig:online_more} \textbf{(a)} as the slope of the single-task cumulative regret of the meta-agent (green) becomes the same as for the oracle agent (blue).
In the federated case (yellow), while the estimated kernel also converges to the true kernel, the more restrictive setting forces us to use a constant exploration rate (see \cref{alg:lifelong_fed} and \cref{sec:federated}) which means that the behavior of the federated meta-learner is always slightly sub-optimal.
This can be observed by noting that the slope of the single-task cumulative regret of the federated meta-learner (yellow) is higher compared to the oracle agent even after the estimated kernel converges to the true kernel.

When we adjust \algon and \falgon to use all available data to predict the kernel instead of only using $\Dexp_{1:s}$, the lifelong regret decreases.
As we would expect, using more data for meta-learning the kernel speeds up the convergence of $\hat{k}$ to $k^*$ which, in turn, makes the BO runs more efficient. In practice, using the data from all interactions, not just the ones obtained by forced exploration, seems to be the best choice. From a theoretical perspective, this comes with additional technical challenges, as we point out in \cref{sec:forced_exp}.

%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
%=======================================================================
% \bibliography{refs}
\end{document}