% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example


\usepackage{hyperref}
\usepackage{natbib}

\usepackage[utf8]{inputenc} 
\usepackage[T1]{fontenc}    
\usepackage{hyperref}       
\usepackage{url}            
\usepackage{booktabs}       
\usepackage{amsfonts}       
\usepackage{nicefrac}      
\usepackage{microtype}     
\usepackage{bm}


\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} 



\newcommand{\theHalgorithm}{\arabic{algorithm}}


\usepackage{geometry}
\geometry{a4paper,scale=0.72}

%%  
\usepackage{amsbsy}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amsfonts}      
\usepackage{xcolor}
\usepackage{amssymb}

\usepackage{hyperref}
\usepackage{cleveref}


 
\renewcommand{\arraystretch}{1.5}

\newtheorem{theorem}{Theorem}[section] % for theorems
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}[section]
\newtheorem{remark}{Remark}
\newtheorem{prop}{Proposition}[section]
\newenvironment{talign*}
 {\let\displaystyle\textstyle\csname align*\endcsname}
 {\endalign}
\newenvironment{talign}
 {\let\displaystyle\textstyle\csname align\endcsname}
 {\endalign}
\crefname{talign}{}{}
\crefname{equation}{}{}

\usepackage{enumitem} 



\usepackage{xcolor}  % for algorithms
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}
\SetCommentSty{mycommfont}
\SetKwInput{KwInput}{Input} 
\SetKwInput{KwOutput}{Output}   

\usepackage[acronym]{glossaries}
\newacronym{RKHS}{RKHS}{reproducing kernel Hilbert space}
\newacronym{MC}{MC}{Monte Carlo}
\newacronym{MCMC}{MCMC}{Markov chain Monte Carlo}
\newacronym{CLT}{CLT}{central limit theorem}
\newacronym{CF}{CF}{\emph{control functionals}}
\newacronym{IID}{IID}{independent and identically distributed}
\newacronym{CV}{CV}{\emph{control variate}}
\newacronym{vvCV}{vv-CV}{\emph{vector-valued control variate}}
\newacronym{vvRKHS}{vv-RKHS}{\emph{reproducing kernel Hilbert space of vector-valued functions}}
\newacronym{mvkernel}{mv-kernel}{\emph{matrix-valued reproducing kernel}}
\newacronym{vvfunctions}{vv-functions}{\emph{vector-valued functions}}
\newacronym{TI}{TI}{\emph{thermodynamic integration}}
\newacronym{BQ}{BQ}{Bayesian quadrature}


\newacronym{GBML}{GBML}{\emph{Gradient-based Meta-learning}}
\newacronym{MAML}{MAML}{\emph{Model-Agnostic Meta-Learning}}

\setkeys{glslink}{hyper=false}








\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\arginf}{arg\,inf}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argsup}{arg\,sup}


\def\S{\mathcal{S}}
\def\Svv{\mathcal{S}^{\text{vv}}}
\def\Ssv{\mathcal{S}^{\text{sv}}}
\def\Jvv{J^{\text{vv}}}


\def\J{\mathcal{J}}
\def\P{\mathbb{P}}
\def\Pset{\mathcal{P}}
\def\E{\mathbb{E}}
\def\Q{\mathbb{Q}}
\def\D{\mathcal{D}}
\def\F{\mathcal{F}}
\def\G{\mathcal{G}}
\def\V{\mathbb{V}}
\def\U{\mathcal{U}}
\def\H{\mathcal{H}}
\def\Lvv{L^{\text{vv}}}
\def\L{\mathcal{L}}
\def\M{\mathcal{M}}
\def\N{\mathbb{N}}
\def\Nplus{\mathbb{N}_+}
\def\SL{\S_{\text{SL}}}
\def\R{\mathbb{R}}
\def\X{\mathbb{R}^d}
\def\O{\mathcal{O}}


\def\MC{\hat{\Pi}^{\text{MC}}}
\def\CV{\hat{\Pi}^{\text{CV}}}
\def\CF{\hat{\Pi}^{\text{CF}}}
\def\nepoch{I_{\text{tr}}}


\newcommand{\metric}[2]{\left< #1, #2 \right>} 
\def\defn{\equiv}


\def\Ttrain{T_{\text{train}}}
\def\Ttest{T_{\text{test}}}










\title{Meta-learning Control Variates: Variance Reduction with Limited Data\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%



% Add authors
\author[1,3]{\href{mailto:<zhuo.sun.19@ucl.ac.uk>?Subject=Your UAI 2023 paper}{Zhuo Sun}{}}
\author[2,3]{Chris J. Oates}
\author[1,3]{Fran\c{c}ois-Xavier Briol}
% \author[1,2]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistical Science\\
    University College London, London, UK
}
\affil[2]{%
    School of Mathematics, Statistics \& Physics\\
    Newcastle University, UK
  }
\affil[3]{%
    The Alan Turing Institute, London, UK
}







% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{sun_447}



\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle



\appendix




In \Cref{appdx:proofs}, we provide the proof of the theoretical results stated in the main text. In \Cref{appdx:experimental_details}, we provide more details on the implementation of Neural-CVs and Meta-CVs, together with the full experimental protocol.



 

\section{Proof of Theorems}
\label{appdx:proofs}

In this section, we will firstly review the assumptions and theorems in \citep{ji2022theoretical_ms_gbml} in \Cref{appdx:previous_results} as the proof of the theorems follows the results of \citep{ji2022theoretical_ms_gbml}. We then give the proof of \Cref{theoCV:cvepsionexits} in \Cref{appdx:proof_thereom1} and proof of \Cref{theoCV:from_r_to_ri} in \Cref{appdx:proof_thereom2}.





\subsection{Convergence of Model-Agnostic Meta-Learning}
\label{appdx:previous_results}



\citet{ji2022theoretical_ms_gbml} analysed the convergence of model-agnostic meta-learning, as we will adapt their results to the training of CVs. 
Letting $O_t$ be either $S_t$ or $Q_t$, and phrasing in terms of the notation and setting used in this work, the assumptions of \citep{ji2022theoretical_ms_gbml} are:
\begin{enumerate}
\item[(A1)] $\min_t \inf_{\gamma} J_{O_t}(\gamma) >-\infty ;$
\item[(A2)] $ \chi := \max_t \sup_{\gamma \neq \zeta} \frac{ \|\nabla_\gamma J_{O_t}(\gamma) - \nabla_\zeta J_{O_t}(\zeta) \|_2 }{ \|\gamma - \zeta \|_2 } < \infty ;$
\item[(A3)] $ \rho := \max_t \sup_{\gamma \neq \zeta} \frac{ \|\nabla_\gamma^2 J_{O_t}(\gamma) - \nabla_\zeta^2 J_{O_t}(\zeta) \|_2 }{ \|\gamma - \zeta \|_2 } < \infty ;$
\item[(A4)] $ \sigma^2 := \max_t \sup_\gamma \| \nabla_\gamma J_{O_t}(\gamma) \|^2_2 < \infty ;$
\item[(A5)] $ b_t := \sup_\gamma \|J_{S_t}(\gamma) - J_{Q_t}(\gamma)\|_2 < \infty .$
\end{enumerate}

















\begin{theorem}[Theorem 9 and Corollary 10 \citep{ji2022theoretical_ms_gbml}]
\label{theo:theo9_coro10_in_conver_gbml}
 Let the above assumptions (A1) to (A5) hold. Then, with a meta step-size $\eta_i =\frac{1}{80\chi_{\eta_i}}$ for $i = 1, \ldots, \nepoch$ and $\alpha = \frac{1}{8 \chi L}$ in Algorithm \ref{alg:meta_neural_cv_train} , we attain a solution $\hat{\gamma}_{\text{\normalfont meta}}$ such that
 \begin{talign*}
     \E \|\E_t [ \nabla \J_t( \hat{\gamma}_{\text{\normalfont meta}} ) ] \|_2 = \O\left( \frac{1}{\nepoch} + \frac{\sigma^2}{B} + \sqrt{\frac{1}{\nepoch} + \frac{\sigma^2}{B}} \right),
 \end{talign*}
 where $\chi_{\eta_i} = (1+\alpha \chi)^{2L} + C_b b + C_\chi \E_t[\|\nabla J_{Q_t}( \hat{\gamma}_{\text{\normalfont meta}} )\|_2]$,
with $b = \mathbb{E}_{t}[b_t]$ and
$C_b =C_\chi =(\alpha \rho+ \nicefrac{\rho}{\chi}(1+\alpha \chi)^{L-1})(1+\alpha \chi)^{2L}$.
\end{theorem}



\begin{lemma}[Lemma 19 \citep{ji2022theoretical_ms_gbml}]
\label{lemma:lemma19_in_conver_gbml}
Under assumptions (A1) - (A5), for any $t$ and any $\gamma \in \R^{p+1}$, we have 
\begin{talign*}
    \| \E_{t} [ \nabla J_{Q_{t}}(\gamma) ] \|_2 \leq\frac{1}{C_1'}\|\E_{t} [ \nabla \J_t(\gamma) ] \|_2 + \frac{C_2'}{C_1'},
\end{talign*}
where $C_1'> 0$ and $C_2' > 0$ are constants given $C_1' = 2-(1+\alpha \chi)^{2L}$ and $C_2' = ((1+\alpha \chi)^{2L} - 1)\sigma + (1+\alpha \chi)^{L}((1+\alpha \chi)^L-1)b$.
\end{lemma}



\subsection{Proof of Theorem~\ref{theoCV:cvepsionexits}}
\label{appdx:proof_thereom1}
To prove \Cref{theoCV:cvepsionexits}, we firstly derive three useful propositions (P1-P3) based on our \Cref{assum:cv_new1} and \Cref{assum:cv_new2} in \Cref{sec:theory}, and then give the proof based on the above results from \citep{ji2022theoretical_ms_gbml}. 


For each task $t$, we claim that
\begin{enumerate}
\item[(P1)] $ \sup_{\gamma \neq \zeta} \frac{ \|\nabla_\gamma J_{O_t}(\gamma) - \nabla_\zeta J_{O_t}(\zeta) \|_2 }{ \|\gamma - \zeta \|_2 } < \infty ;$
\item[(P2)] $ \sup_{\gamma \neq \zeta} \frac{ \|\nabla_\gamma^2 J_{O_t}(\gamma) - \nabla_\zeta^2 J_{O_t}(\zeta) \|_2 }{ \|\gamma - \zeta \|_2 } < \infty ;$
\item[(P3)] $ \sup_\gamma \| \nabla_\gamma J_{O_t}(\gamma) \|_2 < \infty $,
\end{enumerate}
for both $O_t \in \{S_t,Q_t\}$.



\begin{proof}[Proof of P1-P3]
Denote the additive contribution of a single sample to the loss function as $l_t(x,\gamma) = (f_t(x) - g(x;\gamma))^2$.
First we will show that under \Cref{assum:cv_new1} and \Cref{assum:cv_new2}, we have: for each $t$ and $x \in D_t$, the function $\gamma \mapsto \nabla_\gamma \ell_t(x ; \gamma)$ is bounded and Lipschitz; and for each $t$ and $x \in D_t$, the function $\gamma \mapsto \nabla_\gamma^2 \ell_t(x ; \gamma)$ is Lipschitz. 
Then (P1-P3) follow immediately as $J_{Q_t}(\gamma)= \frac{1}{\vert Q_t \vert}\sum_{x\in Q_t}l_t(x; \gamma)$ and $J_{S_t}(\gamma)= \frac{1}{\vert S_t \vert}\sum_{x\in S_t}l_t(x; \gamma)$. 

From direct calculation, we have:
\begin{align*}
\nabla_\gamma \ell_t(x ; \gamma) & = - 2 (f_t(x) - g(x ; \gamma)) \nabla_\gamma g(x;\gamma) \\
\nabla_\gamma^2 \ell_t(x ; \gamma) & = 2 (f_t(x) - g(x ; \gamma)) \nabla_\gamma g(x;\gamma) \nabla_\gamma g(x;\gamma)^\top  - 2 (f_t(x) - g(x ; \gamma)) \nabla_\gamma^2 g(x;\gamma) \\
& = 2 (f_t(x) - g(x ; \gamma)) \left[ \nabla_\gamma g(x;\gamma) \nabla_\gamma g(x;\gamma)^\top - \nabla_\gamma^2 g(x;\gamma) \right] 
\end{align*}
and taking differences:
\begin{align*}
\| \nabla_\gamma \ell_t(x ; \gamma) - \nabla_\zeta \ell_t(x ; \zeta) \|_2 & = \| - 2 (f_t(x) - g(x ; \gamma)) \nabla_\gamma g(x;\gamma) + 2 (f_t(x) - g(x ; \zeta)) \nabla_\zeta g(x;\zeta) \|_2 \\
& \leq 2 |f_t(x)| \| \nabla_\gamma g(x ; \gamma) - \nabla_\zeta g(x ; \zeta) \|_2 \\
& \qquad + 2 \| g(x ; \gamma) \nabla_\gamma g(x;\gamma) - g(x ; \zeta) \nabla_\zeta g(x;\zeta) \|_2 \\
& \leq 2 |f_t(x)| \| \nabla_\gamma g(x ; \gamma) - \nabla_\zeta g(x ; \zeta) \|_2 \\
& \qquad + 2 | g(x ; \gamma) | \| \nabla_\gamma g(x;\gamma) - \nabla_\zeta g(x ; \zeta) \|_2 + 2 \| \nabla_\zeta g(x;\zeta)\|_2  |g(x;\gamma) - g(x;\zeta)| .
\end{align*}
So, for each $t$ and $x \in D_t$, the function $\gamma \mapsto \nabla_\gamma \ell_t(x ; \gamma)$ is bounded and Lipschitz when the functions $\gamma \mapsto g(x;\gamma)$ and $\gamma \mapsto \nabla_\gamma g(x;\gamma)$ are bounded and Lipschitz (i.e. \Cref{assum:cv_new1}).

Then taking differences and bounding terms in a similar manner, we have, 
\begin{align*}
\| \nabla_\gamma^2 \ell_t(x ; \gamma) - \nabla_\zeta^2 \ell_t(x ; \zeta) \|_2 & \leq 2 |f_t(x)| \| \nabla_\gamma g(x;\gamma) \nabla_\gamma g(x;\gamma)^\top - \nabla_\gamma^2 g(x;\gamma) \\
& \hspace{60pt} - \nabla_\zeta g(x;\zeta) \nabla_\zeta g(x;\zeta)^\top + \nabla_\zeta^2 g(x;\zeta) \|_2 \\
& \qquad + 2|g(x;\gamma)| \| \nabla_\gamma g(x;\gamma) \nabla_\gamma g(x;\gamma)^\top - \nabla_\gamma^2 g(x;\gamma) \\
& \hspace{100pt} - \nabla_\zeta g(x;\zeta) \nabla_\zeta g(x;\zeta)^\top + \nabla_\zeta^2 g(x;\zeta) \|_2 \\
& \qquad + 2\|  \nabla_\zeta g(x;\zeta) \nabla_\zeta g(x;\zeta)^\top - \nabla_\zeta^2 g(x;\zeta) \|_2 |g(x;\gamma) - g(x;\zeta)|
\end{align*}
So for each $t$ and $x \in D_t$, the function $\gamma \mapsto \nabla_\gamma^2 \ell_t(x ; \gamma)$ is Lipschitz when the functions $\gamma \mapsto \nabla_\gamma g(x;\gamma) \nabla_\gamma g(x;\gamma)^\top - \nabla_\gamma^2 g(x;\gamma)$ are bounded and Lipschitz (i.e. \Cref{assum:cv_new2}).
\end{proof}





\paragraph{Proof of \Cref{theoCV:cvepsionexits}:} 
\begin{proof}
    Assumption (A1) is automatically satisfied.  (P1) and (P2) above imply (A2) and (A3). (P3) above implies (A4). 
    
    Note that \Cref{assum:cv_new1} implies (A5). This is because, for each $t$, $x\in D_t$, we have $\sup_\gamma l_t(x; \gamma):=\sup_\gamma(f_t(x) - g(x;\gamma))^2 < \infty$ as we assume that $\gamma \mapsto g(x;\gamma)$ is bounded and $f_t(x)$ is constant in $\gamma$. Thus, $\sup_\gamma J_{O_t}(\gamma) =\frac{1}{|O_t|} \sum_{x \in O_t} l_t(x;\gamma)<\infty$ where $O_t$ can be either $S_t$ or $Q_t$. So $\sup_\gamma \|J_{S_t}(\gamma) - J_{Q_t}(\gamma)\|_2 <\infty$.
    
    Then, \Cref{theoCV:cvepsionexits} follow from the conclusion of \Cref{theo:theo9_coro10_in_conver_gbml}.
\end{proof}






\subsection{Proof of Corollary~\ref{theoCV:from_r_to_ri}}
\label{appdx:proof_thereom2}


\begin{proof} Since \Cref{assum:cv_new1} and \Cref{assum:cv_new2} imply (A1) to (A5) in \Cref{appdx:previous_results}, we will use the constants defined earlier in \Cref{appdx:previous_results} here as well.
Firstly, note that given $\hat{\gamma}_{\epsilon}$, with 
\begin{talign*}
    \alpha < \frac{\exp(\frac{\log 2}{2 L})-1}{\chi} = \frac{2^{\frac{1}{2L}}-1}{\chi} ,
\end{talign*}
we have:
$\E\| \E_{t} [ \nabla J_{Q_{t}}(\hat{\gamma}_{\epsilon}) ] \|_2 \leq\frac{1}{C_1'}\epsilon + \frac{C_2'}{C_1'}$ by taking $\gamma = \hat{\gamma}_{\epsilon}$ in \Cref{lemma:lemma19_in_conver_gbml}.


If then additionally $ \nabla^2 J_{Q_t}(\gamma) \succeq \mu I_{p+1}$ holds, by (9.11) in \cite{boyd2004convex} we have,
\begin{talign*}
    \|\gamma -\gamma_t^* \|_2 \leq \frac{2}{\mu} \|\nabla 
    J_{Q_t}(\gamma)\|_2.
\end{talign*}


Taking the expectation of both sides, we then have
\begin{talign*}
         \E_{t} [ \|\gamma - \gamma_t^*\|_2 ] &\leq \frac{2}{\mu} \E_{t}[\|\nabla J_{Q_t}(\gamma)\|_2] \\
         &\overset{(i)} \leq \frac{2}{\mu} (\|\E_{t }[\nabla J_{Q_t}(\gamma)] \|_2 +\sigma ) ,
\end{talign*}
where $(i)$ follows from \citep{ji2022theoretical_ms_gbml} (Page 35, Line 8). Take $\gamma = \hat{\gamma}_{\epsilon}$ and take the expectation of both sides. Then by \Cref{theoCV:cvepsionexits},
\begin{talign*}
     \E[\E_{t} [\|\hat{\gamma}_{\epsilon} - \gamma_t^*\|_2 ]] 
     &\leq \frac{2}{\mu} \E [ \|\E_{t}[\nabla J_{Q_t}(\hat{\gamma}_{\epsilon})] \|_2 ] + \frac{2\sigma }{\mu} \\
     &\leq \frac{2}{\mu} \left(\frac{1}{C_1'}\epsilon + \frac{C_2'}{C_1'}\right)+\frac{2\sigma }{\mu} \\
     &= \frac{2}{\mu C_1'}\epsilon + \frac{2(\sigma C_1' + C_2')}{\mu C_1'} \\
     &=\frac{C_{1}}{\mu}\epsilon + \frac{C_{2}}{\mu} ,
\end{talign*}
where $C_{1} = \frac{2}{C_1'}$ and $C_{2} =\frac{2(\sigma C_1' + C_2')}{C_1'} $.
\end{proof}





























%%%%%%
\section{Experimental Details}
\label{appdx:experimental_details}

In this section, we provide more experimental details and implementation details of Neural-CVs and Meta-CVs. Details of the synthetic example are presented in \Cref{appdx:experiments_oscillatory}. Details of the boundary-value ODE are provided in \Cref{appdx:boundary_value_ODEs}. Details of Bayesian inference for the Lotka--Volterra system are provided in \Cref{appdx:experiments_lotka}. Details of the Sarcos robot arm are presented in \Cref{appdx:sarcos}. 








    


\subsection{Experiment: Oscillatory Family of Functions}
\label{appdx:experiments_oscillatory}

Our environment $\rho$ consists of independent distributions on each element of $a$. For $a_1$, we select a $\textsf{Unif}(0.4, 0.6)$, whilst for all other parameters we select a $\textsf{Unif}(4,6)$. Each task is of the form $\mathcal{T}_t = \{f_t(x;a_t), \pi_t\}$ where $a_t := (a_{t,1}, a_{t,2:d+1})^\top$ is a sample from $\rho$. This creates potentially infinite number of integral estimation tasks as $a$ is continuous. The target distributions are $\pi_1(x) = \ldots =\pi_T(x) = \textsf{Unif}(0,1)^d$ where $d$ is the dimension of $x$.


For all experiments of this example, we set the neural network identical for both Meta CVs and Neural CVs. That is, a fully connected neural network with two hidden layers. Each layer has $80$ neurons while the output layer has $1$ neurons (the output then is multiplied by a identity matrix $I_d$ to used as $\tilde{u}$ where $d$ is the dimension of the input $x$). The total number of parameters of the neural network $p = 80d + 6641$ where $d$ the dimension of the input $x$. The activation function is the sigmoid function. The neural network is served as $\tilde{u}$ and we apply Langevin Stein operator onto $\tilde{u}(x)\delta(x)$ where $\delta(x) = \prod_{j=1}^d x_j(1-x_j)$ to satisfy assumptions in \citep{oates2019convergence}. For experiments in this example, we use Adam as the $\textsc{Update}$ rule in this example and the penalty constant $\lambda$ is set to be $5\times 10^{-6}$.

\paragraph{2-dimensional Oscillatory Family of Functions}
\begin{itemize}
    \item For Meta-CVs: The inner step size $\alpha = 0.01$. The number of inner gradient steps is $L=1$. The meta step size $\eta = 0.002$ for all meta iterations. The number of meta iteration $\nepoch$ is set to be $4,000$. The meta batch size of tasks $B$ is set to be $5$.
    \item  For Neural-CVs: The step size (learning rate) is $0.002$. The number of training epochs for each task is set to be $20$ with batch size $5$.
    
   \item For Control functionals: we use radius basis function $k(x,x') =  \exp( - \frac{\|x-x'\|_2^2}{2v})$ with kernel hyperparameter $v >0$ as the base kernel for control functionals. The hyper-parameter $v$ is tuned by maximising the marginal likelihood of the Stein kernel on $S_t$ for each task. Optimal control functionals are selected by using $S_t$ and then unbiased control functional estimators are constructed by using $Q_t$ of each task.

\end{itemize}


\paragraph{Impact of the Number of Inner Updates $L$}
\begin{itemize}
    \item  For Meta-CVs: The inner step size $\alpha = \frac{0.01}{50 \times L}$ for $L \in \{1,3,5,7,10\}$. The meta step size $\eta = 0.002$ for all meta iterations. The number of meta iteration $\nepoch$ is set to be $4,000$. The meta batch size of tasks $B$ is set to be $5$.
\end{itemize}


\paragraph{Impact of Dimensions}
\begin{itemize}
    \item For Meta-CVs: The inner step size $\alpha = 0.01$. The number of inner gradient steps is $L=1$. The meta step size $\eta = 0.002$ for all meta iterations. The number of meta iteration $\nepoch$ is set to be $4,000$. The meta batch size of tasks $B$ is set to be $5$.
    \item  For Neural-CVs: The step size (learning rate) is $0.002$. The number of training epochs for each task is set to be $20$ with batch size $5$.

   \item For Control functionals: we use radius basis function $k(x,x') =  \exp(-\frac{\|x-x'\|_2^2}{2v})$ with kernel hyperparameter $v >0$ as the base kernel for control functionals. The hyper-parameter $v$ is tuned by maximising the marginal likelihood of the Stein kernel on $S_t$ for each task. Optimal control functionals are selected by using $S_t$ and then unbiased control functional estimators are constructed by using $Q_t$ of each task.

\end{itemize}


\paragraph{Impact of $B$ and $\nepoch$ of Meta-CVs}
\begin{itemize}
    \item The inner step size $\alpha =0.01$. The number of inner gradient steps is $L=1$. The meta step size is $\eta= 0.002$ for all meta iterations.
\end{itemize}


\subsection{Experiment: Boundary Value ODEs}
\label{appdx:boundary_value_ODEs}
For all experiments of this example, we set the neural network identical for both Meta-CVs and Neural-CVs. That is, a fully connected neural network with three hidden layers. Each layer has $80$ neurons while the output layer has $1$ neurons. The total number of parameters of the neural network $p =13,201$. The activation function is the sigmoid function. We use Adam as the $\textsc{Update}$ rule in this example and the penalty constant $\lambda$ is set to be $5\times 10^{-6}$.

\begin{itemize}
    \item For Meta-CVs: The inner step size $\alpha = 0.01$ and the meta step size $\eta = 0.002$ for all meta iterations. The number of inner updates is $L=1$. The number of meta iteration $\nepoch$ is set to be $2,000$. The meta batch size of tasks is set to be $5$. 
    \item For Neural-CVs: The step size (learning rate) is $0.002$. The number of training epochs for each task is set to be $20$ with batch size $5$.
\end{itemize}








\subsection{Experiment: Bayesian Inference of Lotka-Volterra System}
\label{appdx:experiments_lotka}

The $\log$-$\exp$ transform is used on the model parameters $x$ to avoid constrained parameters on the ODE directly. We reparameterised the Lotka—Volterra system as,
\begin{talign*}
    \frac{\mathrm{d}u_1(s)}{\mathrm{d}s} &= \tilde{x}_1 u_1(s)- \tilde{x}_2 u_1(s) u_2(s)\\
    \frac{\mathrm{d}u_2(s)}{\mathrm{d}s} &= \tilde{x}_3 u_1(s) u_2(s)- \tilde{x}_4 u_2(s),
\end{talign*}
where 
\begin{talign*}
    &\tilde{x}_1 =\exp(x_1), \tilde{x}_2  = \exp(x_2) , \\
    &\tilde{x}_3 = \exp(x_3),  \tilde{x}_4 = \exp(x_4),
\end{talign*}
where $u_1$ and $u_2$ represents the number of preys and predators, respectively. 


\noindent The model is,
\begin{talign*}
    y_{1}(0) \sim \text{Log-Normal}(\log \tilde{x}_5, \tilde{x}_7)\\
    y_{2}(0) \sim \text{Log-Normal}(\log \tilde{x}_6, \tilde{x}_8) \\
    y_{1}(s) \sim \text{Log-Normal}(\log u_1(s), \tilde{x}_7) \\
    y_{2}(s) \sim \text{Log-Normal}(\log u_2(s), \tilde{x}_8) 
\end{talign*}
where 
\begin{talign*}
    &\tilde{x}_5: = \exp(x_5), \tilde{x}_6: = \exp(x_6)\\
    &\tilde{x}_{7}:=\exp(x_7), \tilde{x}_{8}=\exp(x_8). 
\end{talign*}


\noindent By doing so, $x$ is then on the whole $\R^8$. As a result, the prior distribution $\pi(x)$ is defined on $\R^8$ and Stan will return the scores of these parameters directly as these 8 parameters $x$ themselves are unconstrained through manually reparameterisation directly.


\noindent Priors are,
\begin{talign*}
  x_1 ,x_4 &\sim \text{Normal}(0, 0.5^2) \\
  x_2 , x_3 &\sim \text{Normal}(-3, 0.5^2) \\
  x_5, x_6 &\sim \text{Normal}(\log 10,1^2) \\
  x_7, x_8 &\sim \text{Normal}(-1,1^2)
\end{talign*}



\paragraph{Inference of $x_1$ and $x_2$} 
\begin{itemize}
    \item For both Meta-CVs and Neural-CVs: We use a fully connected neural network with $3$ hidden layers. Each layer has $5$ neurons while the output layer has $8$ neurons. The total number of parameters of the neural network $p =153$.  The activation function is the tanh function. All parameters of neural networks are initialised with a Gaussian distribution with zero mean and standard deviation $0.01$ except of $\gamma_{t,0}$ is initialised at the Monte Carlo estimator of each task. We use Adam as the $\textsc{Update}$ rule in this example and the penalty constant $\lambda$ is set to be $5\times 10^{-5}$.
    
    \item For Meta-CVs: The inner step size $\alpha = 0.0001$. The number of inner gradient steps is $L=1$. The meta step size was initialised at $0.001$ with a step size decay ($\eta_{i+10} = 0.9 \eta_{i} $) every $10$ meta iterations. The number of meta iteration $\nepoch$ is set to be $2,000$. The meta batch size of tasks $B$ is set to be $5$. We only use 100 tasks (sub-populations) for learning the Meta-CVs. For each of these 100 tasks, we have more than $N_t$ data points (also because MCMC sampler will return more than $N_t$ samples, so we reuse all of them) such that we can learn Meta-CV with $\nepoch = 2000$ and $B=5$.
    
    \item For Neural-CVs: The step size (learning rate) is $0.001$. The number of training epochs for each task is set to be $20$ with batch size $5$.
\end{itemize}


\paragraph{Inference of $x_3$ and $x_4$} 
\begin{itemize}
    \item For both Meta-CVs and Neural-CVs: We use a fully connected neural network with $3$ hidden layers. Each layer has $3$ neurons while the output layer has $8$ neurons. The total number of parameters of the neural network $p =83$.  The activation function is the tanh function. All parameters of neural networks are initialised with a Gaussian distribution with zero mean and standard deviation $0.01$ except of $\gamma_{t,0}$ is initialised at the Monte Carlo estimator of each task. We use Adam as the $\textsc{Update}$ rule in this example and the penalty constant $\lambda$ is set to be $5\times 10^{-5}$.
    
    \item For Meta-CVs:  The inner step size $\alpha = 0.001$. The number of inner gradient steps is $L=1$. The meta step size was initialised at $0.001$ with a step size decay ($\eta_{i+10} = 0.9 \eta_{i} $) every $10$ meta iterations. The number of meta iteration $\nepoch$ is set to be $2,000$. The meta batch size of tasks $B$ is set to be $5$. We only use 100 tasks (sub-populations) for learning the Meta-CVs. For each of these 100 tasks, we have more than $N_t$ data points (also because MCMC sampler will return more than $N_t$ samples, so we reuse all of them) such that we can learn Meta-CV with $\nepoch = 2000$ and $B=5$.
    
    \item For Neural-CVs: The step size (learning rate) is $0.001$. The number of training epochs for each task is set to be $20$ with batch size $5$.
\end{itemize}



\subsection{Experiment: Sarcos Robot Arm}
\label{appdx:sarcos}




\paragraph{Approximate Inference of Full Bayesian Gaussian Process Regression} We learn full Bayesian hierarchical Gaussian processes by variational inference \citep{kucukelbir2017automatic_VL, lalchand2020approximate_FB_GP}.



We set $\sigma=0.1$, $\pi(x_1) = \textsf{Gamma}(25,25)$ and $\pi(x_2)= \textsf{Gamma}(25,25)$, which is the prior used in \citep{oates2017_CF_for_MonteCarloIntegration}.
We transform the kernel hyper-parameters $x \in \R^{2+}$ to $\eta = g(x)= \log x$ such that we can learn a variational distribution $q_{\phi}(\eta)$ of $\eta$ in $\R^2$ and then transform back to $q(x)$. We use full rank approximation which means the variational family takes the following form:
\begin{talign*}
    q_{\phi}(\eta) = \textsf{N}(\mu, VV^\top) ,
\end{talign*}
with variational parameter $\phi:= \{\mu, V\} \in \R^{p+p(p+1)/2}$ where $\mu$ is a column vector and $V$ is a lower triangular matrix. The objective of variational inference is to maximize the evidence lower with respect to $\phi$, which is given by,
\begin{talign*}
    \textsf{ELBO}(\phi) &= \E_{q_\phi} [\log p(y_{1:q}, e^{\eta}) + \log \vert \textsf{Jacobian}_{g^{-1}}(\eta) \vert] - \E_{q_{\phi}}[\log q_{\phi}(\eta)] \\
    &=  \E_{q_\phi} [\log p(y_{1:q}|e^{\eta}) +\log \pi(e^{\eta}) + \log \vert \textsf{Jacobian}_{g^{-1}}(\eta) \vert] - \E_{q_{\phi}}[\log q_{\phi}(\eta)]
\end{talign*}
The expectations involved in $\textsf{ELBO}(\phi)$ are approximated by Monte Carlo estimators and we use re-parametrization trick \citep{kingma2013_VAE} to learn $\phi$. \Cref{fig:prior_post_kernelparams} demonstrates the prior and the corresponding posterior of the kernel hyper-parameters $x=(x_1, x_2)$ (in the form of $2$d histograms).

\begin{figure}[ht]
    \centering
   \includegraphics[width=0.6\columnwidth]{sarcos/posterior_theta_gammaprior.pdf}
    \caption{Priors and Posteriors of Kernel Hyper-parameters $x$.}
\label{fig:prior_post_kernelparams}
\end{figure}



\paragraph{Settings}
\begin{itemize}
   \item For both Meta-CVs and Neural-CVs, a fully connected neural network with $5$ hidden layers. Each layer has $20$ neurons while the output layer has $2$ neurons (the output then is timed by a identity matrix $I_2$ to used as $u$ since $2$ is the dimension of the input $x$). The total number of parameters of the neural network $p =10,401$. The activation function is the sigmoid function.  All parameters of neural networks are initialised with a Gaussian distribution with zero mean and standard deviation $0.001$. We use Adam as the $\textsc{Update}$ rule in this example and the penalty constant $\lambda$ is set to be $1\times 10^{-10}$.
    \item For Meta-CVs: The inner step size $\alpha = 0.01$. The meta step size was initialised at $0.001$ with a step size decay ($\eta_{i+10} = 0.9 \eta_{i} $) every $10$ meta iterations. The number of meta iteration $\nepoch$ is set to be $1,000$. The meta batch size of tasks $B$ is set to be $1$. 
    \item For Neural CV: The step size (learning rate) is $0.001$. The number of training epochs for each task is set to be $20$ with batch size $5$.
   \item For Control functionals: we use radius basis function $k(x,x') =  \exp(-\frac{\|x-x'\|_2^2}{2v})$ with kernel hyperparameter $v >0$ as the base kernel for control functionals. The hyper-parameter $v$ is tuned by maximising the marginal likelihood with the Stein kernel on $S_t$ for each task. Optimal control functionals are selected by using $S_t$ and then unbiased control functional estimators are constructed by using $Q_t$ of each task.

\end{itemize}


\paragraph{Extra Experiments} In addition, we test the performance of Meta-CVs on the same tasks used for learning the Meta-CV. Under the same setting described above, the comparisons between Meta-CVs and other methods are presented in \Cref{fig:fb_Sarcos_gammaprior_investinsteps_sameMetaTrainTest}.


\begin{figure}[ht]
    \centering
   \includegraphics[width=0.6\columnwidth]{sarcos/fb_Sarcos_gammaprior_investinsteps_sameMetaTrainTest.pdf}
    \caption{Estimated absolute errors over the same training states (which are used for learning the Meta-CV) of the Sarcos anthropomorphic robot arm (CF: Control functionals; NCV: Neural-CVs; MCV-L: Meta-CVs with L inner steps).}
\label{fig:fb_Sarcos_gammaprior_investinsteps_sameMetaTrainTest}
\end{figure}


\bibliography{sun_447}

\end{document}
