%\documentclass[table, xcdraw, numbered]{arxiv} 
\documentclass[accepted]{uai2023}

\usepackage{xr-hyper}
\usepackage[table]{xcolor}
\usepackage[american]{babel}
\usepackage{natbib} 
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsthm}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
%% Contribution
\newcommand*\samethanks[1][\value{footnote}]{\footnotemark[#1]}

\title{Functional Causal Bayesian Optimization \\ (Supplementary material)}

\author[1,\thanks{Equal contribution.}]{Limor Gultchin}
\author[2,\samethanks]{Virginia Aglietti}
\author[2]{Alexis Bellot}
\author[2]{Silvia Chiappa}
\affil[1]{University of Oxford, The Alan Turing Institute, Work done at DeepMind, London, UK}
\affil[2]{DeepMind, London, UK}

\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{graphicx}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain} % just in case the style had changed
\newcommand{\thistheoremname}{}
\newtheorem*{genericthm*}{\thistheoremname}
\newenvironment{namedthm*}[1]
  {\renewcommand{\thistheoremname}{#1}%
  \begin{genericthm*}}
  {\end{genericthm*}}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{amsfonts,dsfont}
\usepackage[]{xcolor}
\definecolor{darkPurple}{HTML}{423C70}
\definecolor{darkGreen}{HTML}{00544B}
\definecolor{cboGreen}{HTML}{045D04}
\definecolor{lightGreen}{HTML}{ADDC37}
\definecolor{brightBlue}{HTML}{47A6FB}
\definecolor{cocaBOPurple}{HTML}{95A5A6}
\definecolor{mcbosoftpurple}{HTML}{BB8FCE}

\usepackage{wrapfig}
\usepackage{tikz}
\usepackage[colorinlistoftodos, textwidth=26mm, shadow, color=blue!30!white, textsize=tiny]{todonotes}
\setlength{\marginparwidth}{2.5cm}%for the todo package
\usepackage{cancel} 
\usetikzlibrary{arrows,shapes,backgrounds,through,shadows}
\usetikzlibrary{decorations.pathmorphing,calc}

\usepackage{mathrsfs}
\usepackage{bbm}
\usepackage{bm}

\tikzset{
  dot node/.style={
    shape=circle,
    % fill=white,
    draw,
    inner sep=+0pt,
    minimum size=+4.mm
  },
  dotdot node/.style 2 args={
    dot node,
    label={[shape=circle,fill=gray,outer sep=+0pt,inner sep=+0pt,minimum size=+2.mm]center:}
  },
  arc style/.style={
    |<->|,
    shorten >=+-.5\pgflinewidth,
    shorten <=+-.5\pgflinewidth,
  }
}

\usepackage{array,multirow,graphicx}
\usepackage{float}
\usepackage[algo2e]{algorithm2e} 
\usepackage{algorithm, algorithmic}
\usepackage{enumitem}
\usepackage{tikz-network}
\usepackage{amssymb}
\usetikzlibrary{automata,positioning}
\tikzstyle{dot}=[circle,fill,inner sep=2.5pt]  % dot node
\tikzstyle{dgraph}=[->, line width=1.5pt]
\newcommand{\arr}{-{Triangle[length=2mm, width=2mm]}}
\newcommand{\indep}{\rotatebox[origin=c]{90}{$\models$}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% NOTATION and MACROS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\renewcommand{\eqref}[1]{Eq. (\ref{#1})}
\newcommand{\figref}[1]{Fig. \ref{#1}}
\newcommand{\secref}[1]{Section \ref{#1}}
\newcommand{\mat}[1]{\mathbf{#1}}
\newcommand{\vecS}[1]{\boldsymbol{ #1 }  } % this for boldsymbols
\newcommand{\expectation}[2]{\mathbb{E}_{#1}[#2]}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareBoldMathCommand{\A}{A}
\DeclareBoldMathCommand{\X}{X}
\DeclareBoldMathCommand{\x}{x}
\DeclareBoldMathCommand{\Z}{Z}
\DeclareBoldMathCommand{\z}{z}
\DeclareBoldMathCommand{\I}{I}
\DeclareBoldMathCommand{\U}{U}
\DeclareBoldMathCommand{\V}{V}
\DeclareBoldMathCommand{\F}{F}
\DeclareBoldMathCommand{\C}{C}
\DeclareBoldMathCommand{\c}{c}
\newcommand{\datai}{\mathcal{D}^I}
\newcommand{\acronospace}[1]{\textsc{#1}}
\newcommand{\gpstext}{\acronospace{gp}s}
\newcommand{\graph}{\mathcal{G}}
\newcommand{\boldtheta}{\bm{\theta}}
\newcommand{\boldomega}{\bm{\omega}}
\newcommand{\range}{\mathcal{R}}
\newcommand{\cond}{\,|\,}

\newcommand{\acro}[1]{\textsc{#1}\xspace}
\newcommand{\ie}{i.e.\xspace}
\newcommand{\eg}{e.g.\xspace}
\newcommand{\doi}{\text{do}}
\newcommand{\scm}{\acro{scm}}
\newcommand{\scms}{\acronospace{scm}s}
\newcommand{\gp}{\mathcal{GP}}
\newcommand{\gptext}{\acro{gp}}
\newcommand{\rbf}{\acro{rbf}}
\newcommand{\psa}{\acro{psa}}
\newcommand{\bmi}{\acro{bmi}}
\newcommand{\bmr}{\acro{bmr}}
\newcommand{\rkhs}{\acro{rkhs}}
\newcommand{\mps}{\acro{mps}}
\newcommand{\dmp}{\acro{dmp}}
\newcommand{\mdp}{\acro{mdp}}
\newcommand{\ci}{\acro{ci}}
\newcommand{\Age}{\text{Age}}
\newcommand{\BMI}{\text{BMI}}
\newcommand{\DAG}{\acro{dag}}
\newcommand{\Aspirin}{\text{Aspirin}}
\newcommand{\Statin}{\text{Statin}}
\newcommand{\Height}{\text{Height}}
\newcommand{\health}{\acro{health}}
\newcommand{\echain}{\acro{chain}}
\newcommand{\st}{*}
\newcommand{\ngroup}{\acronospace{ng}roup }
\newcommand{\pgroup}{\acronospace{pg}roup }

% MPS
\newcommand{\calF}{\mathcal{F}}
\newcommand{\Sset}{\mathcal{S}}
\newcommand{\piS}{\mathbf{\pi}_{\Sset}}
\newcommand{\mupistarS}{\mu^Y_{\pi^\st_{\Sset}}}
\newcommand{\mupistarX}{\mu^Y_{\pi^\st_{X}}}
\newcommand{\PiS}{\Pi_{\Sset}}
\newcommand{\svalue}{\textit{s}}
\newcommand{\cvalue}{\mathbf{c}}

% Power set of MPS
\newcommand{\powersetmps}{\Sigma}

% MIS among the MPS
\newcommand{\mismps}{{\mathbb{M}}_{\Sigma}}

% bold Greek letters
\newcommand{\boldpi}{\boldsymbol{\pi}}
\newcommand{\boldalpha}{\boldsymbol{\alpha}}
\newcommand{\boldbeta}{\boldsymbol{\beta}}

% methods acronym
\newcommand{\cgo}{\acro{cgo}}
\newcommand{\fcgo}{f\acro{cgo}}
\newcommand{\fgo}{\acro{fgo}}
\newcommand{\cbo}{\acro{cbo}}
\newcommand{\fcbo}{f\acro{cbo}}
\newcommand{\cocabo}{\acronospace{c}o\acronospace{c}a-\bo}
\newcommand{\fbo}{\acro{bfo}}
\newcommand{\bo}{\acro{bo}}
\newcommand{\mcbo}{\acro{mcbo}}
\newcommand{\fei}{f\acro{ei}}

% subscript for hard and soft
\newcommand{\hardsubscript}{\text{hard}}
\newcommand{\softsubscript}{\text{func}}
\newcommand{\functsubscript}{\text{func}}

%cost notation
\newcommand{\cost}{\texttt{Co}}

% Couple in the Mixed policy scope
\newcommand{\langerangle}[2]{\langle #1, #2 \rangle}

\newcommand{\pa}{\text{pa}}
\newcommand{\opa}{\text{pa}}
\newcommand{\an}{\text{an}}
\newcommand{\de}{\text{de}}
\newcommand{\spo}{\text{sp}}
\def\*#1{\mathbf{#1}}
\def\1#1{\mathcal{#1}}
\def\2#1{\mathscr{#1}}
\def\3#1{\mathbb{#1}}

\newcommand{\kappapar}{\xi}

\usepackage{multicol,multirow} \usepackage{hhline} 
\newcommand{\nrmpsreduce}{\texttt{NRMPSReduce}}
\usepackage{subfig}
\usepackage{graphicx}
\usepackage[most]{tcolorbox}

\usepackage{xr-hyper}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{gultchin_486}


\begin{document}
\onecolumn
\maketitle

\section{Proofs}\label{sec:app:proofs_31}
\begin{namedthm*}{Proposition \ref{prop:hard_suboptimality}}
Let $\graph$ be a causal graph such that (i) $\exists C\in \pa_{\graph}(Y)$ with $C\notin \I$; or (ii) $\exists C\in \text{sp}_{\graph}(Y)$. If $\exists X \in \an_{\graph}(Y) \cap \I$ such that $\{\langle X, C \rangle\}$ is an \mps, then there exists at least one \scm compatible with $\graph$ for which $\min_{\Sset \in \Sigma_{\hardsubscript}, \piS \in \PiS} \mu^Y_{\piS}>\min_{\Sset \in \Sigma, \piS \in \PiS} \mu^Y_{\piS}$.
\end{namedthm*}

\begin{proof}
\emph{Case (i)}: 
Assume that there exists $C\in \pa_{\graph}(Y)$ with $C\notin \I$ and $X \in \an_{\graph}(Y) \cap \I$ such that $\{\langle X, C \rangle\}$ is an \mps. As $X \in \an_{\graph}(Y)$, there exists a directed path from $X$ to $Y$, say $X \rightarrow X_i \rightarrow X_{i-1} \rightarrow \cdots \rightarrow X_1 \rightarrow Y$ without loss of generality. Let ${\cal M} = \langle \V, \U, \calF, p(\U) \rangle$ be an \scm such that
\begin{align*}
&C = U_{C}, \, U_{C} \sim \mathcal N(0,1), \\
&X_i = X, \, X_{i-1} = X_i,\, \dots,\, X_1 = X_2, \\
&Y = X_1 C U_Y, \, U_Y \sim \mathcal N(1,1).
\end{align*}    
${\cal M}$ is compatible with $\graph$. 
In this \scm, any \dmp 
$\pi_\Sset$ with $\Sset \in \Sigma_{\hardsubscript}$ would give  $\mu_{\pi_\Sset}^Y=\mathbb{E}_{\pi_\Sset}[Y]=0$.
In contrast, a \dmp $\pi_\Sset$ including the functional intervention $\pi_{X|C}(C)= -1/C$ would result in $Y = - U_Y$ and therefore  $\mu_{\pi_\Sset}^Y = -1$, giving $\min_{\Sset \in \Sigma_{\hardsubscript}, \piS \in \PiS} \mu^Y_{\piS}=0>-1\geq \min_{\Sset \in \Sigma, \piS \in \PiS} \mu^Y_{\piS}$.

\emph{Case (ii)}: 
Assume that there exists $C\in \text{sp}_{\graph}(Y)$ and $X \in \an_{\graph}(Y) \cap \I$ such that $\{\langle X, C \rangle\}$ is an \mps. As $X \in \an_{\graph}(Y)$, there exists a directed path from $X$ to $Y$, say $X \rightarrow X_{i} \rightarrow X_{i-1} \rightarrow \cdots \rightarrow X_1 \rightarrow Y$.

Let ${\cal M} = \langle \V, \U, \calF, p(\U) \rangle$ be an \scm such that
\begin{align*}
&C = U_{CY},\, U_{CY} \sim \mathcal N(0,1), \\
&X_i = X, \, X_{i-1} = X_i,\, \dots,\, X_1 = X_2, \\
&Y = X_1 U_{CY} U_Y, \, U_Y \sim \mathcal N(1,1).
\end{align*}    
${\cal M}$ is compatible with $\graph$. In this \scm, any \dmp $\pi_\Sset$ with $\Sset \in \Sigma_{\hardsubscript}$ would give
$\mu_{\pi_\Sset}^Y=\mathbb{E}_{\pi_\Sset}[Y]=0$. In contrast, a \dmp $\pi_\Sset$ containing the functional intervention $\pi_{X|C}(C)= -1/C$, would result in $Y = - U_Y$ and therefore $\mu_{\pi_\Sset}^Y = -1$, giving $\min_{\Sset \in \Sigma_{\hardsubscript}, \piS \in \PiS} \mu^Y_{\piS}=0>-1\geq \min_{\Sset \in \Sigma, \piS \in \PiS} \mu^Y_{\piS}$. 
\end{proof}

In the following proposition we use the notation $\graph_{\underline{\X}}$ to indicate the modification of $\graph$ obtained by removing the outgoing edges from $\X$.

\begin{namedthm*}{Proposition \ref{prop:hard_optimality}}
In a casual graph $\graph$, if $\pa_{\graph}(Y)\subseteq \I$ and $\text{sp}_{\graph}(Y)=\emptyset$ there exists a \dmp compatible with \mps $\Sset=\{\langerangle{X}{\emptyset}: X \in \pa_{\graph}(Y)\}$ that solves the \fcgo problem.
\end{namedthm*}

\begin{proof}
Consider \mps $\Sset\in\Sigma$ for $\graph$ and \dmp $\pi_{\mathcal S}$ compatible with $\Sset$. Let $\Z = \pa_{\graph}(Y) \backslash ((\X_{\Sset}\cup \C_{\Sset}) \cap \pa_{\graph}(Y))$. As $\pa_{\graph}(Y)\subseteq \I$, 
we can define the \mps $\Sset_\pa = \{\langerangle{X}{\emptyset}: \forall X \in \pa_{\graph}(Y)\}$. Denote by $p_{\pi^*_{\Sset_{\pa}}}(Y)$ the distribution of $Y$ induced by an optimal \dmp $\pi^*_{\Sset_{\pa}}$ compatible with $\Sset_{\text{pa}}$, \ie such that 
$\int_{\range_Y} Y p_{\pi^\st_{\Sset_\pa}}(Y)dY\leq \int_{\range_Y} Y p_{\pi_{\Sset_\text{pa}}}(Y)dY$, for every \dmp $\pi_{\Sset_\text{pa}}$ compatible with $\Sset_{\pa}$, and let $\range = \range_Y \times \range_{\X_{\Sset}\cup \C_{\Sset}}\times \range_{\Z}$. Exploiting the rules of do-calculus \citep{pearl2000causality} and $\sigma$-calculus \citep{correa2020calculus} we obtain
\begin{align*}
\mu_{\pi_{\Sset}}^Y 
&= \int_{\range} Y p_{\pi_{\Sset}}(Y \cond \X_{\Sset}\cup \C_{\Sset} \cup \Z) \underbrace{p_{\pi_{\Sset}}(\X_{\Sset}\cup \C_{\Sset} \cup \Z)d\X_{\Sset}\cup\C_{\Sset} d\Z dY }_{{\cal A}}\\
&= \int_{\range} Y p_{\pi_{\Sset}}(Y \cond \pa_{\graph}(Y)) {\cal A}  \hskip2.8cm (\text{rule 1 }\sigma\text{-calculus}) \,\, \begin{small}Y \indep_{\graph_{\Sset}} (\X_{\Sset}\cup \C_{\Sset} \cup \Z)\backslash \pa_{\graph}(Y) \cond \pa_{\graph}(Y) \end{small}\\
&= \int_{\range} Y p(Y \cond \pa_{\graph}(Y)) {\cal A} \hskip3.2cm (\text{rule 2 }\sigma\text{-calculus}) \,\, \begin{small}Y \indep_{\graph_{\Sset, \underline{\X_{\Sset}}}, \graph_{ \underline{\X_{\Sset}}}} \X_{\Sset} \cond (\pa_{\graph}(Y)\backslash(\pa_{\graph}(Y) \cap \X_{\Sset})) \end{small} \\
&= \int_{\range} Y p(Y \cond \doi(\pa_{\graph}(Y))) {\cal A} \hskip2.6cm (\text{rule 2 } \text{do-calculus})\,\, \begin{small} Y \indep_{\graph_{\underline{\pa_{\graph}(Y)}}} \pa_{\graph}(Y)  \end{small} 
\\
&= \int_{\range} Y p_{\pi_{\Sset_{\pa}}}(Y) {\cal A} 
\geq \int_{\range} Y p_{\pi^\st_{\Sset_{\pa}}}(Y) {\cal A} 
= \mu^Y_{\pi^\st_{\Sset_{\text{pa}}}},
\end{align*}
where $\indep_{\graph_{\Sset, \underline{\X_{\Sset}}}, \graph_{ \underline{\X_{\Sset}}}}$ denotes d-separation in both $\graph_{\Sset, \underline{\X_{\Sset}}}$ and $\graph_{\underline{\X_{\Sset}}}$.
\end{proof}

\begin{namedthm*}{Proposition~\ref{prop:soft_opt}}
If  $\Sset^\st, \pi^{\st}_{\Sset^\st}=\argmin_{\Sset \in \Sigma, \piS\in\PiS}\mu^Y_{\piS}$, 
then $\Sset^\st, \pi^{\st}_{\Sset^\st}=\argmin_{\Sset \in \Sigma^{\C}, \piS\in\PiS}\mu^Y_{\piS,\C=\c}$ $\forall \C \subset \V \backslash Y$ 
such that $\C \cap \text{de}_{\graph}(\I)=\emptyset$ and $\forall\c \in \range_{\C}$ with $\Sigma^{\C} = \{\Sset \in \Sigma: \X_{\Sset} = \X_{\Sset^\st} \text{ and } \{\langerangle{X}{\C_X^{\Sset^\st} \cup \C_X^{\Sset} \cup \C}:X \in \X_{\Sset^\st}\} \text{ is an } \mps\}$.
\end{namedthm*}

\begin{proof}
Assume, by contradiction, that $(\Sset^\st, \pi^{\st}_{\Sset^\st})$, with $\pi^{\st}_{\Sset^\st}=\left\{ \pi_{X|\C_X^{\Sset^\st}}^{\Sset^\st} \right\}_{X\in \C_X^{\Sset^\st}}$, is a solution to the \fcgo problem but there exist $\C \subset \V \backslash Y$ and a value $\c \in \range_\C$ such that the tuple $(\Sset^1, \pi_{\Sset^1})$ with $\Sset^1 \in \Sigma^\C$ and $\pi_{\Sset^1}=\left\{ \pi_{X|\C_X^{\Sset^1}}^{\Sset^1} \right\}_{X\in \C_X^{\Sset^1}}\in \PiS$ satisfies $\mu^Y_{\pi_{\Sset^1}, \C=\c} < \mu^Y_{\pi^{\st}_{\Sset^\st},\C=\c}$. As $\Sset^1 \in \Sigma^\C$, we can construct \mps $\Sset^2 = \{\langerangle{X}{\C_X^{\Sset^\st} \cup \C_X^{\Sset^1} \cup \C}: X \in \X_{\Sset^\st}\}$ and the compatible  $\pi_{\Sset^2}=\left\{\pi^{\Sset^2}_{X|\C_X^{\Sset^\st} \cup \C_X^{\Sset^1} \cup \C}\right\}_{X \in \X_{\Sset^\st}}$ with 
\begin{align*}
\pi^{\Sset^2}_{X|\C_X^{\Sset^\st} \cup \C_X^{\Sset^1} \cup \C}=
\begin{cases}
& \pi_{X|\C_X^{\Sset^1}}^{\Sset^1} \text{ if } \C \in [\c-\delta,\c+\delta] \\
& \pi_{X|\C_X^{\Sset^\st}}^{\Sset^\st}  \text{ otherwise},
\end{cases}
\end{align*} 
for a small enough $\delta>0$. As $\C \cap \de_{\graph}(\I) =\emptyset$, variables in $\C$ are not affected by interventions on variables in $\X_{\Sset^\st}$, and therefore $p_{\pi^\st_{\Sset^\st}}(\C)= p_{\pi_{\Sset^1}}(\C)=p(\C)$. Thus we obtain:
\begin{align*}
    \mu^Y_{\pi_{\Sset^2}} &= \int_{\range_\C} \mu^Y_{\pi_{\Sset^2}, \C=\c'} \; p_{\pi_{\Sset^2}}(\C=\c')d\c'\\
    &= \int_{[\c-\delta,\c+\delta]} \mu^Y_{\pi_{\Sset^2}, \C=\c'}\; p_{\pi_{\Sset^2}}(\C=\c')d\c' +  \int_{\range_\C\backslash [\c-\delta,\c+\delta]} \mu^Y_{\pi_{\Sset^2}, \C=\c'}\; p_{\pi_{\Sset^2}}(\C=\c')d\c'\\
    &= \int_{[\c-\delta,\c+\delta]} \mu^Y_{\pi_{\Sset^1}, \C=\c'}\; p_{\pi_{\Sset^1}}(\C=\c')d\c' + \int_{\range_\C\backslash [\c-\delta,\c+\delta]} \mu^Y_{\pi^{\st}_{\Sset^\st}, \C=\c'} \; p_{\pi^{\st}_{\Sset^\st}}(\C=\c')d\c'\\
    &< \int_{[\c-\delta,\c+\delta]} \mu^Y_{\pi^{\st}_{\Sset^\st}, \C=\c'}\; p_{\pi^{\st}_{\Sset^\st}}(\C=\c')d\c' + \int_{\range_\C\backslash [\c-\delta,\c+\delta]} \mu^Y_{\pi^{\st}_{\Sset^\st}, \C=\c'} \; p_{\pi^{\st}_{\Sset^\st}}(\C=\c')d\c'\\
    &=\mu^Y_{\pi^{\st}_{\Sset^\st}},
\end{align*}
with contradicts the assumption that $(\Sset^\st, \pi^{\st}_{\Sset^\st})$ is a solution to the \fcgo problem.
\end{proof}

\section{Alternative kernel construction}\label{sec:app:alternate_kernels}
The kernel function $\kappa_{\Sset}^\kappapar$ introduced in \secref{sec:gpsurrogate} sets the covariance between the elements in the vector $\pi_{\softsubscript}$ associated to a \dmp $\pi_{\Sset}$ to 0, thus restricting the type of functions that can be selected during optimization\footnote{Notice that, for hard interventions, this corresponds to limiting the range of values that can be set when intervening.}.

\begin{wrapfigure}[4]{r}{0.18\textwidth}
\vskip-0.5cm
\scalebox{0.9}{
\begin{tikzpicture}[dgraph]
\node[dot] (c1) [fill=darkGreen!70,label=north:$C_1$] at (-0.8, 2) {};
\node[dot] (c2) [fill=darkGreen!70,label=north:$C_2$] at (0.8, 2) {};
\node[dotdot node] (x) [fill=brightBlue!70,label=north:$X$] at (0, 1.4) {};
\node[dotdot node] (z)[fill=brightBlue!70,label=north:$Z$] at (1.6,1.4) {};
\node[dot] (y) [fill=red!70,label=north:$Y$] at (0.8,0.8) {};
\draw[line width=0.6pt, brightBlue, \arr](c1)--(x);
\draw[line width=0.6pt, brightBlue, \arr](c2)--(x);
\draw[line width=0.6pt, brightBlue, \arr](c2)--(z);
\draw[line width=0.6pt, \arr](x)--(y);
\draw[line width=0.6pt, \arr](z)--(y);
\end{tikzpicture}}
\end{wrapfigure}

For instance, consider the graph on the right with $\Sset = \{\langerangle{X}{(C_1, C_2)}, \langerangle{Z}{C_2}\}$ and $\piS = \{\pi_{X|\{C_1, C_2\}}, \pi_{Z|C_2}\}$. The proposed kernel function would set $\text{Cov}(\pi_{X|\{C_1, C_2\}}, \pi_{Z|C_2}) = 0$. While a study of the effect of choosing different covariance structures on the optimal target effect goes beyond the scope of this paper, in this section we provide alternative kernel constructions that relax this constraint.

Given a \dmp $\pi_\Sset$, one can define the correlation between elements in $\pi_{\softsubscript}$ by introducing a $|\C_\Sset|$-dimensional vector $\boldomega$ of parameters for each function $\pi_{X|\C_X}$ in $\pi_{\softsubscript}$ such that the $j$-th term $\omega_j=1$ if the $j$-th term in $\C_\Sset$ is in $\C_{X}$ and $\omega_j=0$ otherwise. For instance, for $\piS = \{\pi_{X|\{C_1, C_2\}}, \pi_{Z|C_2}\}=\pi_{\functsubscript}$, we have $\omega_1 = \omega_2 = 1$ for $\pi_{X|\{C_1, C_2\}}$ as both variables in $\C_{\Sset} = \{C_1, C_2\}$ are in $\C_X$, while $\omega_1 = 0$ and $\omega_2 = 1$ for $\pi_{Z|C_2}$ as only $C_2$ is in $\C_Z$. 

We can then redefine $\kappa_{\Sset}^\kappapar$ to be an \rbf kernel on an input space given by product between the the context variables and the $\boldomega$ parameters. Denote by $\boldomega^i, \boldomega^j$ two possible values for the $\boldomega$ vector, for instance we could have $\boldomega^i=[1, 1]^\top$ and $\boldomega^j=[0, 1]^\top$ in the example above; and by $\c^i = [c_1^i, \dots, c_{|\C_{\Sset}|}^i]^\top$ and $\c^j = [c_1^j, \dots, c_{|\C_{\Sset}|}^j]^\top$ two vector of values for $\C_{\Sset}$. We can define $\kappa_{\Sset}^\kappapar : (\range_{\C_{\Sset}} \times \Omega) \times (\range_{\C_{\Sset}} \times \Omega) \to \mathbb{R}^{|\Sset_{\functsubscript}| \times |\Sset_{\functsubscript}|}$ where $\Omega$ is the space of values for each vector $\boldomega$ and $\kappa_{\Sset}^\kappapar((\c, \boldomega)^i, (\c, \boldomega)^j) = \kappa_{\Sset}^\kappapar((\c^i)^\top \boldomega^i, (\c^j)^\top \boldomega^j) = \gamma \exp(-0.5/l^2 \sum_{n=1}^{|\C_{\Sset}|}(c^i_n\omega^i_n - c^j_n\omega^j_n)^2)$ where $\kappapar = \{\gamma, l\}$. For the example above, we can write $\kappa_{\Sset}^\kappapar((\c^i)^\top \boldomega^i, (\c^j)^\top \boldomega^j) = \gamma \exp(-0.5/l^2 [(c_1^i\omega_1^i - c_1^j\omega_1^j)^2 + (c_2^i\omega_2^i - c_2^j\omega_2^j)^2])$. When $\gamma\neq0$, $\boldomega^i=[1, 1]^\top$ and $\boldomega^j=[0, 1]^\top$, this kernel would return a covariance between $\pi_{X| C_1, C_2}$ and $\pi_{Z| C_2}$ equal to $\kappa_{\Sset}^\kappapar((\c^i)^\top \boldomega^i, (\c^j)^\top \boldomega^j) = \gamma \exp(-0.5/l^2 [(c_1^i)^2 + (c_2^i - c_2^j)^2])$. The covariance would thus depend on the context values in the overlapping part of the context variables space and a correction term $(c_1^i)^2$. Instead of fixing the values in $\boldomega$ to either zero or one based on the graph structure, one could think about optimizing the values that are different from zero so as to achieve a higher flexibility in terms of allowed covariance while still imposing structure via the zero values.

As a more general kernel construction, given a \dmp $\Sset$, a vector of parameter values $\boldomega^i$ and a vector of context values $\c^i = [c_1^i, \dots, c^i_{|\C_{\Sset}|}]^\top$, one could define the augmented input vector $\c^i_\text{aug} = [(\c^i)^\top \boldomega^i, (\c^i) \boldomega^i, t]^\top$ (and similarly for two alternative vector of values $\c^j$ and $\boldomega^j$) given by the concatenation of two $|\C_\Sset|$-dimensional vector obtained by $(\c^i)^\top \boldomega^i$ and a task index $t$ that gives the index of the function in $\pi_{\Sset_{\functsubscript}}$, similarly to what was introduced in \secref{sec:gpsurrogate}. 

For an augmented vector of hyper-parameters $\kappapar = [\gamma, l, \tilde{\gamma}, \tilde{l}]$, one could then define the following kernel:
\begin{align}
    \kappa_{\Sset}^{\kappapar}(\c_\text{aug}^i, \c_\text{aug}^j) &= \mathbb{I}_{t = t'} \gamma^2 \exp\left(-\frac{0.5}{l^2} \sum_{n=1}^{|\C_{\Sset}|} (\c^i_{\text{aug},n} - \c^j_{\text{aug}, n})^2\right)
    + \mathbb{I}_{t\neq t'} \tilde{\gamma}^2 \exp\left(-\frac{0.5}{\tilde{l}^2} \sum_{n=|\C_{\Sset}|+1}^{2|\C_{\Sset}|} (\c^i_{\text{aug},n} - \c^j_{\text{aug}, n})^2\right) \nonumber\\
    %
    &=\mathbb{I}_{t = t'} \gamma^2 \exp\left(-\frac{0.5}{l^2} \sum_{n=1}^{|\C_{\Sset}|} (c^i_n\omega_n - c^j_n\omega'_n)^2\right)
    + \mathbb{I}_{t\neq t'} \tilde{\gamma}^2 \exp\left(-\frac{0.5}{\tilde{l}^2} \sum_{n=|\C_{\Sset}|+1}^{2|\C_{\Sset}|} (c^i_n\omega_n - c^j_n\omega'_n)^2\right),
    \label{eq:complex_kernel}
\end{align}
where $c^i_n$ is the $n$-th term of the $\c^i$ vector (similarly for $\c^j$ and $\boldomega^i$), and  $\mathbb{I}_{t=t'}$ is an indicator function equal to one if $t=t'$ and zero otherwise. The first term in \eqref{eq:complex_kernel} represents an \acro{rbf} kernel capturing the covariance structure \emph{within} the $t$-th function in $\pi_{\functsubscript}$ while the second term is again an \acro{rbf} kernel that captures the covariance \emph{across} functions in $\pi_{\functsubscript}$. Differently from the kernel described above we now have two sets of hyper-parameters: $\gamma, l$ for the first $\rbf$ kernel and $\tilde{\gamma}, \tilde{l}$ for the second. This gives higher flexibility in terms of the functional interventions we can learn and thus the target effect values we can achieve. As in the previous kernel we can let the parameters in $\boldomega$, as well as in $\kappapar$, change to capture different level of correlations or set them equal to one and zero depending on the structure of the graph. In the latter case and for the example introduced above, we would have $\omega_1 = \omega_2 = 1$ for $\pi_{X|C_1, C_2}$ which would lead to a standard \rbf kernel for the first term in \eqref{eq:complex_kernel}. We could then set $\tilde{\gamma}=0$ to have a zero covariance across functions or finally vary $\omega_3$ and $\omega_4$ for both $\pi_{X|C_1, C_2}$ and $\pi_{Z|C_2}$ to allow for increasing level of correlation.

\section{Chain Experiments}\label{sec:app:echain}
For the \echain experiments we use the following \scm: 
\begin{align*} 
& X = U_X, \hskip0.1cm W = U_W, \hskip0.1cm Z = -0.5X + U_Z, \hskip0.1cm Y = -W -3ZX + U_Y, \hskip0.1cm \text{with } U_X, U_W, U_Z, U_Y \sim {\cal N}(0,1).
\end{align*}
We set the range for hard interventions on both $Z$ and $W$ to $[-1, 1]$. The set of non-redundant \mps{s} is  $\mismps = \{\{\langerangle{Z}{\emptyset}\}, \{\langerangle{W}{\emptyset}\}, \{\langerangle{Z}{\emptyset}, \langerangle{W}{\emptyset}\}, \{\langerangle{Z}{\{X\}}\}, \{\langerangle{Z}{\{X\}}, \langerangle{W}{\emptyset}\}\}$. 

We set $\texttt{GridSize} = 10$ and represent each functional intervention with $N_{\alpha}=N_{\beta}=10$ samples for the context variables. We sample the coefficients $\boldalpha_i$ (for $i=1,\dots, N_\alpha$) and $\boldbeta_j$ (for $j=1, \dots, N_\beta$) uniformly in the interval $[-0.27, 0.27]$, in order to keep the range of values obtained for the intervened variables following a functional intervention similar to the ranges set for the hard interventions. For each $\Sset \in \mismps$, we initialize the linear kernel $\kappa^{\kappapar}_\Sset$ with $\kappapar = 1$. Exploration is hard to achieve when the $\gptext$ models for $\Sset$ including functional interventions are initialized with \rbf $K^{\theta}_\Sset$ and hyper-parameters $\theta = (\ell, \sigma^2_f) = (1, 1)$. We thus perform hyper-parameters search exploring continuous values $\sigma^2_f \in [1, 10000]$ and $\ell \in [1, 30]$, which results in selecting $\sigma_f^2 = 7000$, and $\ell = 20$ for both \fcbo and \fbo.
For \cbo and \bo, which consider only hard interventions and thus do not suffer from exploration issues, we initialize $K^{\theta}_\Sset$ with $\theta = (1, 1)$. For \mcbo we use the default setting (Mat\'ern $5/2$ kernel), as it is not possible to tune the kernel and corresponding hyper-parameters. In order to run \mcbo with contextual interventions, we use the augmented \scm  with action variables $X = U_X$, $W= U_W + A_W$, $Z = -0.5X + U_Z + A_Z$, $Y = -W -3ZX + U_Y$. 
In this setting, the average \acro{cpu} execution time for a single \fcbo run is $\sim$ 6 minutes, while for a single \mcbo run is $\sim$ 14 minutes.

\section{Health Experiments}\label{sec:app:health}
For the \health experiments, we use the \scm from \cite{ferro2015use}:
\begin{equation*}
\begin{split}
    &\Age = U_{\Age}, \ci = U_{\ci},\bmr = 1500 + 10 \times U_{\bmr}, \\
    &\text{Height} = 175 + 10 \times U_{\text{Height}}, \\
    &\text{Weight} = \frac{\bmr + 6.8 \times \text{Age} - 5 \times  \Height}{13.7 + \ci \times 150/7716}, \\
    &\bmi = \text{Weight} / (\text{Height}/100)^2, \\
    &\Aspirin = \sigma(-8 + 0.1 \times \Age + 0.03 \times \bmi), \\
    &\Statin = \sigma(-13 + 0.1 \times \Age + 0.2 \times \bmi), \\
    &\psa = 6.8 + 0.04 \times \Age - 0.15 \times \bmi - 0.6 \times \Statin + 0.55 \times \Aspirin \\
    & \hskip0.9cm+\sigma(2.2 - 0.05 \times \Age + 0.01\times \bmi - 0.04 \times \Statin + 0.02 \times \Aspirin) + U_{\psa}, 
\end{split}
\end{equation*}
with $U_{\text{Age}} \sim \mathcal{U}(55, 75)$,  $U_{\text{\acro{ci}}} \sim \mathcal{U}(-100, 100)$, $U_{\text{\acro{bmr}}} \sim  t\mathcal{N}(-1, 2)$, $U_{\text{Height}}\sim t\mathcal{N}(-0.5, 0.5)$, $U_{\acro{psa}} \sim \mathcal{N}(0, 0.4)$, where $\mathcal{U}(\cdot, \cdot)$ denotes a uniform distribution, $t\mathcal{N}(a, b)$ a standard Gaussian distribution truncated between $a$ and $b$, and $\sigma(\cdot)$ the sigmoidal transformation defined as $\sigma(x) = \frac{1}{1 + \exp(-x)}$. 

We set the ranges for hard interventions on Aspirin, Statin, and CI to $[0.1, 1]$.
The set of non-redundant \mps{s} is $\mismps =$ \{\{$\langerangle{\Aspirin}{\emptyset}$\},
\{$\langerangle{\Statin}{\emptyset}$\}, \{$\langerangle{\ci}{\emptyset}$\},
\{$\langerangle{\Aspirin}{\emptyset}$, $\langerangle{\Statin}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\emptyset}$, $\langerangle{\ci}{\emptyset}$\}, \{$\langerangle{\Statin}{\emptyset}$, $\langerangle{\ci}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\emptyset}$, $\langerangle{\Statin}{\emptyset}$, $\langerangle{\ci}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\{\Age, \bmi\}}$\}, \{$\langerangle{\Statin}{\{\Age, \bmi\}}$\}, \{$\langerangle{\Aspirin}{\{\Age,\bmi\}}$, $\langerangle{\Statin}{\{\Age, \bmi\}}$\}, \{$\langerangle{\Aspirin}{\{\Age, \bmi\}}$, $\langerangle{\Statin}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\emptyset}$, 
$\langerangle{\Statin}{\{\Age, \bmi\}}$\},\{$\langerangle{\Aspirin}{\{\Age, \bmi\}}$, $\langerangle{\ci}{\emptyset}$\},\{$\langerangle{\Statin}{\{\Age, \bmi\}}$, $\langerangle{\ci}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\{\Age, \bmi\}}$, $\langerangle{\Statin}{\{\Age, \bmi\}}$, $\langerangle{\ci}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\emptyset}$, $\langerangle{\Statin}{\{\Age, \bmi\}}$, $\langerangle{\ci}{\emptyset}$\}, \{$\langerangle{\Aspirin}{\{\Age, \bmi\}}$, $\langerangle{\Statin}{\emptyset}$, $\langerangle{\ci}{\emptyset}$\}\}. 

We represent each functional intervention with $N_{\alpha}=N_{\beta}=10$ samples for the context variables. We sample the coefficients $\boldalpha_i$ (for $i=1,\dots, N_\alpha$) and $\boldbeta_j$ (for $j=1, \dots, N_\beta$) uniformly in the interval $[0, 3.3]$, in order to keep the total cost of functional interventions and hard interventions comparable. The \rbf kernels $K^{\theta}_\Sset$ and $\kappa_{\Sset}^{\kappapar}$ are initialized with $\theta = (1, 1)$ and $\kappapar = (1, 1)$ for each $\Sset \in \mismps$. In this setting, the average \acro{cpu} execution time for a single \fcbo run is $\sim$ 3 hours and 20 minutes, while for a single \mcbo run is $\sim$ 10 hours.
\end{document}