%\documentclass{uai2023} % for initial submission
 \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage[capitalize,noabbrev]{cleveref}
 \usepackage{relsize}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\newtheorem*{theorem1}{\textbf{Theorem~\ref{thm:lb_epehe}}}
\newtheorem*{theorem2}{\textbf{Theorem~\ref{thm:smiple_d1}}}
\newtheorem*{theorem3}{\textbf{Theorem~\ref{thm:simple_ipm}}}
\newtheorem*{lemma2}{\textbf{Lemma~\ref{lemma2}}}
\newtheorem*{theorem5}{\textbf{Theorem~\ref{main_theorem_1}}}

\newcommand{\ind}{\perp\!\!\!\!\perp} 

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 

% \usepackage{xr-hyper}


%%%%%%% Helper Code for xr %%%%%%%%%%
\usepackage{xr} 

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\myexternaldocument{aloui_189}



%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Transfer Learning for Individual Treatment Effect Estimation\\(Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{Ahmed Aloui$^*$}
\author{Juncheng Dong$^*$}
\author{Cat P. Le}
\author{Vahid Tarokh}
% Add affiliations after the authors
\affil{
Department of Electrical and Computer Engineering, Duke University
%Durham, North Carolina, USA
}


\begin{document}
\onecolumn  
\maketitle
\def\thefootnote{*}\footnotetext{Equal Contribution.}


%\onecolumn %% Turn this off if single column is desired for the supplement
%\maketitle
%\textbf{\huge{Appendix}}
%\appendix
\section{Reproducibility Statement}
The supplementary material includes the implementation codes for our proposed framework, TARNet, and CITA.

\section{Causal Inference: An Example}
Let $X \in \mathcal{X}$ be the features (e.g., age, height, weight), the treatment assignment $A \in \{0, 1\}$ be the indicator representing if the subject received vaccine $0$ or $1$. The mortality outcome is denoted by $Y\in\mathcal{Y}$. 
% (Without loss of generality, we can create one-to-one mapping between any finite set of values of $T$ and the set of integers ${0,...,M-1}$ where $M$ is the total number of values of $T$.)

The main challenge of causal inference arises from the absence of counterfactual observations. We do not observe the outcomes of individuals upon receiving treatment $1$ if they have received treatment $0$ and vice versa. The subjects who received vaccine $1$ may differ significantly from those who received treatment $0$. This issue is called selection bias. For instance, older people are more likely to receive the treatment than young people). Thus, estimating the counterfactual effects is challenging due to the unbalance between the treatment groups.

Let $\hat{f}(x,a)$ be a hypothesis modeling the outcome for an individual $x$ if he/she received treatment $a$. The factual loss is defined as follows:
\begin{equation}\epsilon_{F}(\hat{f}) = \int_{\mathcal{X}\times\{C,B\}\times\mathcal{Y}} l_{\hat{f}}(x,a,y)\; p(x,a,y) dx dady\end{equation}
By Bayes rule, we can write the factual loss as
\begin{equation*}
\begin{aligned}
&\epsilon_{F}(\hat{f}) \\
&= \int_{\mathcal{X}\times\mathcal{Y}} l_{\hat{f}}(x,a=0,y)\; p(x,y|A=0) p(A=0) dxdy +\\
&\int_{\mathcal{X}\times\mathcal{Y}}l_{\hat{f}}(x,a=1,y)\; p(x,y|A=1) p(A=1) dxdy \\
&= p(A=0) \int_{\mathcal{X}\times\mathcal{Y}} l_{\hat{f}}(x,a=0,y)\; p(x,y|A=0) dxdy +\\
&\left(1-p(A=0)\right)\int_{\mathcal{X}\times\mathcal{Y}}l_{\hat{f}}(x,a=1,y)\; p(x,y|A=1) dxdy \\
& = p(A=0) \epsilon_{F}^{A=0}(\hat{f}) + \left(1-p(A=0)\right) \epsilon_{F}^{A=0}(\hat{f})
\end{aligned}
\end{equation*}

We define the factual loss for the group who received vaccine $0$ as follows:

\begin{equation}
\epsilon_{F}^{A=0}(\hat{f}) = \int_{\mathcal{X}\times\mathcal{Y}} l_{\hat{f}}(x,a=0,y)\; p(x,y|A=0) dxdy
\end{equation}

Similarly, the factual loss for the group who received vaccine $1$ is described as: 

\begin{equation}
\epsilon_{F}^{A=1}(\hat{f}) = \int_{\mathcal{X}\times\mathcal{Y}} l_{\hat{f}}(x,a=1,y)\; p(x,y|A=1) dxdy
\end{equation}

Consider a parallel universe where the treatment assignments are flipped (i.e., those who received vaccine $1$ receive vaccine $0$ and vice versa). The performance of our hypothesis $\hat{f}$ in this scenario is the counterfactual loss, defined as follows:

\begin{equation}
\epsilon_{CF}(\hat{f}) = \int_{\mathcal{X} \times\{0,1\}  \times\mathcal{Y}} l_{\hat{f}}(x,a,y)\; p(x,1-a,y) dx da dy
\end{equation}


%-----------------------------------------------------------------------------

\section{Datasets and Experiments Descriptions}
\subsection{Datasets}
\label{datasets}
\paragraph{IHDP} 
The IHDP dataset was first introduced by \cite{hill} based on real covariates available from the Infant Health and Development Program (IHDP), studying the effect of development programs on children. The features in this dataset come from a Randomized Control Trial. The potential outcomes were simulated using Setting B. The dataset consists of $747$ individuals (e.g., $139$ in the treatment group and $608$ in the control group), each with $25$ features. The potential outcomes are generated as follows:
$$Y_{0} \sim \mathcal{N}(\exp(\beta^{T}\cdot(X + W)), 1)$$
and
$$Y_{1} \sim \mathcal{N}(\beta^{T}(X+W) - \omega,1)$$
where $W$ has the same dimension as $X$ with all entries equal $0.5$ and $\omega=4$. The regression coefficient $\beta$, a vector of length $25$, is randomly sampled from a categorical distribution with the support $(0, 0.1, 0.2, 0.3, 0.4)$ and the respective probabilities $\mu = (0.6, 0.1, 0.1, 0.1,0.1)$. The dataset generated according to these parameters is referred to as the \textit{base} dataset.

Additionally, we generate $9$ additional datasets by introducing $9$ new settings. These settings, which are constructed by varying $\mu$ and $\omega$, are shown in
Table~\ref{ihdp_table}. Each of these generated datasets consists of $747$ individuals (e.g., $139$ in the treatment group and $608$ in the control group).

%These values of $\mu$ and $\omega$ for the different IHDP datasets are given below in the Table.

\begin{table}[t]
\begin{center}
\caption{The settings to generate IHDP datasets}
\label{ihdp_table}
\begin{tabular}{l|lc}
\hline
\multicolumn{1}{l}{\bf Dataset} &\multicolumn{1}{c}{\bf $\mu$} &\multicolumn{1}{c}{\bf $\omega$} \\
\hline
IHDP (\textit{Base})&(0.6, 0.1, 0.1, 0.1, 0.1)&4\\
IHDP 1&(0.61, 0.09, 0.1, 0.1, 0.1) &4.1 \\
IHDP 2&(0.62, 0.08, 0.1, 0.1, 0.1) &4.2 \\
IHDP 3&(0.63, 0.07, 0.1, 0.1, 0.1) &4.3 \\
IHDP 4&(0.64, 0.06, 0.1, 0.1, 0.1) &4.4 \\
IHDP 5&(0.65, 0.05, 0.1, 0.1, 0.1) &4.5 \\
IHDP 6&(0.66, 0.04, 0.1, 0.1, 0.1) &4.6 \\
IHDP 7&(0.67, 0.03, 0.1, 0.1, 0.1) &4.7 \\
IHDP 8&(0.68, 0.02, 0.1, 0.1, 0.1) &4.8 \\
IHDP 9&(0.69, 0.01, 0.1, 0.1, 0.1) &4.9 \\
\hline
\end{tabular}
\end{center}
\end{table}


 
\paragraph{Jobs} The Jobs dataset~\citep{JobsDataset} consists of $619$ observations. In this experiment, the causal inference task aims to learn the effect of participation in a specific professional training program on landing a job in the following three years. Here, we generate a family of related datasets by randomly reverting the original treatment assignments (i.e., $0 \leftrightarrow 1$) with the probability $p \in \{0 = 0/9, 1/9, 2/9, 3/9, 4/9, 5/9, \cdots, 9/9=1\}$. The dataset corresponding to $p=0$ is considered the original dataset, and the dataset with $p=1$ has all treatment assignments reversed. We select the original Jobs dataset, introduced in~\citep{JobsDataset} as the \textit{base} dataset for our experiments.    


\paragraph{Twins}
The Twins dataset~\cite{cevae} is based on the collected birthday data of twins born in the United States from 1989 to 1991. It is assumed that twins share significant parts of their features. Consider the scenario where one of the twins was born heavier than the other as the treatment assignment. The outcome is whether the baby died in infancy (i.e., mortality). Here, the twins are divided into two groups: the treatment and the control groups. The treatment group consists of heavier babies from the twins. On the other hand, the control group consists of lighter babies from the twins. All given observations from this dataset are considered factual.

We first construct a \textit{base} dataset by selecting a set of $2000$ pairs of twins from the original dataset~\citep{cevae}. Each individual is assigned to the treatment group according to a Bernoulli experiment with the probability of $q = 0.75$. In an analogous manner to that of the Jobs dataset, we generate a family of related datasets by randomly reverting the treatment assignments of the \textit{base} dataset (i.e., $0 \leftrightarrow 1$) with corresponding probabilities $p \in \{0, 0.1, 0.2, 0.3, 0.4, 0.5, \cdots, 1\} $. For instance, to generate dataset $i=1,2, \cdots, 11$, we revert the individual treatment assignments in the base dataset using the Bernoulli experiment with the probability of $p_i = (i-1)/10$. In particular, $p=0$ corresponds to the original dataset, while $p=1$ corresponds to all treatment assignments reverted. 


% We choose $11$ different $p$ values ranging from 0 to 1 such that the $p$ values are evenly spaced (e.g., \texttt{numpy.linspace(0,1,11)}
% We choose the \textit{base} dataset as the \textit{base} dataset for our experiments~\ref{exp_detail}.  

% We generate multiple Twins datasets by flipping the treatment labels with probability $=p$ ranging from $0$ to $1$. This assures that we are changing both the selection bias and the potential outcomes functions. 

\paragraph{RKHS}
In this experiment, we generate $100$ Reproducing Kernel Hilbert Space (RKHS) datasets, each having $2000$ data points. Next, we generate the treatment and the control populations $X_{1},X_{0} \in \mathbb{R}^{4}$ respectively from Gaussian distributions $\mathcal{N}(\mu_{1},I_{4})$ and $\mathcal{N}(\mu_{0},I_{4})$ for each dataset. We sample $\mu_{1} \in \mathbb{R}^{4}$ and $\mu_{0}\in \mathbb{R}^{4}$ respectively according to Gaussian distributions $\mathcal{N}(\pmb{e},I_{4})$ and $\mathcal{N}(-\pmb{e},I_{4})$ where $\pmb{e}=[1,1,1,1]^{T}$.

Subsequently, we generate the potential outcome functions $f_{0}$ and $f_{1}$ with a Radial Basis Function (RBF) kernel $K(\cdot,\cdot)$, described as follows:

%Let $\gamma_{0}\in \mathbb{R}^{4}$ be a vector with its components sampled individually from the Gaussian distribution $\mathcal{N}(7,1)$. Let $\gamma_{1} \in \mathbb{R}^{4}$ be another vector with its components sampled individually from the Gaussian distribution $\mathcal{N}(9,1)$. \\
Let $\gamma_{0},\gamma_{1}\in \mathbb{R}^{4}$ be two vectors sampled  from $\mathcal{N}(7\pmb{e},I_{4})$ and $\mathcal{N}(9\pmb{e},I_{4})$, respectively. Let $\lambda \in \mathbb{N}$ be sampled uniformly from $\{10,11,\ldots,99,100\}$. For $j \in \{0,1\}$:
\begin{enumerate}
    \item We sample $m_{j} \in \mathbb{N}$ according to the Poisson distribution with parameter $\lambda$ (i.e., $\text{Pois}$)
    \item For every $i\in\{1,\ldots,m_{j}\}$, we sample $x_{j}^i$ according to  $\mathcal{N}(\gamma_{j},I_{4}) $
    \item The potential outcome functions $f_j, j=0,1$ are constructed as $f_{j}(\cdot) = \sum_{i=1}^{m_j}K(x_{j}^i,\cdot)$
\end{enumerate}
Given the potential outcome functions $f_j, j \in \{0,1\}$, the corresponding potential outcomes $Y_0$ and $Y_1$ are generated by:
$$
Y_0(x)=f_0(x), \; \text{for every}\; x\in \mathbb{R}^{4},
$$
and
$$
Y_1(x)=f_1(x), \; \text{for every}\; x\in \mathbb{R}^{4}.
$$
% Clearly, all the generated potential outcomes functions are in the same RKHS. 
 %We choose the \textit{base} dataset as the \textit{base} dataset for our experiments~\ref{exp_detail}.
We will refer to the first constructed dataset above as the \textit{base} dataset. Here, all the generated potential outcome functions are in the same RKHS.


\paragraph{Heat (Physics)} Consider a hot object left to cool off over time in a room with temperature $T(0)$. A person will likely suffer a burn if he/she touches the object at time $u$.

The causal inference task of interest is the effect of room temperature $T(0)$ on the probability of suffering a burn. This family consists of $20$ datasets; each includes $4000$ observations (e.g., $2000$ in the control group and $2000$ in the treatment group). The treatment in our setting is $a=1$ when $T(0) = 5$, and $a=0$ when $T(0) = 25$. The touching times of the treatment and control groups are sampled from two Chi-squared distributions $\chi^{2}(5)$ and $\chi^{2}(2)$, respectively, to introduce artificial bias. 

From the solution to Newton's Heat Equation~\citep{heat_equation}, the underlying causal structure is governed by the following equation:
$$T(u) = C \cdot\exp(-ku) + T(0)$$
where $T(u)$ is the temperature at time $u$ and $C, k$ are constants. Let $T_{0} = 25, C = 75$ for the control groups and $T_{0} = 5, C = 95$ for the treatment groups in the datasets. We choose $20$ values of $k= \{0.5, \cdots, 2\}$ uniformly spaced in $[0.5, 2]$. For each value of $k$, we generate a new dataset. The dataset corresponding to $k=0.5$ is referred to as the \textit{base} dataset.
 
Let $T^0(u)$ and $T^1(u)$ denote the temperature at time $u$ for the control and treatment groups, respectively. The potential outcomes $Y_0(u)$ and $Y_1(u)$ corresponding to the probability of suffering a burn at time $t$ for the control and treatment groups are described as follows:
$$
Y_j(u) = \max\left(\frac{1}{75}(T^j(u)-25),0\right)
$$
 


\paragraph{Movement (Physics)} Consider a free-falling object encountering air resistance. Opening the parachute can change the air resistance and control the descent velocity. The causal inference task of interest is the effect of the air resistance (e.g., with $a=1$ or without parachute $a=0$)  on the object's velocity at different times.

In this experiment, the family of datasets is generated, consisting of $12$ datasets. Each dataset includes $4000$ observations (e.g., $2000$ in the treatment group and $2000$ in the control group). The covariate is the time $u$. The outcome is the velocity at time $u$. The times of the treatment and control groups are sampled from two Chi-squared distributions $\chi^{2}(2)$ and $\chi^{2}(5)$, respectively, to create artificial bias. 

The underlying causal structure is governed by an ordinary differential equation (ODE) with the following analytical solution describing the velocity of a person at time $u$:
\begin{align}
\label{movement_equation}
    v(u) = \frac{g}{C}+(v(0)-\frac{g}{C})e^{-Cu}
\end{align}

where $g= 10$ is the earth's gravitational constant, $ C = k/m$, and $m, k$ are the mass and the air resistance constant, respectively. We assume that $v(0) = 0$ corresponds to a free-falling object without initial velocity.

For the control group, $m=k=C=1$ and the potential outcome is calculated as $Y_0(u)= v(u) = 10-e^{-u}$.  We use different sets of $(m,k)$ to generate the treatment groups for each dataset. The values of $(m,k)$ used in this experiment are as follows:
$(5,1)$, $(5,5)$, $(5,10)$, $(5,20)$, $(10,5)$, $(10,10)$, $(10,20)$, $(20,5)$, $(20,10)$, $(20,20)$, $(50,10)$, $(50,20)$.
The potential outcome function $Y_1 (u)$ is calculated from Equation~\ref{movement_equation} with the values of $m, k$ shown above.
We choose the dataset corresponding to $(m,k)=(5,1)$ as the \textit{base} dataset.  


\subsubsection{Details of Experiments}
\label{exp_detail}

In this paper, we first create a number of causal inference tasks from the above families of datasets. For each family of datasets (e.g., IHDP, Jobs, Twins), the \textbf{base} task is created from its \textit{base} dataset. Similarly, we construct the other tasks from the remaining datasets in that family. In order to study the effects of transfer learning on causal inference, we define the source tasks and the target tasks as follows:
\begin{itemize}
    \item In the first experiment in Section ~\ref{exp1}, we choose the \textit{base} task to be the source task and the other tasks to be the target tasks.
    \item In the second experiment in Section ~\ref{exp2}, we choose the \textit{base} task to be the target task and the other tasks to be the source tasks.
\end{itemize}


\section{Proofs of Theorems}\label{proof}

\begin{theorem1}
Let $\hat{f}^{S}$ be a model trained on a source task, then
\begin{align*}
    \epsilon^{T}_{F}(\hat{f}^{S}) + u \epsilon^{T,a=0}_{CF}(\hat{f}^{S}) \leq \varepsilon_{PEHE}^{T}(\hat{f}^{S})
\end{align*}
where $u = p^{T}_{F}(a=1) $.
\end{theorem1}

\begin{proof}[\textbf{Proof of Theorem~\ref{thm:lb_epehe}}]
\label{pf:risk}

We have: 
\begin{align}\label{eq1}
\begin{aligned}
& \varepsilon_{PEHE}(\hat{f^{S}})\\ & = \int_{\mathcal{X}} \big[(\hat{f}^{S}(x,1)  - \hat{f}^{S}(x,0)) - (f^{T}(x,1) - f^{T}(x,0))\big]^{2}\\ & \quad p^{T}_F(x) dx \\
& = \int_{\mathcal{X}} \big[(\hat{f}^{S}(x,1)  - f^{T}(x,1)) - (f^{T}(x,0) - \hat{f}^{S}(x,0))\big]^{2} \\ & \quad p^{T}_F(x) dx \\
& = \int_{\mathcal{X}} (\hat{f}^{S}(x,1)  - f^{T}(x,1))^{2} p_{F}(x) dx \\
& \quad + \int_{\mathcal{X}} (\hat{f}^{S}(x,0)  - f^{T}(x,0))^{2} p^{T}_{F}(x) dx \\
& \quad - 2 \int_{\mathcal{X}} (\hat{f}^{S}(x,1)  - f^{T}(x,1))(f^{T}(x,0)- \hat{f}^{S}(x,0))\\
&\quad \; p^{T}_F(x) dx \\
\end{aligned}
\end{align}

First, we have the following properties of the factual and counterfactual distributions:
$$
\begin{aligned}
1. \; &\forall x \in \mathcal{X},\; p_F(x) = p_{CF}(x) \\
2. \; & \forall x \in \mathcal{X},\forall a \in \{0,1\},\; p_F(x,a) = p_{CF}(x,1-a)
\end{aligned}
$$

Applying these properties, the first term of Equation~(\ref{eq1}) can be expressed as:
$$
\begin{aligned}
& \int_{\mathcal{X}} (\hat{f}^{S}(x,0)  - f^{T}(x,0))^{2} p^{T}_{F}(x) dx \\
& =  u \int_{\mathcal{X}} (\hat{f}^{S}(x,0)  - f^{T}(x,0))^{2} p^{T}_{F}(x|a=1) dx \\
&\; + (1-u) \int_{\mathcal{X}} (\hat{f}^{S}(x,0)  - f^{T}(x,0))^{2} p^{T}_{F}(x|a=0) dx \\
& =  u \int_{\mathcal{X}} (\hat{f}^{S}(x,0)  - f^{T}(x,0))^{2} p^{T}_{CF}(x|a=0) dx \\
&\; + (1-u) \int_{\mathcal{X}} (\hat{f}^{S}(x,0)  - f^{T}(x,0))^{2} p^{T}_{F}(x|a=0) dx \\
& = u \epsilon^{T,a=0}_{CF}(\hat{f}^{S}) + (1-u)\; \epsilon^{T,a=0}_{F}(\hat{f}^{S}) 
\end{aligned}
$$
Similarly, the second term of Equation~(\ref{eq1}) can be expressed as:
$$
\begin{aligned}
& \int_{\mathcal{X}} (\hat{f}^{S}(x,1)  - f^{T}(x,1))^{2} p^{T}_{F}(x) dx \\
& =  (1-u) \epsilon^{T,a=1}_{CF}(\hat{f}^{S}) + u\; \epsilon^{T,a=1}_{F}(\hat{f}^{S})
\end{aligned}
$$

The potential outcome is independent given the features $Y_1 \ind Y_0 | X$ due to its unconfoundedness. Hence, the third term of Equation~(\ref{eq1}) can be expressed as:
$$
\begin{aligned}
    & \mathbb{E}\big[(\hat{f}^{S}(X,1)  - f^{T}(X,1))(f^{T}(X,0)- \hat{f}^{S}(X,0))\big] \\
    & = \mathbb{E}_x\Bigg[\mathbb{E}\Big[\hat{f}^{S}(x,1)  - Y^{T}_1)(Y^{T}_0- \hat{f}^{S}(x,0))|X=x\Big]\Bigg] \\
    & = 0
\end{aligned}
$$

The factual and counterfactual losses of the treatment and control groups  are positive. Thus, we have:

\begin{align*}
\begin{aligned}
    & u \epsilon^{T,a=1}_{F}(\hat{f}^{S}) + (1-u) \epsilon^{T,a=0}_{F}(\hat{f}^{S}) + u \epsilon^{T,a=0}_{CF}(\hat{f}^{S})\\
    & = \epsilon^{T}_{F}(\hat{f}^{S}) + u \epsilon^{T,a=0}_{CF}(\hat{f}^{S}) \\
    & \leq \varepsilon_{PEHE}^{T}(\hat{f}^{S})
\end{aligned}
\end{align*}
\end{proof}


\begin{theorem2}
For any hypothesis $\hat{f}$, we have:
\begin{align}
    \begin{aligned}
        \epsilon^{T}_{CF}(\hat{f}) \leq & \epsilon^{S}_{F}(\hat{f}) + 
        V(p^{T}_{F},p^{S}_{F}) + V(p^{T}_{F},p^{T}_{CF}) \\ & + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,t) - f^{T}(x,t)|]  
    \end{aligned}
\end{align}
and
\begin{align}
    \begin{aligned}
        \varepsilon^{T}_{PEHE}(\hat{f}) \leq & 4 \epsilon^{S}_{F}(\hat{f}) + 
        4 V( p^{T}_{F},p^{S}_{F}) + 2 V(  p^{T}_{F},p^{T}_{CF}) \\ & + 4 \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
\end{align}
\end{theorem2}

\begin{proof}[\textbf{Proof of Theorem~\ref{thm:smiple_d1}}]
\label{pf:simple_td}
Adapting the first theorem in \cite{Ben-David2010} to our setting, we have the following two inequalities: 
$$
\epsilon^{T}_{CF}(\hat{f}) \leq \epsilon^{T}_{F}(\hat{f}) + V(p^{T}_{F},p^{T}_{CF})  
$$
and
$$
\epsilon^{T}_{F}(\hat{f}) \leq \epsilon^{S}_{F}(\hat{f}) + V(p^{T}_{F},p^{S}_{F}) + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]   
$$
Therefore, we have:
$$
    \begin{aligned}
        \epsilon^{T}_{CF}(\hat{f}) \leq & \epsilon^{S}_{F}(\hat{f}) + 
        V(p^{T}_{F},p^{S}_{F}) + V(p^{T}_{F},p^{T}_{CF}) \\ & + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
$$
From \cite{shalit}, we have: 
$$
\varepsilon_{PEHE}^{T}(\hat{f}) \leq 2 \epsilon_F^{T}(\hat{f}) + 2 \epsilon_{CF}^{T}(\hat{f})
$$

Therefore, we have:
$$
\begin{aligned}
        \varepsilon^{T}_{PEHE}(\hat{f}) \leq & 4 \epsilon^{S}_{F}(\hat{f}) + 
        4 V( p^{T}_{F},p^{S}_{F}) + 2 V(  p^{T}_{F},p^{T}_{CF}) \\ & + 4 \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
$$
\end{proof}


\begin{theorem3}
Suppose that the function class $G$ is stable under addition and multiplication and $\hat f, f^{T} \in G$, then
\begin{align}
    \begin{aligned}
        \epsilon^{T}_{CF}(\hat{f}) \leq & \epsilon^{S}_{F}(\hat{f}) + 
        \underset{G}{\text{IPM}}(p^{T}_{F},p^{S}_{F}) + \underset{G}{\text{IPM}}(p^{T}_{F},p^{T}_{CF}) \\ & + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
\end{align}
and 
\begin{align}
    \begin{aligned}
        \varepsilon^{T}_{PEHE}(\hat{f}) \leq & 4 \epsilon^{S}_{F}(\hat{f}) + 
        4 \underset{G}{\text{IPM}}( p^{T}_{F},p^{S}_{F}) + 2 \underset{G}{\text{IPM}}(  p^{T}_{F},p^{T}_{CF}) \\ & + 4 \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
\end{align}
\end{theorem3}

\begin{proof}[\textbf{Proof of Theorem~\ref{thm:simple_ipm}}]
we have that:
\begin{equation*}
\begin{aligned}
\epsilon^{T}_{CF}(\hat{f}) \leq & \; \epsilon^{T}_{F}(\hat{f}) + \|\int (f^{T}(x,a)-\hat{f}(x,a))^2\\ & \; (p_F^{T}(x,a) - p_{CF}^{T}(x,a)) da dx  \| \\
 \leq & \epsilon^{T}_{F}(\hat{f}) + \underset{g\in G}{\sup}\|\int g(x,a)\\ & \quad \;(p_F^{T}(x,a) - p_{CF}^{T}(x,a)) da dx  \| 
\end{aligned}
\end{equation*}

Hence, we have:
$$
\begin{aligned}
\epsilon^{T}_{CF}(\hat{f}) \leq \epsilon^{T}_{F}(\hat{f}) + \underset{G}{\text{IPM}}(  p^{T}_{F},p^{T}_{CF})  
\end{aligned}
$$

Similarly, we have:
$$
\begin{aligned}
    & \epsilon^{T}_{F}(\hat{f})\\& \leq \epsilon^{S}_{F}(\hat{f}) + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|] \\ 
    & + \|\int (f^{S}(x,a)-\hat{f}(x,a))^2(p_F^{S}(x,a) - p_{F}^{S}(x,a)) da dx  \|  \\
    & \leq \epsilon^{T}_{F}(\hat{f}) + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|] + \underset{G}{\text{IPM}}( p^{T}_{F},p^{S}_{F})
\end{aligned}
$$

Thus, we have:
$$
\begin{aligned}
&\epsilon^{T}_{F}(\hat{f}) \\
&\leq \epsilon^{S}_{F}(\hat{f}) + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|] + \underset{G}{\text{IPM}}( p^{T}_{F},p^{S}_{F})
\end{aligned}
$$
Therefore, we have:
$$
    \begin{aligned}
        \epsilon^{T}_{CF}(\hat{f}) \leq & \epsilon^{S}_{F}(\hat{f}) + 
        \underset{G}{\text{IPM}}(p^{T}_{F},p^{S}_{F}) + \underset{G}{\text{IPM}}(p^{T}_{F},p^{T}_{CF}) \\ & + \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
$$
From \cite{shalit}, we have: 
$$
\varepsilon_{PEHE}^{T}(\hat{f}) \leq 2 \epsilon_F^{T}(\hat{f}) + 2 \epsilon_{CF}^{T}(\hat{f})
$$

Therefore, we have:
$$
\begin{aligned}
        \varepsilon^{T}_{PEHE}(\hat{f}) \leq & 4 \epsilon^{S}_{F}(\hat{f}) + 
        4 \underset{G}{\text{IPM}}( p^{T}_{F},p^{S}_{F}) + 2 \underset{G}{\text{IPM}}(  p^{T}_{F},p^{T}_{CF}) \\ & + 4 \mathbb{E}_{p^{S}_{F}}[|f^{S}(x,a) - f^{T}(x,a)|]  
    \end{aligned}
$$
\end{proof}




Next, we will use the following results from ~\cite{shalit} for causal inference. For $x\in \mathcal{X}, a\in\{0,1\}$, with notation simplicity, we define:
$$
L_{\Phi,h}^{T}(x,a) = \int_{Y}l_{\Phi,h}(x,a,y)P(Y^{T}_a = y|x)dy.
$$

\begin{theorem}[Bounding The Counterfactual Loss]
\label{bounding_cfl}
Let $\Phi$ be an invertible representation with inverse $\Psi$. 
Let $p_{\Phi}^{a=i} = p_{\phi}(r|a=i),a\in\{0,1\}$
Let $h: \mathcal{R} \times\{0,1\} \rightarrow \mathcal{Y}$ be a hypothesis. Assume that for $a=0,1$, the function $r\mapsto L_{\Phi,h}(\Psi(r), a) \in G$ then:
\begin{align}
\begin{aligned}
&\epsilon_{C F}(\Phi,h) \leq \\
&(1-u) \epsilon_{F}^{a=1}(\Phi,h)+a \epsilon_{F}^{a=0}(\Phi,h)+ \\
& \underset{G}{\text{IPM}}\left(p_{\Phi}^{a=1}, p_{\Phi}^{a=0}\right).
\end{aligned}
\end{align}
\end{theorem}

\begin{theorem}[Bounding the $\epsilon_{PEHE}$]
\label{bounding_epehe}
The Expected Precision in Estimating Heterogeneous Treatment Effect $\epsilon_{PEHE}$ satisfies
\begin{align}
\begin{aligned}
&\varepsilon_{PEHE}(\Phi,h) \\
& \leq 2\left(\epsilon_{C F}(\Phi,h)+\epsilon_F(\Phi,h)\right)   \\
&\leq 2\left(\epsilon_F^{a=0}(\Phi,h)+\epsilon_F^{a=1}(\Phi,h)+\underset{G}{\text{IPM}}\left(p_{\Phi}^{a=1}, p_{\Phi}^{a=0}\right)\right)
\end{aligned}
\end{align}
\end{theorem}


In the next section, the performance of target task $\epsilon_F^{T,a=0}(\Phi,h)$ is related to that of a source task $\epsilon_F^{S,a=0}(\Phi,h)$. Without loss of generality, we present the proof for the case when $a=0$. %Proof of Lemma 1 requires the most work, Lemma 2 and Theorem 1 result from direct application of Lemma 1. 


First, we make the following assumptions:
\begin{itemize}
    \item \textbf{A1}: $\Phi$ is injective (Thus, $\Psi = \Phi^{-1}$ exists on $\text{Im}(\Phi)$).
    \item \textbf{A2}: There exists a real function space $G$ on $\text{Im}(\Phi)$ such that the function $r \mapsto \ell^{T}_{\Phi,h}(\Psi(r), a,y) \in G$.
    % \item\label{transferability_assumption}\textbf{Assumption 3}: \textbf{Causal Knowledge Transferability Assumption}: There exists a function class $G'$ on $\mathcal{Y}$ such that $y\mapsto \ell_{\Phi,h}(x,t,y) \in G'$ and  $\mathbb{E}\left[\underset{G'}{\text{IPM}}(P(Y_t^{S}|x), P(Y_t^{T}|x))\right]\le\delta$ for $t\in \{0,1\}$.
    \item \textbf{A3}: There exists a function class $G'$ on $\mathcal{Y}$ such that $y\mapsto \ell_{\Phi,h}(x,a,y) \in G'$. 
    %and almost surely on $\mathcal{X}$ with respect to $P(X^{Sr})$.
\end{itemize}
The measure of the fundamental difference between two causal inference tasks is defined as follows:
\begin{equation*}\label{transferability_assumption}
    \gamma^* = \mathbb{E}_{x \sim P(X^S)}\left[\underset{G'}{\text{IPM}}(P(Y_a^{S}|x), P(Y_a^{T}|x))\right]
\end{equation*}


\begin{lemma}
\label{lemma1}
Suppose that Assumptions 1-3 hold. The factual losses of any model $(\Phi,h)$ on source and target task satisfy for every $a \in \{0,1\}$
\begin{equation*}
    \begin{aligned}
        &\epsilon_{F}^{T,a}(\Phi,h) \le \\&\epsilon_{F}^{S,a}(\Phi,h) +  \underset{G}{\text{IPM}}(P(\Phi(X_a^{T})),P(\Phi(X_a^{S}))) + \gamma^{*} 
    \end{aligned}
\end{equation*}
\end{lemma}

\begin{proof}[\textbf{Proof of Lemma~\ref{lemma1}}] 
\begin{align*}
\begin{aligned}
&\epsilon_{F}^{T,a=0}(\Phi,h) - \epsilon_{F}^{S,a=0}(\Phi,h) \\
& = \int_{\mathcal{X}} L_{\Phi,h}^{T}(x,0)P(X_0^{T}=x)-L_{\Phi,h}^{S}(x,0)P(X_0^{S}=x)dx \\
& = \int_{\mathcal{X}} L_{\Phi,h}^{T}(x,0)P(X_0^{T}=x)-L_{\Phi,h}^{T}(x,0)P(X_0^{S}=x)\\ & + L_{\Phi,h}^{T}(x,0)P(X_0^{S}=x) - L_{\Phi,h}^{S}(x,0)P(X_0^{S}=x)dx \\
& = \underbrace{\int_{\mathcal{X}} L_{\Phi,h}^{T}(x,0)P(X_0^{T}=x)-L_{\Phi,h}^{T}(x,0)P(X_{0}^{S}=x)dx}_{\mathlarger{\Gamma}}\\
&+\underbrace{\int_{\mathcal{X}}\left(L_{\Phi,h}^{T}(x,0)-L_{\Phi,h}^{S}(x,0)\right)P(X_0^{S}=x)dx}_{\mathlarger{\Theta}}
\end{aligned}
\end{align*}

To bound $\Theta$, we use the following inequality:
\begin{align*}
\begin{aligned}
& L_{\Phi,h}^{T}(x,t)-L_{\Phi,h}^{S}(x,t) \\
&= \int_{Y}\ell_{\Phi,h}(x,a,y)\left(P(Y^{T}_a = y|x)-P(Y^{S}_a = y|x)\right)dy \\
&\le \max_{f \in G'}\Bigg|\int_{Y}f(y)P(Y^{T}_a = y|x)-P(Y^{S}_a = y|x)dy\Bigg| \\
&= \underset{G'}{\text{IPM}}\big(P(Y^{T}_a = y|x), P(Y^{S}_a = y|x)\big) 
\end{aligned}
\end{align*}


From the above inequality, we have:
\begin{align*}
\begin{aligned}
\Theta &= \int_{\mathcal{X}}\left(L_{\Phi,h}^{T}(x,0)-L_{\Phi,h}^{S}(x,0)\right)P(X_0^{S}=x)dx \\
& \leq \mathbb{E}_{x \sim P(X^S)}\left[\underset{G'}{\text{IPM}}(P(Y_a^{S}|x), P(Y_a^{T}|x))\right]\\
& = \gamma^{*}
\end{aligned}
\end{align*}

To bound $\Gamma$, we use the change of variable formula:
$$
\begin{aligned}
 \Gamma  &= \int_{\mathcal{X}} L_{\Phi,h}^{T}(x,0)P(X_0^{T}=x) - \\
& \quad L_{\Phi,h}^{T}(x,0)P(X_0^{S}=x)dx \\
&= \int_{\mathcal{R}} L_{\Phi,h}^{T}\big(\Psi(r),0\big)P\big(\Phi(X^{T}_{0})=r\big) - \\
& \quad L_{\Phi,h}^{T}\big(\Psi(r),0\big)P\big(\Phi(X^{S}_{0}) =r\big)dr\\
&\le \max_{g\in G} \Bigg|\int g(r) \Big(P\big(\Phi(X^{T}_{0})=r\big)-\\
& \quad P\big(\Phi(X^{S}_{0}) =r\big)\Big)dr\Bigg|\\
&= \underset{G}{\text{IPM}}\Big(P\big(\Phi(X^{T}_0)\big),P\big(\Phi(X^{S}_0\big)\Big)
\end{aligned}
$$
Combining the above upper bounds for $\Gamma$ and $\Theta$, we have:
$$
\begin{aligned}
&\epsilon_{F}^{T,a=0}(\Phi,h) - \epsilon_{F}^{S,a=0}(\Phi,h) \\
& \le  \underset{G}{\text{IPM}}\Big(P\big(\Phi(X^{T}_0)\big),P\big(\Phi(X^{S}_0)\big)\Big) + \gamma^{*} 
\end{aligned}
$$

Thus, we conclude that:
\begin{equation*}
\begin{aligned}
&\epsilon_{F}^{T,a=0}(\Phi,h)\\
&\le \epsilon_{F}^{S,a=0}(\Phi,h) + \underset{G}{\text{IPM}}\Big(P\big(\Phi(X^{T}_0)\big),P\big(\Phi(X^{S}_0)\big)\Big) + \gamma^{*}
\end{aligned}
\end{equation*}
\end{proof}


\begin{lemma2}
Suppose that Assumptions A1, A2, A3 hold. Then the counterfactual loss of any model $(\Phi,h)$ on the target task satisfy:
\begin{equation*}
    \begin{aligned}
        \epsilon_{CF}^{T}(\Phi,h) \le &\epsilon_F^{S,a=1}(\Phi,h) + \epsilon_F^{S,a=0}(\Phi,h)\\ 
                               & + \underset{G}{\text{IPM}}(P(\Phi(X_1^{T})),P(\Phi(X_1^{S}))) \\
                               & + \underset{G}{\text{IPM}}(P(\Phi(X_0^{T})),P(\Phi(X_0^{S}))) \\
                               & + \underset{G}{\text{IPM}}(P(\Phi(X_0^{T})),P(\Phi(X_1^{T})))+2\gamma^*\\
    \end{aligned}
\end{equation*}
where 
\begin{equation}
%\label{transferability_assumption}
    \gamma^* = \underset{{x \sim P(X^S)}}{\mathbb{E}}\left[\underset{G'}{\text{IPM}}(P(Y_a^{S}|x), P(Y_a^{T}|x))\right]
\end{equation}
measures the fundamental difference between two causal inference tasks.
\end{lemma2}


\begin{proof}[\textbf{Proof of Lemma~\ref{lemma2}}] 
Theorem \ref{bounding_cfl} is applied to establish an upper bound for the counterfactual loss of the target task. Subsequently, we apply Lemma~\ref{lemma1}.

$$
\begin{aligned}
&\epsilon^{T}_{CF}(\Phi,h) \\
& \leq \epsilon_{F}^{T,a=1}(\Phi,h)+\epsilon_{F}^{T, a=0}(\Phi,h)+ \underset{G}{\text{IPM}}\big(\Phi(X^{T}_0), \Phi(X^{T}_1)\big)
\end{aligned}
$$
Therefore,
\begin{equation*}
\begin{aligned}
\epsilon^{T}_{CF}(\Phi,h)&\leq \epsilon_F^{S,a=1}(\Phi,h) + \epsilon_F^{S,a=0}(\Phi,h) +2\gamma^{*} \\ & 
+\underset{G}{\text{IPM}} \Big(P\big(\Phi(X_1^{T})\big), P\big(\Phi(X_1^{S})\big)\Big) \\
&+ \underset{G}{\text{IPM}}\Big(P\big(\Phi(X_0^{T})\big),P\big(\Phi(X_0^{S})\big)\Big) \\
&+ \underset{G}{\text{IPM}}\Big(P\big(\Phi(X_0^{T})\big),P\big(\Phi(X_1^{T})\big)\Big)\\
\end{aligned}
\end{equation*}
\end{proof}

\begin{theorem5}{(Transferability of Causal Knowledge)} 
Suppose that Assumptions A1, A2, A3 hold. The performance of source model on target task, i.e. $\varepsilon^{T}_{PEHE}(\Phi,h)$, is upper bounded by:
\begin{equation*}
\begin{aligned}
    \varepsilon^{T}_{PEHE}(\Phi, h) \le &2(\epsilon_F^{S,a=1}(\Phi,h) + \epsilon_F^{S,a=0}(\Phi,h)\\ 
    &+\underset{G}{\text{IPM}}(P(\Phi(X_1^{T})),P(\Phi(X_1^{S})))\\
    & + \underset{G}{\text{IPM}}(P(\Phi(X_0^{T})),P(\Phi(X_0^{S}))) \\
    & + \underset{G}{\text{IPM}}(P(\Phi(X_0^{T})),P(\Phi(X_1^{T}))+2\gamma^*) 
\end{aligned}
\end{equation*}
\end{theorem5}


\begin{proof}[\textbf{Proof of Theorem~\ref{main_theorem_1}}]
By applying Theorem \ref{bounding_epehe}, we get 
\begin{align*}
\begin{aligned}
&\varepsilon^{T}_{P E H E}(\Phi,h) \\
&\leq 
2\Big(\epsilon_F^{T,a=0}(\Phi,h) + \epsilon_F^{T,a=1}(\Phi,h) \\&+ \underset{G}{\text{IPM}}\left(P\left(\Phi(X^{T}_0)\right), P\left(\Phi(X^{T}_1)\right)\right)\Big) \\
\end{aligned}
\end{align*}
After applying Lemma~\ref{lemma1} to the first and second terms of the above equation, we have:
\begin{equation*}
\begin{aligned}
    \varepsilon^{T}_{PEHE}(\Phi, h) \le & \; 2 \; (\epsilon_F^{S,a=1}(\Phi,h) + \epsilon_F^{S,a=0}(\Phi,h)\\ 
    &+\underset{G}{\text{IPM}}(P(\Phi(X_1^{T})),P(\Phi(X_1^{S})))\\
    & + \underset{G}{\text{IPM}}(P(\Phi(X_0^{T})),P(\Phi(X_0^{S}))) \\
    & + \underset{G}{\text{IPM}}(P(\Phi(X_0^{T})),P(\Phi(X_1^{T}))+2\gamma^*) 
\end{aligned}
\end{equation*}
\end{proof}

\section{Baseline: Data Bundling} 
In many causal inference scenarios, we only have access to the trained model, and the corresponding data is unavailable. This situation could be the case in medical applications due to privacy reasons. Consequently, bundling the datasets of source tasks with the target task is not feasible. In contrast, the data may be available for some specific applications. In this case, we create another baseline referred to as data bundling.

In data bundling, we create the bundled dataset by combining the datasets of source tasks and the target task. Here, we compare our approach with data bundling for the IHDP and the Movement(Physics) datasets. For data bundling, we report the model's best performance (i.e., $\varepsilon_{P E H E}$) achieved by hyper-parameter search. For our approach, we only report the model's performance with the lowest training error. This setup gives more advantage to the data bundling baseline. The results are illustrated in Figure~\ref{fig:data_bundling}. Even with the aforementioned advantage, the data bundling method achieves poorer performance than our approach. This is due to data imbalance, lack of precision in determining similarity from propensity score, and \textbf{differences in outcome functions}.

\begin{figure}[t]
\centering
    \centering
    \includegraphics[width=0.45\textwidth]{figures/tl_db_movement.png}
    \centering
    \includegraphics[width=0.45\textwidth]{figures/bundling_ihdp.png}
    \caption{Performance comparison between data bundling and our approach. Our approach (red horizontal line) significantly outperforms data bundling. An increase in the size of training data doesn't improve the performance of data bundling.}
    \label{fig:data_bundling}
\end{figure}




\section{Causal Inference Task Affinity}
\label{epsnn}
Let $\mathcal{P}_{N_{\theta}}(T, D^{te})\in [0,1]$ be a function that measures the performance of a given model $N_{\theta}$ parameterized by $\theta\in\mathbb{R}^d$ on the test set $D^{te}$ of the causal task $T$.

\begin{definition}[$\varepsilon$-approximation Network]
A model $N_{\theta}$ is called an $\varepsilon$-approximation network for a task-dataset pair $(T,D)$ if it is trained using the training data $D^{tr}$ such that $\mathcal{P}_{N_{\theta}}(T, D^{te}) \geq 1 - \varepsilon$, for a given $0 < \varepsilon < 1$. 
\end{definition}

\begin{definition}[Fisher Information Matrix]
For a neural network $N_{\theta_{s}}$ with weights $\theta_{s}$ trained on data $D_{s}$, a given test dataset $D_t$ and the negative log-likelihood loss function $L(\theta,D)$, the Fisher Information matrix is defined as:
\begin{align}
    F_{s,t} &=\mathbb{E}_{D\sim D_t}\Big[\nabla_{\theta} L(\theta_{s},D)\nabla_{\theta} L(\theta_{s},D)^T\Big] \\
    &= -\mathbb{E}_{D\sim D_{t}}\Big[\mathbf{H}\big(L(\theta_{s},D)\big)\Big],
\end{align}
\end{definition} 
where $\mathbf{H}$ is the Hessian matrix, i.e., $\mathbf{H}\big(L(\theta,D)\big)= \nabla_{\theta}^2L(\theta,D)$, and expectation is taken w.r.t the data. It is proven that the Fisher Information Matrix is asymptotically well-defined \citep{9766163}.
In practice, we approximate the above with the empirical Fisher Information matrix:
\begin{align}\label{emprical_fisher}
    \hat{F}_{s,t} = \frac{1}{|D_{t}|}\sum_{x\in D_{t}} \nabla_{\theta} L(\theta_{s},x)\nabla_{\theta} L(\theta_{s},x)^T.
\end{align}
Here, the empirical Fisher Information Matrix is positive semi-definite because it is the summation of positive semi-definite terms, regardless of the number of samples.


% \subsubsection{Comparison between Unsymmetrized task affinity and CITA}
% We compare the nonsymmetrized task affinity~\citep{le2022task} and symmetrized task affinity (CITA) on the Jobs and the Twins datasets. Figure~\ref{fig:td_sym_plot} shows that CITA has successfully captured the symmetries within causal inference tasks. The x-axis $p$  denotes the probability of flipping treatment assignments of the original dataset. The proposed symmetrized task affinity shows that the datasets corresponding to $p=1$ (i.e., the flipped treatments dataset) and $p=0$ (i.e., the original dataset) are the closest tasks to the original task. The dataset with $p=0.5$ is the furthest dataset. We observe that the computed symmetrized task affinitys (CITA) have the y-axis symmetry at $p=0.5$, indicating the symmetry of the causal inference tasks.  In contrast, the nonsymmetrized task affinity fails to capture this symmetry property.


\subsection{Task Affinity Between Counterfactual Tasks}
\label{task_cf}
In the following section, we denote the task-dataset pair $a=(T_{a}, D_{a})$  by $a_{F}=(T_{a_{F}}, D_{a_{F}})$ where $D_{a_F}$ is sampled from the factual distribution. Similarly, $a_{CF} = (T_{a_{CF}}, D_{a_{CF}})$ denotes the counterfactual task-dataset pair, where $D_{a_{CF}}$ is sampled from   the counterfactual distribution. We refer to $(T_{a_{F}}, D_{a_{F}})$ and $(T_{a_{CF}}, D_{a_{CF}})$ as the corresponding factual and counterfactual tasks. 

The following theorem proves that the order of proximity of tasks is preserved even if we observe the counterfactual tasks instead. In other words, a task, which is more similar to the target task when measured using factual data, remains more similar to the target task even when measured using counterfactual data. 
% To ensure good performance on both the target factual and counterfactual distributions, we need both distances between factual and counterfactual tasks to be small. 
%Our theorem states that a small factual distance implies a small counterfactual distance. Hence, the causal knowledge can be transferred efficiently.

%then, we have all the sufficient conditions for good causal knowledge transfer performance. 


\begin{theorem}
\label{counterctual_order}
Let $\mathbb{T}$ be the set of tasks and
let $a_{F} = (T_{a_{F}},D_{a_{F}})$, $b_{F} = (T_{b_{F}},D_{b_{F}})$, and $c_{F} = (T_{c_{F}},D_{c_{F}})$ be three factual tasks and  $a_{CF} = (T_{a_{CF}},D_{a_{CF}})$, $b_{CF} = (T_{b_{CF}},D_{b_{CF}})$, and $c_{CF} = (T_{c_{CF}},D_{c_{CF}})$ their corresponding counterfactual tasks. 

Suppose that there exists a class of neural networks (well-trained causal inference neural networks) 
$\mathcal{N} = \{N_{\theta}\}_{\theta \in \Theta}$ for which:
\begin{align}
    \forall a,b,c \in \mathbb{T}, \: d[a,b] \leq d[a,c] + d[c,b]
\end{align}
and the task affinity between the factual and the counterfactual can be arbitrarily small, described as follows:
\begin{align}    
    \forall \epsilon>0, \exists N_{\theta} \in \mathcal{N}, \; d[a_{F},a_{CF}]<\epsilon
\end{align}

We have the following result:
\begin{align}
    d[a_{F},b_{F}]\leq d[a_{F},c_{F}] \implies d[a_{CF},b_{CF}] \leq d[a_{CF},c_{CF}]
\end{align}
\end{theorem}

\begin{proof}[\textbf{Proof of Theorem ~\ref{counterctual_order}}]
Suppose $d[a_{F},b_{F}]\leq d[a_{F},c_{F}]$. For every $\epsilon >0$, we have:
\begin{align*}
\begin{aligned}
d[a_{CF},b_{CF}] & \leq d[a_{CF},a_{F}] + d[a_{F},b_{F}] + d[b_{F},b_{CF}]\\
             & \leq \epsilon + d[a_{F},c_{F}] + \epsilon\\
             & \leq d[a_{F},a_{CF}] + d[a_{CF},c_{CF}] + d[c_{F},c_{CF}] \\ & \; +2\epsilon \\
             &\leq d[a_{CF},c_{CF}] + 4\epsilon
\end{aligned}
\end{align*}
Therefore,
$d[a_{CF},b_{CF}]\leq d[a_{CF},c_{CF}]$ as $\epsilon \to 0$.
\end{proof}

%\newpage
\bibliography{aloui_189}

\end{document}
