\documentclass[accepted]{uai2024} %
                        

\usepackage[american]{babel}

\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %


\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage{framed}
\usepackage{makecell}
\usepackage{nicefrac}
\usepackage{neuralnetwork}
\usepackage{algorithm2e}
\usepackage{caption}
\usepackage{subcaption}

\usepackage[capitalize,noabbrev]{cleveref}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{problem}[theorem]{Problem}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\newcommand{\rep}{\phi}
\newcommand{\Rep}{\Phi}
\newcommand{\repval}{\varphi}

\newcommand{\Pdata}{P^\text{data}}
\newcommand{\pdata}{p^\text{data}}
\newcommand{\Y}{\tilde{Y}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\trueweights}[2]{{\frac{\text{d}#2}{\text{d}#1}}}
\newcommand{\proj}[3]{{{#1}^{#2,#3}}}
\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\indep}{\perp \!\!\! \perp}
\allowdisplaybreaks
\newcommand%



\newcommand%
\newcommand%
\newcommand%
\newcommand%


\newcommand{\ourtitle}{Towards Representation Learning for Weighting Problems in Design-Based Causal Inference}


\newcommand{\swap}[3][-]{#3#1#2} %

\title{\ourtitle}

\author[1]{\href{mailto:<oscar.clivio@stats.ox.ac.uk>?Subject=Your UAI 2024 paper}{Oscar Clivio}{}}
\author[2]{Avi Feller}
\author[1]{Chris Holmes}
\affil[1]{%
    Department of Statistics\\
    University of Oxford
}
\affil[2]{%
    Goldman School of Public Policy and Department of Statistics\\
    University of California, Berkeley
}
  
  
  \begin{document}
\maketitle

\begin{abstract}
Reweighting a distribution to minimize a distance to a target distribution is a powerful and flexible strategy for estimating a wide range of causal effects, but can be challenging in practice because optimal weights typically depend on knowledge of the underlying data generating process. In this paper, we focus on design-based weights, which do not incorporate outcome information; prominent examples include prospective cohort studies, survey weighting, and the weighting portion of augmented weighting estimators. In such applications, we explore the central role of representation learning in finding desirable weights in practice. Unlike the common approach of assuming a well-specified representation, we highlight the error due to the choice of a representation and outline a general framework for finding suitable representations that minimize this error. Building on recent work that combines balancing weights and neural networks, we propose an end-to-end estimation procedure that learns a flexible representation, while retaining promising theoretical properties. We show that this approach is competitive in a range of common causal inference tasks.
\end{abstract}

\section{Introduction}
Estimating causal effects is a fundamental task in multiple fields such as epidemiology \citep{westreich2017totruioosw}, medicine \citep{rosenbaum2012omoaocsios}, public policy \citep{benmichael2022plwau} or economics \citep{sekhon2012ammficbicea}. Some challenges include removing the influence of confounders \citep{pearl2016ciisap} or generalizing a treatment effect estimated on a randomized control trial (RCT) to a target observational population \citep{degtiar2023arogat, colnet2022rtrfgfseavs}. 
Weighting approaches, which target a causal effect as an expectation under a reweighting of the original distribution, can address many of these problems \citep{benmichael2021tbaici, colnet2022rtrfgfseavs, johansson2022gbarlfeopoace}. 

In this paper, we focus on finding so-called \emph{design-based weights}, which do not incorporate any outcome information, either out of principle or out of necessity; as such, we cannot apply existing approaches involving outcomes off the shelf. Most prominently, design-based weights arise in the classical literature on the design of observational studies, which stresses the importance of separating the ``design'' and ``analysis'' phases of a non-randomized study \citep{rubin2008focidta}, and therefore stresses the importance of estimating weights without using the outcome. Such weights also arise in \emph{prospective cohort studies} \citep{song2010oscaccs} and in \emph{survey design} \citep{lohr2021sdaa}, in which researchers have not yet collected outcomes, as well as in applications in which it is useful to develop a single set of outcome-agnostic weights, such as in analyses with multiple outcomes of interest \citep{benmichael2024mcwfsd}.  Finally, in doubly robust methods that combine outcome and weighting models, such as in Automatic Debiased Machine Learning (AutoDML) \citep{chernozhukov2022admlocase} or augmented balancing weights \citep{benmichael2021tbaici}, the weights are typically estimated without using outcomes. 


Such methods for finding design-based weights generally rely on minimizing a probability distance between the weighted distribution and a reference distribution. The optimal distance, however, typically depends on the unknown data generating process (DGP). This has led to a large literature on learning an adequate \textit{representation}, a mapping of the covariate space to another manifold, that retains important properties of the DGP. Standard representations include balancing scores \citep{rosenbaum1983tcrotpsiosfce}, sufficient dimension reduction \citep{luo2020musdrfci}, and variable selection \citep{brookhart2006vsfpsm}. The correctness of these representations typically relies on unverifiable assumptions and the analyst is left without guarantees on the bias of the weighting estimator if they are not met, leading to poor performance in practice \citep{kang2007demystifying}. More recent approaches learn a representation implicitly by, for example, modelling weights directly as neural networks, however they only provide guarantees on the bias for specific DGPs, e.g. when the outcome model is piece-wise constant \citep{ozeryflato2018abfci} or follows a neural network architecture \citep{kallus2020dbdcrfciuat}. Despite these advances, there are not currently  principled procedures to directly assess and control the quality of a representation and its impact of the bias on the weighted estimator for any possible data generating process.










This constitutes our two main contributions. (1) We quantify the information lost by using a weighted estimator based on a representation,  rather on the original covariates, through a ``confounding bias'' and a ``balancing score error'', and give guarantees on the resulting bias of the estimator for any (posited class of the) outcome model.
(2) We develop a method inspired by DeepMatch \citep{kallus2020dbdcrfciuat} and RieszNet \citep{chernozhukov2022rafadmlwnnarf} that learns such representations from data. Unlike the original RieszNet application, however, we do not incorporate outcome information. We show promising performance of this approach on benchmark datasets in treatment effect estimation. This learnt representation can serve as the input of any weighting method, making it a generic pre-processing method.

\section{Background}
\subsection{Setup and Notation}


Let $P(X,\Y)$ be a \textbf{source} distribution on \textit{covariates} $X$ and some \textit{pseudo-outcomes} $\Y$, and $Q$ be a \textbf{target} distribution on covariates. For any distribution $R$ and random variable $Z$, denote $R_Z$ the law $R(Z)$. We assume that we have access to (not necessarily disjoint) i.i.d. samples $\mathcal{P}$ from $P$ and $\mathcal{Q}$ from $Q$. Let $\E_R[Z]$ be the expectation of a random variable $Z$ under the distribution $R$. We call a \textbf{weight function wrt $P$} or \textbf{weights wrt $P$} any measurable $P_X$-a.s. non-negative function $w(x)$ of covariates such that $\E_P[w(X)] = 1$. Any weight function $w$ wrt $P$ induces a distribution $P^w$ such that $\trueweights{P_X}{P^w_X}(x) = w(x)$ and $P^w(\Y|X) = P(\Y|X)$, where we say that $P$ is \textbf{reweighted} by $w(X)$, with $\E_{P^w}[f(X)] = \E_{P}[w(X)f(X)]$ for any function $f$. Let $\E_P[\Y | X=x]$ be a function of interest, which we call \textbf{the outcome model}. We are interested in the \textbf{target estimand} $\E_{Q}[\E_P[\Y | X]]$. In general, we do not have access to either the outcome model or the target estimand. That said, for any weight function $w(x)$ wrt $P$, $\hat{\tau}_w := \frac{1}{|\mathcal{P}|} \sum_{i \in \mathcal{P}} w(X_i) \Y_i$ is an unbiased estimator of $\E_{P^w}[\E_P[\Y | X]]$ as soon as $\E_P[w(X)\Y]$ is well-defined. All of this motivates our problem statement.
    
\begin{problem} \label{prob:gwp}
    Find a weight function $w(X)$ wrt $P$ such that
    \begin{align*}
        \E_{P^w}\left[\E_P[\Y | X=x]\right] = \E_{Q}\left[\E_P[\Y | X=x]\right] 
    \end{align*}
\end{problem}

This generalizes many weighting problems in causal inference. Generally, let $A$ denote the treatment variable, and $Y$ denote the outcome. We assume that the values of $A$ belong to a finite space $\mathcal{A}$. For $a \in \mathcal{A}$, we denote $Y(a)$ the potential outcome wrt $a$, which is the realized outcome if the subject were to receive treatment $a$. 
In the context of transportability, we also introduce a binary indicator $S$ for membership in a RCT population, thus $A \indep X | S=1$ and $(Y(1), Y(0)) \indep A | S=1$. Let $\Pdata(X,Y,S,A,(Y(a))_{a \in \mathcal{A}})$ be the true data distribution. In the absence of subscript, we assume that the expectation operator is that wrt $\Pdata$, that is $\E := \E_{\Pdata}$. Then, Problem \ref{prob:gwp} can be applied to the following weighting problems (details in Appendix \ref{suppl:problems}):
\begin{itemize}
    \item \emph{Average Treatment Effect on the Treated (ATT).} 
     The pseudo-outcome is $Y$;  the source and target distributions are $\Pdata(X,Y|A=0)$ and $\Pdata(X|A=1)$, respectively; the outcome model is $\E[Y(0) | X=x]$; the estimand is $\E[Y(0) | A=1]$.
    \item \emph{Average Treatment Effect (ATE).} Let $a \in \mathcal{A}$ be fixed. The pseudo-outcome is $Y$;  the source and target distributions are $\Pdata(X,Y|A=a)$ and $\Pdata(X)$, respectively; the outcome model is $\E[Y | A=a, X=x]$; the estimand is $\E[Y(a)]$.
    \item  \emph{Transportability.} The pseudo-outcome is
    \begin{align*}
        \Y := \frac{AY}{\Pdata(A=1|S=1)} - \frac{(1-A)Y}{\Pdata(A=0|S=1)};
    \end{align*}
    the source distribution is the joint covariate and pseudo-outcome distribution in the RCT $\Pdata(X,\Y|S=1)$ ; the target distribution is the covariate distribution in the target population $\Pdata(X|S=0)$; the outcome model is the conditional average treatment effect (CATE) $\E[Y(1) - Y(0) | X=x]$ ; the estimand is the ATE on the target population, $\E[Y(1) - Y(0) | S=0]$. 
    
\end{itemize}

One solution to these problems has the following form:
\begin{definition}
We call \textbf{true weights between $P$ and $Q$} the Radon-Nikodym derivative $\trueweights{P_X}{Q_X}$, which is a weight function wrt $P$.
\end{definition}

These weights are also known as \emph{inverse probability weights} or the \emph{Riesz representer} \citep{hirshberg2017amle, chernozhukov2022admlocase}. They are uniquely defined \citep{benmichael2021tbaici} by, for any measurable function $f$, 
\begin{align*}
    \E_{P}\left[\trueweights{P_X}{Q_X}(X)f(X)\right] = \E_Q[f(X)].
\end{align*}
In particular, this holds for $f(x) = \E_P[\Y | x]$, which solves Problem \ref{prob:gwp}.
In practice, the true weights $\trueweights{P_X}{Q_X}$ are unknown; we turn to estimating them and more generally obtaining solution weights in the next section. 

Finally, to ensure that true weights are well-defined, we make the following assumption, which is equivalent to \emph{overlap} in ATE estimation \citep{brunssmith2023abwalr} and \emph{support inclusion} \citep{colnet2022rtrfgfseavs} in transportability.  
\begin{assumption}
\label{ass:ac} $Q_X$ is absolutely continuous wrt $P_X$.
\end{assumption}

As we discuss in the introduction, we are in the setting where outcomes $Y_i$ and pseudo-outcomes $\Y_i$ for $i \in \mathcal{P}$ are not observed and cannot be used when trying to find weights solving Problem \ref{prob:gwp}, and are only available for estimating the final estimate $\hat{\tau}_w$ \textit{after} weights have been found. 

\subsection{Common Methods in Weighting}
\label{sec:lit_review_weighting}

In ATT/ATE estimation and transportability, true weights are proportional to the inverse of one of the propensity scores $p(A=a|X=x)$ \citep{benmichael2021tbaici} or $P(S=1|X=x)$ \citep{cole2010gefrctttptat}. Thus, an inverse probability weighting estimator $\widehat{w}$ of $\trueweights{P_X}{Q_X}$ is obtained by fitting a model for the indicated propensity score and inverting it, leading to potentially outsize errors due to misspecification \citep{zubizarreta2015swtbcfewiod}. An alternative used in the automatic debiased machine learning (AutoDML) literature is to minimize the mean squared error between $\trueweights{P_X}{Q_X}$ and $\widehat{w}$, which can actually be estimated without exactly knowing the true weights $\trueweights{P_X}{Q_X}$ \citep{chernozhukov2022admlocase, chernozhukov2022rafadmlwnnarf, newey2023admlfcs}. Another family of methods \citep{hainmueller2012ebfceamrmtpbsios, fong2018cbpsfactatteopa} relies on imposing that weights $w$ verify \textbf{balance} in some moments $r$, i.e. $\E_{P^w}[r(X)] = \E_{Q}[r(X)]$. Then one minimizes some dispersion measure of weights under these constraints. However, balancing $r(X)$ does not guarantee balancing the unknown $\E_P[Y|X]$ and the solution might not be feasible if $r$ has too many moments \citep{wainstein2022tfb}. Similar methods enforce such balance approximately through a generalized method of moments \citep{imai2014cbps, fong2018cbpsfactatteopa}.

Finally, another family of methods \citep{benmichael2021tbaici} aims at finding weights $w$ minimizing $|\text{Bias}_{P,Q}(w)|$ where we refer to
\begin{align*}
    \text{Bias}_{P,Q}(w) = \E_{P^w}[\E_P[Y|X]] - \E_{Q}[\E_P[Y|X]]
\end{align*} as the \textbf{``bias''} of weights $w$, measuring how short they fall of solving Problem \ref{prob:gwp} and which is also equal to the bias of the estimator $\hat{\tau}_w$ wrt the target estimand. It is usually assumed that $\E_P[\Y|x]$ belongs to a class of functions $\mathcal{M}$ which leads to the bound
\begin{align*}
|\text{Bias}_{P,Q}(w)|
&\leq \text{IPM}_{\mathcal{M}}(P^w_X, Q_X) \\
&:= \sup_{\bar{m} \in \mathcal{M}} |\E_{P^w}[\bar{m}(X)] - \E_{Q}[\bar{m}(X)]|
\end{align*}
where the RHS is an integral probability metric (IPM) \citep{sriperumbudur2012oteeoipm} on the class $\mathcal{M}$ and generally corresponds to a known probability discrepancy; for example the Wasserstein distance when $\mathcal{M}$ is the set of Lipschitz functions or the maximal mean discrepancy (MMD) wrt kernel $k$ when $\mathcal{M}$ is the RKHS of $k$. Thus, adding a term to control the variance of the weighting estimator \citep{kallus2020gommfci, benmichael2021tbaici}, we obtain a solution $w$ by solving
\begin{align}
    \min_w \ \ \text{IPM}_{\mathcal{M}}(P^w_X, Q_X)^2 + \sigma^2 \cdot ||w(X)||_{L_2(P)}^2 \label{eq:balancingweightsoptim}
\end{align} for a chosen $\sigma > 0$ that controls a bias-variance trade-off \citep{brunssmith2022oaadtfbw}. A key challenge is that as we do not know the outcome model $\E_P[\Y|x]$, we do not know the model class $\mathcal{M}$, thus an adequate probability discrepancy to minimize. In practice, one resorts to trying a specific discrepancy, thus making an implicit assumption on the function space $\mathcal{M}$ which can then be inadequate wrt the outcome model $\E_P[\Y|x]$ at stake. Recognizing this, directions in the literature include finding a data-driven tailored function class $\mathcal{M}$ \citep{kallus2020dbdcrfciuat, wainstein2022tfb} or finding guarantees when the function class is misspecified \citep{brunssmith2022oaadtfbw}.

\subsection{Choosing a Distance via a Representation}
\label{sec:background_reps}
Many methods minimize a probability discrepancy measure or more generally find weights that only depend on covariates $x$ via a vector-valued function $\rep(x)$ known as a \textbf{representation} \citep{kallus2020dbdcrfciuat, xue2023acdfsatvdlr}. %
Indeed, assuming any function class $\mathcal{M}$ implicitly assumes that any function linearly depends on a representation $\rep(x)$, e.g. the first-order moment $x$ for linear functions, the kernel feature spaces $k(.,x)$ for the RKHS of kernel $k$ \citep{hazlettNAkbafnpwpfece, kallus2020dbdcrfciuat}, and more generally $(m(x))_{m \in \mathcal{M}}$ for any class $\mathcal{M}$ (note that such a representation is not unique). In turn, every representation defines a function class. Thus, choosing a function class $\mathcal{M}$ means \textit{implicitly} choosing a representation $\rep(x)$ and assuming that the true outcome model $\E_P[\Y|x]$ linearly depends on it.%

Further, it is also common practice to \textit{explicitly} define a representation $\rep(x)$ (on which the outcome model need \textit{not} depend linearly) and apply a weighting method using it. Notable examples include propensity scores and balancing scores \citep{rosenbaum1983tcrotpsiosfce}, prognostic scores \citep{hansen2008tpaotps} or variable selection \citep{brookhart2006vsfpsm, colnet2022rtrfgfseavs}. One motivation to do so is that a low-dimensional representation can mitigate undesirable effects of high dimensions in causal inference \citep{ning2020reocevahdcbps, damour2021oioswhdc} or probability distances \citep{dudley1969tsomgcc, ramdas2015otdpokadbnhtihd} and improve efficiency by selecting essential covariate information wrt the DGP.

The question then becomes how to obtain suitable representations $\rep(x)$. It is well-known that weighting on the true outcome model, the propensity score or a representation predicting either \citep{rosenbaum1983tcrotpsiosfce, hansen2008tpaotps} is a sensible choice as these representations preserve unconfoundedness. However, we do not have access to these true models or representations predicting them. Methods based on sufficient dimension reduction attempt to find a linear representation under the constraint that it predicts either model \citep{cook2009rgifsrtg, luo2020musdrfci},
while others extract representations from a learnt model for the outcome, the treatment or the RCT indicator \citep{rosenbaum1983astaubciaoswbo, hansen2008tpaotps, cole2010gefrctttptat}. However, to the best of our knowledge, there are no guarantees on the bias when any posited model is misspecified or more generally when any underlying assumption is violated, while they are critical as one cannot verify such assumptions. In particular, classification-based learning of propensity scores does not optimize for covariate balance but for prediction of the treatment or the RCT indicator, while (near-)deterministic prediction of either will violate (strict \citep{damour2021oioswhdc}) overlap, leading to poor matching or weighting performance in practice \citep{alam2019sapsmbstuoepfca, king2019wpssnbufm}. In addition, many such methods learn the representation using outcomes, which is done before weighting, thus is not permitted in an actual design-based setting. More recent works learn implicit representations by positing a rich parametric class $\mathcal{M}$ \citep{ozeryflato2018abfci, kallus2020dbdcrfciuat}, as a result bias can be controlled but only for outcome models belonging to this class.

Thus, one might wonder whether guarantees on the bias can be provided when using \textit{any} representation $\rep$ and \textit{any} class $\mathcal{M}$, without using outcome information and without relying on rigid well-specification assumptions. This is the main contribution of our paper, which we develop next.



\section{Theory and Method}

\subsection{Quantifying the Information Loss}
\label{sec:decomposition}
Choosing a representation $\rep(X)$ introduces many trade-offs.
At one extreme, oracle %
representations, such as balancing scores or prognostic scores, perfectly preserve unconfoundedness; that is, unconfoundedness given $\rep(X)$ implies unconfoundedness given $X$. These are largely unknown, however. At the other extreme, degenerate representations, such as a \emph{constant} $\rep(X)$, will destroy all the information in the original $X$.
We now characterize representations that minimize the information lost relative to $X$.

Indeed, we first make technical assumptions ensuring that all expectations are well-defined. For any distribution $R$, random variable $Z$ and integer $p \geq 1$, let 
\begin{align*}
||Z||_{L_p(R)} := \left(E_R[|Z|^p]\right)^{\frac{1}{p}},
\end{align*}
and note $Z \in L_P(R)$ iff $||Z||_{L_P(R)} < \infty$. Notably, for a measurable function $f$ of values of $Z$,
\begin{align*}
    ||f||_{L_p(R_Z)} = \left(E_R[|f(Z)|^p]\right)^{\frac{1}{p}} = ||f(Z)||_{L_p(R)}
\end{align*}
We then make the following assumptions.

\begin{assumption} \label{ass:rndL2}
    $\trueweights{P_X}{Q_X}(X) \in L_2(P)$
\end{assumption}
\begin{assumption} \label{ass:yL2}
    $\Y \in L_2(P)$
\end{assumption}

Then, under Assumptions \ref{ass:ac}, \ref{ass:rndL2}, \ref{ass:yL2} by noting that for any weights $w$ wrt $P$ that are in $L_2(P_X)$, and for any measurable mapping $\rep(x)$ of covariates, the bias can be decomposed as 
\begin{align}
  &\text{Bias}_{P,Q}(w) = \E_{P^w}\left[\E_{P}[\Y|X]\right] - \E_{Q}\left[\E_P[\Y|X]\right] \nonumber \\
  &= \underbrace{\E_{P^w}\left[\E_P[\Y|\rep(X)]\right] - \E_Q\left[\E_P[\Y|\rep(X)]\right]}_{\text{Bias wrt the representation}} \nonumber \\
  &+ \underbrace{\E_{P^w}\left[\E_{P}[\Y|X] - \E_P[\Y|\rep(X)]\right]}_{\text{Chosen weights bias}} \nonumber \\
  &+ \underbrace{\E_{Q}\left[\E_P[\Y|\rep(X)] - \E_P[\Y|X]\right]}_{\text{Confounding bias}}.\label{eq:decomposition}
\end{align}




We now explain each term in the RHS. First, if the weights $w(X)$ are a function of the representation $\rep(X)$, the \textit{bias wrt the representation} would be the bias if we replaced $X$ with $\rep(X)$ in the equality of Problem \ref{prob:gwp}. This interpretation still holds for general weights $w(X)$ as from the tower property applied to $\E_{P^w}[\E[\Y|\rep(X)]]$, they can be replaced with $\E_P[w(X)|\rep(X)]$, which is a $L_2(P_X)$ weight function wrt $P$ and is a function of $\rep(X)$, in the term.
As in Section \ref{sec:lit_review_weighting}, we can directly bound the bias wrt the representation via an IPM of the form
\begin{align*}
    \text{IPM}_{\mathcal{G}}(P^w_{\rep(X)}, Q_{\rep(X)}), 
\end{align*}
where for example, for a class $\mathcal{M}$ such that $\E_P[\Y | x] \in \mathcal{M}$, the class $\mathcal{G}$ can contain
\begin{align}
      \rep(\mathcal{M}, P) := \{z \mapsto \E_P[m(X) | \rep(X) = z],\ \ m \in \mathcal{M} \}. \label{eq:phiMP}
\end{align}
Second, the \emph{chosen weights bias} measures how much ``chosen'' weights $w(x)$ do not depend on $\rep(x)$. It turns out that this quantity is zero for weights $\hat{w}(x)$ that solve the canonical minimization in Equation \ref{eq:balancingweightsoptim} with the aforementioned IPM; as we show next, these weights only depend on $\rep(x)$.

\begin{proposition}
\label{prop:sols_rep}
Let $\rep(x)$ be a measurable mapping with values in a space $\Rep$.
\begin{enumerate}
    \item Under Assumptions \ref{ass:ac}, \ref{ass:rndL2}, if $\mathcal{G}$ is a class of $L_2(P_{\rep(X)})$ functions on $\Rep$, $\sigma > 0$, there is a unique solution $\hat{w}(x)$ to the problem
\begin{align*}
    \min_{\substack{ w \text{ weight}\\ \text{ function} \\ \text{wrt } \text{P}}} \text{IPM}_{\mathcal{G}}(P^w_{\rep(X)}, Q_{\rep(X)})^2 + \sigma^2 \cdot ||w(X)||_{L_2(P)}^2  
\end{align*}
and it is a function of $\rep(x)$ $P_X$-almost surely, i.e. there exists $\bar{w} : \Rep \rightarrow \mathbb{R}$ such that $\hat{w}(x) = \bar{w}(\rep(x)) \ \forall x \ P_X-$a.s. ; and $\hat{w}(X) \in L_2(P)$.
    \item Under Assumption \ref{ass:yL2}, for any $L_2(P_X)$ weight function $w(x)$ wrt $P$ that is a function of $\rep(x) \ P_X$-a.s., the chosen weights bias is zero.
\end{enumerate}

\end{proposition}

Finally, the \textbf{confounding bias} is the most important term of this decomposition, as it characterizes the information lost in $\rep(X)$ relative to $X$ --- and thus can be seen as the bias \textit{of} $\rep$, rather than the bias \textit{wrt} $\rep$ that is applied to weights.

When the target is $\E[Y(a)]$, this quantity is the difference between $\E\left[\E[Y \mid A = a, \rep(X)]\right]$ and $\E\left[\E[Y \mid A = a, X]\right]$, measuring how much $\rep(X)$ preserves unconfoundedness \citep{damour2021dsfrfceewwo, melnychuk2023boricbftee}.
More generally, for solution weights $\hat{w}$ of Equation \ref{eq:balancingweightsoptim} with an IPM depending on $\rep(X)$, it is exactly the difference between the biases of $\hat{w}$ wrt original covariates $X$ and their representation $\rep(X)$, as shown by Equation \ref{eq:decomposition}. Thus, if $\hat{w}$ has a small (resp. zero) bias wrt $\rep$, then it will also have a small (resp. zero) bias overall.


To the best of our knowledge, this is the first extension of the confounding bias for the $\E[Y(a)]$ target estimand to more general weighting problems in causal inference. It has a similar formulae as the \textit{excess target information loss} in \cite{johansson2019saiidir} measuring the loss of information induced by a representation in domain adaptation. We further provide a characterisation for it that will prove useful.
\begin{proposition} \label{prop:confounding_bias}
Under Assumption \ref{ass:ac}, for any measurable $\rep(x)$, $Q_{\rep(X)}$ is absolutely continuous wrt $P_{\rep(X)}$, with 
\begin{align*}
    \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X)) = \E_P\left[ \trueweights{P_X}{Q_X}(X) \middle| \rep(X) \right] \ \ P\text{-a.s.}
\end{align*}
and under the additional Assumptions \ref{ass:rndL2} and \ref{ass:yL2}, the confounding bias is equal to both
\begin{align}
    \E_P\bigg[&\left(\E_P[\Y|\rep(X)] - \E_P[\Y|X] \right) \nonumber \\
    &\times \left(\trueweights{P_X}{Q_X}(X) - \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))  \right)\bigg] \label{eq:confounding_bias}
\end{align}
and
\begin{align}
    -\E_P\left[\E_P[\Y|X] \left(\trueweights{P_X}{Q_X}(X) - \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))  \right) \right] \label{eq:confounding_bias_2}
\end{align}
\end{proposition}

When the confounding bias is zero, $\rep$ is known as a \textit{deconfounding score} \citep{damour2021dsfrfceewwo}, and the overall bias is simply equal to the bias wrt $\rep$. In particular, from Equation \ref{eq:confounding_bias}, the confounding bias will be zero in two special cases :
\begin{itemize}
    \item When $\E_P[\Y|X] = \E_P[\Y|\rep(X)] \ P-$a.s., that is
    \begin{align*}
    \E_P[\Y|X] = \E_P\left[\E_P[\Y|X] \middle|\rep(X)\right] \ P\text{-a.s.}
    \end{align*}
    from the tower property. This is equivalent to $\E_P[\Y|x]$ being a function of $\rep(x)$ $P_X$-a.s., i.e. $\rep(X)$ a prognostic score \citep{hansen2008tpaotps}.
    \item When $\trueweights{P_X}{Q_X}(X) = \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X)) \ P$-a.s., that is 
    \begin{align*}
    \trueweights{P_X}{Q_X}(X) = \E_P\left[\trueweights{P_X}{Q_X}(X) \middle|\rep(X)\right] \ P\text{-a.s.}
    \end{align*}
    from Proposition \ref{prop:confounding_bias}. This is equivalent to $\trueweights{P_X}{Q_X}(x)$ being a function of $\rep(x)$ $P_X$-a.s., i.e. $\rep(X)$ a balancing score \citep{rosenbaum1983tcrotpsiosfce}.
\end{itemize}
We make a more rigorous connection between the confounding bias and canonical scores from the literature as well as notions from transportability in Appendix \ref{sec:scores}

Further, the confounding bias and its role in the decomposition of Equation $\ref{eq:decomposition}$ allow us to extend the idea of a deconfounding score to hold approximately, rather than exactly. Indeed, if the confounding bias of $\rep$ is not zero but remains small, then we can expect that a small bias wrt $\rep$ obtained by solving the problem of Proposition\ref{prop:sols_rep} will still yield a small overall bias. This gives us more flexibility than relying on well-specified models, where any guarantee on the bias is lost in case of misspecification. In contrast, the confounding bias directly quantifies the misspecification itself.

Thus, one might wonder whether we can minimize directly said misspecification to find an \textit{approximate} deconfounding score $\rep$. However Equation \ref{eq:confounding_bias} involves ground-truth models we do not have access to like $\E_P[\Y|x]$, $\trueweights{P_X}{Q_X}(x)$ as well as their projections on $\rep(x)$. Further, we do not observe any outcomes at this stage, precluding any estimation of $\E_P[\Y|x]$. To address all of this, note that a direct application of the Cauchy-Schwarz inequality to Equation \ref{eq:confounding_bias_2} yields
\begin{align}
    |\text{Confounding bias}| \leq ||\E_P[\Y|X]||_{L^2(P)} \cdot \text{BSE}_{P,Q}(\rep)
\end{align}
where we further have $||\E_P[\Y|X]||_{L^2(P)} \leq ||\Y||_{L_2(P)}$ from Jensen's inequality, and we call
\begin{align}
\text{BSE}_{P,Q}(\rep) := \bigg|\bigg|\trueweights{P_X}{Q_X}(X) - \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))\bigg|\bigg|_{L_2(P)}\label{eq:bse_bounds_cb}
\end{align}
the \textbf{balancing score error} (BSE). This name is justified as from Proposition \ref{prop:confounding_bias}, this quantity is equal to
\begin{align*}
    \bigg|\bigg|\trueweights{P_X}{Q_X}(X) - \E_P\left[ \trueweights{P_X}{Q_X}(X) \middle| \rep(X) \right]\bigg|\bigg|_{L_2(P)},
\end{align*}
that is the root mean-squared error between $\trueweights{P_X}{Q_X}(X)$ and its projection on $\rep(X)$, i.e. its best predictor from $\rep(X)$ in $L_2(P)$. In other words, it measures the extent to which $\trueweights{P_X}{Q_X}(x)$ is not a function of $\rep(x)$ $P_X$-a.s., and therefore the extent to which $\rep(x)$ is not a balancing score. Importantly, it does not depend on the pseudo-outcome $\Y$, only on the marginal $P_X$. Note that the confounding bias can be zero and the balancing score error positive, even potentially arbitrary, for many representations $\rep(x)$ that contain information on the outcome model $\E_P[\Y|x]$. Concrete examples include prognostic scores from \citet{hansen2008tpaotps}, or the deconfounding scores in the example of Section 5 in \citet{damour2021dsfrfceewwo}. Our setup excludes such representations as it assumes that we do not observe outcomes at this stage. Alternatively, if one had access to outcomes, then similarly as for the balancing score error, we can bound the confounding bias with a ``prognostic score error''.

On the other hand, note that the balancing score error allows us to control the resulting bias with only mild assumptions on the outcome model. We formalize this next.
\begin{proposition} \label{prop:bse}
Under Assumptions \ref{ass:ac}, \ref{ass:rndL2}, \ref{ass:yL2}, for any set $\mathcal{M}$ of $L_2(P_X)$ functions such that $E_P[\Y | x] \in \mathcal{M}$, for any measurable representation $\rep$, and for any $L_2(P_X)$ weights $w$ wrt $P$ depending on $\rep(x)$ $P_X$-a.s., defining $\rep(\mathcal{M}, P)$ as in Equation \ref{eq:phiMP},
\begin{align*}
    |\text{Bias}_{P,Q}(w)| \leq& \ \text{IPM}_{\rep(\mathcal{M}, P)}(P^w_{\rep(X)}, Q_{\rep(X)})\\
    &+ ||\Y||_{L_2(P)} \cdot \text{BSE}_{P,Q}(\rep).
\end{align*}
\end{proposition}
We note that the bound of Proposition \ref{prop:bse} is ``sharp'' in the sense that when we replace the IPM and the BSE by the (unknown) terms they bound, namely the bias wrt the representation and the confounding bias, the inequality becomes an equality. It further suggests a two-step approach to minimize the overall bias on the LHS. First, learn a representation $\rep$ that minimizes the BSE, i.e. the second term of the RHS, plug this learnt representation $\rep$ into an IPM and find weights minimizing it, or in other words minimizing the first term of the RHS. To learn the representation, the BSE could be used in addition or in replacement of the traditional likelihood to learn propensity score models. Importantly, we can bypass propensity score estimation algother and posit more general representations, including multivariate functions. We turn to this in the next sections.%

\subsection{Operationalizing and Minimizing Information Loss}

While we have avoided the need to specify an outcome model $\E_P[\Y|x]$, a key bottleneck remains for the balancing score error: we do not have access to the true weights $\trueweights{P_X}{Q_X}(X)$ or their projection $\trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))$. One possible workaround is to first remove the projection by using the definition of a conditional expectation: for any function $g$ on the image space of $\rep$,
\begin{align} 
\text{BSE}_{P,Q}(\rep) \leq \bigg|\bigg|\trueweights{P_X}{Q_X}(X) - g(\rep(X))\bigg|\bigg|_{L^2(P)}. \label{eq:bse_workaround}
\end{align}
In particular, for $\epsilon>0$, if there exists \textit{any} function $g$ on the image space of $\rep$ such that the RHS of Equation \ref{eq:bse_workaround} is below $\nicefrac{\large \epsilon}{||\Y||_{L_2(P)}}$, then $\rep$ has an absolute confounding bias at most $\epsilon$. This gives us more flexibility than working with the true projection of $\trueweights{P_X}{Q_X}$, and motivates finding an $g$ and $\rep$ minimizing the RHS. 

This approach, however, is insufficient since we still do not have access to $\trueweights{P_X}{Q_X}$. A key result from the covariate shift literature \citep{kanamori2009alsatdie}, notably exploited in the AutoDML literature \citep{chernozhukov2022rafadmlwnnarf, chernozhukov2022admlocase}, helps us remove $\trueweights{P_X}{Q_X}$ from the minimization entirely : for any distributions $P,Q$ verifying Assumption \ref{ass:ac}, and for any function $v$, $\big|\big|\trueweights{P_X}{Q_X}(X) - v(X)\big|\big|^2_{L^2(P)}$ is equal to $\mathcal{L}_{P,Q}(v)$ up to an additive constant wrt $v$, where we refer to
\begin{align*}
\mathcal{L}_{P,Q}(v) := \E_P[v(X)^2] - 2 \cdot \E_Q[v(X)]
\end{align*}
as the \textit{AutoDML loss}. In particular, $\mathcal{L}_{P,Q}(v)$ can be estimated in finite samples for any known $v$, as 
\begin{align*}
    &\mathcal{L}_{\mathcal{P}, \mathcal{Q}}(v) = \frac{1}{|\mathcal{P}|}\sum_{i \in \mathcal{P}} v(X_i)^2 - \frac{2}{|\mathcal{Q}|}\sum_{i \in \mathcal{Q}} v(X_i)
\end{align*}
This motivates an approach to \textit{learn a representation} $\rep$. We posit a parameterized representation $\rep(x; \theta_\rep)$ with values in a space $\Rep$, and a scalar parameterized function $g(.; \theta_g)$ on $\Rep$. Then we minimize $\mathcal{L}_{\mathcal{P},\mathcal{Q}}(g(\rep(.; \theta_\rep); \theta_g))$ wrt $\theta_\rep, \theta_g$. Due to the compositionality of neural networks, we parameterize $g$ and $\rep$ jointly in a neural network which is plugged into the AutoDML loss, similarly to the Riesz representer component of RieszNet \citep{chernozhukov2022rafadmlwnnarf}, and where a pre-specified, potentially low-dimensional hidden layer is later used as the representation $\rep$  \citep{clivio2022nsmfhdci}. This is illustrated in Figure \ref{fig:intuition}. Unlike RieszNet, we do not use any outcome information and we do not use the final Riesz representer head as the solution weight function, but instead plug the representation into a probability distance to obtain such a solution, as we will see shortly. We later show that this yields lower biases in our experiments.



\begin{figure}[t]
\makebox[\columnwidth][c]{
\scalebox{1.0}{
    \begin{neuralnetwork}[height=4, layerspacing=17mm]
    \newcommand{\xnn}[2]{$x_#2$}
    \newcommand{\ynn}[2]{$\hat{y}_#2$}
    \newcommand{\hfirst}[2]{\small $h^{(1)}_#2$}
    \newcommand{\hsecond}[2]{\small $h^{(2)}_#2$}
    \newcommand{\hthird}[2]{\small $h^{(3)}_#2$}
    \inputlayer[count=3, bias=false, title=\small $x$, text=\xnn]
    \hiddenlayer[count=4, bias=false, title=\small Hidden\\layer, text=\hfirst] \linklayers
    \hiddenlayer[count=2, bias=false, title=\mall $\rep(x)$, text=\hsecond] \linklayers
    \hiddenlayer[count=4, bias=false, title=\small Hidden\\layer, text=\hthird] \linklayers
    \outputlayer[count=1, title=\small  $g(\rep(x))$, text=\ynn] \linklayers
\end{neuralnetwork}
}
}
    \caption{Neural Network to Learn a Representation $\rep$.}
    \label{fig:intuition}
\end{figure}


\subsection{Extension to Simultaneous Weightings}
\label{sec:simultaneous}
In ATE estimation, one aims at estimating all $\mu(a) := \E[Y(a)]$ for all $a \in \mathcal{A}$ simultaneously. This can be done \citep{martinet2020abwffetceogt} by finding a function $f(a)$ minimizing
\begin{align*}
    \E[(\mu(A) - f(A))^2]
\end{align*}
over functions $f$ defined by
\begin{align*}
    f(a) = \E[w_a(X)\E[Y | X, A=a] \ | \ A=a]
\end{align*} where $w_a(X)$ is a weight function wrt $\Pdata_{X|A=a}$. This is equivalent to minimizing
\begin{align*}
    \E[\text{Bias}^2_{\Pdata(.|A),\Pdata(.)}(w_A)],
\end{align*}
which is a special case of minimizing the \textbf{joint squared bias}
\begin{align*} 
    \text{Bias}^2_{P^\Lambda,Q^\Lambda,p_\Lambda}(w^\Lambda) := \E_{p_\Lambda(\alpha)}[\text{Bias}^2_{P^\alpha,Q^\alpha}(w^\alpha)]
\end{align*}
where $\alpha$ belongs to a set $\Lambda$ endowed with a probability distribution $p_\Lambda(\alpha)$, $h^\Lambda := (h^\alpha)_{\alpha \in \Lambda}$ for any $h$, and $P^\alpha, Q^\alpha, w^\alpha$ are a source distribution, a target distribution, a weight function indexed by $\alpha \in \Lambda$, respectively. The following corollary extends previous results on the balancing score error to the setting of simultaneous weighting problems.
\begin{corollary}
\label{prop:bse_multiple}
Let $\Lambda$ be a set endowed with a distribution $p_\Lambda(\alpha)$, $P^\Lambda,Q^\Lambda$ be mappings from $\Lambda$ to a distribution such that for any $\alpha \in \Lambda$, $P^\alpha, Q^\alpha$ satisfy Assumptions \ref{ass:ac}, \ref{ass:rndL2}, \ref{ass:yL2}. Then for any $\mathcal{M}^\Lambda$ such that $\forall \alpha \in \Lambda, \ \ \E_{P^\alpha}[\Y | x] \in \mathcal{M}^\alpha$ where $\mathcal{M}^\alpha$ is a set of $L_2(P_X)$ functions, for any mapping $\rep^\Lambda$ from $\Lambda$ to measurable representations, for any $w^\Lambda$ such that each $w^\alpha(x)$ is an $L_2(P^\alpha_X)$ weight function wrt $P^\alpha$ depending on $\rep^\alpha(x)$,
\begin{align*}
    &\frac{1}{2} \cdot \text{Bias}^2_{P^\Lambda,Q^\Lambda}(w^\Lambda)\\
    &\leq \E_{p_\Lambda(\alpha)}\Big[\text{IPM}^2_{\rep^\alpha(\mathcal{M}^\alpha, P^\alpha)}(P^{\alpha, w^\alpha}_{\rep^\alpha(X)}, Q^{\alpha}_{\rep^\alpha(X)})\Big] \\
    &\ \ \ \ + \left(\sup_{\alpha \in \Lambda}||\Y||^2_{L_2(P^\alpha)}\right) \cdot \text{BSE}^2_{P^\Lambda,Q^\Lambda,p_\Lambda}(\rep^\Lambda).
\end{align*}
where we call
\begin{align*}
    \text{BSE}^2_{P^\Lambda,Q^\Lambda,p_\Lambda}(\rep^\Lambda) := \E_{p_\Lambda(\alpha)}[\text{BSE}^2_{P^\alpha,Q^\alpha}(\rep^\alpha)]
\end{align*}
the \textbf{joint squared balancing score error}.
\end{corollary}

We also note that this framework is identical to Problem \ref{prob:gwp} when $\Lambda$ is of cardinality 1. Finally, we can extend the previous section to simultaneous weights, where we now find an indexed representation $\rep^\Lambda$ that minimizes the joint squared balancing score error. We do so by first positing a parameterized representation $\rep(x, \alpha; \theta_\rep)$ belonging to some space $\Rep$ and a scalar parameterized function $g(\repval, \alpha; \theta_g)$ on the $\Rep \times \Lambda$ space, and then minimizing 
\begin{align*}
 \mathcal{L}_{\mathcal{P}^\Lambda,\mathcal{Q}^\Lambda}^{g, \rep, p_\Lambda}( \theta) = \E_{p_\Lambda(\alpha)}\Big[\mathcal{L}_{\mathcal{P}^\alpha,\mathcal{Q}^\alpha}(g(\rep(., \alpha; \theta_\rep), \alpha ; \theta_g))\Big]
\end{align*}
wrt $\theta_\rep, \theta_g$, where $\mathcal{P}^\alpha,\mathcal{Q}^\alpha$ are samples from $P^\alpha,Q^\alpha$.

If desired, we can separate the problem of minimizing the joint squared bias into independent weighting problems, minimizing each individual bias separately, especially when $\Lambda$ is finite and with few elements. However, we can also share parameters or dependencies between individual problems, e.g. use the same representation for all problems, i.e. $\rep^\alpha := \rep$ for some $\rep$ for all $\alpha \in \Lambda$, or share parameters $\theta_\rep, \theta_g$ between problems $\alpha \in \Lambda$, notably when there are few samples for every $\mathcal{P}_\alpha$ or $\mathcal{Q}_\alpha$ as in ATE estimation with high-cardinal $\mathcal{A}$.

For completeness, we now give examples of $\Lambda, \mathcal{P}^\Lambda, \mathcal{Q}^\Lambda$. In ATE estimation, we have access to samples $\{(x_i, a_i, y_i)\}_{i = 1, \cdots, n}$ of $\Pdata(X,A,Y)$. Then, $\Lambda = \mathcal{A}$ and for each $\alpha = a \in \mathcal{A}$, $\mathcal{P}^a = \{(x_i,y_i)\}_{i : a_i = a}$, $\mathcal{Q}^a = \mathcal{Q}^0 := \{x_i\}_{i = 1, \cdots, n}$. In ATT estimation, where $\mathcal{A}$ is binary, then $\Lambda = \{0\}$, $\mathcal{P}^0 = \{(x_i,y_i)\}_{i : a_i = 0}$, $\mathcal{Q}^0 = \{x_i\}_{i : a_i = 1}$. In transportability, $\Lambda = \{0\}$, we have access to samples $\{(x_i,a_i,y_i)\}_{i = 1, \cdots, n}$ of the RCT distribution $\Pdata(X,A,Y|S=1)$, samples $\{(x_i)\}_{i = n+1, \cdots, n+m}$ of some observational data $P(X|S=0)$, and $\pi = \Pdata(A=1|S=1)$, so $\Lambda = \{0\}$, $\mathcal{P}^0 = \{(x_i, \tilde{y}_i = \frac{a_iy_i}{\pi} - \frac{(1-a_i)y_i}{1-\pi})\}_{i = 1, \cdots, n}$, $\mathcal{Q}^0 = \{x_i\}_{i = n+1, \cdots, n+m}$.




\subsection{Weighting and Algorithm}

Learning a representation by minimizing a bound of the BSE helped us minimize the second term of the RHS of Proposition \ref{prop:bse}. We now turn to minimizing the first term, that is  \textbf{finding weights}. In finite samples, we aim to find discrete weights $w_i = w(X_i)$ for $i \in \mathcal{P}$, with constraints $\forall i \in \mathcal{P}, \  w_i \geq 0$ and $\frac{1}{|\mathcal{P}|}\sum_{i \in \mathcal{P}} w_i = 1$.

In line with Proposition \ref{prop:sols_rep}, we would ideally obtain $\hat{w}$ by solving Equation \ref{eq:balancingweightsoptim} with $\text{IPM}_\mathcal{\rep(\mathcal{M}, P)}(\mathcal{P}^w_{\rep(X)}, \mathcal{Q}_{\rep(X)})$ where $\mathcal{P}_w$ is the empirical distribution over $\mathcal{P}$ with probabilities $\nicefrac{w_i}{|\mathcal{P}|}$. However, as $\mathcal{M}$ is unknown, $\text{IPM}_{\rep(\mathcal{M}, P)}$ will remain unknown. Proposition 9 of \citet{clivio2022nsmfhdci} suggests that if $\mathcal{M}$ is the set of $L$-Lipschitz constants and $\rep(x)$ is a neural network with invertible and bi-Lipschitz activation functions, then $\rep(\mathcal{M},P)$ is contained in the class of $L'$-Lipschitz functions for some $L'$ that depends on the weights and bi-Lipschitz constant of $\rep$ and might be significantly larger than $L$.

For computational simplicity, we work with a canonical IPM and choose the maximal mean discrepancy wrt some kernel $k$ \citep{gretton2012aktst}, following common practice in the literature \citep{kallus2020gommfci, huling2024ebocd}. More generally, minimizing such an MMD under the above weight constraints is referred to as \textit{kernel optimal matching} (KOM) with simplex weights \citep{kallus2020gommfci} in causal inference, where we change the setting from treated and control distributions to source and control distributions, or empirical \textit{kernel mean matching} (KMM) \citep{huang2006cssbbud} in covariate shift, where we add $L_2$ regularization. This minimization amounts to solving a quadratic program (QP) with linear constraints, which can be done using any off-the-shelf QP solver. The $\sigma^2$ hyperparameter for regularization can be selected either with a fixed value (e.g. $0$ as in \citet{huling2024ebocd}) or from a principled procedure \citep{kallus2020gommfci}. In the case of simultaneous weightings, this procedure can be repeated for each problem indexed $\alpha = 1, \cdots, \ell$. Our exact implementation of kernel optimal/mean matching for this purpose is given in Appendix \ref{sec:kom}




\begin{algorithm}[t]
\caption{Representation Learning and Weighting.}
\label{algo:joint}
\hline \\[1ex]
\textbf{Input :} Distribution $p_\Lambda(\alpha)$ over $\alpha \in \Lambda$, model $g(\rep(., \alpha; \theta_\rep), \alpha ; \theta_g)$, for each $\alpha$: samples $\mathcal{P}^\alpha, \mathcal{Q}^\alpha$, kernel $k^\alpha$, hyperparameter $\sigma^\alpha \geq 0$. \ \\[0.5ex]
\hline \\[0.5ex]
Initialize $\theta := (\theta_\rep, \theta_g)$\; 
\While{$\theta$ not converged}{
    Move $\theta$ in direction $-\nabla_\theta \mathcal{L}_{\mathcal{P}^\Lambda,\mathcal{Q}^\Lambda}^{g, \rep, p}( \theta)$\;
}
\For{$\alpha \in \Lambda$}{
    $\rep^\alpha(x) \gets \rep(x, \alpha, \theta_\rep)$\;
    $\tilde{k}^\alpha(x,x') \gets k^\alpha(\rep^\alpha(x), \rep^\alpha(x'))$\;
    $\hat{w}^\alpha \gets$ kernel optimal matching with simplex weights, kernel $\tilde{k}^\alpha$ and regularization hyperparameter $(\sigma^\alpha)^2$ \;
}
\KwResult{$\hat{w}^\Lambda$}\\[1ex]
\hline \\[2ex]

\end{algorithm}

We summarize all the previous steps in Algorithm \ref{algo:joint}.  Each component $\hat{w}^\alpha$ of its result $\hat{w}^\Lambda$ is then plugged in an estimator $\hat{\tau}^\alpha_{\hat{w}^\alpha}$ of $\E_{\mathcal{P}^\alpha_{\hat{w}^\alpha}}[\E_{P^\alpha}[\Y|X]]$ as
\begin{align*}
    \hat{\tau}^\alpha_{\hat{w}^\alpha} = \frac{1}{|\mathcal{P}^\alpha|} \sum_{i \in \mathcal{P}^\alpha} \hat{w}^\alpha_i \Y_i.
\end{align*}
This estimator could be analyzed theoretically (e.g. for consistency, error rates, ...) by inspecting, for each $\alpha \in \Lambda$, two separate terms : (i) the confounding bias of the learnt representation $\rep^\alpha$, and (ii) the difference between the estimator and the representation-wise estimand $\E_{Q^\alpha}[\E_{P^\alpha}[\Y|{\hat{\rep}}^\alpha(X)]]$. As the representation is learnt using the same loss as Equation 2.6 of \citet{chernozhukov2024admlvrr} and the confounding bias of the learnt representation is bounded by its balancing score error, itself bounded by the Riesz representer error in Theorem 2.1 of \citet{chernozhukov2024admlvrr}, we can resort to their results. Then, the difference between estimator and representation-wise estimand can be analyzed using previous work on analysis of KOM or KMM, such as \citet{kallus2020gommfci} or \citet{yu2012aokmmucs}.

\section{Related Work}

\textbf{Generalization bounds.} An adjacent line of work to ours is generalization bounds in domain adaptation, where one aims to bound the risk of a model on a target domain using the risk on a source domain. Typically, this involves a representation and the bound includes an IPM analogous to ours \citep{bendavid2006aorfda, zhao2017msdawatonn, zhao2018amsda, li2023mufrfdg}. In extensions of such bounds to causal inference, where a counterfactual risk is bounded using a factual risk and an IPM as before but the representation is usually assumed to be invertible \citep{shalit2017eitegbaa, bellot2022gbaafecateod, johansson2022gbarlfeopoace, kazemi2024abrfctee}, precluding the study of misspecified or confounded representations. Thus, usually no term quantifying the ``misspecification'' of the representation is added. Notable exceptions are \citet{johansson2019saiidir} in domain generalization and \citet{curth2021slhteftted} in causal inference which include an \textit{information loss} without actively trying to minimize it. The information loss from \citet{johansson2019saiidir} can be shown to be identical to our confounding bias with the outcome replaced by the loss function. A balancing score error analogous to ours will bound this information loss if the loss function is bounded above by a constant and our AutoDML loss-based approach can be used too.

\textbf{Confounding bias, balancing score error} \citet{damour2021dsfrfceewwo} also define a confounding bias and their Proposition 2 can be shown to be a special case of our Proposition \ref{prop:confounding_bias} for ATE estimation, which they only compute in a restricted case with Gaussian covariates and generalized linear models for outcome and propensity models. \citet{melnychuk2023boricbftee} define a conditional confounding bias and estimate bounds of it for a \textit{fixed} representation instead of learning it using their bounds, which does not seem trivial as their estimation relies on two different neural network fitting steps \textit{after} fitting the representation. \citet{clivio2022nsmfhdci} provide an alternative error on how much the representation is not a balancing score but they mention that it is difficult to compute and do not use it to learn the representation, which relies on assuming a propensity score model. Further, note that approaches to sensitivity analysis generally derive or bound the confounding bias induced by not including unobserved confounders in the adjustment set \citep{imbens2003steaipe, tan2006adafciups, hartman2024safsw}, although this is done by making about assumptions on the relationship between unobserved confounders and other variables in the data generating process ; in contrast, aforementionned methods and our work pertain to a setting without such unobserved confounders.


\textbf{Learning representations for treatment effects. } For weighting, besides points developed in Section \ref{sec:background_reps}, DeepMatch \citep{kallus2020dbdcrfciuat} requires a grid search involving multiple neural network trainings (50 in the experiments) and other models \citep{averitt2020tccgfcciohd, kitazawa2022gbwvdnn} take an $f$-divergence as the discrepancy measure but do not provide bounds on the bias, which is likely inherent to the non-intersection of IPMs and $f$-divergences \citep{sriperumbudur2012oteeoipm}. Other methods learn representations using outcome regression, alone or with weights \citep{shalit2017eitegbaa, johansson2022gbarlfeopoace, chernozhukov2022rafadmlwnnarf}.

\textbf{Outcome-based weights and representations} Some methods use outcomes to derive (i) the outcome function class $\mathcal{M}$ e.g. as a confidence interval around a regressed outcome model as in \citet{wainstein2022tfb}; (ii) the representation $\rep$ as in the canonical prognostic scores \citep{hansen2008tpaotps} or more recent and more general deconfounding scores \citep{damour2021dsfrfceewwo}; or (iii) the weights more generally, e.g. by directly estimating the density ratio between the source and target distributions of the outcome \citep{taufiq2024mdrfopeicb}. Finally, many standard outcome modeling approaches, such as (kernel ridge) regression are implicitly weighting estimators so one could use such approaches to derive weights; see, for example, \citet{brunssmith2023abwalr}.

\section{Numerical Results}



%
We now evaluate our method and alternatives on the IHDP \citep{hill2011bnmfci} and News datasets \citep{johansson2016lrfci} for ATE estimation and a Traumatic Brain Injury (TBI) dataset \citep{colnet2022rtrfgfseavs} for transportability. 

For weighting, we focus on KOM for two kernels, 1) the \emph{energy distance} kernel $k(x,x') = -||x - x'||_2$; KOM with this kernel is known as \emph{energy balancing} \citep{huling2024ebocd} ; 2) the linear kernel $k(x,x') = x^Tx'$. We evaluate these two methods with original covariates (``Energy'' and ``Linear''), a representation learned according to our approach (``Ours + Energy'' and ``Ours + Linear''), one through the canonical Principal Component Analysis (PCA,  \citet{hotelling1933aoacosvipc}) approach (``PCA + Energy'' and ``PCA + Linear''), the propensity score model vector ($(\hat{p}(a|x))_{a \in \mathcal{A}}$ for ATE estimation, $(\hat{p}(s|x))_{s = 0,1}$ for transportability) learnt with a gradient boosting classifier (``PS + Energy'' and ``PS + Linear''), representations from a layer of a neural network model of such propensity score models as in neural score matching (NSM, \citet{clivio2022nsmfhdci}) (``NSM + Energy'' and ``NSM + Linear''). We also check IPW with the same propensity scores (``IPW''), entropy balancing with first-order moments (``Entropy''), the weights head of the neural network used to train our representation(``NN Head''), and uniform weights (``Unweighted''). Weights from ``IPW'' and ``NN Head'' were normalized to prevent outsize errors, while those from other methods were already normalized by design.

 On energy balancing or linear kernel methods, we take $\sigma^\alpha = 0.01$. KOM was performed using the \texttt{osqp} library in Python \citep{stellato2020oaossfqp}, in line with the implementation of energy balancing in the \texttt{weightit} package \citep{greifer2024weightit}. All representations are 10-dimensional, and we always use a common representation for all treatment arms. The neural network first has a 200-unit layer, a 10-unit layer corresponding to the representation, a second 200-unit layer, and finally the scalar head. Neural network implementation was performed in PyTorch \citep{paszke2019paishpdll}. Adam \citep{kingma2014aamfso} was used to optimize the loss with a learning rate of $0.01$ and early stopping with a patience of 3 epochs, and all other hyperparameters at their default PyTorch values. We average results over 50 random seeds for IHDP and News, 100 for TBI. We show standard errors in parentheses.


\begin{table}[h]
  \caption{Joint Bias on the IHDP, News and TBI Datasets}
  \label{tab:bias}
  \centering
  \begin{tabular}{lccc}
    \toprule
    Method     & IHDP & News     & TBI \\
    \midrule
    Ours + Energy & \makecell{0.079\\(0.011)} & \makecell{0.128\\(0.014)} & \makecell{5.00\\(0.37)} \\
    \midrule
    NSM + Energy & \makecell{0.167\\(0.040)} & \makecell{0.070\\(0.013)} & \makecell{5.40\\(0.53)} \\
    \midrule
    PS + Energy & \makecell{0.096\\(0.012)} &  \makecell{0.381\\(0.026)} & \makecell{7.79\\(0.53)} \\
    \midrule
    PCA + Energy &\makecell{0.080\\(0.014)} &  \makecell{0.314\\(0.020)} & \makecell{10.65\\(0.82)} \\
    \midrule
    Energy & \makecell{0.078\\(0.014)} & \makecell{0.397\\(0.027)} & \makecell{10.69\\(0.83)} \\
    \midrule
    Ours + Linear & \makecell{0.087\\(0.009)} & \makecell{0.122 \\(0.013)} & \makecell{18.50\\(1.61)} \\
    \midrule
    NSM + Linear & \makecell{0.183\\(0.043)} & \makecell{0.113\\(0.018)} & \makecell{19.20\\(1.71)} \\
    \midrule
    PS + Linear & \makecell{0.105\\(0.017)} &  \makecell{0.499\\(0.036)} & \makecell{13.22\\(1.15)} \\
    \midrule
    PCA + Linear &\makecell{0.077\\(0.013)} &  \makecell{0.321\\(0.023)} & \makecell{63.86\\(2.29)} \\
    \midrule
    Linear & \makecell{0.076\\(0.011)} & \makecell{0.168\\(0.011)} & \makecell{22.71\\(1.75)} \\
    \midrule
    Entropy & \makecell{0.087\\(0.013)} & \makecell{0.221\\(0.020)} & \makecell{7.63\\(0.60)} \\
    \midrule
    IPW & \makecell{0.114\\(0.024)} & \makecell{0.280\\(0.018)} & \makecell{2.28\\(0.18)} \\
    \midrule
    NN Head & \makecell{0.181\\(0.031)} & \makecell{0.746\\(0.121)} & \makecell{59.71\\(2.52)} &
    \midrule
    Unweighted & \makecell{0.195\\(0.050}& \makecell{0.611\\(0.053)} & \makecell{7.67\\(0.15)} \\
    \bottomrule
  \end{tabular}
\end{table}

As a metric, we consider the joint bias (JB), which is the square-root of the joint squared bias where the target estimand is replaced by the average (known) outcome model over the empirical target distribution,
\begin{align*}
    \sqrt{\sum_{\alpha \in \Lambda} p_\Lambda(\alpha) \left( \frac{\sum_{i \in \mathcal{P}^\alpha} w^\alpha_i\Y_i}{|\mathcal{P}^\alpha|} - \frac{\sum_{i \in \mathcal{Q}^\alpha} \E_{\mathcal{P}^\alpha}[\Y|x_i]}{|\mathcal{Q}^\alpha|}  \right)^2}.
\end{align*}
Results are shown in Table \ref{tab:bias}. For either energy or linear KOM, our representation typically outperforms all other representations; exceptions are NSM on News for both kernels, the propensity score on TBI with the linear kernel, original covariates for both kernels and PCA for the linear kernel on IHDP. It further outperforms baselines not relying on KOM on IHDP and News for both kernels and on TBI for the energy balancing kernel, except entropy balancing for the linear kernel on IHDP and IPW on News. We note that the linear kernel yields generally degraded performance on TBI compared to the energy balancing kernel, but not other datasets. On IHDP, each KOM method performs better using original covariates than using a representation, which suggests that dimensionality reduction in any form is not necessarily beneficial on such a dataset where 16 out of 25 covariates are binary. Notably, on all datasets, using our representation with any KOM outperforms the Riesz representer head of the same neural network used to train the representation. Further, on the 3477-dimensional News dataset, energy balancing was significantly sped up when using a lower-dimensional representation instead of original covariates.

On TBI, high biases are due to a wide range of pseudo-outcomes (e.g. from $-8.37$ to $174.18$, with a target estimand at $56.89$ on seed $5$), and the highest biases to weights with most of their mass on a single point with an pseudo-outcome far away from the target estimand (e.g. $97\%$ of the mass on an pseudo-outcome of $145.40$ for NN Head, compared to at most $5\%$ on an pseudo-outcome of $117.97$ for entropy balancing, still on seed $5$).



\section{Limitations and Conclusion}

We have shown the importance of the confounding bias and the balancing score error (BSE) in learning representations for weighting, and have outlined a method to minimize the BSE. Experimental results suggest that representations obtained from the method might help improve performance for common optimization-based weighting approaches. The method could notably be applied to multimodal data involving tabular, text and image covariates \citep{klaassen2024deocewmd}.

One concern could be that the functions $g,\rep$ are generally not uniquely identifiable by minimizing the AutoDML loss. Without restrictions, many different $(g, \phi)$ tuples will indeed share the same value of the AutoDML loss, e.g. any $(g_h, \phi_h) = (g \circ h, h^{-1}(\phi))$ where $h$ is invertible. However, restricting $g$ and $\phi$ to be components of a neural network with a given architecture will exclude many possible invertible $h$'s. Some $h$'s will remain though, such as $h(z) = \lambda \odot z$ where $\odot$ is the Hadamard product and $\lambda_i \neq 0 \ \forall i$, which means that the returned $\phi$ might have arbitrary amplitude or smoothness. A workaround could be in adding some regularization of $\phi$ in the AutoDML loss, eg through weight decay. We do not perform weight decay and still obtain competitive performance in later experiments, which suggests that Adam optimization might choose an appropriate $\phi$ in practice. 

Directions for future work to address limitations of our current approach include: (1) check whether such quantification of the representation's quality can also be done for augmented estimators, (2) evaluate the currently unknown gaps between the confounding bias and the BSE, and between the BSE and the AutoDML objective ; the latter provides a worst-case error but can be overly conservative ; (3) characterize the function class of the projection of the outcome model on the representation, depending on the class of the original outcome model or that of the representation, instead of assuming a canonical RKHS as we do now ; (4) develop a more thorough theoretical analysis of the estimator than the strategy presented in this paper.


\begin{acknowledgements}

We sincerely thank David Bruns-Smith, Sam Pimentel, Erin Hartman and anonymous reviewers for valuable feedback. O.C. was supported by the EPSRC Centre for Doctoral Training in Modern Statistics and Statistical Machine Learning (EP/S023151/1). A.F. was supported in part by the Institute of Education Sciences, U.S. Department of Education, through Grant R305D200010. C.H. was supported by the EPSRC Bayes4Health programme grant and The Alan Turing Institute, UK.

\end{acknowledgements}

\newpage


\bibliography{uai2024cameraready/references_uai_camera_ready}

\newpage

\onecolumn

\title{\ourtitle\\(Supplementary Material)}
\maketitle

\appendix


\section{Details on Problems in Causal Inference}
\label{suppl:problems}

Under the assumptions of \textit{no interference} and \textit{consistency}, $A = a$ implies $Y = Y(a)$, which can written as $Y = \sum_{a \in \mathcal{A}} 1_{\{A=a\}} Y(a)$ or, more compactly, $Y = Y(A)$. Further, under \textit{unconfoundedness} and \textit{overlap} we have that $\E[Y(a) | X] = \E[Y | A=a, X]$, helping identify causal effects of interest which we detail below.

In \textbf{ATT estimation} \citep{benmichael2021tbaici}, we are interested in the effect of a binary treatment on the population receiving it, that is $\E[Y(1) - Y(0) | A=1]$. Thanks to consistency and no interference, $\E[Y(1) | A=1]$ is accessible as the average of outcomes on the treated distribution, so the challenging part is estimating $\E[Y(0) | A=1]$. The weighting approach is then to reweight the control distribution, on which $Y(0) = Y$, that is to find a function $w(x)$ such that 
\begin{align*}    
\E[Y(0) | A=1] = \E[w(X)Y | A=0] \approx \frac{1}{\{i : A_i = 0\}}\sum_{i : A_i = 0} w(X_i)Y_i.
\end{align*}
In \textbf{average potential outcome estimation} \citep{huling2024ebocd}, for a fixed $a \in \mathcal{A}$, we are interested in the marginal effect of the potential outcome wrt $a$, that is $\E[Y(a)]$. The weighting approach is then to reweight the distribution of the population for which $A = a$, implying $Y(a) = Y$, i.e. find a function $w_a(x)$ such that
\begin{align*}
\E[Y(a)] = \E[w_a(X)Y | A=a] \approx \frac{1}{\{i : A_i = a\}}\sum_{i : A_i = a} w_a(X_i)Y_i.
\end{align*}
We note that the closely related goal of \textbf{ATE estimation}, that is when $\mathcal{A}$ is binary and we want $\E[Y(1) - Y(0)]$, can be solved by average potential outcome estimation for both $a = 1$ and $a = 0$ separately. With some abuse of notation, we use the two names of average potential outcome estimation and ATE estimation interchangeably.

In \textbf{generalizability} and  \textbf{transportability} \citep{colnet2022rtrfgfseavs, degtiar2023arogat}, $A$ is binary again and we have an other binary variable $S$ such that $S = 1$ denotes membership in the RCT population, that is $A \indep X | S=1$ and $(Y(1),Y(0)) \indep A | S=1$. We are interested in $\E[Y(1) - Y(0)]$ for generalizability and $\E[Y(1) - Y(0) | S=0]$ for transportability. What motivates weighting here is that we do not have access to $A,Y$ when $S=0$. Under the \textit{transportability assumption}, the \textit{conditional average treatment effect} is identical between RCT and non-RCT populations, i.e. for any $x$, $\text{CATE}(x) := \E[Y(1) - Y(0) | x]$ is equal to both $\E[Y(1) - Y(0) | x, S=1]$ and $\E[Y(1) - Y(0) | x, S=0]$. In addition, the CATE is identified on the RCT population as $\text{CATE}(x) = \E\left[ \frac{AY}{P(A=1|S=1)} - \frac{(1-A)Y}{P(A=0|S=1)} \middle | X=x, S=1\right]$. Then, defining $\pi = P(A=1|S=1)$, the weighting approach is to reweight the distribution of the RCT population, i.e. find weights $w$ such that
\begin{align*}
\E[Y(1) - Y(0)] = \E[w(X) \cdot \text{CATE}(X) | S=1] \approx \frac{1}{|\{i : S_i = 1\} |} \sum_{i : S_i = 1} w(X_i)\left(\frac{A_iY_i}{\pi} - \frac{(1-A_i)Y}{1 - \pi}\right)
\end{align*}
in generalizability or such that
\begin{align*}
\E[Y(1) - Y(0) | S=0] = \E[w(X) \cdot \text{CATE}(X) | S=1] \approx \frac{1}{|\{i : S_i = 1\} |} \sum_{i : S_i = 1} w(X_i)\left(\frac{A_iY_i}{\pi} - \frac{(1-A_i)Y}{1 - \pi}\right)
\end{align*}
in transportability. Due to the similarity of both frameworks, without loss of generality, we focus on transportability as in \citet{colnet2022rtrfgfseavs} and \citet{egami2021csfgeratalsdpiu} which study variable selection in this setting.

Note that the framework of Problem $\ref{prob:gwp}$ generally does not allow for \textbf{CATE estimation}, as the CATE is a function and the target $\E_Q[\E_P[\Y|X]]$ is a scalar. Alternatively, one can perform simultaneous weightings as in Section \ref{sec:simultaneous}, where for every problem we fix a covariate value $x_0$ and a treatment value $a$ and take the target estimand to be the $\E[Y|A=a, X=x_0]$. We can take the pseudo-outcome to be $Y$, the source distribution to be $\Pdata(X,Y|A=a)$ and the target distribution to be $\Pdata(X|X=x_0)$. However, this choice of target distribution would be a spike at $X=x_0$, potentially violating Assumption \ref{ass:ac} in many widely-applicable situations, e.g. if the source distribution of covariates has a density wrt the Borel measure. As in \citet{benmichael2021tbaici}, such a problem with spikes could be mitigated with smoothing. If Assumption \ref{ass:ac} does hold, e.g. if the source distribution of covariates is discrete and has a non-zero mass at $x_0$, then it could actually be possible to perform weighting, although we are not aware of such an approach in the previous literature.

\section{Generalization of Former ``Score'' Notions}
\label{sec:scores}

More rigorously, the confounding bias is zero in three important cases:
\begin{enumerate}
\item $\E_P[\Y|x]$ is a function of $\rep(x)$ $P_{X}-$a.s., where we call $\rep(x)$ a ``generalized prognostic score'' ;
\item $\trueweights{P}{Q}(x)$ is a function of $\rep(x)$  $P_{X}-$a.s., where we call $\rep(x)$ a ``generalized balancing score'' ;
\item The confounding bias is zero without $\rep$ necessarily being a generalized prognostic or balancing score, where we call $\rep(x)$ a ``generalized deconfounding score''.
\end{enumerate}
The following result connects these notions to previous literature.


\begin{proposition}
\label{prop:scores}
In ATT/ATE estimation, a) balancing scores \citep{rosenbaum1983tcrotpsiosfce} are equivalent to generalized balancing scores . In ATE estimation, b) deconfounding scores \citep{damour2021dsfrfceewwo} are equivalent to generalized deconfounding scores, c) prognostic scores \citep{hansen2008tpaotps} are generalized prognostic scores, and the converse is true if
\begin{align*}
\forall a \in \mathcal{A}, Y(a) \indep X \ | \ \E[Y|X,A=a].
\end{align*}
In transportability \citep{egami2021csfgeratalsdpiu}, assuming that the transportability assumption holds for X, d) heterogeneity sets are generalized prognostic scores, e) sampling sets are generalized balancing scores, f) separating sets are generalized deconfounding scores.
\end{proposition} 



Thus, these ``generalized'' scores extend existing notions of prognostic, balancing and deconfounding scores from the literature to the more general framework from Problem \ref{prob:gwp} and connect them to the confounding bias, refining our understanding of why these scores are well-suited for weighting. They also connect notions used for variable selection in transportability  to the score notions from weighting for the ATT or the ATE.

We might say that the generalized notions clearly outline the ``proper'' definitions of their original counterparts, in a sense that they are either equivalent to them, or weaker than them while preserving properties required for deconfounding, as illustrated by generalized prognostic scores.
Hence, for the remainder of the paper, we omit the ``generalized'' adjective from all notions of scores.


\section{Representation Selection}
\label{sec:repselection}

To \textbf{select between two representations} $\rep_1$ and $\rep_2$, one can choose the representation with the lowest BSE. This is equivalent to compare $\min_{g_1} \mathcal{L}_{P,Q}(g_1(\rep_1(.))$ and $\min_{g_2} \mathcal{L}_{P,Q}(g_2(\rep_2(.))$, where each minimization is taken over all functions. These are inaccessible, but we can instead perform each minimization under a rich parameterized class of functions. Particularly, this would help select between two fitted propensity score models and we expect that the one with the best prediction performance might not necessarily be selected.

Further, we note that the AutoDML loss makes us lose the ability of evaluating how approximately deconfounding is \textit{one} representation, instead of comparing different representations. Flexible density ratio estimators \citep{arbour2021pw} could be plugged into the balancing score error, especially as both the true weights and their expectation conditional on the representation are density ratios from Assumption 1 and Proposition \ref{prop:confounding_bias}.

\section{Details on Experiments}
\label{sec:xp_details}

\paragraph{Code} Our code is available at \url{https://github.com/oscarclivio/representations_weighting}.

\paragraph{Origin of datasets} We extracted IHDP \citep{hill2011bnmfci} from the GitHub repository for Dragonnet \citep{shi2019annfteote} at \url{https://github.com/claudiashi57/dragonnet/tree/master/dat/ihdp/csv}, News \citep{johansson2016lrfci} from \url{https://www.fredjo.com/files/NEWS_csv.zip} and TBI \citep{colnet2022rtrfgfseavs} from \url{https://github.com/BenedicteColnet/IPSW-categorical}. In addition, TBI is covered by a MIT license, and the original data source for News \citep{newman2008bow} by a CC BY 4.0 license.

\paragraph{Infrastructure} We ran experiments on a laptop with a GeForce GTX 1070 GPU with Max-Q Design and 12 CPU core. We used our own Python implementation for datasets (after downloading the data), weighting methods and representation learning techniques, including propensity score modelling and neural network fitting.

\paragraph{Choice of hyperparameters} We tried different sets of hyperparameters for neural networks, and first chose a set such that our approach had good performance (outperformed by at most one other method) on two different datasets each in a separate task under the energy balancing. Several sets verified this property, however performance of individual methods and individual hyperparameters was generally unequal among datasets. For ATE estimation, our method had the same ranking as in the paper for many hyperparameters on News, but was outperformed by standard kernel balancing on IHDP \citep{hill2011bnmfci, shalit2017eitegbaa} and ACIC 2016 \citep{dorie2019avdiymfcillfadac}. For ATT estimation, at the time of writing, we did not find such ``good'' hyperparameters on News and ACIC2016, but did so on IHDP and Jobs \citep{lalonde1986eteeotpwed, johansson2016lrfci}. This generally shows that different hyperparameters should be tested, especially for neural network-based methods. Defining a principled way (that does not use ground-truth target estimands) to select them for weighting has to be addressed in future work.

\paragraph{Misspecified outcome classes} We note that in our experiments, the outcome class $\mathcal{M}$ is  often not correctly specified. Indeed, note that in our datasets
\begin{itemize}
    \item The control outcome model in IHDP in its setting B \citep{hill2011bnmfci}, as used to generate the data \citep{shi2019annfteote}, linearly depends on an exponential $e^{\beta . x}$ function for some $\beta$.
    \item The outcome models for News linearly depend on the vector of topic probabilities $z(x)$ in \citet{johansson2016lrfci}, where we note that weights in this linear relationship are further positive. Noting $k$ the number of topics, we then have for any $x$, $z_i(x) \geq \frac{1}{k}$ for at least one $i$, and noting $w_0$ the minimal weight we obtain that $\forall a = 0, 1, \ \E[Y|x,A=a] \geq \frac{w_0}{k}$, thus either treated or control outcome model is bounded away from 0.
    \item The outcome model for TBI is quadratic in $x$ \citep{colnet2022rtrfgfseavs}.
\end{itemize}

Thus, outcome functions on IHDP, News and TBI are clearly not linear functions, thus misspecified for linear kernel optimal matching. For energy balancing, none of the outcome functions above is square-integrable (here \textit{without} a probability measure; IHDP and TBI due to their functional forms, News due to being bounded away from $0$), thus none of them is Sobolev of any order. As the covariate space for IHDP and News has an odd dimensionality, these functions are misspecified outcome functions for the class corresponding to the energy distance according to page 12 of \citet{Mak2018support}. This is less clear for the outcome model of the TBI dataset which has even dimensionality ; we conjecture that this outcome model is misspecified too, as its outcome model is not Sobolev of any order and Sobolev spaces up to a certain order are invoked as canonical members of the outcome class corresponding in \citet{huling2024ebocd}.



\section{Details on our implementation of kernel optimal or mean matching}
\label{sec:kom}

When applied to the representation $\phi$ composed through a kernel $k$, the square of the MMD is:
\begin{align*}
    \text{MMD}^2_k(\mathcal{P}_w, \mathcal{Q}) =& \ \frac{1}{|\mathcal{P}|^2}\sum_{i, j \in \mathcal{P}} w_i w_j k(\rep(x_i),\rep(x_j)) \\
    &- \frac{2}{|\mathcal{P}||\mathcal{Q}|}\sum_{i \in \mathcal{P}, j \in \mathcal{P}} w_i k(\rep(x_i),\rep(x_j))\\
    &+ \frac{1}{|\mathcal{Q}|^2}\sum_{i, j \in \mathcal{Q}} k(\rep(x_i),\rep(x_j)).
\end{align*}
Thus, its minimization with regularization is a quadratic problem (QP)
\begin{align}
    &\min_{w}  \frac{1}{2} w^T S w + v^T w \text{ subject to } l \leq A w \leq u
    \label{eq:osqp_obj}
\end{align}
that can be solved with any off-the-shelf solver $\texttt{solver}(S, v, l, A, u)$ (e.g. \citet{stellato2020oaossfqp}). Noting $I_{\mathcal{P}}$ the identity matrix over $\mathcal{P}$, we have
\begin{align*}
    &S = S^{k,\rep,\sigma}_{\mathcal{P},\mathcal{Q}} := (\nicefrac{2}{|\mathcal{P}|^2} \cdot k(\rep(x_i),\rep(x_j)) + 2\sigma^2 \cdot I_{\mathcal{P}})_{i, j \in \mathcal{P}}, \\
    & v := v^{k,\rep}_{\mathcal{P},\mathcal{Q}} = (-\nicefrac{2}{|\mathcal{P}||\mathcal{Q}|} \cdot \sum_{j \in \mathcal{Q}} k(\rep(x_i),\rep(x_j)))_{i \in \mathcal{P}}, \\
    &A := A_{\mathcal{P}} = \left(\begin{array}{c} I_{\mathcal{P}} \\ \hline 1 \cdots 1  \end{array}\right), \ l := l_{\mathcal{P}} = (\underbrace{0, \cdots, 0}_{|\mathcal{P}| \text{ times}}, |\mathcal{P}|)^T,\\
    &u = u_{\mathcal{P}} = (\underbrace{+\infty, \cdots, +\infty}_{|\mathcal{P}| \text{ times}}, |\mathcal{P}|)^T
\end{align*}
For the joint squared bias with a finite $\Lambda = \{1, \cdots, \ell \}$, we sum all objectives from Proposition \ref{prop:sols_rep} over each $\alpha = 1, \cdots, \ell$ with $\mathcal{P}^\alpha, \mathcal{Q}^\alpha, \rep^\alpha$, $\sigma^\alpha$ and with kernel $k^\alpha$, giving
\begin{align}
    &S = S^{k^\Lambda,\rep^\Lambda,\sigma^\Lambda}_{\mathcal{P}^\Lambda,\mathcal{Q}^\Lambda} := \text{diag} \left((S^{k^,\rep^i,\sigma^i}_{\mathcal{P}^i,\mathcal{Q}^i})_{i = 1, \cdots, \ell}\right)  \label{eq:osqp_params} \\
    &v = v^{k^\Lambda,\rep^\Lambda}_{\mathcal{P}^\Lambda,\mathcal{Q}^\Lambda} := \begingroup \setlength\arraycolsep{2pt} \left(\begin{array}{c}v^{k^1,\rep^1}_{\mathcal{P}^1,\mathcal{Q}^1} \\ \vdots \\ v^{k^\ell,\rep^\ell}_{\mathcal{P}^\ell,\mathcal{Q}^\ell}\end{array}\right) \endgroup, \ \psi = \psi_{\mathcal{P}^\Lambda} := \begingroup \setlength\arraycolsep{2pt} \left(\begin{array}{c}\psi_{\mathcal{P}^1} \\ \vdots \\ \psi_{\mathcal{P}^\ell}\end{array}\right) \endgroup, \nonumber
\end{align}
for $\psi \in {A, l, u}$. This step is agnostic to how $\sigma^\alpha$ is selected, either with a fixed value (e.g. $0$ as in \citet{huling2024ebocd}) or from a principled procedure \citep{kallus2020gommfci}.pointwise



\section{Proof of Results}
\label{suppl:proofs}


\subsection{Proof of Proposition \ref{prop:sols_rep}}

\subsubsection{Item 1}

First, note that we can restrict our attention to weight functions $w$ in $L_2(P_X)$, that is such that $w(X) \in L_2(P)$, as the objective will be $\infty$ for weights functions not in $L_2(P_X)$. For any $L_2(P_X)$ weight function and function $g \in \mathcal{G}$, we have
\begin{align*}
     \Big|\E_{P^w_X}[g \circ \rep] - \E_{Q_X}[g \circ \rep]\Big|
    &= \Big|\E_{P^w}[(g \circ \rep)(X)] - \E_{Q}[(g \circ \rep)(X)]\Big| \\
    &= \Big|\E_{P^w}[g(\rep(X))] - \E_{Q}[g(\rep(X))]\Big|  \\
    &= \Big|\E_{P^w_{\rep(X)}}[g] - \E_{Q_{\rep(X)}}[g]\Big|.
\end{align*}
where all integrals are well-defined, as $g(\rep(X)) \in L_2(P)$ by assumption in the Proposition and $\trueweights{P_X}{Q_X}(X) \in L_2(P)$ from Assumptions \ref{ass:ac} \ref{ass:rndL2}. Taking the supremum over $g \in \mathcal{G}$, we have
\begin{align*}
    \text{IPM}_{\mathcal{M}}(P^w_X, Q_X)= \text{IPM}_{\mathcal{G}}(P^w_{\rep(X)}, Q_{\rep(X)})
\end{align*}
where $\mathcal{M} = \{x \rightarrow (g \circ \rep)(x), \ g \in \mathcal{G}\}$. Note that $\mathcal{M} \subseteq L_2(P_X)$, and that all of this also justifies the claim that the bias wrt $\rep$ is bounded by $\text{IPM}_{\mathcal{G}}(P^w_{\rep(X)}, Q_{\rep(X)})$. Thus, we are solving 
\begin{align*}
    \min_{w \in \mathcal{A}} J(w)
\end{align*}
where
\begin{align*}
    A &:= \{w \in L_2(P_X) \ | \ w \geq 0 \ P_X\text{-a.s.},\E_P[w(X)] = 1\} \\
    J(w) &:= I_\mathcal{M}(w)^2 + \sigma^2 \cdot S(w) \\
    I_\mathcal{M}(w) &:= \text{IPM}_{\mathcal{M}}(P^w_X, Q_X) \\
    S(w) &:= \E_P[w(X)^2]
\end{align*}
where functions in $L_2(P_X)$ are identified $P_X$-a.s.. We note that $\text{inf}_{w \in \mathcal{A}} J(w)$ is finite, as $\trueweights{P_X}{Q_X} \in A$ from Assumption \ref{ass:rndL2} and $J(\trueweights{P_X}{Q_X}) = \sigma^2 \cdot E_P\left[\left(\trueweights{P_X}{Q_X}(X)\right)^2\right] < \infty$. \\

We prove the first item of the Proposition in three parts :
\begin{enumerate}
    \item There is at most one solution.
    \item There is at least one solution.
    \item Any solution is a function of $\rep(x)$ $P_X$-a.s.
\end{enumerate}
Note that (i) only the third part uses the fact that functions in $\mathcal{M}$ are functions of $\rep(x)$ $P_X$-a.s., (ii) under stronger assumptions on the class $\mathcal{G}$, the result also follows directly from Theorem 4.1 of \citet{brunssmith2022oaadtfbw}, while the following analysis presents more relaxed assumptions over $\mathcal{G}$. \\

\textbf{Part 1 : There is at most one solution} $A$ is clearly a convex subset of $L_2(P_X)$, and $J$ is strictly convex. Indeed, for any $t \in [0,1], w_1, w_2 \in A$, $m \in \mathcal{M}$, letting $w_t = tw_1 + (1-t)w_2$
\begin{align*}
&\left|\E_{P^{w_t}}[m(X)] - \E_Q[m(X)]\right| \\ 
&= \left|t\left(\E_{P^{w_1}}[m(X)] - \E_Q[m(X)]\right) + (1-t)\left(\E_{P^{w_2}}[m(X)] - \E_Q[m(X)]\right)\right| \\
&\leq t\left|\E_{P^{w_1}}[m(X)] - \E_Q[m(X)]\right) + (1-t)\left|\E_{P^{w_2}}[m(X)] - \E_Q[m(X)]\right| \text{ from the triangle inequality} \\
&\leq tI_{\mathcal{M}}(w_1) + (1-t)I_{\mathcal{M}}(w_2) \text{ taking the supremum wrt $m$ on each term on the RHS}
\end{align*}
so taking the supremum wrt $m$ on the LHS, $I_\mathcal{M}(w_t) \leq tI_{\mathcal{M}}(w_1) + (1-t)I_{\mathcal{M}}(w_2)$ ; thus $I_\mathcal{M}$ is convex. As $u \mapsto u^2$ is convex non-decreasing, $I_\mathcal{M}^2$ is convex. Also, for any $t \in [0,1], w_1, w_2 \in A$, $m \in \mathcal{M}$, again letting $w_t = tw_1 + (1-t)w_2$,
\begin{align*}
    &tS(w_1) + (1-t)S(w_2)- S(w_t) \\
    &= t\E_P[w_1(X)^2] + (1-t)\E_P[w_2(X)^2] - \E_P[(tw_1(X) + (1-t)w_2(X) )^2] \\
    &= t\E_P[w_1(X)^2] + (1-t)\E_P[w_2(X)^2] - t^2\E_P[w_1(X)^2] - (1-t)^2\E_P[w_2(X)^2] - 2t(1-t)\E_P[w_1(X)w_2(X)] \\
    &= t(1-t)\E_P[(w_1(X) - w_2(X))^2]
\end{align*}
which is non-negative, and zero iff $t = 0$, $t = 1$ or $w_1 = w_2 \ P_X-$a.s.. Thus, $S$ is strictly convex. Thus, the sum of $I_\mathcal{M}^2$ and $\sigma^2 \cdot S$, that is $J$, is strictly convex.

As $A$ is a convex subset of $L_2(P_X)$ and $J$ is strictly convex, there is at most one minimizer of $J$ in $A$. \\

\textbf{Part 2 : There is at least one solution.} From e.g. Theorem 2 of \url{https://www.math.umd.edu/~yanir/742/742-5-6.pdf}, the existence of a minimizer of $J$ in $A$ is guaranteed if $A$ is weakly closed and $J$ is coercive and sequentially weakly lower semi-continuous.

First, we show that $A$ is weakly closed. Let $w_n \in A^\mathbb{N}$ weakly converging to some $w_* \in L_2(P_X)$, that is such that
\begin{align*}
    \forall h \in L_2(P_X), \ \ \E_P[w_n(X)h(X)] \xrightarrow[n\to\infty]{} \E_P[w_*(X)h(X)].
\end{align*}
Then taking $h = 1$, we have $1 = \E_P[w_n(X)] \xrightarrow[n\to\infty]{} \E_P[w_*(X)]$, thus $\E_P[w_*(X)] = 1$.

Further, for $k \in \mathbb{N}^*$, let $B_k := \{w_*(X) \leq \frac{1}{k}\}$. Then, with $h := 1_{B_k}$, as $w_n \geq 0 \ P_X-$a.s. for each $n \in \mathbb{N}$
\begin{align*}
    0 \leq \E_P[1_{B_k}(X)w_n(X)] \xrightarrow[n\to\infty]{} \E_P[w_*(X)h(X)] \leq -\frac{P(B_k)}{k}
\end{align*}
which leads to $0 \leq \E_P[w_*(X)h(X)] \leq -\frac{P(B_k)}{k}$, which is not contradictory only if $P(B_k) = 0$. Then, as $\{w_*(X) < 0\} = \{\cup_{k \in \mathbb{N}^*} B_k\}$,
\begin{align*}
    P(w_*(X) < 0) &= P\left(\cup_{k \in \mathbb{N}^*} B_k\right) \\
    &\leq \sum_{k \in \mathbb{N}^*} P(B_k) \\
    &= 0
\end{align*}
Thus, $w^* \geq 0 \ P_X$-a.s.. As a result, $w_* \in A$, so $A$ is weakly closed. We note that $J$ is coercive, as $S$ is clearly coercive and $I_\mathcal{M}$ is non-negative. What is left to prove in this part is then that $J$ is sequentially weakly lower semi-continuous. Let $w_n \in L_2(P_X)^\mathbb{N}$ weakly converging to some $w_* \in L_2(P_X)$. We want to show that
\begin{align*}
    \liminf\limits_{n\rightarrow\infty} J(w_n) \geq J(w_*).
\end{align*}
Indeed,
\begin{align*}
    \liminf\limits_{n\rightarrow\infty} I_\mathcal{M}(w_n)^2
    &= \liminf\limits_{n\rightarrow\infty} \sup_{m \in \mathcal{M}} \left|\E_P[w_n(X)m(X)] - \E_Q[m(X)]\right| \\
     &\geq \sup_{m \in \mathcal{M}} \liminf\limits_{n\rightarrow\infty}  \underbrace{\left|\E_P[w_n(X)m(X)] - \E_Q[m(X)]\right|^2}_\substack{\xrightarrow[n\to\infty]{} \left|\E_P[w_*(X)m(X)] - \E_Q[m(X)]\right|^2 \\ \text{as m }\in L_2(P_X)} \\
     &= \sup_{m \in \mathcal{M}} \left|\E_P[w_*(X)m(X)] - \E_Q[m(X)]\right|^2 \\
     &=  I_\mathcal{M}(w_*)^2
\end{align*}
and by convexity of $u \mapsto u^2$,
\begin{align*}
    \forall x, \ w_n(x)^2 \geq  w_*(x)^2 + 2w_*(x)\left(w_n(x) - w_*(x)\right)
\end{align*}
so
\begin{align*}
    \liminf\limits_{n\rightarrow\infty} S(w_n)
    &= \liminf\limits_{n\rightarrow\infty} \E_P[w_n(X)^2]  \\
    &\geq \liminf\limits_{n\rightarrow\infty} \E_P[w_*(X)^2] + 2\big(\underbrace{\E_P[w_n(X)w_*(X)] - \E_P[w_*(X)^2]}_{\xrightarrow[n\to\infty]{} 0}\big)  \\
    &= \E_P[w_*(X)^2] \\
    &= S(w_*)
\end{align*}
and
\begin{align*}
   \liminf\limits_{n\rightarrow\infty} I_\mathcal{M}(w_n)^2 + \sigma^2 S(w_n) &\geq \liminf\limits_{n\rightarrow\infty} I_\mathcal{M}(w_n) + \liminf\limits_{n\rightarrow\infty} \sigma^2 S(w_n)  \\
   &\geq I_\mathcal{M}(w_*)^2 + \sigma^2 S(w_*) \text{ from the above}.
\end{align*}
All of this shows that $J$ is sequentially weakly lower semi-continuous, concluding this part of the proof. \\

\textbf{Part 3 : Any solution is a function of $\rep(x)$}. For any $w \in L_2(P_X)$, let $\bar{w}(z) = \E_P[w(X) | \rep(X) = z]$. If $w \in A$, then $\bar{w}(\rep(.)) \in A$. Indeed, the conditional expectation of any $L_2(P)$ random variable is also $L_2(P)$, so $\bar{w}(\rep(.)) \in L_2(P_X)$. Further, the conditional expectation of any almost surely non-negative random variable is also almost surely non-negative, so $\bar{w}(\rep(.)) \geq 0 \ P_X$-a.s.. Finally, the tower property shows that
\begin{align*}
    \E_P[\bar{w}(\rep(X))] = \E_P[\E_P[w(X) | \rep(X)]] = \E_P[w(X)] = 1.
\end{align*}
Thus, $\bar{w}(\rep(.)) \in A$. It actually turns out that $J(\bar{w}(\rep(.))) \leq J(w)$, with equality iff $w = \bar{w}(\rep(.))$. This concludes the proof, as a minimizer of $J$ in $A$ has to be a function of $\rep(x)$, as otherwise we can construct a weight function in $A$ that realises a strictly lower objective, which is contradictory.

First, 
\begin{align*}
    \forall g \in \mathcal{G}, \ \ \E_{P}[w(X)g(\rep(X))] &= \E_P[\E_P[w(X)g(\rep(X)) | \rep(X)]] \text{ from the tower property}\\
    &= \E_P[\E_P[w(X) | \rep(X)] g(\rep(X)) ]\\
    &= \E_P[\bar{w}(\rep(X))g(\rep(X))] 
\end{align*}
so $I_\mathcal{M}(\bar{w}(\rep(.))) = I_\mathcal{M}(w)$. Further,
\begin{align*}
    S(w) - S(\bar{w}(\rep(.)))
    &= \E_P[w(X)^2] - \E_P[\E_P[w(X) | \rep(X)]^2] \\
    &= \E_P[\E_P[w(X)^2 | \rep(X)]] - \E_P[\E_P[w(X) | \rep(X)]^2] \text{ from the tower property}\\
    &= \E_P\left[\E_P[w(X)^2 | \rep(X)]] - \E_P[w(X) | \rep(X)]^2\right] \\
    &= \E_P[\text{Var}(w(X) | \rep(X))] \\
    &= \E_P[\E_P[(w(X) - \bar{w}(\rep(X)))^2 | \rep(X)]] \\
    &= \E_P[(w(X) - \bar{w}(\rep(X)))^2] \text{ from the tower property}.
\end{align*}
Taken all together, $J(w) \geq J(\bar{w}(\rep(.)))$ with equality iff $\E_P[(w(X) - \bar{w}(\rep(X)))^2] = 0$, that is $w = \bar{w}(\rep(.)) \ P_X$-a.s.. This concludes the proof.

\subsubsection{Item 2}

Let $w$ be an $L_2(P_X)$ weight function such that $w =  \bar{w}(\rep(.)) \ P_X$-a.s. for some $\bar{w}$. Then,
\begin{align*}
    \text{Chosen weights bias of }w &= \E_{P^w}\left[\E_{P}[\Y|X] - \E_P[\Y|\rep(X)]\right] \\
    &= \E_{P}\left[w(X)\E_{P}[\Y|X] - w(X)\E_P[\Y|\rep(X)]\right] \\
    &= \E_{P}\left[\bar{w}(\rep(X))\E_{P}[\Y|X] - \bar{w}(\rep(X))\E_P[\Y|\rep(X)]\right] \\
    &= \E_{P}\left[\bar{w}(\rep(X))\E_{P}[\Y|X]\right] - \E_P\left[\bar{w}(\rep(X))\E_P[\Y|\rep(X)]\right] \\
    &= \E_{P}\left[\bar{w}(\rep(X))\E_{P}[\Y|X]\right] - \E_P\left[\bar{w}(\rep(X))\E_P[\Y|\rep(X)]\right] \\
    &= \E_{P}\left[\E_{P}[\bar{w}(\rep(X))\Y|X]\right] - \E_P\left[E_P[\bar{w}(\rep(X))\Y|\rep(X)]\right] \\
    &= \E_{P}[\bar{w}(\rep(X))\Y] - E_P[\bar{w}(\rep(X))\Y] \text{ from the tower property} \\
    &= 0
\end{align*}


\subsection{Proof of Proposition \ref{prop:confounding_bias}}

Let $\Sigma_Z$ denote the $\sigma$-algebra of the space of values taken by random variable $Z$.

Let $B \in \Sigma_{\rep(X)}$ such that $P_{\rep(X)}(B) = 0$. Then $0 = P_{\rep(X)}(B) =  P_X(\rep^{-1}(B))$ where $\rep^{-1}(B) \in \Sigma_X$ as $\rep$ is measurable. By Assumption \ref{ass:ac}, $Q_X(\rep^{-1}(B)) = 0$. Then $0 = Q_X(\rep^{-1}(B)) = Q_{\rep(X)}(B)$. Thus, $Q_{\rep(X)}$ is absolutely continuous wrt $P_{\rep(X)}$.

Notably, from the Radon-Nikodym theorem, $\trueweights{P_{\rep(X)}}{Q_{\rep(X)}}$ exists. Then for any $B \in \Sigma_{\rep(X)}$,
\begin{align*}
    &\E_P\left[\trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X)) \cdot 1_B(\rep(X))\right] \\
    &= \E_Q[1_B(\rep(X))] \\
    &= \E_P\left[\trueweights{P_X}{Q_X}(X) \cdot 1_B(\rep(X))\right] \text{ by taking the Radon-Nikodym derivative wrt }X \\
    &= \E_P\left[\E_P\left[\trueweights{P_X}{Q_X}(X) \cdot 1_B(\rep(X)) \middle| \rep(X)\right]\right] \text{ from the tower property} \\
    &= \E_P\left[\E_P\left[\trueweights{P_X}{Q_X}(X) \middle| \rep(X)\right] \cdot 1_B(\rep(X))\right] \\
\end{align*}
where all integrals are well-defined as the Radon-Nikodym derivative is measurable and $L_1(P_X)$, and its conditional expectation is also $L_1(P_{\rep(X)})$ as any conditional expectation of any $L_1(P)$ random variable is also $L_1(P)$.

Thus we have shown that $\forall B \in \Sigma_{\rep(X)}, \ \ \int h \cdot 1_B \text{d}P_{\rep(X)} = 0$ where $h(z) = \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(z) - \E_P\left[\trueweights{P_X}{Q_X}(X) \middle| \rep(X) = z\right]$. We now show that $h = 0$, which concludes the proof for the first part of the Proposition. Note that $h$ is measurable as any Radon-Nikodym derivative is measurable, and any conditional expectation is measurable. Notably, as $\mathbb{R}_+$ and  $\mathbb{R}_-$ are in the Borel $\sigma$-algebra, $B_+ = h^{-1}(\mathbb{R}_+)$ and $B_- = h^{-1}(\mathbb{R}_-)$ are in $\Sigma_{\rep(X)}$. Thus,
\begin{align*}
    &0 = \int_\mathcal{Z} h \cdot 1_{B_+} \text{d}P_{\rep(X)} = \int_\mathcal{Z} h_+ \text{d}P_{\rep(X)} \\
    &0 = \int_\mathcal{Z} h \cdot 1_{B_-} \text{d}P_{\rep(X)} = -\int_\mathcal{Z} h_- \text{d}P_{\rep(X)}
\end{align*}
which implies that $h_+ = 0$ and $h_- = 0$, both $P_{\rep(X)}$-\text{a.s.}, as these two functions are non-negative. Thus, $h = 0$ $P_{\rep(X)}$-\text{a.s.}, which concludes the first part of proof.  \\[2ex]

Now we further assume Assumptions \ref{ass:rndL2} and \ref{ass:yL2}. Then, we note that the confounding bias is equal to $-\E_P\left[\trueweights{P_X}{Q_X}(X)\left(\E_{P}[\Y|X] - \E_P[\Y|\rep(X)]\right)\right]$. As $\trueweights{P_X}{Q_X}$ is now a $L_2(P_X)$ weight function wrt $P$, and using that $\trueweights{P_{\rep(X)}}{Q_{\rep(X)}} = \E_P\left[\trueweights{P_X}{Q_X}(X) \middle| \rep(X) = . \right] \ P_{\rep(X)}$-a.s., identical computations as in the proof of item 2 in Proposition \ref{prop:sols_rep} show that $\trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(.))$ is also a $L_2(P_X)$ weight function wrt $P$, while being a function of $\rep(x)$. Applying Proposition \ref{prop:sols_rep}, item 2, to $\trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(.))$ leads to $\E_P\left[\trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))\left(\E_{P}[\Y|X] - \E_P[\Y|\rep(X)]\right)\right] = 0$. Summing this to the confounding bias leads to
\begin{align*}
    \text{Confounding bias} = -\E_P\left[\left(\trueweights{P_X}{Q_X}(X) - \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))\right) \cdot \left(\E_{P}[\Y|X] - \E_P[\Y|\rep(X)]\right)\right].
\end{align*}
Finally,
\begin{align*}
    &\E_P\left[\left(\trueweights{P_X}{Q_X}(X) - \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))\right) \E_P[\Y|\rep(X)\right] \\
    &= \E_P\left[\left(\trueweights{P_X}{Q_X}(X) - \E_P\left[\trueweights{P_{X}}{Q_{X}}(X) \middle| \rep(X)\right]\right) \E_P[\Y|\rep(X)\right] \text{ from the first part of the Proposition}\\
    &= \E_P\left[\trueweights{P_X}{Q_X}(X)\E_P[\Y|\rep(X)]\right] - \E_P\left[\E_P\left[\trueweights{P_{X}}{Q_{X}}(X) \middle| \rep(X)\right]\E_P[\Y|\rep(X)]\right] \\
    &= \E_P\left[\trueweights{P_X}{Q_X}(X)\E_P[\Y|\rep(X)]\right] - \E_P\left[\E_P\left[\trueweights{P_{X}}{Q_{X}}(X)\E_P[\Y|\rep(X)] \middle| \rep(X)\right]\right] \\
    &= \E_P\left[\trueweights{P_X}{Q_X}(X)\E_P[\Y|\rep(X)]\right] - \E_P\left[\trueweights{P_{X}}{Q_{X}}(X)\E_P[\Y|\rep(X)]\right] \text{ from the tower property} \\
    &= 0
\end{align*}
Thus,
\begin{align*}
    \text{Confounding bias} = -\E_P\left[\left(\trueweights{P_X}{Q_X}(X) - \trueweights{P_{\rep(X)}}{Q_{\rep(X)}}(\rep(X))\right) \cdot \E_{P}[\Y|X]\right].
\end{align*}
\subsection{Proof of Corollary \ref{prop:bse}}
Note that from the tower property,
\begin{align}
    \E_P[\Y | \rep(X)] = \E_P[\E_P[\Y | X, \rep(X)] | \rep(X)] =  \E_P[\E_P[\Y | X] \ | \ \rep(X)] \label{eq:epyrepx}
\end{align}

From Proposition \ref{prop:sols_rep}, for any $w$ depending on $\rep$ $P_X$-a.s., the zero chosen weights bias is zero. Thus,
\begin{align*}
    |\text{Bias}_{P,Q}(w)| &\leq \Big|\E_{P^w}[\E[\Y|\rep(X)]] - \E_Q[\E[\Y|\rep(X)]]\Big| + |\text{Confounding bias}| \\
    & \ \ \ \ \ \ \ \ \ \ \text{ where } \E_P[\Y|x] \in \mathcal{M} \text{ so from Equation  \ref{eq:epyrepx}, } \E_P[\Y|\rep(x)] \in \rep(\mathcal{M},P)\\
    &\leq \text{IPM}_{\rep(\mathcal{M},P)}(P^w_{\rep(X)}, Q_{\rep(X)}) + |\text{Confounding bias}| \text{ by definition of an IPM} \\
    &\leq \text{IPM}_{\rep(\mathcal{M},P)}(P^w_{\rep(X)}, Q_{\rep(X)}) + ||\Y||_{L_2(P)} \cdot \text{BSE}_{P,Q}(\rep) \text{ from Equation \ref{eq:bse_bounds_cb}}
\end{align*}

\subsection{Proof of Corollary \ref{prop:bse_multiple}}

From Corollary \ref{prop:bse}, for any $\alpha \in \Lambda$,
\begin{align*}
    & \text{Bias}^2_{P^\alpha,Q^\alpha}(w^\alpha)\\
    &\leq \left(\text{IPM}_{\rep^\alpha(\mathcal{M}^\alpha, P^\alpha)}(P^{\alpha, w^\alpha}_{\rep^\alpha(X)}, Q^{\alpha}_{\rep^\alpha(X)}) + ||\Y||_{L_2(P^\alpha)} \cdot \text{BSE}_{P^\alpha,Q^\alpha}(\rep^\alpha)\right)^2.
\end{align*}
Noting that $\forall a, b, (a+b)^2 \leq 2(a^2 + b^2)$ and taking the expectation wrt $p_\Lambda(\alpha)$ gives
\begin{align*}
    \frac{1}{2} \cdot \text{Bias}^2_{P^\Lambda,Q^\Lambda}(w^\Lambda)
    &\leq \E_{p_\Lambda(\alpha)}\Big[\text{IPM}^2_{\rep^\alpha(\mathcal{M}^\alpha, P^\alpha)}(P^{\alpha, w^\alpha}_{\rep^\alpha(X)}, Q^{\alpha}_{\rep^\alpha(X)})\Big] +  \E_{p_\Lambda(\alpha)}\left[||\Y||^2_{L_2(P^\alpha)} \cdot \text{BSE}^2_{P^\alpha,Q^\alpha}(\rep^\alpha)\right]
\end{align*}
Taking $||\Y||^2_{L_2(P^\alpha)} \leq \sup_{\alpha \in \Lambda} ||\Y||^2_{L_2(P^\alpha)}$ in the expectation with the BSE's leads to the result.


\subsection{Proof of Proposition \ref{prop:scores}}

First, let's note two useful properties :

\begin{itemize}
    \item For any distribution $R$ and random variable $Z$,
    \begin{align}
    \forall x, \ \ \E_R[\E_R[Z | X] \ | \  \rep(X) = \rep(x)] = \E_R[Z | \rep(X) = \rep(x)] \label{eq:Y_cond_rep}.
    \end{align}
    \item For any distributions $R$ and function $f$,
    \begin{align}
    \Big(\exists g, \ \forall x \ R_X\text{-a.s.}, \ f(x) = g(\rep(x))\Big) \Leftrightarrow \ \forall x \ R_X\text{-a.s.}, \ f(x) = \E_R[f(X) \ | \ \rep(X) = \rep(x)] \label{eq:collapse}.
    \end{align}
\end{itemize}


\textbf{Proof of a), ATT case :} Let $e(x) := \Pdata(A=1|X=x)$
\begin{align*}
    &\rep \text{ is a balancing score} \\
    &\Leftrightarrow \exists g, \ \ e(x) = g(\rep(x)) \ \forall x \ \Pdata_X\text{-a.s.} \text{ from \citep{rosenbaum1983tcrotpsiosfce}}\footnotenotemark \\
        &\Leftrightarrow \exists g, \ \ e(x) = g(\rep(x)) \ \forall x \ \Pdata_{X|A=0}\text{-a.s.} \text{ from the overlap assumption} \\
    &\Leftrightarrow \exists g, \ \ \trueweights{\Pdata_{X|A=0}}{\Pdata_{X|A=1}}(x) = g(\rep(x)) \ \forall x \  \Pdata_{X|A=0}\text{-a.s.}\text{ as } \trueweights{\Pdata_{X|A=0}}{\Pdata_{X|A=1}}(x) \text{ is a bijective function of } e(x)\text{ from Bayes' rule} \\
    &\Leftrightarrow \rep(x) \text{ is a generalized balancing score}.
\end{align*}

\stepcounter{footnote}
\footnotetext{While the original statement in \citet{rosenbaum1983tcrotpsiosfce} is not $\Pdata_X$-a.s., we note that it can be relaxed to $\Pdata_X$-a.s. as it pertains to the adjustment formula that involves an expectation wrt $\Pdata_X$}

\textbf{Proof of a), ATE case} : we fix $a \in \mathcal{A}$ and work with the following definition \citep{imbens2000trotpsiedrf} of a balancing score for non-binary treatments : $1_{\{A = a\}} \indep X | \rep(X)$. Indeed, as the problem is arm-specific, the definitions of generalized deconfounding, balancing and prognostic scores are arm-specific \textit{a priori}. An extension to an alternative definition $A \indep X | \rep(X)$ is straightforward by replacing a fixed $a \in \mathcal{A}$ with $\forall a \in \mathcal{A}$ at the start of each of the following statements involving $a$. Then,
\begin{align*}
    &\rep \text{ is a balancing score} \\
    &\Leftrightarrow \Pdata(a|x) = \Pdata(a|\rep(x)) \ \forall x \ \Pdata_X\text{-a.s.} \\
    &\Leftrightarrow \Pdata(a|x) = \E[\Pdata(a|X) | \rep(X) = \rep(x)] \ \forall x \ \Pdata_X\text{-a.s.}  \text{ using \ref{eq:Y_cond_rep} with } Z = 1_{\{A = a\}} \\
    &\Leftrightarrow \exists g_a, \ \ \Pdata(a|x) = g_a(\rep(x)) \ \forall x \ \Pdata_X\text{-a.s.}  \text{ from \ref{eq:collapse}} \\
    &\Leftrightarrow \exists g_a, \ \ \trueweights{\Pdata_{X|A=a}}{\Pdata_X}(x) = g_a(\rep(x)) \ \forall x \ \Pdata_X\text{-a.s.} 
 \\
    & \ \ \ \ \ \ \text{ where } \ \trueweights{\Pdata_{X|A=a}}{\Pdata_X}(x) \text{ is the true weights and is a bijective function of } \Pdata(a|x)\text{ from Bayes' rule} \\
    &\Leftrightarrow \exists g_a, \ \ \trueweights{\Pdata_{X|A=a}}{\Pdata_X}(x) = g_a(\rep(x)) \ \forall x \ \Pdata_{X|A=a}\text{-a.s. from the overlap assumption} \\
    &\Leftrightarrow \rep(x) \text{ is a generalized balancing score}.
\end{align*}

\textbf{Proof of b)} : we slightly change the definition of deconfounding scores \citep{damour2021dsfrfceewwo} to $\forall a \in \mathcal{A}, \ \ \E[\E[Y | \rep(X), A=a]] = \E[Y(a)]$, where the representation $\rep$ is now shared across treatment arms, in the spirit of D'Amour and Franks (2021)\citep{damour2021dsfrfceewwo}.To this aim, it is sufficient to show that, in Problem \ref{prob:gwp} applied to estimation of $\E[Y(a)]$, the confounding bias is equal to $\E[\E[Y | \rep(X), A=a]] - \E[Y(a)]$. From the original definition of the confounding bias, this simplifies further to $\E[Y(a)] = \E[\E[Y | X, A=a]]$. This follows from the canonical unconfoundedness, overlap and SUTVA assumptions.

\textbf{Proof of c)} : again, $a \in \mathcal{A}$ is fixed. Assume $\rep(x)$ is a prognostic score for $Y(a)$, that is $Y(a) \indep X | \rep(X)$. Then,
\begin{align*}
    \forall x \ \Pdata_{X|A=a}\text{-a.s.}, \E[Y|x,A=a]  &:= \E[Y(a) | x] \\
    &= \E[Y(a) | x, \rep(x)] \\
    &= \E[Y(a) | \rep(x)] \text{ by application of the definition of a prognostic score},
\end{align*}
so $\E[Y|x,A=a]$ is a function of $\rep(x) \ \Pdata_X\text{-a.s.}$, thus it is so $\Pdata_{X|A=a}\text{-a.s.}$ from the overlap assumption, making the latter a generalized prognostic score.

Now assume that $\E[Y|x,A=a]$ itself is a prognostic score, that is $Y(a) \indep X \ | \ \E[Y|X,A=a]$. Then, $\pdata(Y(a) | x) = \pdata(Y(a) | \E[Y|x,A=a])  \ \forall x \ \Pdata_{X}\text{-a.s.}$, where $\Pdata_{X}\text{-a.s.}$ can be replaced with $\Pdata_{X|A=a}\text{-a.s.}$ thanks to the overlap assumption. Let $\rep(X)$ be a generalized prognostic score. Then, there exists a function $g_a$ such that $\E[Y|x,A=a] = g_a(\rep(x)) \ \forall x \ \Pdata_{X|A=a}\text{-a.s.}$. In particular, as $\pdata(Y(a)|x)$ is already a function of $\E[Y|x,A=a]$ $\Pdata_{X|A=a}\text{-a.s.}$, it is also a function of $\rep(x)$  $\Pdata_{X|A=a}\text{-a.s.}$. So there exists a function $h_a$ such that $\pdata(Y(a)|x) = h_a(\rep(x)) \ \forall x \ \Pdata_{X|A=a}\text{-a.s.}$. In particular, by application of \ref{eq:collapse}, $\pdata(Y(a)|x) = \E[\pdata(Y(a) | X) | \rep(X) = \rep(x)] \ \forall x \ \Pdata_{X|A=a}\text{-a.s.}$ and by application of \ref{eq:Y_cond_rep} to $Z = 1_{\{Y(a) = .\}}$,  $\pdata(Y(a)|x) = \pdata(Y(a)|\rep(x)) \ \forall x \ \Pdata_{X|A=a}\text{-a.s.}$, which can be replaced with $\Pdata_{X}\text{-a.s.}$ from the overlap assumption. Thus, $\rep(x)$ is a prognostic score.

\textbf{Proof of d)} : let $X_\mathcal{I}$ be covariates selected according to indices $\mathcal{I}$ and  $X_{-\mathcal{I}}$ be their complement. We also use this notation for e) and f). 

If $x_{\mathcal{I}}$ is a heterogeneity set, i.e. $Y(1) - Y(0) \indep (S, X_{-\mathcal{I}}) | X_{\mathcal{I}}$ then
\begin{align*}
    \forall x \ \Pdata_X\text{-a.s.}, \E_P[\Y|x]
    &= \text{CATE}(x) \text{ (under the transportability assumption)} \\
    &= \E[Y(1) - Y(0) | x] \\
    &= \E[Y(1) - Y(0) | x_{-\mathcal{I}}, x_{\mathcal{I}}] \\
    &= \E[Y(1) - Y(0) | x_{\mathcal{I}}] \text{ by definition of a heterogeneity set}
\end{align*}
where $\Pdata_X$-a.s. is equivalent to $\Pdata_{X|S=1}$-a.s. under the support inclusion (i.e. overlap) assumption, so $\E_P[\Y|x]$ is a function of $x_{\mathcal{I}} \ \Pdata_{X|S=1}$-a.s., making the latter a generalized prognostic score.

\textbf{Proof of e)} : If  $x_{\mathcal{I}}$ is a sampling set, that is $Y(1), Y(0), S \indep X_{-\mathcal{I}} | X_{\mathcal{I}}$, then
\begin{align*}
    \forall x \Pdata_X\text{-a.s.}, \trueweights{\Pdata_{X|S=1}}{\Pdata_{X|S=0}}(x) &= \frac{\pdata(x|S=0)}{\pdata(x|S=1)} \\
    &= \frac{\Pdata(S=1)}{\Pdata(S=0)} \frac{\pdata(S=0|x)}{\pdata(S=1|x)} \text{ from Bayes' rule} \\
     &= \frac{\Pdata(S=1)}{\Pdata(S=0)} \frac{\pdata(S=0|x_{\mathcal{I}},x_{-\mathcal{I}})}{\pdata(S=1|x_{\mathcal{I}},x_{-\mathcal{I}})} \\
  &= \frac{\Pdata(S=1)}{\Pdata(S=0)} \frac{\pdata(S=0|x_{\mathcal{I}})}{\pdata(S=1|x_{\mathcal{I}})} \text{ as }x_{\mathcal{I}}\text{ is a sampling set} \\
  &= \frac{\pdata(x_{\mathcal{I}}|S=0)}{\pdata(x_{\mathcal{I}}|S=1)} \\
  &= \trueweights{\Pdata_{X_{\mathcal{I}}|S=1}}{\Pdata_{X_{\mathcal{I}}|S=0}}(x_{\mathcal{I}}) 
\end{align*}
thus $\trueweights{\Pdata_{X|S=1}}{\Pdata_{X|S=0}}(x)$ depends on  $x_{\mathcal{I}} \ \forall x \ \Pdata_X\text{-a.s.}$, which is equivalent to $\Pdata_{X|S=1}$-a.s. under the support inclusion (i.e. overlap) assumption, and the last two lines illustrate the fact that, in this case, the density ratio wrt $X$ is equal to that wrt the representation a.s. under the source distribution.

\textbf{Proof of f)} : If  $x_{\mathcal{I}}$ is a separating set, that is $Y(1) - Y(0) \indep S | X_{\mathcal{I}}$,
\begin{align*}
    \Pdata\text{-a.s.}, \ \E_P[\Y | X_{\mathcal{I}}, S=0]
    &= \E_P[ \E_P[\Y|X] | X_{\mathcal{I}}] \\
    &= \E_P[ \text{CATE}(X) | X_{\mathcal{I}}] \\
    &= \E[ \text{CATE}(X) | X_{\mathcal{I}}, S=1] \\
    &= \E[\E[Y(1) - Y(0) | X, S=1]| X_{\mathcal{I}}, S=1] \text{ under the transportability assumption} \\
    &= \E[\E[Y(1) - Y(0) | X, X_{\mathcal{I}}, S=1] | X_{\mathcal{I}}, S=1] \\
    &= \E[Y(1) - Y(0) | X_{\mathcal{I}}, S=1] \text{ under the tower property}\\
    &= \E[Y(1) - Y(0) | X_{\mathcal{I}}, S=0] \text{ by definition of a separating set.}
\end{align*}
where $\Pdata\text{-a.s.}$ implies $\Pdata(.|S=0)-\text{a.s.}$, thus
\begin{align*}
&\text{Confounding bias of }x_{\mathcal{I}} \\
&= \E_Q\Big[\E_P[\Y | X_{\mathcal{I}}] - \E_P[\Y | X]\Big] \\
&= \E\Big[\E[Y(1) - Y(0) | X_{\mathcal{I}}, S=0] - \E_P[\Y|X] \Big| S = 0\Big] \text{ from the above} \\
&= \E\Big[\E[Y(1) - Y(0) | X_{\mathcal{I}}, S=0] - \text{CATE}(X) \Big| S = 0\Big] \\
&= \E\Big[\E[Y(1) - Y(0) | X_{\mathcal{I}}, S=0] - \E[Y(1) - Y(0) | X, S=0] \Big| S = 0\Big] \text{ from the transportability assumption} \\
&=   \E\Big[\E[Y(1) - Y(0) | X_{\mathcal{I}}, S=0]\Big| S = 0\Big]  - \E\Big[\E[Y(1) - Y(0) | X, S=0] \Big| S = 0\Big] \\
&= \E[Y(1) - Y(0) | S=0] - \E[Y(1) - Y(0) | S=0] 
 \text{ under the tower property}\\
&= 0,
\end{align*}
so $x_{\mathcal{I}}$ is a generalized deconfounding score. 




\end{document}