
We begin by introducing our setting based on the Rubin causal model. We have access to two datasets of size $\nrct$ from a randomized trial~(rct) and  $\nobs$ from an observational study~(os), containing tuples $Z := (X,Y,T)$ of features $X \in \mathbb R^\xdim$, observed outcomes $Y\in \mathbb R$ or $\{0, 1\}$ and binary variable $T \in \{0,1\}$ indicating whether a treatment has been assigned ($T=1$) or not ($T=0$). We assume that the data is drawn i.i.d~from the marginal distributions $\p^\diamond $ of  $\pfull^\diamond$ for $\diamond \in \{\rct, \obs\}$. We model the full distributions $\pfull^\diamond$ 
over $\left(X, U, Y(0), Y(1),Y, T\right)$, with $U\in \mathbb R^\udim$ the unobserved confounders and $\left(Y(0), Y(1)\right)$ the potential outcomes. 
% and  $Y$ is the observed outcome, and $T \in \{0,1\}$ is a binary treatment 
% indicator. However, the confounder $U$ and the potential outcomes are never observed, that is, we can only sample from the marginal distribution $\p^\diamond $ of $Z := (X,Y,T)$ under $\pfull^\diamond$ for $\diamond \in \{\rct, \obs\}$. We denote with $Z^\rct = \{$


% \kd{modify this paragraph}
%Let $\pfulljoint$ be the joint distributions of the two studies over  $\left(X, U, Y(0), Y(1),Y, T, S\right)$, where $S \sim \ber(\pis)$ is a binary random variable indicating whether a sample comes from os ($S=1$) or rct ($S=0$), and we have that  $$\pfulljoint \mid S=0 = \pfullrct, \quad \pfulljoint \mid S=1 = \pfullobs.$$
% then define the joint distribution of the two studies \begin{equation}
    %\pfulljoint = \pfullobs \pis + \pfullrct (1-\pis).
%\end{equation} 
% We assume that we  observe $n$ i.i.d. samples $D =\{Z_i\}_{i=1}^{n}$, with $Z_i = (X_i,Y_i,T_i,S_i)$ drawn from the marginal of $\pfulljoint$.

\label{sec:ass}
Our goal is to detect bias in the treatment effect estimates derived from the observational study. To this end, we 
need to assume that the randomized~trial is unbiased and contained in the support of the observational study, which is captured by the following standard assumptions.


\begin{assumption}
    \label{asm:internalvalid}
We impose the assumptions on $\pfullrct$:

 \textit{1) consistency} 
 \begin{equation}
 Y = Y(T)\quad \pfullrct -a.s.
 \end{equation}
  \textit{2) internal validity via randomized treatment assignment} 
  \begin{equation}
      T \ind (Y,X,U) \quad \text{and}\quad \mathbb P(T =1 )= \pi\in(0,1)
  \end{equation}
     % \textit{3) support inclusion} $$\supp(\prct_X) \subseteq \supp(\pobs_X).$$
    % \pfull^{\diamond} &= \underbrace{\p_{Y  \mid Y(1),Y(0),T}}_{\defeq \pdet}~\underbrace{\p_{Y(1),Y(0)  \mid X,U}}_{\defeq \pinv}~\underbrace{\p^{\diamond}_{ X,T, U}}_{\defeq \pconf^{\diamond}},\label{eq:factor}
% The treatment is assigned independent of the covariates and the potential outcomes, that is, 
% \begin{align*}
% \pconfrct  =\prct_{T} \prct_{X, U}, \quad \mathrm{with} \quad \prct_T(T=1) = \pi \in (0,1). 
% \end{align*}
\end{assumption} 
Both consistency and internal validity are held by design in a completely randomized experiment, allowing for an unbiased estimation of the treatment effect. 
Moreover, the assumption of support inclusion is strictly weaker than the positivity of trial participation; see, for example,  \cite{stuart2011use, hartman2015sample, andrews2017weighting, nie2021covariate, colnet2022reweighting}. It is also expected to hold in our setting, as it aligns with the design of observational studies by regulatory agencies for drug monitoring ~\citep{franklin2019evaluating, schurman2019framework, he2020clinical}.

\paragraph{Remark on complete randomization} We remark that the results in this paper also apply to the case where we use data from an unconfounded observational study instead of a completely randomized trial. That is, for some $\epsilon >0$, we can replace Point 2 with 
\begin{equation*}
      T \ind (Y,U)\vert X \quad \text{and}\quad \mathbb P(T =1 \vert X) \in (\epsilon,1-\epsilon)~~\prct_X-a.s.
\end{equation*}
In this case, the propensity score
$ P(T =1 \vert X) $ is generally not known and needs to be estimated.



% %Observational studies, on the other hand, can have arbitrary confounding structures reflected in $\pconf$, i.e. $\pconfobs  =\pobs_{ T \vert X,U} ~\pobs_{X,U}$.



% % \paragraph{Decomposition of $\pfull^\rct$}
% % We assume that we can factorize the full distribution as follows for rct and os\footnote{We mention that technically, the results presented in this paper only require this decomposition to hold for $\pfull^\rct$.} 
% % \begin{align*}
% % \pfull^{\diamond} &= \underbrace{\p_{Y  \mid Y(1),Y(0),T}}_{\defeq \pdet}~\underbrace{\p_{Y(1),Y(0)  \mid X,U}}_{\defeq \pinv}~\underbrace{\p^{\diamond}_{ X,T, U}}_{\defeq \pconf^{\diamond}},\label{eq:factor}
% % \end{align*}
% % where $\pdet$ is deterministically given by the consistency relation $Y=Y(T)$, $\pinv$ is invariant across studies, and $\pconf^\diamond$ differs for $\diamond \in \{\rct,\obs\}$.

% \paragraph{Internal validity and consistency} Second, we assume that the randomized trial provides valid estimates. This is crucial as we will compare observational estimates with those of the rct to detect bias.\begin{assumption}[Internal validity] 
% \label{asm:internalvalid}
% The treatment is assigned independent of the covariates and the potential outcomes, that is, 
% \begin{align*}
% \pconfrct  =\prct_{T} \prct_{X, U}, \quad \mathrm{with} \quad \prct_T(T=1) = \pi \in (0,1). 
% \end{align*}
% \end{assumption} 

% \paragraph{Support inclusion} Third, the population in the observational study includes the population in the trial.
% \begin{assumption}[Support inclusion] 
% \label{asm:nested_support}
% The support of the randomized trial is included in the support of the observational study. More formally,  
% $$\supp(\prct_X) \subseteq \supp(\pobs_X).$$
% \end{assumption}


 
 











We first discuss the main challenges of estimating treatment effects in observational data. Then, we introduce existing tests that target this problem, along with technical background on the relevant test statistics.
\subsection{Treatment effect estimation}
\label{subsec:oldtests}
The conditional average treatment effect~(CATE) measures the average impact of treatment on the outcome $Y$ conditioned on all the observed features $X$, given by 
\begin{align*}
\yone^\diamond (X) &\defeq \EE_{\pfull^{\diamond}}\left[ Y(1) - Y(0) \mid X \right], \quad \diamond \in \{\rct, \obs\}.
\end{align*}
Under~\Cref{asm:internalvalid},  the CATE in the randomized trial is identifiable by the  IPW estimand, i.e. 
\begin{align*}
 \estimandrct(X) = \EE_{\prct}\left[Y \left(\frac{T}{\pi}-\frac{1-T}{1-\pi}\right) \bigg\vert X\right] =\caterct (X).
\end{align*}
 In contrast, the CATE in the observational study is generally not identifiable due to potential dependencies of $T$ on the unobserved confounder $U$, i.e. we can have \begin{align*}
\estimandobs (X) = \EE_{\pobs}\left[Y \left(\frac{T}{e(X)}-\frac{1-T}{1-e(X)}\right) \bigg\vert X\right] \neq \cateobs(X),
\end{align*}
where $e(X) = \pobs(T=1 \mid X)$ is the propensity score. 

\paragraph{Statistical tests for ATE} Given the challenges associated with estimating treatment effects with non-randomized data, several papers have proposed to detect bias in observational studies using randomized trials~\citep{viele2014use,yangelastic, morucci2023double}. In particular, they introduce statistical tests for the null hypothesis 
$$
\hnullate: \EE_{\pxrct}[\estimandobs (X)] =  \EE_{\pxrct}[\estimandrct(X)].
$$
In our setting, rejecting $\hnull$ implies that either (i) unobserved confounding is present in the observational study or (ii) the transportability assumption is violated, i.e.  $\caterct \neq \cateobs$, due to unobserved distribution shifts in $U$. However, even in infinite samples, these tests reject observational studies with negligible bias. In real-world settings, where some degree of bias is likely present, such tests can be too restrictive. 

Another line of work incorporates some tolerance into bias tests~\citep{ de2023hidden}. More specifically, given some user-specified tolerance functions $\lbobs$ and $\ubobs$, they test the null hypothesis
\begin{align}
  \hnull: \EE_{ \pxrct}[\estimandrct(X)] \in \left[ \EE_{ \pxrct}[ \lbobs(X)], \EE_{\pxrct}[\ubobs(X)]\right],
\end{align}
where $\lbobs$ and $\ubobs$ are some user-specified tolerance functions, such that $\lbobs(X) \leq \estimandobs(X) \leq \ubobs(X)$ point-wise. For instance, \citet{de2023hidden} choose sensitivity analysis bounds as tolerance functions and test for unobserved confounding. Similarly, the tolerance function can be instantiated to be transportability bounds as introduced by~\citet{dahabreh2023sensitivity}.

 
\paragraph{Statistical tests for CATE} A crucial limitation of the tests for ATE bias is that they can only detect confounding on average (as illustrated in Section~\ref{sec:exp}), and thus may fail 
to detect bias only present in finer-grained subpopulations. 
 To address this shortcoming without facing multiple testing issues, \citet{hussain2023falsification} propose a kernel test for the null hypothesis \begin{align}
\label{eq:catenull}
    \hnull: \estimandobs (X) = \estimandrct(X) \quad  \pxrct-\mathrm{almost\;surely}.
\end{align}
% The main advantage of this test is that it 
Importantly, this test checks across all (potentially unknown) subpopulations of covariates $X$ for disagreement between the CATEs functions.
Outside the context of treatment effect estimation, the problem of testing equality of conditional expectations has been extensively studied~\citep{delgado1993testing,neumeyer2003nonparametric,racine2006testing,luedtke2019omnibus,muandet2020kernel}. In this paper, we focus on the approach proposed by~\citet{luedtke2019omnibus,muandet2020kernel}, which is an extension of the celebrated two-sample kernel test from~\citet{gretton2012kernel}. In essence, these tests reduce to a relatively simple U-statistic, which we will introduce in the following section.
\subsection{U-Statistics}
\label{subsec:ustat}
U-statistics \citep{serfling2009approximation} are widely used in the testing literature and are for instance the backbone of kernelized tests \citep{gretton2012kernel}. Given a dataset $D = \{Z_i\}_{i=1}^n$ and a kernel function $h$, the U-statistics is defined by
\begin{equation}
\label{eq:ustat}
    \frac{1}{n(n-1)} \sum_{i\neq j} h(Z_i, Z_j).
\end{equation}
Assuming that the samples are drawn i.i.d.~and that the mean $\mathbb E_{z,z'} h(z,z') =0$ is zero, we call this a degenerated U-statistics. As shown in the seminal work by \cite{serfling2009approximation}, the degenerated U-statistics converges in distribution to a weighted $\chi^2$-statistics and its quantiles can be consistently estimated via Bootstrapping \cite{huskova1993consistency}. 

In this paper, we will use a modification of the degenerated U-statistics, called cross U-statistics, given by
\begin{equation}
\label{eq:crossu}
       \bar U := \frac{4}{n^2} \sum_{i\in \mathcal I_1} \sum_{j \in \mathcal I_2} h(Z_i, Z_j).
\end{equation}
where $\mathcal I_1$ and $\mathcal I_2$ are two disjoint sets of indices of size $n/2$.  The advantage of this statistics is that, under mild conditions \citep{kim2024dimension}, a CLT applies and thus it converges in distribution to a normal distribution $\mathcal{N}(0, \sigma^2)$ with Variance
\begin{equation}
\label{eq:variance}
\sigma^2 = \mathbb E_z \left(\frac{2}{n} \sum_{j \in \mathcal I_2} h(z, Z_j)\right)^2 - \left(\mathbb E_z \frac{2}{n} \sum_{j \in \mathcal I_2} h(z, Z_j) \right)^2. 
\end{equation}
In particular, we therefore have, under mild assumptions, that  $    \frac{\sqrt{n} \bar U}{\hat\sigma^2 }\to \mathcal N(0,1)$
with 
\begin{equation}
\label{eq:hatvar_def}
    \hat \sigma^2 = 
    \frac{2}{n} \underset{i\in\mathcal I_1}{\sum}
    \left(\frac{2}{n}\underset{j\in\mathcal I_2}{\sum}  h(Z_i, Z_j)\right)^2 -\bar U^2.
\end{equation}



% Recently, \cite{kim2024dimension} propose a powerful alternative to degenerated U-statistics that is asymptotically normal and referred to as the cross U-statistics. 
% We only focus on its application to the kernel conditional moment test setting. 

% The idea is to split the dataset $\mathcal D = \{Z, X\}_{i=1}^{2n}$ of size $2n$ equally into two  datasets $\mathcal D_1$ and $\mathcal D_2$  with index sets $I_1$ and $I_2$, both of size $n$. We then define 
% $U(Z,X ) = \frac{1}{n} \sum_{j \in \mathcal I_2} \psi(Z) k(X, X_j)  \psi(Z_j)$ and
% $$
% \bar U = \frac{1}{n}\sum_{i \in I_1} U(Z_i)
% $$
% Note that $\bar U$ is a sum of $n$ i.i.d. samples, and thus under weak assumptions \kd{cite the theorem in \citep{kim2024dimension}},  the paper shows that a central limit theorem holds:
% $$
% \sqrt{n} (\bar U - \mathbb E \bar U) \to \mathcal N(0, \sigma^2(\beta))
% $$
% with $\sigma^2 = \mathbb E_Z U(Z)^2 -  (\mathbb E_Z U(Z))^2 $. Since $\mathbb E ~\bar U =\mathbb H^2 =0$ under the Null Hypothesis, \cite{kim2024dimension} propose to use the following test statistics, which asymptotically converges to a standard normal distribution under weak assumptions
% $$
% \frac{\sqrt{n} \bar U}{\sqrt{\frac{1}{n} \sum_{i \in I_1} U(Z_i)^2 - \bar U^2  } } \to \mathcal N(0, 1).  
% $$



\subsection{Kernel Conditional moment tests}
\label{subsec:conditionalmoment}
Given two random variables $Z$ and $X$ and a function $\psi$, conditional moment tests aim to test the Null Hypothesis
\begin{align}\label{eq:conditional_moment_null}
        H_0:&\quad \quad \EE\left[\psi(Z) \vert X\right]=0, \quad \mathbb P_X-a.s.
\end{align}
We refer to~\Cref{sec:related_work} for a discussion of different conditional moment test statistics. In this paper, we build upon the kernel conditional moment test introduced by \citet{muandet2020kernel}, which is an extension of the two-sample kernel test by \citet{gretton2012kernel}. 
% has been introduced in \citep{muandet2020kernel} and can be viewed as an extension of the celebrated two sample kernel test \citep{gretton2012kernel}.  The aim is to provide an asymptotically valid test for the Null hypothesis that
% \begin{equation}
% \label{eq:kernel_null}
%      H_0: ~~~\quad\EE\left[\psi(Z) f(X)\right]=0, ~~ \forall f~\in \mathcal F, 
% \end{equation}
% where we choose the function class $\FF$ to be the unit ball in a reproducing kernel hilbert space (RKHS) (see e.g., asdflk). Clearly, rejecting this Null Hypothesis allows us also to reject the Null Hypothesis from Equation~\eqref{eq:conditional_moment_null} that
% $\mathbb E_{Z} \left[\psi(Z) \vert X\right]=0, \mathbb P_X-a.s.$. 
To test the Null Hypothesis, \citet{muandet2020kernel}  propose to use the U-statistics to estimate the quantity
\begin{align*}
\tstat^2 &\defeq \left(\sup _{\|f\|_\FF \leq 1}\EE\left[\psi(Z) f(X)\right]\right)^2=  \|\EE\left[\psi(Z) k(X,\cdot)\right] \|_\FF^2\nonumber\\
&= \EE_{(z,x),(z',x')}[\psi_\beta(z) k(x,x') \psi_\beta(z')], 
\end{align*}
with $k$ the reproducing kernel of an RKHS $\mathcal F$. Note that under the Null Hypothesis, we have that $\mathbb H^2 =0$, and thus the U-statistics is degenerate and asymptotically converges to a $\chi^2$-distribution. Computing the quantiles using Bootstrapping then gives an asymptotically valid test. 



