\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{comment}
\usepackage{amsmath}
\usepackage{amssymb}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros


\title{
%Double Penalty Integration Estimator for Combining Randomized Experiments
%and External Controls
Enhancing Treatment Effect Estimation: A Model Robust Approach Integrating Randomized Experiments and External Controls using the Double Penalty Integration Estimator
}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<ycheng26@ncsu.edu>?Subject=Your UAI 2023 paper}{Yuwen Cheng}{}}
\author[2]{\href{mailto:<liliwu@microsoft.com>?Subject=Your UAI 2023 paper}{Lili Wu}{}}
\author[3]{\href{mailto:<syang24@ncsu.edu>?Subject=Your UAI 2023 paper}{Shu Yang}{}}

% Add affiliations after the authors
\affil[1]{%
    Statistics Dept.\\
    North Carolina State University
}
\affil[2]{%
    Microsoft Research NYC
}
\affil[3]{%
    Statistics Dept.\\
    North Carolina State University
  }
  \newcommand{\lbl}[1]{\label{#1}{\ensuremath{^{\fbox{\tiny\upshape#1}}}}}
% remove % from next line for final copy
\renewcommand{\lbl}[1]{\label{#1}}
\newtheorem{lemma}{Lemma}\newtheorem{theorem}{Theorem}\newtheorem{assumption}{Assumption}\newtheorem{remark}{Remark}\newtheorem{corollary}{Corollary}\newtheorem{example}{Example}\newtheorem{definition}{Definition}[section]
\newtheorem{proof}{Proof}\newtheorem{condition}{Condition}\newtheorem{step}{Step}\newcommand{\bx}{\mathbf{x}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bR}{\mathbf{R}}
\newcommand{\bw}{\mathbf{w}}
\newcommand{\mhat}{\hat{\mu}}                     %mu hat
\newcommand{\bmhat}{\mbox{\boldmath$\hat{\mu}$}}  %bold mu hat
\newcommand{\bs}{\mbox{\boldmath$\sigma$}}        %bold sigma
\newcommand{\bS}{\mbox{\boldmath$\Sigma$}}        %bold Sigma
%\newcommand{\vtheta}{\hat{V}}
\newcommand{\ch}{{\mathcal{F}}}
\newcommand{\be}{\begin{equation}}
\newcommand{\en}{\end{equation}}
\newcommand{\bea}{\begin{eqnarray}}
\newcommand{\ena}{\end{eqnarray}}
\newcommand{\ba}{\begin{array}}
\newcommand{\percent}{\%}
\newcommand{\ea}{\end{array}}
\newcommand{\dis}{\displaystyle}
\newcommand{\Pf}{\vspace{0.3 cm}\no\underline{\it Proof}\hspace{0.7 cm}}
\newcommand{\vs}{\vspace{0.6 cm}}
\newcommand{\hs}{\hspace{1 cm}}
\newcommand{\adrsq}{(1-\frac{1}{r})S_r^2}
\newcommand{\bym}{\bar{y}_m^*}
\newcommand{\byr}{\bar{y}_r}
\newcommand{\sumh}{\sum_{c=1}^C}
\newcommand{\hyi}{\hat{Y}_I}
\newcommand{\A}{A}
\newcommand{\rr}{A_R\,}
\newcommand{\hnv}{\hat{N}_v}
\newcommand{\hrv}{\hat{R}_v}
\newcommand{\byn}{\bar{y}_n}
\newcommand{\bxm}{\bar{x}_m^*}
\newcommand{\byi}{\hat{\mu}_I}
\newcommand{\bxr}{\bar{x}_r}
\newcommand{\bxn}{\bar{x}_n}
\newcommand{\bxi}{\bar{x}_I}
\newcommand{\byrv}{\bar{y}_r^v}
\newcommand{\byii}{\hat{\mu}_{I(-i)}}
\newcommand{\sumi}{\sum_{i=1}^{n}}
\newcommand{\vnaive}{V_{JK}^I}
\newcommand{\no}{\noindent}
\newcommand{\R}{{\mathcal{R}}}
\newcommand{\pop}{{\mathbf{Y}}}
\newcommand{\pr}{\mathbb{P} }
\newcommand{\indep}{\perp \!\!\! \perp}
\def\mH{\mathcal{H}}


\newcommand{\T}{\mathrm{\scriptscriptstyle T}}
\newcommand{\mis}{ {\mathrm{mis}}} 
\newcommand{\ATT}{ {\mathrm{ATT}}} 
\newcommand{\adj}{ {\mathrm{adj}}} 
\newcommand{\mat}{ {\mathrm{mat}}} 
\newcommand{\obs}{ {\mathrm{obs}}} 
\newcommand{\var}{ {\mathrm{var}}} 
\newcommand{\cov}{ {\mathrm{cov}}} 
\newcommand{\dsm}{ {\mathrm{dsm}}} 
\newcommand{\psm}{ {\mathrm{psm}}} 
\newcommand{\prog}{ {\mathrm{prog}}} 
\newcommand{\J}{ {\mathcal{J}}} 
\newcommand{\mP}{ {\mathbb{P}}} 
\newcommand{\plim}{ {\mathrm{plim}}} 
\newcommand{\F}{ {\mathcal{F}}} 
\newcommand{\rep}{ {\mathrm{rep}}} 
\newcommand{\reg}{ {\mathrm{REG}}} 
\newcommand{\nni}{ {\mathrm{NNI}}} 
\newcommand{\nnri}{ {\mathrm{NNRI}}}
\newcommand{\HT}{ {\mathrm{HT}}} 
\newcommand{\N}{ {\mathcal{N}}} 
\newcommand{\I}{ {\tau}} 
\newcommand{\It}{ \mathcal{I}} 
\newcommand{\logit}{ {\mathrm{logit}}} 
\newcommand{\de}{ {\mathrm{d}}} 
\newcommand{\mx}{ {\mathrm{m.x}}} 
\newcommand{\dm}{ {d_{V}}} 
\newcommand{\E}{ {\mathbb{E}} } 
\newcommand{\bP}{ {\mathbb{P}}} 
\newcommand{\V}{ {\mathbb{V}}} 
\newcommand{\bone}{\mathbf{1}}
\newcommand{\cU}{ {\mathcal{U}}} 





\renewcommand{\labelenumi}{\alph{enumi})}

  \begin{document}
\maketitle
\begin{abstract}
Randomized experiments (REs) are the cornerstone for treatment effect evaluation. However, due to practical considerations, REs may encounter difficulty recruiting sufficient patients. External controls (ECs) can supplement REs to boost estimation efficiency. Yet, there may be incomparability between ECs and concurrent controls (CCs), resulting in misleading treatment effect evaluation. We introduce a novel bias function to measure the difference in the outcome mean functions between ECs and CCs. We show that the ANCOVA model augmented by the bias function for ECs renders a consistent estimator of the average treatment effect, regardless of whether or not the ANCOVA model is correct.  To accommodate possibly different structures of the ANCOVA model and the bias function, we propose a double penalty integration estimator (DPIE) with different penalization terms for the two functions. With an appropriate choice of penalty parameters, our DPIE ensures consistency, oracle property, and asymptotic normality even in the presence of model misspecification. DPIE is at least as efficient as the estimator derived from REs alone, validated through theoretical and experimental results.
\end{abstract}



\section{Introduction}\label{sec:intro}
Randomized experiments (REs), which allow researchers to scientifically quantify the impact of an intervention on a particular outcome of interest, are widely employed in a variety of areas. To make informed decisions, technology businesses always conduct A/B testing to evaluate new technologies, using a randomized experiment to compare the performance of each new software implementation with the previous version. Meanwhile, in the medical domain, randomized clinical trials (RCTs) ensuring no systematic differences between treatment groups are the cornerstone of treatment effect evaluation. When analyzing data from REs, analysis of covariance (ANCOVA) is a popular method that can provide consistent results, even if the model is misspecified. REs often require the use of external data to analyze treatment effects better: for example, A/B testing is time-consuming and requires a reasonably high number of users; thus, it is crucial to do a preliminary offline evaluation of external data to implement new interventions more efficiently and eliminate ineffective ones in advance \citep{gilotte2018offline}; meanwhile, if data from earlier clinical stages (Phase I or II) indicate that the product under investigation has a favorable benefit-risk profile in a disease area with unmet healthcare needs, then it is possible to design an RCT with a larger treatment group and a relatively smaller concurrent control (CC) group. Because the small CC group cannot provide sufficient power to the trial, it is reasonable to augment RCTs with external controls (ECs) from earlier trials \citep{yuan2019design}. In this paper, we propose a new method that combines the ECs with CCs to improve average treatment effect (ATE) estimation.

Since \citet{pocock1976combination}, who first introduced historical controls to incorporate external data into analysis, numerous statistical methods have been developed. Specifying a set of covariates in advance and then calculating the propensity score for matching, stratification, or weighting \citep{greenland1999causal,rubin1996matching,rubin2007design,hernan2016using}
is typical. However, these methods rely on the exchangeability assumption that there are no unmeasured confounders between ECs and CCs, which is unlikely in real-world applications. Additionally, Bayesian approaches \citep{spiegelhalter2004incorporating,hobbs2013adaptive,schmidli2014robust,ibrahim2000power,hobbs2012commensurate,neuenschwander2009note} can handle datasets combining both ECs and CCs: appropriate priors can be selected for incorporating the ECs after evaluating the relationship between the ECs and the CCs. Nevertheless, these methods can result in type I error inflation \citep{viele2014use}. Building upon the work of \citet{stuart2008matching} and \citet{yang2022improved}, who introduced a parametric bias function to adjust for the outcome heterogeneity between the control groups due to unmeasured confounders, \citet{wu2022integrative} advanced the idea by using the sieve estimation approach \citep{chen2007large} to estimate the unknown outcome model and bias function. The bias function in their approach measures the difference between experimental data and observational data. However, \citet{wu2022integrative} did not fully leverage the advantage of REs \citep{wang2021model} since these prior works were based on the assumption that the outcome model was correctly specified. In this paper, we extend the concept of the bias function to handle cases of possible model misspecification. Our bias function measures the difference between the EC outcome mean function and the working model in REs and guarantees consistency even when the outcome mean model is not correctly specified, providing a robust solution to the challenge of model misspecification. This is of great practical significance as simple models such as ANCOVA are commonly used despite the possibility of misspecification. 

We adopt the nonparametric sieve estimation approach \citep{chen2007large} to accurately estimate the bias function, therefore it is important to utilize feature selection techniques. These techniques tackle the high-dimensional aspect of the basis functions used in sieve estimation and address the possibility of the working outcome mean model containing irrelevant covariates. To resolve this, penalized terms can be added to each parameter in the objective function for optimization  along with regularization parameters. However, using the same regularization terms for both the REs and ECs can cause issues due to their differing levels of sparsity and magnitude. Therefore, we implement different penalized terms for the unknown bias function and the working outcome mean model, considering their distinct levels of sparsity and magnitude.

Multiple methods have employed different penalties for different goals. However, they focused on decomposing one function into different parts and applying different penalties to those parts. \citet{chernozhukov2017lava} proposed the Lava estimator by decomposing the signals into a dense part and a sparsity part, and then applying different penalties to each component; \citet{buhlmann2020deconfounding} proposed a spectral deconfounding approach to estimate sparse parameters given hidden variables, and demonstrated the Lava method \citep{chernozhukov2017lava} as one of their special cases; \citet{xing2021adaptive}, further, focused on the estimation of multivariate regression with hidden variables, and demonstrated their method can be viewed as the multivariate generalization of the Lava approach \citep{chernozhukov2017lava}. In addition to decomposing the parameters into sparse and dense parts, \citet{wang2019double} decomposed the function into an easy-to-interpret part and an uninterpretable part, and then applied double penalties. Our approach makes use of double penalties to deal with the possible different structures of the working outcome mean function and bias function to consistently select useful terms and enhance the efficiency of the ATE estimator. 

Different bias functions can utilize varying penalties, depending on their structure. Our study focuses on the use of the Smoothly Clipped Absolute Deviation (SCAD) penalty \citep{fan2001variable,fan2004nonconcave} to illustrate the theorem in the context of variable selection. This is because SCAD offers both oracle properties and asymptotic normality by selecting the appropriate regularization parameters. Nevertheless, the existing results are limited to situations where the models are correctly specified. To overcome this limitation, we present a novel proof for the SCAD penalty that extends its desirable properties to scenarios involving potential misspecification of models.


Our main contributions can be summarized as follows:
\begin{enumerate}
\item We present a novel bias function to combine REs and ECs and use sieve estimation \citep{chen2007large} to provide a flexible and computationally feasible way of estimating the unknown bias function. Our ATE estimator
for REs is consistent regardless of the specification of the working
outcome mean model.
\item We introduce the Double Penalty Integration Estimator (DPIE), which
employs different penalized terms for the unknown bias function and
the working outcome mean model to differentiate their different levels
of sparsity and magnitude. We prove that DPIE guarantees consistency
for the parameters that minimize the least squares loss
%the Kullback-Leibler Information Criterion
% (KLIC) between the true model and the working model \citep{white1982maximum}
and has the oracle property of only selecting non-zero parameters
and exhibiting asymptotic normality under the SCAD penalties.
\item We demonstrate that combining different data sources results in a
more efficient estimated ATE than using only REs, as long as the number
of basis function terms in the bias function is fewer than that of
the working outcome mean function. The oracle property of DPIE ensures the selection of relevant basis terms when the outcome mean function is more complex and less smooth than the bias function, leading to improved efficiency. On the other hand, using single penalties may result in a loss of the oracle property and failure to select useful basis terms, leading to decreased efficiency.
%that it can select non-zero basis terms when the outcome mean function
%contains more terms and is less smooth than the bias function, thereby
%improving efficiency. In contrast, single penalties may lose the oracle
%property, fail to select useful basis terms, and result in lost efficiency.
\end{enumerate}
The rest of the paper is organized as follows. We introduce the basic
idea in Section \ref{sec:Problem-Setup}. Section \ref{sec:Method}
introduces the proposed DPIE estimator and derives the theorem. We
conduct simulations for comparison in Section \ref{sec:Simulation}.
Section \ref{sec:Real-Data-Analysis} applies the proposed estimators
to an observational study from the National Supported Work (NSW) and
Current Population Survey (CPS). Finally, we conclude the paper with
a discussion in Section \ref{sec:Discussion}.




\section{PROBLEM SETUP}\label{sec:Problem-Setup}
Denote $X\in\mathcal{X\text{\ensuremath{\subset\mathbb{R}^{d}}}}$ as the vector of pre-treatment covariates, $A\in\text{\{}0,1\text{\}}$ as the binary treatment, and $Y\in\mathbb{R}$ as the outcome of interest. Following the potential outcomes framework \citep{neyman1923application,rubin1974estimating}, let $Y(a)$ be the potential outcome for the subject given the treatment $a,a=0,1.$

In real life, one can use previous trials or real-world data as ECs to supplement the REs. Assuming two data sources are accessible: the RE data source having $n$ independent and identically distributed (i.i.d.) subjects $\left\{ \left(X_{i},A_{i},Y_{i}\right):i\in{\rm \mathcal{I}_{{\rm RE}}}\right\} $ with $n_{1}$ concurrent treatments $\left\{ \left(X_{i},1,Y_{i}\right):i\in{\rm \mathcal{I}_{{\rm CT}}}\right\} $ and $n_{0}$ concurrent controls $\left\{ \left(X_{i},0,Y_{i}\right):i\in{\rm \mathcal{I}_{{\rm CC}}}\right\} $, and the EC data source with $m$ i.i.d. subjects with $\left\{ \left(X_{i},0,Y_{i}\right):i\in{\rm \mathcal{I}_{{\rm EC}}}\right\} $.
Let $N=n+m$ be the total sample size. Define $S$ as the indicator of the subject in the REs: $S_{i}=1$ for $i\in{\rm \mathcal{I}_{{\rm RE}}}$ and $S_{i}=0$ for $i\in{\rm \mathcal{I}_{{\rm EC}}}$. Then the ATE is $\tau=\mathbb{E}\left\{ Y\left(1\right)-Y\left(0\right)\mid S=1\right\} $. Further, let $e\left(X\right)=\mathbb{P}\left(A=1\mid X,S=1\right)$ be the propensity score and also define the conditional outcome mean function as $\mu_{a,s}(X)=\E\left(Y\mid X,A=a,S=s\right )$ %with $\hat{\mu}_{a,s}(X)$ as the corresponding estimator of $\mu_{a,s}(X)$ 
for $a=0,1$ and $s=0,1$.

One of the fundamental challenges to identifying the ATE is that $Y(1)$ and $Y(0)$ cannot be observed simultaneously. To overcome this issue, we make the following three common assumptions in the causal inference literature \citep{rubin1978bayesian}:

\begin{assumption}\label{asump-ignorable} $\{Y(0),Y(1)\}\indep A\mid X,S=1$ almost surely, where $\indep$ means ``independent of''. \end{assumption}
\begin{assumption}\label{consistency} $Y=Y(1)A+Y(0)(1-A)$. \end{assumption}
\begin{assumption}\label{asump-overlap}There exist constants $c_{1}$ and $c_{2}$ such that $0<c_{1}\leq e(X)\leq c_{2}<1$ almost surely.
\end{assumption} 

Assumption \ref{asump-ignorable} states that the treatment assignment is unconfounded in the REs. 
Assumption \ref{consistency} ensures observed outcomes correspond to potential outcomes given the received treatments.
Under Assumptions \ref{asump-ignorable} and \ref{consistency},
%with $X=x$ , we have $\tau(x)=\E\left\{ Y\left(1\right)-Y\left(0\right)\mid X=x,S=1\right\} =\E(Y\mid X=x,A=1,S=1)-\E(Y\mid X=x,A=0,S=1)$, and 
the conditional outcome mean function in REs is $\mu_{a,1}\left(X\right)=\E\left\{ Y\left(a\right)\mid X,S=1\right\} =\E( Y\mid X,A=a,S=1) $.
Assumption \ref{asump-overlap} implies a sufficient overlap of the covariate distribution between the treatment groups, then averaging the treatment effect on the distribution of $X$ is feasible, thus
the ATE is $\tau =\E\left\{ \mu_{1,1}\left(X\right)-\mu_{0,1}\left(X\right)\mid S=1\right\} $.
%$\tau=\E\left\{ \tau(X)\mid S=1\right\} =\E\{ \E(Y\mid X,A=1,S=1)-\E(Y\mid X,A=0,S=1)\mid S=1\} =\E\left\{ \mu_{1,1}\left(X\right)-\mu_{0,1}\left(X\right)\mid S=1\right\} $.

The Analysis of covariance (ANCOVA) model is a powerful tool for estimating the ATE in REs. The randomization design  allows for the ATE estimator $\hat{\tau}$ to be
consistent and asymptotically normal under arbitrary misspecification of the ANCOVA model \citep{wang2021model}. Following the common practice, we use the ANCOVA model as the working model in REs. To enhance the model's generality,
we incorporate a $k_{1}$-dimension basis function of $X$ $p_{\mu}(X)=\left\{ p_{\mu,1}(X),\ldots,p_{\mu,k_{1}}(X)\right\} ^{{\T}}$  into the ANCOVA model as $\bar{\mu}_{A,1}(X;\beta)=\beta_{\rm {int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)$, where $\beta$ is a $ K_1=(k_1+2)$-dimensional parameter $\left(\beta_{\rm{int}},\beta_{A},\beta_{X}^{{\T}}\right)^\T$. Under the ANCOVA model, it is common to utilize ordinary least squares estimators for parameter estimation. Denote $\beta_*=\left(\beta_{\rm{int*}},\beta_{A*},\beta_{X*}^{{\T}}\right)^\T$ as the minimizer of $\E [\{Y-\bar{\mu}_{A,1}(X;\beta)\}^{2} \mid S=1]$. Importantly, $\beta_{A*}$ is the ATE $\tau$ regardless of the correctness of the working model. 

% Denote $\beta_*=\left(\beta_{\rm{int*}},\beta_{A*},\beta_{X*}^{{\T}}\right)^\T$ as the minimizer 
% of $\E [\{Y-\bar{\mu}_{A,1}(X;\beta)\}^{2} \mid S=1 ]$. 
% %$:=\beta^\T p_\mu$. %, where $\beta^\T=\left(\beta_{\rm{int}},\beta_{A},\beta_{X}^{{\T}}\right)$ and $p_{\mu}^\T=\left\{ 1,A,p_{\mu}^{{\T}}(X)\right\}$ with dimension $K_{1}=k_{1}+2$. 
% Importantly, for $S=1$, $\beta_A$ is the ATE $\tau$ regardless of the correctness of the working model.

To utilize  ECs to augment REs, we consider the following assumption such that the EC covariate distribution is nested in the RE covariate distribution.
\begin{assumption}\label{asump-HCccoverlap}
%$0<\mathbb{P}(S = 1|X) <1$ almost surely.
$f(X\mid S=0)/f(X\mid S=1)<\infty$ almost surely. 
\end{assumption} 

To use ECs to supplement CCs, it is crucial to remove biases of EC data due to possible incomparability between ECs and CCs. We define the bias function as 
\begin{align*}
b_{0}(X)=\mathbb{E}(Y\mid X,A=0,S=0) -\bar{\mu}_{0,1}(X;\beta).
\end{align*}
We omit the dependence of $b_0(X)$ on $\beta$ for clarity. 
Assumption \ref{asump-HCccoverlap} ensures that $b_0(X)$ is well-defined for all $X$ such that $f(X\mid S=0)>0$. 
If the working model $\bar{\mu}_{0,1}(X;\beta)$ is correctly specified, the bias function at $\beta_*$ reduces to $\mathbb{E}\left( Y\mid X,A=0,S=0\right) -\mathbb{E}\left( Y\mid X,A=0,S=1\right)$, which measures the difference of the conditional mean of the control outcome given $X$ between ECs and CCs. In this case, if $X$ captures all confounders of $S$ and $Y$, then $\E\left( Y\mid X,A=0,S=1\right) =\E\left( Y\mid X,A=0,S=0\right) $, and thus $b_{0}(X)\equiv0$; otherwise, $b_{0}(X)\neq0$. This special case is discussed in \citet{wu2022integrative}, but their analysis requires the outcome model to be correctly specified. In contrast, our setup does not necessitate a correctly specified outcome model. 

For the combined data, let the ANCOVA model augmented by the  bias function $b_{0}(X)$ be $$\bar{\mu}_{A,S}(X;\beta)=\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)+(1-S)b_{0}(X),$$ then
\begin{align*}
&\bar{\mu}_{0,0}(X;\beta)=\beta_{\rm{int}}+\beta_{X}^{{\T}}p_{\mu}(X)+b_{0}(X)\\
  =&\bar{\mu}_{0,1}(X;\beta)+\mathbb{E}(Y\mid X,A=0,S=0)-\bar{\mu}_{0,1}(X;\beta)\\
  =&\mathbb{E}(Y\mid X,A=0,S=0).
\end{align*}
An important implication is that even if the outcome working model is misspecified, incorporating the bias function $b_{0}(X)$ ensures that $\bar{\mu}_{0,0}(X;\beta)$ recovers the true outcome mean under ECs. 
%This helps to account for any unmeasured confounding factors and ensures that the estimated treatment effect $\hat{\tau}$ is consistent and can be more efficient by leveraging additional information in ECs. 
Denote $\beta_*=\left(\beta_{\rm{int*}},\beta_{A*},\beta_{X*}^{{\T}}\right)^\T$ as the minimizer of $\E [\{Y-\bar{\mu}_{A,S}(X;\beta)\}^{2}]$. 
The following theorem demonstrates that  $\beta_{A*}$ still identifies the ATE $\tau$ in the combined RE and EC data.

\begin{theorem} \label{iden}(Identification)  
%The ANCOVA model is
%\[
%Y=\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)+(1-S)b_{0}(X)+\epsilon,\ \E(\epsilon)=0.
%\]
Under Assumptions \ref{iden}--\ref{asump-HCccoverlap} and the augmented ANCOVA model, we have $\beta_{A*}=\tau.$ 
\end{theorem}

Theorem \ref{iden} provides a vehicle to integrate REs and ECs for robust estimation of the ATE.  Consistent estimation of ${\tau}$ still depends on an accurate approximation  of unknown $b_{0}(X).$ 
Thus, we adopt the method of sieves \citep{chen2007large}. Denote $p_{b}(X)$ as the $K_{2}$-dimensional basis functions. Based on Theorem S1 in the Supplementary Material, there exists a $K_{2}$-vector $\delta_{*}$
such that the uniform convergence 
$\text{\ensuremath{p_{b}^{{\T}}(X)\delta_{*}\rightarrow}} b_{0}(X)$ and therefore the uniform convergence 
\begin{align*}
&\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)+(1-S)p_{b}^{{\T}}(X)\delta_{*}\\
\rightarrow & \beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)+(1-S)b_{0}(X)
\end{align*} 
hold as $K_{2}\rightarrow\infty$. 
Then our final working model becomes 
\begin{equation*}
\bar\mu_{A,S}(X;\beta,\delta) =\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)+(1-S)\delta^{{\T}}p_{b}(X).
\end{equation*}
%We will clarify the conditions under which combining ECs can improve in Theorem \ref{theorem 4}, detailed in Section \ref{sec:Method}.
% Denote 
% $\beta^\T=\left(\beta_{\rm{int}},\beta_{A},\beta_{X}^{{\T}}\right)$ with dimension $K_1=k_1+2$, and $\theta^{{\T}}=(\beta^{{\T}},\delta^{{\T}})$ and $p=\{ 1,A,p_{\mu}^{{\T}}(X),(1-S)p_{b}^{{\T}}(X)\} ^{{\T}}$ with dimension $K=K_{1}+K_{2}$. %, where $K_{1}$ and $K_{2}$ increase with sample size $N$ such that under REs $p^{{\T}}\theta=p_{\mu}^{{\T}}\beta$ and
% %under ECs $p^{{\T}}\theta=p_{\mu}^{{\T}}\beta+p_{b}^{{\T}}\delta.$ 
% %Additionally, to handle the case of model misspecification, 
% We assume that the true density function for $(p,Y)$ is $g(p,Y)$. Since $g$
% is an unknown function, we select a family of distribution functions
% $f(p,Y,\theta)$, which may or may not contain the true structure of
% $g$. Denote $\theta_{*}=(\beta_{*}^{{\T}},\delta_{*}^{{\T}})^{{\T}}$ as the parameter vector which minimizes
% the Kullback Leibler Information Criterion (KLIC), $KLIC(g:f,\theta)=\mathbb{E}\left[\log\left\{ g(p,Y)/f(p,Y,\theta)\right\} \right]$.
% Under appropriate assumptions on $f$ and $g$, the maximum likelihood
% estimator $\hat{\theta}$ is a consistent estimator of $\theta_{*}$
% \citep{white1982maximum}. If the model is correctly specified, meaning
% $f(p,Y,\theta_{0})=g(p,Y)$ for some $\theta_{0}$ in the parameter
% space $\Theta$, then $\theta_{*}=\theta_{0}$.
%Furthermore, %according to Theorem S2 in the Supplementary Material, 
%denote $(\beta_*,\delta_*)$ as the minimizer of $\E \{Y-\bar{\mu}_{A,S}(X;\beta,\delta)\}^{2}$. 
% For consistency in notation, we hereafter denote $\delta_0$ as $\delta$, so that $(\beta_*,\delta_*)$ represents the minimizer of the least squares $\E \{Y-\bar{\mu}_{A,S}(X;\beta,\delta)\}^{2}$.
%The objective function for estimating $\beta_*$ and $\delta_*$ using least squares is as follows:
%\begin{align*}
%\left(\hat{\beta},\hat{\delta}\right) & =\underset{\beta,\delta}{\rm argmin} \sum_{i=1}^{N}\{Y_i-\bar{\mu}_{A_i,S_i}(X_i;\beta,\delta)\}^{2}.
%\end{align*}

% Let $L(\beta,\delta)$
% be the quasi-log-likelihood function of the observations $\left(p_{1},Y_{1}\right),\ldots,\left(p_{N},Y_{N}\right)$,
% then the corresponding maximum likelihood estimator is 
% \begin{align*}
% \left(\hat{\beta},\hat{\delta}\right) & =\underset{\beta,\delta}{\rm argmax} \left\{ L\left(\beta,\delta\right)\right\} ,\\
%  & =\underset{\beta,\delta}{\rm argmax} \left[\sum_{i=1}^{N}\left\{ \ln f\left(Y_{i},p_{i,}\beta,\delta\right)\right\} \right].
% \end{align*}
We consider using the least squares loss function to obtain estimators for $\beta_*$ and $\delta_*$. 
To overcome the risk of overfitting in sieve estimation, where high-dimensional
basis functions are used, it is necessary to add regularizers on $\delta_*$ %$\Lambda_{\delta}$
to the loss function. Additionally, as the working models $\bar{\mu}_{A,1}(X;\beta)$
may contain irrelevant covariates, adding regularizers on $\beta_*$ %$\Lambda_{\beta}$ 
is recommended to select proper covariates. 
In the subsequent section, we thoroughly explore the significance of the structural properties of regularizers to effectively accommodate the inherent characteristics of the problem at hand.

\begin{comment}
There are various methods one can choose from to achieve this, such
as penalized regression, like Lasso \citep{tibshirani1996regression},
Smoothly Clipped Absolute Deviation (SCAD) \citep{fan2001variable,fan2004nonconcave}
penalties, or use black-box methods like the random forest. In this context,
we will focus on using the SCAD penalty to illustrate the theorem
for variable selection. It is worth mentioning that our setup can
also be developed for different penalty functions, but we will not
be discussing those in this study. 

It is important to note that $\beta_{*}$ and $\delta_{*}$ may have
distinct complexities, and it is beneficial to apply different penalties, instead of the same penalty, to both parameters. A toy numerical experiment in the Supplemental Materials demonstrates that utilizing different penalties for $\beta_{*}$ and $\delta_{*}$ yields superior performance compared to using a single penalty, particularly when $\beta_{*}$ and $\delta_{*}$ have different magnitudes.
%instead of the same penalty, to both parameters. To elaborate this,
%consider the case $y=x^{{\T}}\beta+(1-s)x^{{\T}}\delta+\epsilon,\ x=\left(x_{1},\ldots,x_{50}\right)^{{\T}}\in\mathbb{R}^{50},\ \beta^{{\T}}=(1,\ldots,50)/10,\ \delta^{{\T}}=(1,\ldots,50)\times30,\epsilon\sim\mathcal{N}(0,1)$,
%where $s$ denotes the zero-one indicator variable that determines
%whether the observation belongs to the REs, for simplicity, we assume
%$x^{{\T}}\beta$ is the correct outcome mean function of the REs,
%and $x^{{\T}}\delta$ is the bias function reflecting the difference
%between ECs and REs, i.e., if $\delta=0$, then the observed covariates
%capture all confounders in the ECs and REs and thus the exchangeability
%assumption is valid. For didactic purposes, the magnitude of $\delta$
%is much larger than the magnitude of $\beta$. Using the same regularization
%parameter appears to assign the same weight for $\beta$ and $\delta$,
%thus any penalty regularization methods tend to omit small signals
%$\beta$ and only pick up big signals $\delta.$ Therefore, in order
%to make penalizations between different parameters comparable, it
%is crucial to add regularizations to $\beta$ and $\delta$ separately.
%Figure \ref{example2-1} shows the smoothed linear regression between
%$\hat{\beta}$ and $\beta$ after applying the single-penalty regularization
%(denoted as ``Single'') and the double-penalty regularizations (denoted
%as ``Double'') to select variables and refitting the model using
%selected variables, where double penalties make $\hat{\beta}$ more
%accurate than the single penalties. 
%
%\begin{figure}[h]
%\center{} \includegraphics[scale=0.15]{\string"/Users/yuwencheng/Dropbox/Project2HC@ yuwen/manuscript/example2\string".png}
%
%\caption{\label{example2-1}The smoothed linear regression between $\hat{\beta}$
%and $\beta$ with the 95\% confidence intervals as the shaded area
%and $(\beta,\hat{\beta})$ as the points.}
%\end{figure}

Hence, the penalized least squares estimator using double penalties is 
\begin{equation*}
\begin{split}
\left(\hat{\beta},\hat{\delta}\right)&=\underset{\beta,\delta}{\rm argmin} 
\Biggl[
\sum_{i=1}^{N}\{Y_i-\bar{\mu}_{A_i,S_i}(X_i;\beta,\delta)\}^{2}\\
&+\Lambda_{\beta,\lambda_{1}}+\Lambda_{\delta,\lambda_{2}}
\Biggl].
\end{split}
\end{equation*}
where $\Lambda_{\beta,\lambda_{1}}$ and $\Lambda_{\delta,\lambda_{2}}$
are non-convex penalty functions on $\beta$ and $\delta$ with regularization
parameters $\lambda_{1}$ and $\lambda_{2}$, separately. 
The ATE estimator is $\hat\tau=\hat\beta_A$. 
%The goal is to estimate $\beta_{A}$.
\end{comment}

\section{A DOUBLE PENALTY REGULARIZATION METHOD}\label{sec:Method}
\subsection{Main Idea}\label{subsec:Main-Idea}

The common regularization methods use the same regularization parameter %$\lambda$ 
for the two penalty functions on $\beta_*$ and $\delta_*$. %$\Lambda_{\beta,\lambda}$ and $\Lambda_{\delta,\lambda}$. 
However, it is important to note that $\beta_{*}$ and $\delta_{*}$ may have distinct complexities (magnitude and/or sparsity), and it is beneficial to apply different penalties, instead of the same penalty, to both parameters.

To begin with, we denote $P_{\lambda}\left(\gamma\right)=\lambda P(\gamma)$
as the penalty function with regularization parameter $\lambda$ for
any parameter $\gamma$ and any penalization $P(\cdot)$. 
There are various choices for the penalty function, like Lasso \citep{tibshirani1996regression},
Smoothly Clipped Absolute Deviation (SCAD) \citep{fan2001variable,fan2004nonconcave}
penalties, or use black-box methods like the random forest. 
To overcome the limitations caused by adding the same penalty to all
parameters, we set up different penalties for $\beta$ and $\delta$,
separately and propose a double penalty-regularized integrated estimator (DPIE). Specifically, the proposed DPIE under the least squares loss is 
\begin{equation*}
\begin{split}
\left(\hat{\beta},\hat{\delta}\right)&=\underset{\beta,\delta}{\rm argmin} 
\Biggl[
\sum_{i=1}^{N}\{Y_i-\bar{\mu}_{A_i,S_i}(X_i;\beta,\delta)\}^{2}\\
&+N\sum_{j=1}^{K_{1}}P_{\lambda_{1,j}}\left(\vert\beta\vert\right)+N\sum_{j=1}^{K_{2}}P_{\lambda_{2,j}}\left(\vert\delta\vert\right)
\Biggl].
\end{split}
\end{equation*}
There are multiple ways to search for two regularization parameters $\lambda_{1}$
and $\lambda_{2}$. One simple way is to define a scaling tuning parameter
$sc$ as $sc=\lambda_{2}/\lambda_{1},$ then one can use cross-validation
to choose $sc$ given a particular search range and within each $sc$
value, one can also use cross-validation to choose $\lambda_{2}.$
Both cross-validation steps can use the software $\texttt{cv.ncvreg}$
function in the R package $\texttt{ncvreg},$ which finds the tuning
parameter based on the minimum cross-validated error.

If we only use the RE data, we have 
\begin{equation*}
\begin{split}
\hat{\beta}_{\rm {RE}}&=\underset{\beta}{\rm argmin} 
\Biggl[
\sum_{i=1}^{N} S_{i} \{Y_i-\bar{\mu}_{A_i,S_i}(X_i;\beta,\delta)\}^{2}\\&+n\sum_{j=1}^{K_{1}}P_{\lambda_{1,j}}\left(\vert\beta\vert\right)
\Biggl].
\end{split}
\end{equation*}

\subsection{Theoretical Analysis}\label{subsec:Theoretical-Analysis}

The goal of this subsection is to derive the statistical properties of the DPIE. More interestingly, we aim to show the DPIE is at least as efficient as the ATE estimator based only on the REs. 

For concreteness, we will focus on using the SCAD penalty for illustration. It is worth mentioning that our setup can also be developed for different penalty functions, but we will not
be discussing those in this study.  
When the working function $\bar\mu_{A,S}(X;\beta,\delta)$ is correctly specified, % and the  density function of $(Y,A,S,X)$ is known, 
the penalized maximum likelihood estimator with the SCAD penalty has both oracle properties as well as asymptotic normality by
selecting the appropriate regularization parameters $\lambda$ \citep{fan2001variable,fan2004nonconcave}.  
In the following, we will show the DPIE derived under the penalized least squares loss function with double SCAD penalties also has the oracle property and asymptotic normality under the working model $\bar\mu_{A,S}(X;\beta,\delta)$. 

Following the framework in \citet{fan2001variable,fan2004nonconcave}, we rewrite the working model as
\begin{align*}
&\bar\mu_{A,S}(X;\beta,\delta)\\=&\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\T}}p_{\mu}(X)+(1-S)\delta^{{\T}}p_{b}(X)\\=&p^{\T}\theta,
\end{align*}
where $\theta^{{\T}}=(\beta^{{\T}},\delta^{{\T}})$ and $p=\{ 1,A,p_{\mu}^{{\T}}(X),(1-S)p_{b}^{{\T}}(X)\} ^{{\T}}$ with dimension $K=K_{1}+K_{2}$. 
%where $K_{1}$ and $K_{2}$ increase with sample size $N$ such that 
In REs, we have $p^{{\T}}\theta=p_{\mu}^{{\T}}\beta$, and in ECs, we have $p^{{\T}}\theta=p_{\mu}^{{\T}}\beta+p_{b}^{{\T}}\delta.$ 
Denote $g$ as the unknown true density function of $(Y,p)$ and $f$ as the working density function such that minimizing the least squares loss is equivalent to maximizing the quasi-log-likelihood function; the Gaussian distribution is one such example. Denote $P_{\lambda}\left(\theta\right)$
as the SCAD penalty function with the first-order derivative 
\[
P_{\lambda}^{'}(\theta)=\lambda\left\{ \bone\left(\theta\leq\lambda\right)+\frac{(a\lambda-\theta)_{+}}{\left(a-1\right)\lambda}\bone\left(\theta>\lambda\right)\right\} 
\]
for some $a>2$ and $\theta>0$. 
%The goal of the theoretical analysis part is to guarantee the consistency
%and the oracle property of the parameters $\beta_{*},\delta_{*}$,
%further, we also show using the combined data is at least as efficient
%as using only the REs. 
Then one can rewrite the penalized least squares estimator as 
the penalized quasi-likelihood estimator 
\begin{equation*}
\begin{split}
\left(\hat{\beta},\hat{\delta}\right)&=\underset{\beta,\delta}{\rm argmax} \;Q(\beta,\delta)\\
&=\underset{\beta,\delta}{\rm argmax} 
\Biggl[
\sum_{i=1}^{N}\left\{ \ln f\left(Y_{i},p_{i,}\beta,\delta\right)\right\}\\
&-N\sum_{j=1}^{K_{1}}P_{\lambda_{1,j}}\left(\vert\beta\vert\right)-N\sum_{j=1}^{K_{2}}P_{\lambda_{2,j}}\left(\vert\delta\vert\right)
\Biggl].
\end{split}
\end{equation*}

Let 
\begin{align*}
\alpha_{N} & =\max_{1\leq j_{1}\leq K_{1},1\leq j_{2}\leq K_{2}}\left\{ 
\begin{aligned}
&P_{\lambda_{1}}^{'}\left(\vert\beta_{*,j_{1}}\vert\right),P_{\lambda_{2}}^{'}\left(\vert\delta_{*,j_{2}}\vert\right),\\&\beta_{*,j_{1}}\neq0,\delta_{*,j_{2}}\neq0
\end{aligned}
\right\} ,\\
b_{N} & =\max_{1\leq j_{1}\leq K_{1},1\leq j_{2}\leq K_{2}}\left\{ 
\begin{aligned}
& P_{\lambda_{1}}^{''}\left(\vert\beta_{*,j_{1}}\vert\right),
P_{\lambda_{2}}^{''}\left(\vert\delta_{*,j_{2}}\vert\right),\\&\beta_{*,j_{1}}\neq0,\delta_{*,j_{2}}\neq0
\end{aligned}
\right\} ,
\end{align*}
where $P_{\lambda}^{''}\left(\theta\right)$ is the second-order derivative
of $P_{\lambda}(\theta).$ We present the regularity conditions on
the penalty functions given by \citet{fan2004nonconcave}:

\begin{assumption}\label{assumption3}Let the values of $\beta_{*,1},\ldots,\beta_{*,s_{1}}$
be nonzero and $\beta_{*,s_{1}+1},\ldots,\beta_{*,K_{1}}$ be zero.
Similarly, let the values of $\delta_{*,1},\ldots,\delta_{*,s_{2}}$
be nonzero and $\delta_{*,s_{2}+1},\ldots,\delta_{*,K_{2}}$ be zero.
Then $\beta_{*},\delta_{*}$ satisfy: 
\begin{align*}
\min_{1\leq j\leq s_{1}}\vert\beta_{*,j}\vert/\lambda_{1}\rightarrow\infty,\  & \min_{1\leq j\leq s_{2}}\vert\delta_{*,j}\vert/\lambda_{2}\rightarrow\infty,\\
\max_{s_{1}+1\leq j\leq K_{1}}\vert\beta_{*,j}\vert/\lambda_{1}\rightarrow0,\  & \max_{s_{2}+1\leq j\leq K_{2}}\vert\delta_{*,j}\vert/\lambda_{2}\rightarrow0,
\end{align*}

as $N\rightarrow\infty.$

\end{assumption}

\citet{fan2004nonconcave} showed under Assumption \ref{assumption3},
the SCAD penalties have $\alpha_{N}=0$ and $b_{N}=0$ as $N$ is large enough, where the former 
% guarantees the unbiasedness property in the
% asymptotic normality property and 
ensures the existence of root-$N/K$-consistent
penalized likelihood estimator, and the latter ensures the penalty function does not have much more influence on the penalized likelihood
functions, making the penalty estimator have the same efficiency as
the maximum likelihood estimator.  
% Assumption S10 ensures that, upon selecting an appropriate working density function $f$, the OLS estimator will converge to the value which minimizes the $KLIC$ distance between the working density function $f$ and the true density function $g$.
%Regularity conditions S1--S8 on
%likelihood function and density functions are in the Supplementary Material. 

Denote $\|v\|_{p}$
as the $\mathcal{L}_{p}$-norm of a vector $v$. Based on these assumptions,
we can provide the consistency and the asymptotic normality of the
estimated parameters.
\begin{theorem}\label{theorem1}Suppose that the working density function $f\left(Y,p,\beta,\delta\right)$ and the true density function $g(p,Y)$
satisfy Assumptions S1--S10 on the Supplementary Material, and the
SCAD penalty functions $P_{\lambda_{1}}\left(\cdot\right),P_{\lambda_{2}}\left(\cdot\right)$
satisfy Assumption \ref{assumption3}. If $K^{4}/N\rightarrow0$ as
$N\rightarrow\infty$, then there is a local maximizer $\left(\hat{\beta},\hat{\delta}\right)$
of $Q(\beta,\delta)$ such that $\|\hat{\beta}-\beta_{*}\|_{2}=O_{p}\left\{ \left(K/N\right)^{1/2}\right\} ,\|\hat{\delta}-\delta_{*}\|_{2}=O_{p}\left\{ \left(K/N\right)^{1/2}\right\} .$

\end{theorem}

Denote $\beta_*=(\beta_{*1}^{\T},\beta_{*2}^{\T})^{\T}$, where $\beta_{*1}\neq0$ with dimension $s_{1}$ and $\beta_{*2}=0$.
Similarly, denote  $\delta_{*}=(\delta_{*1}^{\T}, \delta_{*2}^{\T})^{\T}$, $\theta_{*}=(\theta_{*1}^{\T}, \theta_{*2}^{\T})^{\T}$
where $\delta_{*1}\neq0$ with dimension $s_{2}$ and $\delta_{*2}=0,$
and $\theta_{*1}\neq0$ with dimension $s=s_{1}+s_{2}$  and $\theta_{*2}=0$.
% Denote 
% \begin{align*}
% &A\left(\theta\right)=-\E\left[\frac{\partial^{2}\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta_{j}\partial\theta_{k}}\right],\\
% &B(\theta)=\E\left[\left\{ \frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta}\right\} \left\{ \frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta}\right\} ^{{\T}}\right],
% \end{align*}
% and also denote $I(\theta_{0})$ be the Fisher information matrix,
% and let $I(\theta_{01})=I(\theta_{01},0)$ be the Fisher information
% matrix knowing $\theta_{02}=0$. Note, if the density function $f$
% is correctly specified, then $A(\theta_{0})=B(\theta_{0})=I(\theta_{0}).$
Then we have the following theorem. 
\begin{theorem}\label{theorem2}
Under Assumption \ref{assumption3} and Assumptions S1--S10 in the
Supplementary Material, if $\lambda_{1},\lambda_{2}\rightarrow0$,
$\sqrt{N/K}\lambda_{1}\rightarrow\infty,\sqrt{N/K}\lambda_{2}\rightarrow\infty$
and $K^{5}/N\rightarrow0$ as $N\rightarrow\infty$, then with probability
tending to 1, $\hat{\beta},\hat{\delta}$ in Theorem \ref{theorem1}
must satisfy 
\begin{enumerate}
\item (Sparsity) $\hat{\beta}_{2}=0,\hat{\delta}_{2}=0$. 
\item (Asymptotic normality) 
\begin{align*}
&\sqrt{N}WA^{1/2}\left(\theta_{*1}\right)\left(\hat{\theta}_{1}-\theta_{*1}\right)\\
&\rightarrow\mathcal{N}\left(0,WA^{-1/2}(\theta_{*1})B(\theta_{*1})A^{-1/2}(\theta_{*1})W^{{\T}}\right)
\end{align*}
in distribution, where $W$ is a $q\times s$ matrix such that $WW^{{\T}}\rightarrow G$,
and $G$ is a $q\times q$ nonnegative symmetric matrix. For simplicity, the specific forms of $W$, $A(\theta)$ and 
 $B(\theta)$ are deferred to the Supplementary Material.
\end{enumerate}
\end{theorem}
Theorems \ref{theorem1}-\ref{theorem2} demonstrate
that under proper selection of tuning parameters, the estimator $\hat{\theta}$ is consistent for $\theta_{*}$ and asymptotic normal. % If the model
% is specified correctly, i.e., $g(Y,p_{i})=f(Y,p_{i},\theta)$ for
% some $\theta\in\Theta$, then $\theta_{0}=\theta_{*}$, and 
% \[\sqrt{N}WI^{1/2}\left(\theta_{01}\right)\left(\hat{\theta}_{1}-\theta_{01}\right)\rightarrow\mathcal{N}\left(0,WW^{{\T}}\right)
% \]
% in distribution. 

On the other hand, using a single $\lambda$ may fail to achieve such desirable results in Theorem \ref{theorem2}. To emphasize
the importance of adding different penalties, we provide a simple analytical calculation:
let $\beta_{1*}=O_{p}(N^{-1/2}),\beta_{2*}=O_{p}(N^{-1}),\delta_{1*}=O_{p}\left(N^{-1/10}\right),\delta_{2*}=O_{p}\left(N^{-1/3}\right).$
Assuming $\lambda_{1}=N^{\epsilon}$ and $\lambda_{2}=N^{\gamma}$
such that $\beta_{1*},\beta_{2*},\delta_{1*}$ and $\delta_{2*}$
all satisfy Assumption \ref{assumption3}: 
\begin{align*}
\frac{N^{-1/2}}{N^{\epsilon}}\rightarrow\infty,\  & \frac{N^{-1}}{N^{\epsilon}}\rightarrow0,\\
\frac{N^{-1/10}}{N^{\gamma}}\rightarrow\infty,\  & \frac{N^{-1/3}}{N^{\gamma}}\rightarrow0.
\end{align*}
Hence, we have $-1<\epsilon<-1/2$ and $-1/3<\gamma<-1/10$, which
means we cannot find a common $\lambda$ for $\beta_{*}$ and $\delta_{*}$.
If the magnitude of $\beta_*$ and $\delta_*$ differ largely, one single penalty cannot satisfy the requirements for consistency and oracle properties. 
A toy numerical experiment in the Supplemental Materials demonstrates that utilizing different penalties for $\beta_{*}$ and $\delta_{*}$ yields superior performance compared to using a single penalty, particularly when $\beta_{*}$ and $\delta_{*}$ have different magnitudes.
We also illustrate this in the Simulation. 

It should be noted that Theorem \ref{theorem1}-\ref{theorem2} can be expanded to incorporate different working density functions $f$, not just the current form corresponding to the least squares loss. This means we can also adopt other losses beyond the least squares loss. If $f$ takes other forms, under certain regularity conditions on $f$ and $g$, $\hat\theta$ that maximizes the penalized quasi-log-likelihood function $Q(\theta)$ converges to $\tilde\theta_*$, where $\tilde\theta_*$ minimizes the Kullback-Leibler Information Criterion (KLIC) between $f$ and $g$, $KLIC(g:f,\theta)=\mathbb{E}\left[\log\left\{ g(Y,p)/f(Y,p,\theta)\right\} \right]$. Detailed discussions are included in the Supplementary Material. However, the identification strategy for $\tau$ from $f(Y,p,\tilde\theta_*)$ may change, depending on the specific form of $f(Y,p,\tilde\theta_*)$, similar to \cite{wang2021model}. 


% Theorem \ref{theorem2}  establishes asymptotic normality for a general estimate $\hat\beta$ that converges to $\beta_*$, where $\beta_*$ in conjunction with $\delta_*$ minimizes the $KLIC$ distance between working density $f$ and the true density $g$. In the ensuing Theorem, our focus is on the OLS estimate $\hat\beta$, demonstrating that under the OLS corresponding working density $f$, $\beta_{A*}$ equals $\tau$.

\subsection{Comparison between the DPIE and the RE-only estimator}
We now show the advantage of the DPIE based on the combined data compared with the estimator based only on the REs. For ease of comparison, we assume homoscedasticity of the residual error $\epsilon$ in the equation $Y=\bar\mu_{A,S}(X)+\epsilon$, where $\V(\epsilon)=\sigma^{2}$.

\begin{theorem}\label{theorem 4}
Under Assumption \ref{iden}--\ref{assumption3} and Assumptions S1--S10 in
Supplementary Material, if $\lambda_{1},\lambda_{2}\rightarrow0$,
$\sqrt{N/K}\lambda_{1}\rightarrow\infty,\sqrt{N/K}\lambda_{2}\rightarrow\infty$
and $K^{5}/N\rightarrow0$ as $N\rightarrow\infty$. For the least squares estimate $\hat\beta$ and $\hat\delta$, 
%     \item
% $\delta_{*}=\delta_{0},\beta_{*}=\left(\beta_{0*},\beta_{A*},\beta_{X*}^{{\T}}\right)^{{\T}}$ satisfies $\beta_{A*}=\tau.$
\begin{enumerate}
    \item in combined data, 
$\hat{\tau}-\tau\rightarrow\mathcal{N}\left\{ 0,\V(\hat{\tau})\right\} $ in distribution;
\item in the RE data, 
$\hat{\tau}_{{\rm RE}}-\tau\rightarrow\mathcal{N}\left\{ 0,\V(\hat{\tau}_{{\rm RE}})\right\} $
 in distribution; and
 \item $\V(\hat{\tau}_{{\rm RE}})\geq\V(\hat{\tau})$
and the equality holds iff $p_{\mu}=Mp_{b}(X)$ for some matrix
$M$.
\end{enumerate}
% the combined data has
% $\hat{\tau}-\tau\rightarrow\mathcal{N}\left\{ 0,\V(\hat{\tau})\right\} $ in distribution, and the RE data has 
% $\hat{\tau}_{{\rm RE}}-\tau\rightarrow\mathcal{N}\left\{ 0,\V(\hat{\tau}_{{\rm RE}})\right\} $
%  in distribution, where $\V(\hat{\tau}_{{\rm RE}})\geq\V(\hat{\tau})$
% and the inequality holds iff $p_{\mu}=Mp_{b}(X)$ for some matrix
% $M$.

\end{theorem}
Theorem \ref{theorem 4} shows that the estimate of $\tau$ will remain
accurate, regardless of the validity of the ANCOVA working model.
When incorporating ECs, the estimator is no less efficient than the estimator obtained from the REs alone. 
%Additionally, adding double penalties in SCAD ensures the accurate estimation of $\delta_{*}$, which in turn guarantees the accuracy of the bias function. As demonstrated in Theorem 1, having an accurate bias function is essential for achieving consistency in the ATE estimation, even when the ANCOVA working model is misspecified. This highlights the significance of using different penalties for the parameters $\beta_{*}$ and $\delta_{*}$. 


\section{SIMULATION}\label{sec:Simulation}

In this section, first, we illustrate the importance of adding different penalties for different data sources, then we compare the proposed estimator of $\tau$ with existing competitors combining the REs and the ECs.

\subsection{Simulation Study 1}

We generate two data sources with sample sizes $n=m=1000$. % and total sample size  $N=2000$. 
Covariates $X\in\mathbb{R}^{50}$ are generated by $X_{d}\sim{\rm Uniform}\left[1-\sqrt{3},1+\sqrt{3}\right],d=1,\ldots,50$, and 
outcome is generated by $Y=X^{{\T}}\beta_{0}+\left(1-S\right)X^{{\T}}\delta_{0}+\epsilon,$
where $\epsilon\sim\mathcal{N}\left(0,1\right)$. We simulate $T=100$
Monte Carlo times, and specify the true $\beta_{0}=(1,\ldots,50)^{{\T}}/50.$

To examine the instances where various penalties are required and validate the argument presented in Section \ref{sec:Method}, we specify three cases for $\delta_{0}$: 
\begin{enumerate}
\item $\|\delta_{0}\|_{1}\geq \|\beta_{0}\|_{1}$ and half of parameters in $\delta_{0}$ equal to zero: $c\|\beta_{0}\|_{1}=\|\delta_{0}\|_{1}$ and $c=1,3,5,7,9.$ 
\item $\|\delta_{0}\|_{1}<\|\beta_{0}\|_{1}$ and half of parameters
in $\delta_{0}$ equal to zero: $c\|\beta_{0}\|_{1}=\|\delta_{0}\|_{1}$ and $c=0.1,0.3,0.5,0.7,0.9.$ 
\item Vary the sparsity level of $\delta_{0}$ while ensuring that its magnitude satisfies $\|\delta_{0}\|_{1}=\|\beta_{0}\|_{1}$:
%the number of variables in $\delta_0$ equal to zero one by one. 
the number of variables in $\delta_0$ that equal zero gradually change, increasing from $2$ to $50$ with a step size of three.
\end{enumerate}
In each case, we compare the results based on 
\begin{enumerate}
\item Results based on the combined data and use the double
SCAD penalty (denoted as
``DPIE''). 
\item Results based on the combined data and use the single
SCAD penalty (denoted as ``SPIE''). 
\item Results based only on the RE data (denoted as ``RE''). 
\end{enumerate}
\begin{figure*}
\center{}\includegraphics[scale=0.25]{\string"large\string".png}
\caption{\label{large}Simulation results based on 100 Monte Carlo times. The
left panel shows the MSE versus the magnitude ratio between $\delta_{0}$
and $\beta_{0}$. The right panel shows the percentage of wrongly
choosing more and fewer parameters, separately.}
\end{figure*}
All results are based on re-fitting models with the parameters chosen
in each method, and compare the results based on the mean squared
error $MSE=\sqrt{d^{-1}\sum_{i=1}^{d}\left(\hat{\beta}_{i}-\beta_{0,i}\right)^{2}}$
and the percentage of incorrectly selecting more (denoted as ``Over-select'') and fewer parameters (denoted as ``Under-select''). Figure \ref{large} shows the MSE results and the percentage of Under-select and Over-select in case a).
When the magnitudes of two parameters differ, using different penalties
improves accuracy when compared to using the same penalties for all
parameters. Moreover, the gained accuracy improves as the magnitude
difference increases. The right panel of Figure \ref{large} shows
the percentage of incorrectly selecting more or fewer variables. When $\|\delta_{0}\|_{1}>\|\beta_{0}\|_{1}$,
using the same penalties makes it difficult to select $\beta_{0}$, resulting in a large MSE. These findings are consistent
with the theoretical results in Section \ref{sec:Method}. 
Case b) for $\|\delta_{0}\|_{1}<\|\beta_{0}\|_{1}$ shows a similar phenomenon, and thus the results are deferred to the Supplementary
Material.

In contrast, in Case c), where we vary the sparsity levels of $\delta_{0}$ while keeping the magnitude the same ($||\delta_{0}||_1=||\beta_{0}||_1$), the DPIE and SPIE methods demonstrate similar performances. Refer to the figure in the Supplementary Material for a visual representation. This finding also aligns with the theoretical result in
Section \ref{sec:Method}, where we only need to restrict the magnitude
of different parameters to guarantee consistency and oracle properties.



\subsection{Simulation Study 2}

We now compare the proposed estimator of $\tau$ with existing competitors combining the REs and the ECs. 
We generate REs and ECs with sample sizes $n=m=1000$. 
%We set up $T=500$ simulation times. Let the total sample size $N=2000,$
%where the RE sample size $n=1000$, and the EC sample size $m=1000$.
Covariates $X\in\mathbb{R}^{2}$ are generated by $X_{d}\sim{\rm Uniform}\left[-1.5,1.5\right],d=1,2$.
The treatments $A$ in the REs are generated by ${\rm Binomial}(1000,0.5)$.
We consider two settings for generating outcomes: 
\begin{enumerate}
\item[S1]  $Y=-1.5X_{1}^{2}-1.5X_{2}+2A+(1-S)(10X_{1}^{2}+4X_{2}^{3})+\epsilon$,
where $\text{\ensuremath{\epsilon\sim\mathcal{N}(0,1)}}$; 
\item[S2]  $Y=-1.5X_{1}^{2}-1.5e^{X_{2}}+2A+(1-S)(10X_{1}^{2}+4X_{2}^{3})+\epsilon$,
where $\text{\ensuremath{\epsilon\sim\mathcal{N}(0,1)}}$. 
%
\end{enumerate}
In each case, we approximate the $\bar{\mu}_{1,1}(X;\beta),\ \bar{\mu}_{0,1}(X;\beta)$
and $b_{0}(X)$ using the power series basis functions with the
power up to three. We use the double  SCAD penalty method to select
important features of $\bar{\mu}_{1,1}(X;\beta),\ \bar{\mu}_{0,1}(X;\beta)$ and
$b_{0}(X)$, where, in Setting S1, the working models are correct,
while in Setting S2, the working models are misspecified for $\bar{\mu}_{1,1}(X;\beta),\ \bar{\mu}_{0,1}(X;\beta)$.
After selecting parameters, we estimate the variance
using the linear regression estimated variance of $\hat{\tau}$. We
compare our method with the power prior Bayesian method \citep{lin2019propensity} and the Matching procedure \citep{stuart2008matching}.

% \begin{table}[h!]\centering{} \caption{\label{table1}The absolute bias, estimated variance, the true variance,
% MSE and the 95\% Wald confidence intervals of the $\hat{\tau}$ ,
% compared with the Bayesian methods (denoted as $\vert\hat{\tau}-\tau\vert_{B}$) in two settings (denoted as S1 , S2) .}
%  \setlength{\tabcolsep}{0.4mm}{
% \begin{tabular}{cccccccc}
% \hline 
%  & $\times10^{-3}$ & $\vert\hat{\tau}-\tau\vert$ & $\vert\hat{\tau}-\tau\vert_{B}$  & $v$ & $\hat{v}$ & MSE & CI\tabularnewline
% \hline 
% EC+RE & S1 & 2.43 & 110.7 & 3.33 & 3.27 & 3.34 & 94.6\%\tabularnewline
% \hline 
% RE &  & 4.05 &  & 3.94 & 4.00 & 3.96 & 95.6\%\tabularnewline
% \hline 
% EC+RE & S2 & 4.42 & 111.5 & 3.33 & 3.30 & 3.35 & 95\%\tabularnewline
% \hline 
% RE &  & 4.67 &  & 3.97 & 4.01 & 3.99 & 95.8\%\tabularnewline
% \hline 
% \end{tabular}
% }
% \end{table}

% \begin{table}[h!]
% \centering{} \caption{\label{table11}The absolute bias, estimated variance, the true variance,
% MSE and the 95\% Wald confidence intervals of the $\hat{\tau}$ ,
% compared with the Bayesian methods (denoted as $\vert\hat{\tau}-\tau\vert_{B}$) and the Matching method (denoted as $\vert\hat{\tau}-\tau\vert_{M}$ and $v_M$) in two settings (denoted as S1, S2).}
%  \setlength{\tabcolsep}{0.3mm}{
% \begin{tabular}{cccccccccc}
% \hline 
%  & $\times10^{-3}$ & $\vert\hat{\tau}-\tau\vert$ & $\vert\hat{\tau}-\tau\vert_{B}$ & $\vert\hat{\tau}-\tau\vert_{M}$ &$v_M$ &$v$ & $\hat{v}$ & MSE & CI\tabularnewline
% \hline 
% EC+RE & S1 & 2.43 & 110.7 &646& 457& 3.33 & 3.27 & 3.34 & 94.6\%\tabularnewline
% \hline 
% RE &  & 4.05 & && & 3.94 & 4.00 & 3.96 & 95.6\%\tabularnewline
% \hline 
% EC+RE & S2 & 4.42 & 111.5 &645& 457& 3.33 & 3.30 & 3.35 & 95\%\tabularnewline
% \hline 
% RE &  & 4.67 & && & 3.97 & 4.01 & 3.99 & 95.8\%\tabularnewline
% \hline 
% \end{tabular}}
% \end{table}

\begin{table}[ht]
\centering{} \caption{\label{table1}The absolute bias, estimated variance, the true variance,
MSE and the 95\% Wald confidence intervals of  $\hat{\tau}$ ,
compared with the Bayesian methods (denoted as $\vert\hat{\tau}-\tau\vert_{B}$ and $v_B$) and the Matching method (denoted as $\vert\hat{\tau}-\tau\vert_{M}$ and $v_M$) in two settings (denoted as S1, S2).}
\begin{tabular}{ccccc}
\hline 
$\times10^{-3}$ & \multicolumn{2}{c}{S1} & \multicolumn{2}{c}{S2}\tabularnewline
\hline 
 & EC+RE & RE & EC+RE & RE\tabularnewline
\hline 
$\vert\hat{\tau}-\tau\vert$ & 2.43 & 4.05 & 4.42 & 4.67\tabularnewline
\hline 
$\vert\hat{\tau}-\tau\vert_{B}$ & 110.7 &  & 111.5 & \tabularnewline
\hline 
$\vert\hat{\tau}-\tau\vert_{M}$ & 646 &  & 645 & \tabularnewline
\hline 
$v_{B}$ & 326 &  & 328 & \tabularnewline
\hline 
$v_{M}$ & 457 &  & 457 & \tabularnewline
\hline 
$v$ & 3.33 & 3.94 & 3.33 & 3.97\tabularnewline
\hline 
$\hat{v}$ & 3.27 & 4.00 & 3.30 & 4.01\tabularnewline
\hline 
MSE & 3.34 & 3.96 & 3.35 & 3.99\tabularnewline
\hline 
CI & 94.6\% & 95.6\% & 95\% & 95.8\%\tabularnewline
\hline 
\end{tabular}
\end{table}

Table \ref{table1} shows the absolute bias of the estimated ATE $\hat{\tau}$,
true variances $v$, estimated variances $\hat{v}$, MSE, and 95\%
Wald confidence intervals. In both settings, combining EC and RE data
improves accuracy and efficiency. The Bayesian method uses
the estimated probability of trial inclusion $\mathbb{P}(S=1\mid X)$
to adjust the EC, which borrows less information from ECs than correctly
estimating the bias function, resulting in worse results. The matching procedure in \citet{stuart2008matching} measures the difference between the ECs and CCs in two stages: first they used ECs to match CCs, balancing covariates between the ECs and CCs in this process. They then determined the bias value, $\delta$, between ECs and CCs using matched groups of CCs and ECs. This $\delta$ is constant for all 
$X$ and may not be accurate. In contrast, our methods use a bias function, $b_0(X)$, which adapts to different $X$ values and accounts for all differences, irrespective of whether they arise from covariates or the outcome. Consequently, our approach is more effective than the two-stage method proposed by Stuart and Rubin (2008). As shown in Table \ref{table1}, \citet{stuart2008matching} approach has a larger bias compared to our method and the Bayesian method due to the less accurate bias term, $\delta$.

\section{REAL DATA ANALYSIS}\label{sec:Real-Data-Analysis}

We apply the proposed DPIE estimator as well as other
methods in Section \ref{sec:Simulation} to the data from the National
Supported Work (NSW) study. This study aims at evaluating the effect of a job training program on future earnings, containing an experimental sample from a randomized evaluation of the NSW program, and a nonexperimental
sample from the Current Population Survey (CPS) program. $15992$
external control units are included in the original CPS dataset,
whereas 260 random control units are included in the NSW dataset.
We use the Matching procedure \citep{abadie2006large} to match each
random control unit with $2$ external control units without replacement,
therefore, we use $520$ external control units (ECs) and $260$
random control units (CCs) as the control group, and $260$ random
treatment units as the treatment group.

This analysis includes the eight original covariates from the NSW
and CPS datasets (age, education, Black, Hispanic, married, having
no college degree (denoted as ``nodeg''), real earnings in 1974
(denoted as ``re74''), and real earnings in 1975 (denoted as ``re75'')
as well as their 2-way interactions. The outcome of interest is the
real earnings in 1978 (denoted as ``re78''). For a better regression,
we divide all the real earnings (re74, re75, re78) by 1000, scale
all covariates between 0 and 1, and omit variables with the same value
across observations. There are 86 covariates in total, with 43 covariates
in the bias function and 43 covariates in the outcome mean function.
Accordingly, we use the mean of real earnings in 1978 in the REs as
the true value, $1.794$.

Table \ref{tab:real data} shows the estimated control mean $\hat{\tau}$
(reported as ``Est'') in the random group, i.e., $S=1$, along with
its standard error (reported as `` se'') and 95\% Wald confidence
intervals using the proposed DPIE estimator, the SPIE estimator and
the SCAD estimator only based on the RE data. The number of variables
selected in the outcome mean model (reported as ``\#var\_$\bar{\mu}_{A,S}$'')and
in the bias function are also reported (reported as ``\#var\_$b_{0}$'').
Even the bias function is as complex as the outcome mean model, the
DPIE improves efficiency by increasing the sample size, resulting
in the standard error of DPIE being smaller than that of RE in this
real data scenario. On the other hand, the SPIE estimator has a larger
bias than the DPIE, because the magnitude of the outcome mean function
is much smaller than that of the bias function, leading to a biased
estimate compared with the DPIE, which is consistent with our simulation
results shown in Figure \ref{large}. Based on the DPIE estimator,
the estimated average treatment effect is $1.704$.

\begin{table}[ht]
\center{}\caption{\label{tab:real data}The first panel shows estimated $\hat{\tau}$
and corresponding standard error, bias, 95\% Wald confidence interval
and the number of selected variables in the outcome mean model. The
second panel shows estimated variables in the outcome mean model $\mu_{0,1}(X)$
and the bias function $b_{0}(X)$ based on the DPIE estimator.}
\setlength{\tabcolsep}{0.5mm}{ %
\begin{tabular}{cccccc}
\hline 
 & Est  & se  & bias & \#var\_$\bar{\mu}_{A,S}$ & \#var\_$b_{0}$\tabularnewline
\hline 
DPIE  & 1.857 (0.746 , 2.969)  & 0.567 & 0.063 & 4 & 4\tabularnewline
SPIE  & 1.626 (0.582 , 2.671)  & 0.533  & 0.168 & 5 & 1\tabularnewline
RE  & 1.698 (0.455, 2.941)  & 0.634  & 0.097 & 4 & /\tabularnewline
\hline 
\end{tabular}} 
\end{table}


\section{DISCUSSION}\label{sec:Discussion}

We introduce a bias function to measure the discrepancy between the ECs and the working model in REs and use sieve estimation and feature selection techniques to handle the high-dimensional nature of the basis functions and to prevent irrelevant covariates from being included in the outcome mean model. We propose a double penalty integration estimator (DPIE) that takes advantage of the different levels of smoothness of the outcome mean and bias functions. Our results demonstrate that the DPIE is consistent, has the oracle property, and is asymptotically normal when the penalty parameters are selected appropriately. Moreover, our estimator is robust to model misspecification and is at least as efficient as the REs alone.

We provide a general framework with a broad class of choices for
combining multiple datasets and employing flexible penalized regression
procedures. Combining several treatments for a more accurate estimation
of the value functions in policy evaluation and individual treatment
regimes is a direct extension of our method. In addition, our outcome
$Y$ can be extended to multiple types, including survival \citep{https://doi.org/10.48550/arxiv.2201.06595}
and zero-inflation outcomes \citep{yu2021multiplicative}. In lieu
of better estimating the outcome mean function to enhance the ATE
estimate, one may directly combine the bias function and the heterogeneous
treatment effects (HTEs; \citet{yang2022elastic}), which are the causal effects of a treatment
given the characteristics of the subjects, to obtain a more accurate
estimate of the HTEs. Evaluating the HTEs is the primary question
in many domains, including precision medicine and tailored policy
recommendations \citep{https://doi.org/10.48550/arxiv.2011.08047, chu2022targeted}.
Finally, we exclusively consider the SCAD penalty in our theoretical
study. The SCAD penalty addresses consistency, oracle property, and asymptotic normality of some local minimizer of the penalized loss. However, it doesn't ensure the uniqueness of the solution or provide methods for identifying the specific local minimizer with the desired properties among a large pool of potential local minimizes \citep{zhang2010nearly}. This gap between theory and practice presents an interesting avenue for future research. To address this concern, we propose several potential approaches: \citet{fan2014strong} introduced a general procedure based on the LLA algorithm and derive a lower bound on the probability that a specific local solution exactly matches the oracle estimator, which could be applicable in real-world scenarios; \citet{kim2012global} provided conditions for determining the uniqueness of a local minimizer. Additionally, we recommend varying initial values in R's \texttt{ncvfit} function, and selecting the estimate that minimizes error. Alternatively, using unpenalized estimated covariates as initial values can be considered.
Moreover, 
%the SCAD penalty encounters the computational difficulty of the non-convex optimization problem \citep{hesterberg2008least}.
A general theoretical framework for multiple penalties, such as the adaptive Lasso \citep{zou2006adaptive} and minimax concave penalty \citep{zhang2010nearly} of double penalty selection, would therefore be desirable.





% References

\bibliography{cheng_477}
\end{document}
