\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{comment}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsfonts}
\usepackage{comment}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\usepackage{array}
\usepackage{multirow}
\usepackage{hyperref}

%\externaldocument[main]{cheng_447}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\title{Supplementary Material}
\title{Enhancing Treatment Effect Estimation: A Model Robust Approach Integrating Randomized Experiments and External Controls using the Double Penalty Integration Estimator (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<ycheng26@ncsu.edu>?Subject=Your UAI 2023 paper}{Yuwen Cheng}{}}
\author[2]{\href{mailto:<liliwu@microsoft.com>?Subject=Your UAI 2023 paper}{Lili Wu}{}}
\author[3]{\href{mailto:<syang24@ncsu.edu>?Subject=Your UAI 2023 paper}{Shu Yang}{}}

% Add affiliations after the authors
\affil[1]{%
    Statistics Dept. North Carolina State University
}
\affil[2]{%
    Microsoft Research NYC
}
\affil[3]{%
    Statistics Dept. North Carolina State University
  }
  

  









  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
%\hbadness=10000 \tolerance=10000 \hyphenation{en-vi-ron-ment
%in-ven-tory e-num-er-ate char-ac-ter-is-tic}
    %\renewcommand{\theequation}{\thesubsection.\arabic{equation}}
    %Wayne wants simple eq numbers so they don't have to be redone
    %use renewcommand for appendix equation numbering (B.1) etc.





%\usepackage{hangpar}
\newcommand{\lbl}[1]{\label{#1}{\ensuremath{^{\fbox{\tiny\upshape#1}}}}}
% remove % from next line for final copy
\renewcommand{\lbl}[1]{\label{#1}}
\newtheorem{lemma}{Lemma}\newtheorem{theorem}{Theorem}\newtheorem{assumption}{Assumption}\newtheorem{remark}{Remark}\newtheorem{corollary}{Corollary}\newtheorem{example}{Example}\newtheorem{definition}{Definition}[section]
\newtheorem{proof}{Proof}\newtheorem{condition}{Condition}\newtheorem{step}{Step}\newcommand{\bx}{\mathbf{x}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bR}{\mathbf{R}}
\newcommand{\bw}{\mathbf{w}}
\newcommand{\mhat}{\hat{\mu}}                     %mu hat
\newcommand{\bmhat}{\mbox{\boldmath$\hat{\mu}$}}  %bold mu hat
\newcommand{\bs}{\mbox{\boldmath$\sigma$}}        %bold sigma
\newcommand{\bS}{\mbox{\boldmath$\Sigma$}}        %bold Sigma
%\newcommand{\vtheta}{\hat{V}}
\newcommand{\ch}{{\mathcal{F}}}
\newcommand{\be}{\begin{equation}}
\newcommand{\en}{\end{equation}}
\newcommand{\bea}{\begin{eqnarray}}
\newcommand{\ena}{\end{eqnarray}}
\newcommand{\ba}{\begin{array}}
\newcommand{\percent}{\%}
\newcommand{\ea}{\end{array}}
\newcommand{\dis}{\displaystyle}
\newcommand{\Pf}{\vspace{0.3 cm}\no\underline{\it Proof}\hspace{0.7 cm}}
\newcommand{\vs}{\vspace{0.6 cm}}
\newcommand{\hs}{\hspace{1 cm}}
\newcommand{\adrsq}{(1-\frac{1}{r})S_r^2}
\newcommand{\bym}{\bar{y}_m^*}
\newcommand{\byr}{\bar{y}_r}
\newcommand{\sumh}{\sum_{c=1}^C}
\newcommand{\hyi}{\hat{Y}_I}
\newcommand{\A}{A}
\newcommand{\rr}{A_R\,}
\newcommand{\hnv}{\hat{N}_v}
\newcommand{\hrv}{\hat{R}_v}
\newcommand{\byn}{\bar{y}_n}
\newcommand{\bxm}{\bar{x}_m^*}
\newcommand{\byi}{\hat{\mu}_I}
\newcommand{\bxr}{\bar{x}_r}
\newcommand{\bxn}{\bar{x}_n}
\newcommand{\bxi}{\bar{x}_I}
\newcommand{\byrv}{\bar{y}_r^v}
\newcommand{\byii}{\hat{\mu}_{I(-i)}}
\newcommand{\sumi}{\sum_{i=1}^{n}}
\newcommand{\vnaive}{V_{JK}^I}
\newcommand{\no}{\noindent}
\newcommand{\R}{{\mathcal{R}}}
\newcommand{\pop}{{\mathbf{Y}}}
\newcommand{\pr}{\mathbb{P} }
\newcommand{\indep}{\perp \!\!\! \perp}
\def\mH{\mathcal{H}}


\newcommand{\T}{\mathrm{\scriptscriptstyle T}}
\newcommand{\mis}{ {\mathrm{mis}}} 
\newcommand{\ATT}{ {\mathrm{ATT}}} 
\newcommand{\adj}{ {\mathrm{adj}}} 
\newcommand{\mat}{ {\mathrm{mat}}} 
\newcommand{\obs}{ {\mathrm{obs}}} 
\newcommand{\var}{ {\mathrm{var}}} 
\newcommand{\cov}{ {\mathrm{cov}}} 
\newcommand{\dsm}{ {\mathrm{dsm}}} 
\newcommand{\psm}{ {\mathrm{psm}}} 
\newcommand{\prog}{ {\mathrm{prog}}} 
\newcommand{\J}{ {\mathcal{J}}} 
\newcommand{\mP}{ {\mathbb{P}}} 
\newcommand{\plim}{ {\mathrm{plim}}} 
\newcommand{\F}{ {\mathcal{F}}} 
\newcommand{\rep}{ {\mathrm{rep}}} 
\newcommand{\reg}{ {\mathrm{REG}}} 
\newcommand{\nni}{ {\mathrm{NNI}}} 
\newcommand{\nnri}{ {\mathrm{NNRI}}}
\newcommand{\HT}{ {\mathrm{HT}}} 
\newcommand{\N}{ {\mathcal{N}}} 
\newcommand{\I}{ {\tau}} 
\newcommand{\It}{ \mathcal{I}} 
\newcommand{\logit}{ {\mathrm{logit}}} 
\newcommand{\de}{ {\mathrm{d}}} 
\newcommand{\mx}{ {\mathrm{m.x}}} 
\newcommand{\dm}{ {d_{V}}} 
\newcommand{\E}{ {\mathbb{E}} } 
\newcommand{\bP}{ {\mathbb{P}}} 
\newcommand{\V}{ {\mathbb{V}}} 
\newcommand{\bone}{\mathbf{1}}
\newcommand{\cU}{ {\mathcal{U}}} 
\newcommand{\sgn}{ {\mathrm{sgn}}} 


\global\long\def\theequation{S\arabic{equation}}
\setcounter{equation}{0}
\global\long\def\thefigure{S\arabic{figure}}
\setcounter{figure}{0}
\global\long\def\thesection{S\arabic{section}}
\setcounter{section}{0}
%\newcommand{\thefigure}{S\arabic{figure}}
%\newcommand{\thesection}{S\arabic{section}}
\global\long\def\thetheorem{S\arabic{theorem}}
\setcounter{theorem}{0}
\global\long\def\thecondition{S\arabic{condition}}
\setcounter{condition}{0}
\global\long\def\theremark{S\arabic{remark}}
\setcounter{remark}{0}
\global\long\def\thestep{S\arabic{step}}
\setcounter{step}{0}
\global\long\def\theassumption{S\arabic{assumption}}
\setcounter{assumption}{0}


\renewcommand{\labelenumi}{\alph{enumi})}




% If your paper is accepted, change the options for the package
% aistats2022 as follows:
%
%\usepackage[accepted]{aistats2022}
%
% This option will print headings for the title of your paper and
% headings for the authors names, plus a copyright note at the end of
% the first column of the first page.

% If you set papersize explicitly, activate the following three lines:
%\special{papersize = 8.5in, 11in}
%\setlength{\pdfpageheight}{11in}
%\setlength{\pdfpagewidth}{8.5in}

% If you use natbib package, activate the following three lines:
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}

% If you use BibTeX in apalike style, activate the following line:
%\bibliographystyle{apalike}


The supplementary material is structured as follows:
Section \ref{subsec:Regularity-conditions} includes additional assumptions for the likelihood and density functions.
Section \ref{subsec:Proof} provides proofs for the main theorems.
Section \ref{sec:Toy-example} presents the toy example mentioned in Section 2.
Section \ref{plots} displays additional figures for the first simulation study in this section. The simulation codes are in the \url{https://github.com/yuwen997/simulation-codes}.

\section{Regularity conditions}\label{subsec:Regularity-conditions}

In this section, we provide the same assumptions on likelihood functions and density functions as those in
\citet{fan2004nonconcave,white1982maximum}. For a better understanding of these conditions, an informal summary of assumptions S1--S9 is provided here. Assumptions S1--S7 align with A1--A7 in \citet{white1982maximum}, ensuring MLE estimator consistency and asymptotic normality in both misspecified and correct models. S8--S9 resemble F--G in \citet{fan2004nonconcave}, bounding $f$ moments.
\begin{assumption}\label{f1}The
independent random vectors $(p_{i},Y_{i}),$ $i=1,\ldots,N,$ have
common joint distribution function $G$ on $\Upsilon$, a measurable
Euclidean space, with measurable Radon-Nikodym density $g=dG/d\mu.$ 

\end{assumption}

\begin{assumption}\label{f1-1}The family of distribution functions
$F(Y_{1},p,\theta)$ has Radon-Nikodym densities $f(y,p,\theta)=dF(y,p,\theta)/d\mu$
which are measurable in $(y,p)$ for every $\theta$ in $\Theta,$
a compact subset of Euclidean space, and continuous in $\theta$
for every $(y,p)$ in $\Upsilon.$

\end{assumption}

\begin{assumption}\label{f1-2} (a) $\E\{\log g(Y,p)\}$ exists and $\vert\log f(y,p,\theta)\vert\leq m(y,p)$
for all $\theta$ in $\Theta$, where $m$ is integrable with respect
to $G$; (b) $KLIC(g:f,\theta)$ has a unique minimum at $\theta_{*}$
in $\Theta$.

\end{assumption}

\begin{assumption}\label{f1-3} $\partial\log f(y,p,\theta)/\partial\theta_{j},j=1,\ldots,K,$
are measurable functions of $(y,p)$ for each $\theta$ in $\Theta$
and continuously differentiable functions of $\theta$ for each $(y,p)$
in $\Upsilon.$ \end{assumption}

\begin{assumption}\label{f1-4} $\vert\partial^{2}\log f(y,p,\theta)/\partial\theta_{i}\partial\theta_{j}\vert$
and $\vert\partial\log f(y,p,\theta)/\partial\theta_{i}\cdot\partial\log f(y,p,\theta)/\partial\theta_{j}\vert$,
$i,j=1,\ldots,K$ are dominated by functions integrable with respect
to $G$ for each $\theta$ in $\Theta$ and $(y,p)$ in $\Upsilon.$
\end{assumption}

\begin{assumption}\label{f1-5} Define matrix 
\begin{align*}
A\left(\theta\right) & =-{\mathbb{E}}\left\{\frac{\partial^{2}\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta_{j}\partial\theta_{k}}\right\}>0,\\
B(\theta) & =\E\left[\left\{ \frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta}\right\} \left\{ \frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta}\right\} ^{{\rm T}}\right],
\end{align*}
and (a) $\theta_{*}$ is interior to $\Theta$; (b) $B(\theta_{*})$
is nonsingular; (c) $\theta_{*}$ is a regular point of $A(\theta).$

\end{assumption}

\begin{assumption}\label{f1-6} $\vert\partial\left\{ \text{\ensuremath{\partial f(y,p,\theta)/\partial\theta_{i}\cdot f(y,p,\theta)}}\right\} /\partial\theta_{j}\vert$,
$i,j=1,\ldots,K$ are dominated by functions integrable with respect
to $\mu$ for all $\theta$ in $\Theta$ and the minimal support of
$f(y,p,\theta)$ does not depend on $\theta.$ \end{assumption}


\begin{comment}
For every $N$ the observations $\left(p_{1},Y_{1}\right),\ldots,\left(p_{N},Y_{N}\right)$
are i.i.d. with the probability density $f\left(Y_{1},p_{1},\theta\right)$,
which has a common support, and the model is identifiable. Furthermore,
the first derivatives of the likelihood function satisfies the equation
\[
\E_{\theta}\left\{ \frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta_{j}}\right\} =0\ {\rm for}\ j=1,...,K.
\]
\end{comment}

\begin{assumption}\label{f2}Define matrix 
\begin{align*}
C(\theta) & =A(\theta)B^{-1}(\theta)A(\theta).
\end{align*}

Assume matrix $A(\theta)$ and $B(\theta)$ satisfy conditions 
\begin{align*}
0 & <C_{1}<\lambda_{\min}\left\{ A\left(\theta\right)\right\} \leq\lambda_{\max}\left\{ A\left(\theta\right)\right\} <C_{2}<\infty\ \ {\rm for}\ {\rm all}\ N,\\
0 & <C_{1}^{*}<\lambda_{\min}\left\{ B\left(\theta\right)\right\} \leq\lambda_{\max}\left\{ B\left(\theta\right)\right\} <C_{2}^{*}<\infty\ \ {\rm for}\ {\rm all}\ N,
\end{align*}

and for $j,k=1,\ldots,K,$ 
\[
\E_{\theta}\left\{ \frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta_{j}}\frac{\partial\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta_{k}}\right\} ^{2}<C_{3}<\infty
\]

and 
\[
\E_{\theta}\left\{ \frac{\partial^{2}\log f\left(Y_{1},p_{1},\theta\right)}{\partial\theta_{j}\partial\theta_{k}}\right\} ^{2}<C_{4}<\infty.
\]

\end{assumption}

\begin{assumption}\label{f3}
There is a large enough open subset $\omega_{N}$ of $\Theta\in R^{K}$
which contains the parameter point $\theta_{*}$, such that for almost
all $\left(p_{i},Y_{i}\right)$ the density admits all third derivatives
$\partial f(Y_{i},p_{i},\theta)/\partial\theta_{j}\theta_{k}\theta_{l}$
for all $\theta\in\omega_{N}$. Furthermore, there are functions $M_{jkl}$
such that 
\[
\vert\frac{\partial\log f\left(Y_{i},p_{i},\theta\right)}{\partial\theta_{j}\partial\theta_{k}\partial\theta_{l}}\vert\leq M_{jkl}\left(Y_{i},p_{i}\right)
\]
for all $\theta\in\omega_{N},$ and 
\[
\E_{\theta}\left\{ M_{jkl}^{2}\left(Y_{i},p_{i}\right)\right\} <C_{5}<\infty
\]
for all $K,N,j,k$ and $l$.

\end{assumption}
\begin{assumption}\label{f4}

Define $h(A,X,S\mid\beta,\delta)=\beta_{{\rm int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)\delta^{{\rm T}}p_{b}(X).$
$f,g$ satisfy that the minimizers $\beta_{*},\delta_{*}$ of $\E\{Y-h(A,X,S\mid\beta,\delta)\}^{2}$
are also the minimizers $\beta_{*},\delta_{*}$ of $KLIC(g:f,\beta,\delta)=\mathbb{E}\left[\log\left\{ g(Y,p)/f(Y,p,\beta,\delta)\right\} \right].$

\end{assumption}

\section{Proof}\label{subsec:Proof}

In this section, we provide proof of Theorems 1-4. Define $L\left(\theta\right)=L(\beta,\delta)=\sum_{i=1}^{N}\left\{ \ln f\left(Y_{i},p_{i,}\beta,\delta\right)\right\} .$
Subsequently, the penalized likelihood function is $Q(\theta)=L(\theta)-N\sum_{i=1}^{K_{1}}P_{\lambda_{1,i}}\left(\vert\beta\vert\right)-N\sum_{i=1}^{K_{2}}P_{\lambda_{2,i}}\left(\vert\delta\vert\right).$
Assume $\beta_{*}=\begin{pmatrix}\beta_{*1}\\
\beta_{*2}
\end{pmatrix},\ \delta_{*}=\begin{pmatrix}\delta_{*1}\\
\delta_{*2}
\end{pmatrix}$ where $\beta_{*1}\neq0$ with $s_{1}$ dimensions, $\delta_{*1}\neq0$
with $s_{2}$ dimensions, $\beta_{*2}=0$ with $K_{1}-s_{1}$ dimensions
and $\delta_{*2}=0$ with $K_{2}-s_{2}$ dimensions. Further, let
$\theta_{*}=\begin{pmatrix}\theta_{*1}\\
\theta_{*2}
\end{pmatrix}$ , where $\theta_{*1}=\begin{pmatrix}\beta_{*1}\\
\delta_{*1}
\end{pmatrix}\neq0$ with $s=s_{1}+s_{2}$ dimensions and $\theta_{*2}=\begin{pmatrix}\beta_{*2}\\
\delta_{*2}
\end{pmatrix}=0$ with $K-s$ dimensions.

\subsection{Theorem S1}

Theorem S1 was previously demonstrated in \citet{lorentz1966approximation}
and \citet{chen2007large}, and we restate it here.

\begin{theorem}\label{theorem1}For any unknown function $f:$$\mathbb{R}^{d}\rightarrow\mathbb{R}$,
assuming function $f\left(\cdot\right)$ is $t$ times continuously
differentiable. Let $K=(q+1)^{d}$ where $x_{1},\ldots,x_{d}$ are
at least up to power $q$, and let $r^{K}(x)$ be the $K$-dimension
power series basis function, $R\left(x\right)=A_{K}r^{K}\left(x\right)$
where $A_{K}$ is the matrix such that $\E\left\{ R\left(X\right)R^{{\rm T}}\left(X\right)\right\} =\mathcal{I}$
where $\mathcal{I}$ is the identity matrix. Then there is a $K$-vector
$\theta$ such that on the compact set $\mathcal{X}$, $\sup_{x\in\mathcal{X}}\vert f\left(x\right) -R^{{\rm T}}\left(x\right)\theta\vert=O\left(K^{-t/d}\right)$.
\end{theorem}


\subsection{Proof for Theorem 1}

Under the combined dataset, the ANCOVA working model is 
\begin{align*}
Y & =\beta_{{\rm int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)b_{0}(X).
\end{align*}

We can rewrite the models as 
\begin{align*}
\tilde{Y} & =\beta_{{\rm int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)\\
 & =\beta^{{\rm T}}p_{\mu}\\
 & =\bar{\mu}_{A,1}(X;\beta),
\end{align*}
where $\tilde{Y}:=Y-(1-S)b_{0}(X).$ Note, $\bar{\mu}_{A,1}(X;\beta)=\beta^{{\rm T}}p_{\mu}$.
Remember $\bar{\mu}_{A,1}(X)=\beta_{0}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X).$
The same in Proof of Theorem for Linear Models in \citet{rosenblum2009using},
the ordinary least squares estimate of $\beta$ is asymptotically
normal and converges in probability to the minimizer $\beta_{*}$
of $\E\{\tilde{Y}-\bar{\mu}_{A,1}(X;\beta)\}^{2}$ . Then 
\begin{align*}
\E\{\tilde{Y}-\bar{\mu}_{A,1}(X;\beta)\}^{2} & =\E\{\tilde{Y}-\E\left(\tilde{Y}\mid A,X,S\right)+\E\left(\tilde{Y}\mid A,X,S\right)-\bar{\mu}_{A,1}(X;\beta)\}^{2}\\
 & =\E\{\tilde{Y}-\E\left(\tilde{Y}\mid A,X,S\right)\}^{2}+\E\left\{ \E\left(\tilde{Y}\mid A,X,S\right)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}\\
 & =\E\{\tilde{Y}-\E\left(\tilde{Y}\mid A,X,S\right)\}^{2}+\E\left[\E\left\{ \E\left(\tilde{Y}\mid A,X,S\right)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}\mid S\right]\\
 & =\E\{\tilde{Y}-\E\left(\tilde{Y}\mid A,X,S\right)\}^{2}+\E \left[\left\{ \E\left(\tilde{Y}\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}\mid S=1 \right] \mathbb{P}(S=1)\\
 & +\E\left[ \left\{ \E\left(\tilde{Y}\mid A=0,X,S=0\right)-\bar{\mu}_{0,1}(X;\beta)\right\} ^{2} \mid S=0 \right]\mathbb{P}(S=0).
\end{align*}

Under HCs, by the definition of $b_{0}(X),$ we have 
\begin{align*}
\E\left(\tilde{Y}\mid A=0,X,S=0\right)-\bar{\mu}_{0,1}(X;\beta) & =\E\left\{ Y-b_{0}(X)\mid A=0,X,S=0\right\} -\bar{\mu}_{0,1}(X;\beta)\\
 & =\E\left\{ Y-\E\left(Y\mid A=0,X,S=0\right)+\bar{\mu}_{0,1}(X;\beta)\mid A=0,X,S=0\right\} -\bar{\mu}_{0,1}(X;\beta)\\
 & =\E\left(Y\mid A=0,X,S=0\right)-\E\left(Y\mid A=0,X,S=0\right)+\bar{\mu}_{0,1}(X;\beta)-\bar{\mu}_{0,1}(X;\beta)\\
 & =0.
\end{align*}
Similar in \citep{wang2021model}, the $\beta_{*}$ minimizing 
\begin{align*}
    \E\left[ \left\{ \E\left(\tilde{Y}\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}\mid S=1\right]	&=\E\left[\left\{ \E\left(Y\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}\mid S=1\right]\\
	&=\E\left[\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}\mid S=1\right].
\end{align*}

% On the other hand, $\beta_{*}$ minimizes 
% \begin{align*}
%     \E[\{Y-\bar{\mu}_{A,1}(X;\beta)\}^{2}\mid S=1]	&=\E[\{Y-\mu_{A,1}(X)+\mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\}^{2}\mid S=1]\\
% 	&=\E[\{Y-\mu_{A,1}(X)\}^{2}\mid S=1]+\E[\{\mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\}^{2}\mid S=1]\\
% 	&=\E[\{Y-\mu_{A,1}(X)\}^{2}\mid S=1]+\E\{\mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\}^{2}.
% \end{align*} Therefore $\tilde{\beta}_{*}=\beta_{*}$,which both minimizes $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}$. Similar in  \citep{wang2021model}, 
By the first formula of taking the first derivative of
$\E\left[\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2} \mid S=1\right],$ $\beta_{*}$
satisfies $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\mid S=1\right \} =0.$
That is, $\beta_{*}$ satisfies $\tau=\E\left\{ \mu_{1,1}(X)-\mu_{0,1}(X)\mid S=1 \right\}  =\E\left\{ \bar{\mu}_{1,1}(X)-\bar{\mu}_{0,1}(X)\mid S=1 \right\}  =\beta_{A*}.$
Similarly, in the REs, we have $\tau=\beta_{A*}.$



%$\E\left\{ \left(\tilde{Y}-\beta^{{\rm T}}p_{\mu}\right)^{{\rm T}}p_{\mu}\right\} =0$.
%On the other hand, the first equation on
%\E\left\{ \left(\tilde{Y}-\beta^{{\rm T}}p_{\mu}\right)^{{\rm T}}p_{\mu}\right\} $ is


% \begin{align*}
% \E\left(\tilde{Y}-\beta^{{\rm T}}p_{\mu}\right) & =\E\left\{ \E\left(\tilde{Y}-\beta^{{\rm T}}p_{\mu}\mid A,X,S\right)\right\} \\
%  & =\E\left[ \E\left\{Y-(1-S)b_{0}(X)-\bar{\mu}_{A,1}(X)\mid A,X,S\right\}\right] \\
%  & =\E\left(\E\left[ \E\left\{Y-(1-S)b_{0}(X)-\bar{\mu}_{A,1}(X)\mid A,X,S\right\}\mid S\right]\right)\\
%  & =\E\left\{ \E\left(Y\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X)\mid S=1\right\} \mathbb{P}(S=1)\\
%  &+\E\left\{ \E\left(Y-b_{0}(X)-\bar{\mu}_{A,1}(X)\mid A,X,S\right)\mid S=0\right\} \mathbb{P}(S=0)\\
%  & =\E\left\{ \E\left(Y\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X)\mid S=1\right\} \mathbb{P}(S=1)\\
%  &+\E\left[ \E\left\{Y-b_{0}(X)-\bar{\mu}_{0,1}(X)\mid A=0,X,S=0\right\}\mid S=0\right] \mathbb{P}(S=0)\\
%  & =\E\left\{ \E\left(Y\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X)\mid S=1\right\} \mathbb{P}(S=1)\\
%  & +\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-\E\left(Y\mid A=0,X,S=0\right)+\bar{\mu}_{0,1}(X)-\bar{\mu}_{0,1}(X)\mid S=0\right\} \mathbb{P}(S=0)\\
%  & =\E\left\{ \E\left(Y\mid A,X,S=1\right)-\bar{\mu}_{A,1}(X)\mid S=1\right\} \mathbb{P}(S=1)\\
%  & =0.
% \end{align*}
% Therefore, we have $\E\{ \E\left(Y\mid A,X,S=1\right)\mid S=1\} =\E\left\{ \bar{\mu}_{A,1}(X)\mid S=1 \right\} ,$
% and therefore $\beta_{A}=\E\{ \bar{\mu}_{1,1}(X)-\bar{\mu}_{0,1}(X)\mid S=1 \} =\E\left\{ \E\left(Y\mid A=1,X,S=1\right)-\E\left(Y\mid A=0,X,S=1\right)\mid S=1 \right\} =\tau.$

% \subsection{Theorem S2}
% \begin{theorem}
%     Define $h(A,X,S\mid\beta,\delta)=\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)\delta^{{\rm T}}p_{b}(X).$ Assume $(\tilde{\beta}_*, \tilde{\delta}_*)$ are the minimizers of 
%     $\E\{Y-h(A,X,S\mid\beta,\delta)\}^{2}$, and the ANCOVA model is \begin{equation*}
%         Y=\beta_{0}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)\delta^{{\rm T}}p_{b}(X)+\epsilon.
%     \end{equation*}

% Then $\tilde{\beta_*}=\beta_{*}$,  $\tilde{\delta}_*=\delta_{0}$.
% \end{theorem}
% \textbf{Proof}: Under the combined dataset, the ANCOVA working model is 
% \begin{align*}
% Y & =\beta_{{\rm int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)\delta^{{\rm T}}p_{b}(X)+\epsilon.
% \end{align*}
% Similarly in Proof of Theorem for Linear Models in \citet{rosenblum2009using},
% based on Assumption \ref{f4}, the least squares estimates $(\hat{\beta},\hat{\delta})$
% are asymptotically normal and converge in probability to the minimizer
% $(\tilde{\beta}_{*},\tilde{\delta}_{*})$ of $\E\{Y-h(A,X,S\mid\beta,\delta)\}^{2}$
% . Then 
% \begin{align*}
% \E\{Y-h(A,X,S\mid\beta,\delta)\}^{2} & =\E\{Y-\E\left(Y\mid A,X,S\right)+\E\left(Y\mid A,X,S\right)-h(A,X,S\mid\beta,\delta)\}^{2}\\
%  & =\E\{Y-\E\left(Y\mid A,X,S\right)\}^{2}+\E\{\E\left(Y\mid A,X,S\right)-h(A,X,S\mid\beta,\delta)\}^{2}\\
%  & =\E\{Y-\E\left(Y\mid A,X,S\right)\}^{2}+\E\left[\E\left\{ \E\left(Y\mid A,X,S\right)-h(A,X,S\mid\beta,\delta)\right\} ^{2}\mid S\right]\\
%  & =\E\{Y-\E\left(Y\mid A,X,S\right)\}^{2}+\E\left\{ \E\left(Y\mid A,X,S=1\right)-h(A,X,S=1\mid\beta,\delta)\right\} ^{2}\mathbb{P}(S=1)\\
%  & +\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-h(A=0,X,S=0\mid\beta,\delta)\right\} ^{2}\mathbb{P}(S=0).
% \end{align*}

% By the definition of $b_{0}(X),$ $\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-h(A=0,X,S=0\mid\beta,\delta)\right\} ^{2}$
% is minimized iff $\tilde{\delta}_{*}$ as $\delta_{*},$ then we have 
% \begin{align*}
% h(A=0,X,S=0\mid\beta,\delta) & =\beta_{\rm{int}}+\beta_{X}^{{\rm T}}p_{\mu}(X)+\delta_{*}^{{\rm T}}p_{b}(X)\\
%  & =\bar{\mu}_{0,1}(X;\beta)+\mathbb{E}(Y\mid X,A=0,S=0)-\bar{\mu}_{0,1}(X;\beta)\\
%  & =\mathbb{E}(Y\mid X,A=0,S=0).
% \end{align*}
% and $\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-h(A=0,X,S=0\mid\beta,\delta)\right\} ^{2}=0.$
% Similar in \citet{wang2021model}, $\tilde{\beta}_{*}$ minimizes 
% \begin{align*}
% \E\left\{ \E\left(Y\mid A,X,S=1\right)-h(A,X,S=1\mid\beta)\mid S=1\right\} ^{2} & =\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}.
% \end{align*}
% Therefore, $\tilde{\beta}_{*}={\beta}_{*}$, which both minimize $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}$. 
% % By the first formula of taking the first derivative of
% % $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2},$ $\beta_{*}$
% % satisfies $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} =0.$
% % That is, $\beta_{*}$ satisfies $\tau=\E\left\{ \mu_{1,1}(X)-\mu_{0,1}(X)\right\} =\E\left\{ \bar{\mu}_{1,1}(X;\beta)-\bar{\mu}_{0,1}(X;\beta)\right\} =\beta_{A*}.$
% % Similarly, in the REs, we have $\tau=\beta_{A*}.$ 

\subsection{Proof for Theorem 2}

Assume $g(Y,p)$ is the true density function, $f(Y,p,\theta)$ is
our working density function.
From Assumption \ref{f4}, the selected $f$ makes minimizing the KLIC equivalent to minimizing the least square  to $\E\{Y-h(A,X,S\mid\beta,\delta)\}^{2}$, therefore, $\theta_{*}=(\beta_*^{\rm{T}},\delta_*^{\rm{T}})^{\rm{T}}$ is also the parameter which
minimizes the Kullback-Leibler Information Criterion (KLIC), 
\[
KLIC(g:f,\theta)=\mathbb{E}\left[\log\left\{ g(Y,p)/f(Y,p,\theta)\right\} \right].
\]

We follow the similar proofs in\citet{fan2004nonconcave}, let $a_{N}=\sqrt{K}\left(N^{-1/2}+\alpha_{N}\right)$
and set $\|\mathbf{u}\|_{2}=C$, where $C$ is a large enough constant,
our aim is to show that for any given $\epsilon$ there is a large
constant $C$ such that, for large $N$ we have 
\[
\mathbb{P}\left\{ \sup_{\|\mathbf{u}\|_{2}=C}Q(\theta_{*}+a_{N}\mathbf{u})<Q(\theta_{*})\right\} \geq1-\epsilon.
\]

This implies that with probability tending to $1$ there is a local
maximum $\hat{\theta}$ in the call $\left\{ \theta_{*}+a_{N}\mathbf{u}:\|\mathbf{u}\|_{2}\leq C\right\} $
such that $\|\hat{\theta}-\theta_{*}\|_{2}=O_{p}\left(a_{N}\right).$
Because $P_{\lambda_{1}}(0)=P_{\lambda_{2}}(0)=0,$ We have 
\begin{align*}
D(\mathbf{u}) & =Q(\theta_{*}+a_{N}\mathbf{u})-Q(\theta_{*})\\
 & \leq\underbrace{L(\theta_{*}+a_{N}\mathbf{u})-L(\theta_{*})}_{\left(I\right)}\\
 & \underbrace{-N\sum_{j=1}^{s_{1}}\left\{ P_{\lambda_{1}}\left(\vert\beta_{*,j}+a_{N}u_{1j}\vert\right)-P_{\lambda_{1}}\left(\vert\vert\beta_{*,j}\vert\right)\right\} }_{\left(II\right)}\\
 & \underbrace{-N\sum_{j=1}^{s_{2}}\left\{ P_{\lambda_{2}}\left(\vert\delta_{*,j}+a_{N}u_{2j}\vert\right)-P_{\lambda_{2}}\left(\vert\delta_{*,j}\vert\right)\right\} }_{\left(III\right)}\\
 & :=\left(I\right)+\left(II\right)+\left(III\right),
\end{align*}
where $\mathbf{u}^{{\rm T}}=(\mathbf{u_{1}^{{\rm T}},\mathbf{u}_{2}^{{\rm T}})}$
with $\text{\ensuremath{\mathbf{u_{1}}}}$ as $K_{1}$ dimensions
and $\text{\ensuremath{\mathbf{u_{2}}}}$ as $K_{2}$ dimensions.
First for $(II)$ we have 
\begin{align*}
(II) & =-\sum_{j=1}^{s_{1}}\left[Na_{N}P_{\lambda_{1}}^{'}\left(\vert\beta_{*,j}\vert\right)\sgn(\beta_{*,j})u_{1j}+Na_{N}^{2}P_{\lambda_{1}}^{''}\left(\beta_{*,j}\right)u_{1j}^{2}\left\{ 1+o(1)\right\} \right]\\
 & :=I_{1}+I_{2},\\
\vert I_{1}\vert & \leq\sum_{j=1}^{s_{1}}\vert Na_{N}P_{\lambda_{1}}^{'}\left(\vert\beta_{*,j}\vert\right)\sgn(\beta_{*,j})u_{1j}\vert\leq\sqrt{s_{1}}Na_{N}\alpha_{N}\|\mathbf{u}_{1}\|_{2}\leq Na_{N}^{2}\|\mathbf{u}\|_{2},\\
\vert I_{2}\vert & =\sum_{j=1}^{s_{1}}Na_{N}^{2}P_{\lambda_{1}}^{''}(\vert\beta_{*,j}\vert)u_{1j}^{2}\left\{ 1+o(1)\right\} \leq2\max_{1\leq j\leq s_{1}}P_{\lambda_{1}}^{''}(\vert\beta_{*,j}\vert)Na_{N}^{2}\|\mathbf{u}\|_{2}^{2}.
\end{align*}
Similarly for $(III).$ Then for $(I)$ we have 
\begin{align*}
(I) & =a_{N}\text{\ensuremath{\nabla^{{\rm T}}L(\theta_{*})\mathbf{u}+\frac{1}{2}\mathbf{u}^{{\rm T}}\nabla^{2}L(\theta_{*})\mathbf{u}a_{N}^{2}+\frac{1}{6}\nabla^{{\rm T}}\left\{ \mathbf{u}^{{\rm T}}\nabla^{2}L(\theta^{*})\mathbf{u}\right\} \mathbf{u}a_{N}^{3}}}\\
 & :=I_{3}+I_{4}+I_{5},
\end{align*}
with the same proof in Theorem 1 in \citet{fan2004nonconcave}, by
condition \ref{f2}, we have 
\begin{align*}
\vert I_{3}\vert & =\vert a_{N}\nabla^{{\rm T}}L(\theta_{*})\mathbf{u}\vert\leq a_{N}\|\nabla^{{\rm T}}L(\theta_{*})\|_{2}\|\mathbf{u}\|_{2}=O_{p}(a_{N}^{2}N)\|\mathbf{u}\|_{2}.\\
I_{4} & =\frac{1}{2}\mathbf{u}^{{\rm T}}\left\{ \frac{1}{N}\left(\left[\nabla^{2}L(\theta_{*})-\E\left\{ \nabla^{2}L(\theta_{*})\right\} \right]\right)\right\} \mathbf{u}Na_{N}^{2}\\
 & -\frac{1}{2}\mathbf{u}^{{\rm T}}A(\theta_{*})\mathbf{u}Na_{N}^{2}\\
 & =-\frac{Na_{N}^{2}}{2}\mathbf{u}^{{\rm T}}A(\theta_{*})\mathbf{u}+o_{p}(1)Na_{N}^{2}\|\mathbf{u}\|_{2}^{2}.
\end{align*}
By condition \ref{f3} and $K^{4}/N\rightarrow0$ and $K^{2}\alpha_{N}\rightarrow0$
as $N\rightarrow\infty$, we have 
\begin{align*}
\vert I_{5}\vert & =\vert\frac{1}{6}\sum_{i,j,k=1}^{K}\frac{\partial L(\theta^{*})}{\partial\theta_{i}\partial\theta_{j}\partial\theta_{k}}u_{i}u_{j}u_{k}a_{N}^{3}\vert\\
 & \leq\frac{1}{6}\sum_{l=1}^{N}\left\{ \sum_{i,j,k=1}^{K}M_{ijk}^{2}(Y_{i},p_{i})\right\} ^{1/2}\|\mathbf{u}\|_{2}^{3}a_{N}^{3}\\
 & =o_{p}(Na_{N}^{2})\|\mathbf{u}\|_{2}^{2}.
\end{align*}
Therefore, by Assumption 5 and allowing $\|\mathbf{u}\|_{2}$ to be
large enough, all $I_{1},I_{2},I_{3},I_{5}$ and $(III)$ are dominated
by $I_{4}$, which is negative, therefore proves $\|\hat{\theta}-\theta_{*}\|_{2}=O_{p}\left\{ \sqrt{K}\left(N^{-1/2}+a_{N}\right)\right\} .$
Further we have $\max\left\{ \|\hat{\beta}-\beta_{*}\|_{2},\|\hat{\delta}-\delta_{*}\|_{2}\right\} \leq\|\hat{\theta}-\theta_{*}\|_{2}=O_{p}\left\{ \sqrt{K}\left(N^{-1/2}+a_{N}\right)\right\} .$
For the SCAD penalty, it is clear that $a_{N}=O_{p}\left(N^{-1/2}\right)$,
therefore $\hat{\beta}$ and $\hat{\delta}$ are root-$\left(N/K\right)$-consistent
penalized likelihood estimators exist with probability tending to
1, and no requirements are imposed on the convergence rate of $\lambda_{1}$
and $\lambda_{2}$.

\subsection{Proof for Theorem 3}

We follow the similar proofs in \citet{fan2004nonconcave}. we first
show that the nonconcave penalized estimator possesses the sparsity
property $\hat{\theta}_{2}=0$ by the following lemma.

\begin{lemma}\label{lemma1}Assume Assumption 5, Assumption \ref{f1}--\ref{f3}
are satisfied, if $\lambda_{1},\lambda_{2}\rightarrow0,$ $\sqrt{N/K}\lambda_{1}\rightarrow\infty,$
$\sqrt{N/K}\lambda_{2}\rightarrow\infty$, and $K^{5}/N\rightarrow0$
as $N\rightarrow\infty,$ then first show that with probability tending
to $1$, for any given $\theta_{1}$ satisfying $\|\theta_{1}-\theta_{*1}\|_{2}=O_{p}\left(\sqrt{K/N}\right)$
and any constant $C,$ 
\[
Q\left\{ (\theta_{1}^{{\rm T}},0)^{{\rm T}}\right\} =\max_{\|\theta_{2}\|_{2}\leq C(K/N)^{1/2}}Q\left\{ (\theta_{1}^{{\rm T}},\theta_{2}^{{\rm T}})^{{\rm T}}\right\} .
\]

\end{lemma}

Proof: Let $\epsilon=C\sqrt{K/N}.$ It is sufficient to show that
with probability tending to $1$ as $N\rightarrow\infty,$ for any
$\theta_{1}-\theta_{*1}=O_{p}\left(\sqrt{K/N}\right)$ we have for
$j=s+1,\ldots,K,$

\[
\begin{array}{ccc}
\frac{\partial Q(\theta)}{\partial\theta_{j}} & <0 & {\rm for}\ 0<\theta_{j}<\epsilon,\\
\frac{\partial Q(\theta)}{\partial\theta_{j}} & >0 & {\rm for}\ -\epsilon<\theta_{j}<0.
\end{array}
\]

By Taylor expansion,

\begin{align*}
\frac{\partial Q(\theta)}{\partial\theta_{j}} & =\frac{\partial L(\theta)}{\partial\theta_{j}}-NP_{\lambda}^{'}(\vert\theta_{j}\vert)\sgn\left(\theta_{j}\right)\\
 & =\frac{\partial L(\theta_{*})}{\partial\theta_{j}}+\sum_{l=1}^{K}\frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}(\theta_{l}-\theta_{*,l})\\
 & +\sum_{l,k=1}^{K}\frac{\partial^{3}L(\theta^{*})}{\partial\theta_{j}\partial\theta_{l}\partial\theta_{k}}(\theta_{l}-\theta_{*,l})\left(\theta_{k}-\theta_{*,k}\right)\\
 & -NP_{\lambda}^{'}(\vert\theta_{j}\vert)\sgn\left(\theta_{j}\right)\\
 & :=I_{1}+I_{2}+I_{3}+I_{4},
\end{align*}
where $\theta^{*}$ lies between $\theta$ and $\theta_{*}$, and
$P_{\lambda}^{'}(\vert\theta_{j}\vert)=P_{\lambda_{1}}^{'}(\vert\beta_{j}\vert)$
for $j=s+1,\ldots,K_{1}-s_{1}+s,$ and $P_{\lambda}^{'}(\vert\theta_{j}\vert)=P_{\lambda_{2}}^{'}(\vert\delta_{j}\vert)$
for $j=K_{1}-s_{1}+s+1,\ldots,K.$

Following the same proof in \citet{fan2004nonconcave}, we prove $I_{1}+I_{2}+I_{3}=O_{p}\left(\sqrt{NK}\right).$
First, $I_{1}=O_{p}\left(\sqrt{N}\right)=O_{p}\left(\sqrt{NK}\right).$
Also, 
\begin{align*}
I_{2} & =\sum_{l=1}^{K}\left(\frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}-\E\left\{ \frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}\right\} \right)\left(\theta_{l}-\theta_{*,l}\right)\\
 & +\text{\ensuremath{\sum_{l=1}^{K}\E\left\{ \frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}\right\} }}\left(\theta_{l}-\theta_{*,l}\right)\\
 & :=S_{1}+S_{2}.
\end{align*}

Using the Cauchy-Schwarz inequality and $\|\theta-\theta_{*}\|_{2}=O_{p}\left(K/N\right),$ we
have 
\begin{align*}
\vert S_{2}\vert & =\vert N\sum_{l=1}^{K}A(\theta_{*})(j,l)(\theta_{l}-\theta_{*,l})\vert\\
 & \leq NO_{p}\left(\sqrt{\frac{K}{N}}\right)\left\{ \sum_{l=1}^{K}A^{2}(\theta_{*})(j,l)\right\} ^{1/2}.
\end{align*}
By Assumption \ref{f2}, as the eigenvalues of the $A(\theta)$ are
bounded, we have $S_{2}=O_{p}\left(\sqrt{NK}\right).$ On the other
hand, 
\[
\vert S_{1}\vert\leq\|\theta-\theta_{*}\|_{2}\left(\sum_{l=1}^{K}\left[\frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}-\E\left\{ \frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}\right\} \right]^{2}\right)^{1/2}.
\]
By Assumption \ref{f2}, we have 
\[
\left(\sum_{l=1}^{K}\left[\frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}-\E\left\{ \frac{\partial^{2}L(\theta_{*})}{\partial\theta_{j}\partial\theta_{l}}\right\} \right]^{2}\right)^{1/2}=O_{p}\left(\sqrt{NK}\right).
\]
Therefore $S_{1}=O_{p}\left(\sqrt{NK}\right)$ and $I_{2}=O_{p}\left(\sqrt{NK}\right)$.
Further, 
\begin{align*}
I_{3} & =\sum_{l,k=1}^{K}\left[\frac{\partial^{3}L(\theta^{*})}{\partial\theta_{j}\partial\theta_{l}\partial\theta_{k}}-\E\left\{ \frac{\partial^{3}L(\theta^{*})}{\partial\theta_{j}\partial\theta_{l}\partial\theta_{k}}\right\} \right](\theta_{l}-\theta_{*,l})\left(\theta_{k}-\theta_{*,k}\right)\\
 & +\sum_{l,k=1}^{K}\E\left\{ \frac{\partial^{3}L(\theta^{*})}{\partial\theta_{j}\partial\theta_{l}\partial\theta_{k}}\right\} (\theta_{l}-\theta_{*,l})\left(\theta_{k}-\theta_{*,k}\right)\\
 & :=S_{3}+S_{4}.
\end{align*}

By Assumption \ref{f3}, $\vert S_{4}\vert\leq C_{5}^{1/2}NK\|\theta-\theta_{*}\|_{2}^{2}=O_{p}(K^{2})=o_{p}\left(\sqrt{NK}\right).$
Further, 
\[
S_{3}^{2}\leq\sum_{l,k=1}^{K}\left[\frac{\partial^{3}L(\theta^{*})}{\partial\theta_{j}\partial\theta_{l}\partial\theta_{k}}-\E\left\{ \frac{\partial^{3}L(\theta^{*})}{\partial\theta_{j}\partial\theta_{l}\partial\theta_{k}}\right\} \right]^{2}\|\theta-\theta_{*}\|_{2}^{4},
\]
where under the Assumption \ref{f3} and Assumption 5, $S_{3}=O_{p}\left\{ \left(NK^{2}\frac{K^{2}}{N^{2}}\right)^{1/2}\right\} =o_{p}\left(\sqrt{NK}\right).$
Then 
\[
I_{1}+I_{2}+I_{3}=O_{p}\left(\sqrt{NK}\right).
\]

Because we focus on the SCAD penalty, \citet{fan2004nonconcave} illustrates
that under Assumption 5, the SCAD penalty satisfies that 
\begin{align*}
\lim\inf_{N\rightarrow+\infty}\lim\inf_{\beta\rightarrow0+}P_{\lambda_{1}}^{'}(\beta)/\lambda_{1} & >0\\
\lim\inf_{N\rightarrow+\infty}\lim\inf_{\delta\rightarrow0+}P_{\lambda_{2}}^{'}(\delta)/\lambda_{2} & >0,
\end{align*}
therefore from 
\[
\frac{\partial Q(\theta)}{\partial\theta_{j}}=N\lambda\left\{ -\frac{P_{\lambda}^{'}\left(\vert\theta_{j}\vert\right)}{\lambda}\sgn({\theta_{j}})+O_{p}\left(\sqrt{\frac{K}{N}}/\lambda\right)\right\} ,
\]
where $\lambda=\lambda_{1}$ if $j=s+1,\ldots,K_{1}-s_{1}+s$ and
$\lambda=\lambda_{2}$ if $j=K_{1}-s_{1}+s+1,\ldots,K,$ and $P_{\lambda}^{'}(\vert\theta_{j}\vert)=P_{\lambda_{1}}^{'}(\vert\beta_{j}\vert)$
for $j=s+1,\ldots,K_{1}-s_{1}+s,$ and $P_{\lambda}^{'}(\vert\theta_{j}\vert)=P_{\lambda_{2}}^{'}(\vert\delta_{j}\vert)$
for $j=K_{1}-s_{1}+s+1,\ldots,K,$ the sign of $\theta_{j}$ completely
determines the sign of $\partial Q(\theta)/\partial\theta_{j}.$ We
complete the proof of Lemma \ref{lemma1}.

By Lemma \ref{lemma1} we prove $\hat{\theta}_{2}=\begin{pmatrix}\hat{\beta}_{2}\\
\hat{\delta}_{2}
\end{pmatrix}=0$. Then we prove the part 2.

Let 
\begin{align*}
\Sigma & ={\rm diag}\left\{ P_{\lambda}^{''}\left(\theta_{*,1}\right),\ldots,P_{\lambda}^{''}\left(\theta_{*,s}\right)\right\} \\
 & ={\rm diag}\left\{ P_{\lambda_{1}}^{''}\left(\beta_{*,1}\right),\ldots,P_{\lambda_{1}}^{''}\left(\beta_{*,s_{1}}\right),P_{\lambda_{2}}^{''}\left(\delta_{*,1}\right),\ldots,\ P_{\lambda_{2}}^{''}\left(\delta_{*,s_{2}}\right)\right\} 
\end{align*}
and 
\begin{align*}
b & =\left\{ P_{\lambda}^{'}\left(\vert\theta_{*,1}\vert\right)\sgn\left(\theta_{*,1}\right),\ldots,P_{\lambda}^{'}\left(\vert\theta_{*,s}\vert\right)\sgn\left(\theta_{*,s}\right)\right\} ^{{\rm T}}\\
 & =\left\{ P_{\lambda_{1}}^{'}\left(\vert\beta_{*,1}\vert\right)\sgn\left(\beta_{*,1}\right),\ldots,P_{\lambda_{1}}^{'}\left(\vert\beta_{*,s_{1}}\vert\right)\sgn\left(\beta_{*,s_{1}}\right),\ P_{\lambda_{2}}^{'}\left(\vert\delta_{*,1}\vert\right)\sgn\left(\delta_{*,1}\right),\ldots,P_{\lambda_{2}}^{'}\left(\vert\delta_{*,s_{2}}\vert\right)\sgn\left(\delta_{*,s_{2}}\right)\right\} ^{{\rm T}}.
\end{align*}

If we can show that 
\[
\left\{ A(\theta_{*1})+\Sigma\right\} \left(\hat{\theta}_{1}-\theta_{*1}\right)+b=\frac{1}{N}\nabla L(\theta_{*1})+o_{p}(N^{-1/2}),
\]
then 
\begin{align*}
 & \sqrt{N}WA^{-1/2}(\theta_{*1})\left\{ A(\theta_{*1})+\Sigma\right\} \left[\hat{\theta}_{1}-\theta_{*1}+\left\{ A(\theta_{*1})+\Sigma\right\} ^{-1}b\right]\\
= & \frac{1}{\sqrt{N}}WA^{-1/2}(\theta_{*1})\nabla L(\theta_{*1})+o_{p}\left\{ WA^{-1/2}(\theta_{*1})\right\} \\
= & \frac{1}{\sqrt{N}}WA^{-1/2}(\theta_{*1})\nabla L(\theta_{*1})+o_{p}(1).
\end{align*}

Let $R_{i}=\frac{1}{\sqrt{N}}WA^{-1/2}(\theta_{*1})\nabla L_{i}(\theta_{*1}),$
$i=1,\ldots,N.$ Following the same proof in \citet{fan2004nonconcave},
for any $\epsilon,$ we have 
\begin{align*}
\sum_{i=1}^{N}\E\|R_{i}\|_{2}^{2}\bone\left\{ \|R_{i}\|_{2}>\epsilon\right\}  & =N\mathbb{\mathbb{E}}\|R_{1}\|_{2}^{2}\bone\left\{ \|R_{1}\|_{2}>\epsilon\right\} ,\\
 & \leq N(\mathbb{\mathbb{E}}\|R_{1}\|_{2}^{4})^{1/2}\left\{ \mathbb{P}\left(\|R_{1}\|_{2}>\epsilon\right)\right\} ^{1/2}.
\end{align*}

By Assumption \ref{f2} and $WW^{{\rm T}}\rightarrow G,$ we obtain
\[
\mathbb{P}\left(\|R_{1}\|_{2}>\epsilon\right)\leq\frac{\mathbb{E}\|WA^{-1/2}(\theta_{*1})\nabla L_{1}(\theta_{*1})\|_{2}^{2}}{N\epsilon^{2}}=O(N^{-1})
\]
 and 
\begin{align*}
\mathbb{\mathbb{E}}\|R_{1}\|_{2}^{4} & =\frac{1}{N^{2}}\mathbb{E}\|WA^{-1/2}(\theta_{*1})\nabla L_{1}(\theta_{*1})\|_{2}^{4}\\
 & \leq\frac{1}{N^{2}}\lambda_{{\rm max}}(WW^{{\rm T}})\lambda_{{\rm max}}\left\{ A^{-1}(\theta_{*1})\right\} \mathbb{E}\|\nabla^{{\rm T}}L_{1}(\theta_{*1})\nabla L_{1}(\theta_{*1})\|_{2}^{2}\\
 & \leq O(\frac{K^{2}}{N^{2}}).
\end{align*}

Thus, we have 
\[
\sum_{i=1}^{N}\E\|R_{i}\|_{2}^{2}\bone\left\{ \|R_{i}\|_{2}>\epsilon\right\} =O(N\frac{K}{N}\frac{1}{\sqrt{N}})=o(1).
\]

and

\[
\sum_{i=1}^{N}\cov(R_{i})=\cov\left\{ WA^{-1/2}(\theta_{*1})\nabla L_{1}(\theta_{*1})\right\} =WA^{-1/2}(\theta_{*1})B(\theta_{*1})A^{-1/2}(\theta_{*1})W^{{\rm T}},
\]
so that the $R_{i}$ satisfies the conditions of the Lindeberg-Feller
central limit theorem. Further, using the Taylor expansion on $\nabla Q(\hat{\theta}_{1})$
at the point $\theta_{*1}$, we have 
\begin{align*}
 & \frac{1}{N}\left[\left\{ \nabla^{2}L(\theta_{*1})-\nabla^{2}P_{\lambda}(\theta_{1}^{**})\right\} \left(\hat{\theta}_{1}-\theta_{*1}\right)-\nabla P_{\lambda}\left(\theta_{*1}\right)\right]\\
= & -\frac{1}{N}\left[\nabla L(\theta_{*1})+\frac{1}{2}\left(\hat{\theta}_{1}-\theta_{*1}\right)^{{\rm T}}\nabla^{2}\left\{ \nabla L\left(\theta_{1}^{*}\right)\left(\hat{\theta}_{1}-\theta_{*1}\right)\right\} \right],
\end{align*}
where $\theta_{1}^{*}$ and $\theta_{1}^{**}$ lie between $\hat{\theta}_{1}$
and $\theta_{*1}.$ Now define 
\[
\mathcal{L}:=\nabla^{2}L(\theta_{*1})-\nabla^{2}P_{\lambda}(\theta_{1}^{**})
\]
and 
\[
C:=\frac{1}{2}\left(\hat{\theta}_{1}-\theta_{*1}\right)^{{\rm T}}\nabla^{2}\left\{ \nabla L\left(\theta_{1}^{*}\right)\left(\hat{\theta}_{1}-\theta_{*1}\right)\right\} .
\]
Following the proof in \citet{fan2004nonconcave}, under Assumption
\ref{f3} and Assumption 5 and by the Cauchy--Schwarz inequality,
we have $\|1/N\mathcal{C}\|_{2}^{2}=o_{p}\left(1/N\right).$  Further, 
we have 
\[
\lambda_{i}\left\{ \frac{1}{N}\mathcal{L}+A(\theta_{*1})+\Sigma\right\} =o_{p}\left(\frac{1}{\sqrt{K}}\right),i=1,\ldots,s,
\]
where $\lambda_{i}(M)$ is the $i$th eigenvalue of a symmetric matrix
$M$. Therefore, 
\[
\left\{ \frac{1}{N}\mathcal{L}+A(\theta_{*1})+\Sigma\right\} \left(\hat{\theta}_{1}-\theta_{*1}\right)=o_{p}\left(\frac{1}{\sqrt{N}}\right).
\]
Then, we have $\left\{ A(\theta_{*1})+\Sigma\right\} \left(\hat{\theta}_{1}-\theta_{*1}\right)+b=\frac{1}{N}\nabla L(\theta_{*1})+o_{p}(N^{-1/2}),$
and finally we have 
\begin{align*}
\sqrt{N}WA^{-1/2}(\theta_{*1})\left\{ A(\theta_{*1})+\Sigma\right\} \left[\hat{\theta}_{1}-\theta_{*1}+\left\{ A(\theta_{*1})+\Sigma\right\} ^{-1}b\right]\\
\rightarrow\mathcal{N}(0,WA^{-1/2}(\theta_{*1})\left[B(\theta_{*1})-\mathbb{E}\left\{ \nabla L_{1}(\theta_{*1})\right\} \right]A^{-1/2}(\theta_{*1})W^{{\rm T}}\text{.}
\end{align*}
Further, based on the SCAD penalty, $\Sigma=0$ and $b=0$, therefore,
we have 
\[
\sqrt{N}WA^{1/2}\left(\theta_{*1}\right)\left(\hat{\theta}_{1}-\theta_{*1}\right)\rightarrow\mathcal{N}\left(0,WA^{-1/2}(\theta_{*1})B(\theta_{*1})A^{-1/2}(\theta_{*1})W^{{\rm T}}\right).
\]
 If the model is correctly specified, i.e., $g(Y,p_{i})=f(Y,p_{i},\theta)$
for some $\theta\in\Theta$, then $\theta_{0}=\theta_{*}$, and 
\[
\sqrt{N}WI^{1/2}\left(\theta_{01}\right)\left(\hat{\theta}_{1}-\theta_{01}\right)\rightarrow\mathcal{N}\left(0,WW^{{\rm T}}\right).
\]

We finish the second part. 

\subsection{Comments on Theorem 2 and Theorem 3}

It's important to clarify that if our focus isn't primarily on the least square loss, we can adjust the assumptions on $f$ and $g$ from \ref{f1}--\ref{f4} to only satisfy Assumptions \ref{f1}--\ref{f3} for Theorem 2 and Theorem 3. The critical point is that for any other $f$ and $g$ fulfilling Assumptions \ref{f1}--\ref{f3}, a local maximizer $\hat{\theta}$ of the $Q(\theta)$ exists such that $\hat{\theta}$ converges to $\theta_*$, where $\theta_*$ minimizes the KLIC between $f$ and $g$, and also exhibits the oracle property and asymptotic normality. However, in the main paper, our primary concern is the least square loss, and we further constrain $f$ to meet assumption \ref{f4} in addition to assumptions \ref{f1}--\ref{f3}. Assumption \ref{f4} implies that the parameters minimizing the KLIC also minimize a specific form: the least square form. This is because in the main paper, we are particularly interested in ANCOVA least square estimates.



\subsection{Proof for Theorem 4}

% Under the combined dataset, the ANCOVA working model is 
% \begin{align*}
% Y & =\beta_{{\rm int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)\delta^{{\rm T}}p_{b}(X)+\epsilon,\ \E(\epsilon)=0.
% \end{align*}
%  Define $h(A,X,S\mid\beta,\delta)=\beta_{\rm{int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+(1-S)\delta^{{\rm T}}p_{b}(X).$
% Similarly in Proof of Theorem for Linear Models in \citet{rosenblum2009using},
% based on Assumption \ref{f4}, the least squares estimates $(\hat{\beta},\hat{\delta})$
% are asymptotically normal and converge in probability to the minimizer
% $(\tilde{\beta}_{*},\delta_{*})$ of $\E\{Y-h(A,X,S\mid\beta,\delta)\}^{2}$
% . Then 
% \begin{align*}
% \E\{Y-h(A,X,S\mid\beta,\delta)\}^{2} & =\E\{Y-\E\left(Y\mid A,X,S\right)+\E\left(Y\mid A,X,S\right)-h(A,X,S\mid\beta,\delta)\}^{2}\\
%  & =\E\{Y-\E\left(Y\mid A,X,S\right)\}^{2}+\E\{\E\left(Y\mid A,X,S\right)-h(A,X,S\mid\beta,\delta)\}^{2}\\
%  & =\E\{Y-\E\left(Y\mid A,X,S\right)\}^{2}+\E\left[\E\left\{ \E\left(Y\mid A,X,S\right)-h(A,X,S\mid\beta,\delta)\right\} ^{2}\mid S\right]\\
%  & =\E\{Y-\E\left(Y\mid A,X,S\right)\}^{2}+\E\left\{ \E\left(Y\mid A,X,S=1\right)-h(A,X,S=1\mid\beta,\delta)\right\} ^{2}\mathbb{P}(S=1)\\
%  & +\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-h(A=0,X,S=0\mid\beta,\delta)\right\} ^{2}\mathbb{P}(S=0).
% \end{align*}

% By the definition of $b_{0}(X),$ $\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-h(A=0,X,S=0\mid\beta,\delta)\right\} ^{2}$
% is minimized iff $\delta_{*}$ as $\delta_{0},$ then we have 
% \begin{align*}
% h(A=0,X,S=0\mid\beta,\delta) & =\beta_{\rm{int}}+\beta_{X}^{{\rm T}}p_{\mu}(X)+\delta_{0}^{{\rm T}}p_{b}(X)\\
%  & =\bar{\mu}_{0,1}(X;\beta)+\mathbb{E}(Y\mid X,A=0,S=0)-\bar{\mu}_{0,1}(X;\beta)\\
%  & =\mathbb{E}(Y\mid X,A=0,S=0).
% \end{align*}
% and $\E\left\{ \E\left(Y\mid A=0,X,S=0\right)-h(A=0,X,S=0\mid\beta,\delta)\right\} ^{2}=0.$
% Similar in \citet{wang2021model}, $\tilde{\beta}_{*}$ minimizes 
% \begin{align*}
% \E\left\{ \E\left(Y\mid A,X,S=1\right)-h(A,X,S=1\mid\beta)\mid S=1\right\} ^{2} & =\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}.
% \end{align*}
% Therefore, $\tilde{\beta}_{*}={\beta}_{*}$, which both minimize $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2}$. 
% % By the first formula of taking the first derivative of
% % $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} ^{2},$ $\beta_{*}$
% % satisfies $\E\left\{ \mu_{A,1}(X)-\bar{\mu}_{A,1}(X;\beta)\right\} =0.$
% % That is, $\beta_{*}$ satisfies $\tau=\E\left\{ \mu_{1,1}(X)-\mu_{0,1}(X)\right\} =\E\left\{ \bar{\mu}_{1,1}(X;\beta)-\bar{\mu}_{0,1}(X;\beta)\right\} =\beta_{A*}.$
% % Similarly, in the REs, we have $\tau=\beta_{A*}.$ 
% Therefore, from
% Theorem 4, we have $\hat{\beta}_{A}$ is asymptotically normal and
% converges in probability to $\tau$ in combined dataset and in REs. 
The consistency and the asymptotic normality of the estimator $\hat\tau$ are from Theorem 2 and Theorem 3. Here we only prove the calculation of the variance. 
If $\V(\epsilon)=\sigma^{2},$
then the influence function of $\theta$ is 
\[
\phi_{\theta}(x,y,s)=\E\left\{ \begin{pmatrix}p_{\mu}\\
(1-S)p_{b}^{{\rm }}(X)
\end{pmatrix}\begin{pmatrix}p_{\mu}^{{\rm T}}, & (1-S)p_{b}^{{\rm T}}(X)\end{pmatrix}\right\} ^{-1}\begin{pmatrix}p_{\mu}\\
(1-s)p_{b}^{{\rm }}(x)
\end{pmatrix}(y-p^{{\rm T}}\theta).
\]
Then the asymptotic variance of $\hat{\beta}$ is
\begin{align*}
\V(\hat{\beta}) & =\E\left\{ \phi_{\theta}(X,Y,S)\phi_{\theta}^{{\rm T}}(X,Y,S)\right\} _{11}\\
 & =\sigma^{2}\E\left\{ \begin{pmatrix}p_{\mu}^{{\rm }}p_{\mu}^{{\rm T}} & (1-S)p_{\mu}^{{\rm }}p_{b}^{{\rm T}}(X)\\
(1-S)p_{b}(X)p_{\mu}^{{\rm T}} & (1-S)p_{b}^{{\rm }}(X)p_{b}^{{\rm T}}
\end{pmatrix}\right\} _{11}^{-1}\\
 & =\sigma^{2}\E\left[Sp_{\mu}^{{\rm }}p_{\mu}^{{\rm T}}+(1-S)p_{\mu}^{{\rm }}p_{\mu}^{{\rm T}}-(1-S)\left\{ p_{\mu}^{{\rm }}p_{b}^{{\rm T}}(X)\right\} \left\{ p_{b}^{{\rm }}(X)p_{b}^{{\rm T}}(X)\right\} ^{-1}\left\{ p_{b}(X)p_{\mu}^{{\rm T}}\right\} \right]^{-1}.
\end{align*}

On the other hand, under the RE data, the ANCOVA working model is
\begin{align*}
Y & =\beta_{{\rm int}}+\beta_{A}A+\beta_{X}^{{\rm T}}p_{\mu}(X)+\epsilon,\ \epsilon\sim\mathcal{}(0,\sigma^{2}),
\end{align*}
Similarly, the asymptotic variance of $\hat{\beta}_{{\rm RE}}$ is
\begin{align*}
\V(\hat{\beta}_{{\rm RE}}) & =\sigma^{2}\E\left(Sp_{\mu}p_{\mu}^{{\rm T}}\right)^{-1}.
\end{align*}
Then by Holder inequality, 
\begin{align*}
 & \E\left\{ (1-S)p_{\mu}^{{\rm }}p_{\mu}^{{\rm T}}\right\} \\
 & -\E\left[(1-S)\left\{ p_{\mu}^{{\rm }}p_{b}^{{\rm T}}(X)\right\} \left\{ p_{b}^{{\rm }}(X)p_{b}^{{\rm T}}(X)\right\} ^{-1}\left\{ p_{b}(X)p_{\mu}^{{\rm T}}\right\} \right]\geq0,
\end{align*}
where the equality holds iff $p_{\mu}=Mp_{b}(X)$ for some matrix
$M$. Therefore, we have $\V\left\{ \hat{\beta}\right\} \leq\V\left\{ \hat{\beta}_{{\rm RE}}\right\} ,$
and the equality holds iff $p_{\mu}=Mp_{b}(X)$ for some matrix
$M$. Therefore $\V(\hat{\tau}_{{\rm RE}})\geq\V(\hat{\tau}).$


\section{Toy example} \label{sec:Toy-example}

Consider the case $y=x^{{\rm T}}\beta+(1-s)x^{{\rm T}}\delta+\epsilon,\ x=\left(x_{1},\ldots,x_{50}\right)^{{\rm T}}\in\mathbb{R}^{50},\ \beta^{{\rm T}}=(1,\ldots,50)/10,\ \delta^{{\rm T}}=(1,\ldots,50)\times30,\epsilon\sim\mathcal{N}(0,1)$,
where $s$ denotes the zero-one indicator variable that determines
whether the observation belongs to the REs, for simplicity, we assume
$x^{{\rm T}}\beta$ is the correct outcome mean function of the REs,
and $x^{{\rm T}}\delta$ is the bias function reflecting the difference
between ECs and REs, i.e., if $\delta=0$, then the observed covariates
capture all confounders in the ECs and REs and thus the exchangeability
assumption is valid. For didactic purposes, the magnitude of $\delta$
is much larger than the magnitude of $\beta$. Using the same regularization
parameter appears to assign the same weight for $\beta$ and $\delta$,
thus any penalty regularization methods tend to omit small signals
$\beta$ and only pick up big signals $\delta.$ Therefore, in order
to make penalizations between different parameters comparable, it
is crucial to add regularizations to $\beta$ and $\delta$ separately.
Figure \ref{example2-1} shows the smoothed linear regression between
$\hat{\beta}$ and $\beta$ after applying the single-penalty regularization
(denoted as ``Single'') and the double-penalty regularizations (denoted
as ``Double'') to select variables and refitting the model using
selected variables, where double penalties make $\hat{\beta}$ more
accurate than the single penalties. 

\begin{figure}[h]
\center{} \includegraphics[scale=0.15]{\string"example2\string".png}


\caption{\label{example2-1}The smoothed linear regression between $\hat{\beta}$
and $\beta$ with the 95\% confidence intervals as the shaded area
and $(\beta,\hat{\beta})$ as the points.}
\end{figure}


\section{Plots}\label{plots}

We present the extra figures for the first simulation study in this
section. Figure \ref{small} shows the results for the case $\|\beta_{0}\|_{1}\geq \|\delta_{0}\|_{1}$
when setting half of parameters in $\delta_{0}$ equal to zero: $\|\beta_{0}\|_{1}=c\|\delta_{0}\|_{1},c=1,3,5,7,9;$
and Figure \ref{sparsity} shows the results for varying the sparsity
level of $\delta_{0}$ when setting $\|\beta_{0}\|_{1}=\|\delta_{0}\|_{1}$
with the x axis as the ratio of the number of variables in $\delta_{0}$ equal
to zero. Each figure shows the MSE results and the percentage of Under-select
and Over-select. Figure \ref{small} shows the SPIE has a larger MSE
compared to the DPIE, which is consistent with the theoretical results.
On the other hand, the changes in the sparsity of $\delta_{0}$ affect
little on the results. This finding also consists of the theoretical
result, where we only need to restrain the magnitude of different
parameters to guarantee the consistency and oracle properties. 
\begin{figure}[h]
\center{}\includegraphics[scale=0.35]{\string"small\string".png}
\caption{\label{small}Simulation results based on 100 Monte Carlo times. The
left panel shows the MSE versus the magnitude ratio between $\delta$
and $\beta$. The right panel shows the percentage of wrongly choosing
more and less parameters, separately.}
\end{figure}

\begin{figure}[h]
\center{}\includegraphics[scale=0.35]{\string"sparsity\string".png}
\caption{\label{sparsity}Simulation results based on 100 Monte Carlo times.
The left panel shows the MSE versus the sparsity level in $\delta$.
The right panel shows the percentage of wrongly choosing more and
less parameters, separately.}
\end{figure}

\bibliography{cheng_477-supp}

\end{document}