%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


%%additional packages%%
\usepackage{bm}
\usepackage{latexsym}
\usepackage{algorithm,algorithmic,lscape}
\usepackage{graphics}
\usepackage{url}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{color}
\usepackage{lscape}
\usepackage{caption}
\newcommand{\red}{\color{red}}
\newcommand{\blue}{\color{blue}}
\newcommand{\green}{\color{green}}

% Set the typeface to Times Roman
\usepackage{times}

\usepackage{amsmath}
\usepackage{amsfonts}
%\usepackage{tikz}
\usepackage{pgfplots}
%\pgfplotsset{compat=1.12}
%\usetikzlibrary{positioning}
\usepackage{booktabs}
\usepackage{lmodern}
\usepackage{empheq}


\usepackage{scalerel}
\def\MnSymbolGlyphs#1{% IF ONE NEEDS TO LOCATE GLYPHS
  \usepackage{MnSymbol,fonttable}%
\AtBeginDocument{\fonttable{MnSymbol#110}}%
}

\allowdisplaybreaks

\def\ImportFromMnSymbol#1{%
\DeclareFontFamily{U} {MnSymbol#1}{}
\DeclareFontShape{U}{MnSymbol#1}{m}{n}{
<-6> MnSymbol#15
<6-7> MnSymbol#16
<7-8> MnSymbol#17
<8-9> MnSymbol#18
<9-10> MnSymbol#19
<10-12> MnSymbol#110
<12-> MnSymbol#112}{}
\DeclareFontShape{U}{MnSymbol#1}{b}{n}{
<-6> MnSymbol#1-Bold5
<6-7> MnSymbol#1-Bold6
<7-8> MnSymbol#1-Bold7
<8-9> MnSymbol#1-Bold8
<9-10> MnSymbol#1-Bold9
<10-12> MnSymbol#1-Bold10
<12-> MnSymbol#1-Bold12}{}
\DeclareSymbolFont{MnSy#1} {U} {MnSymbol#1}{m}{n}
}
\newcommand\DeclareMnSymbol[4]{\DeclareMathSymbol{#1}{#2}{MnSy#3}{#4}}
\ImportFromMnSymbol{A}
\DeclareMnSymbol{\ConIndepNat}{\mathrel}{A}{225}
\def\ConIndep{\mathrel{\scalerel*{\ConIndepNat}{X}}}

\newtheorem{theorem}{Theorem}
\newtheorem{condition}{Condition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}

\usepackage{subcaption}


\title{Partially Adaptive Regularized Regression for Estimating Linear Causal Effects: Supplementary Material}

% The standard author block has changed for UAI 2021 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.

% Add authors in order of decreasing contribution
\author[1]{\href{Hisayoshi Nanmo <nanmohisayoshi@gmail.com>?Subject=Partially Adaptive Regularized Multiple Regression Analysis for Estimating Linear Causal Effects}{Hisayoshi Nanmo}{}} % Lead author
\author[2]{\href{Manabu Kuroki <kuroki-manabu-zm@ynu.ac.jp>?Subject=Partially Adaptive Regularized Multiple Regression Analysis for Estimating Linear Causal Effects}{Manabu Kuroki}{}}


% Add affiliations after the authors
\affil[1]{%
Chugai Pharmaceutical Co., Ltd.\\
Nihonbashi Muromachi, Chuo-ku, Tokyo, Japan 
}
\affil[2]{%
Yokohama National University      \\
Tokiwadai, Hodogaya-ku, Yokohama, Japan 
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\onecolumn
\maketitle
\renewcommand{\thesection}{\Alph{section}}
\renewcommand{\theequation}{\thesection.\arabic{equation}}


\section{The Proof of Theorem 3}
\subsection{Basic Theory}

In this section, we provide a brief review of the basic theory of optimization, which is used to prove Theorem 3.
Readers who are familiar with optimization theory can skip this section. For details, also refer to, for example, \cite{Beck17}.

Throughout the Supplementary Material, let $f(\mbox{\boldmath $x$})$ be a proper, closed, and convex function.
Here, $f(\mbox{\boldmath $x$})$ is called proper when the domain of $f(\mbox{\boldmath $x$})$, $\mbox{dom}(f)=\{\mbox{\boldmath $x$}|f(\mbox{\boldmath $x$})<\infty\}$, is not empty, and $f(\mbox{\boldmath $x$})$ takes values on the extended real number line---i.e., $(-\infty,\infty]$.
In addition, $f(\mbox{\boldmath $x$})$ is called closed when ${\displaystyle \liminf _{x\to x_{0}}f(\mbox{\boldmath $x$})\geq f(\mbox{\boldmath $x$}_{0})}$ holds, where $``{\displaystyle \liminf}"$ is the {limit inferior} (of ${\displaystyle f}$ at point ${\displaystyle \mbox{\boldmath $x$}_{0}}$).
Furthermore, $f(\mbox{\boldmath $x$})$ is called $\sigma$-strongly convex for a given $\sigma >0$ if $\mbox{dom}(f)$ is convex, and the following inequality holds for any $\mbox{\boldmath $x$},\mbox{\boldmath $y$}\in \mbox{dom}(f)$ and $\lambda \in [0,1]$:
\begin{equation}
f(\lambda \mbox{\boldmath $x$} + (1-\lambda)\mbox{\boldmath $y$}) \leq \lambda f(\mbox{\boldmath $x$}) + (1- \lambda)f(\mbox{\boldmath $y$})-\frac{\displaystyle \sigma}{\displaystyle 2}
\lambda (1- \lambda)||\mbox{\boldmath $x$}-\mbox{\boldmath $y$}||^2_2.\label{a1}
\end{equation}
In particular, $f(\mbox{\boldmath $x$})$ is called convex if equation (\ref{a1}) holds for $\sigma=0$.
In addition, a set $C$ is called convex if it holds that
$\lambda \mbox{\boldmath $x$}+(1-\lambda)\mbox{\boldmath $y$}\in C$ for any $\mbox{\boldmath $x$}$, $\mbox{\boldmath $y$}\in C$ and $\lambda \in [0, 1]$.
Throughout the Supplementary Material, the function $g(\mbox{\boldmath $x$})$ is also a proper, closed, and convex function.

The function $f(\mbox{\boldmath $x$})$ is also assumed to satisfy the following conditions: (i) ${\rm dom}(f)$ is convex, (ii) ${\rm dom}(g)\subset{\rm int}({\rm dom}(f))$ and (iii) $f(\mbox{\boldmath $x$})$ is $l_{f}$-smooth over ${\rm int}({\rm dom}(f))$.
Here, $f(\mbox{\boldmath $x$})$ is called an $l_{f}$-smooth function when $(\partial/\partial \mbox{\boldmath $x$})f$ is a Lipschitz continuous function with Lipschitz constant $l_{f}$.
For a set $A$, ${\rm int}(A)$ is a set of all {interior points} of $A$.
The function $h(\mbox{\boldmath $x$})$ is called Lipschitz continuous if there exists a positive real constant $K$ such that $
{\displaystyle |h(\mbox{\boldmath $x$}_{1})-h(\mbox{\boldmath $x$}_{2})|\leq K||\mbox{\boldmath $x$}_{1}-\mbox{\boldmath $x$}_{2}||_2}$ for any $\mbox{\boldmath $x$}_1, \mbox{\boldmath $x$}_2\in \mbox{dom}(h)$ and such a $K$ is called a Lipschitz constant.
Finally, the minimizer of $f(\mbox{\boldmath $x$})$ is a point $\mbox{\boldmath $a$}$ for which
$f(\mbox{\boldmath $x$})> f(\mbox{\boldmath $a$})$ at $\mbox{\boldmath $x$}$ around $\mbox{\boldmath $a$}$.

For a $p$-dimensional vector $\mbox{\boldmath $x$}\in \mathbb{R}^{p}$, consider a problem that finds the minimizer of
\begin{equation}
{\displaystyle F(\mbox{\boldmath $x$})=f(\mbox{\boldmath $x$})+g(\mbox{\boldmath $x$})},
\end{equation}
where we assume that the optimal set of ${\displaystyle \mathop{\mbox{argmin}}_{x}\left( f(\mbox{\boldmath $x$})+g(\mbox{\boldmath $x$})\right)}$ is nonempty in this Supplementary Material.

Under the preparation above, we introduce the following propositions.

\begin{proposition}{\bf (Convergence rate of the proximal gradient method \citep{Beck17}}
For the sequence $\{\mbox{\boldmath $x$}[k]\}_{k\geq 0}$ defined by
\begin{eqnarray}
\bm{x}[k+1]&=&\underset{x\in \mathbb{R}^{p}}{{\rm argmin}}\left(f(\bm{x}[k])+\left\langle\frac{\partial}{\partial \bm{x}} f(\bm{x})_{x=x[k]},\bm{x}-\bm{x}[k]\right\rangle+g(\bm{x})+\frac{1}{2t}||\bm{x}-\bm{x}[k] ||^{2}_{2} \right)\label{a3}\\
&=&\underset{x\in \mathbb{R}^{p}}{{\rm argmin}}\left(tg(\bm{x})+\frac{1}{2}||\bm{x}-(\bm{x}[k]-t\frac{\partial}{\partial \bm{x}} f(\bm{x})_{x=x[k]} ) ||^{2}_{2} \right) 
\end{eqnarray}
%Editor: Please ensure that the intended meaning has been maintained in the following edit.
for $t>0$ and the initial vector $\bm{x}[0]$ from \mbox{dom}(F), $\bm{x}^{*}$ is the minimizer of $F(\mbox{\boldmath $x$})$:
\begin{equation}
F(\bm{x}[k])-F(\bm{x}^{*})\leq \frac{\displaystyle l_f}{2k}||\bm{x}[0]-\bm{x}^{*}||_2
\end{equation}
for $k \geq 0$ and {$t\leq 1/l_f$}.
\end{proposition}

\begin{proposition}{\bf \citep{Beck17}}
For $D$ to be a Euclidean space, let $f: D\rightarrow (-\infty, \infty]$ be a proper, closed, and $\sigma$-strongly convex function $(\sigma > 0)$.
Then,

(a) $f(\mbox{\boldmath $x$})$ has a unique minimizer $\mbox{\boldmath $x$}^*$ in dom$(f)$,

(b) for all $\mbox{\boldmath $x$}\in \mbox{dom}(f)$, $$f(\mbox{\boldmath $x$})-f(\mbox{\boldmath $x$}^*) \geq \frac{\sigma}{2}
||\mbox{\boldmath $x$}-\mbox{\boldmath $x$}^*||^2_2.$$
\end{proposition}
\begin{proposition}{\bf \citep{Beck17}}
Let $D$ be a Euclidean space and $f: D\rightarrow (-\infty,\infty]$ be a $\sigma$-strongly convex function if and only if the function ${\displaystyle f(\mbox{\boldmath $x$})-\frac{\sigma}{2}||\mbox{\boldmath $x$}||^2_2}$ is convex. \end{proposition}


\subsection{Preparation}
\quad~For Section 3, let $\mbox{\boldmath $w$}_i$ be an $n$-dimensional observation vector of the $i$-th explanatory variable $W_i$ of $\mbox{\boldmath $W$}$ $(W_i\in \mbox{\boldmath $W$}: i=1,2,...,q)$.
In addition, based on the weight vector $\mbox{\boldmath $\gamma$}$ of equations (6) and (7), we define the $n\times q$ matrix $\mbox{\boldmath $w$}^{\sharp}$ and $B^{\sharp}_{yw{\cdot}xz}$ as
\begin{equation}
\mbox{\boldmath $w$}^{\sharp}=\left(\frac{\mbox{\boldmath $w$}_{1}}{\gamma_{1}};\frac{\mbox{\boldmath $w$}_{2}}{\gamma_{2}};\ldots;\frac{\mbox{\boldmath $w$}_{q}}{\gamma_{q}}\right)
\end{equation}
and $\bm{\gamma}\odot B_{yw{\cdot}xz}$, respectively.
Then, for $p=1$, equation (5) is reformulated as
\begin{equation}
L^{\sharp}_1(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B^{\sharp}_{yw{\cdot}xz})=\frac{1}{2}||\mbox{\boldmath $y$}-\mbox{\boldmath $x$}\beta_{yx{\cdot}zw}-\mbox{\boldmath $z$}B_{yz{\cdot}xw}-\mbox{\boldmath $w$}^{\sharp} B^{\sharp}_{yw{\cdot}xz}||^2_2+\lambda_{1}||B^{\sharp}_{yw{\cdot}xz}||^1_{1}.\label{a7}
\end{equation}
Then, to solve our problem, we adopt the idea of the block-coordinate-relaxation method \citep{Sardy00}. Intuitively, in the block-coordinate-relaxation method, a whole set of variables is divided into several blocks, and the original optimization problem is iteratively solved as a sequential optimization problem regarding some blocks under the assumption that the remaining blocks are constant.
Based on this idea, first, we divide equation (\ref{a7}) into the following two kinds of functions:
\begin{equation}
f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B^{\sharp}_{yw{\cdot}xz})=\frac{1}{2}||\mbox{\boldmath $y$}-\mbox{\boldmath $x$}\beta_{yx{\cdot}zw}-\mbox{\boldmath $z$}B_{yz{\cdot}xw}-\mbox{\boldmath $w$}^{\sharp} B^{\sharp}_{yw{\cdot}xz}||^2_2
\end{equation}
\begin{equation}
g(B^{\sharp}_{yw{\cdot}xz})=\lambda_{1}||B^{\sharp}_{yw{\cdot}xz}||^1_{1}.
\end{equation}
Then, {when we divide a whole set of variables} into $\{X\}\cup\mbox{\boldmath $Z$}$ and $\mbox{\boldmath $W$}$ according to the block-coordinate-relaxation method, the minimum optimization for equation (\ref{a7}) includes the following two substep minimization procedures in the $k+1$-th step $(k\geq 0)$:
\begin{equation}
\left.
\begin{array}{l}
\displaystyle B^{\sharp}_{yw{\cdot}xz}[k+1]=\mathop{\mbox{argmin}}_{B}\left( L^{\sharp}_1(\beta_{yx{\cdot}zw}[k],B_{yz{\cdot}xw}[k],B)\right)\\
\displaystyle \left( \beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1]^{T}\right)^{T}=\mathop{\mbox{argmin}}_{b,B}\left( L^{\sharp}_1(\bm{b},B,B^{\sharp}_{yw{\cdot}xz}[k+1])\right)
\end{array}\right\},\label{a10}
\end{equation}
where
\begin{displaymath}
\beta_{yx{\cdot}zw}[0]=\hat{\beta}_{yx{\cdot}z},\,\,\,
B_{yz{\cdot}xw}[0]=\hat{B}_{yz{\cdot}x}
\end{displaymath}
\begin{displaymath}
B^{\sharp}_{yw{\cdot}xz}[0]=\mathop{\mbox{argmin}}_{B}\left(
\frac{1}{2}|| \mbox{\boldmath $y$}-
\mbox{\boldmath $x$}\hat{\beta}_{yx{\cdot}z}-\mbox{\boldmath $z$}\hat{B}_{yz{\cdot}x}-
\mbox{\boldmath $w$}^{\sharp} B||^{2}_{2}+\lambda_1||B||^1_{1}\right).
\end{displaymath}
{First, from equation (\ref{a3}), $B^{\sharp}_{yw{\cdot}xz}[k+1]$ can be expressed as follows:
\begin{eqnarray}
\lefteqn{B^{\sharp}_{yw{\cdot}xz}[k+1]=\mathop{\mbox{argmin}}_{B}\left(f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B^{\sharp}_{yw{\cdot}xz}[k]\right)\right.}\nonumber\\
&&\left.+\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B^{\sharp}_{yw{\cdot}xz}[k]}, B-B^{\sharp}_{yw{\cdot}xz}[k]\right\rangle+g(B)+\frac{l_{f}}{2}||B-B^{\sharp}_{yw{\cdot}xz}[k]||_{2}^{2} \right),\label{a11}
\end{eqnarray}
} where $l_{f}$ is a Lipschitz constant with respect to the partial derivative function
\begin{eqnarray}
\frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B).
\end{eqnarray}
Here, {through the partial derivative of convex function (\ref{a11}) with respect to $B$}, equation (\ref{a11}) can also be rewritten as
\begin{equation}
\hspace*{-5mm}B^{\sharp}_{yw{\cdot}xz}[k+1]={\rm prox}_{\frac{1}{l_{f}}g}\left(B^{\sharp}_{yw{\cdot}xz}[k]-\frac{1}{l_{f}}\frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B^{\sharp}_{yw{\cdot}xz}[k]}\right)\left(=T_{l_{f}}\left(B^{\sharp}_{yw{\cdot}xz}[k]\right)\right),\label{a13}
\end{equation}
where
\begin{eqnarray}
\mbox{prox}_{a}(b)=
{\begin{cases}b-a&:\ b\ge a\\
0&:\ -a< b<a\\
b+a&:\ b\le -a
\end{cases}}
\end{eqnarray}
and we define
\begin{eqnarray}
T_{l_{f}}(B^{\prime})={\rm prox}_{\frac{1}{l_{f}}g}\left( B^{\prime}-\frac{1}{l_{f}}\frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B^{\prime}}\right)
\end{eqnarray}
for any fixed $\beta_{yx{\cdot}zw}$ and $B_{yz{\cdot}xw}$.
Then, we obtain equation (25) by replacing
$\beta_{yx{\cdot}zw}$ and $B_{yz{\cdot}xw}$ with $\beta_{yx{\cdot}zw}[k]$ and $B_{yz{\cdot}xw}[k]$, respectively.

Second, since the sum of squares matrix of $X$ and $\mbox{\boldmath $Z$}$ is invertible in the paper, clearly, the solution of $( \beta_{yx{\cdot}zw}[k+1],$
$ B_{yz{\cdot}xw}^{T}[k+1])^{T}$ given $B^{\sharp}_{yw{\cdot}xz}[k+1]$ can be derived as the least squares estimators of $(b,B)^T$:
\begin{eqnarray}
\left(
\begin{array}{c}
\beta_{yx{\cdot}zw}[k+1]\\
B_{yz{\cdot}xw}[k+1]
\end{array}
\right)=\left(
\begin{array}{cc}
s_{xx}&S_{xz}\\
S^T_{xz}&S_{zz}
\end{array}\right)^{-1}
\left(
\begin{array}{c}
\mbox{\boldmath $x$}^T\\
\mbox{\boldmath $z$}^T
\end{array}\right)
\left(\mbox{\boldmath $y$}-\mbox{\boldmath $w$}B_{yw{\cdot}xz}[k+1]\right).\label{a16}
\end{eqnarray}
Thus, letting $\{(\beta_{yx{\cdot}zw}[l], B_{yz{\cdot}xw}^{T}[l])^T\}_{l\ge 0}$ and $\{B_{yw{\cdot}xz}^{\sharp}[k]\}_{k\ge 0}$ be the sequence generated by procedure (\ref{a10}) for solving the minimization problem with respect to loss function (\ref{a7}), $L^{\sharp}_1(\beta_{yx{\cdot}zw}[l],B_{yz{\cdot}xw}[l],B^{\sharp}_{yw{\cdot}xz}[k])$ is a monotonically decreasing function of $l$ and $k$.

\subsection{Proof}
Under the preparation in Section A.2, we prove the following lemmas to prove Theorem 3.
\begin{lemma}
For a given $\beta_{yx{\cdot}zw}$ and $B_{yz{\cdot}xw}$ in equation (\ref{a7}), we have
\begin{eqnarray}
\lefteqn{L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{1}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, T_{l_{f}}(B_{2})\right)}\nonumber \\
&\ge&\frac{ l_{f}}{2}||B_{1}-T_{l_{f}}(B_{2})||_{2}^{2}-\frac{l_{f}}{2}||B_{1}-B_{2}||_{2}^{2}+d_{f}(B_{1},B_{2})
\end{eqnarray}
for any $B_{1}$ and $B_{2}$, where $d_{f}(B_{1},B_{2})$ is the Bregman distance with $f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw},B )$ between $B_1$ and $B_2$; i.e.,
\begin{eqnarray}
 \lefteqn{d_{f}(B_{1},B_{2})=f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{1}\right)-f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{2}\right)}\nonumber \\
 &&-\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}, B_{1}-B_{2}\right\rangle.
\end{eqnarray}
and $\left\langle \mbox{\boldmath $a$}, \mbox{\boldmath $\mbox{\boldmath $b$}$}\right\rangle$ is an inner product between vectors $\mbox{\boldmath $a$}$ and $\mbox{\boldmath $b$}$,
i.e., $\left\langle \mbox{\boldmath $a$}, \mbox{\boldmath $b$}\right\rangle=\mbox{\boldmath $a$}^T\mbox{\boldmath $b$}$.
\end{lemma}


{\noindent}{\bf Proof of Lemma 1:}
Letting
\begin{eqnarray}
\psi(\bm{b})=f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{2}\right)+\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}, \bm{b}-B_{2}\right\rangle+g(\bm{b})+\frac{l_{f}}{2}||\bm{b}-B_{2}||_{2}^{2},\label{a19}
\end{eqnarray}
$\psi$ is {an $l_{f}$-strongly convex function} from Proposition 3. Referring to equations (\ref{a11}) and (\ref{a13}), we have
\begin{eqnarray}
\lefteqn{{\rm prox}_{\frac{1}{l_{f}}g}\left(B_{2}-\frac{1}{l_{f}}\frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}\right)}\\
&&=\underset{b}{\mbox{argmin}}\left(f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{2}\right)+\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}, \bm{b}-B_{2}\right\rangle+g(\bm{b})+\frac{l_{f}}{2}||\bm{b}-B_{2}||_{2}^{2}\right)\nonumber
\end{eqnarray}
and $T_{l_{f}}(B_{2})=\underset{b}{\mbox{argmin}}\psi(\bm{b})$.
Thus, from Proposition 2, we have
\begin{eqnarray}
\psi(B_{1})-\psi(T_{l_{f}}(B_{2}))\ge\frac{l_{f}}{2}||B_{1}-T_{l_{f}}(B_{2})||^2_2.\label{a21}
\end{eqnarray}
Here, letting $\lambda_{\mbox{max}} (A)$ be the maximum eigenvalue of a $p\times p$ symmetric matrix $A$, when we define
$${\displaystyle \|A\|_{op}=\sup _{x\neq 0}\frac{\displaystyle\mbox{\boldmath $x$}^T A\mbox{\boldmath $x$}}{\displaystyle\mbox{\boldmath $x$}^T\mbox{\boldmath $x$}}=\lambda_{\mbox{max}} (A),}$$
for all $B^{\prime}$ and $B^{\prime\prime}$, we have
\begin{eqnarray}
&&||\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime}}-\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime\prime}}||_{2}=||(\mbox{\boldmath $w$}^{\sharp})^{T}(\mbox{\boldmath $w$}^{\sharp} B^{\prime\prime}-\mbox{\boldmath $w$}^{\sharp} B^{\prime})||^2_{2}\nonumber\\
&&\hspace{1cm}\le ||(\mbox{\boldmath $w$}^{\sharp})^{T}{\mbox{\boldmath $w$}^{\sharp}||_{op}}||B^{\prime\prime}-B^{\prime}||^2_{2}\le\lambda_{{\rm max}}(S_{ww}^{\sharp})||B^{\prime\prime}-B^{\prime}||^2_{2}, \label{a22}
\end{eqnarray}
then $f(\mbox{\boldmath $x$})$ is a $\lambda_{{\rm max}}(S_{ww}^{\sharp})$($=l_{f}$)-smooth function.
In addition, from the 
%Editor: Please ensure that the intended meaning has been maintained in the following edit.
Cauchy--Schwarz inequality 
and equation (\ref{a22}),
\begin{eqnarray}
\lefteqn{\left\langle\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime}}-\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime\prime}},B^{\prime}-B^{\prime\prime} \right\rangle}\nonumber\\
&&\le||\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime}}-\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime\prime}}||_{2} ||B^{\prime}-B^{\prime\prime}||_{2}\nonumber\\
&&\le l_{f} ||B^{\prime}-B^{\prime\prime}||_{2}^{2}.
\end{eqnarray}
Letting
\begin{eqnarray}
h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B)=\frac{l_{f}}{2}||B||_{2}^{2}-f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B),\label{a24}
\end{eqnarray}
from equation (\ref{a24}), we have
\begin{eqnarray}
\lefteqn{\left\langle\frac{\displaystyle \partial}{\displaystyle \partial B} h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime}}-\frac{\displaystyle \partial}{\displaystyle \partial B} h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime\prime}},B^{\prime}-B^{\prime\prime} \right\rangle}\nonumber\\
&&=\left\langle l_{f}(B^{\prime}-B^{\prime\prime})-\left(\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime}}-\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B^{\prime\prime}}\right),B^{\prime}-B^{\prime\prime} \right\rangle\nonumber\\
&&\ge 0\label{a25}
\end{eqnarray}
for any $B'$ and $B''$.
From equation (\ref{a25}), since $h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B)$ is a convex function with respect to $B$ and satisfies the first-order condition, we obtain
\begin{eqnarray}
&&{h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},T_{l_{f}}(B_{2}))\ge h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B_{2})+\left\langle\frac{\displaystyle \partial}{\displaystyle \partial B} h(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B_{2}},T_{l_{f}}(B_{2})-B_{2} \right\rangle}\nonumber\\
&\Leftrightarrow&\frac{l_{f}}{2}||T_{l_{f}}(B_{2})||_{2}^{2}-f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, T_{l_{f}}(B_{2}))\ge\frac{l_{f}}{2}||B_{2}||_{2}^{2}-f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B_{2})\nonumber\\
&&+\left\langle l_{f}B_{2}-\frac{\displaystyle \partial}{\displaystyle \partial B} f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}, B)_{B=B_{2}},T_{l_{f}}(B_{2})-B_{2} \right\rangle\nonumber\\
&\Leftrightarrow&f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw},T_{l_{f}}(B_{2})\right)\le f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw},B_{2}\right)\nonumber\\
&&+\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}, T_{l_{f}}(B_{2})-B_{2}\right\rangle+\frac{l_{f}}{2}||T_{l_{f}}(B_{2})-B_{2}||_{2}^{2}\nonumber\\
&\Leftrightarrow&L_{1}^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw},T_{l_{f}}(B_{2}))\le \psi(T_{l_{f}}(B_{2})).\label{a26}
\end{eqnarray}
From equation (\ref{a26}) together with equation (\ref{a21}), we derive
\begin{eqnarray}
\psi(B_{1})-L_{1}^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw},T_{l_{f}}(B_{2}))\ge \frac{l_{f}}{2}||B_{1}-T_{l_{f}}(B_{2})||_{2}^{2},\label{a27}
\end{eqnarray}
for any fixed $\beta_{yx{\cdot}zw}$ and $B_{yz{\cdot}xw}$ and any $B_{1}$.

From equations (\ref{a19}) and (\ref{a27}), we obtain
\begin{eqnarray}
\lefteqn{f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{2}\right)+\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}, B_{1}-B_{2}\right\rangle+g(B_{1})}\nonumber\\
&&+\frac{l_{f}}{2}||B_{1}-B_{2}||-L_{1}^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw},T_{l_{f}}(B_{2}))\ge \frac{l_{f}}{2}||B_{1}-T_{l_{f}}(B_{2})||_{2}^{2},
\end{eqnarray}
and adding $f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{1}\right)$ 
%Editor: Please ensure that the intended meaning has been maintained in the following edit.
to the right-hand side of the above function for rearrangement, 
we obtain
\begin{eqnarray}
\lefteqn{L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{1}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, T_{l_{f}}(B_{2})\right)}\nonumber \\
&\ge&\frac{l_{f}}{2}||B_{1}-T_{l_{f}}(B_{2})||_{2}^{2}-\frac{l_{f}}{2}||B_{1}-B_{2}||_{2}^{2}+f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{1}\right)-f^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{2}\right)\nonumber\\
&&-\left\langle \frac{\partial}{ \partial  B}f^{\sharp}(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B)_{B=B_{2}}, B_{1}-B_{2}\right\rangle
\end{eqnarray}
which completes the proof.
\hspace*{\fill}$\Box$
\begin{lemma}
Let $\{B_{yw{\cdot}xz}^{\sharp}[k]\}_{k\ge 0}$ be the sequence generated by the sequential minimization of equation (\ref{a10}) given $\beta_{yx{\cdot}zw}$ and $B_{yz{\cdot}xw}$.
Then, for optimal solution $B_{yw{\cdot}xz}^{\sharp*}$, there exists some natural number $K$ for any $\epsilon>0$ such that
\begin{eqnarray}
||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k+1]||_{2}^{2}<\epsilon
\end{eqnarray}
for all $k\ge K$.
\end{lemma}
{\noindent}{\bf Proof of Lemma 2: }Letting $B_{1}=B_{yw{\cdot}xz}^{\sharp*}$ and $B_{2}=B_{yw{\cdot}xz}^{\sharp}[k]$ in Lemma 1, from $B_{yw{\cdot}xz}^{\sharp}[k+1]=T_{l_{f}}(B_{yw{\cdot}xz}^{\sharp}[k])$ and
the nonnegativity of the Bregman distance regarding the convex functions, we have
\begin{eqnarray}
\lefteqn{\frac{2}{l_{f}}(L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{yw{\cdot}xz}^{\sharp*}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right))}\nonumber\\
&&\ge ||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k+1]||_{2}^{2}-||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k]||_{2}^{2}+\frac{2}{l_{f}}d_{f}(B_{yw{\cdot}xz}^{\sharp*},B_{yw{\cdot}xz}^{\sharp}[k])\nonumber\\
&&\ge ||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k+1]||_{2}^{2}-||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k]||_{2}^{2}.
\end{eqnarray}
Thus, noting that $\left\{L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}, B_{yz{\cdot}xw}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right)\right\}_{k\geq 0}$ is a monotonically decreasing sequence with respect to $k$, we have
\begin{eqnarray}
0\le ||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k+1]||_{2}^{2}\le ||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k]||_{2}^{2},
\end{eqnarray}
i.e., $\{B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k]\}_{k\geq 0}$ is also a monotonically decreasing sequence with respect to $k$.
Noting that the formulation of $B_{yw{\cdot}xz}^{\sharp}[k]$ is the paraphrase of equation (\ref{a3}) in Proposition 1 given $\beta_{yx{\cdot}zw}$ and $B_{yw{\cdot}xz}$, $B_{yw{\cdot}xz}^{\sharp}[k]$ converges to $B_{yw{\cdot}xz}^{\sharp*}$ for $k\rightarrow \infty$.
In other words, there exists some natural number $K$ for any $\epsilon>0$ such that
\begin{eqnarray}
||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k+1]||_{2}^{2}<\epsilon
\end{eqnarray}
for all $k\ge K$.\hspace*{\fill}$\Box$

\begin{lemma}
Let $\{B_{yw{\cdot}xz}^{\sharp}[k]\}_{k\ge 0}$ be the sequence generated by the sequential minimization of equation (\ref{a10}) given an optimal solution $\beta_{yx{\cdot}zw}^{*}$ and $B_{yz{\cdot}xw}^{*}$.
Then, for optimal solution $B_{yw{\cdot}xz}^{\sharp*}$,
\begin{eqnarray}
\lefteqn{\hspace*{-2cm}L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right)}\nonumber \\
&\le& \frac{\lambda_{{\rm max}}(S^{\sharp}_{ww})}{2k}||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[0]||_{2}^{2}
\end{eqnarray}
holds for all $k\ge 0$.
\end{lemma}
{\noindent}{\bf Proof of Lemma 3: }
For any $i\ge 0$, letting $B_{1}=B_{yw{\cdot}xz}^{\sharp*}$ and $B_{2}=B_{yw{\cdot}xz}^{\sharp}[i]$ in Lemma 1, since we have
\begin{eqnarray}
\lefteqn{\frac{2}{l_{f}}(L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[i+1]\right))}\nonumber \\
&&\ge||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[i+1]||_{2}^{2}-||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[i]||_{2}^{2}+ \frac{2}{l_{f}}d_{f}(B_{yw{\cdot}xz}^{\sharp*},B_{yw{\cdot}xz}^{\sharp}[i])\nonumber \\
&&\ge||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[i+1]||_{2}^{2}-||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[i]||_{2}^{2},
\end{eqnarray}
we obtain
\begin{eqnarray}
\lefteqn{\frac{2}{l_{f}}\sum_{i=0}^{k-1}(L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[i+1]\right))}\nonumber\\
&&\ge||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[k]||_{2}^{2}-||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[0]||_{2}^{2}\geq -||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[0]||_{2}^{2}.
\end{eqnarray}
Here, noting that $\left\{
L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[i+1]\right)
\right\}_{i\geq 0}$ is a monotonically decreasing sequence with respect to $i\geq 0$,
we derive
\begin{eqnarray}
\lefteqn{k(L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[k]\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right))}\nonumber\\
&\le&\sum_{i=0}^{k-1}(L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[i+1]\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right))\nonumber \\
&\le&\frac{l_{f}}{2}||B_{yw{\cdot}xz}^{\sharp*}-B_{yw{\cdot}xz}^{\sharp}[0]||_{2}^{2}.
\end{eqnarray}
Finally, {noting that $l_{f}=\lambda_{{\rm max}}(S_{ww}^{\sharp})$}, we derive Lemma 3.
\hspace*{\fill}$\Box$

From Lemma 3, according to equation (\ref{a16}), we provide the optimal solution $\beta_{yx{\cdot}zw}^{*}$ and $B_{yz{\cdot}xw}^{*}$ given $B^{*}_{yw{\cdot}xz}$ as
\begin{eqnarray}
\left(
\begin{array}{c}
\beta^{*}_{yx{\cdot}zw}\\
B^{*}_{yz{\cdot}xw}
\end{array}
\right)=\left(
\begin{array}{cc}
s_{xx}&S_{xz}\\
S^T_{xz}&S_{zz}
\end{array}\right)^{-1}
\left(
\begin{array}{c}
\mbox{\boldmath $x$}^T\\
\mbox{\boldmath $z$}^T
\end{array}\right)
\left(\mbox{\boldmath $y$}-\mbox{\boldmath $w$}B^{*}_{yw{\cdot}xz}\right).\label{a38}
\end{eqnarray}
Then, the following lemma is obtained.
\begin{lemma}Let $\{\beta_{yx{\cdot}zw}[k]\}_{k\ge 0}$, $\{B_{yz{\cdot}xw}[k]\}_{k\ge 0}$ and $\{B_{yw{\cdot}xz}^{\sharp}[k]\}_{k\ge 0}$ be the sequences generated by i-PROGLES and $\mbox{\boldmath $u$}=(\mbox{\boldmath $x$},\mbox{\boldmath $z$})$. Then,
for optimal solution $\beta_{yx{\cdot}zw}^{*}$, $B_{yz{\cdot}xw}^{*}$,
there exists some natural number $K$ for any $\epsilon\geq 0$ such that
\begin{eqnarray}
\lefteqn{L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}^{\sharp}[k+1]\right)}\nonumber\\
&&-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right)\le\frac{\lambda_{{\rm max}}(S_{uu})}{2}\lambda_{{\rm max}}(S_{wu}^{\sharp}S_{uu}^{-2}S_{uw}^{\sharp})\epsilon
\end{eqnarray}
for all $k\ge K$.
\end{lemma}
{\noindent}{\bf Proof of Lemma 4: }
For all $k\ge 0$, we obtain
{\begin{eqnarray}
\lefteqn{\hspace*{-1cm}L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}^{\sharp}[k+1]\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right)}\nonumber\\
&&\le\frac{1}{2}||\bm{u}((\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1])^{T}-(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*})^{T})||_{2}^{2} \nonumber \\
&&\le\frac{1}{2}||\bm{u}||_{op}^{2}||(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1])^{T}-(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*})^{T}||_{2}^{2} \nonumber \\
&&\le\frac{\lambda_{{\rm max}}(S_{uu})}{2}||(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1])^{T}-(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*})^{T}||_{2}^{2}.\label{a40}
\end{eqnarray}}
From equations (\ref{a16}) and (\ref{a38}), we obtain
\begin{eqnarray}
\lefteqn{||(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1])^{T}-(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*})^{T}||_{2}^{2}}\nonumber\\
&&=||\left(  \mbox{\boldmath $u$}^{T} \mbox{\boldmath $u$}\right)^{-1} \mbox{\boldmath $u$}^{T}\mbox{\boldmath $w$}^{\sharp}\left( B_{yw{\cdot}xz}^{\sharp*}- B_{yw{\cdot}xz}^{\sharp}[k+1]\right)||_{2}^{2}\nonumber\\
&&\le||\left(  \mbox{\boldmath $u$}^{T} \mbox{\boldmath $u$}\right)^{-1} \mbox{\boldmath $u$}^{T}\mbox{\boldmath $w$}^{\sharp}||_{op}^{2}||\left( B_{yw{\cdot}xz}^{\sharp*}- B_{yw{\cdot}xz}^{\sharp}[k+1]\right)||_{2}^{2}
\end{eqnarray}
Here, there exists a maximum eigenvalue of $S_{wu}^{\sharp}S_{uu}^{-2}S_{uw}^{\sharp}$ because the sum of squares matrix of $\mbox{\boldmath $x$}$ and $\mbox{\boldmath $z$}$ is invertible.
Thus, from Lemma 2, there exists some natural number $K$ for any $\epsilon>0$ such that
\begin{eqnarray}
||(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1])^{T}-(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*})^{T}||_{2}^{2}\le \lambda_{{\rm max}}(S_{wu}^{\sharp}S_{uu}^{-2}S_{uw}^{\sharp})\epsilon\label{a42}
\end{eqnarray}
for all $k\ge K$.
From equations (\ref{a40}) and equation (\ref{a42}), we obtain Lemma 4.\hspace*{\fill}$\Box$

\setcounter{theorem}{2}
\begin{theorem}
Let $\{\beta_{yx{\cdot}zw}[k]\}_{k\ge 0}$, $\{B_{yz{\cdot}xw}[k]\}_{k\ge 0}$ and $\{B_{yw{\cdot}xz}[k]\}_{k\ge 0}$ be the sequences of $\beta_{yx{\cdot}zw}$, $B_{yz{\cdot}xw}$ and $B_{yw{\cdot}xz}$, respectively, generated by i-PROGLES, and let $\mbox{\boldmath $u$}=(\mbox{\boldmath $x$},\mbox{\boldmath $z$})$.
When $\beta_{yx{\cdot}zw}^{*}$, $B_{yz{\cdot}xw}^{*}$ and $B^{ *}_{yw{\cdot}xz}$ minimize equation (19) regarding $\beta_{yx{\cdot}zw}$, $B_{yz{\cdot}xw}$ and $B_{yw{\cdot}xz}$, respectively, there exists a natural number $K$ for any $\epsilon>0$ such that 
\begin{eqnarray}
\lefteqn{L_{1}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{*}\right)-L_{1}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}[k+1]\right)}\nonumber\\
&&\le \frac{\lambda_{{\rm max}}(S^{\sharp}_{ww})}{2k}||B_{yw{\cdot}xz}^{\sharp}[0]-B_{yw{\cdot}xz}^{\sharp*}||_{2}^{2}+\frac{\lambda_{{\rm max}}(S_{uu})}{2}\lambda_{{\rm max}}(S_{wu}^{\sharp}S_{uu}^{-2}S_{uw}^{\sharp})\epsilon.
\end{eqnarray}
holds for any $k\ge K$, where $B^{\sharp}_{yw{\cdot}xz}[k]=\mbox{\boldmath $\gamma$}\odot B_{yw{\cdot}xz}[k]$ and $B^{\sharp *}_{yw{\cdot}xz}=\mbox{\boldmath $\gamma$}\odot B_{yw{\cdot}xz}^{*}$.
\end{theorem}
{\noindent}{\bf Proof of Theorem 3: }
Noting that
\begin{eqnarray}
\lefteqn{L_{1}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{*}\right)-L_{1}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}[k+1]\right)}\nonumber\\
&&=L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}^{\sharp}[k+1]\right)\nonumber\\
&&=L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp*}\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right)\nonumber\\
&&+L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{\sharp}[k+1]\right)-L_{1}^{\sharp}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}^{\sharp}[k+1]\right),
\end{eqnarray}
from Lemmas 3 and 4, we have
\begin{eqnarray}
\lefteqn{L_{1}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{*}\right)-L_{1}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}[k+1]\right)}\nonumber\\
&&\le \frac{\lambda_{{\rm max}}(S^{\sharp}_{ww})}{2k}||B_{yw{\cdot}xz}^{\sharp}[0]-B_{yw{\cdot}xz}^{\sharp*}||_{2}^{2}+\frac{\lambda_{{\rm max}}(S_{uu})}{2}\lambda_{{\rm max}}(S_{wu}^{\sharp}S_{uu}^{-2}S_{uw}^{\sharp})\epsilon.
\end{eqnarray}
\hspace*{\fill}$\Box$.

\clearpage
\section{Numerical Experiments}

\quad~In this section, we conduct numerical experiments to compare the performance of LASSO, adaptive LASSO, elastic net, SCAD, MCP, OLS and PAL$_1$MA.

\subsection{Loss Functions}

\setcounter{equation}{0}

For an $r$-dimensional regression vector $B_{yz{\cdot}xw}$ and a $q$-dimensional regression vector $B_{yw{\cdot}xz}$, 
let $B_y=(\beta_{yx{\cdot}zw},B^T_{yz{\cdot}xw}, B^T_{yw{\cdot}xz})^T=(\beta_{1},\beta_{2},...,\beta_{q+r+1})^T$ and $\lambda,\lambda_1, \lambda_2\geq 0$.
First, the loss function of adaptive LASSO \citep{Zou06} is defined as
\begin{equation}
\frac{1}{2}||\mbox{\boldmath $y$}-\mbox{\boldmath $x$}\beta_{yx{\cdot}zw}-\mbox{\boldmath $z$}B_{yz{\cdot}xw}-\mbox{\boldmath $w$} B_{yw{\cdot}xz}||^2_2+\lambda||\bm{\gamma}\odot B_y||^1_{1},\label{b1}
\end{equation}
where $\mbox{\boldmath $\gamma$}=(\gamma_{1},\gamma_{2},...,\gamma_{q+r+1})^T$ is a weight vector such that
\begin{equation}
\mbox{\boldmath $\gamma$}=\left(\frac{1}{|\tilde{\beta}_{1}|^{\xi}},\frac{1}{|\tilde{\beta}_{2}|^{\xi}},\ldots,\frac{1}{|\tilde{\beta}_{q+r+1}|^{\xi}}\right)^{T}
\end{equation}
for the non-invertible sum of squares matrix of the explanatory variables with tuning parameter $\xi\geq 0$ and
\begin{equation}
\mbox{\boldmath $\gamma$}=\left(\frac{1}{|\hat{\beta}_{1}|^{\xi}},\frac{1}{|\hat{\beta}_{2}|^{\xi}},\ldots,\frac{1}{|\hat{\beta}_{q+r+1}|^{\xi}}\right)^{T}
\end{equation}
for the invertible sum of squares matrix of the explanatory variables with tuning parameter $\xi\geq 0$.
In particular, equation (\ref{b1}) is the loss function of the standard LASSO \citep{Tibshirani96} when $\xi=0$ and the loss function of OLS regression when $\lambda=0$.

Second, for $0\leq \phi \leq 1$, the loss function of the elastic net \citep{Zou05} is given by \begin{eqnarray}
\frac{1}{2}||\mbox{\boldmath $y$}-\mbox{\boldmath $x$}\beta_{yx{\cdot}zw}-\mbox{\boldmath $z$}B_{yz{\cdot}xw}-\mbox{\boldmath $w$} B_{yw{\cdot}xz}||^2_2+\lambda \left((1-\phi)||B_y||^2_{2}+\phi||B_y||^1_{1}\right). 
\end{eqnarray}

Third, consider the following type of loss function:
\begin{eqnarray}
\frac{1}{2}||\mbox{\boldmath $y$}-\mbox{\boldmath $x$}\beta_{yx{\cdot}zw}-\mbox{\boldmath $z$}B_{yz{\cdot}xw}-\mbox{\boldmath $w$} B_{yw{\cdot}xz}||^2_2+\sum_{j=1}^{q+r+1}p_{\lambda,\xi}(\beta_{j}).\label{b5}
\end{eqnarray}
Then, for $\xi>1$, the loss function of MCP \citep{Zhang10} is given by defining the function $p_{\lambda,\xi}$ in equation (\ref{b5}) as follows:
\begin{eqnarray}
p_{\lambda,\xi}(x)=
\begin{cases}\displaystyle \lambda |x|-\frac{|x|^{2}}{2\xi}&:\ |x|\le\xi\lambda\\
\displaystyle\frac{1}{2}\xi\lambda^{2}&:\ |x|>\xi\lambda
\end{cases}. 
\end{eqnarray}
In addition, for $\xi>2$, the loss function of SCAD \citep{Fan01} is given by defining the function $p_{\lambda,\xi}$ in equation (\ref{b5}) as follows:
\begin{eqnarray}
p_{\lambda,\xi}(x)=
\begin{cases}\displaystyle \lambda |x|&:\ |x|\le\lambda\\
\displaystyle\frac{\xi\lambda |x|-0.5(|x|^{2}+\lambda^{2})}{\xi-1}&:\ \lambda<|x|<\xi\lambda\\
\displaystyle\frac{\lambda^{2}(\xi^{2}-1)}{2(\xi-1)}&:\ |x|>\xi\lambda
\end{cases}.
\end{eqnarray}
In this paper, we use the ``glmnet'' package (version 4.0.2) \citep{Friedman10} to perform LASSO, adaptive LASSO and elastic net, and the ``ncvreg'' package \citep{Breheny11} to conduct SCAD and MCP.
The ``glmnet'' and ``ncvreg'' packages are available from \url{https://glmnet.stanford.edu/} and
\url{http://pbreheny.github.io/ncvreg/}, respectively.

\subsection{Parameter settings}
\begin{table}[!ttt]
\begin{center}
Table A. Path coefficients
\vspace*{3mm}

\begin{tabular}{ccccccc} 
 & \multicolumn{5}{c}{$(a)$ $Z$ satisfies the back-door criterion} &  \\\cline{2-6}
 & Fig.~A (a) & $\alpha_{yz}$ & $\alpha_{xz}$ & $\alpha_{yx}$ & $A_{yw}$ &  \\\cline{2-6}
 & $(a_1)$ & 0.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ &  \\
 & $(a_2)$ & 0.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ &  \\
 & $(a_3)$ & 2.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ &  \\
 & $(a_4)$ & 2.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ &  \\\cline{2-6}
 &  &  &  &  &  &  \\
\multicolumn{7}{c}{$(b)$ $\{Z_1,Z_2\}$ satisfies the back-door criterion} \\\hline
Fig.~A (b) & $\alpha_{yz_1}$ & $\alpha_{xz_1}$ & $\alpha_{yz_2}$ & $\alpha_{xz_2}$ & $\alpha_{yx}$ & $A_{yw}$ \\\hline
$(b_1)$ & 0.5 & 0.5 & 0.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_2)$ & 0.5 & 0.5 & 0.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_3)$ & 0.5 & 0.5 & 2.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_4)$ & 0.5 & 0.5 & 2.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_5)$ & 0.5 & 2.5 & 0.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_6)$ & 0.5 & 2.5 & 0.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_7)$ & 0.5 & 2.5 & 2.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_8)$ & 0.5 & 2.5 & 2.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_9)$ & 2.5 & 0.5 & 0.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{10})$ & 2.5 & 0.5 & 0.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{11})$ & 2.5 & 0.5 & 2.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{12})$ & 2.5 & 0.5 & 2.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{13})$ & 2.5 & 2.5 & 0.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{14})$ & 2.5 & 2.5 & 0.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{15})$ & 2.5 & 2.5 & 2.5 & 0.5 & $U([-3,3])$ & $U([-3,3])$ \\
$(b_{16})$ & 2.5 & 2.5 & 2.5 & 2.5 & $U([-3,3])$ & $U([-3,3])$ \\\hline
\end{tabular}
\end{center}
$U([-3,3])$: path coefficients that have been determined by the random number from the uniform distribution on the interval $[-3,3]$.
\end{table}

\begin{figure}[bbb].
\vspace*{-3mm}
%\begin{center}

\hspace*{\fill}\hspace*{\fill}\includegraphics[width=5.2cm,clip]{Fig1(a).jpg}\hspace*{\fill}
\includegraphics[width=5.2cm,clip]{Fig1(b).jpg}\hspace*{\fill}
%\vspace*{2mm}

\hspace*{2.7cm}\hspace*{\fill}(a)\hspace*{\fill}\hspace*{\fill} \hspace*{\fill}(b)\hspace*{\fill}\hspace*{\fill}
\vspace*{1mm}

\hspace*{\fill}Fig. A. Causal diagram\hspace*{\fill}
%\end{center}
%\vspace*{-5mm}

\end{figure}

For simplicity, letting $X$ and $Y$ be the treatment variable and the response variable, respectively, consider the linear SCMs with 42 explanatory variables for $Y$ in the form of
\begin{equation}
\left.
\begin{array}{l}
Y=\alpha_{yx}X+\alpha_{yz}Z+A_{yw}\mbox{\boldmath $W$}+\epsilon_{y}\\
X=\alpha_{xz}Z+\epsilon_{x}
\end{array}\right\}
\end{equation}
for Fig.~A (a) ($\mbox{\boldmath $W$}$ includes 40 variables), and
\begin{equation}
\hspace*{-5.0mm}\left.
\begin{array}{l}
Y=\alpha_{yx}X+\alpha_{yz_1}Z_1+\alpha_{yz_2}Z_2+A_{yw}\mbox{\boldmath $W$}+\epsilon_{y}\\
X=\alpha_{xz_1}Z_1+\alpha_{xz_2}Z_2+\epsilon_{x}
\end{array}
\hspace*{-1mm}
\right\}
\end{equation}
for Fig.~A (b) ($\mbox{\boldmath $W$}$ includes 39 variables).
Fig.~A (a) shows that (i) $Z$ satisfies the back-door criterion relative to $(X,Y)$, and (ii) the path coefficients of $\mbox{\boldmath $W$}$ on $Y$ are regularized, but $Z$ is not.
Fig.~A (b) shows that (i) $\{Z_1,Z_2\}$ satisfies the back-door criterion relative to $(X,Y)$, and (ii) the path coefficients of $\{Z_2\}\cup\mbox{\boldmath $W$}$ on $Y$ are regularized, but $Z_1$ is not.
Theorem 1 holds in Fig.~A (a), so $\mbox{\boldmath $W$}$ is collapsible.
However, $\{Z_2\}\cup\mbox{\boldmath $W$}$ is not in Fig.~A (b); thus, the estimated total effect may be biased.

\quad~To construct the population variance-covariance matrix, first, we assigned one of $0.5$ and $2.5$ to $\alpha_{yz}$ and $\alpha_{xz}$, depending on Fig.~A (a), and $\alpha_{yz_1}$, $\alpha_{yz_2}$, $\alpha_{xz_1}$ and $\alpha_{xz_2}$, depending on Fig.~A (b).
Multicollinearity may occur between $X$ and the covariates satisfying the back-door criterion when we assign $2.5$ to the path coefficients on $X$ but may not occur when we assign $0.5$ to the path coefficients on $X$.
Other path coefficients were randomly and independently generated according to the uniform distribution on the interval $[-3,3]$.
These parameter settings are shown in Table A.
In addition, the population variance-covariance matrices of the covariates $\{Z\}\cup \mbox{\boldmath $W$}$ in Fig.~A (a) and $\{Z_1,Z_2\}\cup \mbox{\boldmath $W$}$ in Fig.~A (b) are also randomly generated using the ``randcorr'' package (available from \url{https://www.rdocumentation.org/packages/randcorr/versions/1.0/topics/}\url{randcorr-package}) according to \cite{Pourahmadi15}.
Furthermore, we assume that (i) the random disturbances $\epsilon_{x}$ and $\epsilon_{y}$ independently follow normal distributions with mean zero and variance one, and (ii) the random disturbances are also independent of their non-descendants.

Regarding tuning the regularization parameter $\lambda$,
the ``glmnet'' package was utilized for LASSO, adaptive LASSO and elastic net.
Here, the search ranges were set to $\xi\in\{0.1,0.2,0.3,...,2.9,3.0\}$ for the tuning parameter $\xi$ of adaptive LASSO and $\phi\in\{0.01,0.02,0.03,...,0.98,0.99\}$ for the mixing parameter $\phi$ of elastic net.
For MCP and SCAD, the ``ncvreg'' package was applied to determine the regularized parameter $\lambda$.
Here, the search ranges were set to $\xi\in\{1.5,2.0,2.5,...,19.5,20.0\}$ for the tuning parameter $\xi$ of MCP and
$\xi\in\{2.5,...,19.5,20.0\}$ for the tuning parameter $\xi$ of SCAD.
In contrast, in PAL$_1$MA, we conducted all possible selection based on three fold cross-validation to determine the regularization parameter $\lambda_1$ from the search range $\lambda_1\in\{ 0.01,0.011,...,0.049,0.050\}$ and the tuning parameter $\xi_1$ from the search range $\xi_1\in\{0.1,0.2,0.3,0.4,0.5\}$.
Similarly, bias correction was also conducted through all possible selections to determine the regularization parameter $\lambda_2$ from the search range $\lambda_2\in\{0.00, 0.01,0.02,0.03\}$ and the tuning parameter $\xi_2$ from the search range $\xi_2\in\{0.00, 0.01,0.02,0.03\}$.
Note that such parameter settings of PAL$_1$MA in this paper are somewhat empirical; i.e., they may not be optimally determined compared to other regularized regression analyses.
The development of optimal parameter tuning for PAL$_1$MA is saved for future work.
The parameter tuning results are shown in Table B.

\subsection{Analysis}

\quad~For 5000 replications, we generated 30 random samples of 42 variables from a multivariate normal distribution with a zero mean vector and the population variance-covariance matrix generated by the above procedure.
Tables C and C' show the numerical results by LASSO, adaptive LASSO, elastic net, SCAD, MCP, PAL$_1$MA and OLS based on Table B.
Here, for OLS, we select a set of covariates based on prior causal knowledge; i.e., $Z$ and $\{Z_1,Z_2\}$ are selected in Figs.~A (a) and (b), respectively.

From Figs.~B and B' and Tables C and C', we make the following observations:

\begin{enumerate}
\item When the total effect is close to zero, the coincidence rates between the signs of the estimated total effects and the true total effects are low for each regression analysis, but those of PAL$_1$MA are still higher than those of the other regression analyses.

\item When the true total effect is far from zero, the coincidence rates are high for each regression analysis.

\item When there is high spurious correlation,
the coincidence rates for PAL$_1$MA are lower than those for elastic net, but the differences are not significant.
This situation may occur because the variances of the estimated total effects are larger than those of the other regression analyses.

\item Except for Case $(b_8)$, PAL$_1$MA provides fewer bias estimates than the other regularized regression analyses.
In Case $(b_8)$, PAL$_1$MA provides more biased estimates than SCAD and MCP but higher coincidence rates than these regularized regression analyses.

\item The variance of the estimated total effects from PAL$_1$MA are larger than those from the other regularized regression analyses but smaller than those from OLS regression for most cases.

\item From Figs.~B and B', the interquartile ranges of PAL$_1$MA include the true value of the total effects in all cases, but the other regularized regression analyses do not include this value in most cases.

\item {The running time of the i-PROGLES is slightly longer than those of other regularized regression analyses.}
%In addition, OLS and PAL$_1$MA estimators are distributed symmetrically around the sample means, but the other regularized regression estimators are not, instead skewed to zeros.
%Thus, the results from the asymptotic normal distribution may not be applicable for our simulation experiments.

\end{enumerate}

Overall, the coincidence rates between the signs of the estimated total effects and the true total effect from PAL$_1$MA seem equal to or higher than those from the other regression analyses.
In addition, PAL$_1$MA can provide less biased estimators than the other regularized regression analyses in most cases.
In some cases of Figs.~A (b), PAL$_1$MA does not select a set $\{Z_1,Z_2\}$ of covariates satisfies the back-door criterion, and such a  missing covariate ($Z_2$) provides biased estimates of the total effects. 
However, since the regression coefficient of $Z_2$ takes a small value in such cases, PAL$_1$MA seems not reverse the direction of the regression coefficient in most case. 
Here, as seen from the following section, note that such a drawback can be eliminated by selecting smaller values of the regularization parameters based on the whole set of covariates{, although an estimated total effects may not be stable in some situations.} 
These results imply that the estimation of the total effect by PAL$_1$MA does not lead to the misleading qualitative interpretation compared to the standard regularized regression analysis.

\begin{landscape}
\begin{table*}[!ttt]
\hspace*{\fill}Table B. Parameter settings\hspace*{\fill}
\vspace*{2mm}

%\hspace*{10mm}
{ \begin{tabular}{c|c|cc|cc|cc|cc|cccc|c} 
\multicolumn{15}{c}{(a) $Z$ satisfies the back-door criterion} \\\hline
\multirow{2}{*}{Fig.~A (a)}  & LASSO & \multicolumn{2}{c|}{adaptive LASSO} & \multicolumn{2}{c|}{Elastic net} & \multicolumn{2}{c|}{MCP} & \multicolumn{2}{c|}{SCAD} & \multicolumn{4}{c|}{PAL$_{1}$MA}&Total effect \\\cline{2-15}
 & $\lambda$ & $\xi$ & $\lambda$ & $\phi$ & $\lambda$ & $\xi$ & $\lambda$ & $\xi$ & $\lambda$ & $\xi_1$ & $\lambda_1$ & $\xi_2$ & $\lambda_2$ & $\tau_{yx}$ \\\hline
$(a_{1})$ &  0.0080 & 1.8000 & 1.1510 & 0.2500 & 0.0250 & 11.5000 & 0.0470 & 16.0000 & 0.0380 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & -0.1520  \\
$(a_{2})$ &  0.0240 & 0.9000 & 1.9240 & 0.6600 & 0.0640 & 5.0000 & 0.0340 & 16.5000 & 0.0720 & 0.2000 & 0.0100 & 0.0001 & 0.0001 & 0.1290  \\
$(a_{3})$ &  0.0070 & 0.7000 & 0.0460 & 0.0400 & 0.1680 & 15.0000 & 0.0350 & 12.0000 & 0.0900 & 0.1000 & 0.0100 & 0.0013 & 0.0014 & -0.0200  \\
$(a_{4})$ &0.0150 & 0.8000 & 0.0440 & 0.1100 & 0.0500 & 19.0000 & 0.0320 & 12.5000 & 0.0810 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & 0.3770  \\\hline
\multicolumn{15}{c}{}  \\
\multicolumn{15}{c}{(b) $\{Z_1,Z_2\}$ satisfies the back-door criterion} \\\hline
\multirow{2}{*}{Fig.~A (b)}  & LASSO & \multicolumn{2}{c|}{adaptive LASSO} & \multicolumn{2}{c|}{Elastic net} & \multicolumn{2}{c|}{MCP} & \multicolumn{2}{c|}{SCAD} & \multicolumn{4}{c|}{PAL$_{1}$MA}&Total effect \\\cline{2-15} & $\lambda$ & $\xi$ & $\lambda$ & $\phi$ & $\lambda$ & $\xi$ & $\lambda$ & $\xi$ & $\lambda$ & $\xi_1$ & $\lambda_1$ & $\xi_2$ & $\lambda_2$ & $\tau_{yx}$ \\\hline
$(b_{1})$ & 0.0110 & 1.2000 & 0.1340 & 0.2400 & 0.0290 & 20.0000 & 0.0370 & 16.5000 & 0.0440 & 0.2000 & 0.0100 & 0.0005 & 0.0005 & -0.1520  \\
$(b_{2})$ & 0.0560 & 1.2000 & 0.4380 & 0.9700 & 0.0550 & 20.0000 & 0.0880 & 13.0000 & 0.0960 & 0.1000 & 0.0100 & 0.0005 & 0.0006 & -0.6700   \\
$(b_{3})$ &0.0070 & 1.3000 & 0.8200 & 0.7500 & 0.0360 & 13.0000 & 0.0400 & 5.0000 & 0.0450 & 0.1000 & 0.0100 & 0.0024 & 0.0026 & -0.1840  \\
$(b_{4})$ & 0.0850 & 1.1000 & 0.7270 & 0.2400 & 0.0320 & 14.0000 & 0.1510 & 16.5000 & 0.0650 & 0.1000 & 0.0100 & 0.0011 & 0.0012 & -0.7780 \\
$(b_{5})$ &0.0630 & 0.8000 & 0.0570 & 0.7900 & 0.0100 & 16.5000 & 0.0730 & 13.0000 & 0.0750 & 0.4000 & 0.0100 & 0.0000 & 0.0000 & 0.3520 \\
$(b_{6})$ &0.0090 & 0.3000 & 0.0520 & 0.7500 & 0.0320 & 12.5000 & 0.1170 & 15.5000 & 0.0550 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & -0.2630 \\
$(b_{7})$ &0.0060 & 1.0000 & 0.0840 & 0.2800 & 0.0230 & 17.0000 & 0.0320 & 20.0000 & 0.1410 & 0.5000 & 0.0100 & 0.0000 & 0.0000 & 0.2980 \\
$(b_{8})$ &  0.1060 & 1.2000 & 0.3590 & 0.8400 & 0.0150 & 10.0000 & 0.0620 & 9.5000 & 0.0720 & 0.1000 & 0.0100 & 0.0005 & 0.0006 & -0.8240  \\
$(b_{9})$ & 0.0360 & 0.8000 & 0.0910 & 0.7900 & 0.0250 & 16.5000 & 0.0770 & 3.5000 & 0.1510 & 0.4000 & 0.0100 & 0.0000 & 0.0000 & 0.1580 \\
$(b_{10})$ & 0.0070 & 1.2000 & 0.1280 & 0.5500 & 0.0240 & 13.5000 & 0.1240 & 15.0000 & 0.0760 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & -0.1850   \\
$(b_{11})$ &0.0750 & 1.5000 & 0.6240 & 0.5700 & 0.0490 & 4.0000 & 0.0960 & 17.0000 & 0.1260 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & -0.4260   \\
$(b_{12})$ & 0.0330 & 1.5000 & 1.5240 & 0.4300 & 0.0890 & 11.0000 & 0.1020 & 16.0000 & 0.1150 & 0.1000 & 0.0300 & 0.0000 & 0.0000 & -0.0840   \\
$(b_{13})$ &0.0980 & 2.3000 & 2.0180 & 0.7100 & 0.0570 & 5.0000 & 0.1580 & 3.0000 & 0.1320 & 0.2000 & 0.0100 & 0.0001 & 0.0001 & -0.8140   \\
$(b_{14})$ & 0.0630 & 1.2000 & 0.2350 & 0.5500 & 0.0250 & 19.0000 & 0.0610 & 15.0000 & 0.0710 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & -0.2580 \\
$(b_{15})$ &0.0330 & 1.4000 & 0.6870 & 0.9900 & 0.0500 & 12.0000 & 0.0840 & 10.5000 & 0.0540 & 0.3000 & 0.0100 & 0.0000 & 0.0000 & -0.1950 \\
$(b_{16})$ &0.0860 & 1.2000 & 0.8960 & 0.8400 & 0.0180 & 6.0000 & 0.0490 & 17.0000 & 0.0420 & 0.1000 & 0.0100 & 0.0008 & 0.0009 & -0.9100 
  \\\hline
\end{tabular}}
\vspace*{1mm}

{\hspace*{11mm}
$\lambda$, $\lambda_1$, $\lambda_2$: regularization parameters; $\xi$, $\xi_1$, $\xi_2$: tuning parameters; $\phi$: mixing parameter; $\tau_{yx}$: total effect of $X$ on $Y$.  
}
\end{table*}

\end{landscape}



\begin{table*}[!ttt]
\begin{center}
Table C. Results based on cross-validation.
\vspace*{2mm}

(a) $Z$ satisfies the back-door criterion
\end{center}
\vspace*{2mm}

\hspace{-7.5mm}\begin{tabular}{cccccccccccc} 
 & \multicolumn{5}{c}{$(a_1)$} &  & \multicolumn{5}{c}{$(a_2)$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO & -0.0971 & 0.0550 & 0.0098 & 0.0824 & 0.8282   &  & 0.0079 & 0.1216 & \bf{0.0163} & 0.0385 & 0.1482  \\
adaptive LASSO & -0.0185 & 0.1336 & 0.0200 & \bf{0.0463} & 0.2332   &  & -0.0024 & 0.1319 & 0.0187 & 0.0366 & 0.0708  \\
Elastic net &  -0.0983 & 0.0538 & \bf{0.0081} & 0.0720 & 0.8904   &  &  0.0054 & 0.1241 & \bf{0.0163} & \bf{0.0304} & 0.1174 \\
MCP &-0.0619 & 0.0902 & 0.0176 & 0.0971 & 0.4794  &  &  0.0275 & 0.1020 & 0.0278 & 0.1320 & 0.1642   \\
SCAD &-0.0635 & 0.0886 & 0.0163 & 0.0919 & 0.5440 &  & 0.0034 & 0.1261 & 0.0168 & 0.0309 & 0.0626  \\
PAL$_1$MA &-0.1552 & \bf{0.0031} & 0.0136 & 0.1165 & \bf{0.9190}  &  & 0.1333 & \bf{0.0038} & 0.0787 & 0.2804 & \bf{0.6908}  \\
OLS &-0.1480 & 0.0041 & 0.0426 & 0.2063 & 0.7630 &  & 0.1293 & 0.0002 & 0.2708 & 0.5204 & 0.6048 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
 & \multicolumn{5}{c}{$(a_3)$} &  & \multicolumn{5}{c}{$(a_4)$} \\\cline{2-6}\cline{8-12}
  & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO &  0.0036 & 0.0232 & 0.0038 & 0.0572 & 0.2456  &  & 0.2959 & 0.0808 & 0.0309 & 0.1562 & 0.9390 \\
adaptive LASSO &  0.0022 & 0.0218 & \bf{0.0012} & \bf{0.0277} & 0.0720&  &  0.3328 & 0.0439 & 0.0285 & 0.1632 & 0.9554  \\
    Elastic net &  0.0137 & 0.0333 & 0.0032 & 0.0459 & 0.2820   &  &0.2297 & 0.1469 & \bf{0.0254} & \bf{0.0617} & \bf{0.9998}  \\
MCP & 0.0052 & 0.0248 & 0.0044 & 0.0616 & 0.1130 &  &  0.4180 & 0.0413 & 0.0474 & 0.2137 & 0.8942  \\
SCAD & 0.0083 & 0.0279 & 0.0023 & 0.0385 & 0.0214 &  & 0.3059 & 0.0708 & 0.0469 & 0.2047 & 0.8070   \\
PAL$_1$MA &  -0.0210 & \bf{0.0013} & 0.0140 & 0.1184 & \bf{0.5688}  &  &0.3897 & \bf{0.0131} & 0.0572 & 0.2389 & 0.9564   \\
OLS & -0.0189 & 0.0007 & 0.0372 & 0.1929 & 0.5398  &  & 0.3840 & 0.0073 & 0.1974 & 0.4442 & 0.8100\\\cline{1-6}\cline{8-12}
\end{tabular}
\vspace{1mm}

mean: sample mean; bias: bias between the true value and the sample mean; mse: mean squared error: sd: standard deviation; sign: coincidence rate between the signs of the true value and the estimates {The best results for each columns are highlighted in boldface.}
\end{table*}

\begin{table*}[!ttt]
\begin{center}
Table C'. Results based on cross-validation.
\vspace*{2mm}

(b) $\{Z_1,Z_2\}$ satisfies the back-door criterion
\end{center}
\vspace*{2mm}

\hspace{-7.5mm}\begin{tabular}{cccccccccccc} 
  & \multicolumn{5}{c}{$(b_1)$} &  & \multicolumn{5}{c}{$(b_2)$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO & -0.0756 & 0.0761 & 0.0129 & 0.0844 & 0.6876  &  & -0.5308 & 0.1390 & 0.0451 & 0.1606 & 0.9920\\
adaptive LASSO &-0.0001 & 0.1517 & 0.0230 & \bf{0.0021} & 0.0032   &  & -0.6033 & 0.0664 & 0.0355 & 0.1763 & 0.9948  \\
Elastic net & -0.0816 & 0.0702 & \bf{0.0103} & 0.0732 & 0.8074  &  &  -0.5291 & 0.1407 & 0.0448 & \bf{0.1583} & 0.9936  \\
MCP &-0.0522 & 0.0995 & 0.0178 & 0.0888 & 0.4406  &  &  -0.5979 & 0.0719 & 0.0427 & 0.1936 & 0.9458 \\
SCAD &-0.0465 & 0.1052 & 0.0185 & 0.0861 & 0.4128  &  & -0.6378 & 0.0319 & 0.0328 & 0.1783 & 0.9602   \\
PAL$_1$MA & -0.1485 & \bf{0.0032} & 0.0192 & 0.1386 & \bf{0.8712}  &  &-0.6984 & \bf{0.0287} & \bf{0.0273} & 0.1628 & \bf{0.9990}  \\
OLS & -0.1528 & 0.0010 & 0.0542 & 0.2329 & 0.7470 &  &  -0.6697 & 0.0001 & 0.1126 & 0.3356 & 0.9718 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
  & \multicolumn{5}{c}{$(b_3)$} &  & \multicolumn{5}{c}{$(b_4)$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO & -0.0957 & 0.0879 & \bf{0.0166} & 0.0941 & 0.7560 &  & -0.4448 & 0.3337 & 0.1313 & 0.1414 & 0.9904  \\
adaptive LASSO & 0.0000 & 0.1835 & 0.0337 & \bf{0.0000} & 0.0000 &  & -0.5685 & 0.2099 & 0.0672 & 0.1521 & 0.9964   \\
Elastic net &  -0.0560 & 0.1276 & 0.0222 & 0.0772 & 0.5674 &  & -0.4267 & 0.3517 & 0.1349 & \bf{0.1055} & \bf{1.0000}  \\
MCP & -0.0568 & 0.1267 & 0.0277 & 0.1081 & 0.4088 &  & -0.4490 & 0.3294 & 0.1342 & 0.1603 & 0.9626  \\
SCAD &  -0.0766 & 0.1069 & 0.0325 & 0.1451 & 0.3810 &  & -0.5625 & 0.2159 & 0.0759 & 0.1711 & 0.9824   \\
PAL$_1$MA & -0.1686 & \bf{0.0149} & 0.0221 & 0.1480 & \bf{0.8948} &  & -0.6766 & \bf{0.1018} & \bf{0.0601} & 0.2231 & 0.9976  \\
OLS & -0.1860 & 0.0025 & 0.0522 & 0.2284 & 0.7936 &  & -0.7760 & 0.0025 & 0.1495 & 0.3867 & 0.9718 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
  & \multicolumn{5}{c}{$(b_5)$} &  & \multicolumn{5}{c}{$(b_6)$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO & 0.1219 & 0.2297 & 0.0646 & \bf{0.1087} & 0.7640 &  & -0.1216 & 0.1412 & 0.0311 & 0.1058 & 0.7900 \\
adaptive LASSO & 0.2227 & 0.1290 & 0.0312 & 0.1205 & 0.9406 &  & -0.1227 & 0.1401 & \bf{0.0306} & 0.1049 & 0.7932 \\
Elastic net & 0.2277 & 0.1239 & \bf{0.0300} & 0.1211 & 0.9482 &  & -0.1020 & 0.1608 & 0.0346 & 0.0937 & 0.7646   \\
MCP & 0.1233 & 0.2283 & 0.0689 & 0.1294 & 0.6744 &  & -0.0515 & 0.2114 & 0.0522 & \bf{0.0867} & 0.3956 \\
SCAD & 0.1167 & 0.2350 & 0.0726 & 0.1317 & 0.6618 &  & -0.0793 & 0.1835 & 0.0455 & 0.1085 & 0.5452  \\
PAL$_1$MA & 0.3811 & \bf{0.0295} & 0.0552 & 0.2331 & \bf{0.9586} &  & -0.2976 & \bf{0.0348} & 0.0493 & 0.2193 & \bf{0.9320} \\
OLS &0.3498 & 0.0018 & 0.2565 & 0.5064 & 0.7704 &  & -0.2560 & 0.0068 & 0.5220 & 0.7225 & 0.6364 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
  & \multicolumn{5}{c}{$(b_7)$} &  & \multicolumn{5}{c}{$(b_8)$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO &0.1124 & 0.2296 & 0.0644 & 0.1078 & 0.7414 &  & -0.5468 & 0.2768 & 0.0887 & \bf{0.1099} & 0.9998  \\
adaptive LASSO &0.1189 & 0.2232 & 0.0615 & 0.1082 & 0.7696 &  & -0.6804 & 0.1432 & 0.0334 & 0.1137 & \bf{1.0000}  \\
Elastic net & 0.1487 & 0.1934 & \bf{0.0463} & \bf{0.0942} & 0.9154 &  & -0.6009 & 0.2227 & 0.0656 & 0.1263 & \bf{1.0000}  \\
MCP & 0.1530 & 0.1891 & 0.0653 & 0.1718 & 0.6476 &  & -0.7764 & \bf{0.0472} & \bf{0.0196} & 0.1319 & 0.9976  \\
SCAD & 0.0993 & 0.2428 & 0.0773 & 0.1353 & 0.5856 &  & -0.7703 & 0.0533 & \bf{0.0196} & 0.1296 & 0.9974 \\
PAL$_1$MA & 0.4603 & \bf{0.1183} & 0.0829 & 0.2625 & \bf{0.9686} &  & -0.7105 & 0.1131 & 0.0577 & 0.2120 & 0.9994 \\
OLS & 0.3408 & 0.0012 & 0.2443 & 0.4943 & 0.7704 &  & -0.8224 & 0.0012 & 0.1699 & 0.4122 & 0.9718\\\cline{1-6}\cline{8-12}
\end{tabular}
\vspace{1mm}

mean: sample mean; bias: bias between the true value and the sample mean; mse: mean squared error: sd: standard deviation; sign: coincidence rate between the signs of the true value and the estimates. {The best results for each columns are highlighted in boldface.}
\end{table*}

\begin{table*}[!ttt]
\begin{center}
Table C'. Results based on cross-validation.
\vspace*{2mm}

(b) $\{Z_1,Z_2\}$ satisfies the back-door criterion
\end{center}
\vspace*{2mm}


\hspace{-7.5mm}\begin{tabular}{cccccccccccc} 
  & \multicolumn{5}{c}{$(b_9)$} &  & \multicolumn{5}{c}{$(b_{10})$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO &0.1000 & 0.0583 & 0.0115 & 0.0898 & 0.7886 &  & -0.0626 & 0.1219 & \bf{0.0223} & 0.0863 & 0.5282   \\
adaptive LASSO &  0.0762 & 0.0822 & 0.0131 & \bf{0.0797} & 0.7044 &  & -0.0445 & 0.1400 & 0.0254 & 0.0758 & 0.4102  \\
Elastic net & 0.1136 & 0.0448 & \bf{0.0098} & 0.0881 & 0.8576 &  & -0.0493 & 0.1352 & 0.0232 & 0.0703 & 0.5136   \\
MCP &  0.0776 & 0.0807 & 0.0156 & 0.0954 & 0.6076 &  & -0.0124 & 0.1721 & 0.0316 & \bf{0.0446} & 0.1178  \\
SCAD &0.0442 & 0.1142 & 0.0229 & 0.0994 & 0.3606 &  & -0.0201 & 0.1644 & 0.0302 & 0.0568 & 0.1910   \\
PAL$_1$MA &  0.1706 & \bf{0.0122} & 0.0106 & 0.1024 & \bf{0.9592} &  & -0.1923 & \bf{0.0078} & 0.0225 & 0.1499 & \bf{0.9208}   \\
OLS & 0.1573 & 0.0010 & 0.0496 & 0.2227 & 0.7704 &  & -0.1805 & 0.0040 & 0.2569 & 0.5068 & 0.6364 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
  & \multicolumn{5}{c}{$(b_{11})$} &  & \multicolumn{5}{c}{$(b_{12})$} \\\cline{2-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO &-0.1674 & 0.2586 & 0.0837 & 0.1296 & 0.8384 &  & 0.0063 & 0.0901 & 0.0107 & 0.0504 & 0.1188 \\
adaptive LASSO &-0.3059 & 0.1201 & 0.0365 & 0.1486 & 0.9726 &  & 0.0000 & 0.0838 & \bf{0.0070} & \bf{0.0000} & 0.0000  \\
Elastic net & -0.2310 & 0.1949 & 0.0546 & 0.1287 & 0.9476 &  & 0.0100 & 0.0938 & 0.0109 & 0.0461 & 0.1050  \\
MCP & -0.2607 & 0.1653 & 0.0827 & 0.2353 & 0.7000 &  & 0.0098 & 0.0936 & 0.0109 & 0.0463 & 0.0162 \\
SCAD & -0.1207 & 0.3053 & 0.1089 & \bf{0.1251} & 0.7076 &  & 0.0098 & 0.0937 & 0.0104 & 0.0407 & 0.0140  \\
PAL$_1$MA & -0.4059 & \bf{0.0201} & \bf{0.029} & 0.1703 & \bf{0.9926} &  & -0.0396 & \bf{0.0443} & 0.0223 & 0.1427 & \bf{0.6152}   \\
OLS & -0.4230 & 0.0030 & 0.0424 & 0.2060 & 0.9718 &  & -0.0821 & 0.0017 & 0.0509 & 0.2256 & 0.6364 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
  & \multicolumn{5}{c}{$(b_{13})$} &  & \multicolumn{5}{c}{$(b_{14})$} \\\cline{1-6}\cline{8-12}
 & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO &  -0.3901 & 0.4240 & 0.1979 & 0.1347 & 0.9904 &  & -0.0045 & 0.2535 & 0.0648 & 0.0235 & 0.0820  \\
adaptive LASSO & -0.6497 & 0.1643 & \bf{0.0631} & 0.1899 & \bf{0.9996} &  & -0.0004 & 0.2577 & 0.0664 & \bf{0.0061} & 0.0072   \\
Elastic net & -0.4351 & 0.3789 & 0.1597 & \bf{0.1269} & 0.9980 &  & -0.0154 & 0.2426 & 0.0606 & 0.0415 & 0.2374 \\
MCP &  -0.4831 & 0.3310 & 0.1517 & 0.2054 & 0.9704 &  & -0.0047 & 0.2534 & 0.0650 & 0.0276 & 0.0620  \\
SCAD & -0.5735 & 0.2405 & 0.1021 & 0.2103 & 0.9662 &  & -0.0039 & 0.2542 & 0.0652 & 0.0250 & 0.0588 \\
PAL$_1$MA & -0.8383 & \bf{0.0243} & 0.1000 & 0.3153 & 0.9968 &  & -0.2747 & \bf{0.0167} & \bf{0.0478} & 0.2180 & \bf{0.9206}  \\
OLS &-0.8114 & 0.0026 & 0.1640 & 0.4050 & 0.9718 &  & -0.2532 & 0.0049 & 0.5067 & 0.7118 & 0.6364 \\\cline{1-6}\cline{8-12}
 &  &  &  &  &  &  &  &  &  &  &  \\
  & \multicolumn{5}{c}{$(b_{15})$} &  & \multicolumn{5}{c}{$(b_{16})$} \\\cline{2-6}\cline{8-12}
   & mean & bias & mse & sd & sign &  & mean & bias & mse & sd & sign \\\cline{1-6}\cline{8-12}
LASSO & 0.0193 & 0.2141 & 0.0486 & 0.0524 & 0.0164 &  & -0.5053 & 0.4043 & 0.1776 & \bf{0.1191} & 0.9992  \\
adaptive LASSO &0.0000 & 0.1948 & \bf{0.0380} & \bf{0.0000} & 0.0000 &  & -0.6185 & 0.2910 & 0.0992 & 0.1205 & 0.9998  \\
Elastic net &  0.0190 & 0.2138 & 0.0484 & 0.0513 & 0.0104 &  & -0.5709 & 0.3387 & 0.1305 & 0.1259 & \bf{1.0000}  \\
MCP &  0.0220 & 0.2168 & 0.0513 & 0.0651 & 0.0030 &  & -0.7268 & 0.1827 & \bf{0.0632} & 0.1725 & 0.9976 \\
SCAD & 0.0313 & 0.2261 & 0.0591 & 0.0895 & 0.0076 &  & -0.7094 & 0.2001 & 0.0673 & 0.1650 & 0.9966  \\
PAL$_1$MA & -0.1051 & \bf{0.0898} & 0.1163 & 0.3291 & \bf{0.6364} &  & -0.7830 & \bf{0.1266} & 0.0778 & 0.2485 & 0.9990  \\
OLS &-0.1905 & 0.0043 & 0.2857 & 0.5345 & 0.6364 &  & -0.9069 & 0.0027 & 0.2047 & 0.4525 & 0.9718 \\\cline{1-6}\cline{8-12}
\end{tabular}
\vspace{1mm}

mean: sample mean; bias: bias between the true value and the sample mean; mse: mean squared error: sd: standard deviation; sign: coincidence rate between the signs of the true value and the estimates {The best results for each columns are highlighted in boldface.}
\end{table*}

\begin{figure*}[!ttt]
%\vspace*{-2cm}

\begin{center}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{a1.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{a2.png}}\hspace*{\fill}

\hspace*{\fill}$(a_1)$\hspace*{\fill}\hspace*{\fill}$(a_2)$\hspace*{\fill}

\vspace{1cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{a3.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{a4.png}}\hspace*{\fill}

\hspace*{\fill}$(a_3)$\hspace*{\fill}\hspace*{\fill}$(a_4)$\hspace*{\fill}

(a) $Z$ satisfies the back-door criterion.
\end{center}
Fig.~B. Boxplots of the estimated total effects based on 5000 replications from the numerical experiments. The dashed lines show the true total effects.
\vspace*{-5mm}

\end{figure*}
\begin{figure*}[!ttt]
\vspace*{-2cm}
\begin{center}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b1.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b2.png}}\hspace*{\fill}

\hspace*{\fill}$(b_1)$\hspace*{\fill}\hspace*{\fill}$(b_2)$\hspace*{\fill}


\vspace{0.5cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b3.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b4.png}}\hspace*{\fill}

\hspace*{\fill}$(b_3)$\hspace*{\fill}\hspace*{\fill}$(b_4)$\hspace*{\fill}


\vspace{0.5cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b5.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b6.png}}\hspace*{\fill}

\hspace*{\fill}$(b_5)$\hspace*{\fill}\hspace*{\fill}$(b_6)$\hspace*{\fill}


\vspace{0.5cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b7.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b8.png}}\hspace*{\fill}

\hspace*{\fill}$(b_7)$\hspace*{\fill}\hspace*{\fill}$(b_8)$\hspace*{\fill}

(b) $\{Z_1,Z_2\}$ satisfies the back-door criterion.
\end{center}
Fig.~B'. Boxplots of the estimated total effects based on 5000 replications from the numerical experiments. The dashed lines show the true total effects.
\vspace*{-5mm}

\end{figure*}
\begin{figure*}[!ttt]
\vspace*{-2cm}
\begin{center}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b9.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b10.png}}\hspace*{\fill}

\hspace*{\fill}$(b_{9})$\hspace*{\fill}\hspace*{\fill}$(b_{10})$\hspace*{\fill}


\vspace{0.5cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b11.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b12.png}}\hspace*{\fill}

\hspace*{\fill}$(b_{11})$\hspace*{\fill}\hspace*{\fill}$(b_{12})$\hspace*{\fill}


\vspace{0.5cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b13.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b14.png}}\hspace*{\fill}

\hspace*{\fill}$(b_{13})$\hspace*{\fill}\hspace*{\fill}$(b_{14})$\hspace*{\fill}


\vspace{0.5cm}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b15.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{b16.png}}\hspace*{\fill}

\hspace*{\fill}$(b_{15})$\hspace*{\fill}\hspace*{\fill}$(b_{16})$\hspace*{\fill}

(b) $\{Z_1,Z_2\}$ satisfies the back-door criterion.
\end{center}
Fig.~B'. Boxplots of the estimated total effects based on 5000 replications from the numerical experiments. The dashed lines show the true total effects.

\end{figure*}
\clearpage
\section{Case Study}

\subsection{Background}

\begin{figure*}[!hhh]
\hspace*{\fill}\resizebox{10.0cm}{!}{\includegraphics{2.jpg}}\hspace*{\fill}


\hspace*{\fill}Fig.~C. Causal diagram \citep{Kuroki12}\hspace*{\fill}
\end{figure*}

\begin{table}[!bbb]
\begin{center}
Table D. Randomly selected data from the paper by \cite{Okuno86}. 
\vspace*{3mm}

\begin{tabular}{c|cccccccccc|c} \hline
No. & $X_{1}$ & $X_{2}$ & $X_{3}$ & $X_{4}$ & $X_{5}$ & $X_{6}$ & $X_{7}$ & $X_{8}$ & $X_{9}$ & $X_{10}$ & $Y$ \\\hline
1&33 & 28.3  & 6.7  & 40.0  & 3.0  & 5.0  & 208.0  & 20.0  & 19.0  & 30.0  & 19.3  \\
2&16.7 & 35.0  & 5.0  & 40.0  & 5.0  & 5.5  & 108.0  & 25.0  & 10.5  & 39.0  & 7.3  \\
3&16.7 & 35.0  & 8.3  & 30.0  & 2.1  & 3.0  & 112.0  & 25.0  & 20.0  & 25.0  & 35.2  \\
4&33 & 25.0  & 8.3  & 40.0  & 4.0  & 4.1  & 240.0  & 34.0  & 22.5  & 25.0  & 18.4  \\
5&44 & 29.5  & 6.5  & 30.0  & 2.1  & 5.0  & 120.0  & 6.7  & 7.0  & 30.0  & 21.7  \\
6&16.7 & 35.0  & 4.9  & 40.0  & 5.0  & 3.9  & 168.0  & 25.0  & 20.0  & 25.0  & 28.7  \\
7&44 & 29.5  & 8.3  & 40.0  & 2.1  & 2.2  & 200.0  & 7.0  & 7.0  & 30.0  & 37.8  \\
8&44 & 25.8  & 6.7  & 40.0  & 4.1  & 5.0  & 132.0  & 22.0  & 8.2  & 46.0  & 13.4  \\
9&33 & 25.5  & 6.5  & 40.0  & 4.0  & 4.0  & 276.0  & 20.0  & 22.5  & 25.0  & 17.8  \\
\hline
\end{tabular}
\end{center}

\end{table}

In this section, we apply LASSO, adaptive LASSO, elastic net, SCAD, MCP, PAL$_1$MA and OLS to a case study of setting up coating conditions for car bodies, reported by \cite{Okuno86} and reanalyzed by \cite{Kuroki12}.

According to \cite{Okuno86}, car bodies are coated to increase both the rust protection quality and the visual appearance.
A certain coating thickness must be ensured in the coating process.
At the time of the study, this process was conducted by operators who sprayed the car bodies with paint, which depended on the operators' skills and could cause low transfer efficiency.
\cite{Okuno86} collected nonexperimental data on the coating process to examine the process conditions and to increase the transfer efficiency.
The sample size is 38, and the dataset is available from \cite{Okuno86}.
In addition, the observed variables of interest are as follows:

\vspace*{1mm}
\hspace*{\fill}Process condition \hspace*{\fill}\\
{\noindent}The dilution ratio $(X_{1})$, degree of viscosity $(X_{2})$,
gun speed $(X_{3})$, spray distance $(X_{4})$, air pressure $(X_{5})$, pattern width $(X_{6})$, fluid output $(X_{7})$, temperature of the paint $(X_{8})$, temperature $(X_{9})$, and degree of moisture $(X_{10})$

\hspace*{\fill}Response\hspace*{\fill}

\hspace*{\fill}The transfer efficiency $(Y)$.\hspace*{\fill}

\vspace*{1mm}
{\noindent}Table D shows the randomly selected data from the whole dataset given by \cite{Okuno86}.
Here, note that our discussion is based on Table D to consider a situation where OLS and the all-variable selection procedure cannot be applied.

According to \cite{Okuno86}, there is some difference among these variables in terms of the controllability level: $X_{1}$, $X_{2}$, $...$, $X_{6}$ can be controlled; $X_{7}$ and $X_{8}$ result from other factors and are difficult to control; and $X_{9}$ and $X_{10}$ are environmental conditions that cannot be controlled.
In addition, \cite{Kuroki12} assumed that the cause-effect relationships in the coating process are as shown in Fig.~C.
From Fig.~C, $\{X_{8},X_{10}\}$ satisfies the back-door criterion relative to $(X_2,Y)$.
For details on this case study, refer to \cite{Okuno86} and \cite{Kuroki12}.

\subsection{Analysis}

\quad~In this section, we are concerned with the evaluation of the total effect of $X_2$ on $Y$ because similar observations can be derived regarding other controllable variables.
Table E shows the results obtained by each regression analysis.
Here, parameter tuning was conducted by the same procedure as in Section B.

First, according to \cite{Okuno86}, it is well known that the viscosity $(X_2)$ is an important factor that increases both the rust protection quality and visual appearance.
However, from Table E, the total effect of $X_2$ on $Y$ is estimated as zero by MCP, which is problematic because it provides such a misleading interpretation that it is no use to control $X_2$ to achieve the aim.

Second, OLS regression provides the unbiased estimator of the total effect through a set $\{X_8,X_9\}$ that satisfies the back-door criterion.
Given this finding, it is desirable that the estimators from the regularized regression analysis be close to the OLS estimate.
From the viewpoint of this observation, the estimates from PAL$_1$MA are close to the OLS estimates for each selected variable, but those from the other regularized regression analyses are not close to these estimates.

Third, when the regression coefficient of $X_8$ is regularized, for PAL$_1$MA, $X_8$ is not selected, but $\{X_5,X_6\}$ is selected.
This phenomenon may occur because the OLS estimate of the regression coefficient of $X_8$ is very small $(-0.083)$ in the regression model of $Y$ on $X_2$, $X_5, X_6, X_8$ and $X_{10}$.
However, even if a set of sufficient confounders is not available by PAL$_1$MA, by checking the solution paths shown in Fig.~D, we can verify that missing sufficient confounders do not interfere with the qualitative interpretation of the total effects by PAL$_1$MA for any $\lambda$.

Fourth, from Fig.~E, the sample ranges of elastic net, PAL$_1$MA and OLS do not include zero, but those of the other regression analyses include zero.
From this observation, it is judged that $X_2$ would have a positive effect on $Y$ from elastic net, PAL$_1$MA and OLS, but the other regression analyses may not result in the rejection of the hypothesis that $X_2$ has no effect on $Y$.

\begin{table*}[!ttt]

\begin{center}
Table E. Results based on cross-validation. 
\vspace*{3mm}

\begin{tabular}{cccccccc} \hline
\multicolumn{1}{c}{Method} & \multicolumn{1}{c}{non-regulaized} & \multicolumn{1}{c}{estimate} & \multicolumn{1}{c}{sd} & \multicolumn{1}{c}{selected variables} & \multicolumn{3}{c}{parameters} \\
\multicolumn{1}{c}{} & \multicolumn{1}{c}{variables} & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & $\lambda$ & $\xi$ & $\phi$ \\\hline
LASSO & $-$ & 0.0470  & 0.0914 & $X_2$,$X_5$,$X_6$ & 0.1640 & $-$ & $-$ \\
adaptive LASSO & $-$ & 0.0759  & 0.0862  & $X_2$,$X_5$,$X_6$,$X_{10}$ & 0.2160 & 0.5000 & $-$ \\
Elastic net & $-$ & 0.1395  & 0.0963  & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.1350 & $-$ & 0.5500 \\
MCP & $-$ & 0.0000  & 0.0621  & $X_6$ & 0.3020  & 4.0000  & $-$ \\
SCAD & $-$ & 0.1357  & 0.1124  & $X_2$,$X_4$,$X_5$,$X_6$,$X_{10}$ & 0.0820  & 17.5000  & $-$ \\
PAL$_1$MA & $X_8$,$X_{10}$ & 0.2221  & 0.1111  & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.00005 & 0.1000 & $-$ \\
PAL$_1$MA & $X_8$ & 0.2242  & 0.0950  & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.00003 & 0.5000 & $-$ \\
PAL$_1$MA & $X_{10}$ & 0.2251  & 0.0854  & $X_2$,$X_5$,$X_6$,$X_{10}$ & 0.00003 & 0.5000 & $-$ \\
PAL$_1$MA & $-$ & 0.2297  & 0.0795  & $X_2$,$X_5$,$X_6$,$X_{10}$ & 0.00002 & 0.5000 & $-$ \\
OLS & $-$ & 0.2455  & 0.1314  & $X_2$,$X_8$,$X_{10}$ & $-$ & $-$ & $-$ \\\hline
\end{tabular}
\end{center}

estimate: estimates of the total effect with $n=9$; sd: standard deviation based on leave-one-out method; selected variables: selected explanatory variables by variable selection; 
parameter: regularized, tuning and mixing parameters.
Here, $\lambda_2$ and $\xi_2$ of PAL$_1$MA were selected as zero by leave-one-out method. 
\end{table*}


\begin{figure*}[!ttt]

\hspace*{\fill}\resizebox{17.0cm}{!}{\includegraphics{pathUAI1.png}}\hspace*{\fill}

\vspace*{-2mm}

\hspace*{\fill}\hspace*{1.5cm}(a) LASSO\hspace*{\fill}
\hspace*{\fill}\hspace*{1.5cm}(b) adaptive LASSO\hspace*{\fill}
\hspace*{\fill}\hspace*{\fill}(c) Elastic net\hspace*{\fill}

\vspace*{2mm}

\hspace*{\fill}\resizebox{11.25cm}{!}{\includegraphics{pathUAI2.png}}\hspace*{\fill}

\vspace*{-2mm}

\hspace*{\fill}\hspace*{\fill}\hspace*{0.5cm}(d) MCP
\hspace*{1.5cm}
\hspace*{\fill}(e) SCAD\hspace*{\fill}\hspace*{\fill}

\hspace*{\fill}\resizebox{11.25cm}{!}{\includegraphics{palmapath1.png}}\hspace*{\fill}

\vspace*{-1mm}

\hspace*{3.8cm}(f) PAL$_1$MA $(\{X_8,X_{10}\})$ \hspace{2cm}
(g) PAL$_1$MA $(X_8)$\hspace*{\fill}\hspace*{\fill}

\vspace*{2mm}

\hspace*{\fill}\resizebox{11.5cm}{!}{\includegraphics{palmapath2.png}}\hspace*{\fill}

\vspace*{-2mm}

\hspace*{4.5cm}(f) PAL$_1$MA $(X_{10})$ \hspace*{2.7cm}(g) PAL$_1$MA $(\phi)$\hspace*{\fill}\hspace*{\fill}

\vspace*{4mm}

Fig.~D. Solution paths of the regularization parameter $\lambda$ when both $\xi$ and $\phi$ are fixed to the value in Table E.
Here, the dashed horizontal lines and the dashed vertical lines show the value of $\lambda$ from Table E.
The bold solid line: the regression coefficient of $X_2$;
the dot-dashed line: the regression coefficient of $X_{8}$;
the dashed line: the regression coefficient of $X_{10}$;
the thin solid line: the regression coefficients of the other covariates.

\end{figure*}
\clearpage



\begin{figure*}[!ttt]
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{boxUAI810.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{boxUAI8.png}}\hspace*{\fill}

\hspace*{2.5cm}(a) PAL$_1$MA $(\{X_8,X_{10}\})$ \hspace*{4.5cm}(b) PAL$_1$MA $(X_8)$\hspace*{\fill}\hspace*{\fill}

\vspace*{3mm}

\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{boxUAI10.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{boxUAIna.png}}\hspace*{\fill}

\hspace*{3cm}(c) PAL$_1$MA $(X_{10})$ \hspace*{5cm}(d) PAL$_1$MA $(\phi)$\hspace*{\fill}\hspace*{\fill}
\vspace*{3mm}


\hspace*{\fill}Fig.~E. Boxplots of the case study for setting up the coating conditions for car bodies\hspace*{\fill}

\end{figure*}

\begin{figure*}[!ttt]
\begin{center}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{boxfix10.png}}\hspace*{\fill}
\hspace*{\fill}\resizebox{7.0cm}{!}{\includegraphics{{boxfixna.png}}}\hspace*{\fill}

\hspace*{3cm}(a) PAL$_1$MA $(X_{10})$ \hspace*{5cm}(b) PAL$_1$MA $(\phi)$\hspace*{\fill}\hspace*{\fill}
\vspace*{3mm}


\hspace*{\fill}Fig.~F. Boxplots of the case study for setting up the coating conditions for car bodies\hspace*{\fill}
\end{center}
\end{figure*}

\clearpage
Here, Table F also shows the results obtained by conducting parameter tuning to select $\{X_8,X_{10}\}$ satisfying the back-door criterion relative to $(X_2,Y)$ with the best prediction accuracy possible.
To select $\{X_8,X_{10}\}$, in Table E, the regularization parameters have been set to smaller values than those in Table E.
First, both the estimates and the standard deviations of LASSO, adaptive LASSO, MCP and SCAD in Table F are larger than those in Table E, but there seems to be no significant change in those of PAL$_1$MA between Tables E and F.
Second, compared to Table E, covariates other than $X_8$ and $X_{10}$ are selected in Table F.
Especially for MCP, an intermediate variable $X_7$ is also selected against the back-door criterion to select $X_8$ and $X_{10}$, which may be problematic in the context of statistical causal inference.
Third, from Figs.~F (a) and (b), although the sample ranges of LASSO, adaptive LASSO, MCP and SCAD include zero, OLS or PAL$_1$MA does not include zero.
From this observation, it is judged that $X_2$ would have a positive effect on $Y$ from elastic net, the PAL$_1$MA and OLS, but the other regression analyses may not result in the rejection of the hypothesis that $X_2$ has no effect on $Y$.
\begin{table*}[!ttt]

\hspace*{\fill}Table F. Results\hspace*{\fill}
\vspace*{3mm}

\hspace{-7.5mm}{\begin{tabular}{cccccccc} \hline
\multicolumn{1}{c}{Method} & \multicolumn{1}{c}{non-regulaized} & \multicolumn{1}{c}{estimate} & \multicolumn{1}{c}{sd} & \multicolumn{1}{c}{selected variables} & \multicolumn{3}{c}{parameters} \\
\multicolumn{1}{c}{} & \multicolumn{1}{c}{variables} & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & $\lambda$ & $\xi$ & $\phi$ \\\hline
LASSO & $-$ & 0.1453  & 0.1070 & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.0752 & $-$ & $-$ \\
adaptive LASSO & $-$ & 0.1438  & 0.1856  & $X_2$,$X_3$,$X_4$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.0170 & 0.5000 & $-$ \\
Elastic net & $-$ & 0.1395  & 0.0963  & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.1350 & $-$ & 0.5500 \\
MCP & $-$ & 0.4254  & 0.2473  & $X_1$,$X_2$,$X_4$,$X_5$,$X_6$,$X_7$,$X_8$,$X_9$,$X_{10}$ & 0.0140  & 4.0000  & $-$ \\
SCAD & $-$ & 0.1568  & 0.1158  & $X_2$,$X_4$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.0680  & 17.5000  & $-$ \\
PAL$_1$MA & $X_{10}$ & 0.2254  & 0.0835  & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.00002 & 0.5000 & $-$ \\
PAL$_1$MA & $-$ & 0.2276  & 0.0816  & $X_2$,$X_5$,$X_6$,$X_8$,$X_{10}$ & 0.00001 & 0.5000 & $-$ \\
OLS & $-$ & 0.2455  & 0.1314  & $X_2$,$X_8$,$X_{10}$ & $-$ & $-$ & $-$ \\\hline
\end{tabular}}


estimate: estimates of the total effect with $n=9$; sd: standard deviation based on method; selected variables: selected explanatory variables by the variable selection; 
parameter: regularized, tuning and mixing parameters.
Here, $\lambda_2$ and $\xi_2$ of PAL$_1$MA were selected as zero by three fold cross-validation.
\end{table*}
\clearpage
\begin{thebibliography}{99}
%\setlength{\itemindent}{-\leftmargin}
%\makeatletter\renewcommand{\@biblabel}[1]{}\makeatother

\bibitem[\protect\citeauthoryear{Beck}{2017}]{Beck17}
Beck, A.  
{\it First-order Methods in Optimization}. 
Society for Industrial and Applied Mathematics, 2017. 

\bibitem[\protect\citeauthoryear{Breheny and Huang}{2011}]{Breheny11}
Breheny, P. and Huang, J.  
Coordinate descent algorithms for nonconvex penalized regression, with applications to biological feature selection. 
{\it Annals of Applied Statistics}, {\bf 5}:232-253, 2011.

\bibitem[\protect\citeauthoryear{Fan and Li}{2001}]{Fan01}
Fan, J. and Li, R.  
Variable selection via nonconcave penalized likelihood and its oracle properties. 
{\it Journal of the American statistical Association}, {\bf 96}:1348--1360, 2001.

\bibitem[\protect\citeauthoryear{Friedman et al.}{2010}]{Friedman10}
Friedman, J., Hastie, T., and Tibshirani, R.  
Regularization paths for generalized linear models via coordinate descent. 
{\it Journal of statistical software}, {\bf 33}:1–22, 2010.

\bibitem[\protect\citeauthoryear{Kuroki}{2012}]{Kuroki12}
\textsc{Kuroki, M.}  
Optimizing an external intervention using a structural equation model with an application to statistical process analysis. 
{\it Journal of Applied Statistics}, {\bf 39}:673-694, 2012.

\bibitem[\protect\citeauthoryear{Okuno et al.}{1986}]{Okuno86}
\textsc{Okuno, T., Katayama, Z., Kamigori, N., Itoh, T., Irikura, N., and Fujiwara, N.}  
{\it Multivariate Data Analysis in Industry}, JUSE Press, 1986. 

\bibitem[\protect\citeauthoryear{Pourahmadi and Wang}{2015}]{Pourahmadi15}
Pourahmadi, M. and Wang, X,  
Distribution of random correlation matrices: Hyperspherical parameterization of the Cholesky factor, {\it Statistics and Probability Letters}, {\bf 106}:5-12, 2015.

\bibitem[\protect\citeauthoryear{Sardy et al.}{2000}]{Sardy00}
Sardy, S., Bruce, A. G., and Tseng, P.  
Block coordinate relaxation methods for nonparametric wavelet denoising. 
{\it Journal of Computational and Graphical Statistics}, {\bf 9}:361-379, 2000.

\bibitem[\protect\citeauthoryear{Tibshirani}{1996}]{Tibshirani96}
Tibshirani, R.  
Regression shrinkage and selection via the lasso, 
{\it Journal of the Royal Statistical Society: Series B}, {\bf 58}:267--288, 1996.

\bibitem[\protect\citeauthoryear{Zhang}{2010}]{Zhang10}
Zhang, C. H.  
Nearly unbiased variable selection under minimax concave penalty. 
{\it Annals of Statistics}, {\bf 38}:894-942, 2010.

\bibitem[\protect\citeauthoryear{Zou}{2006}]{Zou06}
Zou, H. 
The adaptive LASSO and its oracle properties. {\it Journal of the American statistical association}, {\bf 101}:1418-1429, 2006.

\bibitem[\protect\citeauthoryear{Zou and Hastie}{2005}]{Zou05}
Zou, H. and Hastie, T.  
Regularization and variable selection via the Elastic net. 
{\it Journal of the Royal Statistical Society: Series B}, {\bf 67}:301-320, 2005.

\end{thebibliography}

\end{document}

