%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
%%additional packages%%
\usepackage{bm}
\usepackage{latexsym}
\usepackage{algorithm,algorithmic,lscape}
\usepackage{graphics}
\usepackage{times}
\usepackage{amsfonts}
\usepackage{lmodern}
\usepackage{empheq}
\usepackage{scalerel}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{color}
\usepackage{caption}
%\usepackage{amsmath}
%%additional packages%%

%%additional new commands/theorems%%
\newcommand{\red}{\color{red}}
\newcommand{\blue}{\color{blue}}
%\newcommand{\green}{\color{green}}
\newtheorem{theorem}{Theorem}
\newtheorem{condition}{Condition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
%%additional new commands/theorems%%


%%???%%
\def\MnSymbolGlyphs#1{% IF ONE NEEDS TO LOCATE GLYPHS
  \usepackage{MnSymbol,fonttable}%
  \AtBeginDocument{\fonttable{MnSymbol#110}}%
}

\allowdisplaybreaks

\def\ImportFromMnSymbol#1{%
  \DeclareFontFamily{U} {MnSymbol#1}{}
  \DeclareFontShape{U}{MnSymbol#1}{m}{n}{
   <-6> MnSymbol#15
   <6-7> MnSymbol#16
   <7-8> MnSymbol#17
   <8-9> MnSymbol#18
   <9-10> MnSymbol#19
   <10-12> MnSymbol#110
   <12-> MnSymbol#112}{}
  \DeclareFontShape{U}{MnSymbol#1}{b}{n}{
   <-6> MnSymbol#1-Bold5
   <6-7> MnSymbol#1-Bold6
   <7-8> MnSymbol#1-Bold7
   <8-9> MnSymbol#1-Bold8
   <9-10> MnSymbol#1-Bold9
   <10-12> MnSymbol#1-Bold10
   <12-> MnSymbol#1-Bold12}{}
  \DeclareSymbolFont{MnSy#1} {U} {MnSymbol#1}{m}{n}
}
\newcommand\DeclareMnSymbol[4]{\DeclareMathSymbol{#1}{#2}{MnSy#3}{#4}}
\ImportFromMnSymbol{A}
\DeclareMnSymbol{\ConIndepNat}{\mathrel}{A}{225}
\def\ConIndep{\mathrel{\scalerel*{\ConIndepNat}{X}}}
%%???%%

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Partially Adaptive Regularized Multiple Regression Analysis for Estimating Linear Causal Effects}

% The standard author block has changed for UAI 2021 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.

% Add authors in order of decreasing contribution
\author[1]{\href{Hisayoshi Nanmo <nanmohisayoshi@gmail.com>?Subject=Partially Adaptive Regularized Multiple Regression Analysis for Estimating Linear Causal Effects}{Hisayoshi Nanmo}{}} % Lead author
\author[2]{\href{Manabu Kuroki <kuroki-manabu-zm@ynu.ac.jp>?Subject=Partially Adaptive Regularized Multiple Regression Analysis for Estimating Linear Causal Effects}{Manabu Kuroki}{}}


% Add affiliations after the authors
\affil[1]{%
{Chugai Pharmaceutical Co., Ltd.\\
Nihonbashi Muromachi, Chuo-ku, Tokyo, Japan} 
}
\affil[2]{%
{Yokohama National University}      \\
Tokiwadai, Hodogaya-ku, Yokohama, Japan 
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

  \begin{document}
\maketitle

\setcitestyle{authoryear,open={},close={}}

\begin{abstract}
This paper assumes that cause-effect relationships among variables can be described with a linear structural equation model.
Then, a situation is considered where a set of observed covariates satisfies the back-door criterion but the ordinary least squares method cannot be applied to estimate linear causal effects because of multicollinearity/high-dimensional data problems. In this situation, we propose a novel regression approach, the ``partially adaptive L$_p$-regularized multiple regression analysis'' (PAL$_p$MA) method for estimating the total effects.
Different from standard regularized regression analysis, PAL$_p$MA provides a consistent or less-biased estimator of the linear causal effect.
PAL$_p$MA is also applicable to evaluating direct effects through the single-door criterion. 
Given space constraints, the proofs, some numerical experiments, and an industrial case study on setting up painting conditions of car bodies are provided in the Supplementary Material.
\end{abstract}

\section{Introduction}

\subsection{Background}

\quad~The multicollinearity problem [\citep{Frisch34}], which occurs when two or more explanatory variables are highly correlated, is an important issue in regression analysis.
If multicollinearity exists, because the performance of least squares/maximum likelihood estimators of regression coefficients is inadequate, valid results may not be obtained.
The high-dimensional data problem occurs in the framework of regression analysis when the sample size is smaller than the number of explanatory variables.
High-dimensional data analysis also suffers from multicollinearity, which causes overfitting and interferes with obtaining admissible solutions for regression coefficients.
Recently, due to the development of technological advances that help collect data with a large number of variables to better understand a given phenomenon of interest, multicollinearity/high-dimensional data problems have become serious in many domains.
To overcome this difficulty, numerous kinds of variable selection techniques based on regularized regression analysis, for example, the least absolute shrinkage and selection operator (LASSO), elastic net, smoothly clipped absolute deviation (SCAD) and minimax concave penalty (MCP) methods, have been proposed by many
statistical and AI researchers and practitioners [\citep{Buhlmann11}; \citep{efron04}; \citep{fan01}; \citep{Hoerl70a}; {\citep{KM2018,KM2019,KM2020}}; \citep{Tibshirani96}; \citep{geer14}; \citep{zhang10}; \citep{Zou06}; \citep{Zou05}].
%Although these methods have their advantages and disadvantages, there is no doubt that regression analysis has played a major role in avoiding the high-dimensional/multicollinearity data problem in regression techniques aimed at improving the prediction accuracy.

\quad~Currently, the role of regression analysis is not limited to the prediction of a response variable by explanatory variables; 
it also plays an important role in evaluating the linear causal effects of the treatment variable on the response variable.
In particular, the total effect, which is one of the representative linear causal effects and the main interest in this paper, means the changes in the expected response variable by one unit through an external intervention [\citep{Pearl}].
As has often been noted in the framework of statistical causal inference, to derive the consistent estimator of the total effect, in addition to the treatment variable, confounders must be included as explanatory variables in the regression model.
However, there are many confounders that have an effect on both the treatment variable and the response variable and that are highly correlated in reality.
This situation leads to the multicollinearity problem, which deteriorates the estimation accuracy of the total effects and formulates an unreliable plan that prevents us from conducting appropriate policy decision making.
On the other hand, the present countermeasures against the multicollinearity problem are formulated independently of the confounding problem.
Thus, although stable results of regression analysis may be derived by these countermeasures from the viewpoint of the prediction, they may yield a highly biased estimate of the linear causal effect.

\subsection{Contributions}

\quad~In this paper, when the cause-effect relationships among variables can be described with a linear structural equation model, we consider a situation where a set of observed covariates satisfies the back-door criterion but the ordinary least squares (OLS) method cannot be applied to estimate the total effects because of the multicollinearity/high-dimensional data problem.
In this situation, to evaluate the total effect, we propose a novel regression approach, the ``Partially Adaptive $L_p$-regularized Multiple regression Analysis'' (PAL$_p$MA) method for $p=1,2$. In particular, PAL$_1$MA has the following desirable properties:


{\noindent}(1) In statistical causal inference, it is important not to remove a treatment variable or confounders from the regression model when estimating the total effects.
However, even if some covariates are guaranteed to be important confounders from qualitative causal knowledge, standard regularized regression analysis may remove them and the treatment variable from the model, depending on the value of the regularization parameter.
In contrast, PAL$_1$MA enables us to include both the treatment variable and such covariates in the regression model, regardless of the value taken by the regularization parameter.
In particular, when we know that a set of covariates satisfies the back-door criterion, the solution path with such information can be utilized as the criteria of parameter tuning to estimate the total effects.


{\noindent}(2) Regarding PAL$_p$MA for $p=1,2$, we can derive a collapsibility condition, i.e., a sufficient condition that the $L_p$-regularized estimator of the regression coefficient of interest is consistent with the OLS estimator regardless of the value taken by the regularization parameter, and thus leads to the consistent estimator of the total effects under the condition. The collapsibility problem in regression analysis have been discussed by many researchers [\citep{Clogg92}; \citep{geng93}; \citep{guo95}; \citep{Wermuth}].
However, to the best of our knowledge, there has been much less discussion of collapsibility problem in the context of regularized regression analysis.

{\noindent}(3) Compared to standard regularized regression analysis, PAL$_1$MA can reduce the bias or provide higher coincidence rates for the signs of the OLS estimator, even when the collapsibility conditions are violated.
In contrast, in standard regularized regression analysis, the regression coefficients can flip from positive to negative values and from negative to positive values as they shrink toward zero, depending on the value of the regularization parameter.
This phenomenon implies that standard regularized regression analysis may provide misleading qualitative results regarding the total effects compared to PAL$_1$MA.
%This also leads to the concept of the "qualitative c-equivalence" between two sets of covariates, the whole set of covariates and the selected subset of covariates by PARL-LASSO.


\quad~From these properties, PAL$_1$MA contributes to solving the multicollinearity/high-dimensional data problems of evaluating linear causal effects in the context of statistical causal inference.
Given space constraints, the proofs, some numerical experiments and an industrial case study on setting up painting conditions of car bodies [\citep{Kuroki12}] are provided in the Supplementary Material.

\section{Linear Structural Causal Model}

\quad~In the context of statistical causal inference, a directed acyclic graph that represents cause-effect relationships is called a causal diagram.
A directed graph is a pair $G=(\bm{V}, \bm{E})$, where $\bm{V}$ is a finite set of vertices and the set $\bm{E}$ of directed arrows is a subset of the set $\bm{V}{\times}\bm{V}$ of ordered pairs of distinct vertices.
In this paper, we refer to vertices in the directed acyclic graph and random variables of the linear structural equation model interchangeably.
\setcitestyle{authoryear,open={[},close={]}}
In addition, for the graph theoretic terminology used in this paper, we refer readers to \cite{Pearl09}.
\setcitestyle{authoryear,open={},close={}}

\begin{definition}\label{DEFINITION 1} (Linear Structural Causal Model)
Suppose a directed acyclic graph $G=(\bm{V}, \bm{E})$ with set $\bm{V}=\{V_{1},V_{2},\cdots,V_{m}\}$ of variables is given.
The graph $G$ is called a causal diagram when each child-parent family in the graph $G$ represents a linear structural equation model
\begin{align}
V_{i}=\mu_{v_i}+\sum_{V_{j}{\in}\mbox{pa}(V_{i})}\alpha_{v_{i}v_{j}}V_{j}+\epsilon_{v_{i}},\,\,\,i=1, 2, \ldots, m
\label{1}
\end{align}
as the data generating process, where $\mbox{pa}(V_{i})$ denotes a set of parents of $V_{i}$ in $G$ and random disturbances $\epsilon_{v_{1}},\epsilon_{v_{2}}, \ldots, \epsilon_{v_{m}}$ are assumed to be independent and identically distributed with mean $0$.
In addition, $\mu_{v_i}$ is an intercept, and $\alpha_{v_{i}v_{j}}({\neq}0)$ is called a path coefficient or a direct effect of $V_j$ on $V_i$ $(i,j=1,2,\ldots,m\,;\, i\neq j)$.
Then, equation (\ref{1}) is called a linear structural causal model (SCM) in this paper.
\end{definition}

\quad~To proceed with our discussion, we define some notation.
For univariates $X$ and $Y$ and a set of variables $\bm{Z}$, let $\sigma_{xy{\cdot}z}$ be the conditional covariance between $X$ and $Y$ given $\bm{Z}=\bm{z}$, and let $\sigma_{xx{\cdot}z}$ be the conditional variance of $X$ given $\bm{Z}=\bm{z}$.
The regression coefficient of $X$ in the regression model of $Y$ on $X$ and $\bm{Z}$ is denoted by $\beta_{yx{\cdot}z}=\sigma_{xy{\cdot}z}/\sigma_{xx{\cdot}z}$.
For sets of variables $\bm{X}$, $\bm{Y}$, and $\bm{Z}$ ($\bm{Y}$ can be univariate), let $\Sigma_{xy{\cdot}z}$ be the conditional cross-covariance matrix between $\bm{X}$ and $\bm{Y}$ given $\bm{Z}=\bm{z}$, and let $\Sigma_{xx{\cdot}z}$ be the conditional variance-covariance matrix of $\bm{X}$ given $\bm{Z}=\bm{z}$.
In addition, let $B_{yx{\cdot}z}=\Sigma^{-1}_{xx{\cdot}z}\Sigma_{xy{\cdot}z}$ denote the regression coefficient vector of $\bm{X}$ in the regression model of $Y$ on $\bm{X}$ and $\bm{Z}$.
The set of variables $\bm{Z}$ is omitted from these arguments if it is an empty set.
Similar notation is used for the remaining statistical parameters.
Furthermore, letting $\bm{X}=\{X_1,X_2,...,X_q\}$,
the $i$-th element of $B_{yx{\cdot}z}$ is denoted by $\beta_{yx_i{\cdot}x_{(i)}z}$, where $\bm{X}_{(i)}=\bm{X}\backslash \{X_i\}$ $(i=1,2,...,q)$. $\bm{0}_q$ is a $q$-dimensional zero vector. Similar notation is used for other sets of variables.

\quad~The main purpose of this paper is to estimate total effects from observed data.
The total effect $\tau_{yx}$ of $X$ on $Y$ is defined as the total sum of the products of the path coefficients on the sequence of directed arrows along all the directed paths from $X$ to $Y$.
To achieve our aim, we introduce the back-door criterion [\citep{Pearl09}] as one of the representative identifiability criteria for the total effects.
Here, when a linear causal effect can be determined uniquely from the variance/covariance parameters of the observed variables, it is said to be identifiable, that is, it can be estimated consistently.


\begin{definition}\label{DEFINITION 2} (Back-Door Criterion)
Let $\{X,Y\}$ and $\bm{Z}$ be disjoint subsets of
$\bm{V}$ in a directed acyclic graph $G$.
If a set $\bm{Z}$ of vertices satisfies the following conditions relative to an ordered pair $(X, Y)$, then $\bm{Z}$ is said to satisfy the back-door criterion relative to $(X, Y)$.
\begin{enumerate}
\item No vertex in $\bm{Z}$ is a descendant of $X$, and
\item $\bm{Z}$ d-separates $X$ from $Y$ in the graph obtained by deleting all the directed arrows emerging from $X$ from graph $G$.
\end{enumerate}


\end{definition}


If a set $\bm{Z}$ of observed variables satisfies the back-door criterion relative to $(X,Y)$ in a causal diagram $G$, 
then, the total effect $\tau_{yx}$ is identifiable and is given by the formula $\beta_{yx{\cdot}z}$ [\citep{Pearl09}].\setcitestyle{authoryear,open={[},close={]}}
For other identification conditions of linear causal effects, refer to, for example, \cite{Brito04}, {\cite{Cai08}, \cite{Chan10}}, \cite{Chen17}, \cite{Chen2017}, \cite{KP2014}, \cite{Pearl09}, \cite{Stanghellini04}, \cite{Stanghellini15} and \cite{Tian}. 
 
\quad~Here, a covariate is defined as an element of non-descendants of $X$ and $Y$.
In addition, covariates in a minimal set of variables that satisfy the back-door criterion are called confounders.
Note that such a minimal set is not unique and whether or not a certain covariate is considered a confounder depends on the selected minimal set.
Furthermore, a set of covariates satisfying the back-door criterion is also called a sufficient set of confounders; otherwise, it is called an insufficient set of confounders.
For details on the SCM, refer to the paper by \cite{Pearl09}.\setcitestyle{authoryear,open={},close={}}
Finally, the direct effect is also known as one of the representative linear causal effects.
However, we are concerned with the evaluation of the total effects because the direct effect can also be discussed in the framework of regression analysis through the ``single-door criterion'' [\citep{Pearl09}].
Thus, the total effects are identified with linear causal effects in this paper.

\section{PAL$_p$MA}

\subsection{Setup}
\quad~Let $X$, $Y$, $\bm{Z}$ and $\bm{W}$ be a treatment variable (and an explanatory variable), a response variable, an $r$-dimensional vector of explanatory variables ($\bm{Z}$ can be empty) and a $q$-dimensional vector of explanatory variables ($\bm{W}$ can be empty), respectively.
For a sample size of $n$, consider the linear regression model of $Y$ on $X$, 
$\bm{Z}$ and $\bm{W}$
\begin{align}
\bm{y}=\bm{x}\beta_{yx{\cdot}zw}+\bm{z}B_{yz{\cdot}xw}+\bm{w}B_{yw{\cdot}xz}+\bm{\epsilon}_{y{\cdot}xzw},\label{2}
\end{align}
where $\bm{x}$ and $\bm{y}$ represent $n$-dimensional observation vectors of $X$ and $Y$, respectively.
In addition, $\bm{z}$ and $\bm{w}$ are an $n\times r$ observation matrix of $\bm{Z}$ and an $n\times q$ observation matrix of $\bm{W}$, respectively.
Furthermore, $\beta_{yx{\cdot}zw}$, $B_{yz\cdot xw}$ and $B_{yw\cdot xz}$ are the regression coefficient of $X$, the regression vector of $\bm{Z}$ and the regression vector of $\bm{W}$ in equation (\ref{2}), respectively. $\bm{\epsilon}_{y{\cdot}xzw}$ is an $n$-dimensional vector of error variables.
Here, we assume that elements of $\bm{\epsilon}_{y{\cdot}xzw}$ are independent and identically distributed with mean zero and variance $\sigma_{yy{\cdot}xzw}<\infty$.
In this paper, we also assume that a treatment variable, a response variable and explanatory variables are standardized to a sample mean of zero and a variance of one in advance.
Here, we consider a situation where (i) $\bm{Z}\cup\bm{W}$ is a set of covariates satisfying the back-door criterion relative to $(X,Y)$, (ii) $\bm{Z}$ is a subset of confounders selected from prior causal knowledge (possibly an empty set, a sufficient set of confounders, or an insufficient set of confounders), and (iii) $\bm{W}$ is a set of covariates for which it is uncertain which covariate should be added to $\bm{Z}$ as a confounder, or we know that a given set of covariates satisfies the back-door criterion but the OLS method is not applicable to estimating total effects using such a set because of the multicollinearity/high-dimensional data problem.

\quad~Then, for a smaller subset of $\bm{Z}\cup\bm{W}$, if the signs of the regression coefficients of $X$ are equivalent between the regression models using $\bm{Z}\cup\bm{W}$ and a selected smaller set, the regression model using such a subset will not provide misleading qualitative results regarding the total effects.
Under the above setting, the aim of this paper is to derive a consistent or less-biased estimator of the total effect.

\quad~This paper mainly focuses on a situation where
the sum of squares matrix of $\{X\}\cup\bm{Z}$ is invertible but that of $\{X\}\cup\bm{Z}\cup\bm{W}$ is not, because if it is invertible then the total effect is estimable by the OLS method [\citep{Pearl09}].

\subsection{PAL$_p$MA estimator}
\quad~We let $s_{xy}$, $S_{zw}$ and $S_{xz}$ be the sum of cross-products between $X$ and $Y$,
the sum of the cross-product matrix between $\bm{Z}$ and $\bm{W}$ and the sum of the cross-product vectors between $X$ and $\bm{Z}$, respectively.
In addition, we let $s_{xx}$, $S_{zz}$ and $I_{q,q}$ be the sum of squares of $X$, the sum of squares matrix of $\bm{Z}$ and a $q\times q$ identity matrix, respectively.
Furthermore, $s_{xx{\cdot}zw}$, $S_{xw{\cdot}z}$ and $S_{ww{\cdot}z}$ are the conditional sum of squares of $X$ given $\bm{Z}$ and $\bm{W}$, the conditional sum of the cross-product vector between $X$ and $\bm{W}$ given $\bm{Z}$ and the conditional sum of squares matrix of $\bm{W}$ given $\bm{Z}$, respectively.
Similar notation is used for the remaining sum of squares/cross-products.
Then, the proposed method, PAL$_p$MA, is formulated as follows:

\quad~Let {
\begin{align}
\hspace*{-5mm}\left(
\begin{array}{c}
\hat{\beta}_{yx{\cdot}zw}\\ 
\hat{B}_{yz{\cdot}xw}\\
\hat{B}_{yw{\cdot}xz}
\end{array}
\right)
=\left(
\begin{array}{ccc}
s_{xx} & S_{xz} & S_{xw} \\
S^T_{xz} & S_{zz}& S_{zw}\\  
S^T_{xw} & S^T_{zw}& S_{ww} 
\end{array}
\right)^{-1}
\hspace*{-2mm}
\left(
\begin{array}{c}
s_{xy}\\ 
S_{zy}\\
S_{wy}
\end{array}
\right)\label{3}
\end{align}
}when the sum of squares matrix of the explanatory variables is invertible, and {
\begin{align}
\left(
\begin{array}{c}
\tilde{\beta}_{yx{\cdot}zw}\\ 
\tilde{B}_{yz{\cdot}xw}\\
\tilde{B}_{yw{\cdot}xz}
\end{array}
\right)
=
\left(\hspace*{-1mm}
\begin{array}{ccc}
s_{xx} & S_{xz} & S_{xw} \\
S^T_{xz} & S_{zz}& S_{zw}\\  
S^T_{xw} & S^T_{zw}& \lambda I_{q,q}+S_{ww} 
\end{array}
\hspace*{-1mm}\right)^{-1}
\hspace*{-3mm}\left(
\begin{array}{c}
s_{xy}\\ 
S_{zy}\\
S_{wy}
\end{array}
\right),\label{4}
\end{align}
}for $\lambda>0$ when the sum of squares matrix of the explanatory variables is not invertible.
Then, for $p=1,2$, consider the loss function
\begin{align}
\lefteqn{L_p(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B_{yw{\cdot}xz})}\nonumber \\
&=&\hspace{-5mm}\frac{1}{2}||\bm{y}-\bm{x}\beta_{yx{\cdot}zw}-\bm{z}B_{yz{\cdot}xw}-\bm{w}B_{yw{\cdot}xz}||^2_2 \nonumber \\
&&+\lambda_{p}||\bm{\gamma}\odot B_{yw{\cdot}xz}||_{p}^{{p}},\label{5}
\end{align}
where $\bm{\gamma}=(\gamma_{1},\gamma_{2},...,\gamma_{q})^T$ is a weight vector such that
{\begin{align}
\bm{\gamma}=\left(\frac{1}{|\tilde{\beta}_{yw_{1}{\cdot}xzw_{(1)}}|^{\xi}},\ldots,\frac{1}{|\tilde{\beta}_{yw_{q}{\cdot}xzw_{(q)}}|^{\xi}}\right)^{T}\label{6}
\end{align}}
for the non-invertible sum of squares matrix of the explanatory variables with tuning parameter $\xi\geq 0$, and
{\begin{align}
\bm{\gamma}=\left(\frac{1}{|\hat{\beta}_{yw_{1}{\cdot}xzw_{(1)}}|^{\xi}},\ldots,\frac{1}{|\hat{\beta}_{yw_{q}{\cdot}xzw_{(q)}}|^{\xi}}\right)^{T}\label{7}
\end{align}}
for the invertible sum of squares matrix of the explanatory variables with tuning parameter $\xi\geq 0$, where the superscript ``$T$'' stands for a transposed vector/matrix.
In addition, $\odot$ refers to the Hadamard product. $||\cdot||_p$ denotes the $L_p$ norm, and $\lambda_p$ is called a regularization parameter corresponding to the $L_p$ norm ($\lambda_p\geq 0$).
$|\cdot |$ stands for the absolute value.
The loss function (equation (\ref{5})) is different from standard $L_p$-regularized loss functions in the sense that the regularization parameter $\lambda_p$ is not assigned to
$\beta_{yx{\cdot}zw}$ or $B_{yz{\cdot}xw}$.
In this sense, equation (\ref{5}) is called a partially adaptive $L_p$-regularized loss function in this paper.
Here, under the assumption the sum of squares matrix of explanatory variables $\{X\}\cup\bm{Z}\cup\bm{W}$ is invertible, letting $\lambda_p=0$,
${\beta}_{yx{\cdot}zw}$, ${B}_{yz{\cdot}xw}$ and ${B}_{yw{\cdot}xz}$ that minimize equation (\ref{5}) yield equation (\ref{3}), i.e., the OLS estimators $\hat{\beta}_{yx{\cdot}zw}$, $\hat{B}_{yz{\cdot}xw}$ and $\hat{B}_{yw{\cdot}xz}$ of equation (\ref{2}), respectively.
Letting $\lambda_2=\lambda$ and $\xi=0$,
${\beta}_{yx{\cdot}zw}$, ${B}_{yz{\cdot}xw}$ and ${B}_{yw{\cdot}xz}$ that minimize equation (\ref{5}) yield equation (\ref{4}), i.e., the ridge-type estimators $\tilde{\beta}_{yx{\cdot}zw}$, $\tilde{B}_{yz{\cdot}xw}$ and $\tilde{B}_{yw{\cdot}xz}$ of equation (\ref{2}), respectively.


\quad~For $p=1$ and $\lambda_1>0$, ${\beta}_{yx{\cdot}zw}$, ${B}_{yz{\cdot}xw}$ and ${B}_{yw{\cdot}xz}$ that minimize equation (\ref{5}) are called PAL$_1$MA estimators, denoted by $\check{\beta}^{\dagger}_{yx{\cdot}zw}$, $\check{B}^{\dagger}_{yz{\cdot}xw}$ and $\check{B}^{\dagger}_{yw{\cdot}xz}$, respectively.
{If $\bm{W}$ is an active set for given $\lambda_1>0$}, that is, a subset of explanatory variables with nonzero regression coefficients, but does not include any elements of $\{X\}\cup\bm{Z}$ (i.e., any $i$-th element of $\check{B}^{\dagger}_{yw{\cdot}xz}$ does not take the value zero for given $\lambda_1>0$ ), 
and letting $q$ be the number of explanatory variables in the active set $\bm{W}$,  then under the assumption that the sum of squares matrix of explanatory variables $\{X\}\cup\bm{Z}\cup\bm{W}$ is invertible,
$\check{\beta}^{\dagger}_{yx{\cdot}zw}$ is given by
\begin{align}
{\check{\beta}^{\dagger}_{yx{\cdot}zw}
=\hat{\beta}_{yx{\cdot}zw}+
\frac{\displaystyle \lambda_1}{\displaystyle s_{xx{\cdot}zw}} \hat{B}^T_{xw{\cdot}z}\bm{\gamma}\odot\mbox{sign}(\check{B}^{\dagger}_{yw{\cdot}xz}).}\label{8}
\end{align}
Here, $\hat{B}_{xw{\cdot}z}$ is given by $\hat{B}_{xw{\cdot}z}=S^{-1}_{ww{\cdot}z}S_{wx{\cdot}z}$.
In addition, for a $q$-dimensional vector $\bm{a}=(a_1,a_2,...,$
$a_q)^T$, sign$(\bm{a})=(\mbox{sign}(a_1),\mbox{sign}(a_2),...,\mbox{sign}(a_q))^T$, where {\begin{align}
\mbox{sign} (a_i)=
{\begin{cases}1&:\ a_i>0\\
0&:\ a_i=0\\
-1&:\ a_i<0
\end{cases}}\label{9}
\end{align}
for $i=1,2,...,q$.}
\quad~For $p=2$ and $\lambda_2>0$, ${\beta}_{yx{\cdot}zw}$, ${B}_{yz{\cdot}xw}$ and ${B}_{yw{\cdot}xz}$ that minimize equation (\ref{5}) are called PAL$_2$MA estimators, denoted by 
$\tilde{\beta}^{\dagger}_{yx{\cdot}zw}$, $\tilde{B}^{\dagger}_{yz{\cdot}xw}$ and $\tilde{B}^{\dagger}_{yw{\cdot}xz}$, respectively.
Then, $\tilde{\beta}^{\dagger}_{yx{\cdot}zw}$ is given by
\begin{align}
\lefteqn{\hspace*{-3mm}\tilde{\beta}^{\dagger}_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}zw}}\nonumber \\
&&\hspace*{-9mm}+
\frac{\lambda_2  \hat{B}^T_{yw{\cdot}xz}(S_{ww{\cdot}z}+\lambda_2 \mbox{diag}(\bm{\gamma}))^{-1}S_{wx{\cdot}z}}{s_{xx{\cdot}z}-S_{xw{\cdot}z}(S_{ww{\cdot}z}+\lambda_2 \mbox{diag}(\bm{\gamma}))^{-1}S_{wx{\cdot}z}},\label{10}
\end{align}
where $\mbox{diag}(\bm{\gamma})$ is a diagonal matrix whose $(i,i)$ element corresponds to the $i$-th element of $\bm{\gamma}$ $(i=1,2,...,q)$.

\subsection{L$_p$ collapsibility}

\quad~In this section, we extend the concept of collapsibility from the framework of traditional regression analysis to regularized regression analysis as follows:


\begin{definition} ($L_p$ Collapsibility)\label{DEFINITION 3}
For a given $p$, $\bm{W}$ is said to be $L_p$ collapsible with the regression coefficient of $X$ on $Y$ in regression model (\ref{2}) when the coefficient does not depend on $\bm{W}$ or the regularization parameter $\lambda_p$.
In particular, when
$\bm{W}$ is $L_p$ collapsible with the regression coefficient of $X$ on $Y$ in regression model (\ref{2}) for $p=1,2$,
$\bm{W}$ is said to be collapsible with the regression coefficient of $X$ on $Y$ in regression model (\ref{2}).


\end{definition}

From equations (\ref{8}) and (\ref{10}), the following theorem is derived immediately:


\begin{theorem}\label{THEOREM 1}
For $p=1,2$, when the sum of squares matrix of $\bm{Z}\cup\bm{W}$ is invertible, if $S_{xw{\cdot}z}=\bm{0}_q$ holds, $\bm{W}$ is collapsible with the regression coefficient of $X$ on $Y$ in regression model (\ref{2}), i.e., we have
\begin{align}
\check{\beta}^{\dagger}_{yx{\cdot}zw}=\tilde{\beta}^{\dagger}_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}z}.\label{11}
\end{align}
Particularly, if $X$ is conditionally independent of $\bm{W}$ given $\bm{Z}$, we have
\begin{align}
\hspace*{-2mm}E(\check{\beta}^{\dagger}_{yx{\cdot}zw})=E(\tilde{\beta}^{\dagger}_{yx{\cdot}zw})=E(\hat{\beta}_{yx{\cdot}zw})=E(\hat{\beta}_{yx{\cdot}z}).\label{12}
\end{align}


\end{theorem}
Note that $\bm{W}$ is assumed to be an active set for $p=1$ in Theorem \ref{THEOREM 1}.


\begin{theorem}\label{THEOREM 2}
For $p=2$, when the sum of squares matrix of $\{X\}\cup\bm{Z}\cup\bm{W}$ is invertible, if $S_{yw{\cdot}xz}=\bm{0}_q$ holds, 
$\bm{W}$ is $L_2$ collapsible with the regression coefficient of $X$ on $Y$ in regression model (\ref{2}), i.e., we have
\begin{align}
\tilde{\beta}^{\dagger}_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}z}. \label{13}
\end{align}
Particularly, if $Y$ is conditionally independent of $\bm{W}$ given $X$ and $\bm{Z}$, we have
\begin{align}
E(\tilde{\beta}^{\dagger}_{yx{\cdot}zw})=E(\hat{\beta}_{yx{\cdot}zw})=E(\hat{\beta}_{yx{\cdot}z}).\label{14}
\end{align}

\end{theorem}

\quad~Generally, standard regularized regression analysis does not provide consistent estimators of the regression coefficients.
In contrast, from Theorem \ref{THEOREM 1}, for $p=1,2$, PAL$_p$MA provides the consistent estimator of the regression coefficient of $X$ on $Y$ if $X$ and $\bm{W}$ are conditionally independent given $\bm{Z}$, regardless of the regularization parameter.
In other words, when $\bm{W}$ is $L_p$ collapsible with the regression coefficient of $X$ on $Y$ and $\bm{Z}$ satisfies the back-door criterion relative to $(X,Y)$ in regression model (\ref{2}),
PAL$_p$MA can provide a consistent estimator of the total effect.
On the other hand, when $X$ is not conditionally independent of $\bm{W}$ given $\bm{Z}$, PAL$_p$MA may provide a biased estimator of the regression coefficient of $X$ on $Y$.

\quad~To reduce the bias, consider a partially adaptive $L_2$-regularized loss function with a weight vector $\bm{\gamma}^*$ and a tuning parameter $\xi^*$ such that $\bm{x}$ and
$\bm{y}$ are replaced by an empty set and $\bm{x}$ in equation (\ref{5}), respectively.
Letting $\tilde{B}_{xw{\cdot}z}^{\dagger}$ and $\tilde{B}^{\dagger}_{xz{\cdot}w}$ be PAL$_2$MA estimators of $B_{xw{\cdot}z}$ and $B_{xz{\cdot}w}$ derived from such a loss function, respectively, from equation (\ref{8}), we formulate the modified PAL$_1$MA estimator of $\beta_{yx{\cdot}zw}$ as
\begin{align}
%\lefteqn{
\hspace*{-3mm}
\check{\beta}^*_{yx{\cdot}zw}
%}\nonumber \\
%&&\hspace*{-6mm}
=\check{\beta}^{\dagger}_{yx{\cdot}zw}-\frac{\lambda_1}{\tilde{s}^{\dagger}_{xx{\cdot}zw}}\tilde{B}_{xw{\cdot}z}^{\dagger T}\bm{\gamma}\odot\mbox{sign}(\check{B}_{yw{\cdot}xz}^{\dagger}),\label{15}\\
%\end{align}\\
\hspace*{-20mm}
\tilde{s}^{\dagger}_{xx{\cdot}zw}
=||\bm{x}-\bm{z}\tilde{B}^{\dagger}_{xz{\cdot}w}-\bm{w}\tilde{B}^{\dagger}_{xw{\cdot}z}||^2_2\label{16}
\end{align}
for an active set $\bm{W}$. When the sum of squares matrix of $\{X\}\cup \bm{Z}\cup \bm{W}$ is invertible, we have
%\begin{equation}
\begin{align}
\lefteqn{\hspace*{-1.5cm}\check{\beta}^*_{yx{\cdot}zw}=\check{\beta}^{\dagger}_{yx{\cdot}zw}-\frac{\lambda_1}{\tilde{s}^{\dagger}_{xx{\cdot}zw}}\tilde{B}_{xw{\cdot}z}^{\dagger T}\bm{\gamma}\odot\mbox{sign}(\check{B}_{yw{\cdot}xz}^{\dagger})}\nonumber \\
\lefteqn{\hspace*{-1cm}=\hat{\beta}_{yx{\cdot}zw}+\lambda_1 \left(
\frac{\displaystyle 1}{\displaystyle s_{xx{\cdot}zw}}\hat{B}_{xw{\cdot}z}-\frac{1}{\tilde{s}_{xx{\cdot}zw}^{\dagger}}\tilde{B}_{xw{\cdot}z}^{\dagger}\right)^T}\nonumber \\
&&\times \bm{\gamma}\odot\mbox{sign}(\check{B}_{yw{\cdot}xz}^{\dagger}).
\label{17}
\end{align}
%\end{equation}
Thus, when $\bm{Z}\cup\bm{W}$ satisfies the back-door criterion, if $\hat{B}_{xw{\cdot}z}=\tilde{B}^{\dagger}_{xw{\cdot}z}$ and $s_{xx{\cdot}zw}=\tilde{s}^{\dagger}_{xx{\cdot}zw}$ hold (i.e.,these estimators are not dependent on the regularization parameter), then the total effect is estimated by $\check{\beta}^{*}_{yx{\cdot}zw}$.
In addition, since we have
\begin{align}
\lefteqn{\check{\beta}^*_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}zw}+\lambda_1 \hat{B}^T_{xw{\cdot}z}}\nonumber \\
&&\hspace*{-5mm}\times
\left(
\frac{I_{q, q}}{s_{xx{\cdot}zw}}
-\frac{S_{ww{\cdot}z}(S_{ww{\cdot}z}+\lambda_2 \mbox{diag}(\bm{\gamma}^*))^{-1}}{\tilde{s}_{xx{\cdot}zw}^{\dagger}}
\right)\nonumber \\
&&\times \bm{\gamma}\odot\mbox{sign}(\check{B}_{yw{\cdot}xz}),\label{18}
%
%+\frac{\lambda_1  \lambda^2_2}{s_{xx{\cdot}zw}\tilde{s}_{xx{\cdot}zw}^{\dagger}} \hat{B}_{xw{\cdot}z}
%(S_{ww{\cdot}z}+\lambda_2 \mbox{diag}(\bm{\gamma}))^{-1}\nonumber \\
%&&\times S_{wx{\cdot}z}\hat{B}_{xw{\cdot}z}(S_{ww{\cdot}z}+\lambda_2 \mbox{diag}(\bm{\gamma}))^{-1}\nonumber \\
%&&\times \mbox{\boldmath
\end{align}
if {$S_{xw{\cdot}z}=\bm{0}_{q}$}, the total effect is also estimated by $\check{\beta}^{*}_{yx{\cdot}zw}$.

\quad~On the contrary, even when the sum of squares matrix of $\{X\}\cup \bm{Z}\cup \bm{W}$ is not invertible, by taking a small value of $\lambda_2>0$ such that $S_{ww{\cdot}z}+\lambda_2 \mbox{diag}(\bm{\gamma}^{*})$ is invertible in equation (\ref{18}), the modified PAL$_1$MA can provide the less-biased estimator of the total effects.
Hereafter, the modified PAL$_1$MA estimator is merely called the PAL$_1$MA estimator.

\subsection{i-PROGLES}

\quad~Similar to standard regularized regression analysis such as LASSO, adaptive LASSO and elastic net, it is difficult to provide the explicit formula of the PAL$_1$MA estimator of the regression coefficient of $X$ on $Y$, since equation (\ref{5}) includes the non-differentiable term $||\bm{\gamma}\odot B_{yw{\cdot}xz}||^1_1$; the optimization algorithm is needed to derive the PAL$_1$MA estimator.
Here, note that standard LASSO algorithms such as least angle regression [\citep{efron04}] and generalized path seeking [\citep{friedman12}] are not applicable to achieve our aim since neither $\beta_{yx{\cdot}zw}$ nor $B_{yz{\cdot}xw}$ are regularized in equation (\ref{5}).

\quad~To derive the PAL$_1$MA estimator $\check{\beta}^{*}_{yx{\cdot}zw}$, we propose a novel optimization algorithm that adopts the idea of the block coordinate relaxation method [\citep{sardya00}]: ``integrated algorithm of PROximal Gradient method and LEast Squares method'' (i-PROGLES).
i-PROGLES, which is shown in Algorithm \ref{ALGORITHM 1}, can be considered the integrated iterative algorithm of the proximal gradient method [\citep{Daubechies}] and the OLS method. i-PROGLES enables us to include both the treatment variable and some of important confounders in the regression model, regardless of the value taken by the regularization parameters. 
In addition, if we know that a set of covariates satisfies the back-door criterion, the solution path with such information can be utilized as the criteria of parameter tuning of i-PROGLES to include the set of covariates. 

\begin{figure*}[!t]


\begin{center}
\begin{minipage}{15cm}
\begin{algorithm}[H]
%\baselineskip 5.5mm
\caption{\label{ALGORITHM 1}
%integrated algorithm of PROximal Gradient Method and LEast Squares Method (
: i-PROGLES\hspace*{\fill}(both $\lambda_{2}$ and $\xi_2$ are used to derive $\tilde{B}^{\dagger}_{xw{\cdot}z}$ and $\tilde{s}^{\dagger}_{xx{\cdot}zw}$)
%)
}
{\begin{algorithmic}[1]
 \renewcommand{\algorithmicrequire}{\textbf{Input:}}
 \renewcommand{\algorithmicensure}{\textbf{Output:}}

\REQUIRE
$\bm{x}$, $\bm{y}$, $\bm{z}$ and $\bm{w}$, $k^*>0$, $\lambda_1\geq 0$,
$\lambda_2\geq 0$, $\xi_1 >0$, $\xi_2 >0$

%\ENSURE  $\beta_{yx{\cdot}zw},B_{yz{\cdot}xw}$ and $B^{\sharp}_{yw{\cdot}xz}$\\ \textit{Initialization} :Set
\begin{displaymath}
\beta_{yx{\cdot}zw}[0]=\hat{\beta}_{yx{\cdot}z},\,\,\,
B_{yz{\cdot}xw}[0]=\hat{B}_{yz{\cdot}x}
\end{displaymath}
\begin{displaymath}
B^{\sharp}_{yw{\cdot}xz}[0]=\mathop{\mbox{argmin}}_{B}\left(
\frac{1}{2}|| \bm{y}-
\bm{x}\hat{\beta}_{yx{\cdot}z}-\bm{z}\hat{B}_{yz{\cdot}x}-
\bm{w}^{\sharp} B||^{2}_{2}+\lambda_1||B||_{1}\right)
\end{displaymath}
%\textit{
Calculate the weight vector:
%} :
If the sum of squares matrix of the explanatory variables is not invertible, set
{\begin{displaymath}
\bm{\gamma}=\left(\frac{1}{|\tilde{\beta}_{yw_{1}{\cdot}xzw_{(1)}}|^{\xi_1}},\frac{1}{|\tilde{\beta}_{yw_{2}{\cdot}xzw_{(2)}}|^{\xi_1}},\ldots,\frac{1}{|\tilde{\beta}_{yw_{q}{\cdot}xzw_{(q)}}|^{\xi_1}}\right)^{T}
\end{displaymath}}
If the sum of squares matrix of the explanatory variables is invertible, set
{\begin{displaymath}
\bm{\gamma}=\left(\frac{1}{|\hat{\beta}_{yw_{1}{\cdot}xzw_{(1)}}|^{\xi_1}},\frac{1}{|\hat{\beta}_{yw_{2}{\cdot}xzw_{(2)}}|^{\xi_1}},\ldots,\frac{1}{|\hat{\beta}_{yw_{q}{\cdot}xzw_{(q)}}|^{\xi_1}}\right)^{T}
\end{displaymath}}
\FOR {$k=0$ to\ $k^{*}$}
\STATE Set
\begin{displaymath}
\eta\leq (\lambda_{\mbox{max}}(S_{ww}^{\sharp}))^{-1}
\end{displaymath}
\begin{displaymath}
B^{\sharp}_{yw{\cdot}xz}[k+1]=\mbox{prox}_{\eta\lambda_{1}}(B^{\sharp}_{yw{\cdot}xz}[k]-\eta(
S^{\sharp}_{wx} \beta_{yx{\cdot}zw}[k]+S^{\sharp}_{wz} B_{yz{\cdot}xw}[k]+ S^{\sharp}_{ww}B^{\sharp}_{yw{\cdot}xz}[k]-S^{\sharp}_{wy}))
\end{displaymath}
%q\STATE  Set $q$ to the number of explanatory variables in an active set $\bm{W}$
\STATE Set
\begin{displaymath}
B_{yw{\cdot}xz}[k+1]=\left(\gamma_{1}^{-1}\beta_{yw_{1}{\cdot}xzw_{(1)}}^{\sharp}[k+1],\gamma_{2}^{-1}\beta_{yw_{2}{\cdot}xzw_{(2)}}^{\sharp}[k+1]\ldots,\gamma_{q}^{-1}\beta^{\sharp}_{yw_{q}{\cdot}xzw_{(q)}}[k+1]\right)^T
\end{displaymath}
%for an active set $\bm{W}$.
\STATE Set
\begin{displaymath}
\beta_{yx{\cdot}zw}[k+1]=
\hat{\beta}_{yx{\cdot}z}-\hat{B}_{wx{\cdot}z}B_{yw{\cdot}xz}[k+1],\,\,\,\,\,\,
%\end{displaymath}
%\begin{displaymath}
B_{yz{\cdot}xw}[k+1]=
\hat{B}_{yz{\cdot}x}-\hat{B}_{wz{\cdot}x}B_{yw{\cdot}xz}[k+1]
\end{displaymath}
%  \IF {($i \ne 0$)}
%  \STATE statement..
%  \ENDIF
\ENDFOR
  \STATE Set
\begin{displaymath}
\check{\beta}^*_{yx{\cdot}zw}=\beta_{yx{\cdot}zw}[k^{*}+1]-\frac{\lambda_1}{\tilde{s}^{\dagger}_{xx{\cdot}zw}}\tilde{B}_{xw{\cdot}z}^{\dagger T}\bm{\gamma}\odot\mbox{sign}(B_{yw{\cdot}xz}[k^{*}+1])
\end{displaymath}
 \RETURN  $\check{\beta}^*_{yx{\cdot}zw}$
\end{algorithmic}}
\end{algorithm}
\end{minipage}
\end{center}


\end{figure*}

\quad~To construct i-PROGLES, let $\bm{w}_i$ be an $n$-dimensional observation vector of the $i$-th explanatory variable $W_i$ of $\bm{W}$ $(W_i\in \bm{W}: i=1,2,...,q)$.
In addition, based on the weight vector $\bm{\gamma}$ from equations (\ref{6}) and (\ref{7}), we define the $n\times q$ matrix $\bm{w}^{\sharp}$ and $B^{\sharp}_{yw{\cdot}xz}$ as $\bm{w}^{\sharp}=\left(\gamma_{1}^{-1}\bm{w}_{1},\gamma_{2}^{-1}\bm{w}_{2},\ldots,\gamma_{q}^{-1}\bm{w}_{q}\right)$ and $\bm{\gamma}\odot B_{yw{\cdot}xz}$, respectively.
Then, for $p=1$, equation (\ref{5}) is reformulated as
\begin{align}
\lefteqn{L^{\sharp}_1(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B^{\sharp}_{yw{\cdot}xz})}\nonumber\\
&=&\frac{1}{2}||\bm{y}-\bm{x}\beta_{yx{\cdot}zw}-\bm{z}B_{yz{\cdot}xw}-\bm{w}^{\sharp} B^{\sharp}_{yw{\cdot}xz}||^2_2 \nonumber \\
&&+\lambda_{1}||B^{\sharp}_{yw{\cdot}xz}||^1_{1}.\label{19}
\end{align}
Here, $B^{\sharp}_{yw{\cdot}xz}[0]$ is defined as the solution of equation (\ref{19}) given $\beta_{yx{\cdot}zw}=\hat{\beta}_{yx{\cdot}z}(=\beta_{yx{\cdot}zw}[0])$ and ${B}_{yz{\cdot}xw}=\hat{B}_{yz{\cdot}x}(={B}_{yz{\cdot}xw}[0])$.
Based on equation (\ref{19}), in the first substep of the $k+1$-th step $(k\geq 0)$, we evaluate $B^{\sharp}_{yw{\cdot}xz}$ as the solution of the naive LASSO given $\beta_{yx{\cdot}zw}={\beta}_{yx{\cdot}zw}[k]$ and ${B}_{yz{\cdot}xw}=B_{yz{\cdot}xw}[k]$:
\begin{align}
\lefteqn{
\hspace*{-5mm}B^{\sharp}_{yw{\cdot}xz}[k+1]}\nonumber\\
&&\hspace*{-11mm}=\mathop{\mbox{argmin}}_{B}\left( L^{\sharp}_1(\beta_{yx{\cdot}zw}[k],B_{yz{\cdot}xw}[k],B)\right).\label{20}
\end{align}
Here, letting $S_{ww}^{\sharp}$, $S_{yw}^{\sharp}$, $S_{wx}^{\sharp}$ and $S_{wz}^{\sharp}$ be the sum of squares matrix of $\bm{W}^{\sharp}$, the sum of cross-products vector between $Y$ and $\bm{W}^{\sharp}$, the sum of cross-products vector between $\bm{W}^{\sharp}$ and $X$ and the sum of cross-products matrix between $\bm{W}^{\sharp}$ and $\bm{Z}$, respectively, and
\begin{align}
\lefteqn{\hspace*{-5mm}f^{\sharp}(\beta_{yx{\cdot}zw},B_{yz{\cdot}xw},B_{yw{\cdot}xz})}\nonumber \\
&&\hspace*{-10mm}=\frac{1}{2}||\bm{y}-\bm{x}\beta_{yx{\cdot}zw}-\bm{z}B_{yz{\cdot}xw}-\bm{w}^{\sharp}B_{yw{\cdot}xz}||^2_2, \label{21}
\end{align}
%In Algorithm 1, note that the proximal gradient method (Daubechies et al, 2004) is also utilized to derive equation (18).Thus
$B^{\sharp}_{yw{\cdot}xz}[k+1]$ is formulated by
\begin{align}
\lefteqn{B^{\sharp}_{yw{\cdot}xz}[k+1]=\mbox{prox}_{\eta\lambda_1}\left(
\frac{}{}B^{\sharp}_{yw{\cdot}xz}[k]-\eta\right.}\label{22}\\
&&\hspace{-8mm}\left. \times\frac{\displaystyle \partial}{\displaystyle \partial  B}f^{\sharp}\left(\beta_{yx{\cdot}zw}[k],B_{yz{\cdot}xw}[k],B\right)_{B=B^{\sharp}_{yw{\cdot}xz}[k]}\right), \nonumber 
\end{align}
which is straightforward from equation (\ref{20}) through the proximal gradient method [\citep{Daubechies}] given $\beta_{yx{\cdot}zw}[k]$ and $B_{yz{\cdot}xw}[k]$.
{In this paper, $\mbox{prox}_{a}(b)$ is defined as
\begin{align}
\mbox{prox}_{a}(b)=
{\begin{cases}b-a&:\ b\ge a\\
0&:\ -a< b<a\\
b+a&:\ b\le -a
\end{cases}}.\label{23}
\end{align}}
In addition, noting
\begin{align}
\lefteqn{\frac{\displaystyle \partial}{\displaystyle \partial  B}f^{\sharp}\left(\beta_{yx{\cdot}zw}[k],B_{yz{\cdot}xw}[k],B\right)_{B=B^{\sharp}_{yw{\cdot}xz}[k]}}\label{24}\\
&=&\hspace{-3mm}S^{\sharp}_{wx} \beta_{yx{\cdot}zw}[k]+S^{\sharp}_{wz} B_{yz{\cdot}xw}[k]+S^{\sharp}_{ww}B^{\sharp}_{yw{\cdot}xz}[k]-S^{\sharp}_{wy},\nonumber
\end{align}
we have
\begin{align}
\lefteqn{\hspace*{-9mm}B^{\sharp}_{yw{\cdot}xz}[k+1]=\mbox{prox}_{\eta\lambda_1}(B^{\sharp}_{yw{\cdot}xz}[k]-\eta\,\,(S^{\sharp}_{wx} \beta_{yx{\cdot}zw}[k]}
\nonumber \\
&&\hspace*{-13mm}
+
S^{\sharp}_{wz} B_{yz{\cdot}xw}[k]+ S^{\sharp}_{ww}B^{\sharp}_{yw{\cdot}xz}[k]-S^{\sharp}_{wy})),\label{25}
\end{align}
where $\eta$ satisfies $\eta\leq (\lambda_{\mbox{max}}(S_{ww}^{\sharp}))^{-1}$. Here, 
$\lambda_{\mbox{max}}(S_{ww}^{\sharp})$, which is the maximum eigenvalue of $S_{ww}^{\sharp}$, corresponds to the Lipschitz constant with respect to $(\partial/\partial B_{yw{\cdot}xz})f^{\sharp}$.

%Here, note that $B_{yw{\cdot}xz}[k]$ is recoverable from $B^{\sharp}_{yw{\cdot}xz}[k]$ because $\bm{\gamma}$ is a given constant vector.

\quad~In the second substep of the $k+1$-th step, we evaluate $\beta_{yx{\cdot}zw}[k+1]$ and $B_{yz{\cdot}xw}[k+1]$ by the OLS method given $B_{yw{\cdot}xz}=B_{yw{\cdot}xz}[k+1]$:
\begin{align}
\lefteqn{\left( \beta_{yx{\cdot}zw}[k+1], B^T_{yz{\cdot}xw}[k+1]\right)^{T}}\nonumber \\
&&=\mathop{\mbox{argmin}}_{b,B}\left(f^{\sharp}(b,B,B_{yw{\cdot}xz}[k+1])\right)\label{26}\\
&&\hspace{-9.5mm}=\left(
\begin{array}{cc}
s_{xx}&S_{xz}\\
S^T_{xz}&S_{zz}
\end{array}\right)^{-1}
\left(
\begin{array}{c}
\bm{x}^T\\
\bm{z}^T
\end{array}\right)
\left(\bm{y}-\bm{w}B_{yw{\cdot}xz}[k+1]\right).\nonumber 
\end{align}
Regarding the convergence of i-PROGLES, the following theorem can be derived:
\begin{theorem}\label{THEOREM 3}
Let $\{\beta_{yx{\cdot}zw}[k]\}_{k\ge 0}$, $\{B_{yz{\cdot}xw}[k]\}_{k\ge 0}$ and $\{B_{yw{\cdot}xz}[k]\}_{k\ge 0}$ be the sequences of $\beta_{yx{\cdot}zw}$, $B_{yz{\cdot}xw}$ and $B_{yw{\cdot}xz}$, respectively, generated by i-PROGLES, and let $\bm{u}=(\bm{x},\bm{z})$.
When $\beta_{yx{\cdot}zw}^{*}$, $B_{yz{\cdot}xw}^{*}$ and $B^{ *}_{yw{\cdot}xz}$ minimize equation (\ref{19}) regarding $\beta_{yx{\cdot}zw}$, $B_{yz{\cdot}xw}$ and $B_{yw{\cdot}xz}$, respectively, there exists the natural number $K$ for any $\epsilon>0$ such that 
\begin{align}
\lefteqn{L_{1}\left(\beta_{yx{\cdot}zw}^{*}, B_{yz{\cdot}xw}^{*}, B_{yw{\cdot}xz}^{*}\right)}\nonumber \\
&&-L_{1}\left(\beta_{yx{\cdot}zw}[k+1], B_{yz{\cdot}xw}[k+1], B_{yw{\cdot}xz}[k+1]\right)\nonumber\\
\lefteqn{\le \frac{\lambda_{\mbox{max}}(S^{\sharp}_{ww})}{2k}||B_{yw{\cdot}xz}^{\sharp}[0]-B_{yw{\cdot}xz}^{\sharp*}||_{2}^{2}}\nonumber \\
\lefteqn{+\frac{\lambda_{\mbox{max}}(S_{uu})}{2}\lambda_{\mbox{max}}(S_{wu}^{\sharp}S_{uu}^{-2}S_{uw}^{\sharp})\epsilon.}\label{27}
\end{align}
holds for any $k\ge K$, where $B^{\sharp}_{yw{\cdot}xz}[k]=\bm{\gamma}\odot B_{yw{\cdot}xz}[k]$ and $B^{\sharp *}_{yw{\cdot}xz}=\bm{\gamma}\odot B_{yw{\cdot}xz}^{*}$.
%and $l_{f}$ is Lipschitz constant with respect to
%\begin{displaymath}
%\frac{\displaystyle \partial}{\displaystyle \partial  B_{yw{\cdot}xz}}f
%\end{displaymath}
%of equation (23).
\end{theorem}

The proof is given in the Supplementary Material.
%From Theorem 3, the required number of iterations in i-PROGLES to obtain an $\epsilon$-optimal solution is at most
%\begin{align}
%\left[\frac{\lambda_{\mbox{max}}(S_{ww}^{\sharp})||B_{yw{\cdot}xz}^{\sharp}[0]-B_{yw{\cdot}xz}^{*\sharp}||_{2}^{2}}{2\epsilon}\right],
%\end{align}
%where $[\cdot]$ stands for the ceiling function.

\section{Numerical Experiment}
\begin{table*}[!ttt]
\begin{center}

Table 1. Results based on cross-validation.


{ \begin{center}
\begin{tabular}{ccccccccccc} \cline{1-11}
 & \multicolumn{5}{c}{(a) $\tau_{yx}=0.474$} & \multicolumn{4}{c}{\hspace{5mm}parameter settings}& \\\cline{2-11}
 & mean & bias & mse & sd & sign & $\lambda$ & $\xi$ & $\phi$ & $\lambda_1$ & $\xi_1$ \\\cline{1-11}
LASSO & 0.1812 & 0.2929 & 0.1012 & 0.1238 & 0.8824& 0.0830 & - & - & - & -  \\
adaptive LASSO & 0.2736 & 0.2006 & 0.0776 & 0.1934 & 0.8932& 3.4300 &1.7000 &-& - & - \\
Elastic net & 0.2101 & 0.2641 & 0.0807 & 0.1047 & 0.9664  & 0.0780 & - & 0.5500 & - & -   \\
MCP & 0.2290 & 0.2451 & 0.0862 & 0.1617 & 0.8462  & 0.0600 & 19.5000 & - &- & -  \\
SCAD & 0.1909 & 0.2832 & 0.1032 & 0.1517 & 0.8216 & 0.0860 & 15.5000 & - & - & - \\
PAL$_1$MA &0.4486 & 0.0256 & 0.0640 & 0.2516 & 0.9746 & - & - & - & 0.0100 & 0.1000   \\
OLS & 0.4717 & 0.0025 & 0.2961 & 0.5441 & 0.8154 & - & - & - & - &-  \\\cline{1-11}
\end{tabular}
\end{center}

}

\end{center}
{mean: sample mean; bias: bias between the true value and the sample mean; mse: mean squared error; sd: standard deviation; sign: coincidence rate between the signs of the true value and the estimates; $\lambda$, $\lambda_1$: regularization parameters; $\xi$, $\xi_1$: tuning parameters; $\phi$: mixing parameter. The regularization parameter $\lambda_2$ and tuning parameter $\xi_2$ are selected as $\lambda_{2}=0.0014$, $\xi_{2}=0.0013$. Refer to the Supplementary Material for the selection of these parameters.}     \captionsetup{labelformat=empty,labelsep=none}\caption{\label{t1}}
\end{table*}
\quad~In this section, we present a numerical experiment to compare the performance of LASSO, adaptive LASSO, elastic net, SCAD, MCP, OLS and PAL$_1$MA.
For simplicity, letting $X$ and $Y$ be the treatment variable and the response variable, respectively, consider the linear SCM with 42 explanatory variables for $Y$ in the form of
\begin{align}
\hspace*{-5.0mm}\left.
\begin{array}{l}
Y=\alpha_{yx}X+\alpha_{yz_1}Z_1+\alpha_{yz_2}Z_2+A_{yw}\mbox{\boldmath $W$}+\epsilon_{y}\\
X=\alpha_{xz_1}Z_1+\alpha_{xz_2}Z_2+\epsilon_{x}
\end{array}
\hspace*{-1mm}
\right\}\label{28}
\end{align}
for {Fig.~\ref{f1}($\mbox{\boldmath $W$}$ includes 39 variables)}.
In this setting, we assume that $\{Z_1,Z_2\}$ satisfies the back-door criterion relative to $(X,Y)$ and the path coefficients of $\{Z_2\}\cup\mbox{\boldmath $W$}$ on $Y$ are regularized but $Z_1$ is not. Then, Theorem \ref{THEOREM 1} does not hold, and the estimated total effect may be biased.

\quad~To set up a simulation, we first construct the population variance-covariance matrix.
To eliminate the arbitrariness, the true values of the path coefficients $\alpha_{yx}$, $\alpha_{yz_{1}}$, $\alpha_{yz_{2}}$, $A_{yw}=(\alpha_{yw_1},...,\alpha_{yw_{39}})$, $\alpha_{xz_1}$ and $\alpha_{xz_2}$ are randomly and independently determined according to the uniform distribution with the interval $[-3,3]$.
In addition, we assume that (i) the random disturbances $\epsilon_{x}$ and $\epsilon_{y}$ independently follow the normal distribution with mean zero and variance one, and (ii) the random disturbances are also independent of their nondescendants.\setcitestyle{authoryear,open={[},close={]}}
Furthermore, the population variance-covariance matrices of $\{Z_1,Z_2\}\cup \mbox{\boldmath $W$}$ are randomly determined according to \cite{Pourahmadi15}.\setcitestyle{authoryear,open={},close={}}
\begin{figure}[hhh]
\begin{center}

\hspace*{20mm}\includegraphics[width=5.5cm,clip]{Fig1(b).jpg}\hspace*{\fill}


\end{center}
\hspace*{\fill}Fig. 1. Causal diagram \hspace*{\fill}
\captionsetup{labelformat=empty,labelsep=none}\caption{\label{f1}}
\end{figure}


\quad~We generated 30 random samples of 42 variables from
a multivariate normal distribution with a zero mean vector and the above variance-covariance matrix for 5000 replications.
Table \ref{t1} shows the basic statistics of the total effects estimated by LASSO, adaptive LASSO, elastic net, SCAD, MCP, OLS and PAL$_1$MA based on the given sample size of 30 for each parameter setting.
Regarding the parameter tuning for regularized regression analysis, see the Supplementary Material.
Here, for the OLS method, we select a set of covariates based on prior causal knowledge; i.e., $\{Z_1,Z_2\}$ are selected.

\quad~From Table \ref{t1}, both the PAL$_1$MA estimators and the OLS estimators are almost consistent with the true values of the total effects, but the other regularized regression methods yield highly biased estimators.
In addition, the coincidence rates between the signs of the estimated total effects and the true total effects for PAL$_1$MA are better than those for the other regression methods.
From {Fig.~\ref{f2}}, the interquartile ranges of both PAL$_1$MA and OLS include the true value of the total effects, but the other regularized regression analyses do not include this value of the total effects.
For further discussion on the simulation experiments, see the Supplementary Material.

\begin{figure}[hhh]
%\begin{center}

\hspace*{\fill}\includegraphics[width=7.5cm,clip]{b.png}\hspace*{\fill}


%\end{center}
\hspace*{\fill}Fig. 2. Boxplots of the estimated total effects. The dashed lines show the true total effects. \hspace*{\fill}
\captionsetup{labelformat=empty,labelsep=none}\caption{\label{f2}}
\end{figure}

\section{Conclusion}

\quad~In current situations where advanced artificial intelligence (AI) technology enables us to collect large datasets, it would not be so difficult to observe a large number of covariates.
In such situations, it would be reasonable to consider that such a set of covariates satisfies the back-door criterion to estimate the total effects.
However, when multicollinearity/high-dimensional data problems occur in even this situation, it is difficult to evaluate the linear causal effects reliably.
To solve this problem, we established PAL$_p$MA to provide a consistent or less-biased estimator of the total effects.
%This establishment has been achieved by constructing the unifying approach of (i) the collapsibility condition of the OLS estimator, (ii) the oracle properties of adaptive LASSO, and (iii) the bias correction of LASSO estimator by adaptive Ridge (OLS) regression.
In addition, through numerical experiments and a case study in Supplementary Material, we confirmed that PAL$_1$MA is superior to other estimation methods.
The results of this paper are applicable to evaluating the direct effect in the framework of regression models through the ``single-door criterion'' [\citep{Pearl09}]. {The results of this paper would also help us to obtain the reliable evaluation of the mean of the response variable when conducting the external intervention (e.g., \cite{KN20}, \cite{NK21}) from  multicollinearity/high-dimensional data.}

\quad~Finally, although PAL$_p$MA is formulated based on linear regression models, 
it would be interesting to extend our approach to a wide variety of statistical models, including generalized linear models, generalized estimating equations and proportional hazards models.
Such an extension would be straightforward --- the objective function would be replaced with a more general form. This extension will be left for future work.

\subsubsection*{Acknowledgement}
{This research was financially supported by Grant-in-Aid for Scientific Research (B) Grant Number 21H03504 and Scientific Research (C) Grant Number 19K11856.
}

\begin{thebibliography}{99}

\bibitem[\protect\citeauthoryear{Brito}{2004}]{Brito04}
Brito, C. 
Graphical methods for identification in structural equation models. Computer Science Department, UCLA, PhD Thesis, 2004.

\bibitem[\protect\citeauthoryear{B\"{u}hlmann and van de Geer}{2011}]{Buhlmann11}
B\"{u}hlmann, P. and van de Geer, S.  
{\textit{Statistics for High-dimensional Data: Methods, Theory and Applications}}, Springer Science and Business Media, 2011.


\bibitem[\protect\citeauthoryear{Cai and Kuroki}{2008}]{Cai08}
Cai, Z. and Kuroki, M. 
On identifying total effects in the presence of latent variables and selection bias. 
{\textit{Proceedings of the 34th Conference on Uncertainty in Artificial Intelligence}}, 62-69, 2008.

\bibitem[\protect\citeauthoryear{Chan and Kuroki}{2010}]{Chan10}
{Chan, H. and Kuroki, M. 
Using descendants as instrumental variables for the identification of direct causal effects in linear SEMs. 
{\textit{Proceedings of the 13th International Conference on Artificial Intelligence and Statistics}}, 73-80, 2010.}

\bibitem[\protect\citeauthoryear{Chen}{2017}]{Chen17}
Chen, B. R.  
Graphical methods for linear structural equation modeling. Computer Science Department, UCLA, PhD Thesis, 2017.

\bibitem[\protect\citeauthoryear{Chen et al}{2017}]{Chen2017}
Chen, B., Kumor, D. and Bareinboim, E.  
Identification and model testing in linear structural equation models using auxiliary variables, 
{\textit{Proceedings of the 34th International Conference on Machine Learning}}, 757–766, 2017.

\bibitem[\protect\citeauthoryear{Clogg et al}{1992}]{Clogg92}
Clogg, C.C., Petkova, E. and Shihadeh, E. S. 
Statistical methods for analyzing collapsibility in regression models. 
{\textit{Journal of Educational Statistics}}, {\textbf{17}}:51--74, 1992.

\bibitem[\protect\citeauthoryear{Daubechies et al}{2004}]{Daubechies}
Daubechies, I., Defrise, M. and De Mol, C. 
An iterative thresholding algorithm for linear inverse problems with a sparsity constraint. {\textit{Communications on Pure and Applied Mathematics}}, {\textbf{57}}:1413--1457, 2004.

\bibitem[\protect\citeauthoryear{Efron et al}{2004}]{efron04}
Efron, B., Hastie, T., Johnstone, I. and Tibshirani, R.  
Least angle regression. 
{\textit{Annals of Statistics}}, {\textbf{32}}:407--499, 2004.

\bibitem[\protect\citeauthoryear{Fan and Li}{2001}]{fan01}
Fan, J. and Li, R.  
Variable selection via nonconcave penalized likelihood and its oracle properties. 
{\textit{Journal of the American statistical Association}}, {\textbf{96}}:1348--1360, 2001.

\bibitem[\protect\citeauthoryear{Frisch}{1934}]{Frisch34}
Frisch, R.  
{\textit{Statistical Confluence Analysis by Means of Complete Regression Systems}}, University Institute of Economics, 1934.

\bibitem[\protect\citeauthoryear{Friedman}{2012}]{friedman12}
Friedman, J. H.  
Fast sparse regression and classification. 
{\textit{International Journal of Forecasting}}, {\textbf{28}}:722-738, 2012.

\bibitem[\protect\citeauthoryear{Geng and Asano}{1993}]{geng93}
Geng, Z. and Asano, C.  
Strong collapsibility of association measures in linear models.
{\textit{Journal of the Royal Statistical Society:  Series B}}, {\textbf{55}}:741--747, 1993.

\bibitem[\protect\citeauthoryear{Guo and Geng}{1995}]{guo95}
Guo, J. H. and Geng, Z.  
Collapsibility of logistic regression coefficients.
{\textit{Journal of the Royal Statistical Society:  Series B}}, {\textbf{57}}:263--267, 1995.

\bibitem[\protect\citeauthoryear{Hoerl and Kennard}{1970}]{Hoerl70a}
Hoerl, A. E. and Kennard, R. W.  
Ridge regression: Biased estimation for nonorthogonal problems. {\textit{Technometrics}}, {\textbf{12}}:55--67, 1970. 

\bibitem[\protect\citeauthoryear{Kuroki}{2012}]{Kuroki12}
\textsc{Kuroki, M.}  
Optimizing an external intervention using a structural equation model with an application to statistical process analysis. 
{\textit{Journal of Applied Statistics}}, {\textbf{39}}:673-694, 2012.

\bibitem[\protect\citeauthoryear{Kuroki and Matsuura}{2018}]{KM2018}
{Kuroki, M. and Matsuura, S. 
Predictive principal variable selection for linear regression analysis. 
{\textit{Journal of the Japanese Society for Quality Control}}, {\textbf{48}}:90-104, 2018. }

\bibitem[\protect\citeauthoryear{Kuroki and Matsuura}{2019}]{KM2019}
{Kuroki, M. and Matsuura, S. 
Predictive principal variable selection criteria for linear regression analysis with applications to statistical quality control-Basic idea-. {\textit{Journal of the Japanese Society for Quality Control}}, {\textbf{49}}:293-298, 2019. }

\bibitem[\protect\citeauthoryear{Kuroki and Matsuura}{2020}]{KM2020}
{Kuroki, M. and Matsuura, S. 
Predictive principal variable selection criteria for linear regression analysis with applications to statistical quality control-Case studies-. 
{\textit{Journal of the Japanese Society for Quality Control}}, {\textbf{50}}:4-11, 2020. }

\bibitem[\protect\citeauthoryear{Kuroki and Nanmo}{2020}]{KN20}
{Kuroki, M. and Nanmo, H.
Variance formulas for estimated mean response and predicted response with external intervention based on the back-door criterion in linear structural equation models. 
{\textit{AStA Advances in Statistical Analysis}}, 
{\textbf{104}}:667-685, 2020. }

\bibitem[\protect\citeauthoryear{Kuroki and Pearl}{2014}]{KP2014}
Kuroki, M. and Pearl, J.  
Measurement bias and effect restoration in causal inference. 
{\textit{Biometrika}}, {\textbf{101}}:423–437, 2014. 

\bibitem[\protect\citeauthoryear{Nanmo and Kuroki}{2021}]{NK21}
{Nanmo, H. and Kuroki, M. 
Exact variance formula for the estimated mean outcome with external intervention based on the front-door criterion in Gaussian linear structural equation models. {\textit{Journal of Multivariate Analysis}}, 
{\textbf{185}}:104766, 2021. }




%\bibitem{}\label{Kallus18}
%Kallus, N., Puli, A. M. and Shalit, U. (2018).
%Removing hidden confounding by experimental grounding.
%{\textit{Proceedings of the 32nd International Conference on Neural Information Processing Systems}, 10911-10920.

\bibitem[\protect\citeauthoryear{Pearl}{2009}]{Pearl09}
Pearl, J.  
{\textit{Causality: Models, Reasoning, and Inference, 2nd edition}}, Cambridge University Press, 2009.

\bibitem[\protect\citeauthoryear{Pearl}{2009, 2013, 2017}]{Pearl}
Pearl, J.  
Linear models: {A} useful ``microscope'' for causal analysis. 
{\textit{Journal of Causal Inference}}, {\textbf{1}}:155--170, 2013. 

\bibitem[\protect\citeauthoryear{Pearl}{2017}]{Pearl17}  
Pearl, J.  
A linear `microscope' for interventions and counterfactuals. 
{\textit{Journal of Causal Inference}}, {\textbf{5}}:1--15, 2017. 

%\bibitem{}\label{rida15}
%Rida, I., Jiang, X., \& Marcialis, G. L. (2015). Human body part selection by group lasso of motion for model-free gait recognition. {\textit{IEEE Signal Processing Letters}, {\textbf 23}, 154--158.

\bibitem[\protect\citeauthoryear{Pourahmadi and Wang}{2015}]{Pourahmadi15}
Pourahmadi, M. and Wang, X.  
Distribution of random correlation matrices: Hyperspherical parameterization of the Cholesky factor. 
{\textit{Statistics and Probability Letters}}, {\textbf{106}}:5-12, 2015.

\bibitem[\protect\citeauthoryear{Sardy et al}{2000}]{sardya00}
Sardy, S., Bruce, A. G. and Tseng, P.  
Block coordinate relaxation methods for nonparametric wavelet denoising. 
{\textit{Journal of Computational and Graphical Statistics}}, {\textbf{9}}:361-379, 2000.

\bibitem[\protect\citeauthoryear{Stanghellini}{2004}]{Stanghellini04}
Stanghellini, E.  
Instrumental variables in Gaussian directed acyclic graph models with an unobserved confounder. 
{\textit{Environmetrics}}, {\textbf{15}}:463-469, 2004.

\bibitem[\protect\citeauthoryear{Stanghellini and Pakpahan}{2015}]{Stanghellini15}
Stanghellini, E. and Pakpahan, E.  
Identification of causal effects in linear models: beyond instrumental variables. 
{\textit{Test}}, {\textbf{24}}:489–509, 2015. 

\bibitem[\protect\citeauthoryear{Tian}{2004, 2007ab}]{Tian}
Tian, J.  
Identifying linear causal effects. 
{\textit{Proceeding of the 19th National Conference on Artificial Intelligence}}, 104-111, 2004. 

\bibitem[\protect\citeauthoryear{Tian}{2007a}]{Tian2007a}
Tian, J.  
A criterion for parameter identification in structural equation models. 
{\textit{Proceedings of the 23rd Conference on Uncertainty in Artificial Intelligence}}, 392-399, 2007a. 

\bibitem[\protect\citeauthoryear{Tian}{2007b}]{Tian2007b}
Tian, J.  
On the identification of a class of linear models. 
{\textit{Proceedings of the 22nd National Conference on Artificial Intelligence}}, 1284-1289, 2007b. 

\bibitem[\protect\citeauthoryear{Tibshirani}{1996}]{Tibshirani96}
Tibshirani, R.  
Regression shrinkage and selection via the lasso.  
{\textit{Journal of the Royal Statistical Society: Series B}}, {\textbf{58}}:267--288, 1996.

\bibitem[\protect\citeauthoryear{van de Geer et al}{2014}]{geer14}
van de Geer, S., B\"{u}hlmann, P., Ritov, Y. A. and Dezeure, R.  
On asymptotically optimal confidence regions and tests for high-dimensional models. {\textit{Annals of Statistics}}, {\textbf{42}}:1166--1202, 2014.

\bibitem[\protect\citeauthoryear{Wermuth}{1989ab}]{Wermuth}
Wermuth, N.  
Moderating effects of subgroups in linear models.  {\textit{Biometrika}}, {\textbf{76}}:81--92, 1989a.

\bibitem[\protect\citeauthoryear{Wermuth}{1989b}]{Wermuth1989b}
Wermuth, N.   
Moderating effects in multivariate normal distributions. {\textit{Methodika}}, {\textbf{3}}:74--93, 1989b.

\bibitem[\protect\citeauthoryear{Zhang}{2010}]{zhang10}
Zhang, C. H.  
Nearly unbiased variable selection under minimax concave penalty.  
{\textit{Annals of Statistics}}, {\textbf{38}}:894-942, 2010.

\bibitem[\protect\citeauthoryear{Zou}{2006}]{Zou06}
Zou, H.  
The adaptive lasso and its oracle properties. {\textit{Journal of the American statistical association}}, {\textbf{101}}:1418-1429, 2006.

\bibitem[\protect\citeauthoryear{Zou and Hastie}{2005}]{Zou05}
Zou, H. and Hastie, T.  
Regularization and variable selection via the Elastic net. 
{\textit{Journal of the Royal Statistical Society: Series B}}, {\textbf{67}}:301-320, 2005.

%\bibitem{}\label{Zou12}
%Zou, C., Ning, X. and Tsung, F. (2012). LASSO-based multivariate linear profile monitoring. {\textit{Annals of Operations Research}, {\textbf 192}, 3-19.

\end{thebibliography}


\end{document}
