

\begin{table*}[!t]
\renewcommand{\arraystretch}{1.2}
%\small
\centering
\caption{Means of estimated coefficients by PTSLS and P-CAPCE estimators in setting (A).}
%\vspace{-5pt}
\label{tab:TAB1}
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l|lll|lll}
\hline
Estimated coefficients & \multicolumn{3}{c|}{$N=1000$}& \multicolumn{3}{c}{$N=10000$}\\
 \hline
Terms      & 1 & $W$      & $X$ & 1 & $W$      & $X$ \\
                           \hline \hline
PTSLS                      & 1.248     & 50.032 & 27.862               & 1.101      & 51.181 & 19.763               \\
P-CAPCE & -1.651    & 10.383 & 19.293               & 1.226      & 0.963  & 19.971 \\
\hdashline
True Coefficients          & 1         & 1      & 20                   & 1          & 1      & 20                   \\
\hline
\end{tabular}
\vspace{-0cm}
\end{table*}

\begin{table*}[t]
\renewcommand{\arraystretch}{1.2}
%\small
\centering
\caption{MSE and run time of estimators in settings (A) and (B).}
%\vspace{-5pt}
\label{tab:TAB2}
%\renewcommand{\arraystretch}{1}
\begin{tabular}{l|lll:lll}
\hline
   \multicolumn{1}{c|}{MSE}      & PTSLS &NTSLS & Kernel IV & P-CAPCE & S-CAPCE & RKHS CAPCE \\
        \hline \hline
(A) $N=1000$  & 925.139 & 418.396 & 548.821 & 104.990 & 203.079 & {\bf 87.853} \\
Time (second) & 0.126 & 0.361 & 6.105 & 0.132 & 0.596 & 6.410 \\
(A) $N=10000$ & 817.074 & 357.777 & 495.742 & {\bf 69.185}  & 185.056 &    71.276    \\
Time (second) & 0.372 & 1.127 & 2814.018 & 0.452 & 1.883 & 4530.765\\
\hline
(B) $N=1000$  & 290.340 & 46.405  & 45.734  & 202.313 & {\bf 8.600}   & 11.612 \\
Time (second) & 0.127 & 0.356 & 6.019 & 0.143 & 0.454 & 6.540 \\
(B) $N=10000$ & 265.400 & 20.990  & 51.470  & 54.124  & {\bf 3.579}   & 8.985   \\ 
Time (second) & 0.367& 1.031 & 2951.841 & 0.485 & 1.836 & 4360.991\\
\hline
\end{tabular}
\end{table*}




\section{Experiments}


In this section, we present numerical experiments to demonstrate the performance of the proposed P-CAPCE, sieve CAPCE,  and RKHS CAPCE estimators. {Detailed settings are  in Appendix \ref{appG}.} 
The experiments are performed using an Apple M1 (16GB). 


{\bf Baselines.} We compare with the most widely used methods PTSLS (parametric), NTSLS (sieve), and Kernel IV. These methods compute $\mathbb{E}[Y_{x}|{w}]$ which we differentiate to compute CAPCE  $\mathbb{E}[\partial_x Y_{x}|{w}]$.
%Additional information shown in Appendix.
%\subsection{Parametric Estimation}
%First, we compare the PTSLS and P-CAPCE estimator when $g_0$ is null in Eq. (\ref{EQ1}).\\



{\bf SCM Settings.} We consider the following two SCMs:  $W:=H+E_1, X:=Z+W+H+E_2$, and 
%\vspace{-0.1cm}
%{%\small
\begin{equation}
    \begin{aligned}
\label{eq-scm}
\left\{
\begin{array}{l}
Y:=10X^2+WX+X+W+50 f(W)H+E_3\ \hfill\text{(A)}\\
Y:=\text{exp}(X)\text{exp}(W)+25 f(W)H+E_3\hfill\text{(B)}
\end{array}
\right.
\end{aligned}
\end{equation}
%\normalsize}
where $f(W)=W^5+W^4+W^3+W^2$. The SCMs satisfy separability Assumption~\ref{AS2} but not (\ref{eq-sep}). 
We use setting (A) as a parametric setting and setting (B) as a nonparametric setting. 
Values of $Z$, $H$, $E_1$, $E_2$, and $E_3$ are sampled i.i.d. from a uniform distribution on $[-1,1]$.
%$U[-1,1]$.
True CAPCE  is $20x+w+1$ in setting (A) and $\text{exp}(x)\text{exp}(w)$ in setting (B).
%\yuta{The sample sizes are $N=1000$ and $N=10000$. We choose the parameters of each method using test errors from candidates shown in Appendix G.}

{\bf Setting of P-CAPCE  and PTSLS.} %We learn the conditional expectations of basis functions
%$\mathbb{E}[Y|Z=z]$, $\mathbb{E}[X|Z=z]$, $\mathbb{E}[WX|Z=z]$ and $\mathbb{E}[X^2|Z=z]$
%by the nonlinear model, 
%\begin{eqnarray}
%    $b_0+ b_1Z+b_2Z^2$.
%\end{eqnarray}
We used the basis terms $\{1,W,X\}$ for P-CAPCE and $\{1,W,X,WX,X^2\}$ for PTSLS, which match setting (A). 
%and let $z_0=-1$.
%We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $0.001 {\bf I}$ for PTSLS estimator and $0.1 {\bf I}$ for P-CAPCE estimator, where ${\bf I}$ is an identity matrix of size $M$.
%Regularize value is determined by test MSE from $\{1,10^{-1},10^{-2},10^{-3}\}$.
%The results of the test errors are shown in Table 1.



{\bf Setting of NTSLS and sieve CAPCE.} %We learn the conditional expectations by the nonlinear model, 
%\begin{eqnarray}
%    $b_0+ b_1Z+b_2Z^2+b_3Z^3$,
%\end{eqnarray}
We consider the  basis terms $h_p(X)h_q(W)$ for $p=0,1,2$ and $q=0,1,2$, where $h_p$ is Hermite polynomial functions: $h_0(t)=1$, $h_1(t)=t$, $h_2(t)=t^2-1$, and $h_3(t)=t^3-3t$. 
%and let $z_0=-1$.
%Let $\kappa=2$, and we calculate $\hat{\Lambda}$ by Monte Carlo integration using uniform distribution $(x,w)=(U(-4,4),U(-2,2))$, where $\Omega_X \subseteq [-4,4]$ and $\Omega_X \subseteq [-2,2]$. Regularize value is determined by test MSE from $\{1,10^{-1},10^{-2},10^{-3}\}$.\\
%We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $10^{-2} \hat{\Lambda}$.
%In addition, we give an experiment using multivariate linear basis function $\{1,W,X\}$, which is a minimal basis function to build CAPCE.
%Results of the test errors are shown in Table 5.
%We estimate CAPCE via differentiating estimated $\mathbb{E}[Y_{x}|{W}={w}]$.\\
%{\bf Setting of sieve NTSLS estimator.} We learn $\mathbb{E}[Y|Z=z], \mathbb{E}[h_p(X)h_q(W)|Z=z]$ for any $p=0,1,2,3$, $q=0,1,2,3$ and $q=0,1$ by the nonlinear model, 
%\begin{eqnarray}
%    $b_0+ b_1Z+b_2Z^2+b_3Z^3$.
%\end{eqnarray}
%where $h_0(t)=1$, $h_1(t)=t$, $h_2(t)=t^2-1$ and $h_3(t)=t^3-3t$.
%Multivariate linear basis function are $\{1,W,X,WX,X^2\}$.
%In this situation, the function $f_Y^2$ is mis-specified.
%Let $\kappa=2$, and we calculate $\hat{\Lambda}$ by Monte Carlo integration using uniform distridution $(x,w)=(U(-4,4),U(-2,2))$.
%Regularize value is determined by test error from $\{1,10^{-1},10^{-2},10^{-3},\ldots\}$.
%We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $10^{-3} \hat{\Lambda}$.
%We estimate CAPCE via differentiating estimated $\mathbb{E}[Y_{x}|{W}={w}]$. \jin{Explain why NTSLS uses different settings than S-CAPCE.}
%Results of the test errors are shown in Table 6.\\




{\bf Setting of Kernel IV and RKHS CAPCE.} We use polynomial kernel function $k_Z(z,z')=(z^Tz'+C_1)^{C_2}$ and $k_{X,W}(x,w,x',w')=((x,w)^T(x',w')+C_3)^{C_4}$. 
%We select the kernel parameters $(C_1,C_2)$ from $\{1,2,3,4,5\} \times \{1,2,3,4,5\}$.
%, and determined $(\zeta_1,\zeta_2)=(4,5)$ by test MSE.
%We select the regularize values $\lambda_1,\lambda_2$ from $\{1,10^{-1},10^{-2},10^{-3}\}$, respectively, and $(\lambda_3,\xi)$ is from $\{100,10,1\} \times \{100,10,1\}$. \\
%Then, we determine $(\lambda_1,\lambda_2,\lambda_3,\xi)=(0.01,0.01,1,100)$.\\
%{\bf Setting of PTSLS estimator.} We learn $\mathbb{E}[Y|Z=z]$, $\mathbb{E}[W|Z=z]$, $\mathbb{E}[X|Z=z]$, $\mathbb{E}[WX|Z=z]$ and $\mathbb{E}[X^2|Z=z]$ by the nonlinear model, 
%\begin{eqnarray}
%    $b_0+ b_1Z+b_2Z^2$.
%\end{eqnarray}
%We consider the following terms, $\{1,W,X,WX,X^2\}$ to build model of $\mathbb{E}[Y_{x}|{W}={w}]$.
%We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $0.1 {\bf I}$.
%Regularize value is determined by test error from $\{1,10^{-1},10^{-2},10^{-3},\ldots\}$.
%We estimate CAPCE via differentiating estimated $\mathbb{E}[Y_{x}|{W}={w}]$. \jin{The settings have a lot of overlap, no need to repeat. Write a single Settings paragraph.} 
%Results of the test errors are shown in Table 3.\\
%{\bf Results.}


\input{Fig2_01}

%\textbf{Results: Parametric setting (A).}
\textbf{Results.} The means of estimated coefficients by %$100$ time simulations of 
PTSLS and P-CAPCE in the parametric setting (A)  are shown in Table \ref{tab:TAB1}. % \ref{tab:PNUM_EXMP1} and \ref{tab:PNUM_EXMP2}.
%We see that the estimated coefficients of P-CAPCE are converging to the true values when the sample size $N=10000$, while one of the PTSLS estimates is still biased.
%The means of coefficients of P-CAPCE estimators are closer to true coefficients than PTSLS; on the other hand, PTSLS is biased largely due to the violation of the separability when $N=10000$.
%Both P-CAPCE and PTSLS estimates have large standard deviations (SD) when $N=1000$ (shown in Appendix \ref{appG}). 
{We observe that, when $N=1000$, both P-CAPCE and PTSLS estimates have large standard deviations (SD) (shown in Appendix \ref{appG}) such that the differences in estimated values are not statistically significant. The estimated coefficients of P-CAPCE are converging to the true values when the sample size $N=10000$, while the coefficient for $W$ estimated by  PTSLS  is still biased.} 
We plotted the true and estimated CAPCE curves given $W=1$ in Figure \ref{fig:FIG02}(a). It is clear that the estimated curve by P-CAPCE is much closer to the true curve than PTSLS. The true and estimated CAPCE surfaces over $(X, W)$ are shown in Appendix \ref{appG}.

%\textbf{Results: Nonparametric setting (B).}
We computed the mean-squared-error (MSE) between estimated and true CAPCE values for each estimator, where MSE is computed as $\displaystyle \frac{1}{N_1'}\sum_{i=1}^{N_1'}\{\hat{g}(x_i^{(1)'},w_i^{(1)'})-g(x_i^{(1)'},w_i^{(1)'})\}^2$ with test dataset ${\cal D}^{(1)'}$, and the results are shown in Table \ref{tab:TAB2}.
%The mean of test MSE of each estimator by $100$ time simulations are shown in Table \ref{tab:TAB2}.
%We note that TSLS (PTSLS and NTSLS) and CAPCE estimators (parametric, sieve and RKHS) have different risk functions.
We observed that our sieve and RKHS CAPCE estimators are superior to the existing methods; sieve and RKHS CAPCE estimators are superior to P-CAPCE  in the nonparametric setting (B); and kernel-based methods are much slower than other methods.  
We plotted the true and estimated CAPCE curves given $W=1$ in Figure \ref{fig:FIG02}(b), which shows the estimated curves by sieve and RKHS CAPCE are much closer to the true curve than NTSLS and Kernel IV. 
The true and estimated CAPCE surfaces over $(X, W)$ are shown in Appendix \ref{appG}.
%\yuta{We have performed additional experiments in settings (C) and (D) under the separability assumption (\ref{eq-sep}) in Appendix G.}

%, $f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)=f_Y^1(X,{\boldsymbol W},{\boldsymbol u}_Y)+f_Y^2({\boldsymbol H},{\boldsymbol u}_Y)$.
Overall, the results of settings (A) and (B) show that our proposed methods (P-CAPCE, sieve CAPCE,  RKHS CAPCE) are superior to the previous works (PTSLS, NTSLS, Kernel IV). % in terms of unbiasedness.
%In the setting (A), all CAPCE estimators are almost the same MSE; on the other hand, sieve and RKHS CAPCE estimators are superior to P-CAPCE estimator in the setting (B).
{The advantage of our proposed methods stems from that the underlying models (A) and (B) do not satisfy the separability assumption (\ref{eq-sep}) needed by the existing works. Indeed, we have performed experiments in settings where  the interaction between the covariates $W$ and unobserved confounders $H$ (the $f(W)H$ term in (\ref{eq-scm})) is absent, and the results (presented in Appendix \ref{appG}) show that the performances of the existing methods PTSLS, NTSLS, Kernel IV are comparable with our proposed methods under this situation.} 
%\yuta{The run time of P-CAPCE, S-CAPCE, and RKHS CAPCE is slightly larger than PTSLS, NTSLS, and Kernel IV, respectively.}
 %Sieve CAPCE may require many  basis functions if ${\boldsymbol W}$ is high dimensional. 
%\yuta{In addition, from the results of setting (C) and (D) shown in Appendix G, the MSE of all estimators are almost the same under separability assumption (\ref{eq-sep}) and parametric setting (setting (C)). The MSE of NTSLS, Kernel IV, S-CAPCE and RKHS CAPCE are almost the same under separability assumption (\ref{eq-sep}) and nonparametric setting (setting (D)). On the other hand, the MSE of PTSLS and P-CAPCE in setting (D) are worse than the others due to model misspecification.}
%Finally, 
Among the three proposed methods, the performance of P-CAPCE relies on correct parametric model assumption, and  RKHS CAPCE is  computationally expensive and requires  tuning many regularization parameters.


%\input{Fig2_01}


\begin{comment}
%\subsection{Nonparametric Estimation}
%Next, we compare the sieve NTSLS, sieve and RKHS CAPCE estimator when ${\boldsymbol \beta}$ is null in Eq. (\ref{EQ1}).\\
%{\bf SCM Settings.} We consider the following two SCM:
%\begin{eqnarray}
%\label{eq-scm}
%\left\{
%\begin{array}{l}
%    W:=U+E_1,\ \ \ X:=Z+W+U+E_2\\
%   Y:=\text{exp}(X)\text{exp}%(W)+25(W^5+W^4+W^3+W^2)U+E_3
%\end{array}
%\right..
%\end{eqnarray}
%Each realized the value of $Z$, $U$, $E_1$, $E_2$ and $E_3$ are i.i.d. and sampled from a uniform distribution $U[-1,1]$.
True CAPCE $\mathbb{E}[Y_{x}|{W}={w}]$ is $\text{exp}(x)\text{exp}(w)$, and the sample size are $N=10000$.\\
%{\bf Setting of P-CAPCE estimator and PTSLS estimator.} We learn $\mathbb{E}[Y|Z=z]$, $\mathbb{E}[X|Z=z]$, $\mathbb{E}[WX|Z=z]$ and $\mathbb{E}[X^2|Z=z]$ by the nonlinear model, 
%\begin{eqnarray}
%    $b_0+ b_1Z+b_2Z^2$.
%\end{eqnarray}
%We consider the following basis terms, $\{1,W,X\}$ for P-CAPCE model, $\{1,W,X,WX,X^2\}$ for PTSLS model , and let $z_0=-1$.
%We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $0.001 {\bf I}$ for PTSLS estimator and $0.1 {\bf I}$ for P-CAPCE estimator, where ${\bf I}$ is an identity matrix of size $M$.
%Regularize value is determined by test error from $\{1,10^{-1},10^{-2},10^{-3},\ldots\}$.
%The results of the test errors are shown in Table 1.
%We estimate CAPCE via differentiating estimated $\mathbb{E}[Y_{x}|{W}={w}]$.\\
{\bf Setting of sieve CAPCE estimator.} We learn $\mathbb{E}[Y|Z=z], \mathbb{E}[h_p(X)h_q(W)|Z=z]$ for any $p=0,1,2$ and $q=0,1,2$ by the nonlinear model, 
%\begin{eqnarray}
    $b_0+ b_1Z+b_2Z^2+b_3Z^3$,
%\end{eqnarray}
where $h_p$ is Hermite polynomial functions ($h_0(t)=1$, $h_1(t)=t$, $h_2(t)=t^2-1$ and $h_3(t)=t^3-3t$), and let $z_0=-1$.
Let $\kappa=2$, and we calculate $\hat{\Lambda}$ by Monte Carlo integration \citep{Kroese2011} using uniform distridution $(x,w)=(U(-4,4),U(-2,2))$, where $\Omega_X \subseteq [-4,4]$ and $\Omega_X \subseteq [-2,2]$.
Regularize value is determined by test error from $\{1,10^{-1},10^{-2},10^{-3},\ldots\}$.
We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $10^{-2} \hat{\Lambda}$.
%In addition, we give an experiment using multivariate linear basis function $\{1,W,X\}$, which is a minimal basis function to build CAPCE.
%Results of the test errors are shown in Table 5.
We estimate CAPCE via differentiating estimated $\mathbb{E}[Y_{x}|{W}={w}]$.\\
%{\bf Setting of sieve NTSLS estimator.} We learn $\mathbb{E}[Y|Z=z], \mathbb{E}[h_p(X)h_q(W)|Z=z]$ for any $p=0,1,2,3$, $q=0,1,2,3$ and $q=0,1$ by the nonlinear model, 
%\begin{eqnarray}
%    $b_0+ b_1Z+b_2Z^2+b_3Z^3$.
%\end{eqnarray}
%where $h_0(t)=1$, $h_1(t)=t$, $h_2(t)=t^2-1$ and $h_3(t)=t^3-3t$.
%Multivariate linear basis function are $\{1,W,X,WX,X^2\}$.
%In this situation, the function $f_Y^2$ is mis-specified.
%Let $\kappa=2$, and we calculate $\hat{\Lambda}$ by Monte Carlo integration using uniform distridution $(x,w)=(U(-4,4),U(-2,2))$.
%Regularize value is determined by test error from $\{1,10^{-1},10^{-2},10^{-3},\ldots\}$.
%We regularize the matrix $\displaystyle \hat{\bf G}^T \hat{\bf G}$ by adding $10^{-3} \hat{\Lambda}$.
%We estimate CAPCE via differentiating estimated $\mathbb{E}[Y_{x}|{W}={w}]$. \jin{Explain why NTSLS uses different settings than S-CAPCE.}
%Results of the test errors are shown in Table 6.\\
{\bf Setting of RKHS CAPCE estimator.} We use polynomial kernel function $k_Z(z,z')=(z^Tz'+\zeta_1)^{\zeta_2}$, and we determined $(\zeta_1,\zeta_2)=(4,5)$.
We select the regularize values $\lambda_1,\lambda_2$ from $\{1,0.1,0.01,\ldots\}$, respectively, and $(\lambda_3,\xi)$ is from cartesian product $\{100,10,1\} \times \{100,10,1\}$ by Algorithm \ref{ALG2}. 
Then, we determine $(\lambda_1,\lambda_2,\lambda_3,\xi)=(0.01,0.01,1,100)$.\\

{\bf Results.} 
The true surface and estimated surfaces of the sieve NTSLS estimator, sieve CAPCE estimator, and RKHS estimator are shown in Figure \ref{fig:FIG22}.
We plotted the surfaces in the range of $-2 \leq x \leq 2$ and $-1 \leq w \leq 1$ since the edge of the domain of all nonparametric estimators is unstable.
The sieve and RKHS CAPCE surfaces are closer to the true CAPCE surface, and the sieve NTSLS surface is largely biased. \jin{Is there a numerical performance measure for function estimation that can be computed, e.g. root mean squared error (RMSE) or variants used in Machine learning for regression performance? Then a summary results table could be presented. }

\end{comment}

