
In this section, we develop three families of methods for estimating CAPCE from  data based on Theorem \ref{TEO2}.  We do not need 
%i.i.d. 
samples from the joint $\mathfrak{p}(Z,X,Y,{\boldsymbol W})$, %single dataset ${\cal D} = \{y_i,x_i,{\boldsymbol w}_i,z_i\}_{i=1}^N$, 
but rather two datasets ${\cal D}^{(1)} = \{x^{(1)}_i,{\boldsymbol w}^{(1)}_i,z^{(1)}_i\}_{i=1}^{N_1}$ and ${\cal D}^{(2)} = \{y^{(2)}_i,z^{(2)}_i\}_{i=1}^{N_2}$ known as two-samples IV methods \citep{Singh2019,Angrist1992}.
%The integral equation (\ref{IE6}) holds for each individual IV $Z$ in the $M$-dimensional IV $\boldsymbol{Z} = \{Z^1,\ldots, Z^M\}$.  We note that   using many valid  IVs can improve the precision of estimation \citep{Hansen2008}. We will usually derive results for an individual IV $Z$ 

%\yuta{We note that using valid many IVs can improve precision of estimation \citep{Hansen2008}.}
%While only one IV is sufficient for CAPCE identification, multiple IVs can be available for CAPCE estimation. 
%\jin{Unclear what "can be available" means. Multiple IVs are more helpful for estimation?} \jin{We assume multiple IVs $Z^1,\ldots,Z$ such that for $m=1, \dots, M$, (2) holds for $Z=Z$ - Your vector $Z$ notation gives the impression of replacing Z in (2) with a vector Z.} \yuta{[I've fixed Theorem 1 by replacing Z in (2) with a vector Z.]}

%\jin{Why not just stay with a single IV variable? Section 4.1 is much easier to read with the index m dropped. In fact, you could derive the result with a single IV, then extend the results to multiple IVs in the end.}




\subsection{Sieve CAPCE Estimator}
%Next, we introduce the \yuta{sieve CAPCE estimator}.
Sieve estimators are a class of non-parametric estimators that use progressively more complex models to estimate an unknown function as more data becomes available \citep{Geman1982}. 
%In this section we use $W^{l,2}$ norm $(0\leq l \leq \infty)$.
%, which is introduced by \citep{Whitney2003} and \citep{Ai2003}
%, and we can estimate CAPCE directly.
%We consider the following model:
%\footnotesize
%\setlength{\abovedisplayskip}{0pt}
%\begin{equation}
%\label{EQ1}
%\mathbb{E}[\partial_x Y_{x,{\boldsymbol w}}]=g(x,{\boldsymbol w},{\boldsymbol \pi})
%\end{equation}

{\bf Approximation by Orthonormal Basis Functions.} We approximate the CAPCE $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ by a set of orthonormal basis functions, such as Hermite polynomial functions \citep{Hermite2009}. Specifically, 
%we approximate CAPCE $g_0 \in {\cal G}$ by
%Let $L$ dimensional vector $x,{\boldsymbol w}=(x,{\boldsymbol w}^T)^T$.
%\footnotesize
%\setlength{\abovedisplayskip}{0pt}
%\vspace{-0.6cm}
\begin{align}
\label{eq-orth-basis}
  %\mathbb{E}[\partial_x Y_{x,{\boldsymbol w}}] =  \sum_{j=1}^J\beta_{j} \psi_j(x,{\boldsymbol w})+g_0(x,{\boldsymbol w}) =  \sum_{j=1}^J\beta_{j} \psi_j(x,{\boldsymbol w})+\sum_{k=1}^{\infty}\gamma_{k} \sigma_k(x,{\boldsymbol w}),
   \mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}] \equiv g_0(x,{\boldsymbol w}) \approx g(x,{\boldsymbol w})= \sum_{j=1}^{J}\beta_{j} \phi_j(x,{\boldsymbol w}),
\end{align}
where %${\cal G}$ is a function space, and 
$\displaystyle \{\phi_j(x,{\boldsymbol w})\}_{j=1}^{\infty}$ is a set of infinite basis functions that satisfy the following conditions where Sobolev norm $W^{l,2}$ norm $(0\leq l \leq \infty)$ is used:
\begin{assumption}
\label{B1}
The basis functions $\displaystyle \{\phi_j(x,{\boldsymbol w})\}_{j=1}^{\infty}$ are orthonormal basis functions, and satisfy $\|\phi_j(x,{\boldsymbol w})\|_{W^{l,2}} < \infty$ for all $j=1,2,\ldots.$
\end{assumption}
%Let parameters be ${\boldsymbol \pi}=({\boldsymbol \beta}^T,g_0)$, where ${\boldsymbol \pi} \in {\boldsymbol \pi}$.
%This model is nonparametric if ${\boldsymbol \beta}$ is null; on the other hand, this model is parametric if $g_0$ is null.
%We call CAPCE estimator a \textbf{parametric CAPCE (P-CAPCE) estimator} if $g_0$ is null, and a (nonparametric) \textbf{sieve CAPCE (S-CAPCE) estimator} if ${\boldsymbol \beta}$ is null.

%We approximate CAPCE by $\sum_{j=1}^J\beta_{j} \phi_j(x,{\boldsymbol w})$. Let ${\boldsymbol \beta} =(\beta_1,\ldots,\beta_J)^T$.
%\normalsize
%We assume the following condition: 
\begin{assumption}
\label{AS3}
$\displaystyle \sum_{j=1}^J\beta_j\phi_j(x,{\boldsymbol w})$ convergences uniformly to $g_0(x,{\boldsymbol w})$ if $J \rightarrow \infty$. 
\end{assumption}
%In this section we use Sobolev norm $W^{l,2}$ norm $(0\leq l \leq \infty)$.
%\yuta{From Assumption \ref{AS3}, the interchange of integration and limit
%\footnotesize
%\begin{eqnarray}
%\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}k(z,x,{\boldsymbol w})\lim_{J \rightarrow \infty}\sum_{j=1}^J\beta_{j}\phi_j(x,{\boldsymbol w}) dxd{\boldsymbol w}=\lim_{J \rightarrow \infty}\sum_{j=1}^J\gamma_{j}\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}k(z,x,{\boldsymbol w})\phi_j(x,{\boldsymbol w}) dxd{\boldsymbol w} 
%\end{eqnarray}
%\normalsize
%holds.} 
We note that Hermite polynomial functions satisfy Assumption~\ref{AS3} for any bounded and continuous function $g_0$ \citep{Damelin2001}.

%converge uniformly to any $g_0$ if $J \rightarrow \infty$ \citep{Damelin2001}.
%Hermite polynomial functions are $\sigma_k(x,{\boldsymbol w})=exp(-x,{\boldsymbol w}^Tx,{\boldsymbol w})x,{\boldsymbol w}^{{\boldsymbol \kappa}(k)}$, where $\|{\boldsymbol \kappa}(k)\|$ is increasing in $k$.
%Here, ${\boldsymbol \kappa}$ is a vector of non-negative integers and $\|{\boldsymbol \kappa}\|=\sum_{l=1}^L \kappa_l$ and $x,{\boldsymbol w}^{\boldsymbol \kappa}=\prod_{i=1}^L x^{\kappa_i}$.
%We denote parameter space ${\boldsymbol \pi}_K=({\boldsymbol \beta}^T,\sum_{k=1}^K\gamma_{k} \sigma_k(x,{\boldsymbol w}))$, \jin{why $\sigma_k$ here?} which is a subspace of ${\boldsymbol \pi}$.
%using the basis functions $\{\phi_i(x,{\boldsymbol w})\}_{p=1,\ldots,P}$ \citep{Bishop2006}, where ${\boldsymbol \beta}=\{\beta_{1},\ldots,\beta_{i}\}$ are the model parameters to be estimated from data. 
%For example, $\phi_j(x,{w})=\beta_0+ \beta_1 x +\beta_2w +\beta_3 x^2+ \beta_4xw+ \beta_5 w^2+\cdots.$
%This is a generalization of a linear regression that replaces each explanatory variable with some appropriate functions.

%The integral equation (\ref{IE6}) holds for each individual IV $Z$ in the $M$-dimensional IV $\boldsymbol{Z} = \{Z^1,\ldots, Z^M\}$. We note that   using many valid  IVs can improve the precision of estimation \citep{Hansen2008}. 
%{\bf Restriction for Compactness.} The restriction on ${\boldsymbol \beta}$ and ${\boldsymbol \gamma}$ are used to imposed compactness.
%\jin{Should "u" be "c"? What happened to $\gamma$, or should $\beta$ be $\pi$? }



{\bf Compactness Restriction.} 
The integral equation (\ref{IE6}), known as   a ``Fredholm Integral Equation of the First Kind” %with $k$ called an integral kernel 
\citep{Bocher1926}, 
 is ill-posed  since the integral operator ${\cal K}$,  where $\displaystyle {\cal K}(f)(z)=\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X} k(z,x,{\boldsymbol w})f(x,{\boldsymbol w}) dxd{\boldsymbol w}$, is not guaranteed to be compact. 
Problems where one or more of the three properties - existence, uniqueness, and stability of the solution - do not hold are called ill-posed problems \citep{Tikhonov1995} and  lead to severe estimation difficulties. {To relieve the issue, we put restrictions on the functional space of $g_0(x,{\boldsymbol w})$.}
%\yuta{We restrict the integral operator ${\cal K}$ to be in the compact set since the integral equation (\ref{IE6}) is ill-posed.} \citep{Whitney2003} introduced nonparametric compact restriction using Sobolev norm \citep{Gallant1987}. 
Let $\displaystyle \mathfrak{g}(X,{\boldsymbol W})=\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}\{\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}-\mathbb{E}[\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}|Z=z_0]\}g(X,{\boldsymbol W})dxd{\boldsymbol w}$, and define regularized Sobolev norm $\tilde{W}^{l,2}$, which is  called ``consistency norm" in \citep{Gallant1987}, as follows
\begin{equation}
\begin{aligned}
\label{RES1}
    &\left\|\mathfrak{g}(x,{\boldsymbol w})\right\|_{\tilde{W}^{l,2}}^2=\sum_{|\lambda|\leq l} \int \left\{D^{\lambda}\mathfrak{g}(x,{\boldsymbol w})\right\}^2\\
    &\hspace{1.5cm}\times \{1+(x,{\boldsymbol w}^T)(x,{\boldsymbol w}^T)^T\}^{\kappa}  dxd{\boldsymbol w},  
    \end{aligned}
\end{equation}
where $l\ge 1$ is an integer and $\kappa$ is a constant satisfying $\kappa>(1+d)/2$ where $d$ is the dimension of ${\boldsymbol W}$.
We make  the following assumption:
\begin{assumption}
\label{COM}
Given a positive regularization parameter $B_S$, $g_0(x,{\boldsymbol w})$ is in the functional space ${\cal G}_{B_S}=\{g:\|\mathfrak{g}(x,{\boldsymbol w})\|_{\tilde{W}^{l,2}}^2 \leq B_S\}$.    
\end{assumption}

%The relationship ${\boldsymbol \pi}_B \subset \{{\boldsymbol \pi}:{\boldsymbol \beta}^T{\boldsymbol \beta}\leq B_{\beta},\|\mathfrak{g}_0(x,{\boldsymbol w})\|_{\tilde{W}^{l,2}}^2 \leq B_{\gamma}\}$ holds for all $m=1,\ldots,M$.

Using the approximation in (\ref{eq-orth-basis}), equation (\ref{IE6}) reduces to 

\vspace{-0.6cm}
%\footnotesize
\begin{align}
%\mu(z)=\sum_{j=1}^J\beta_{j}\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}k(z,x,{\boldsymbol w})\phi_j(x,{\boldsymbol w}) dxd{\boldsymbol w}+\sum_{k=1}^K\gamma_{k}\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}k(z,x,{\boldsymbol w})\sigma_k(x,{\boldsymbol w}) dxd{\boldsymbol w}
\mu(z)=\sum_{j=1}^J\beta_{j}\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}k(z,x,{\boldsymbol w})\phi_j(x,{\boldsymbol w}) dxd{\boldsymbol w}.
\end{align}
\normalsize
Letting the anti-derivative of the basis functions be $\displaystyle \varphi_j(x,{\boldsymbol w})=\int \phi_j(x,{\boldsymbol w})dx$.% for $j=1,\ldots,J$.
\footnote{We will simply write the antiderivative $\displaystyle \varphi_j(x,{\boldsymbol w})=\int_{-\infty}^x \phi_j(x',{\boldsymbol w})dx'$ as $\varphi_j(x,{\boldsymbol w})=\int \phi_j(x,{\boldsymbol w})dx$ in the paper.} % because the constant of integration is irrelevant since we take the difference between the antiderivatives.}  % where constant of integration is $0$.
Then, the equation becomes
\begin{equation}
\label{EQ7}
\begin{aligned}
&\mathbb{E}[Y|Z=z]-\mathbb{E}[Y|Z=z_0]=\\
&\sum_{j=1}^J\beta_{j}\{\mathbb{E}[\varphi_j(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\varphi_j(X,{\boldsymbol W})|Z=z_0]\}%\\
%&&\hspace{4cm}+\sum_{k=1}^K\gamma_{k}\{\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z_0]\}.
\end{aligned}
\end{equation}
%Next, we show that the estimation problem reduces to a linear equation. 
Let 
%\begin{eqnarray}
%\begin{array}{l}
${c}=\mathbb{E}[Y|Z=z]-\mathbb{E}[Y|z=z_0]$, ${\boldsymbol \beta} =(\beta_1,\ldots,\beta_J)^T$, 
and ${\boldsymbol d}=({d}^{1},\ldots,{d}^{J})^T$ where 
${d}^{j}=\mathbb{E}[\varphi_j(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\varphi_j(X,{\boldsymbol W})|Z=z_0]$. %and ${e}^{k}=\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z_0]$
%\end{array}
%\end{eqnarray}
%for $j=1,\ldots,J$. %, $k=1,\ldots,K$ and $m=1,\ldots,M$.
%Furthermore, denote %${\boldsymbol f}=({d}^{m,1},\ldots,{d}^{j},{e}^{m,1},\ldots,{e}^{k})^T$
%${\boldsymbol d}=({d}^{1},\ldots,{d}^{J})^T$.
Then, the integral equation (\ref{IE6}) finally reduces to a linear equation $c={\boldsymbol \beta}^T{\boldsymbol d}$.



{\bf Sieve CAPCE (S-CAPCE) estimator.} Given datasets ${\cal D}^{(1)} = \{x^{(1)}_i,{\boldsymbol w}^{(1)}_i,z^{(1)}_i\}_{i=1}^{N_1}$ and ${\cal D}^{(2)} = \{y^{(2)}_i,z^{(2)}_i\}_{i=1}^{N_2}$, 
%We explain CAPCE estimation method from observations. 
%We do not need single dataset ${\cal D} = \{y_i,x_i,{\boldsymbol w}_i,z_i\}_{i=1}^N$, but rather two datasets ${\cal D}^{(1)} = \{x^{(1)}_i,{\boldsymbol w}^{(1)}_i,z^{(1)}_i\}_{i=1}^{N_1}$ and ${\cal D}^{(2)} = \{y^{(2)}_i,z^{(2)}_i\}_{i=1}^{N_2}$ as two-samples IV estimator \citep{Singh2019,Angrist1992}.
%, where $z^{(1)}_i=(z^{(1)}_1,\ldots,z^{(1)}_1)$.
%let $N=N_1+N_2$ and  $(z_1,\ldots,z_N)=(z_1^{(1)},\ldots,z_{N_1}^{(1)},z_1^{(2)},\ldots,z_{N_2}^{(2)})$. \\ %for $m=1,\ldots,M$, 
our S-CAPCE estimator consists of two stages. 
{In Stage 1, we learn models $\hat{\mathbb{E}}[Y|Z=z]$ and $\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z]$  from the datasets by regression. 
Then in Stage 2, we estimate parameters ${\boldsymbol \beta}$ by solving Eq. (\ref{EQ7})}.
%$\tildex,{\boldsymbol w}=\hat{\Sigma}^{1/2}(x,{\boldsymbol w}-\overlinex,{\boldsymbol w})$, where $\hat{\Sigma}$ and $\overlinex,{\boldsymbol w}$ are the variance and mean of $x,{\boldsymbol w}$.
%There are four steps in Algorithm \ref{alg1}.



\noindent{\bf Stage 1.} We learn  prediction models %based on covariates 
$\hat{\mathbb{E}}[Y|Z=z]$
%for $m=1,\ldots,M$ 
using ${\cal D}^{(2)}$ and
%Explanatory variables are only IV, $Z$, and response variables are the outcome, $Y$.
%create prediction models 
$\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z]$ for $j=1,\ldots,J$
%and $m=1,\ldots,M$, and $\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z]$ for $k=1,\ldots,K$ and $m=1,\ldots,M$
using ${\cal D}^{(1)}$. Any regression method can be used. We select an IV value $z_0$. 
Denote $\hat{c}_i=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]$ and $\hat{d}_i^{j}=\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_0]$. % for $i=1,\ldots,N$ and $j=1,\ldots,J$.
%for each IV $Z \in \{Z^1,\ldots,Z^M\}$.
%We can use any machine learning method to estimate conditional expectations.

Specifically, we perform the regression using the power series basis functions in this paper. %\yuta{such as power series or splines}. \jin{What basis functions are actually used?}
%For each IV $Z \in \{Z^1,\ldots,Z^M\}$, 
Let basis functions be ${\boldsymbol q}(z)=(q_1(z),q_2(z),\ldots,q_P(z))^T$, and consider the model $\hat{\mathbb{E}}[Y|Z=z]=\sum_{p=1}^P \omega_p q_p(z)$, $\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z]=\sum_{p=1}^P \nu_p^j q_p(z)$ for $j=1,\ldots,J$. Denote ${\boldsymbol \omega}=(\omega_1,\ldots,\omega_P)^T$ and  ${\boldsymbol \nu}^j=(\nu_1^j,\ldots,\nu_P^j)^T$.
Then, we optimize the error functions below:
\begin{equation}
\begin{aligned}
    &Q_1({\boldsymbol \nu}^j;{\cal D}^{(1)})\\
    &\hspace{0.5cm}=\frac{1}{N_1}\sum_{i=1}^{N_1}(\varphi_j(x_i^{(1)},{\boldsymbol w}^{(1)}_i)-{\boldsymbol q}(z_i^{(1)})^T{\boldsymbol \nu}^j)^2,
    \end{aligned}
\end{equation}
\begin{equation}
\begin{aligned}
    &Q_2({\boldsymbol \omega};{\cal D}^{(2)})=\frac{1}{N_2}\sum_{i=1}^{N_2}(y_i^{(2)}-{\boldsymbol q}(z_i^{(2)})^T{\boldsymbol \omega})^2.
\end{aligned}
\end{equation}
Let variance-covariance matrices be $\hat{\bf M}^{(1)}=\sum_{i=1}^{N_1} N_1^{-1}{\boldsymbol q}(z^{(1)}_i){\boldsymbol q}(z^{(1)}_i)^T$ and  $\hat{\bf M}^{(2)}=\sum_{i=1}^{N_2} N_2^{-1}{\boldsymbol q}(z^{(2)}_i){\boldsymbol q}(z^{(2)}_i)^T$. 
%for $m=1,\ldots,M$.
We obtain %the following prediction values %$\hat{c}_i=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]$, $\hat{d}_i^{j}=\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_0]$ for $i=1,\ldots,N$ and $j=1,\ldots,J$.
%and $\hat{e}_i^{k}=\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_0]$ for $i=1,\ldots,N$, $m=1,\ldots,M$, $j=1,\ldots,J$, and $k=1,\ldots,L$. These values can be calculated as below:
%\small
\begin{equation}
\label{eq-pred}
\left\{
\begin{array}{l}
\renewcommand{\arraystretch}{1}
    %&&c_i=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]=({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{-}\sum_{l=1}^N \frac{1}{N} {\boldsymbol q}(z_l)y_l\\
    \hat{c}_i%=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]
    =({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{(2)-}\sum_{l=1}^{N_2} \frac{1}{N_2} {\boldsymbol q}(z^{(2)}_l)y^{(2)}_l\\
    \hat{d}_i^{j}%=\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_0]
    =({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{(1)-}\\
    \hspace{2cm}\times\sum_{l=1}^{N_1} \frac{1}{N_1} {\boldsymbol q}(z^{(1)}_l)\varphi_j(x^{(1)}_l,{\boldsymbol w}^{(1)}_l)\\
    %\hat{e}_i^{k}%=\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_0]
    %=({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{(1)-}\sum_{l=1}^{N_1} \frac{1}{N_1} {\boldsymbol q}(z^{(1)}_l)\varsigma_k(x^{(1)}_l,{\boldsymbol w}^{(1)}_l)\nonumber
\end{array}
\right.
\end{equation}
for $j=1,\ldots,J$,
%, $m=1,\ldots,M$, $j=1,\ldots,J$, and $k=1,\ldots,L$
where $\hat{\bf M}^{-}$ denotes the generalized inverse that satisfies $\hat{\bf M}\hat{\bf M}^{-}\hat{\bf M}=\hat{\bf M}$.
{Let $N=N_1+N_2$ and  $(z_1,\ldots,z_N)=(z_1^{(1)},\ldots,z_{N_1}^{(1)},z_1^{(2)},\ldots,z_{N_2}^{(2)})$. We will compute predicted values in (\ref{eq-pred})  for all $i=1,\ldots,N$. }


\noindent{\bf Stage 2.} 
%Consider multiple IVs $Z=(Z^1,\ldots,Z^M)$.
Estimate parameters ${\boldsymbol \beta}$ based on the  linear equation $c={\boldsymbol \beta}^T{\boldsymbol d}$. 
 Let $\hat{\boldsymbol c}=(\hat{c}_1,\ldots,\hat{c}_N)^T$, $\hat{\boldsymbol d}_i=(\hat{d}^{1}_i,\ldots,\hat{d}^{J}_i)^T$, 
%and $\hat{\boldsymbol e}^k_i=(\hat{e}^{1,j}_i,\ldots,\hat{e}^{j}_i)^T$.
$\hat{\bf D}=(\hat{\boldsymbol d}_1,\ldots,\hat{\boldsymbol d}_N)^T$, and
%$\hat{\bf E}_i=(\hat{\boldsymbol e}^{1}_i,\ldots,\hat{\boldsymbol e}^{K}_i)$, and $\hat{\bf F}_i=(\hat{\bf D}_i,\hat{\bf E}_i)$.\\
%We denote ${\boldsymbol \delta}=({\boldsymbol \beta}^T,{\boldsymbol \gamma}^T)^T$ and 
the empirical risk be 
\begin{eqnarray}
\label{Q1}
    {Q}_3({\boldsymbol \beta}\ ;{\cal D}^{(1)},{\cal D}^{(2)})=\frac{1}{N}\sum_{i=1}^N(\hat{c}_i-\hat{\boldsymbol d}_i^T{\boldsymbol \beta})^2.
\end{eqnarray}
Under Assumption \ref{COM}, our estimator $\hat{\boldsymbol \beta}$ is given  by the optimization problem below: 
\begin{eqnarray}
\label{OPT1}
    \min_{\boldsymbol \beta}{Q}_3({\boldsymbol \beta}\ ;{\cal D}^{(1)},{\cal D}^{(2)})\text{ subject to } {\boldsymbol \beta}^T{\boldsymbol \Lambda}{\boldsymbol \beta}\leq B_S,
\end{eqnarray}
where %\jin{Is $\Lambda$ something easy to compute? Do you need to describe how to compute $\Lambda$?}
%\vspace{-0.8cm}
\begin{equation}
\begin{aligned}
    &{\Lambda}_{i,j}=\sum_{|\lambda|\leq l} \int \left\{D^{\lambda}{\varphi}_i(x,{\boldsymbol w})-D^{\lambda}\mathbb{E}[{\varphi}_i(X,{\boldsymbol W})|Z=z_0]\right\}\\    &\hspace{0.8cm}\times\left\{D^{\lambda}{\varphi}_j(x,{\boldsymbol w})-D^{\lambda}\mathbb{E}[{\varphi}_j(X,{\boldsymbol W})|Z=z_0]\right\}\\
    &\hspace{3cm}\times\{1+\|(x,{\boldsymbol w}^T)\|^2\}^{\kappa}dxd{\boldsymbol w}
\end{aligned}
\end{equation}
\noindent for $i,j=1,\ldots,J$, and ${\boldsymbol \Lambda}=\{{\Lambda}_{i,j}\}_{i,j=1}^{J}$.
%and ${\boldsymbol \varphi}(x,{\boldsymbol w})=(\varphi_1(x,{\boldsymbol w}),\ldots,\varphi_J(x,{\boldsymbol w}))^T$.
${\boldsymbol \Lambda}$ can be calculated by Monte Carlo integration $\hat{\boldsymbol \Lambda}$ \citep{Kroese2011}. %if ${\Lambda}$ is hard to calculate directly.}\jin{How was $\Lambda$ computed in the experiments?}
%The regularization restriction is defined in \citep{Whitney2003}, which are used to impose compactness of ${\boldsymbol \pi}$.
%$\hat{\bf A}_3$ is a positive definite matrix and is involved in the estimator's variance. 
%In this paper, we do not discuss it. 
%\jin{You must specify what $\hat{\bf A}$ is.}
%\yuta{We follow the procedure proposed in \citep{Ai2003} to determine $\hat{\bf A}_3$; (i) compute identity weighted estimator using ${Q}_3({\boldsymbol \beta};{\cal D})=\sum_{i=1}^N \frac{1}{N}[\hat{\boldsymbol c}_i-\hat{\bf D}_i{\boldsymbol \beta}]^T[\hat{\boldsymbol c}_i-\hat{\bf D}_i{\boldsymbol \beta}]$; (ii) compute the $(i,j)$-th element of $\hat{\bf A}_{3}$ by the covariance of residuals given $Z=z_i$ and residuals given $Z=z_j$ for $i,j=1,\ldots,N$.}
The optimization problem (\ref{OPT1}) can be solved by a ridge regression method with the following solution {\citep{Hilt1977}}: 
%The optimization (\ref{OPT1}) have a ridge regression form when $\hat{\bf A}_3$ is an identity matrix \yuta{\citep{Whitney2003}}.
%and ${\bf S}$ be a diagonal matrix whose diagonal elements are $\lambda\text{diag}[{\boldsymbol \Lambda}]$, where ${\boldsymbol 1}_J$ is $J$-th vector $(1,1,\ldots,1)$ and $\text{diag}[{\boldsymbol \Lambda}]$ is a diagonal vector of ${\boldsymbol \Lambda}$. 
%Then, the estimator is 
%\jin{Are you assuming $\hat{\bf A}$ is an identity matrix here? Are you saying (10) is a ridge regression problem? Citation for the following conclusion? }
\begin{eqnarray}
    \hat{\boldsymbol \beta}=(\hat{\bf D}^T\hat{\bf D}+\zeta_S\text{diag}[{\boldsymbol \Lambda}])^{-1}\hat{\bf D}^T\hat{\boldsymbol c}, 
\end{eqnarray}
where  $\zeta_S$ is a regularization parameter called 
Lagrange multipliers.  
%This is a ridge regression form. \jin{Where does this $\beta$ come from? No clue what you are talking about here.}
%Then, estimator of the parameter $\hat{\boldsymbol \pi}$ is given as $(\hat{\boldsymbol \beta},g(\hat{\boldsymbol \gamma}))$. \jin{What is $g(\gamma)$?}
Then, our proposed sieve CAPCE estimator  is given by $\displaystyle \hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol w}]=\sum_{j=1}^{J}\hat{\beta}_{j} \phi_j(x,{\boldsymbol w})$.


{\bf Model Selection.} %Finally, we explain the performance metric of the trained models by Algorithm \ref{alg1}. We have the trained parameters $\hat{\boldsymbol \pi}$ and available test observations ${\cal D'}=\{z'^{(i)},x'^{(i)},y'^{(i)}\}_{i=1}^{N'}$.
The model selection in Stage 1 is a standard regression problem, and we presume the models in Stage 1 have been selected appropriately according to standard machine learning methods. 
We can use the empirical risk in equation~(\ref{Q1}) as a performance metric of the trained model in Stage 2 with parameters $\hat{\boldsymbol \beta}$ if given  separate test datasets %${\cal D'}=\{x'_i,y'_i,z'_i,{\boldsymbol w}'_i\}_{i=1}^{N'}$ or 
${\cal D}^{(1)'}=\{x^{(1)'}_i,z^{(1)'}_i,{\boldsymbol w}^{(1)'}_i\}_{i=1}^{N_1'}$  and ${\cal D}^{(2)'}=\{y^{(2)'}_i,z^{(2)'}_i\}_{i=1}^{N_2'}$.
Let $N'= N_1' + N_2'$. Assume $\hat{c}_i'$ and $\hat{\boldsymbol d}_i'$ for $i=1,\ldots,N'$ are computed using ${\cal D}^{(1)'}$ and ${\cal D}^{(2)'}$.
%By substituting ${\cal D}$ to ${\cal D'}$, we obtain $\hat{\boldsymbol H}'$ and $\hat{\bf D}'$.
Then, we can evaluate the trained model by the  test error  % mean squared error (MSE):
%\begin{eqnarray}
%\label{Q3}
$\displaystyle \hat{Q}_3(\hat{\boldsymbol \beta}\ ;{\cal D}^{(1)'},{\cal D}^{(2)'})=\frac{1}{N'}\sum_{i=1}^{N'}(\hat{c}'_i-\hat{\boldsymbol d}_i^{'T}\hat{\boldsymbol \beta})^2$. 
%\end{eqnarray}
Given separate datasets, this performance metric can  be used for model selection from various candidate  basis functions or the number $J$ or $P$ of basis terms.



{\bf Property of sieve CAPCE estimator.} 
We show that sieve CAPCE estimator is consistent under assumptions similar to sieve NTSLS \citep{Whitney2003}.
%\jin{citation?} %, shown in Appendix.
%The rate of convergence is given under similar assumptions in \citep{Xiaohong2018}, shown in Appendix.
Assumptions B.1 - 4 are shown in Appendix \ref{appB}. 

\begin{theorem}[Consistency]
\label{STHEO1}
    Under SCM ${\cal M}_{IV}$ and Assumptions \ref{AS1}, \ref{AS2},  \ref{B1}, \ref{AS3}, \ref{COM}, \ref{A1}, \ref{A2}, \ref{A3}, and \ref{A6}, 
    %B.1, B.2, B.3, and B.4,
    letting $P \rightarrow \infty$ and $J\rightarrow \infty$, then $\|\hat{g}-g_0\|_{W^{l,\infty}}\xrightarrow{p} 0$.
\end{theorem}
\begin{theorem}[Rate of Convergence]
\label{STHEO2}
    Under SCM ${\cal M}_{IV}$ and Assumptions \ref{AS1}, \ref{AS2},  \ref{B1}, \ref{AS3}, \ref{COM}, \ref{RA1}, \ref{RA2}, \ref{RA4}, \ref{RA5}, \ref{RA6}, \ref{RA7}, and \ref{RA8},
    %C.1, C.2, C.3, C.4, C.5, C.6, and C.7,
    setting $N=N_1=N_2$, then $\|\hat{g}-{g}_0\|_{A}={o}_p(N^{-1/4})$.
\end{theorem}
{Assumpts. C.1-7 and norm $\|\cdot\|_A$ are defined in Appendix~\ref{appC}.} %$o_p$ is order in probability notation \citep{Bishop1975}.

\subsection{Parametric CAPCE estimator}
Next, we develop a parametric CAPCE (P-CAPCE) estimator.
We consider the setting that the CAPCE $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ takes the form of  the following parametric model: 
\begin{equation}
 \mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]=   \sum_{k=1}^K\gamma_k\theta_k(x,{\boldsymbol w}), 
\end{equation}
where $\displaystyle\{\theta_k(x,{\boldsymbol w})\}_{k=1}^{K}$ are a set of known  functions, and  ${\boldsymbol \gamma}=(\gamma_1,\ldots,\gamma_K)^T$ are unknown model parameters to be estimated from data. 
%Let ${\boldsymbol \gamma}_0$ be the parameters which satisfies $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]=\sum_{k=1}^K\gamma_{0 k}\theta_k(x,{\boldsymbol w})$. \jin{Why are you introducing $\gamma_0$? The distinction between $\gamma$ and $\hat{\gamma}$ is not enough?}

The derivation of the P-CAPCE estimator is very similar to that of the sieve CAPCE estimator, so we skip the details in the following. Denote the anti-derivatives $\displaystyle \vartheta_k(x,{\boldsymbol w})=\int\theta_k(x,{\boldsymbol w})dx$ for $k=1,\ldots,K$. 
%The integral equation (\ref{IE6}) becomes 
%\begin{eqnarray}
%\mathbb{E}[Y|Z=z]-\mathbb{E}[Y|Z=z_0]=\sum_{k=1}^K\gamma_{k}\{\mathbb{E}[\vartheta_k(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\vartheta_k(X,{\boldsymbol W})|Z=z_0]\}%\nonumber\\
%&&\hspace{4cm}+\sum_{k=1}^K\gamma_{k}\{\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z_0]\}.
%\end{eqnarray}
%Next, we show that the estimation problem reduces to a linear equation. 
Let 
%\begin{eqnarray}
%\begin{array}{l}
${c}=\mathbb{E}[Y|Z=z]-\mathbb{E}[Y|z=z_0]$, 
and ${\boldsymbol e}=({e}^{1},\ldots,{e}^{K})^T$ where 
${e}^{k}=\mathbb{E}[\vartheta_k(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\vartheta_k(X,{\boldsymbol W})|Z=z_0]$. %and ${e}^{k}=\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z]-\mathbb{E}[\pi_k(X,{\boldsymbol W})|Z=z_0]$
%\end{array}
%\end{eqnarray}
%for $j=1,\ldots,J$. %, $k=1,\ldots,K$ and $m=1,\ldots,M$.
%Furthermore, denote %${\boldsymbol f}=({d}^{m,1},\ldots,{d}^{j},{e}^{m,1},\ldots,{e}^{k})^T$
%${\boldsymbol d}=({d}^{1},\ldots,{d}^{J})^T$.
Then, %the integral 
equation (\ref{IE6})  reduces to a linear equation $c={\boldsymbol \gamma}^T{\boldsymbol e}$.



{\bf P-CAPCE estimator.} Given datasets ${\cal D}^{(1)} = \{x^{(1)}_i,{\boldsymbol w}^{(1)}_i,z^{(1)}_i\}_{i=1}^{N_1}$ and ${\cal D}^{(2)} = \{y^{(2)}_i,z^{(2)}_i\}_{i=1}^{N_2}$, our P-CAPCE estimator consists of two stages. 



\noindent{\bf Stage 1.} %We learn prediction models $\hat{\mathbb{E}}[Y|Z=z]$ using ${\cal D}^{(2)}$ and $\hat{\mathbb{E}}[\vartheta_k(X,{\boldsymbol W})|Z=z]$ for $k=1,\ldots,K$ using ${\cal D}^{(1)}$.
%We perform the regression using the power series basis functions. 
Let basis functions be ${\boldsymbol q}(z)=(q_1(z),q_2(z),\ldots,q_P(z))^T$. % and consider the model $\hat{\mathbb{E}}[Y|Z=z]=\sum_{p=1}^P \omega_p q_p(z)$, $\hat{\mathbb{E}}[\vartheta_k(X,{\boldsymbol W})|Z=z]=\sum_{p=1}^P \nu_p^k q_p(z)$ for $k=1,\ldots,K$. Denote ${\boldsymbol \omega}=(\omega_1,\ldots,\omega_P)^T$ and  ${\boldsymbol \nu}^k=(\nu_1^k,\ldots,\nu_P^k)^T$.
%We select an IV value $z_0$. 
Denote $\hat{c}_i=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]$ and $\hat{e}_i^{k}=\hat{\mathbb{E}}[\vartheta_k(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\vartheta_k(X,{\boldsymbol W})|Z=z_0]$. 
Let variance-covariance matrices $\hat{\bf M}^{(1)}=\sum_{i=1}^{N_1} N_1^{-1}{\boldsymbol q}(z^{(1)}_i){\boldsymbol q}(z^{(1)}_i)^T$ and  $\hat{\bf M}^{(2)}=\sum_{i=1}^{N_2} N_2^{-1}{\boldsymbol q}(z^{(2)}_i){\boldsymbol q}(z^{(2)}_i)^T$.
%for $m=1,\ldots,M$.
We obtain the following predication values %$\hat{c}_i=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]$, $\hat{e}_i^{k}=\hat{\mathbb{E}}[\vartheta_k(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\vartheta_k(X,{\boldsymbol W})|Z=z_0]$ for $i=1,\ldots,N$ and $k=1,\ldots,K$.
%and $\hat{e}_i^{k}=\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_0]$ for $i=1,\ldots,N$, $m=1,\ldots,M$, $j=1,\ldots,J$, and $k=1,\ldots,L$. These values can be calculated as below:
\begin{equation}
\label{eq-pred2}
\left\{
\begin{array}{l}
%\renewcommand{\arraystretch}{1}
    %&&c_i=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]=({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{-}\sum_{l=1}^N \frac{1}{N} {\boldsymbol q}(z_l)y_l\\
    \hat{c}_i%=\hat{\mathbb{E}}[Y|Z=z_i]-\hat{\mathbb{E}}[Y|Z=z_0]
    =({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{(2)-}\sum_{l=1}^{N_2} \frac{1}{N_2} {\boldsymbol q}(z^{(2)}_l)y^{(2)}_l\\
    \hat{e}_i^{k}%=\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varphi_j(X,{\boldsymbol W})|Z=z_0]
    =({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{(1)-}\\
 \hspace{2cm}\times\sum_{l=1}^{N_1} \frac{1}{N_1} {\boldsymbol q}(z^{(1)}_l)\vartheta_k(x^{(1)}_l,{\boldsymbol w}^{(1)}_l)\\
    %\hat{e}_i^{k}%=\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_i]-\hat{\mathbb{E}}[\varsigma_k(X,{\boldsymbol W})|Z=z_0]
    %=({\boldsymbol q}(z_i)-{\boldsymbol q}(z_0))^T\hat{\bf M}^{(1)-}\sum_{l=1}^{N_1} \frac{1}{N_1} {\boldsymbol q}(z^{(1)}_l)\varsigma_k(x^{(1)}_l,{\boldsymbol w}^{(1)}_l)\nonumber
\end{array}
\right.
\end{equation}
for  $k=1,\ldots,K$. 
%, $m=1,\ldots,M$, $j=1,\ldots,J$, and $k=1,\ldots,L$,where $\hat{\bf M}^{-}$ denotes a generalized inverse, which satisfies $\hat{\bf M}\hat{\bf M}^{-}\hat{\bf M}=\hat{\bf M}$.
Let $N=N_1+N_2$ and  $(z_1,\ldots,z_N)=(z_1^{(1)},\ldots,z_{N_1}^{(1)},z_1^{(2)},\ldots,z_{N_2}^{(2)})$. We will compute predicted values in (\ref{eq-pred2}) for all $i=1,\ldots,N$. 


\noindent{\bf Stage 2.} 
%Consider multiple IVs $Z=(Z^1,\ldots,Z^M)$.
Estimate parameters ${\boldsymbol \gamma}$ based on the linear equation $c={\boldsymbol \gamma}^T{\boldsymbol e}$.
Let $\hat{\boldsymbol c}=(\hat{c}_1,\ldots,\hat{c}_{N})^T$, $\hat{\boldsymbol e}_i=(\hat{e}^{1}_i,\ldots,\hat{e}^{K}_i)^T$,
%and $\hat{\boldsymbol e}^k_i=(\hat{e}^{1,j}_i,\ldots,\hat{e}^{j}_i)^T$.
$\hat{\bf E}=(\hat{\boldsymbol e}_1,\ldots,\hat{\boldsymbol e}_N)^T$, and
%$\hat{\bf E}_i=(\hat{\boldsymbol e}^{1}_i,\ldots,\hat{\boldsymbol e}^{K}_i)$, and $\hat{\bf F}_i=(\hat{\bf D}_i,\hat{\bf E}_i)$.\\
%We denote ${\boldsymbol \delta}=({\boldsymbol \beta}^T,{\boldsymbol \gamma}^T)^T$ and 
the empirical risk be
\begin{eqnarray}
\label{Q-P}
    {Q}_4({\boldsymbol \gamma}\ ;{\cal D}^{(1)},{\cal D}^{(2)})=\sum_{i=1}^N \frac{1}{N}(\hat{c}_i-\hat{\boldsymbol e}_i^T{\boldsymbol \gamma})^2.
\end{eqnarray}
We make the following assumption:
\begin{assumption}
\label{COM2}
Given a positive regularization parameter $B_P$, ${\boldsymbol \gamma}$ satisfies ${\boldsymbol \gamma}^T{\boldsymbol \gamma} \leq B_P$.    
\end{assumption}
Under Assumption \ref{COM2}, our estimator $\hat{\boldsymbol \gamma}$ is given by the optimization problem below:
\begin{eqnarray}
\label{OPT-P}
    %\hat{\boldsymbol \gamma}=\arg
    \min_{\boldsymbol \gamma}{Q_4}({\boldsymbol \gamma}\ ;{\cal D}^{(1)},{\cal D}^{(2)})\text{ subject to } {\boldsymbol \gamma}^T{\boldsymbol \gamma}\leq B_P.
\end{eqnarray}
This problem can be solved by the ridge regression method with the following solution {\citep{Hilt1977}}: 
%and ${\bf S}$ be a diagonal matrix whose diagonal elements are $\lambda\text{diag}[{\boldsymbol \Lambda}]$, where ${\boldsymbol 1}_J$ is $J$-th vector $(1,1,\ldots,1)$ and $\text{diag}[{\boldsymbol \Lambda}]$ is a diagonal vector of ${\boldsymbol \Lambda}$. 
\begin{eqnarray}
    \hat{\boldsymbol \gamma}=(\hat{\bf E}^T\hat{\bf E}+\zeta_P{\bf I}_K)^{-1}\hat{\bf E}^T\hat{\boldsymbol c},
\end{eqnarray}
where $\zeta_P$ is a regularization parameter, and ${\bf I}_K$ is a $K \times K$ identity matrix. Then, our proposed P-CAPCE estimator  is given by $\displaystyle \hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol w}]=\sum_{k=1}^{K}\hat{\gamma}_{k} \theta_k(x,{\boldsymbol w})$.
%Then, estimator of the parameter $\hat{\boldsymbol \pi}$ is given as $(\hat{\boldsymbol \beta},g(\hat{\boldsymbol \gamma}))$. \jin{What is $g(\gamma)$?}



{\bf Model Selection.} %Finally, we explain the performance metric of the trained models by Algorithm \ref{alg1}. We have the trained parameters $\hat{\boldsymbol \pi}$ and available test observations ${\cal D'}=\{z'^{(i)},x'^{(i)},y'^{(i)}\}_{i=1}^{N'}$.
We presume the models in Stage 1 have been selected appropriately. We can use the empirical risk in equation~(\ref{Q-P}) as a performance metric of the trained model in Stage 2 with parameters $\hat{\boldsymbol \gamma}$ if given separate test datasets %${\cal D'}=\{x'_i,y'_i,z'_i,{\boldsymbol w}'_i\}_{i=1}^{N'}$ or 
${\cal D}^{(1)'}=\{x^{(1)'}_i,z^{(1)'}_i,{\boldsymbol w}^{(1)'}_i\}_{i=1}^{N_1'}$  and ${\cal D}^{(2)'}=\{y^{(2)'}_i,z^{(2)'}_i\}_{i=1}^{N_2'}$. 
Let $N'= N_1' + N_2'$. Assume $\hat{c}'_i$ and $\hat{\boldsymbol e}'_i$ for $i=1,\ldots,N$ are computed using ${\cal D}^{(1)'}$ and ${\cal D}^{(2)'}$.
%By substituting ${\cal D}$ to ${\cal D'}$, we obtain $\hat{\boldsymbol H}'$ and $\hat{\bf D}'$.
Then, we can evaluate the trained model by the  test error %mean squared error (MSE):
%\begin{eqnarray}
%\label{Q2}
$\displaystyle \hat{Q}_4(\hat{\boldsymbol \gamma}\ ;{\cal D}^{(1)'},{\cal D}^{(2)'})=\frac{1}{N'}\sum_{i=1}^{N'} (\hat{c}'_i-\hat{\boldsymbol e}_i^{'T}\hat{\boldsymbol \gamma})^2$. 
%\end{eqnarray}
Given a separate dataset, this performance metric can be used for model selection from various candidate  basis functions or the number $K$ or $P$ of basis terms.



{\bf Property of P-CAPCE estimator.} 
We show that P-CAPCE estimator is consistent. % and the rate of convergence.
\begin{theorem}[Consistency]
\label{PTHEO1}
    Under SCM ${\cal M}_{IV}$ and Assumptions \ref{AS1}, \ref{AS2},  \ref{COM2},
    \ref{PA1}, \ref{PA2}, \ref{PA3}, and \ref{PA6},
    %D.1, D.2, D.3, and D.4,
    letting $P \rightarrow \infty$, then $\|\hat{\boldsymbol \gamma}-{\boldsymbol \gamma}\|\xrightarrow{p} 0$.
\end{theorem}
\begin{theorem}[Rate of Convergence]
\label{PTHEO2}
   Under SCM ${\cal M}_{IV}$ and Assumptions \ref{AS1}, 
 \ref{AS2}, \ref{COM2},
 \ref{PRA1}, \ref{PRA2}, \ref{PRA4}, \ref{PRA5}, and \ref{PRA8},
 %E.1, E.2, E.3, E.4, and E.5,
 setting $N=N_1=N_2$, then $\|\hat{\boldsymbol \gamma}-{\boldsymbol \gamma}\|={o}_p(N^{-1/4})$.
\end{theorem}
%The rate of convergence is given under similar assumptions in \citep{Xiaohong2018}, shown in Appendix. %\jin{These appeared copied from the Sieve estimator. They needs updates.}
{Assumptions D.1 - 4 are shown in Appendix \ref{appD}. Assumptions E.1 - 5 are in Appendix \ref{appE}.}


 
%We do not need Assumption \ref{B1}, \ref{AS3} and \ref{COM} for consistency since it is always satisfied when $K<\infty$. 
%We can estimate CAPCE under weaker assumptions than PTSLS \citep{Wooldridge2010}.





\subsection{RKHS CAPCE estimator}

Finally, we develop a reproducing kernel Hilbert space (RKHS) CAPCE estimator. 
RKHS models are popular and widely used in nonparametric regression \citep{Theodoridis2006,Scholkopf2013}. 

% $\hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol W}={\boldsymbol w}]$. % following  \citep{Singh2019}.
%We consider two datasets ${\cal D}^{(1)} = \{x^{(1)}_i,{\boldsymbol w}^{(1)}_i,z^{(1)}_i\}_{i=1}^{N_1}$ and ${\cal D}^{(2)} = \{y^{(2)}_i,z^{(2)}_i\}_{i=1}^{N_2}$.

{\bf RKHS model.} Let $k_{X,{\boldsymbol W}}: \Omega_{X,{\boldsymbol W}} \times \Omega_{X,{\boldsymbol W}} \rightarrow \mathbb{R}$ and $k_Z: \Omega_Z \times \Omega_Z \rightarrow \mathbb{R}$ be measurable positive definitive kernels corresponding to RKHSs ${\cal H}_{X,{\boldsymbol W}}$ and ${\cal H}_Z$.
%, and denote $\left<\cdot,\cdot\right>_{{\cal H}_{X,{\boldsymbol W}}}$ and $\left<\cdot,\cdot\right>_{{\cal H}_{Z}}$ be their inner products. %\citep{Taylor2004}:
%\begin{definition}[Positive definitive kernels]
    A symmetric function $k: \Omega \times \Omega \rightarrow \mathbb{R}$ is called positive-definite kernel if
    %\begin{eqnarray}
    $\displaystyle \sum_{i=1}^n\sum_{j=1}^n c_ic_j k({\boldsymbol a}_i,{\boldsymbol a}_j)\geq 0$
    %\end{eqnarray}
    for all ${\boldsymbol a}_1,\ldots,{\boldsymbol a}_n \in \Omega$ given any $n \in \mathbb{N}$ and $c_1,\ldots,c_n \in \mathbb{R}$ \citep{Taylor2004}.
%\end{definition}
Denote the feature map $\eta: \Omega_{X,{\boldsymbol W}}  \rightarrow {\cal H}_{X,{\boldsymbol W}}$, $(x,{\boldsymbol w}) \mapsto k'_{X,{\boldsymbol W}}(x,{\boldsymbol w},\cdot,\cdot)$ and $\psi: \Omega_Z  \rightarrow {\cal H}_Z$, $z \mapsto k_Z(z,\cdot).$
{In addition, we denote the antiderivative feature function $\pi: \Omega_{X,{\boldsymbol W}}  \rightarrow {\cal H}_{X,{\boldsymbol W}}, (x,{\boldsymbol w}) \mapsto k_{X,{\boldsymbol W}}(x,{\boldsymbol w},\cdot,\cdot)$ with $\displaystyle \pi(x,{\boldsymbol w})=-\int \eta(x,{\boldsymbol w})dx$ and the antiderivative kernel function $\displaystyle k_{X,{\boldsymbol W}}(x,{\boldsymbol w},x',{\boldsymbol w}')=\int k'_{X,{\boldsymbol W}}(x,{\boldsymbol w},x',{\boldsymbol w}')dxdx'$. 
%Denote the feature map
%\begin{eqnarray}
%$\pi: \Omega_{X,{\boldsymbol W}}  \rightarrow {\cal H}_{X,{\boldsymbol W}}, (x,{\boldsymbol w}) \mapsto k_{X,{\boldsymbol W}}(x,{\boldsymbol w},\cdot,\cdot)$ and $\psi: \Omega_Z  \rightarrow {\cal H}_Z, \boldsymbol z \mapsto k_Z(z,\cdot).$
Assume that the CAPCE takes the form 
%\jin{Do the following notation need to be changed given that you have used a different notation in describing the two stagtes?}
\begin{equation}
 \mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]= H(\pi(x,{\boldsymbol w}))
\end{equation}
for some operator $H\in {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)$, where ${\cal L}_2(\Omega_1,\Omega_2)$ is the ${\cal L}_2$ measurable function space from $\Omega_1$ to $\Omega_2$, and $H(\pi(x,{\boldsymbol w}))$ is a composition function $H \circ \pi: \Omega_{X,{\boldsymbol W}}  \rightarrow \Omega_Y$.}
%\yuta{The, the RKHS estimator is given by $\left<\hat{H},\pi(x,{\boldsymbol w}) \right>$ for $x \in \Omega_X$ and ${\boldsymbol w} \in {\boldsymbol W}$.}
%\end{eqnarray}
%\jin{The notation $\pi$ and $\pi$ have been used in the last section. Use different notation.}
%\begin{eqnarray}
%    {\bf K}_{Z^{(1)}Z^{(=1=
%    \left(
%    \begin{array}{ccc}
%     k_Z(z_1^{(1)},z_1^{(1)})    & \cdots & k_Z(z_1^{(1)},z_{N_1}^{(1)}) \\
%        \vdots & \ddots & \vdots\\
%    k_Z(z_{N_1}^{(1)},z_1^{(1)}) & \cdots & k_Z(z_{N_1}^{(1)},z_{N_1}^{(1)})
%    \end{array}
%    \right).
%\end{eqnarray}
Our RKHS CAPCE estimator consists of two stages (a detailed derivation is provided in Appendix \ref{appA2}). 
%\yuta{We introduce $G_1, G_2 \in {\cal H}_{Z}$ and $H \in {\cal H}_{X,{\boldsymbol W}}$ for learning.}\\

\noindent{\bf Stage 1.} We learn an operator $G_1\in {\cal L}_2({\cal H}_{Z},{\cal H}_{X,{\boldsymbol W}})$  
%\jin{why "linear" operator?} \yuta{[Comment: This is because it is define by the inner product $G_1(\psi(z))=<\psi(z),g_1>$ with some $g_1 \in {\cal H}_Z$.  It is linear about the input $\psi(z)$. But I will delete ``linear" since it is confusing.]}
that satisfies $\mathbb{E}[\pi(X,{\boldsymbol W})|Z=z]=G_1 (\psi(z))$, 
%\yuta{[Comment: $G_1 (\psi(z))$ is a composition function $G_1 \circ \psi: \Omega_Z  \rightarrow {\cal H}_{X,{\boldsymbol W}}$ of $\psi: \Omega_Z  \rightarrow {\cal H}_Z$ and $G_1:{\cal H}_{Z} \rightarrow {\cal H}_{X,{\boldsymbol W}}$.[]}
%where $\pi(X,{\boldsymbol W})=-\int_{-\infty}^X \pi(x,{\boldsymbol w})dx$.
%, and we do not specify the form of $\psi(x)$.\jin{Isn't $\pi()$  a function of X and W? "we don't specify $\psi$"???}
and learn an operator $G_2\in {\cal L}_2({\cal H}_{Z},\Omega_Y)$ that satisfies $\mathbb{E}[Y|Z=z]=G_2(\psi(z))$. % in the first stage.



\noindent{\bf Stage 2.} 
%Then, in the second stage,
We learn an operator $H \in {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)$ that satisfies $\hat{\mathbb{E}}[Y|Z=z]-\hat{\mathbb{E}}[Y|Z=z_0]=H(\hat{\mathbb{E}}[\pi(X,{\boldsymbol W})|Z=z]-\hat{\mathbb{E}}[\pi(X,{\boldsymbol W})|Z=z_0]) \Leftrightarrow \hat{G}_1(\psi(z)-\psi(z_0))=H(\hat{G}_2(\psi(z)-\psi(z_0)))$, where $\hat{G}_1$ and $\hat{G}_2$ are learned in Stage 1. 


We learn $\hat{G}_1$, $\hat{G}_2$, and $\hat{H}$ by the following optimization problems using datasets ${\cal D}^{(1)}$ and ${\cal D}^{(2)}$:
%\vspace{-0.6cm}
\begin{equation}
\begin{aligned}
    &\min_{G_1%\in {\cal L}_2({\cal H}_{Z},{\cal H}_{X,{\boldsymbol W}})
    } \frac{1}{N_1}\sum_{i=1}^{N_1}\left\|\pi(x_i^{(1)},{\boldsymbol w}_i^{(1)})-G_1(\psi(z_i^{(1)}))\right\|^2_{{\cal H}_{X,{\boldsymbol W}}}\\
    &\hspace{3.5cm}+\lambda_1\left\|G_1\right\|^2_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})},
  \end{aligned}
\end{equation}
\begin{equation}
\begin{aligned}
&\min_{G_2%\in {\cal L}_2({\cal H}_{Z},\Omega_Y)
} \frac{1}{N_2}\sum_{i=1}^{N_2}\left\|y_i^{(2)}-G_2(\psi(z_i^{(2)}))\right\|^2\\
&\hspace{4cm}+\lambda_2\left\|G_2\right\|^2_{{\cal L}_2({\cal H}_{Z},\Omega_Y)},
  \end{aligned}
\end{equation}
\begin{equation}
\begin{aligned}
  &\min_{H% \in {\cal L}_2(\Omega_Y,{\cal H}_{X,{\boldsymbol W}})
  } \frac{1}{N_2}\sum_{i=1}^{N_2}\Big\|\hat{G}_2(\psi(z_i^{(2)})-\psi(z_0))\\
  &\hspace{3cm}-H(\hat{G}_1( \psi(z_i^{(2)})-\psi(z_0)))\Big\|^2\\
  &\hspace{0.6cm}+\xi\left\|H\right\|^2_{{\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)}+\lambda_3\left\|H\circ \hat{G}_1\right\|^2_{{\cal L}_2({\cal H}_Z,\Omega_Y)},
  \end{aligned}
\end{equation}
\noindent where $(\lambda_1,\lambda_2,\lambda_3,\xi)$ are regularization parameters. From the representer theorem \citep{Schlkopf2001}, the optimal $G_1$ exists in $\text{span}\{\psi(z_1^{(1)}),\ldots,\psi(z_{N_1}^{(1)})\}$, and the optimal $G_2$ and $H$ exist in $\text{span}\{\psi(z_1^{(2)}),\ldots,\psi(z_{N_2}^{(2)})\}$.
%\yuta{The, the RKHS estimator is given by $\left<\hat{H},\pi(x,{\boldsymbol w}) \right>$ for $x \in \Omega_X$ and ${\boldsymbol w} \in {\boldsymbol W}$.}


We denote %${\bf K}$ is a 
gram matrices ${\bf K}_{Z^{(1)}Z^{(1)}}=\{ k_Z(z_i^{(1)},z_j^{(1)}) \}_{i,j=1}^{N_1}$; 
${\bf K}_{Z^{(1)}z_0}$ is $N_1 \times N_1$ matrix $\{ k_Z(z_i^{(1)},z_0) \}_{i,j=1}^{N_1}$; and ${\bf K}_{(X,{\boldsymbol W})^{(1)}(x,{\boldsymbol w})}$ is $N_1$-dimension vector $\{ k_{X,{\boldsymbol W}}(x_i^{(1)},{\boldsymbol w}_i^{(1)},x,{\boldsymbol w}) \}_{i=1}^{N_1}$. 
%We give a closed form of RKHS CAPCE estimator as below: 
%\begin{alg}
Then, the RKHS CAPCE estimator is given by 
\begin{align}\label{eq-rkhs}
\hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol w}]=\hat{\boldsymbol \alpha}^T{\bf K}_{(X,{\boldsymbol W})^{(1)}(x,{\boldsymbol w})}, 
\end{align} 
where
%\vspace{-0.8cm}
\begin{equation}\label{eq-rkhs-inverse}
\begin{aligned}
&\hat{\boldsymbol \alpha}=(\hat{\bf O}\hat{\bf O}^T+N_2\xi {\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}+N_2\lambda_3 {\bf I}_{N_2})^{-1}\\
&\hspace{1cm}\times\hat{\bf O}\{{\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2\lambda_2 {\bf I}_{N_2})^{-1}\\
&\hspace{3cm}\times({\bf K}_{Z^{(2)}Z^{(2)}}-{\bf K}_{Z^{(2)}z_0})\},
\end{aligned}
\end{equation}
\begin{equation}
\begin{aligned}
&\hat{\bf O}={\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}({\bf K}_{Z^{(1)}Z^{(1)}}+N_1\lambda_1 {\bf I}_{N_1})^{-1}\\
&\hspace{2.5cm}\times({\bf K}_{Z^{(1)}Z^{(2)}}-{\bf K}_{Z^{(1)}z_0}),
\end{aligned}
\end{equation}
 and ${\bf I}_N$ is a $N \times N$ identity matrix.
%\end{alg}

{\bf Model Selection.} 
We presume the models in Stage 1 have been selected appropriately, and introduce a model selection method in Stage 2 following  \citep{Singh2019}.  
Assume we have separate datasets ${\cal D}^{(1)'} = \{x^{(1)'}_i,{\boldsymbol w}^{(1)'}_i,z^{(1)'}_i\}_{i=1}^{N'_1}$ and ${\cal D}^{(2)} = \{y^{(2)'}_i,z^{(2)'}_i\}_{i=1}^{N'_2}$.
%Train stage 1 estimator $\lambda_1$ on stage 1 observations $\{x_i^{(1)},{\boldsymbol w}^{(1)}_i,z_i^{(1)}\}$ then select stage 1 regularization parameter value $\lambda_1^*$ to minimize out-of-sample loss, calculated from observations $\{x_i^{(2)},{\boldsymbol w}^{(2)}_i,z_i^{(2)}\}$. 
%Also, train stage 1 estimator $\lambda_2$ on stage 1 observations $\{x_i^{(2)},{\boldsymbol w}^{(2)}_i,z_i^{(2)}\}$ then select stage 1 regularization parameter value $\lambda_2^*$ to minimize out-of-sample loss, calculated from observations$\{x_i^{(1)},{\boldsymbol w}^{(1)}_i,z_i^{(1)}\}$. 
%Train stage 2 estimator $\hat{H}$ on stage 2 observations $\{y_i^{(2)},{\boldsymbol w}^{(2)}_i,z_i^{(2)}\}$ then select $\xi$ stage 2 regularization parameter value $\xi^*$ to minimize out-of-sample loss, calculated from observations $\{y_i^{(1)},{\boldsymbol w}^{(1)}_i,z_i^{(1)}\}$.
%We introduce the evaluation algorithm of RKHS CAPCE estimator below:
%\begin{alg}
%\label{ALG2}
We determine the optimal $\lambda_1^*$ by minimizing
\begin{equation}
\begin{aligned}
&L_1(\lambda_1)=\frac{1}{N'_1}\text{Trace}\Big[{\bf K}_{(X,{\boldsymbol W})^{(1)'}(X,{\boldsymbol W})^{(1)'}}\\
&\hspace{0cm}-2{\bf K}_{(X,{\boldsymbol W})^{(1)'}(X,{\boldsymbol W})^{(1)}}{\bf P}_1+{\bf P}_1^T {\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}{\bf P}_1 \Big],
\end{aligned}
\end{equation}
where ${\bf P}_1=({\bf K}_{Z^{(1)}Z^{(1)}}+N'_1\lambda_1{\bf I}_{N_1})^{-1}{\bf K}_{Z^{(1)}Z^{(2)}}$. We determine the optimal $\lambda_2^*$ by minimizing
\begin{equation}
\begin{aligned}
&L_2(\lambda_2)=\frac{1}{N'_2}\text{Trace}\Big[{\boldsymbol y}^{(2)'}{\boldsymbol y}^{(2)'T}\\
&\hspace{1.5cm}-2{\boldsymbol y}^{(2)'}{\boldsymbol y}^{(2)T}{\bf P}_2+{\bf P}_2^T {\boldsymbol y}^{(2)}{\boldsymbol y}^{(2)T}{\bf P}_2 \Big],
\end{aligned}
\end{equation}
where %${\boldsymbol y}^{(2)'}=(y_1^{(2)'},\ldots,y_{N'_2}^{(2)'})^T$, ${\boldsymbol y}^{(2)}=(y_1^{(2)},\ldots,y_{N_2}^{(2)})^T$ and
${\bf P}_2=({\bf K}_{Z^{(1)}Z^{(1)}}+N_1\lambda_2{\bf I}_{N_1})^{-1}{\bf K}_{Z^{(1)}Z^{(2)}}$.
Finally, we determine the optimal $\xi^*$ and $\lambda_3^*$ by minimizing test error
%\begin{eqnarray}
$\displaystyle L(\lambda_3,\xi)=\frac{1}{N'_2}\sum_{i=1}^{N'_2}\|{\boldsymbol y}^{(2)'T}({\bf K}_{Z^{(2)'}Z^{(2)'}}+N'_2\lambda_2^* {\bf I}_{N'_2})^{-1}({\bf K}_{Z^{(2)'}Z^{(2)'}}-{\bf K}_{Z^{(2)'}z_0})-\hat{H}_{\lambda_3,\xi}(x_i^{(1)'},{\boldsymbol w}_i^{(1)'})\|^2$
%\end{eqnarray}
where $\hat{H}_{\lambda_3,\xi}$ is learned with  $\lambda_1=\lambda_1^*$ and $\lambda_2=\lambda_2^*$ using ${\cal D}^{(1)}$  and ${\cal D}^{(2)}$.
%\end{alg}

{\bf Properties of RKHS CAPCE estimator.}  The RKHS CAPCE estimator requires ${\cal O}(N_1^3)+{\cal O}(N_2^3)$ time \citep{Saunders1998}.
%and ${\cal O}(N_1^2)+{\cal O}(N_2^2)$ memory. 
%When $\lambda_3$ is 0, the consistency of RKHS CAPCE estimator is guaranteed by similar assumptions in \citep{Singh2019}, shown in Appendix. \jin{Can you write a formal consistency theorem here?}
%Furthermore, the following theorem holds under Assumptions F.1-8 shown in Appendix F.
{We show that RKHS CAPCE is consistent under assumptions similar to Kernel IV \citep{Singh2019}. Assumptions F.1 - 8 are shown in Appendix \ref{appF}.}
\begin{theorem}[Consistency]
\label{RTEO1}
    Under SCM ${\cal M}_{IV}$ and Assumptions \ref{AS1}, \ref{AS2},  
    \ref{RAS1}, \ref{RAS2}, \ref{RAS3}, \ref{RAS4}, \ref{RAS5}, \ref{RAS6}, \ref{RAS7} and \ref{RAS8},
    %F.1, F.2, F.3, F.4, F.5, F.6, F.7, and F.8,
   the  RKHS CAPCE estimator in (\ref{eq-rkhs}) %$\hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol W}={\boldsymbol w}]=\hat{\boldsymbol \alpha}^T{\bf K}_{(X,{\boldsymbol W})^{(1)}(x,{\boldsymbol w})}$ 
    converges pointwise to CAPCE %$\mathbb{E}[\partial_xY_{x}|{\boldsymbol W}={\boldsymbol w}]$ 
    when $\lambda_3=0$.
\end{theorem}
When $\lambda_3=0$, the inverse of the matrix $\hat{\bf O}\hat{\bf O}^T+N_2 \xi  {\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}$ in Eq.~(\ref{eq-rkhs-inverse}) is numerically unstable. In practice, regularization leads to bias, but we must consider the bias-variance trade-off.
%Assumptions C.1 $\sim$ C.8 are shown in Appendix.
%\jin{is a pointwise consistent estimator of CAPCE (almost everywhere?) or converge pointwise to CAPCE (almost everywhere?)}