

\section{Consistency of Sieve CAPCE Estimator}
\label{appB}

In this section, we  show that sieve CAPCE estimator is consistent under
 assumptions similar to those guaranteeing the consistency of sieve NTSLS \citep{Whitney2003}.


\subsection*{Notations}
We introduce the notations for the assumptions. % of Theorem 4.1.
%First, we note that the combination of sieve CAPCE estimation and parametric CAPCE estimation is possible, and give the consistency theorem simultaneously.
%We denote the parametric and sieve paramator $(g,{\boldsymbol \gamma})=g$, and parameter space be ${\cal G}$. 
%$g$ is a functional parameter shown in sieve estimator, and ${\boldsymbol \gamma}$ is a vector parameter shown in parametric estimator.
%\begin{align}
  %\mathbb{E}[\partial_x Y_{x,{\boldsymbol w}}] =  \sum\nolimits_{j=1}^J\beta_{j} \psi_j(x,{\boldsymbol w})+g_0(x,{\boldsymbol w}) =  \sum\nolimits_{j=1}^J\beta_{j} \psi_j(x,{\boldsymbol w})+\sum\nolimits_{k=1}^{\infty}\gamma_{k} \sigma_k(x,{\boldsymbol w}),
  %\mathbb{E}[\partial_x Y_{x}|{\boldsymbol W}={\boldsymbol w}] = g_0 (x,{\boldsymbol w})+\sum_{k=1}^K\gamma_k\theta_k(x,{\boldsymbol w})=\sum_{j=1}^{\infty}\beta_{j} \phi_j(x,{\boldsymbol w})+\sum_{k=1}^K\gamma_k\theta_k(x,{\boldsymbol w}),
%\end{align}
%$g_0 \in {\cal G}$ is the true value of parameters.


{\bf Conditional Moment Restrictions.}
The estimation problem reduces to the problem called conditional moment restrictions, %\jin{Confusing writing. Do you mean "We will use the conditional moment restrictions method to solve the estimation problem"? }
and properties of the estimator are well studied \citep{Whitney2003,Ai2003}, and it is widely used in machine learning fields \citep{Kato2022}.
Since $\mathbb{E}[Y|Z=z_0]-\mathbb{E}[Y|Z]=\mathbb{E}[\mathbb{E}[Y|Z=z_0]-Y|Z]$ and 
$\mathbb{E}[\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}|Z=z]-\mathbb{E}[\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}|Z=z_0]=\mathbb{E}[\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}-\mathbb{E}[\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}|Z=z_0]|Z=z]$
, Theorem 3.1 reduces to
\begin{eqnarray}
\mathbb{E}\Big[(Y_{X_{z_0}}-Y)-\mathfrak{g}(X,X_{z_0},{\boldsymbol W},g)\Big|Z=z\Big]=0
\end{eqnarray}
where $\displaystyle \mathfrak{g}(X,X_{z_0},{\boldsymbol W},g)=\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X}\{\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}-\mathbbm{1}_{X_{z_0}\leq x,{\boldsymbol W}={\boldsymbol w}}\}g(X,{\boldsymbol W})dxd{\boldsymbol w}$.
%\jin{It's $g(... X_{z0},...)$ in (37)?} 
%where $\mathfrak{g}(X,X_{z_0},{\boldsymbol W},g)=\int_{\Omega_{\boldsymbol W}}\int_{\Omega_{X,{\boldsymbol W}}}\{\mathbbm{1}_{X\leq x,{\boldsymbol W}={\boldsymbol w}}-\mathbbm{1}_{X_{z_0}\leq x,{\boldsymbol W}={\boldsymbol w}}\}g(x,{\boldsymbol w})dxd{\boldsymbol w}$ for $z \in \Omega_{Z}$ and $m=1,\ldots,M$. 
We denote residual function $\rho(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)=(Y_{X_{z_0}}-Y)-\mathfrak{g}(X,{\boldsymbol W},g))$.
%${\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)=(\rho^1(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g),\ldots,\rho^M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g))^T$ and $\mathfrak{g}(X,X_{z_0},{\boldsymbol W},g)=(\mathfrak{g}^1(X,X_{z_0},{\boldsymbol W},g),\ldots,\mathfrak{g}^M(X,X_{z_0},{\boldsymbol W},g))^T$.
Then, the integral equation can be represented by $\mathbb{E}[{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)|Z]=0$. 
%In sieve NTSLS, the residuals is defined as $\rho(Y,X,{\boldsymbol W},g)=Y-g(X,{\boldsymbol W})$.




{\bf Consistency of Sieve CAPCE Estimator.} %\jin{Delete Parametric here?} 
First, we show consistency without compactness restriction. {The Sieve} CAPCE estimator reduces to the general form of the conditional moment restrictions method, which is well-studied in \citep{Whitney2003}, as below:
\begin{eqnarray}
    \hat{g}=\arg\min_{g \in {\cal G}}\sum_{i=1}^N\frac{1}{N}\hat{\rho}(z_i,g)^2,
\end{eqnarray}
where $\hat{\rho}(z_i,g)=\hat{c}_i-\hat{\boldsymbol d}_i{\boldsymbol \beta}$, and $\hat{\rho}(z_i,g)$ can be considered as the estimators of ${\mathbb{E}}[{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)|Z=z_i]$. 



\subsection*{Assumptions}
We make the following consistency assumptions introduced in \citep{Whitney2003}. % (All proof is in this paper).
We denote ${\cal G}_S=\{g \in{\cal G}: \|\mathfrak{g}_0(x,{\boldsymbol w})\|_{\tilde{W}^{l,2}}^2 \leq B_{S}\}$, and $\overline{{\cal G}_S}$ is a closure of ${\cal G}_S$.
\begin{assumption}[Uniqueness of $g$]
\label{A1}
    $g_0 \in {{\cal G}_S}$ is the only $g \in {{\cal G}_S}$ satisfying ${\mathbb{E}}[{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)|Z=z]={0}$.
\end{assumption}
\begin{assumption}[Completeness of {\bf Stage 1.}]
\label{A2}
Taking limits $P\rightarrow \infty$, $N \rightarrow \infty$ with $P/N\rightarrow 0$,
     there {exists} ${\boldsymbol \pi}_P$ with $\mathbb{E}[\{b(z)-{\boldsymbol q}(z)^T{\boldsymbol \pi}_P\}^2]\rightarrow 0$
     for any $b(z)$ with $\mathbb{E}[b(z)^2]<\infty$. 
    %Also $\hat{\boldsymbol A}\xrightarrow{p} {\boldsymbol A}$, and ${\boldsymbol A}$ is positive definite and constant.
\end{assumption}
The above assumption
%\jin{Unclear what "this" refers to, use "The above assumption" or Assumption B.2"} 
is for the completeness of parameter space used in Stage 1.
\begin{assumption}[Boundedness of ${\rho}$]
\label{A3}
    $\mathbb{E}[\|{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)\|^2|Z]$ is bounded and there exists $M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})$, $\nu>0$ such that for all $\tilde{g},g \in \overline{{\cal G}_S}$, $\|{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},\tilde{g})-{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)\|\leq M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})\|\tilde{g}- g\|_{W^{l,2}}^{\nu}$ and $\mathbb{E}[M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})^2|Z]$ is bounded.
\end{assumption}
The above assumption is for the boundness of the parameters used in stage 2.
%\begin{assumption}[Compactness of $g$]
%\label{A4}
%    $g \in {\cal G}$ is compact for the norm $\|g\|_{W^{l,2}}$.
%\end{assumption}
%We denote a sub-space ${\cal G}_J=\{{\sum_{j=1}^J\beta_i\phi_j(x,{\boldsymbol w})},{\boldsymbol \gamma}\}$.
%\begin{assumption}[Completeness of ${\cal G}_J$]
%\label{A5}
%    For any $g \in {\cal G}$ there exists $g_J \in {\cal G}_J$ such that $\lim_{J \rightarrow \infty} \|g_J-g\|_{W^{l,2}}=0$.
%\end{assumption}

%\begin{theorem}
%    If Assumptions \ref{AS2}, \ref{AS1}, \ref{AS3}, \ref{A1}, \ref{A2}, \ref{A3}, \ref{A4} and \ref{A5} and $J \rightarrow \infty$, then $\|\hat{g}-g\|_{W^{l,2}}\xrightarrow{p} 0$.
%\end{theorem}

%We show that our Assumptions are weaker than their assumptions.
%\citep{Whitney2003} assume the separability $f_Y(X,{\boldsymbol W},{\boldsymbol U})=f_{Y_1}(X,{\boldsymbol W},{\boldsymbol \epsilon_Y})+f_{Y_2}({\boldsymbol U},{\boldsymbol \epsilon_Y})$, and let $\mathbb{E}[Y_{x,{\boldsymbol w}}]=f_{Y_1}(X,{\boldsymbol W};g)$. 
%We only assume the model $\mathbb{E}[\partial_x Y_{x,{\boldsymbol w}}]=g(X,{\boldsymbol W};g)$.

%Next, we show the consistency under compactness restriction. 
Let ${\cal W}$ denote the domain of $\mathfrak{g}(x,{\boldsymbol w},g)$.
\begin{assumption}[Openness and Convexness of Restricted Parameter Space]
\label{A6}
    %$g_B$ such that ${\boldsymbol \beta} \in \{{\boldsymbol \beta}^T{\boldsymbol \beta}\leq B_{\beta}\}$ and $g({\boldsymbol \gamma})\in \{ \sum_{m=1}^M\|\mathfrak{g}_0(x,{\boldsymbol w})\|_{\tilde{W}^{l,2}}^2 \leq B_{\gamma}\}$, 
    ${\cal W}$ is open and convex.
\end{assumption}

%\begin{theorem}[Consistency]
%    If Assumption \ref{AS2}, \ref{AS1}, \ref{B1}, \ref{AS3}, \ref{A1}, \ref{A2}, \ref{A3} are satisfied for $g \in \overline{{\cal G}_S}$, Assumption \ref{A6} is satisfied, and $J\rightarrow \infty$, then $\|\hat{g}-g\|_{W^{l,\infty}}\xrightarrow{p} 0$.
%\end{theorem}
%This theorem implies the uniform convergence of $\hat{g}_0$.
%$g=(B,\overline{\cal G})$ is satisfies if $B_{\beta}$ and $B_{\gamma}$ is large enough, or $\alpha$ is small enough.
%As mentioned in \citep{Newey2013}, ``{\it the bigger $\alpha$ is a the more weight the penalty has and so the less the variance and the larger the bias, with a $\alpha$ shrinking to zero as the sample size grows to ensure consistency."}
%\subsection*{Proof of Theorem 4.1.}

The following lemma is shown in \citep{Whitney2003}: 
\begin{lemma}
\label{COMLEM1}
If (i) $\Theta$ is a compact subset of a space with norm $\|\theta\|$: (ii) $\hat{Q}(\theta) \rightarrow_p Q(\theta)$ for all $\theta \in \Theta$: (iii) there is $v >0$ and $B_n O_p(1)$ such that for all $\tilde{\theta}, \theta \in \Theta$, $|\hat{Q}(\theta)-\hat{Q}(\tilde(\theta))| \leq B_n \Delta^v= B_n \epsilon/2M \leq \epsilon/2$ with a positive probability, then $Q(\theta)$ is continuous and $\sup_{\theta \in \Theta} |\hat{Q}(\theta)-Q(\theta)| \rightarrow_p 0$.

\end{lemma}


{\it
{\bf Theorem \ref{STHEO1}.}
Under SCM ${\cal M}_{IV}$ and Assumptions %3.1, 3.2, 4.1, 4.2, 4.3,
\ref{AS1}, \ref{AS2},  \ref{B1}, \ref{AS3}, \ref{COM}, \ref{A1}, \ref{A2}, \ref{A3}, and \ref{A6}, 
%    B.1, B.2, B.3, and B.4,
    letting $P \rightarrow \infty$ and $J\rightarrow \infty$, then $\|\hat{g}-g_0\|_{W^{l,\infty}}\xrightarrow{p} 0$.
}

\begin{proof}




From the Assumption \ref{A2} and \ref{A6}, the parameter space is compact subset.
From the Assumption \ref{A3}, the following relation is satisfied:
\begin{eqnarray}
    &&|{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},\tilde{g})-{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)|\\
    &&\leq M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})\|\tilde{g}- g\|_{W^{l,2}}^{\nu}
\end{eqnarray}
From the Lemma \ref{COMLEM1},
\begin{eqnarray}
\|\tilde{g}- g\|_{W^{l,\infty}}  \rightarrow_p 0.
\end{eqnarray}
From Assumption \ref{A1}, the limits of $\tilde{g}$ is $g_0$.
\end{proof}
From the definition of ${W^{l,\infty}}$, this theorem means uniform convergence.


\section{Rate of Convergence of Sieve CAPCE Estimator}
%\citep{Ai2003} introduce theorem for the rate of convergence.
\label{appC}
\subsection*{Notations}

In this section, we explain the notations used in the assumptions for  Theorem 4.2 and Theorem 4.4.
%We introduce more detailed notations for rate of convergence.
Denote the estimation problem 
\begin{eqnarray}
    \inf_{g \in {\cal G}}\mathbb{E}\left[\mathfrak{g}(X,X_{z_0},{\boldsymbol W},g)^2\right]
\end{eqnarray}
and  introduce norm $\|\cdot\|_A$ as below:
\begin{eqnarray}
    \|g_1-g_0\|_A=\sqrt{\mathbb{E}\left[\left(\frac{d\mathfrak{g}(X,X_{z_0},{\boldsymbol W},g_0)}{dg}\right)^2\right]}
\end{eqnarray}
where
\begin{eqnarray}
    \frac{d{\rho}(Z,g_0)}{dg}[g-g_0]=\frac{d{\rho}(Z,(1-\tau)g_0+\tau g)}{d\tau} \text{ a.s. } Z
\end{eqnarray}
\begin{eqnarray}
    \frac{d{\rho}(Z,g_0)}{dg}[g_1-g_2]=\frac{d{\rho}(Z,g_0)}{dg}[g_1-g_0]-\frac{d{\rho}(Z,g_0)}{dg}[g_2-g_0]
\end{eqnarray}
\begin{eqnarray}
    \frac{d\mathfrak{g}(X,X_{z_0},{\boldsymbol W},g_0)}{dg}=\mathbb{E}\left[\frac{d{\rho}(Z,g_0)}{dg}[g_1-g_2] \Big|\{X,X_{z_0},{\boldsymbol W}\} \right].
\end{eqnarray}
These derivatives are called ``pathwise derivatives." See \citep{Ai2003} for details.

To evaluate the rate of convergence, we denote the number of the basis functions depending on sample size be $J_N$ and $P_N$. 
Note that $N \rightarrow \infty$ implies $J_N \rightarrow \infty$ and $P_N\rightarrow \infty$.
We use more basis functions, ${\boldsymbol q}^{P_
N}=(q^1,\ldots,q^{P_N})$, as the sample size grows for the stage 1.

\subsection*{Assumptions}
We make the following assumptions. 
\begin{assumption}[Compactness of Domain]
\label{RA1}
$\Omega_{(X,X_{z_0},{\boldsymbol W})}$ is compact with non empty interior.
\end{assumption}
\begin{assumption}[Order of Convergence of Stage 1]
\label{RA2}
For  any $h \in {\cal G}_S$ with $\kappa>(1+d)/2$, there exists ${\boldsymbol q}^{P_
N}(X,X_{z_0},{\boldsymbol W})^T{\boldsymbol \pi}_{P_N} \in {\cal G}_S$, where ${\boldsymbol \pi}_{P_N}$ is $P_N$ vector, such that $\sup_{(X,X_{z_0},{\boldsymbol W}) \in \Omega_{(X,X_{z_0},{\boldsymbol W})}}|h(X,X_{z_0},{\boldsymbol W})-{\boldsymbol q}^{P_
N}(X,X_{z_0},{\boldsymbol W})^T{\boldsymbol \pi}_{P_N}|={\cal O}(P_N^{-\kappa/(1+d)})$ and $P_N^{-\kappa/(1+d)}={o}(N^{-1/4})$.
\end{assumption}
The above assumption guarantees the order of convergence of regression (basis functions) used in Stage 1.
%\begin{assumption}[Oder of Weights]
%\label{RA3}
%$\hat{\Sigma}(X,X_{z_0},{\boldsymbol W})={\Sigma}(X,X_{z_0},{\boldsymbol W})+o_p({N^{-1/4}})$ uniformly over $(X,X_{z_0},{\boldsymbol W})\in \Omega_{(X,X_{z_0},{\boldsymbol W})}$.
%\end{assumption}
\begin{assumption}[Order of Convergence of Stage 2]
\label{RA4}
There is a constant $\mu_1>0$ such  that for any $g \in {\cal G}$, there is $\Pi g \in {\cal G}$ satisfying $\|\Pi g-g\|={\cal O}(J_N^{-\kappa/(1+d)})$ and $J_N^{-\kappa/(1+d)}={o}(N^{-1/4})$. $\Pi$ is  the projections to ${\cal G}$.
\end{assumption}
The above assumption guarantees the order of convergence of regression (basis functions) used in Stage 2.
\begin{assumption}[Envelope condition]
\label{RA5}
Each element of ${\rho}(Z,g)$ satisfies the envelope condition in $g \in {\cal G}$;  and, each element of ${\rho}(Z,g) \in {\cal G}_S$ with $\kappa>(1+d)/2$.
\end{assumption}
The envelope condition is shown in \citep{Milgrom2002}. 
%\jin{"Envelope" or "Envelop"?}


Denote $\xi_N=\sup_{(X,X_{z_0},{\boldsymbol W})}\|{\boldsymbol q}^{P_
N}(X,X_{z_0},{\boldsymbol W})\|$.
\begin{assumption}[Condition of $J_N$]
\label{RA6}
$J_N\times ln(N)\times \xi_N\times N^{-1/2}=o(1)$
\end{assumption}
We denote $N(\epsilon^{1/k},{\cal G},\|\cdot\|_{W^{l,2}})$ as the minimal number of radius $\delta$ covering ball of ${\cal G}$. 
\begin{assumption}[Condition of $J_N$]
\label{RA7}
$ln[N(\epsilon^{1/k},{\cal G},\|\cdot\|_{W^{l,2}})] \leq const. \times J_N \times ln(J_N/\epsilon)$
\end{assumption}
These assumptions show how to make the models complex depending on sample size.


\begin{assumption}[Convexness of Parameter Space]
\label{RA8}
${\cal G}$ is convex in $g$, and ${\rho}(Z,g)$ is pathwise differentiable at $g$; and, for some $c_1,c_2>0$,
\begin{eqnarray}
    c_1\mathbb{E}[\hat{\rho}(Z,g)^2]\leq \|\hat{g}-g\|^2 \leq c_2\mathbb{E}[\hat{\rho}(Z,g)^2]
\end{eqnarray}
holds for all $\hat{g} \in {\cal G}$ with $\|\hat{g}-g\|_{W^{l,2}}^2=o(1)$
\end{assumption}

%\begin{theorem}[Rate of Convergence]
%    Give SCM ${\cal M}_{IV}$ and Assumption \ref{AS2}, \ref{AS1}, \ref{B1}, \ref{AS3}, \ref{RA1}, \ref{RA2}, \ref{RA3}, \ref{RA4}, \ref{RA5}, \ref{RA6}, \ref{RA7}, and \ref{RA8}, then $\|\hat{g}-{g}_0\|_{W^{l,2}}={o}_p(N^{-1/4})$.
%\end{theorem}

%\subsection*{Proof of Theorem 4.2.}

\begin{comment}
\begin{lemma}
(A) Suppose Assumptions \ref{RA1} and \ref{RA2}, and the following are satisfied:\\
(i) there exists a constant $c_{1n}$ and a measurable function $c_1(Z): \Omega_Z \rightarrow [0,\infty)$ with $\mathbb{E}[c_1(Z)^p]<\infty$ for some $p \geq 4$ such that $|g(Z,\alpha)|\leq c_{1n}c_1(Z)$ for all $\alpha\in A_n$ and $Z \in \Omega_Z$.\\
(ii) there exists a constant $\kappa \in (0,1]$ and a measurable function $c_2(Z): \Omega_Z \rightarrow [0,\infty)$ with $\mathbb{E}[c_2(Z)^2] < \infty$ such that $|g(Z,\alpha_1)-g(Z,\alpha_2)|\leq c_2(Z) \|\alpha_1-\alpha_2\|^{\kappa}_s$ holds for all $Z \in \Omega_Z$ and $\alpha_1, \alpha_2 \in A_n$.\\
(iii) there exists a positive value $\delta_{1n}=o(1)$ such that
\begin{eqnarray}
    \frac{n\delta_{1n}^2}{ln[(\frac{\xi_{1n}c_{1n}}{\delta_{1n}})^{d_x}N((\frac{\delta_{1n}}{\xi_{0n}})^{1/\kappa},A_n,\|\cdot\|_s)]\times \max\{\xi_{0n}^2,c_{1n}^2,\xi_{on}^{2+2/p},\delta_{1n}^{1-2/p},c_{1n}^{1+2/p}\}}\rightarrow + \infty
\end{eqnarray}
Then $p^{k_n}(X)^T(P^TP)^{-1}P^T\epsilon(\alpha)=o_p(\delta_{1n})$ uniformly over $(X,\alpha) \in \Omega_{X,{\boldsymbol W}} \times A_n$.\\


(B) Suppose Assumptions \ref{RA1} and \ref{RA2}, and the following are satisfied:\\
(i) there exists a positive value $\delta_{2n}=o(1)$ and coefficients $\pi(\alpha)$ such that $\mathbb{E}[g(Z,\alpha)|X]=p^{k_n}(X)^T\pi(\alpha)$ holds for all $X \in \Omega_{X,{\boldsymbol W}}$ and $\alpha \in A_n$.\\
Then $(1/n)\sum_{i=1}^n (p^{k_n}(X)^T(P^TP)^{-1}P^T\sum_{j=1}^np^{k_n}(X_j)\mathbb{E}[g(Z,\alpha)|X_j]-\mathbb{E}[g(Z,\alpha)|X_i])^2=o_p(\delta_{2n}^2)$ uniformly over $\alpha \in A_n$.\\

(C) Suppose Assumptions \ref{RA1} and \ref{RA2}, and the following are satisfied:\\
(v) For fixed $\overline{\alpha_n} \in \overline{A}$, and $\mathbb{E}[\epsilon(Z,\overline{\alpha_n})]\leq const$ for all $n \leq 1$ and $X \in \Omega_{X,{\boldsymbol W}}$.\\
Then $(1/n)\sum_{i=1}^n\{p^{k_n}(X)^T(P^TP)^{-1}P^T\sum_{j=1}^np^{k_n}(X_j)\epsilon(Z,\overline{\alpha_n})\}^2=O_p(k_n/n)$.
\end{lemma}
\end{comment}

The following lemma holds \citep{Ai2003}: %\jin{the lemma needs a proof or citation?}
\begin{lemma}
\label{LEM2}
Under Assumptions \ref{RA1}, \ref{RA2}, \ref{RA4}, \ref{RA5}, \ref{RA6}, \ref{RA7}, and \ref{RA8},
(i) $\hat{L}_N(g)-L_N(g)=o_p(N^{-1/4})$ uniformly over $g \in {\cal G}$; and (ii) $\hat{L}_N(g)-\hat{L}_N(g_0)-\{L_N(g)-L_N(g_0)\}=o_p(\tau_N N^{-1/4})$ uniformly over $g \in {\cal G}$ with $\|g-g_0\|\leq o(\tau_N)$, where $\tau_N=N^{-\tau}$ with $\tau\leq 1/4$.
\end{lemma}


{\it
{\bf Theorem \ref{STHEO2}.}
Under SCM ${\cal M}_{IV}$ and Assumptions %3.1, 3.2, 4.1, 4.2, 4.3,
\ref{AS1}, \ref{AS2},  \ref{B1}, \ref{AS3}, \ref{COM}, 
\ref{RA1}, \ref{RA2}, \ref{RA4}, \ref{RA5}, \ref{RA6}, \ref{RA7}, and \ref{RA8},
    %C.1, C.2, C.3, C.4, C.5, C.6, and C.7, 
    setting $N=N_1=N_2$, then $\|\hat{g}-{g}_0\|_{A}={o}_p(N^{-1/4})$.
}
\begin{proof}
Let 
\begin{eqnarray}
    \hat{L}_{N}(g)=-\frac{1}{2N}\hat{\mathfrak{g}}(X,X_{z_0},{\boldsymbol W}.g)^2,\ \ \  {L}_{N}(g)=-\frac{1}{2N}{\mathfrak{g}}(X,X_{z_0},{\boldsymbol W},g)^2.
\end{eqnarray}
%then the problem reduces to the problem shown in \citep{Ai2003}. \jin{ don't refer to some problem in another place, }

Then, Lemma \ref{LEM2} implies 
\begin{eqnarray}
    \hat{L}_{N}(g)-\hat{L}_{N}(g_0)-\{{L}_{N}(g)-{L}_{N}(g_0)\}=o_p(N^{-1/4})
\end{eqnarray}
and this proves
\begin{eqnarray}
    \|\hat{g}-g_0\|=o_p(N^{-1/4}).
\end{eqnarray}
\end{proof}

\section{Consistency of Parametric CAPCE Estimator}
\label{appD}

In this section, we show the consistency property of parametric CAPCE estimator.
We denote the functional space ${\cal G}$ be $\{g \in {\cal G}: g(x,{\boldsymbol w})=\sum_{k=1}^K \gamma_k \theta_k(x,{\boldsymbol w})\}$.



{\bf Consistency of Parametric CAPCE Estimator.} %\jin{Delete Sieve} 
First, we show consistency without compactness restriction. {The Parametric} CAPCE estimator reduces to the general form of the conditional moment restrictions method, which is well-studied in \citep{Whitney2003}, as below:
\begin{eqnarray}
    \hat{{\boldsymbol \gamma}}=\arg\min_{{\boldsymbol \gamma}}\sum_{i=1}^N\frac{1}{N}\hat{\rho}(z_i,{\boldsymbol \gamma})^2,
\end{eqnarray}
where $\hat{\rho}(z_i,{\boldsymbol \gamma})=\hat{c}_i-\hat{\boldsymbol e}_i{\boldsymbol \gamma}$.
$\hat{\rho}(z_i,{\boldsymbol \gamma})$ can be considered as the estimators of ${\mathbb{E}}[{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})|Z=z_i]$. 



\subsection*{Assumptions}
We make the following  assumptions introduced in \citep{Whitney2003}. % (All proof is in this paper).
We denote ${\cal G}_P=\{{\boldsymbol \gamma}^T{\boldsymbol \gamma} \leq B_{P}\}$, and $\overline{{\cal G}_P}$ is the closure of ${\cal G}_P$.
\begin{assumption}[Uniqueness of $g$]
\label{PA1}
    ${\boldsymbol \gamma} \in {{\cal G}_P}$ is the only ${\boldsymbol \gamma} \in {{\cal G}_P}$ satisfying ${\mathbb{E}}[{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})|Z=z]={\boldsymbol 0}$.
\end{assumption}
\begin{assumption}[Completeness of ${\boldsymbol q}$]
\label{PA2}
Taking limits $P\rightarrow \infty$, $N \rightarrow \infty$ with $P/N\rightarrow 0$,
     there exists ${\boldsymbol \pi}_P$ with $\mathbb{E}[\{b(z)-{\boldsymbol q}(z)^T{\boldsymbol \pi}_P\}^2]\rightarrow 0$
     for any $b(z)$ with $\mathbb{E}[b(z)^2]<\infty$. %Also $\hat{\boldsymbol A}\xrightarrow{p} {\boldsymbol A}$, and ${\boldsymbol A}$ is positive definite and constant.
\end{assumption}
\begin{assumption}[Boundedness of ${\rho}$]
\label{PA3}
    $\mathbb{E}[\|{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})\|^2|Z]$ is bounded and there exists $M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})$, $\nu>0$ such that for all $\tilde{{\boldsymbol \gamma}},{\boldsymbol \gamma} \in \overline{{\cal G}_P}$, $\|{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},\tilde{{\boldsymbol \gamma}})-{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},g)\|\leq M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})\|\tilde{{\boldsymbol \gamma}}- {\boldsymbol \gamma}\|^{\nu}$ and $\mathbb{E}[M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})^2|Z]$ is bounded.
\end{assumption}
%\begin{assumption}[Compactness of $g$]
%\label{A4}
%    $g \in {\cal G}$ is compact for the norm $\|g\|_{W^{l,2}}$.
%\end{assumption}
%We denote a sub-space ${\cal G}_J=\{{\sum_{j=1}^J\beta_i\phi_j(x,{\boldsymbol w})},{\boldsymbol \gamma}\}$.
%\begin{assumption}[Completeness of ${\cal G}_J$]
%\label{A5}
%    For any $g \in {\cal G}$ there exists $g_J \in {\cal G}_J$ such that $\lim_{J \rightarrow \infty} \|g_J-g\|_{W^{l,2}}=0$.
%\end{assumption}

%\begin{theorem}
%    If Assumptions \ref{AS2}, \ref{AS1}, \ref{AS3}, \ref{A1}, \ref{A2}, \ref{A3}, \ref{A4} and \ref{A5} and $J \rightarrow \infty$, then $\|\hat{g}-g\|_{W^{l,2}}\xrightarrow{p} 0$.
%\end{theorem}

%We show that our Assumptions are weaker than their assumptions.
%\citep{Whitney2003} assume the separability $f_Y(X,{\boldsymbol W},{\boldsymbol U})=f_{Y_1}(X,{\boldsymbol W},{\boldsymbol \epsilon_Y})+f_{Y_2}({\boldsymbol U},{\boldsymbol \epsilon_Y})$, and let $\mathbb{E}[Y_{x,{\boldsymbol w}}]=f_{Y_1}(X,{\boldsymbol W};g)$. 
%We only assume the model $\mathbb{E}[\partial_x Y_{x,{\boldsymbol w}}]=g(X,{\boldsymbol W};g)$.

%Next, we show the consistency under compactness restriction. 
Let ${\cal W}$ denote the domain of $\mathfrak{g}(x,{\boldsymbol w},{\boldsymbol \gamma})$.
\begin{assumption}[Openness and Convexness of Restricted Parameter Space]
\label{PA6}
    %$g_B$ such that ${\boldsymbol \beta} \in \{{\boldsymbol \beta}^T{\boldsymbol \beta}\leq B_{\beta}\}$ and $g({\boldsymbol \gamma})\in \{ \sum_{m=1}^M\|\mathfrak{g}_0(x,{\boldsymbol w})\|_{\tilde{W}^{l,2}}^2 \leq B_{\gamma}\}$, 
    ${\cal W}$ is open and convex.
\end{assumption}

%\begin{theorem}[Consistency]
%    If Assumption \ref{AS2}, \ref{AS1}, \ref{B1}, \ref{AS3}, \ref{A1}, \ref{A2}, \ref{A3} are satisfied for $g \in \overline{{\cal G}_S}$, Assumption \ref{A6} is satisfied, and $J\rightarrow \infty$, then $\|\hat{g}-g\|_{W^{l,\infty}}\xrightarrow{p} 0$.
%\end{theorem}
%This theorem implies the uniform convergence of $\hat{g}_0$.
%$g=(B,\overline{\cal G})$ is satisfies if $B_{\beta}$ and $B_{\gamma}$ is large enough, or $\alpha$ is small enough.
%As mentioned in \citep{Newey2013}, ``{\it the bigger $\alpha$ is a the more weight the penalty has and so the less the variance and the larger the bias, with a $\alpha$ shrinking to zero as the sample size grows to ensure consistency."}

%\subsection*{Proof of Theorem 4.3.}

{\it
{\bf Theorem \ref{PTHEO1}.}
    Under SCM ${\cal M}_{IV}$ and Assumptions %3.1, 3.2, 4.4,
    \ref{AS1}, \ref{AS2},  \ref{COM2},
    \ref{PA1}, \ref{PA2}, \ref{PA3}, and \ref{PA6},
    %D.1, D.2, D.3, and D.4,
    letting $P \rightarrow \infty$, then $\|\hat{\boldsymbol \gamma}-{\boldsymbol \gamma}\|\xrightarrow{p} 0$.
}

\begin{proof}

From the Assumption \ref{PA2} and \ref{PA6}, the parameter space is compact subset.
From the Assumption \ref{PA3}, the following relation is satisfied:
\begin{eqnarray}
    &&|{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},\tilde{{\boldsymbol \gamma}})-{\rho}(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})|\\
    &&\leq M(Y,Y_{X_{z_0}},X,X_{z_0},{\boldsymbol W})\|\tilde{{\boldsymbol \gamma}}- {\boldsymbol \gamma}\|^{\nu}
\end{eqnarray}
From Lemma \ref{COMLEM1},
\begin{eqnarray}
\|\tilde{{\boldsymbol \gamma}}- {\boldsymbol \gamma}\|  \rightarrow_p 0.
\end{eqnarray}
From Assumption \ref{PA1}, the limits is ${\boldsymbol \gamma}_0$.
\end{proof}


\section{Rate of Convergence of Parametric CAPCE Estimator}
%\citep{Ai2003} introduce theorem for the rate of convergence.
\label{appE}
\begin{comment}
\subsection*{Notations}
\jin{Can this Notations section be deleted? Is it repeating the corresponding section in Appendix C? Only introduce new needed notations.}

In this section, we explain the notations for the assumptions of Theorem 4.2 and 4.4.
We introduce more detailed notations for rate of convergence.
Denote the estimation problem 
\begin{eqnarray}
    \inf_{{\boldsymbol \gamma} \in {\cal G}}\mathbb{E}\left[\mathfrak{g}(X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})^2\right]
\end{eqnarray}
and  introduce norm $\|\cdot\|_A$ as below:
\begin{eqnarray}
    \|{\boldsymbol \gamma}_1-{\boldsymbol \gamma}_0\|_A=\sqrt{\mathbb{E}\left[\left(\frac{d\mathfrak{g}(X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}\right)^2\right]}
\end{eqnarray}
where
\begin{eqnarray}
    \frac{d{\rho}(Z,{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}[{\boldsymbol \gamma}-{\boldsymbol \gamma}_0]=\frac{d{\rho}(Z,(1-\tau){\boldsymbol \gamma}_0+\tau {\boldsymbol \gamma})}{d\tau} \text{ a.s. } Z
\end{eqnarray}
\begin{eqnarray}
    \frac{d{\rho}(Z,{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}[{\boldsymbol \gamma}_1-{\boldsymbol \gamma}_2]=\frac{d{\rho}(Z,{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}[{\boldsymbol \gamma}_1-{\boldsymbol \gamma}_0]-\frac{d{\rho}(Z,{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}[{\boldsymbol \gamma}_2-{\boldsymbol \gamma}_0]
\end{eqnarray}
\begin{eqnarray}
    \frac{d\mathfrak{g}(X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}=\mathbb{E}\left[\frac{d{\rho}(Z,{\boldsymbol \gamma}_0)}{d{\boldsymbol \gamma}}[{\boldsymbol \gamma}_1-{\boldsymbol \gamma}_2] \Big|\{X,X_{z_0},{\boldsymbol W}\} \right].
\end{eqnarray}
These derivatives are called ``pathwise derivatives."

%To evaluate the rate of convergence, we denote the number of the basis function depends on sample size be $J_N$ and $P_N$. $N \rightarrow \infty$ implies $J_N \rightarrow \infty$ and $P_N\rightarrow \infty$.
\end{comment}

\subsection*{Assumptions}
We make the following assumptions. 
\begin{assumption}[Compactness of Domain]
\label{PRA1}
$\Omega_{(X,X_{z_0},{\boldsymbol W})}$ is compact with non empty interior.
\end{assumption}
\begin{assumption}[Order of Convergence of Stage 1]
\label{PRA2}
For  any $h \in {\cal G}_P$ with $\kappa>(1+d)/2$, there exists ${\boldsymbol q}^{P_
N}(X,X_{z_0},{\boldsymbol W})^T{\boldsymbol \pi}_{P_N} \in {\cal G}_P $, where ${\boldsymbol \pi_{P_N}}$ is $P_N$ vector, such that $\sup_{(X,X_{z_0},{\boldsymbol W}) \in \Omega_{(X,X_{z_0},{\boldsymbol W})}}|h(X,X_{z_0},{\boldsymbol W})-{\boldsymbol q}^{P_
N}(X,X_{z_0},{\boldsymbol W})^T{\boldsymbol \pi}_{P_N}|={\cal O}(P_N^{-\kappa/(1+d)})$ and $P_N^{-\kappa/(1+d)}={o}(N^{-1/4})$.
\end{assumption}
\begin{assumption}[Order of Convergence of Stage 2]
\label{PRA4}
There is a constant $\mu_1>0$ such  that for any ${\boldsymbol \gamma} \in {\cal G}_P$, there is $\Pi {\boldsymbol \gamma} \in {\cal G}_P$ 
%\jin{What is ${\cal G}$?}
satisfying $\|\Pi {\boldsymbol \gamma}-{\boldsymbol \gamma}\|={\cal O}(1)$.% and $J_N^{-B/(1+d)}={o}(N^{-1/4})$.
\end{assumption}
\begin{assumption}[Envelope condition]
\label{PRA5}
Each element of ${\rho}(Z,{\boldsymbol \gamma})$ satisfies the envelope condition in ${\boldsymbol \gamma} \in {\cal G}_P$; and, each element of ${\rho}(Z,{\boldsymbol \gamma}) \in {\cal G}_P$ with $\kappa>(1+d)/2$, for all ${\boldsymbol \gamma} \in {\cal G}_P$.
\end{assumption}
The envelope condition is shown in \citep{Milgrom2002}. %\jin{${\cal G}_P$ in the above and following?} 
%Denote $\xi_N=\sup_{(X,X_{z_0},{\boldsymbol W})}\|{\boldsymbol q}^{P_N}(X,X_{z_0},{\boldsymbol W})\|$.
%\begin{assumption}[Condition of $J_N$]
%\label{PRA6}
%$J_N\times ln(N)\times \xi\times N^{-1/2}=o(1)$
%\end{assumption}
%We denote $N(\epsilon^{1/k},{\cal G},\|\cdot\|_{W^{l,2}})$ as the minimal number of radius $\delta$ covering ball of ${\cal G}$. 
%\begin{assumption}[Condition of $J_N$]
%\label{PRA7}
%$ln[N(\epsilon^{1/k},{\cal G},\|\cdot\|_{W^{l,2}})] \leq const. \times J_N \times ln(J_N/\epsilon)$
%\end{assumption}
\begin{assumption}[Convexness of Parameter Space]
\label{PRA8}
${\cal G}_P$ is convex in ${\boldsymbol \gamma}$, and ${\rho}(Z,{\boldsymbol \gamma})$ is pathwise differentiable at ${\boldsymbol \gamma}$; and, for some $c_1,c_2>0$,
\begin{eqnarray}
    c_1\mathbb{E}[\hat{\rho}(Z,{\boldsymbol \gamma})^2]\leq \|\hat{{\boldsymbol \gamma}}-{\boldsymbol \gamma}\|^2 \leq c_2\mathbb{E}[\hat{\rho}(Z,{\boldsymbol \gamma})^2]
\end{eqnarray}
holds for all $\hat{{\boldsymbol \gamma}} \in {\cal G}_P$ with $\|\hat{{\boldsymbol \gamma}}-{\boldsymbol \gamma}\|^2=o(1)$
\end{assumption}

%\begin{theorem}[Rate of Convergence]
%    Give SCM ${\cal M}_{IV}$ and Assumption \ref{AS2}, \ref{AS1}, \ref{B1}, \ref{AS3}, \ref{RA1}, \ref{RA2}, \ref{RA3}, \ref{RA4}, \ref{RA5}, \ref{RA6}, \ref{RA7}, and \ref{RA8}, then $\|\hat{g}-{g}_0\|_{W^{l,2}}={o}_p(N^{-1/4})$.
%\end{theorem}

%\subsection*{Proof of Theorem 4.4.}


The following lemma holds \citep{Ai2003}: %\jin{Proof or citation}
\begin{lemma}
\label{LEM3}
Under Assumptions \ref{PRA1}, \ref{PRA2}, \ref{PRA4}, \ref{PRA5}, and \ref{PRA8},
(i) $\hat{L}_N({\boldsymbol \gamma})-L_N({\boldsymbol \gamma})=o_p(N^{-1/4})$ uniformly over ${\boldsymbol \gamma}$; and (ii) $\hat{L}_N({\boldsymbol \gamma})-\hat{L}_N({\boldsymbol \gamma}_0)-\{L_N({\boldsymbol \gamma})-L_N({\boldsymbol \gamma}_0)\}=o_p(\tau_N N^{-1/4})$ uniformly over ${\boldsymbol \gamma}$ with $\|{\boldsymbol \gamma}-{\boldsymbol \gamma}_0\|\leq o(\tau_N)$, where $\tau_N=N^{-\tau}$ with $\tau\leq 1/4$.
\end{lemma}


{\it
{\bf Theorem \ref{PTHEO2}.}
Under SCM ${\cal M}_{IV}$ and Assumptions %3.1, 3.2, 4.4,
\ref{AS1}, \ref{AS2}, \ref{COM2},
 \ref{PRA1}, \ref{PRA2}, \ref{PRA4}, \ref{PRA5}, and \ref{PRA8},
% E.1, E.2, E.3, E.4, and E.5,
 setting $N=N_1=N_2$, then $\|\hat{\boldsymbol \gamma}-{\boldsymbol \gamma}\|={o}_p(N^{-1/4})$.
}



\begin{proof}

Let 
\begin{eqnarray}
    \hat{L}_{N}({\boldsymbol \gamma})=-\frac{1}{2N}\hat{\mathfrak{g}}(X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})^2, \ \ \ {L}_{N}(g)=-\frac{1}{2N}{\mathfrak{g}}(X,X_{z_0},{\boldsymbol W},{\boldsymbol \gamma})^2.
\end{eqnarray}
%then the problem reduces to the problem in \citep{Ai2003}. \jin{Revise this proof.}
Then, Lemma \ref{LEM3} implies %\jin{Lemma 3?}
\begin{eqnarray}
    \hat{L}_{N}({\boldsymbol \gamma})-\hat{L}_{N}({\boldsymbol \gamma}_0)-\{{L}_{N}({\boldsymbol \gamma})-{L}_{N}({\boldsymbol \gamma}_0)\}=o_p(N^{-1/4})
\end{eqnarray}
and this proves
\begin{eqnarray}
    \|\hat{{\boldsymbol \gamma}}-{\boldsymbol \gamma}_0\|=o_p(N^{-1/4}).
\end{eqnarray}

\end{proof}
