
\section{Properties of RKHS CAPCE Estimator}
\label{appF}
We show the consistency and rate of convergence of RKHS CAPCE estimator following \citep{Singh2019} when $\lambda_3$ is $0$.

\subsection*{Notations}
We use the integral operator notations from the kernel methods literature. ${\cal L}_2(\Omega_Z,\mathfrak{p}_{Z})$ denotes a  ${\cal L}_2$ integrable function from $\Omega_{Z}$ to $\Omega_Y$ with respect to measure $\mathfrak{p}_{Z}$.
\begin{definition}
The stage 1 operators are
\begin{eqnarray}
    &&S^*_1: {\cal H}_{Z} \rightarrow {\cal L}_2(\Omega_Z,\mathfrak{p}_{Z}), l \mapsto \left<l,\psi(\cdot)\right>_{{\cal H}_{\boldsymbol Z}}\\ 
    &&S_1: {\cal L}_2(\Omega_Z,\mathfrak{p}_{Z}) \rightarrow {\cal H}_{Z}, \tilde{l} \mapsto \int \psi(z) \tilde{l}(z) \mathfrak{p}_{Z}(z) dz
\end{eqnarray}
and $T_1=S^*_1 \circ S_1$ is the uncentered covariance operator.
The details of the theory of vector-valued RKHS are shown in \citep{Singh2019}.
\end{definition}

In addition, we denote
\begin{definition}
\begin{eqnarray}
{G_1}_{\rho}=\arg \min {\cal E}_1 (E), {\cal E}_1=\mathbb{E}[\pi(X,{\boldsymbol W})-{G_1}(\psi(Z))]_{{\cal H}_{X,{\boldsymbol W}}}^2,
\end{eqnarray}
\begin{eqnarray}
{G_1}_{\lambda}=\arg \min {\cal E}_1 ({G_1}), {\cal E}_1=\mathbb{E}[\pi(X,{\boldsymbol W})-{G_1}(\psi(Z))]_{{\cal H}_{X,{\boldsymbol W}}}^2+\lambda \|G_1\|_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})}^2,
\end{eqnarray}
\begin{eqnarray}
\hat{{G_1}}_{\lambda}=\arg \min {\cal E}_1 ({G_1}), {\cal E}_1=\hat{\mathbb{E}}[\pi(X,{\boldsymbol W})-{G_1}(\psi(Z))]_{{\cal H}_{X,{\boldsymbol W}}}^2+\lambda \|G_1\|_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})}^2,
\end{eqnarray}
\begin{eqnarray}
{G_2}_{\rho}=\arg \min {\cal E}_1 (E), {\cal E}_1=\mathbb{E}[Y-{G_2}(\psi(Z))]^2,
\end{eqnarray}
\begin{eqnarray}
{G_2}_{\lambda}=\arg \min {\cal E}_1 ({G_2}), {\cal E}_1=\mathbb{E}[Y-{G_2}(\psi(Z))]^2+\lambda \|G_2\|_{{\cal L}_2({\cal H}_Z,\Omega_Y)}^2,
\end{eqnarray}
\begin{eqnarray}
\hat{{G_2}}_{\lambda}=\arg \min {\cal E}_1 ({G_2}), {\cal E}_1=\hat{\mathbb{E}}[Y-{G_2}(\psi(Z))]^2+\lambda \|G_2\|_{{\cal L}_2({\cal H}_Z,\Omega_Y)}^2.
\end{eqnarray}

\end{definition}





\begin{definition}
The stage 2 operators are
\begin{eqnarray}
    &&S_2^*: {\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}}) \rightarrow {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\mathfrak{p}_{{\cal H}_{X,{\boldsymbol W}}}),H \mapsto \Omega^*_{(\cdot)}H \\
    &&S_2: {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\mathfrak{p}_{{\cal H}_{X,{\boldsymbol W}}}) \rightarrow {\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}}),\\
    &&\tilde{H} \mapsto \int \Omega_{\mu(z)-\mu(z_0)}\circ \tilde{H}\{\mu(z)-\mu(z_0)\} \mathfrak{p}_{{\cal H}_{X,{\boldsymbol W}}}(\mu(z))
\end{eqnarray}
 and $T_2=S^*_2 \circ S_2$ is the uncentered covariance operator.
\end{definition}


\begin{definition}
    
We denote
\begin{eqnarray}
H_{\rho}=\arg \min {\cal E} (H), {\cal E}(H)=\mathbb{E}[Y-\mu_2(z_0)-H(\mu(Z)-\mu(z_0))]_{{\cal H}_{X,{\boldsymbol W}}}^2,
\end{eqnarray}
\begin{eqnarray}
&&H_{\xi}=\arg \min {\cal E}_{\xi} (H),\\
&&{\cal E}(H)=\mathbb{E}[Y-\mu_2(z_0)-H(\mu(Z)-\mu(z_0))]_{{\cal H}_{X,{\boldsymbol W}}}^2+\xi \|H\|_{{\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)}^2,
\end{eqnarray}
\begin{eqnarray}
&&\hat{H}_{\xi}=\arg \min \hat{\cal E}_{\xi} (H),\\
&&\hat{\cal E}(H)=\hat{\mathbb{E}}[Y-\mu_2(z_0)-H(\mu(Z)-\mu(z_0))]_{{\cal H}_{X,{\boldsymbol W}}}^2+\xi \|H\|_{{\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)}^2.
\end{eqnarray}
\end{definition}


\subsection*{Assumptions}

Next, we show assumptions for Theorem \ref{RTEO1}.
\begin{assumption}[Restriction for the domains]
\label{RAS1}
Suppose that $\Omega_{X,{\boldsymbol W}}$ and $\Omega_Z$ are Polish spaces, i.e., separable and completely metrizable topological spaces.
\end{assumption}

\begin{assumption}[Restriction for the feature functions]
\label{RAS2}
Suppose that 
\begin{enumerate}
    \item $k_{X,{\boldsymbol W}}$ and $k_{\boldsymbol Z}$ are continuous and bounded: $\sup_{x \in \Omega_{X,{\boldsymbol W}}} \|\pi(x,{\boldsymbol w})\|_{{\cal H}_{X,{\boldsymbol W}}} \leq Q$ and $\sup_{z \in \Omega_Z} \|\psi(z)\|_{{\cal H}_{\boldsymbol Z}} \leq \kappa$.
    \item $\pi$ and $\psi$ are measurable.
    \item $k_{X,{\boldsymbol W}}$ is characteristic.    
\end{enumerate}
\end{assumption}

\begin{assumption}[Uniqueness]
\label{RAS3}
Suppose that ${G_1}_{\rho} \in {\cal L}_2({\cal H}_{Z},{\cal H}_{\boldsymbol Z})$, then ${\cal E}_1({G_1}_{\rho})=\inf_{{G_1} \in {\cal H}_Z}{\cal E}_1({G_1})$.
Furthermore, suppose that ${G_2}_{\rho} \in {\cal L}_2({\cal H}_{Z},{\cal H}_{\boldsymbol Z})$, then ${\cal E}_1({G_2}_{\rho})=\inf_{{G_2} \in {\cal H}_{Z}}{\cal E}_1({G_2})$.
\end{assumption}

\begin{assumption}[Boundness of stage 1]
\label{RAS4}
    Fix $\zeta_1, \zeta_2 \leq \infty$. For given $c_1,c_2 \in (1,2]$, define the prior ${\cal P}(\zeta_1,c_1)$ and ${\cal P}(\zeta_2,c_2)$ as the set of the probability distributions on $\Omega_{X,{\boldsymbol W}} \times \Omega_Z$ such that a range space assumption is satisfied: $\exists C_1 \in {\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})$ such that $G_{1\rho}=T_1^{\frac{c_1-1}{2}} \circ C_1$ and $\|C_1\|_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})}^2 \leq \zeta_1$, and 
    $\exists C_2 \in {\cal L}_2({\cal H}_Z,\Omega_Y)$ such that $G_{2\rho}=T_1^{\frac{c_2-1}{2}} \circ C_2$ and $\|C_2\|_{{\cal L}_2({\cal H}_Z,\Omega_Y)}^2 \leq \zeta_2$.
\end{assumption}

\begin{lemma}[Rate of convergence of stage 1 (A)]
Make Assumptions \ref{RAS1}, \ref{RAS2}, \ref{RAS3} and \ref{RAS4}. For all $\delta \in (0,1)$, the following holds w.p. $1-\delta$:
\begin{eqnarray}
&&    \|\hat{{G_1}}_{\lambda}-{G_1}_{\rho}\|_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})} \nonumber\\
&&\leq \frac{\sqrt{\zeta_1}(c_1+1)}{4^{\frac{1}{c_1+1}}}\left( \frac{4\kappa(Q+\kappa\|{G_1}_{\rho}\|_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})})ln(2/\delta)}{\sqrt{n\zeta_1}(c_1-1)}\right)
\end{eqnarray}
\end{lemma}

\begin{lemma}[Rate of convergence of stage 1 (B)]
Make Assumptions \ref{RAS1}, \ref{RAS2}, \ref{RAS3} and \ref{RAS4}. For all $\delta \in (0,1)$, the following holds w.p. $1-\delta$:
\begin{eqnarray}
&&    \|\hat{{G_2}}_{\lambda}-{G_2}_{\rho}\|_{{\cal L}_2({\cal H}_Z,\Omega_Z)} \nonumber\\
&&\leq \frac{\sqrt{\zeta_2}(c_2+1)}{4^{\frac{1}{c_2+1}}}\left( \frac{4\kappa(Q+\kappa\|{G_2}_{\rho}\|_{{\cal L}_2({\cal H}_Z,\Omega_Z)})ln(2/\delta)}{\sqrt{n\zeta_2}(c_2-1)}\right)
\end{eqnarray}
\end{lemma}
The proof is shown in \citep{Singh2019}.
The above lemma implies consistency of Stage 1 (A). 
%\jin{What happens to Stage 1(B)?}

\begin{assumption}[Restriction of domain]
\label{RAS5}
    Suppose that $\Omega_Y$ is a Polish space, i.e., separable and completely metrizable topological spaces.
\end{assumption}

\begin{assumption}[Boundness of stage 2]
\label{RAS6}
    Suppose that
\begin{enumerate}
    \item The $\{\Psi_{\mu(z)-\mu(z_0)}\}$ operator family is uniformly bounded in Hilbert-Schmidt norm: $\exists B$ such that $\forall \mu(z)$, $\|\Psi_{\mu(z)-\mu(z_0)}\|^2_{{\cal L}_2(\Omega_Z,{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}}))}=Tr(\Psi^*_{\mu(z)-\mu(z_0)} \circ \Psi_{\mu(z)-\mu(z_0)}) \leq B$.
    \item The $\{\Psi_{\mu(z)-\mu(z_0)}\}$ operator family is H\"{o}lder continuous in operator norm: $\exists L > 0$, $\iota \in (0,1]$ such that $\forall \mu(z), \mu(z')$, $\|\Psi_{\mu(z)-\mu(z_0)}-\Psi_{\mu(z')-\mu(z_0)}\|_{L(\Omega_Z,{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}}))} \leq L \|\mu(z)-\mu(z')\|^{\iota}_{{\cal H}_{X,{\boldsymbol W}}}$.
\end{enumerate}
\end{assumption}

\begin{assumption}[Boundness of stage 2]
\label{RAS7}
Suppose that
\begin{enumerate}
    \item $\left<H_{\rho},\cdot\ \right> \in {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)$. Then, ${\cal E}(H_{\rho})=\inf_{H \in {\cal H}_{X,{\boldsymbol W}}}{\cal E}(H)$.
    \item $Y$ is bounded, i.e. $\exists C < \infty$ such that $\|Y\| \leq C$ almost surely.
\end{enumerate}
\end{assumption}


\begin{assumption}[Boundness of stage 2]
\label{RAS8}
    Fix $\zeta < \infty$. For given $b \in (1,\infty]$ and $c \in (1,2]$, define the prior ${\cal P}(\zeta,b,c)$ as the set of probability distributions $\mathfrak{p}$ on ${\cal H}_{X,{\boldsymbol W}} \times \Omega_Y$ such that
    \begin{enumerate}
        \item A range space assumption is satisfied: $\exists C \in {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)$ such that $H_{\rho}=T_2^{\frac{c-1}{2}}\circ C$ and $\|C\|^2_{{\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)} \leq \zeta$.
        \item In the spectral decomposition $T=\sum_{k=1}^{\infty}\lambda_ke_k\left< \cdot, e_k \right>_{{\cal H}_{X,{\boldsymbol W}}}$, where $\{e_k\}_{k=1}^\infty$ is a basis of $Ker(T)^{\perp}$, the eigenvalues satisfies $\alpha \leq k^b\lambda_k\leq \beta$ for some $\alpha, \beta>0$.
    \end{enumerate}
\end{assumption}
These assumptions are for the boundness of {\bf Stage 2}.

\begin{lemma}
\label{LEMKER}
    Make Assumptions \ref{RAS1}, \ref{RAS2}, \ref{RAS3}, \ref{RAS4}, \ref{RAS5}, \ref{RAS6}, \ref{RAS7} and \ref{RAS8}. Let $\lambda=N_1^{-\frac{1}{c_1+1}}$, $N_1=N_2^{\frac{a(c_1+1)}{\iota(c_1-1)}}$, $a>0$, and $\lambda_3=0$. We have 
    \begin{enumerate}
        \item if $a \leq \frac{b(c+1)}{bc+1}$ then ${\cal E}(\hat{H}_{\xi})-{\cal E}({H}_{\rho})={\cal O}_p(N_2^{-\frac{ac}{c+1}})$ with $\xi=N_2^{-\frac{a}{c+1}}$.
        \item if $a \geq \frac{b(c+1)}{bc+1}$ then ${\cal E}(\hat{H}_{\xi})-{\cal E}({H}_{\rho})={\cal O}_p(N_2^{-\frac{bc}{bc+1}})$ with $\xi=N_2^{-\frac{b}{bc+1}}$.
    \end{enumerate}
\end{lemma}
Lemma~\ref{LEMKER} can be proved  from the proof of Theorem 4 
in \citep{Singh2019} by subsituting $\mu(z)$ with $\mu(z)-\mu(z_0)$.

%\begin{theorem}
%    Make Assumptions \ref{RAS1}, \ref{RAS2}, \ref{RAS3}, \ref{RAS4}, \ref{RAS5}, \ref{RAS6}, \ref{RAS7} and \ref{RAS8}, then RKHS CAPCE estimator is consistent to CAPCE.
%\end{theorem}

%\subsection*{Proof of Theorem 4.5.}


{\it
{\bf Theorem \ref{RTEO1}.}
    Under SCM ${\cal M}_{IV}$ and Assumptions %3.1, 3.2,
    \ref{AS1}, \ref{AS2},  
    \ref{RAS1}, \ref{RAS2}, \ref{RAS3}, \ref{RAS4}, \ref{RAS5}, \ref{RAS6}, \ref{RAS7} and \ref{RAS8},
    %F.1, F.2, F.3, F.4, F.5, F.6, F.7, and F.8,
   the  RKHS CAPCE estimator in (25) %$\hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol W}={\boldsymbol w}]=\hat{\boldsymbol \alpha}^T{\bf K}_{(X,{\boldsymbol W})^{(1)}(x,{\boldsymbol w})}$ 
    converges pointwise to CAPCE %$\mathbb{E}[\partial_xY_{x}|{\boldsymbol W}={\boldsymbol w}]$ 
    when $\lambda_3=0$.
}
\begin{proof}
Lemma \ref{LEMKER} implies consistency of RKHS CAPCE estimator by taking limit $N_2 \rightarrow \infty$.
\end{proof}

