%In this section, we give proofs of Theorems in the body of our paper.
\label{appA}
\begin{comment}
{\bf Proof of Proposition 1.}% \ref{TEO2}} 
We give proof of Proposition 1.

{\it
%\begin{theorem}
{\bf Proposition 1.}
    Under Assumption \ref{AS2}, conditional APCE is equal to the potential APCE, $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol W}={\boldsymbol w}]=\mathbb{E}[\partial_x Y_{x}]$.
%\end{theorem}
}

\begin{proof}
Since $f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)=f_Y^1(X,{\boldsymbol W},{\boldsymbol u}_Y)+f_Y^2({\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)$, $\partial_x Y_{x} \indep {\boldsymbol W}$ holds. Then,
\begin{eqnarray}
    \mathbb{E}[\partial_x Y_{x}]=\mathbb{E}[\partial_x Y_{x}|{\boldsymbol W}={\boldsymbol w}]
\end{eqnarray}
holds, and 
\begin{eqnarray}
    \mathbb{E}[\partial_x Y_{x}|{\boldsymbol W}={\boldsymbol w}]=\mathbb{E}[\partial_x Y_{x}|{\boldsymbol W}={\boldsymbol w}]
\end{eqnarray}
holds from the counterfactual consistency.
\end{proof}

\end{comment}



\subsection{Proof of Theorem \ref{TEO2}}% \ref{TEO2}} 
We give proof of Theorem \ref{TEO2}.

%\begin{theorem}
{\it
{\bf Theorem \ref{TEO2}.} (Identification of CAPCE).
%\label{TEO2}
Under SCM ${\cal M}_{IV}$ and Assumptions \ref{AS1} and \ref{AS2}, CAPCE $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ is identifiable from distributions $\mathbb{P}(X, {\boldsymbol W}|Z)$ and $\mathbb{P}( Y |Z)$ via the  integral equation:
%\footnotesize
\begin{eqnarray}
\mu(z)=\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X} k(z,x,{\boldsymbol w})\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}] dxd{\boldsymbol w},
\end{eqnarray}
%\normalsize
where  $\mu(z)=\mathbb{E}[Y|Z=z_0]-\mathbb{E}[Y|Z=z], k(z,x,{\boldsymbol w})=\mathfrak{p}(X\leq x,{\boldsymbol W}={\boldsymbol w}|Z=z)-\mathfrak{p}(X\leq x,{\boldsymbol W}={\boldsymbol w}|Z=z_0)$, and $z_0$ is a fixed value.
%\end{theorem}
}




\begin{proof}
%From the result of \citep{Wong2022}, the following integral equation holds under Assumptions 3.1 and 3.2:
%\jin
{First, we show the following integral equation holds under Assumptions \ref{AS1} and \ref{AS2} following the idea in \citep{Wong2022}:}
\begin{eqnarray}
&&\mathbb{E}[Y|Z=z,{\boldsymbol W}={\boldsymbol w}]-\mathbb{E}[Y|Z=z_0,{\boldsymbol W}={\boldsymbol w}]\\
&&=- \int_{\Omega_X}\{\mathbb{P}(X\leq x|Z=z,{\boldsymbol W}={\boldsymbol w})-\mathbb{P}(X\leq x|Z=z_0,{\boldsymbol W}={\boldsymbol w})\}
%\\&&\hspace{6cm}\times 
\mathbb{E}[\partial_xY_{x}|{\boldsymbol w}]dx.\nonumber
\end{eqnarray} 
%Here, we give the sketch of the proof as below.
From the setting of the IV, the following integral equation holds:
\begin{eqnarray}
Y_{X_{z}}= \int_{\Omega_X} \mathbbm{1}_{X_{z}=x}Y_{x}dx,
\end{eqnarray}
given ${\boldsymbol W}={\boldsymbol w}$ for each subject, where $\mathbbm{1}_{\cdot}$ is a delta function or indicator function. 
This equation means $X_{z}=x \Rightarrow Y_{X_{z}}=Y_{x}$ from the definition of delta function.
By substituting the integral equations $\displaystyle Y_{X_{z}}=\int_{\Omega_X} \mathbbm{1}_{X_{z}=x}Y_{x}dx$ and $\displaystyle Y_{X_{z_0}}= \int_{\Omega_X} \mathbbm{1}_{X_{z_0}=x}Y_{x}dx$, then
\begin{eqnarray}
Y_{X_{z}}-Y_{X_{z_0}}= \int_{\Omega_X} \{\mathbbm{1}_{X_{z}=x}-\mathbbm{1}_{X_{z_0}=x}\}Y_{x}dx
\end{eqnarray}
holds. Since the Heaviside step function is the integration of the delta function,
\begin{eqnarray}
&&Y_{X_{z}}-Y_{X_{z_0}}=\left[\{\mathbbm{1}_{X_{z}=x}-\mathbbm{1}_{X_{z_0}=x}\}\partial_xY_{x}\right]_{-\infty}^{\infty}- \int_{\Omega_X} \{\mathbbm{I}_{X_{z}\leq x}-\mathbbm{I}_{X_{z_0}\leq x}\}\partial_xY_{x}dx.
\end{eqnarray}
Because $\partial_xY_{x}<\infty$ for all $x \in \Omega_X$, $\displaystyle \left[ \{\mathbbm{1}_{X_{z}=x}-\mathbbm{1}_{X_{z_0}=x}\}\partial_xY_{x}\right]_{-\infty}^{\infty}=0$ holds.
Then, the integral equation becomes
\begin{eqnarray}
Y_{X_{z}}-Y_{X_{z_0}}=-\int_{\Omega_X} \{\mathbbm{I}_{X_{z}\leq x}-\mathbbm{I}_{X_{z_0}\leq x}\}\partial_xY_{x}dx.
\end{eqnarray}
From the separability with covariate $f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)=f_Y^1(X,{\boldsymbol W},{\boldsymbol u}_Y)+f_Y^2({\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)$, random variables $\mathbbm{I}_{X_{z}\leq x}-\mathbbm{I}_{X_{z_0}\leq x}$ and $\partial_xY_{x}$ are independent given ${\boldsymbol W}={\boldsymbol w}$.
Thus, we take expectations on both sides:
\begin{eqnarray}
&&\mathbb{E}[Y_{X_{z}}|{\boldsymbol W}={\boldsymbol w}]-\mathbb{E}[Y_{X_{z_0}}|{\boldsymbol W}={\boldsymbol w}]\\
&&=- \int_{\Omega_X} \mathbb{E}[\{\mathbbm{I}_{X_{z}\leq x}-\mathbbm{I}_{X_{z_0}\leq x}\}\partial_xY_{x}|{\boldsymbol W}={\boldsymbol w}]dx\\
&&=- \int_{\Omega_X}\{\mathbb{E}[\mathbbm{I}_{X_{z}\leq x}|{\boldsymbol W}={\boldsymbol w}]-\mathbb{E}[\mathbbm{I}_{X_{z_0}\leq x}|{\boldsymbol W}={\boldsymbol w}]\}\mathbb{E}[\partial_xY_{x}|{\boldsymbol w}]dx.
\end{eqnarray}
Then, the integral equation becomes
\begin{eqnarray}
\label{c1}
&&\mathbb{E}[Y|Z=z,{\boldsymbol W}={\boldsymbol w}]-\mathbb{E}[Y|Z=z_0,{\boldsymbol W}={\boldsymbol w}]\\
&&=-\int_{\Omega_X}\{\mathbb{P}(X\leq x|Z=z,{\boldsymbol W}={\boldsymbol w})-\mathbb{P}(X\leq x|Z=z_0,{\boldsymbol W}={\boldsymbol w})\}\mathbb{E}[\partial_xY_{x}|{\boldsymbol w}]dx.\nonumber
\end{eqnarray}


Next, %The uniqueness holds from the completeness of random variables $X_{z}$.
the integral equation can be given by multiplying $\mathfrak{p}({\boldsymbol W}={\boldsymbol w}|Z=z)$ and marginalizing for ${\boldsymbol W}$, then
\begin{eqnarray}
%&&\mathbb{E}[Y|Z=z,{\boldsymbol W}={\boldsymbol w}]=\int_{\Omega_X}\mathbb{P}(X \leq x|Z=z,{\boldsymbol W}={\boldsymbol w}) \mathbb{E}[\partial_x Y_{x}]dx\\
%&\Leftrightarrow&
&&\mathbb{E}_{\boldsymbol W}[\mathbb{E}[Y|Z=z,{\boldsymbol W}={\boldsymbol w}]]=\\
&&\int_{\Omega_X}\int_{\Omega_{\boldsymbol W}}\mathbb{P}(X \leq x|Z=z,{\boldsymbol W}={\boldsymbol w})\mathfrak{p}({\boldsymbol W}={\boldsymbol w}|Z=z) \mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]d{\boldsymbol w}dx\\
&\Leftrightarrow&\mathbb{E}[Y|Z=z]=\int_{\Omega_X}\int_{\Omega_{\boldsymbol W}}\mathfrak{p}(X \leq x,{\boldsymbol W}={\boldsymbol w}|Z=z)\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]d{\boldsymbol w}dx\
\end{eqnarray}

Finally, we show the uniqueness of the solution. 
Since $X_z$ is a nontrivial function, there does not exist a function which satisfies $\mathbb{E}[\delta(X)|Z=z,{\boldsymbol W}={\boldsymbol w}]=0$ for any $z \in \Omega_Z$ and ${\boldsymbol w} \in \Omega_{\boldsymbol W}$.
Since $\mathbb{E}[\delta(X)|Z=z,{\boldsymbol W}={\boldsymbol w}]=\mathbb{E}[\delta(X),{\boldsymbol W}={\boldsymbol w}|Z=z]\mathbb{P}({\boldsymbol W}={\boldsymbol w})$, there exists a function which satisfies $\mathbb{E}[\delta(X),{\boldsymbol W}={\boldsymbol w}|Z=z]=0$ for any $z \in \Omega_Z$ and ${\boldsymbol w} \in \Omega_{\boldsymbol W}$ if there exists a function which satisfies $\mathbb{E}[\delta(X)|Z=z]=0$ for any $z \in \Omega_Z$ and ${\boldsymbol w} \in \Omega_{\boldsymbol W}$.
Taking a contraposition, there does not exist a function which satisfies $\mathbb{E}[\delta(X),{\boldsymbol W}={\boldsymbol w}|Z=z]=0$ for any $z \in \Omega_Z$ and ${\boldsymbol w} \in \Omega_{\boldsymbol W}$.
\end{proof}



%\begin{wrapfigure}{r}[1pt]{0.47\textwidth}
\begin{figure}
    \centering
    %\vspace{-1cm}
    \scalebox{1}{
\begin{tikzpicture}
    % x node set with absolute coordinates
    \node[mynode] (x) at (0,0) {$X$};
    \node[mynode] (y) at (3,0) {$Y$};
    \node[mynode] (z) at (-3,0) {$Z$};
    \node[myfillnode] (u) at (3,2) {${\boldsymbol H}$};
    \node[mynode] (w) at (0,2) {${\boldsymbol W}$};
    %\node[mynode] (d) at (-1.5,2) {${\boldsymbol u}_X$};
    %\node[mynode] (e) at (4.5,2) {${\boldsymbol u}_Y$};

    % Directed edge
    \path (x) edge[->] (y);
    \path (z) edge[->]  (x);


    \path (u) edge[->] (y);
    \path (u) edge[->]  (x);
    \path (u) edge[->]  (w);

    \path (w) edge[->] (y);
    \path (w) edge[->]  (x);

    \path (w) edge[->]  (z);
    
    %\path (e) edge[->] (y);
    %\path (d) edge[->]  (x);

\end{tikzpicture}
}
\vspace{-0cm}
    \caption{A causal graph representing the IV setting with covariates when there is an edge ${\boldsymbol W} \rightarrow Z$.}% Causal graph and two types of non-separability in the IV setting, ${\cal M}_{IV}$.}
    \label{DAG1d}
    
\vspace{-0cm}
\end{figure}
%\end{wrapfigure} 

\subsection{Identification theorem under IV model in Fig \ref{DAG1d}}
\label{appA2}
We consider the IV model with covariates represented by the causal graph in Fig \ref{DAG1d}, with the following SCM ${\cal M}_{IV}'$ over ${\boldsymbol V}=\{Z,X,Y,{\boldsymbol W}\}$ and ${\boldsymbol U}=\{{\boldsymbol H},{\boldsymbol u}_X,{\boldsymbol u}_Y,{\boldsymbol u}_Z,{\boldsymbol u}_{\boldsymbol W}\}$: %\jin{Do the results hold with Z = f(W, ) or an edge from W to Z in Figure 1?} \yuta{[If there is an edge from W to Z in Figure 1, the distribution $P(X,W|Z)$, say $P(X|Z)$, is biased.]}
%The SCM of the IV model, ${\cal M}_{IV}$, is defined as
%\begin{eqnarray}
%\vspace{-0.9cm}
\begin{equation}
%\small
\begin{gathered}
Y:=f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y),\  X:=f_X(Z,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_X),
{\boldsymbol W}:=f_{\boldsymbol W}({\boldsymbol H},{\boldsymbol u}_{\boldsymbol W}),\  
Z:=f_Z({\boldsymbol W},{\boldsymbol u}_Z),
%\left\{
%\begin{array}{l}
%    Y:=f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)\\
%    X:=f_X(Z,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_X)\\
%    {\boldsymbol W}:=f_X({\boldsymbol H},{\boldsymbol u}_{\boldsymbol W})
%\end{array}
%\right.
%\end{eqnarray}
\end{gathered}
\end{equation}
%\normalsize
%with the conditional joint distribution $\mathbb{P}_{\{X,Y\}|Z}$.
where %$f_Y$, $f_X$, and $f_Z$ are scalar functions, and 
$f_{\boldsymbol W}$  is a vector function.  % ${\boldsymbol U}=\{{\boldsymbol H},{\boldsymbol u}_X,{\boldsymbol u}_Y,{\boldsymbol u}_Z,{\boldsymbol u}_{\boldsymbol W}\}$, and .
We assume all variables are continuous, %$Z$ are $M$-dimensional instrumental variables, 
${\boldsymbol W}$ are $d$-dimensional pre-treatment covariates, and 
 ${\boldsymbol H}$ stands for unmeasured confounders. 
 We show a similar identification result to Theorem \ref{TEO2}. 

{\bf Theorem 3.1'.}
{\it Under SCM ${\cal M}_{IV}'$ and Assumptions \ref{AS1} and \ref{AS2}, CAPCE $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ is identifiable from distributions $\mathbb{P}(X|Z,{\boldsymbol W})$ and $\mathbb{P}(Y|Z,{\boldsymbol W})$ via the integral equation:
\begin{eqnarray}
\label{d1}
\mu(z,{\boldsymbol w})=\int_{\Omega_X} k(z,x,{\boldsymbol w})\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}] dx,
\end{eqnarray}
where $\mu(z,{\boldsymbol w})=\mathbb{E}[Y|Z=z_0,{\boldsymbol W}={\boldsymbol w}]-\mathbb{E}[Y|Z=z,{\boldsymbol W}={\boldsymbol w}], k(z,x,{\boldsymbol w})=\mathfrak{p}(X\leq x|Z=z,{\boldsymbol W}={\boldsymbol w})-\mathfrak{p}(X\leq x|Z=z_0,{\boldsymbol W}={\boldsymbol w})$, and $z_0$ is a fixed value.}

\begin{proof}
    Eq. (\ref{d1}) is guaranteed by Eq. (\ref{c1}), which appears in the proof of Theorem \ref{TEO2}.
\end{proof}
Based on Theorem 3.1', we have to learn $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ as a function of $x$ for each ${\boldsymbol w} \in \Omega_{\boldsymbol W}$ respectively. In contrast, based on Theorem \ref{TEO2},  we can learn $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ directly as a function of $x$ and ${\boldsymbol w}$. 
%{Theorem 3.1' assumes the separabilty as Assumption 3.1 and 3.2. The disadvantage of Theorem 3.1' is that we have to learn $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ by functions on $x$ for each ${\boldsymbol w} \in \Omega_{\boldsymbol W}$ respectively. The advantage of Theorem 3.1 is that we can learn $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$ by one function on $x$ and ${\boldsymbol w}$.}

{We perform experiments about estimating CAPCE based on Theorem 3.1' in Appendix~\ref{sec-prime}.}

\subsection{Derivation of RKHS CAPCE estimator}
%We give proof RKHS estimator following \citep{Singh2019}.
We show the detailed steps of deriving the RKHS CAPCE estimator.

{\bf RKHS estimator.} 
RKHS CAPCE estimator is given as $\hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol w}]=\hat{\boldsymbol \alpha}^T{\bf K}_{(X,{\boldsymbol W})^{(1)}(x,{\boldsymbol w})}$ where
\begin{eqnarray}
&&\hat{\boldsymbol \alpha}=(\hat{\bf O}\hat{\bf O}^T+N_2\xi {\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}+N_2\lambda_3 {\bf I}_{N_2})^{-1}\hat{\bf O}\nonumber\\
&&\hspace{2cm}\times\{{\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2\lambda_2 {\bf I}_{N_2})^{-1}({\bf K}_{Z^{(2)}Z^{(2)}}-{\bf K}_{Z^{(2)}z_0})\}\nonumber\\
&&\hat{\bf O}={\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}({\bf K}_{Z^{(1)}Z^{(1)}}+N_1\lambda_1 {\bf I}_{N_1})^{-1}({\bf K}_{Z^{(1)}Z^{(2)}}-{\bf K}_{Z^{(1)}z_0}),
\end{eqnarray}
$(\lambda_1,\lambda_2,\lambda_3,\xi)$ are regularization parameters, and ${\bf I}_N$ is a $N \times N$ identity matrix.




\begin{proof}
There are three optimization problems in RKHS estimator, {\bf Stage 1 (A)} {learning linear operator $G_1$}, {\bf Stage 1 (B)} {learning linear operator $G_2$}, and {\bf Stage 2} {learning linear operator $H$}.  We explain them respectively.

{\bf Stage 1 (A).}
We denote the feature map be $\psi(z)$ and $\pi(x,{\boldsymbol w})$, where $\pi(x,{\boldsymbol w})=-\int_{-\infty}^x \eta(x',{\boldsymbol w}) dx'$ for some feature function $\eta(x',{\boldsymbol w})$.
The optimization problem in {\bf Stage 1 (A)}  becomes
\begin{eqnarray}
    \min_{G_1 \in {\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})} {N_1}^{-1}\sum\nolimits_{i=1}^{N_1}\left\|\pi(x_i^{(1)},{\boldsymbol w}_i^{(1)})-G_1(\psi(z_i^{(1)}))\right\|^2_{{\cal H}_{X,{\boldsymbol W}}}+\lambda_1\|G_1\|^2_{{\cal L}_2({\cal H}_Z,{\cal H}_{X,{\boldsymbol W}})}.
\end{eqnarray}
using ${\cal D}^{(1)}$. Then, the estimator $\hat{G_1}$ becomes
\begin{eqnarray}
\hat{G_1}(\cdot)=\left<{\pi_{X^{(1)},{\boldsymbol W}^{(1)}}}({\bf K}_{Z^{(1)}Z^{(1)}}+N_1 \lambda_1 {\bf I})^{-1}\psi_Z^{(1)T},\cdot\right>
\end{eqnarray}
where ${\bf K}_{Z^{(1)}Z^{(1)}}$ and ${\bf K}_{X^{(1)}X^{(1)}}$ are the empirical kernel matrices, the $i$-th column of ${\pi_{X^{(1)},{\boldsymbol W}^{(1)}}}$ is $\displaystyle -\int_{-\infty}^{x_i^{(1)}}\eta(x,{\boldsymbol w})dx$, and the $i$-th column of ${\psi_X}^{(1)}$ is $\psi(z_i^{(1)})$. 
The prediction values are
\begin{eqnarray}
    d_{0}(z)={\pi_{X^{(1)},{\boldsymbol W}^{(1)}}}({\bf K}_{ZZ}+N_1 \lambda_1 I)^{-1}\psi_Z^{(1)T}\psi(z)=-\sum_{i=1}^{N_1}\gamma_i(z)\int_{-\infty}^{x_i^{(1)}}\eta(x,{\boldsymbol w})dx
\end{eqnarray}
where $\gamma(z)=({\bf K}_{Z^{(1)}Z^{(1)}}+N_1 \lambda_1 {\bf I})^{-1}\psi_Z^{(1)T}\psi(z)=({\bf K}_{Z^{(1)}Z^{(1)}}+N_1 \lambda_1 {\bf I})^{-1}{\bf K}_{Z^{(1)}z}$.
Furthermore, the difference in the predication values are
\begin{eqnarray}
    d(z)=d_{0}(z)-d_{0}(z_0)=-\sum_{i=1}^{N_1}\{\gamma_i(z)-\gamma_i(z_0)\}\int_{-\infty}^{x_i^{(1)}}\eta(x,{\boldsymbol w})dx
\end{eqnarray}
 and $\gamma(z)-\gamma_i(z_0)=({\bf K}_{Z^{(1)}Z^{(1)}}+N_1 \lambda_1 {\bf I})^{-1}\psi_Z^{(1)T}\{\psi(z)-\psi(z_0)\}=({\bf K}_{Z^{(1)}Z^{(1)}}+N_1 \lambda_1 {\bf I})^{-1}({\bf K}_{Z^{(1)}z}-{\bf K}_{Z^{(1)}z_0})$ holds.
Letting $\hat{G_1}=\sum_{j=1}^{N_1}\alpha_j \eta(x_j^{(1)},{\boldsymbol w}_j^{(1)})$ since the optimal $\hat{G_1}$ exists in ${\text{span}}(\{\eta(x_j^{(1)},{\boldsymbol w}_j^{(1)})\}_{j=1}^{N_1})$ from the representer theorem \citep{Schlkopf2001}. 
Then the functional form of $d(z)$ is restricted by
\begin{eqnarray}
d(z)
    &=&-\left<\sum_{j=1}^{N_1}\alpha_j \int_{-\infty}^{x_i^{(1)}}\eta(x,{\boldsymbol w}_i^{(1)})dx,-\sum_{i=1}^{N_1}\{\gamma_i(z)-\gamma_i(z_0)\}\int_{-\infty}^{x_j^{(1)}}\eta(x,{\boldsymbol w}_j^{(1)})dx\right>\\
    &=&\sum_{i=1}^{N_1}\sum_{j=1}^{N_1}\alpha_j\{\gamma_i(z)-\gamma_i(z_0)\}\left< -\int_{-\infty}^{x_i^{(1)}}\eta(x,{\boldsymbol w}_i^{(1)})dx,-\int_{-\infty}^{x_j^{(1)}}\eta(x,{\boldsymbol w}_j^{(1)})dx\right>\\
    &=&\sum_{i=1}^{N_1}\sum_{j=1}^{N_1}\alpha_j\{\gamma_i(z)-\gamma_i(z_0)\}\left< \pi(x_i^{(1)},{\boldsymbol w}_i^{(1)}),\pi(x_j^{(1)},{\boldsymbol w}_j^{(1)})\right>.
    \end{eqnarray}
From the kernel trick, it becomes
    \begin{eqnarray}
    &=&\sum_{i=1}^{N_1}\sum_{j=1}^{N_1}\alpha_j\{\gamma_i(z)-\gamma_i(z_0)\}k((x_i^{(1)},{\boldsymbol w}_i^{(1)}),(x_j^{(1)},{\boldsymbol w}_j^{(1)}))\\
    &=& {\boldsymbol \alpha}^T w(z)
\end{eqnarray}
where $w(z)={\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}({\bf K}_{ZZ}+N_1\lambda_1 {\bf I})^{-1}({\bf K}_{Z^{(1)}z}-{\bf K}_{Z^{(1)}z_0})$.
Note that the ${\boldsymbol \alpha}$ will be estimated in {\bf Stage 2.}


{\bf Stage 1 (B).}
The optimization problem in {\bf Stage 1 (B)} is %\jin{should $G_2 \in H_Z$ in the following?}
\begin{eqnarray}
\min_{G_2 \in {\cal L}_2({\cal H}_{{Z}},\Omega_Y)} {N_2}^{-1}\sum\nolimits_{i=1}^{N_2}\left\|y_i^{(2)}-G_2(\psi(z_i^{(2)}))\right\|^2+\lambda_2\|G_2\|^2_{{\cal L}_2({\cal H}_{{Z}},\Omega_Y)}
\end{eqnarray}
using ${\cal D}^{(2)}$. As {\bf Stage 1 (A)}, the estimator of $G_2$, $\hat{G_2}$, become
\begin{eqnarray}
\hat{G_2}(\cdot)=\left<{\boldsymbol y}^{(2)}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2 \lambda_2 {\bf I})^{-1}\psi_Z^{(2)T},\cdot\right>
\end{eqnarray}
where ${\bf K}_{Z^{(2)}Z^{(2)}}$ are the gram matrices, the $i$-th column of ${\boldsymbol y}^{(2)}$ is $y_i^{(2)}$.
\begin{eqnarray}
    u_{0}(z)={\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2 \lambda_2 {\bf I})^{-1}\psi_Z^{(2)T}\psi(z)=\sum_{i=1}^{N_2}\gamma_i(z)\psi(z)
\end{eqnarray}
where $\gamma(z)={\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2 \lambda_2 I)^{-1}\psi_Z^{(2)T}$. Then,
\begin{eqnarray}
    u_{0}(z_0)={\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2\lambda_2 {\bf I})^{-1}{\bf K}_{Z^{(2)}z_0}
\end{eqnarray}
and, the difference of the predication values are 
\begin{eqnarray}
    u(z)=u_0(z)-u_0(z_0)={\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2\lambda_2 {\bf I})^{-1}({\bf K}_{Z^{(2)}z}-{\bf K}_{Z^{(2)}z_0}).
\end{eqnarray}
This is the estimator of $\mathbb{E}[Y|Z=z]-\mathbb{E}[Y|Z=z_0]$.






{\bf Stage 2.} 
The optimization problem in {\bf Stage 2} using ${\cal D}^{(2)}$ is 
%\jin{But you are using ${\cal D}^{(2)}$ in the following?}
\begin{eqnarray}
  &&\min_{H \in {\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)} {N_2}^{-1}\sum\nolimits_{i=1}^{N_2}\left\|\hat{G}_2(\psi(z_i^{(2)})-\psi(z_0))-H(\hat{G}_1( \psi(z_i^{(2)})-\psi(z_0)))\right\|^2\nonumber\\
  &&\hspace{2cm}+\xi\|H\|^2_{{\cal L}_2({\cal H}_{X,{\boldsymbol W}},\Omega_Y)}+\lambda_3\|H\circ \hat{G}_1\|^2_{{\cal L}_2({\cal H}_Z,\Omega_Y)}.
\end{eqnarray}
Then, the estimation problem reduces to
\begin{eqnarray}
    &&\frac{1}{N_2}\sum_{i=1}^{N_2}(y_i^{(2)}-u_{0}(z_0)-{\boldsymbol \alpha}^T w(z))^2+\xi {\boldsymbol \alpha}^T{\bf K}_{XX}{\boldsymbol \alpha}+\lambda_3  {\boldsymbol \alpha}^T {\boldsymbol \alpha}\\
    &=&\frac{1}{N_2}\|{\boldsymbol y}^{(2)}-{\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2\lambda_2 {\bf I})^{-1}{\bf K}_{Z^{(2)}z_0}\\
    &&\hspace{0cm}-({\bf K}_{X^{(1)}X^{(1)}}({\bf K}_{Z^{(1)}Z^{(1)}}+N_1\lambda_1 {\bf I})^{-1}({\bf K}_{Z^{(1)}Z^{(2)}}-{\bf K}_{Z^{(1)}z_0}))^T{\boldsymbol \alpha}\|^2\\
    &&+\xi {\boldsymbol \alpha}^T{\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}{\boldsymbol \alpha}+\lambda_3  {\boldsymbol \alpha}^T {\boldsymbol \alpha},
\end{eqnarray}
and the solution to this optimization problem can be represented as
\begin{eqnarray}
&&\hat{\boldsymbol \alpha}=(\hat{\bf O}\hat{\bf O}^T+N_2\xi {\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}+N_2\lambda_3{\bf I})^{-1}\hat{\bf O}\\
&&\hspace{3cm}\times ({\boldsymbol y}^{(2)}-{\boldsymbol y}^{(2)T}({\bf K}_{Z^{(2)}Z^{(2)}}+N_2\lambda_2 {\bf I})^{-1}{\bf K}_{Z^{(2)}z_0})\\
&&\hat{\bf O}={\bf K}_{(X,{\boldsymbol W})^{(1)}(X,{\boldsymbol W})^{(1)}}({\bf K}_{Z^{(1)}Z^{(1)}}+N_1\lambda_1 {\bf I})^{-1}({\bf K}_{Z^{(1)}Z^{(2)}}-{\bf K}_{Z^{(1)}z_0}).
\end{eqnarray}
Finally, RKHS CAPCE estimator of $(x,{\boldsymbol w})$ becomes
$\hat{\mathbb{E}}[\partial_x{Y}_{x}|{\boldsymbol w}]=\hat{\boldsymbol \alpha}^T{\bf K}_{(X,{\boldsymbol W})(x,{\boldsymbol w})}$.
\end{proof}



