%\begin{wrapfigure}{r}[1pt]{0.47\textwidth}
\begin{figure}
    \centering
    %\vspace{-1cm}
    \scalebox{1}{
\begin{tikzpicture}
    % x node set with absolute coordinates
    \node[mynode] (x) at (0,0) {$X$};
    \node[mynode] (y) at (3,0) {$Y$};
    \node[mynode] (z) at (-3,0) {$Z$};
    \node[myfillnode] (u) at (3,2) {${\boldsymbol H}$};
    \node[mynode] (w) at (0,2) {${\boldsymbol W}$};
    %\node[mynode] (d) at (-1.5,2) {${\boldsymbol u}_X$};
    %\node[mynode] (e) at (4.5,2) {${\boldsymbol u}_Y$};

    % Directed edge
    \path (x) edge[->] (y);
    \path (z) edge[->]  (x);


    \path (u) edge[->] (y);
    \path (u) edge[->]  (x);
    \path (u) edge[->]  (w);

    \path (w) edge[->] (y);
    \path (w) edge[->]  (x);
    
    %\path (e) edge[->] (y);
    %\path (d) edge[->]  (x);

\end{tikzpicture}
}
\vspace{0.25cm}
    \caption{A causal graph representing the IV model.}% setting with covariates.}% Causal graph and two types of non-separability in the IV setting, ${\cal M}_{IV}$.}
    \label{DAG1}    
\vspace{-0cm}
\end{figure}
%\end{wrapfigure} 

%In this section, we introduce the basic notations and definitions in this paper and explain the research background.
We represent each variable with a capital letter $(X)$ and its realized value with a small letter $(x)$.
%Let $\mathbbm{1}_{\Omega}(x)$ be an indicator function, which is $1$ if $x \in \Omega$; and $0$ if $x \notin \Omega$.
Let $\Omega_X$ be the domain of $X$, $\mathbb{E}[Y]$ be the expectation of $Y$, $\mathbb{P}(X\leq x)$ be the cumulative distribution function (CDF) of $X$, and $\mathfrak{p}(X=x)$ be the probability density function (PDF) of $X$.
%In addition, $\mathbb{E}[Y|X=x]$, $\mathbb{P}(X\leq x|Z=z)$, and $\mathfrak{p}(X = x|Z=z)$ denote the conditional expectation of $Y$ given $X=x$, the conditional CDF and PDF of $X$ given $Z=z$.
%, and $\mathbb{V}(Y|X=x)$ be the conditional variance of $Y$ given $X=x$.
%We write $g(x)={\cal O}(h(x))$ as $x \rightarrow \infty$ if there exists a positive real number $M$ and a real number $\delta$ such that $|g(x)| \leq Mh(x)$ for all $x \geq \delta$.
%In contrast, we write $g(x)={\cal O}(h(x))$ as $x \rightarrow 0$ if there exists a positive real number $M$ and a real number $\delta$ such that $|g(x)| \leq Mh(x)$ for all $0 \leq |x| \leq \delta$.
{
A metric space $\left<\Omega,d\right>$, where distance function $d(x,y)$ is defined by a given norm $\|x-y\|$ for $x,y \in \Omega$,  is compact if every sequence in $\Omega$ has a convergent sub-sequence whose limit is in $\Omega$. }
If every Cauchy sequence of points in $\Omega$ has a limit in $\Omega$, $\Omega$ is called complete.

%{\bf Sobolev Space.} We define 
{\bf Sobolev norm} \citep{Gallant1987,Leoni2009}. 
Let ${\boldsymbol \lambda}$ be a $d+1$ dimensional vector of non-negative integer, $\displaystyle|{\boldsymbol \lambda}|=\sum_{l=1}^{d+1}{\lambda}_l$, and $\displaystyle D^{{\boldsymbol \lambda}}f(x,{\boldsymbol w})=\partial^{|{\boldsymbol \lambda}|}f(x,{\boldsymbol w})/\partial x^{{\lambda}_1}\partial {w_1}^{{\lambda}_2}\cdots\partial {w_d}^{{\lambda}_{d+1}}$.
Sobolev norm is defined as follows:
%\begin{eqnarray}
%\label{RES1}
    $\displaystyle\|f\|_{W^{l,p}}=\left\{\sum_{|{\boldsymbol \lambda}|\leq l} \int \{D^{{\boldsymbol \lambda}}f(x,{\boldsymbol w})\}^p dxd{\boldsymbol w} \right\}^{1/p}$
%\end{eqnarray}
for $1\leq p < \infty$, and
%\begin{eqnarray}
%\label{RES1}
    $\|f\|_{W^{l,\infty}}=\max_{|{\boldsymbol \lambda}|\leq l}\sup_{(x,{\boldsymbol w})} D^{{\boldsymbol \lambda}}f(x,{\boldsymbol w})$.
%\end{eqnarray}
Note that $W^{0,p}$ norm coincides with $L_p$ norm for $1\leq p \leq \infty$.


{\bf Structural Causal Models (SCM).} %We use the language of Structural Causal Models (SCM) as our basic semantic and inferential framework \citep{Pearl09}.
We use SCM as our framework \citep{Pearl09}.
An SCM ${\cal M}$ is a tuple $\left<{\boldsymbol V},{\boldsymbol U}, {\cal F}, \mathbb{P}_{\boldsymbol U} \right>$, where ${\boldsymbol U}$ is a set of exogenous (unobserved) variables following a joint distribution $\mathbb{P}_{\boldsymbol U}$, and ${\boldsymbol V}$ is a set of endogenous (observable) variables whose values are determined by structural functions ${\cal F}=\{f_{V_i}\}_{V_i \in {\boldsymbol V}}$ such that $v_i:= f_{V_i}({\mathbf{pa}}_{V_i},{\boldsymbol u}_{V_i})$ where ${\mathbf{PA}}_{V_i} \subseteq {\boldsymbol V}$ and $U_{V_i} \subseteq {\boldsymbol U}$. 
%The values of ${\boldsymbol H}$ are drawn from the distribution $\mathbb{P}_{\boldsymbol U,\epsilon_Y}$.
Each SCM ${\cal M}$ induces an observational distribution $\mathbb{P}_{\boldsymbol V}$ over ${\boldsymbol V}$, and a causal graph $G({\cal M})$ over ${\boldsymbol V}$ in which there exists a directed edge from every variable in ${\mathbf{PA}}_{V_i}$ to $V_i$.
%An intervention of setting a set of endogenous variables ${\boldsymbol X}$ to constants ${\boldsymbol x}$, denoted by $do({\boldsymbol x})$, 
An intervention $do({\boldsymbol x})$ of setting  endogenous variables ${\boldsymbol X}$ to constants ${\boldsymbol x}$ 
replaces the functions of ${\boldsymbol X}$
 by the constants ${\boldsymbol x}$ and induces a \textit{sub-model}  ${\cal M}_{{\boldsymbol x}}$.
We denote the potential outcome $Y$ under intervention $do({\boldsymbol x})$ by $Y_{{\boldsymbol x}}({\boldsymbol u})$, which is the solution of $Y$ in the sub-model ${\cal M}_{{\boldsymbol x}}$ given ${\boldsymbol U}={\boldsymbol u}$. 


%{\bf Average Partial Causal Effect (APCE).} Considering a continuous treatment $X$, the APCE $\mathbb{E}[\partial_x Y_x]:=\mathbb{E}_{\boldsymbol U}[\frac{\partial}{\partial x}Y_x({\boldsymbol U})]$ have been introduced by \citep{Chamberlain1984,Wooldridge2005,Graham2012}. APCE is a function from $x \in \Omega_X$ to $\mathbb{R}$, and a natural generalization of the average causal effect of a binary treatment $\mathbb{E}[Y_{1}]-\mathbb{E}[Y_{0}]$. % for $x,x' \in \Omega_X$. $\mathbb{E}[\partial_x Y_x]$ enables us to evaluate a popular target, the average causal effect of changing treatments from $x'$ to $x''$, by $\mathbb{E}[Y_{x'}]-\mathbb{E}[Y_{x''}]=\int_{x''}^{x'} \mathbb{E}[\partial_x Y_x] dx$. It is sufficient to evaluate APCE to reveal causal relationships.
\begin{comment}
\textbf{Conditional average causal effect (CACE).} 
%Heterogeneity of causal effect is the nonrandom and explainable variability in causal effects across subsets of the population \citep{Varadhan2013}.
CACE $\mathbb{E}[Y_1-Y_0|{\boldsymbol w}]:=\mathbb{E}_{\boldsymbol U}[Y_1({\boldsymbol U})-Y_0({\boldsymbol U})|{\boldsymbol W}={\boldsymbol w}]$  is the most common quantity for evaluating heterogeneous causal effects of  a binary treatment across subsets of the population where ${\boldsymbol W}$ is a set of covariates \citep{Athey2016,Ding2016,Kunzel2019,Wager2018}.
%We define the CACE below:
%\begin{definition}[Conditional average causal effect (CACE)]
%$\mathbb{E}[Y_1-Y_0|{\boldsymbol W}={\boldsymbol w}]$
%\end{definition}
%Here, ${\boldsymbol W}$ is the subjects covariates.
\end{comment}

{\bf Instrumental Variable (IV) Model with Covariates.}
We consider the IV model represented by the causal graph in Fig \ref{DAG1}, with the following SCM ${\cal M}_{IV}$ over ${\boldsymbol V}=\{Z,X,Y,{\boldsymbol W}\}$ and ${\boldsymbol U}=\{{\boldsymbol H},{\boldsymbol u}_X,{\boldsymbol u}_Y,{\boldsymbol u}_Z,{\boldsymbol u}_{\boldsymbol W}\}$: %\jin{Do the results hold with Z = f(W, ) or an edge from W to Z in Figure 1?} \yuta{[If there is an edge from W to Z in Figure 1, the distribution $P(X,W|Z)$, say $P(X|Z)$, is biased.]}
%The SCM of the IV model, ${\cal M}_{IV}$, is defined as
%\begin{eqnarray}
%\vspace{-0.9cm}
%{%\small
\begin{equation}
\begin{gathered}
Y:=f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y),\  X:=f_X(Z,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_X),\\
{\boldsymbol W}:=f_{\boldsymbol W}({\boldsymbol H},{\boldsymbol u}_{\boldsymbol W}),\  
Z:=f_Z({\boldsymbol u}_Z),
%\left\{
%\begin{array}{l}
%    Y:=f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)\\
%    X:=f_X(Z,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_X)\\
%    {\boldsymbol W}:=f_X({\boldsymbol H},{\boldsymbol u}_{\boldsymbol W})
%\end{array}
%\right.
%\end{eqnarray}
\end{gathered}
\end{equation}
%\normalsize}
%with the conditional joint distribution $\mathbb{P}_{\{X,Y\}|Z}$.
where %$f_Y$, $f_X$, and $f_Z$ are scalar functions, and 
$f_{\boldsymbol W}$  is a vector function.  % ${\boldsymbol U}=\{{\boldsymbol H},{\boldsymbol u}_X,{\boldsymbol u}_Y,{\boldsymbol u}_Z,{\boldsymbol u}_{\boldsymbol W}\}$, and .
We assume all variables are continuous, %$Z$ are $M$-dimensional instrumental variables,  
${\boldsymbol W}$ are $d$-dimensional pre-treatment covariates, and ${\boldsymbol H}$ stands for unmeasured confounders. %and $d$-dimensional pre-treatment covariates ${\boldsymbol W}$ are generated from ${\boldsymbol H}$. 
{This IV model has been studied in e.g., \citep{Hartford2017,Klein2020}. We further consider an IV model with an additional edge ${\boldsymbol W} \rightarrow Z$ in Appendix \ref{appA2}. }
%This is the same SCM as \citep{Hartford2017}.
%In practice, we normally do not know the details of the structural functions, and causal effect identification concerns with whether a causal quantity measuring the effect of  treatment $X$ on outcome $Y$ is identified from the observational distribution $P(\boldsymbol{V})$. In general, further assumptions are needed. 
%We denote the potential outcomes 
%$Y_{x,{\boldsymbol w}}=f_Y(x,{\boldsymbol w},{\boldsymbol H},{\boldsymbol u}_Y)$, $X_{z,{\boldsymbol w}}=f_X(z,{\boldsymbol w},{\boldsymbol H},{\boldsymbol u}_X)$,  $Y_{X_{z,{\boldsymbol w}},{\boldsymbol w}}=f_Y(f_X(z,{\boldsymbol w},{\boldsymbol H},{\boldsymbol u}_X),{\boldsymbol w},{\boldsymbol H},{\boldsymbol u}_Y)$, 
%\yuta{$Y_{x}=f_Y(x,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)$, $X_{z}=f_X(z,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_X)$, and $Y_{X_{z}}=f_Y(f_X(z,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_X),{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)$}.
%$\mathbb{E}[X_{z,{\boldsymbol w}}]=\mathbb{E}[X|{Z=z,{\boldsymbol W}={\boldsymbol w}}]$ and $\mathbb{E}[Y_{X_{z,{\boldsymbol w}},{\boldsymbol w}}]=\mathbb{E}[Y|{Z=z,{\boldsymbol W}={\boldsymbol w}}]$ holds since $Z$ is independent of ${\boldsymbol H}$. \jin{do not hold}
%\yuta{$\mathbb{E}[X_{z}|{\boldsymbol W}={\boldsymbol w}]=\mathbb{E}[X|{Z=z,{\boldsymbol W}={\boldsymbol w}}]$ and $\mathbb{E}[Y_{X_{z}}|{\boldsymbol W}={\boldsymbol w}]=\mathbb{E}[Y|{Z=z,{\boldsymbol W}={\boldsymbol w}}]$ holds since $Z$ is independent of ${\boldsymbol H}$ given ${\boldsymbol W}$.}
%$X$ is a treatment variable, $Y$ is an outcome variable, and $Z$ is an instrumental variable \citep{Pearl1995}.
%We are interested in inferring the APCE $\mathbb{E}[\partial_x Y_{x}]$ under this SCM.
%Since $Z$ is independent of ${\boldsymbol H}$, $\mathbb{E}[X_z]$ is equal to $\mathbb{E}[X|Z=z]$ and $\mathbb{E}[Y_{X_z}]$ is equal to $\mathbb{E}[Y|Z=z]$ for $z \in Z$. 
%This property is referred to as consistency \citep{Robins89,Pearl09}. \jin{Why do we need this? can be deleted?}


%{\bf Background.} 
{\bf Related work.} 
%With covariates ${\boldsymbol W}$ and IVs $Z$, 
Under the IV model, 
\citet{Whitney2003} introduced %sieve nonparametric two-stage least square (
sieve NTSLS for identifying and estimating $\mathbb{E}[Y_{x}|{\boldsymbol w}]$ via an integral equation, $\displaystyle \mathbb{E}[Y|Z=z]=\int_{\Omega_{\boldsymbol W}}\int_{\Omega_X} \mathfrak{p}(X=x,{\boldsymbol W}={\boldsymbol w}|Z=z)\mathbb{E}[Y_{x}|{\boldsymbol w}]dxd{\boldsymbol w}$, under the following assumption called \emph{separability}: 
%\vspace{-0.9cm}
\begin{equation}
\begin{gathered}
%\begin{eqnarray}
\label{eq-sep}
%\text{Separability:}
f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)=f_Y^1(X,{\boldsymbol W},{\boldsymbol u}_Y)+f_Y^2({\boldsymbol H}, {\boldsymbol u}_Y),\\
\mathbb{E}[f_Y^2({\boldsymbol H},{\boldsymbol u}_Y)|{\boldsymbol W}]=0, 
%\end{eqnarray}
\end{gathered}
\end{equation}
which says the function $f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)$ is in the form of a summation of two functions, one over $(X,{\boldsymbol W})$ and one over ${\boldsymbol H}$.   {Parametric PTSLS \citep{Angrist2009,Wooldridge2010} and Kernel IV \citep{Singh2019} methods for estimating $\mathbb{E}[Y_{x}|{\boldsymbol w}]$ have also been developed under the separability assumption.} 

%separability $f_Y(X,{\boldsymbol W},{\boldsymbol H},{\boldsymbol u}_Y)=f_Y^1(X,{\boldsymbol W},{\boldsymbol u}_Y)+f_Y^2({\boldsymbol H},{\boldsymbol u}_Y)$ and $\mathbb{E}[f_Y^2({\boldsymbol H},{\boldsymbol u}_Y)|Z]=0$. This separability is a strong assumption since all covariates and the treatment must be separable from all unmeasured confounders.
%This integral equation is widely used in the field of machine learning \citep{Hartford2017,Singh2019,Muandet2020}.
%Previously, two integral equations have been introduced when ${\boldsymbol W}$ is an empty set.
%First, an integral equation for estimating $\mathbb{E}[Y_x]$ have been introduced by \citep{Whitney2003}, which is $$\mathbb{E}[Y|Z=z]=\int_{\Omega_X} \mathbb{P}(X=x|Z=z)\mathbb{E}[Y_x]dx$$ 
%under assumptions that $f_Y(X,{\boldsymbol H})=f_Y^1(X)+f_Y^2({\boldsymbol H})$ and $\mathbb{E}[f_Y^2({\boldsymbol H})|Z]=0$, where $\mathbb{P}(X=x|Z=z)$ is a conditional density function, which is widely used in machine learning fields \citep{Hartford2017,Singh2019,Muandet2020}. 
Recently, \citet{Wong2022} introduced an integral equation for identifying APCE $\mathbb{E}[\partial_x Y_x]:=\mathbb{E}_{\boldsymbol U}[\partial_x Y_x({\boldsymbol U})]$ under the IV model with no covariates ${\boldsymbol W}$: %using an IV $Z$:
$\displaystyle \mathbb{E}[Y|Z=z]-\mathbb{E}[Y|Z=z_0]=-\int_{\Omega_X}\{\mathbb{P}(X\leq x|Z=z)-\mathbb{P}(X\leq x|Z=z_0)\}\mathbb{E}[\partial_x Y_x]dx$. 
\citet{Kawakami2023} has developed parametric (P-APCE) and Picard iteration-based (N-APCE) estimators for APCE. In this paper, we extend their results and develop three families of methods for estimating CACPE $\mathbb{E}[\partial_x Y_{x}|{\boldsymbol w}]$. Our parametric estimator reduces to P-APCE  when $\boldsymbol{W}$ is empty. The sieve and RKHS estimators in this paper were not provided in \citep{Kawakami2023}. We note that Picard-iteration estimator in \citep{Kawakami2023} is not suitable here because  equation~(\ref{IE6}) uses a PDF in the integral kernel instead of a CDF in the  equation for APCE.

%due to the use of a density function in the integral kernel instead of a cumulative distribution function (CDF).
%Especially, sieve and RKHS-based estimators in this paper are not in \citep{Kawakami2023}, even for estimating APCE.
%without any subject's covariates, ${\boldsymbol W}$.
%APCE is sufficient if we compare outcomes of the two treatments $\mathbb{E}[Y_{x''}]-\mathbb{E}[Y_{x'}]$.
%This paper will introduce conditions for directly identifying average partial causal effects (CAPCE) with the subject's covariates under weaker assumption than \citep{Whitney2003}. 

%Due to space constraints, all the proofs are provided in the Appendix in the supplementary material. 
%All the proofs are provided in the Appendix.















