
%In this section, we introduce the notation and background used throughout this paper.
We represent each variable with a capital letter $(X)$ and its realized value with a lowercase letter $(x)$. Let $\mathbb{I}(x)$ denote the indicator function, which takes the value $1$ if $x$ is true and $0$ otherwise.
We denote $\Omega_Y$ as the domain of $Y$, $\mathbb{E}[Y]$ as the expectation of $Y$, and $\mathbb{P}(Y < y)$ as the cumulative distribution function (CDF). % of the continuous variable $Y$. 
Additionally, we denote the $m$-th Cartesian product of the domain $\Omega$ as $\Omega^m$, i.e., $\Omega^m = \Omega \times \Omega \times \dots \times \Omega$ (repeated $m$ times).



{\bf Moments of random variables.}
Moments are measures related to the shape of the probability distribution of a random variable. 
For each $m \geq 1$,
the $m$-th moment of a random variable $Y$ is defined by the expectation of the $m$-th power of $Y$, i.e., 
$\mathbb{E}[Y^m]$, and the central moment of a random variable is defined by
$C_m\defeq\mathbb{E}[(Y-\mathbb{E}[Y])^m]$.
Common statistics involving moments include mean $\mathbb{E}[Y]$, 
variance $C_2$, standard deviation $\displaystyle\sqrt{C_2}$, skewness $\displaystyle C_3/C_2^{3/2}$, and kurtosis $C_4/C_2^2$. 
%variance $\mathbb{E}[(Y-\mathbb{E}[Y])^2]$, standard deviation $\displaystyle\sqrt{\mathbb{E}[(Y-\mathbb{E}[Y])^2]}$, skewness $\displaystyle \frac{\mathbb{E}\left[(Y-\mathbb{E}[Y])^3\right]}{\mathbb{E}\left[(Y-\mathbb{E}[Y])^2\right]^{3/2}}$, and kurtosis $\displaystyle \frac{\mathbb{E}\left[(Y-\mathbb{E}[Y])^4\right]}{\mathbb{E}\left[(Y-\mathbb{E}[Y])^2\right]^{2}}$ 
%\citep{Pearson1896,Joanes1998,Doane2011,Hippel2005,Westfall2014}.
They are essential statistics to capture the shape of the probability distribution of $Y$.
The product moments of the random variables $X$ and $Y$ are given by $\mathbb{E}[XY]$, the covariance is $\mathbb{E}[(X-\mathbb{E}[X])(Y-\mathbb{E}[Y])]$, and are used to define Pearson correlation coefficient  $\frac{\mathbb{E}[(X-\mathbb{E}[X])(Y-\mathbb{E}[Y])]}{\sqrt{\mathbb{E}[(X-\mathbb{E}[X])^2]}\sqrt{\mathbb{E}[(Y-\mathbb{E}[Y])^2]}}$.  %\citep{Pearson1905}.
%\yuta{The product moment of causal effects reveals the association between the two random variables. When $\mathbb{E}[XY]$ is positive, subjects with larger $X$ tend to have larger $Y$. When $\mathbb{E}[XY]$ is negative, subjects with larger $X$ tend to have smaller $Y$. When $\mathbb{E}[XY]$ is zero, there is no linear relationship between $X$ and $Y$.}







{\bf Structural causal models.} 
We use the language of Structural Causal Models (SCM) as our basic semantic and inferential framework \citep{Pearl09}.
An SCM ${\cal M}$ is a tuple $\left<{\boldsymbol U},{\boldsymbol V}, {\cal F}, \mathbb{P}_{\boldsymbol U} \right>$, where ${\boldsymbol U}$ is a set of exogenous (unobserved) variables following a joint distribution $\mathbb{P}_{\boldsymbol U}$, and ${\boldsymbol V}$ is a set of endogenous (observable) variables whose values are determined by structural functions ${\cal F}=\{f_{V_i}\}_{V_i \in {\boldsymbol V}}$ such that $v_i:= f_{V_i}({\mathbf{pa}}_{V_i},{\boldsymbol u}_{V_i})$ where ${\mathbf{PA}}_{V_i} \subseteq {\boldsymbol V}$ and $U^{V_i} \subseteq {\boldsymbol U}$. 
Each SCM ${\cal M}$ induces an observational distribution $\mathbb{P}_{\boldsymbol V}$ over ${\boldsymbol V}$. % and a causal graph $G({\cal M})$ over ${\boldsymbol V}$ in which there exists a directed edge from every variable in ${\mathbf{PA}}_{V_i}$ to $V_i$.
An atomic intervention of setting a set of endogenous variables ${\boldsymbol X}$ to constants ${\boldsymbol x}$, denoted by $do({\boldsymbol x})$, replaces the original equations of ${\boldsymbol X}$ by ${\boldsymbol X} :={\boldsymbol x}$ and induces a \textit{sub-model}  ${\cal M}_{\boldsymbol x}$.
We denote the potential outcome $Y$ under intervention $do({x})$ by $Y_{{x}}({\boldsymbol u})$, which is the solution of $Y$  in the sub-model ${\cal M}_{x}$ given ${\boldsymbol U}={\boldsymbol u}$. 


{\bf Causal effects.}
Researchers usually consider the following SCM, denoted as ${\cal M}$:
\begin{equation}
\begin{gathered}
Y:=f_Y(X,U^Y),\ \  X:=f_X(U^X),
\end{gathered}
\end{equation}
where $U^Y$ and $U^X$ are latent exogenous variables.
%$Y$ is a continuous variable and $X$ is a binary, discrete, or continuous variable.
The individual causal effect (ICE) is defined as $\text{\normalfont ICE}({\boldsymbol u})\defeq Y_1({\boldsymbol u})-Y_0({\boldsymbol u})$.
%\end{definition}
%ICE cannot be directly observed due to the fundamental problem of causal inference \citep{Holland1986, Hernan2024}.
%\begin{definition}[ACE]
The average causal effect (ACE) is defined as $\text{\normalfont ACE}\defeq\mathbb{E}[Y_1-Y_0]$. 
%\jina{ \citet{Ju2010} defined the distibutional causal effect (DCE) as $\text{\normalfont DCE}(y)\defeq\mathbb{P}(Y_0<y)-\mathbb{P}(Y_1<y)$.} \jin{How is this work related to the moments of $Y_1-Y_0$?}
\citet{Heckman1997} showed the identification of ICE under the rank invariance assumption stating that 
%\yuta{``$\mathbb{P}(Y_0<y_0)=\mathbb{P}(Y_1<y_1)$ holds  for almost every subject whose potential outcomes are $(Y_1(u),Y_0(u))=(y_1,y_0)$"}
%\jin{What does "$\mathbb{P}(Y_0<y_0)=\mathbb{P}(Y_1<y_1)$ holds  for almost every subject" mean? It's independent of $u$. Do you mean "it holds for any $y_0, y_1$?}
%\yuta{[Not ``for any $y_0, y_1$", the values of $(y_0, y_1)$ are restricted to those taken by $(Y_1(u),Y_0(u))$ for all $u \in \Omega_U$.]}, 
{``for almost every subject whose potential outcomes are $(y_1,y_0)=(Y_1,Y_0)$,  $\mathbb{P}(Y_0<y_0)=\mathbb{P}(Y_1<y_1)$ holds",} 
which is a strong assumption. %in the case of a continuous outcome. 
They identified the variance of causal effects by identifying ICE. %Their results are restricted to the case of continuous outcomes.
%For a continuous outcome, several studies \citep{DiNardo1996,Kennedy2023d} aim to estimate the PDF of $Y_x$, i.e., $\mathfrak{p}_{Y_x}$. However, identifying the moments of causal effects requires the joint PDF of $(Y_0, Y_1)$, i.e., $\mathfrak{p}_{(Y_0,Y_1)}$. $\mu^{(m)}$ is given by $\int_{\Omega_Y}(y_1-y_0)^m\mathfrak{p}_{(Y_0,Y_1)}(y_0,y_1)dy_0dy_1$.
\citep{Hoshino2020} studied the identification  of the joint PDF of $(Y_0,Y_1)$ under various parametric specifications, whereas this work considers a nonparametric setting. 


{\bf Joint distribution of potential outcomes.}
Joint distributions of potential outcomes, fundamental to this work, have been employed in the framework of probabilities of causation (PoC) \citep{Pearl1999,Tian2000,Li2024}. %particularly in relation to the probability of necessity and sufficiency (PNS).
PoC are a family of probabilities quantifying whether one event was the real cause of another. % in a given scenario.
Recently, \citet{Kawakami2024} defined the probability of necessity and sufficiency (PNS) for continuous treatment and outcome as $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})$, and {showed that it is identified from $\mathbb{P}(X,Y)$ if there are no unmeasured confounders and the function $f_Y(x,U^Y)$ satisfies a monotonicity assumption:}  
%They also established the identification assumptions for the probability of necessity and sufficiency (PNS) in the context of continuous treatment and outcome, without incorporating subjects’ covariates, as outlined below:
\begin{assumption}[Exogeneity]
\label{ASEXO2}
$Y_x\indep X$ for all $x \in \Omega_X$.
\end{assumption}
%If $X$ is binary, i.e., $\Omega_X=\{0,1\}$, Assumption \ref{ASEXO2} is simply $Y_1 \indep X$ and $Y_0 \indep X$.
%\begin{assumption}[Monotonicity over $Y_{x}$]
%\label{MONO2}
%The potential outcomes $Y_{x}$ satisfy:  for any $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$, either $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0$ or $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})=0$.
%\end{assumption}
\begin{assumption}[Monotonicity over $f_Y$]
\label{MONO2}
{The function $f_Y(x,U^Y)$ is either (i) monotonic increasing on $U^Y$
%with $\leq$
for all $x \in \Omega_X$ almost surely w.r.t. $\mathbb{P}_{U^Y}$, or (ii) monotonic decreasing on $U^Y$
%with $\leq$ 
for all $x \in \Omega_X$
almost surely w.r.t. $\mathbb{P}_{U^Y}$.} 
\end{assumption}
%\yuta{This assumption entails certain restrictions.
%\begin{assumption}[Homogeneity]
%\label{homo}
%${U^Y}_{1}={U^Y}_{0}(=U^Y)$ hold.
%\end{assumption}
%Homogeneity means that the exogenous variables $U^Y$ influencing the outcome remain invariant under interventions on $X$ \citep{Wu2025}.}
{We will use the above assumptions to  identify moments of causal effects in this paper.}




%\begin{assumption}
%\label{SUP2}
%Potential outcome $Y_x$ has conditional PDF $p_{Y_x|W=w}$ given $W=w$ for each $x \in \Omega_X$ and $w \in \Omega_W$, and its support $\{y \in \Omega_Y: p_{Y_x|W=w}(y) \ne0 \}$ is the same
%for each $x \in \Omega_X$ and $w \in \Omega_W$.
%\end{assumption}
%\citet{Kawakami2024} show Assumptions \ref{MONO2} or \ref{AS2} are equivalent under Assumption \ref{SUP2}.
%\citet{Kawakami2024} provided another Assumption (monotonicity over $Y_x$), which is equivalent to Assumption \ref{MONO2} under Assumption 3.6 in \citep{Kawakami2024}.
%Under SCM ${\cal M}$, and Assumptions \ref{ASEXO2}  and \ref{MONO2}, the joint distribution of potential outcomes $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})$ and $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})$ are identifiable by
%$\max\{\mathbb{P}(Y<y|X=x_0)-\mathbb{P}(Y<y|X=x_1),0\}$ and $\max\{\mathbb{P}(Y<y|X=x_1)-\mathbb{P}(Y<y|X=x_0),0\}$, respectively, for any $w \in \Omega_W$ and $y \in \Omega_Y$.
