\label{sec3}
For ease of understanding, we will start with a single treatment variable $X$ and a single outcome $Y$. 
We extend binary PoC  for continuous variables, extend the monotonicity Assumption~\ref{BMONO} to continuous settings, and provide an identification theorem.

%In this section, we set up the problem of PoC with scalar $Y$ and $X$ using a scalar structural causal model and give an identification theorem of PNS, PN and PS.

%\subsection{Problem Setup}
\subsection{PoC definition}

%{\bf Scalar Structural Causal Model.}


%\jin{I wonder whether it's necessary to define such an SCM. It implies strong Exogeneity, maybe you just need weak Exogeneity? The original PNS is identified either under strong Exogeneity or weak Exogeneity + Monotonicity. Is the newly defined PNS identifiable under strong Exogeneity alone? }

Let $X$ be a continuous or discrete treatment variable, and $Y$ be a continuous or discrete outcome variable.
We assume the following SCM ${\cal M}_{S}$: 
\begin{eqnarray}
    Y:=f_Y(X,U),\  X:=f_X(\epsilon_X), 
    %\ \text{where}\ \epsilon_X \indep U.
\end{eqnarray}
%The functions $f_Y$ and $f_X$ are  real-valued functions. Potential outcome $Y_x(U)$ is defined by $f_Y(x,U)$ for any $x \in \Omega_X$.
where $U$ and $\epsilon_X$ are latent exogenous variables. %, and $\epsilon_X$ is irrelevant to potential outcome $Y_x(U)$.
%We name ${\cal M}_{S}$ scalar SCM.
%$U$ satisfies the following condition:
%\begin{assumption}[Absolute Completeness]
%\label{TOT}
%The probability distribution of $U$ is absolutely continuous.
    %$\Omega_U$ is totally ordered set, and 
%    $\sup_{u \in \Omega_U}\mathfrak{p}(u)<0$.
%\end{assumption}
%Assumption \ref{TOT} means CDF of $U$ is continuous and guarantees the existence of PDF of $U$.
%Assumption \ref{TOT} also means the probability of two subject have the same value of $U$ is zero, i.e., $\mathbb{P}(u_0=u_1)=0$ for any $u_0, u_1 \in \Omega_U$.
%$\mathfrak{p}(u)$ means the probability density function of $U$.
%\citet{Heckman1999,Heckman2005} also make this assumption, and consider $U$ is a uniform distribution on $[0,1]$.
%
%The domains of $X,Y,U$, $\Omega_X, \Omega_Y, \Omega_U$, are subsets of $\mathbb{N}, \mathbb{Z}$ or $\mathbb{R}$.

%Under SCM ${\cal M}_S$, 
%the potential outcome $Y_x$ is given as $f_{Y}(x,U)$, and 
%we assume
%\begin{assumption}[Exogeneity]
%\label{EXO}
%    $X$ is independent of $U$.
%\end{assumption}

We make the following assumption.
\begin{assumption}[Exogeneity]
\label{ASEXO}
%   Under SCM ${\cal M}_S$, 
%$X$ is independent of $U$.
$Y_x\indep X$ for all $x \in \Omega_X$.
\end{assumption}
We note that if $\epsilon_X \indep U$ then the exogeneity holds,   and randomized controlled trials (RCT) on $X$ ensure exogeneity. Exogeneity implies $\mathbb{P}(Y_x < y)=\mathbb{P}(Y < y|X=x)$. 

%\yuta{[Comment: I fixed the assumption slightly.]}

We define PoC for continuous or discrete $X$ and $Y$ as a generalization of Definition \ref{def1}.
\begin{definition}[Probabilities of causation]
\label{def2}
For any $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$, we define three types of PoC  as below:\footnote{We can equally define PNS as $\text{PNS}(y;x_0,x_1)\defeq\mathbb{P}(Y_{x_0} \leq y < Y_{x_1})$. We will stay with Definition~\ref{def2} in this paper.   
%However, we only discuss the definition of $\text{PNS}(y;x_0,x_1)\defeq\mathbb{P}(Y_{x_0} < y \leq Y_{x_1})$ in this paper. \yuta{I will change the definition $\text{PNS}(y;x_0,x_1)\defeq\mathbb{P}(Y_{x_0} < y \leq Y_{x_1})$ to $\text{PNS}(y;x_0,x_1)\defeq\mathbb{P}(Y_{x_0} < y \leq Y_{x_1}).$}}
}
\begin{equation}
\begin{aligned}
    &\text{PNS}(y;x_0,x_1)\defeq\mathbb{P}(Y_{x_0} < y \leq Y_{x_1}),\\
    &\text{PN}(y;x_0,x_1)\defeq\mathbb{P}(Y_{x_0} < y |y \leq Y, X=x_1),\\
    &\text{PS}(y;x_0,x_1)\defeq\mathbb{P}(y \leq Y_{x_1} |Y < y,X=x_0).
\end{aligned}
\end{equation}
\end{definition}
%\begin{comment}
%\jin{Can PNS be expressed in terms of PN and PS as in the binary case?} 
PNS, PN, and PS are connected  in the special case of binary $X$: 
\begin{lemma} 
\label{LEM41}
If $X$ is binary, we have
\begin{equation}
\begin{aligned}
 \text{PNS}(y;x_0,x_1)
    &=\text{PN}(y;x_0,x_1)\mathbb{P}(y \leq Y, X=x_1)\\
    &\ \ +\text{PS}(y;x_0,x_1)\mathbb{P}(Y < y,X=x_0).
    \end{aligned}
\end{equation}
\end{lemma}

%\end{comment}
\paragraph{Remark on the connection of Def. \ref{def2} with the binary PoC in Def. \ref{def1}:} 
Suppose $Y$ is binary with values $y_0 < y_1$, then Def. \ref{def2} with $y=y_1$ reduces to Def. \ref{def1}. 
In general, the value of $y$ in Def.~\ref{def2} can be interpreted as an outcome threshold, such as the  passing score for a test or a diagnostic threshold for blood pressure or blood glucose levels. 
%\citet{Hannart2018} determined the value of $y$ by maximizing $\text{PNS}(y;x_0,x_1)$, i.e., $y^*= \argmax_{y \in \Omega_Y} \text{PNS}(y;x_0,x_1)$.
Def. \ref{def2} focuses on the necessity/sufficiency of treatment $x_1$ w.r.t. $x_0$ to produce the event $Y\geq y$. 
We may introduce a binary outcome variable $O=\mathbb{I}(Y \geq y)$. Then PNS$(y;x_0,x_1) = \mathbb{P}(O_{x_0}=0,O_{x_1}=1)$, PN$(y;x_0,x_1) = \mathbb{P}(O_{x_0}=0|O=1, X=x_1)$, and PS$(y;x_0,x_1) = \mathbb{P}(O_{x_1}=1| O=0, X=x_0)$. Therefore, Def. \ref{def2} reduces to the standard definition of binary PoC over $X$ and $O$. We note that this interpretation of PNS matches the use of PNS in \citep{Hannart2018}.

{Although Def. \ref{def2} can be interpreted in terms of a binarized outcome $O=\mathbb{I}(Y \geq y)$. It is more natural and consistent to have a formulation in terms of the original variable $Y$ rather than in terms of $O$. A major benefit of the proposed formulation is that it can be naturally extended to study more complex variants of PoC in Section~\ref{sec-variant} that are difficult to formulate in terms of a binarized outcome.}


%The definition of PNS in Def. \ref{def2} appeared in \citep{Hannart2018}. \jin{It doesn't look to me that \citep{Hannart2018} gave the same definition. They essentially introduced a binary variable by defining an event as $E= Y\geq y$. Could you use this idea to provide a justification/interpretation to your definition?}
%\yuta{They define PNS $\mathbb{P}(E_{x_0}=0,E_{x_1}=1)$ through binarized outcome $E=\mathbb{I}(Y \geq y)$, and $E_{x_1}$ and $E_{x_0}$ mean $\mathbb{I}(Y_{x_1} \geq y)$ and $\mathbb{I}(Y_{x_0} \geq y)$. Thus, $\mathbb{P}(E_{x_0}=0,E_{x_1}=1)=\mathbb{P}(\mathbb{I}(Y_{x_0} \geq y)=0,\mathbb{I}(Y_{x_1} \geq y)=1)=\mathbb{P}(\mathbb{I}(Y_{x_0} < y)=1,\mathbb{I}(Y_{x_1} \geq y)=1)=\mathbb{P}(Y_{x_1} \geq y,Y_{x_0} < y)=\mathbb{P}(Y_{x_0} < y\leq Y_{x_1})$.}




When $X$ and $Y$ are discrete variables taking values $\{x_1,\ldots,x_P\}$ and $\{y_1,\ldots,y_Q\}$,
\citet{Li2022} defined PNS by $\mathbb{P}(Y_{x_{i_1}}=y_{j_1},Y_{x_{i_2}}=y_{j_2})$ ($1\leq i_1, i_2 \leq P$, $1\leq j_1, j_2 \leq Q$, $i_1 \ne i_2$ and $j_1 \ne j_2$).
However, their definition is  %always takes $0$ if $Y$ is a continuous variable with bounded PDF and 
not suitable for a continuous outcome $Y$.

%\jin{Is this PoC definition totally novel? Does there exist prior work on extending PoC definition to non-binary discrete or continuous $Y$? These prior works must be discussed. From what is written in the above paragraph, it sounds like this PoC definition is based on some definition from \citet{Hannart2018}. Please clarify the connection of this PoC definition with prior works (e.g. \citep{Li2022}?). If it's largely novel, please justify why this is a reasonable definition and good extension of the binary case. }

%\jin{You can't just throw out a bunch of arbitrary mathematical definitions and call them PoC. People have some intuition about what these concepts mean. Justify why this is a reasonable definition matching people's intuition. Discuss what they mean. What information does PNS$(y;x_0,x_1)$ capture? Why are these definitions  useful in practice? Maybe show an example of how to use these definitions.}

\begin{comment}
    
\yuta{$\text{PNS}(y;x_0,x_1)$ stands for the probability that $Y\geq y$ would happen had he taken $x_1$ and $Y\geq y$ would not happen had he taken $x_0$, and the measure of the sufficiency and necessity of $x_1$ w.r.t. $x_0$ to produce $Y\geq y$.
$\text{PN}(y;x_0,x_1)$ stands for the probability that $Y\geq y$ would not happen had he taken $x_0$ when he takes $x_1$ and  $Y\geq y$ happen, and the measure of the necessity of $x_1$ w.r.t. $x_0$ to produce $Y\geq y$.
$\text{PS}(y;x_0,x_1)$ stands for the probability that $Y\geq y$ would happen had he taken $x_1$ when he takes $x_0$ and $Y\geq y$ does not happen, and the measure of the sufficiency of $x_1$ w.r.t. $x_0$ to produce $Y\geq y$.}

\yuta{For the example of the blood sugar level and the amount of insulin in the previous section, $\text{PNS}(y;x_0,x_1)=\mathbb{P}(Y_{x_0} > y \geq Y_{x_1})$, which is the other definition, means the probability of (i) one would be over the upper bound $y_1$ had he taken $x_0$ and (ii) would not be over the upper bound $y_1$ had he taken $x_1$.
$\text{PN}(y;x_0,x_1)=\mathbb{P}(Y_{x_0} > y |X=x_1,y \geq Y)$ means the probability of one would be over the upper bound $y_1$ had he taken $x_0$ when he is over the upper bound $y_1$ and takes $x_1$.
$\text{PS}(y;x_0,x_1)=\mathbb{P}(y \geq Y_{x_1} |X=x_0,Y > y)$ means the probability of one would not be over the upper bound $y_1$ had he taken $x_1$ when he is over the upper bound $y_1$ and takes $x_0$.
}
\jin{Why are talking about the other definition???}
\end{comment}

\begin{example} \label{ex-1}
Consider the dose-response relationship between the blood sugar level ($Y$) and the amount of insulin ($X$). 
Let $y$ be a blood sugar threshold, and  $x_0,x_1$ be two insulin amount ($x_0>x_1$). 
A doctor may want to know the probability (PNS) that the patient's  blood sugar level would be greater than or equal to the  threshold $y$ had they taken $x_1$ amount of insulin, and would be less than $y$ had they taken $x_0$ insulin.
PN represents the probability that the patient's  blood sugar level would be less than $y$ had they taken $x_0$ insulin when the patient took $x_1$ insulin with sugar level greater than or equal to $y$.
PS represents the probability that the patient's  blood sugar level would be greater than or equal to $y$ had they taken $x_1$ insulin when the patient took $x_0$ insulin with sugar level less than $y$.
\end{example}

\subsection{Identification Assumptions}

\citet{Tian2000} used montonicity Assumption~\ref{BMONO} for binary treatment and outcome to identify binary PoC. Here we generalize this assumption to continuous and discrete cases, and discuss connections with several commonly used assumptions in the literature.

%Next, we explain four nonparametric assumptions for identifying three types of PoC.

%($\mathrm{\bf I}$). {\bf Monotonicity w.r.t.  $X$.}
($\mathrm{\bf I}$). {\bf Monotonicity over  $Y_x$.}
%First, monotonicity on $X$ for a binary treatment, i.e., %$\mathbb{P}(Y_{x_0}=0, Y_{x_1}=1)=0$  or 
%$\mathbb{P}(Y_{x_0}=1, Y_{x_1}=0)=0$,
%$\mathbb{P}(Y_{x_0}\leq Y_{x_1})=0$  or $\mathbb{P}(Y_{x_0}\geq Y_{x_1})=0$,
% have  appeared in the studies \citep{Balke1997,Tian2000,Imbens1994,Angrist1996}.
We first propose the following assumption:
\begin{assumption}[Strong Monotonicity over $Y_x$]
\label{mono-S}
    The potential outcomes $Y_x$ satisfy,  for any $x_0,x_1 \in \Omega_X$,  either "$Y_{x_0}(u)\leq Y_{x_1}(u)$ $\mathbb{P}_U$-almost surely for every $u \in \Omega_U$" or "$Y_{x_1}(u)\leq Y_{x_0}(u)$ $\mathbb{P}_U$-almost surely for  every $u \in \Omega_U$".
\end{assumption}
Note that we allow both monotonic increasing and decreasing cases. It turns out that the PoC in Def.~\ref{def2} can be identified under a weaker assumption:\footnote{This assumption is not ''$\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0$ for any $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$'' or ''$\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})=0$ for any $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$.''
}
\begin{assumption}[Monotonicity over $Y_x$]
\label{MONO_A}
    The potential outcomes $Y_x$ satisfy,  for any $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$,  ``either $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0$ or $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})=0$''.
%    \begin{equation}
%     \text{either } \  \mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0\text{ or }\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})=0
%    \end{equation}   
\end{assumption}
Introducing a binarized outcome $O=\mathbb{I}(Y \geq y)$, Assumption~\ref{MONO_A} becomes ``$\mathbb{P}(O_{x_0}=0,O_{x_1}=1)=0$ or $\mathbb{P}(O_{x_0}=1,O_{x_1}=0)=0$''. Assumption~\ref{MONO_A} is weaker than \ref{mono-S} since $\mathbb{P}(Y_{x_0}< Y_{x_1})=0$  implies $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0$ but not vice versa.
%\yuta{$\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0$ or $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})=0$ can be represented as $\mathbb{P}(O_{x_0}=0,O_{x_1}=1)=0$ or $\mathbb{P}(O_{x_0}=1,O_{x_1}=0)=0$ through binarized outcome $O=\mathbb{I}(Y \geq y)$ for any $y \in \Omega_Y$.}
%\jin{Why need $y$ here? Why not just require $\mathbb{P}(Y_{x_0}\geq Y_{x_1})=0$? You can't just throw out an arbitrary mathematical definition and call it Monotonicity w.r.t $X$. In what sense does this condition stand for monotonicity? Use defining an event as $E= Y\geq y$ interpretation? People have some intuition about what monotonicity mean. Justify why this is a reasonable definition matching people's intuition.}

\begin{comment}
\yuta{
Another monotonicity assumption
\begin{assumption}[Another Monotonicity on $X$]
    The potential outcomes $Y_x$ satisfy,  for any $x_0,x_1 \in \Omega_X$,  
    \begin{equation}
     \text{either } \  \mathbb{P}(Y_{x_0}< Y_{x_1})=0\text{ or }\mathbb{P}(Y_{x_1}< Y_{x_0})=0
    \end{equation}   
\end{assumption}
is more strict assumption than Assumption \ref{MONO_A}.
We have $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=\mathbb{P}(Y_{x_0}< Y_{x_1},Y_{x_0}< y \leq Y_{x_1})$ since $Y_{x_0}< y \leq Y_{x_1} \Rightarrow Y_{x_0}< Y_{x_1}$.
Then, $\mathbb{P}(Y_{x_0}< Y_{x_1})=0$ for any $x_0,x_1 \in \Omega_X$ implies $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=0$ for any $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$.}

\yuta{Assumption 4.3. can be re-written as follows: 
     The potential outcomes $Y_x$ satisfy,  for any $x_0,x_1 \in \Omega_X$,  either "$Y_{x_0}(u)\leq Y_{x_1}(u)$ for almost every $u \in \Omega_U$" or "$Y_{x_1}(u)\leq Y_{x_0}(u)$ for almost every $u \in \Omega_U$".}
\end{comment}
     




\begin{comment}
We can also define it as below.
\begin{assumption}[Monotonicity on potential outcomes]
\label{MONO}
    The potential outcomes satisfy either $\mathbb{P}(Y_{x_0}< Y_{x_1})=0$ or $\mathbb{P}(Y_{x_1}<  Y_{x_0})=0$ for any $x_0,x_1 \in \Omega_X$.
\end{assumption}
We have the following corollary.
\begin{corollary}
\label{COR1}
    Under SCM ${\cal M}_{S}$, Assumption \ref{MONO_A} is equivalent to Assumption \ref{MONO}.
\end{corollary}
Thus, we consider only Assumption \ref{MONO_A} in this paper.
\end{comment}

%These assumptions come up with from the relationship $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=\mathbb{P}(Y_{x_0}\leq y )-\mathbb{P}(Y_{x_1}\leq y)+\mathbb{P}(Y_{x_1}\leq y <  Y_{x_0})=\mathbb{P}(Y\leq y|X=x_0)-\mathbb{P}(Y\leq y|X=x_1)+\mathbb{P}(Y_{x_1}\leq y <  Y_{x_0})$.
%Supposed one of the $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})$ or $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})$ is equal to $0$, then the other is identifiable from observed distribution. 

Next, we discuss several related assumptions used in the literature for various identification purposes.

($\mathrm{\bf I}\hspace{-1.2pt}\mathrm{\bf I}$). 
%{\bf Monotonicity w.r.t. $U$.}
{\bf Monotonicity over $f_Y$.}
Monotonicity on $U$ over structural function $f_Y(x,U)$ has appeared in the instrumental variable (IV) literature, e.g. 
\citep{Vytlacil2002,Heckman1999,Heckman2005}.
%\yuta{We denote the structural function from $X$ to $Y$ by $f_Y(X,U)$.}
%They defined monotonicity on $U$ as below.
\begin{assumption}[Monotonicity over $f_Y$]
\label{AS1}
The function $f_Y(x,U)$ is either monotonic increasing on $U$ for all $x \in \Omega_X$ or monotonic decreasing on $U$ for all $x \in \Omega_X$ almost surely w.r.t. $\mathbb{P}_U$.
%The function $f_Y(x,U)$ is either monotonic increasing or monotonic decreasing on $U$ for all $x \in \Omega_X$ almost surely w.r.t. $\mathbb{P}_U$. 
%\jin{Do you mean: The function $f_Y(x,U)$ is either monotonic increasing on $U$ for all $x \in \Omega_X$ or monotonic decreasing on $U$ for all $x \in \Omega_X$? Otherwise, your proof argument doesn't look right. }
\end{assumption}
%\yuta{"Almost surely w.r.t. $\mathbb{P}_U$" means the probability of the violation of the above conditions is $0$ with respect to $\mathbb{P}_U$.}
%\jin{Does this mean $f_Y(x,U)$ may not be monotonic over some measure zero number of U values? } 
%\yuta{Yes, I do not requires $f_Y(x,U)$ is monotonic for all U values.}
%
%This assumption implies that two trajectories of potential outcome, i.e., $\{(x,Y_x(u_0)) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ and $\{(x,Y_x(u_1))\in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$, do not cross over each other for $\mathbb{P}_U$-almost every $u_0, u_1 \in \Omega_U$.
%
For example, \citet{Heckman2005} introduced the latent index model  $Y:=\mathbb{I}[f_Y(X) \geq U]$ for a binary outcome,  
which satisfies the above assumption.



%\jin{I found the names of 'Monotonicity on Potential Outcomes' and  'monotonicity on SCM' misleading, because (I) is about monotonicity with respect to $X$ while (II) is about monotonicity with respect to $U$. Both kinds of monotonicity could be described in terms of potential outcomes or SCMs. In other words, the difference between these two kinds of monotonicity is NOT about potential outcomes or SCMs.   }


($\mathrm{\bf I}\hspace{-1.2pt}\mathrm{\bf I}\hspace{-1.2pt}\mathrm{\bf I}$). 
{\bf Strict monotonicity over $f_Y$.}
%Third, strict monotonicity on $U$ or structural function also have been appeared in the IV literature, such as
The following stronger monotonicity assumption have also been used 
\citep{Chesher2003,Chernozhukov2005,Chernozhukov2007,Imbens2009}.
%They defined strict monotonicity on $U$ as below.
\begin{assumption}[Strict monotonicity over $f_Y$]
\label{SAS1}
The function $f_Y(x,U)$ is either strictly monotonic increasing on $U$ for all $x \in \Omega_X$ or strictly monotonic decreasing on $U$ for all $x \in \Omega_X$ almost surely w.r.t. $\mathbb{P}_U$ with $\sup_{u \in \Omega_U}\mathfrak{p}(u)<\infty$. 
\end{assumption}
The condition $\sup_{u \in \Omega_U}\mathfrak{p}(u)<\infty$ means $U$ is continuous distribution. %\jin{Do you mean "$U$ must be continuous"? What if $U$ is discrete?} \yuta{If $U$ is the discrete variable, PDF of $U$ can be represented by delta functions, and take $\infty$ for some $u \in \Omega_U$.} \jin{If $U$ is discrete, then the above Assumption can not be defined? Why? } \yuta{If $U$ is discrete, we use Assumption \ref{AS1}.}
%This assumption is stronger than monotonicity on $U$ because strong monotonicity implies monotonicity, but not the other way around.
For example, the widely used additive noise model $Y=f_Y(X)+U$ \citep{Whitney2003,Singh2019,Hartford2017,Xu2021,Kawakami2023} satisfies this assumption.
%We can easily calculate the value of $U$ by $U=f_Y(X)-Y$ under additive noise model.\\
%Under Assumption \ref{SAS1} with monotonic increasing, $U \leq u$ is equal to $Y_x=f_Y(x,U)\leq f_Y(x,u)=y$ as Eq. (2.7) in \citep{Chernozhukov2005}, and we can observe the CDF of $U$ through the conditional CDF of $Y$ given $X=x$.\\


\begin{comment}
($\mathrm{\bf I}\hspace{-1.2pt}\mathrm{\bf V}$). 
{\bf Rank Preservation.}
\citet[p.186]{Hernan2023} stated:  
%\vspace{-0.2cm}
%\begin{center}
    {\it ``Suppose for a second that we could actually rank everybody according to $Y^{a=0}$ and also according to $Y^{a=1}$. We would then have two lists of individuals ordered from larger to smaller value of the corresponding counterfactual outcome. If both lists are in identical order we say that there is rank preservation.''}
%\end{center} 
%\vspace{-0.2cm}
Note that $Y^{a=0}$ and $Y^{a=1}$ stands for $Y_{x_0}$ and $Y_{x_1}$ in our notation.
%Formally, for a binary treatment, \emph{rank preservation assumption} means that the rank of the $Y_{x_0}$ is preserved to the rank of the $Y_{x_1}$, i.e., either ``$Y_{x_0}(u_0) < Y_{x_0}(u_1)$ and $Y_{x_1}(u_0) < Y_{x_1}(u_1)$'' or ``$Y_{x_0}(u_0) > Y_{x_0}(u_1)$ and $Y_{x_1}(u_0) > Y_{x_1}(u_1)$'' holds for $\mathbb{P}_U$-almost every $u_0,u_1 \in \Omega_U$ ($u_0 \ne u_1$) \citep{Robins1989,Robins1991,Have2007,Vansteelandt2014,Bothmann2023,Hernan2023}.
We %make this assumption formal and 
generalize this assumption  to continuous and discrete cases as follows:
\begin{assumption}[Rank preservation]
\label{RP1}
The potential outcomes $Y_x(u)$ satisfy (i) the uniqueness of ranking, i.e., $Y_{x}(u_0) \ne Y_{x}(u_1)$ for all $x \in \Omega_X$ and $\mathbb{P}_U$-almost  every $u_0,u_1 \in \Omega_U$ such that $u_0 \ne u_1$, and (ii)  the rank preservation, i.e., both
\begin{equation}
\begin{aligned}
\label{RPE1}
&Y_{x_0}(u_0) < Y_{x_0}(u_1) \Rightarrow  Y_{x_1}(u_0) < Y_{x_1}(u_1),\\
&Y_{x_0}(u_0) > Y_{x_0}(u_1) \Rightarrow  Y_{x_1}(u_0) > Y_{x_1}(u_1) 
\end{aligned}
\end{equation}
hold for all $x_0,x_1 \in \Omega_X$ and $\mathbb{P}_U$-almost {surely for} every $u_0,u_1 \in \Omega_U$ such that $u_0 \ne u_1$. 
\end{assumption}
%\yuta{"$\mathbb{P}_U$-almost every $u_0,u_1 \in \Omega_U$" means the probability of the violation of the above conditions is $0$ with respect to $\mathbb{P}_U$.} 
%The first assumption guarantees the uniqueness of ranking, and the second assumption guarantees the preservation of ranking.
%Sometimes, researchers consider the non-unique ranking; however, we do not consider it following \citep[p.186]{Hernan2023}.
We  follow \citep[p.186]{Hernan2023} and do not consider non-unique ranking settings.
%This assumption is stronger than \ref{AS1} since, if $f_Y$ satisfies $f_Y(x_0,u_0)<f_Y(x_0,u_1)$ and $f_Y(x_1,u_0)=f_Y(x_1,u_1)$ for some $u_0 < u_1$, it does not violate Assumption \ref{AS1}; however it violates Assumption \ref{RP1}.\\
\end{comment}



{\bf Relationship between the three assumptions.}
%Then, we explain the relationship of the above assumptions. 
We obtain that our proposed monotonicity Assumption~\ref{MONO_A} for continuous and discrete cases is equivalent to the monotonicity Assumption~\ref{AS1} over structural function $f_Y(x,U)$ 
under the following assumption: 
\begin{assumption}
\label{SUP1}
Potential outcome $Y_x$ has PDF $p_{Y_x}$ for each $x \in \Omega_{X}$, and its support $\{y \in \Omega_Y: p_{Y_x}(y) \ne0 \}$ is
the same
%$[-\infty,\infty]$ 
for each $x \in \Omega_{X}$.
\end{assumption}
This assumption is reasonable for continuous variables. 
For example, the 
%multivariate 
linear regression model with Gaussian noise in \citep{Hannart2018} satisfies this assumption.

%, and our proposed rank preservation Assumption~\ref{RP1} for continuous and discrete cases is equivalent to the strict monotonicity Assumption~\ref{SAS1}.  
%\jin{Is the condition "Under SCM ${\cal M}_S$" required for the theorem?} 
%\yuta{[I think it is needed since we use function $f_Y$ in the definition of Asuumption \ref{AS1} and \ref{SAS1}.]}
%\yuta{[Deleted]}
%First, we show the equivalence of the first and second assumptions. 
%We have the following lemma.
%\begin{lemma}
%\label{LEM31}
  %  Under SCM ${\cal M}_{S}$ and Assumption \ref{TOT}, Assumption \ref{MONO} implies Assumption \ref{AS1}.
%\end{lemma}
%Next, we have the following lemma.
%\begin{lemma}
 %   Under SCM ${\cal M}_{S}$ and Assumption \ref{TOT}, Assumption \ref{AS1} implies Assumption \ref{MONO}.
%\end{lemma}
%We have the following theorem.
\begin{theorem}%[Equivalence of three assumptions]
\label{E12}
    Under SCM ${\cal M}_S$ and Assumption \ref{SUP1}, 
    Assumptions \ref{MONO_A} and \ref{AS1} are equivalent, and
    %and Assumptions \ref{SAS1} and \ref{RP1} are equivalent. 
    {Assumption \ref{SAS1} is a strictly stronger requirement than \ref{AS1}.}
\end{theorem}
%\jin{What is the intuition here? How can monotonicity with respect to $X$ be equivalent to monotonicity with respect to $U$? Intuitively, can't we easily construct a SCM that is monotonic in one but not monotonic in the other? Similarly, I can imagine we can easily construct a SCM that is rank preservation but not monotonic with respect to $U$.}

\begin{comment}
\begin{figure}
    \centering
    \scalebox{1}{
    \begin{tikzpicture}

  \draw (0,-0.7) .. controls (1,-0.25) and (2,0.4) .. (3,0.8);
 \draw (-1,-0.9) .. controls (-0.5,-0.88) .. (0,-0.7);
  \draw (3,0.8) .. controls (3.5,1) .. (4,1.25)
    node[anchor=west] {$Y_x(u_{\rho(y;x_0)})$};

      \draw[black,dotted] (0,0.7) .. controls (1,0.6) and (2,0.5) .. (3,0);
 \draw[black,dotted] (-1,0.75) .. controls (-0.5,0.725) .. (0,0.7);
  \draw[black,dotted] (3,0) .. controls (3.5,-0.25) .. (4,-0.3)
    node[anchor=west] {$Y_x(u')$};

    \draw[black,dotted] (-1, -0.7) -- (4, -0.7)
      node[anchor=west] {$Y=y$};

        \draw[black,dotted] (0, -0.7) -- (0, -1.5)
      node[anchor=west] {$X=x_0$};
      \draw[black] (0, -0.7) -- (0, 1.5);
      
   \draw[black,dotted] (3, 0.8) -- (3, -1.5)
      node[anchor=west] {$X=x_1$};
 \draw[black] (3, 0.8) -- (3, 1.5);
      
      
    % y-axis
    \draw[thick, black, ->] (-0.5, -1.5) -- (-0.5, 1.5)
      node[anchor=south] {$Y$};

      % x-axis
    \draw[thick, black, ->] (-1, -1) -- (4, -1)
      node[anchor=west] {$X$};
      
    \draw[black,dotted] (-1, 0.8) -- (4, 0.8)
      node[anchor=west] {$Y=y'$};

          \draw[black,dotted] (-1, 0.25) -- (4, 0.25)
      node[anchor=west] {$Y=y''$};
      
\node at (0,-0.7)[circle,fill,inner sep=1pt]{};
\node at (3,0.8)[circle,fill,inner sep=1pt]{};
      
\end{tikzpicture}
}
    \caption{Sketch of Proof}
    \label{fig:0}
\end{figure}
\end{comment}

\begin{comment}
    
\yuta{
\begin{proof}[(Sketch of Proof)]
%{\bf Sketch of Proof.}\\
(Assumption \ref{MONO_A} $\Rightarrow$ Assumption \ref{AS1})
We assume the negation of Assumption \ref{AS1} that there exists a set ${\cal U}$, whose probability is not zero, and
%a set ${\cal U}$ such that $0<\mathbb{P}({\cal U})<1$, and 
$f_Y(x_0,u_0)\geq y> f_Y(x_0,u_1) \land f_Y(x_1,u_0)< y \leq f_Y(x_1,u_1)$ for some $x_0, x_1 \in \Omega_X$ and $y \in \Omega_Y$ and for any $u_0,u_1 \in {\cal U}$ such that $u_0< u_1$.
Then, we have $f_Y(x_0,u_0)\geq y > f_Y(x_1,u_0) \land f_Y(x_0,u_1)< y \leq f_Y(x_1,u_1)$ for some $x_0, x_1 \in \Omega_X$ and $y \in \Omega_Y$ and for any $u_0, u_1 \in {\cal U}$ such that $u_0 < u_1$.
It implies $f_Y(x_0,u)\geq y > f_Y(x_1,u)$ and $f_Y(x_0,u)< y \leq f_Y(x_1,u)$ for some $x_0, x_1 \in \Omega_X$ and $y \in \Omega_Y$ and for any $u \in {\cal U}$, and $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})> 0$ and $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})> 0$ for some $x_0, x_1 \in \Omega_X$ and $y \in \Omega_Y$. 
This is contradicted to Assumption \ref{MONO_A}.\\
%since $f_Y(x_0,u)> y \geq f_Y(x_1,u) \Leftrightarrow Y_{x_0}(u)> y \geq Y_{x_1}(u)$ and $f_Y(x_0,u)\leq y < f_Y(x_1,u) \Leftrightarrow Y_{x_1}(u)> y \geq Y_{x_0}(u)$.\\
(Assumption \ref{MONO_A} $\Leftarrow$ Assumption \ref{AS1})
We denote $u_{\rho(y;x_0)}=\sup\{u;f_Y(x_0,u)\leq y\}$ for any $x_0 \in \Omega_X$ and $y \in \Omega_Y$.
%Here, we restrict both Assumptions \ref{MONO_A} and \ref{AS1} to monotonic increasing on $X$ and $U$, respectively, for simplicity.
Supposed the function $f_Y(x,U)$ is monotonic increasing on $U$ and $y \leq y'$ for simplicity and denote $y'=Y_{x_1}(u_{\rho(y;x_0)})$ for any $x_1 \in \Omega_X$ ($x_0 <x_1$),
Assumption \ref{AS1} implies $Y_{x_0} \geq y \Leftrightarrow f_Y(x_0,u)\geq f_Y(x_0,u_{\rho(y;x_0)}) \Leftrightarrow u \geq u_{\rho(y;x_0)} \Leftrightarrow f_Y(x_1,u)\geq f_Y(x_1,u_{\rho(y;x_0)}) \Leftrightarrow Y_{x_1} \geq y'$ for any $x_0,x_1 \in \Omega_X$ ($x_0<x_1$) and $y\in\Omega_Y$ almost surely w.r.t. $\mathbb{P}_U$.
This means $\mathbb{P}(Y_{x_1}< y \leq Y_{x_0})=0$ for any $x_0,x_1 \in \Omega_X$ ($x_0<x_1$) and $y \in\Omega_Y$.\\
(Assumption \ref{SAS1} $\Rightarrow$ Assumption \ref{RP1})
From Assumption \ref{SAS1}, we have $f_Y(x,u_0) \ne f_Y(x,u_1)\Leftrightarrow Y_{x}(u_0) \ne Y_{x}(u_1)$ for all $x \in \Omega_X$ and $\mathbb{P}_U$-almost every $u_0,u_1 \in \Omega_U$ such that $u_0 \ne u_1$.
In addition, if the function $f_Y$ is strictly monotonic increasing and $u_0 < u_1$ for simplicity, then we have both $f_Y(x_0,u_0) < f_Y(x_0,u_1) \Leftrightarrow Y_{x_0}(u_0)< Y_{x_0}(u_1)$ and $f_Y(x_1,u_0) < f_Y(x_1,u_1)\Leftrightarrow Y_{x_1}(u_0)< Y_{x_1}(u_1)$ for any $u_0,u_1 \in \Omega_U$ ($u_0 \ne u_1$) and $x_0,x_1 \in \Omega_X$ almost surely w.r.t $\mathbb{P}_U$ from Assumption \ref{SAS1}.
This implies Assumption \ref{RP1}.\\
(Assumption \ref{SAS1} $\Leftarrow$ Assumption \ref{RP1})
From Assumption \ref{RP1}, we have either $Y_x(u_0)< Y_x(u_1) \Leftrightarrow f_Y(x,u_0) < f_Y(x,u_1)$ or $Y_x(u_0)> Y_x(u_1) \Leftrightarrow f_Y(x,u_0) > f_Y(x,u_1)$ for any $x \in \Omega_X$ and $\mathbb{P}_U$-almost every $u_0,u_1 \in \Omega_U$ such that $u_0 \ne u_1$.
This implies Assumption \ref{SAS1}.
\end{proof}
}
\end{comment}



%Next, we show the equivalence of the third and fourth assumptions. 
%We have the following lemma.
%\begin{lemma}
 %   Under SCM ${\cal M}_S$ and Assumption \ref{TOT}, Assumption \ref{SAS1} implies Assumption \ref{RP1}.
%\end{lemma}
%Next, we have the following lemma.
%\begin{lemma}
%    Under SCM ${\cal M}_S$ and Assumption \ref{TOT}, Assumption \ref{RP1} implies Assumption \ref{SAS1}.
%\end{lemma}
%We have the following theorem.
%\begin{theorem}[Equivalence of third and fourth assumptions]
%\label{E34}
%    Under SCM ${\cal M}_S$ and Assumption \ref{TOT}, Assumptions \ref{SAS1} and \ref{RP1} are equivalent.
%\end{theorem}


%In summary, the first and second assumptions are equivalent, the third assumption is stronger than the second assumption, and the fourth assumption is equivalent to the third assumption.

\begin{comment}
    
First and second assumptions (Assumptions \ref{MONO_A} and \ref{AS1}) are often used for binary or discrete outcome.
\yuta{This is because structural functions of them consist of $\mathbb{I}(\cdot)$, and $\mathbb{I}(\cdot)$ is not strictly monotonic function but monotonic function.}
\jin{What is the logic here?} 
On the other hand, third and fourth assumptions (Assumptions \ref{SAS1} and \ref{RP1}) are often used for continuous outcome.
First and forth assumptions (Assumptions \ref{MONO_A} and \ref{RP1}) are related potential outcomes, and second and third assumptions (Assumptions \ref{AS1} and \ref{SAS1}) are related to the structural function.

Assumptions \ref{SAS1} and \ref{RP1} imply that the trajectory of potential outcome, i.e., $\{(x,Y_x(u)) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ is uniquely determined given the realized values $(X=x',Y=y')$. 
\end{comment}


\begin{figure}
    \centering
    \scalebox{1}{
    \begin{tikzpicture}

  \draw (0,0) .. controls (1,0.25) and (2,0.5) .. (3,0.8);
 \draw (-1,-0.5) .. controls (-0.5,-0.25) .. (0,0);
  \draw (3,0.8) .. controls (3.5,1) .. (4,1.25)
    node[anchor=west] 
    {Trajectory (1)};
    %{$Y_x(u_{\rho(y;x_0)})$};

  \draw (0,-1) .. controls (1,-0.5) and (2,-0.5) .. (3,0);
 \draw (-1,-1.25) .. controls (-0.5,-1.25) .. (0,-1);
  \draw (3,0) .. controls (3.5,0.5) .. (4,0.5)
  node[anchor=west] 
    {Trajectory (2)};
    %{$Y_x(u_{\rho(y;x_1)})$};

    \draw[black,dotted] (-1, 0) -- (4, 0)
      node[anchor=west] {$Y_x=y$};

        \draw[black,dotted] (0, 1.5) -- (0, -1.5)
      node[anchor=west] {$X=x_0$};
      
   \draw[black,dotted] (3, 1.5) -- (3, -1.5)
      node[anchor=west] {$X=x_1$};

      
    % y-axis
    \draw[thick, black, ->] (-0.5, -1.5) -- (-0.5, 1.5)
      node[anchor=south] {$Y_x$};

      % x-axis
    \draw[thick, black, ->] (-1, -1) -- (4, -1)
      node[anchor=west] {$X$};

      
\node at (0,0)[circle,fill,inner sep=1pt]{};
\node at (3,0)[circle,fill,inner sep=1pt]{};
      
\end{tikzpicture}
}
    \caption{Trajectories for (1) $Y_x(u_{\rho(y;x_0)})$ and (2) $Y_x(u_{\rho(y;x_1)})$. }
    %$Y_x(u_{\rho(y;x_0)})$ and $Y_x(u_{\rho(y;x_1)})$.}
    \label{fig:1}
\end{figure}


\subsection{Identification Theorem}
Next, we present an identification theorem.
We denote the conditional CDF 
\begin{equation}
    \rho(y;x)\defeq\mathbb{P}(Y< y|X=x).%=\mathbb{P}(Y\leq y|X=x)
\end{equation}
%for any $y \in \Omega_Y$ and $x \in \Omega_X$.
%, and we have $\rho(y;x)=\mathbb{P}(Y < y|X=x)$ for any $y \in \Omega_Y$  and $x \in \Omega_X$. 
%\jin{This only holds assuming Exogeneity or under Under SCM ${\cal M}_S$.  Do you want $\rho(y;x)$ to denote $\mathbb{P}(Y_x\leq y)$ or $\mathbb{P}(Y\leq y|X=x)$? can't be both! This is confusing. Given its application in the following theorem, why not just define $\rho(y;x)=\mathbb{P}(Y\leq y|X=x)$?}\yuta{[Fixed]}
%We assume
%\begin{assumption}[Positivity]
%\label{POS}
%    $\rho(y;x_1)<1$ and $0<\rho(y;x_0)$ hold for any $x_0, x_1 \in \Omega_X$ and $y \in \Omega_Y$.
%\end{assumption}
%We do not assume the positivity of the boundary $\Omega_Y \setminus \Omega_Y$, e.g., the minimum or maximum values of $\Omega_Y$.
%Then, we have the following theorem. %\jin{Is the condition "Under SCM ${\cal M}_S$" really required for the theorem? Can it be replaced by Lemma 3.1 - that is, just assuming Exogeneity?}\yuta{[Fixed]}
\begin{theorem}[Identification of PoC]
\label{THEO1}
Under SCM ${\cal M}_{S}$ and Assumptions \ref{ASEXO}, \ref{MONO_A} (or \ref{AS1}, \ref{SAS1}), and \ref{SUP1}, 
PNS, PN, and PS are identifiable by
\begin{equation}
    \begin{aligned}
    &\text{PNS}(y;x_0,x_1)=\max\{\rho(y;x_0)-\rho(y;x_1),0\},\\
    &\text{PN}(y;x_0,x_1)=\max\left\{\frac{\rho(y;x_0)-\rho(y;x_1)}{1-\rho(y;x_1)},0\right\},\\
    &\text{PS}(y;x_0,x_1)=\max\left\{\frac{\rho(y;x_0)-\rho(y;x_1)}{\rho(y;x_0)},0\right\}
    \end{aligned}
\end{equation}
for any  $x_0,x_1 \in \Omega_X$ and $y \in \Omega_Y$ such that $\rho(y;x_1)<1$ and $\rho(y;x_0)>0$.
\end{theorem}
%This theorem consists of conditional CDF.
%If $Y$ is a binary outcome, Theorem \ref{THEO1} coincide with theorem 4 in \citep{Tian2000}. % or Proposition 4.2. \citep{Galhotra2021}. 
%Note that our theorem does not require the following positivity assumption, $\mathbb{P}(X=x_0)>0$ and $\mathbb{P}(X=x_1)>0$, since we do not use linear programming formulation as \citep{Tian2000,Galhotra2021}.
%
{We can use the trajectories of potential outcomes to visualize and explain the above identification result for PNS. 
The trajectory $\{(x,Y_x(u)) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ represents potential outcome $Y_x(u)$ vs. $X$ for the subject $U=u$.
\emph{Under Assumptions \ref{MONO_A} (or \ref{AS1}, \ref{SAS1}), the subjects' trajectories do not cross over each other} (they may overlap). 
We denote $u_{\rho(y;x)}=\sup\{u: f_Y(x,u) < y\}$ for any $x \in \Omega_X$ and $y \in \Omega_Y$, and $Y_x(u_{\rho(y;x)})$ is the potential outcome for subject $u_{\rho(y;x)}$. 
Consider the two trajectories shown in Figure \ref{fig:1}. Trajectory (1) $\{(x,Y_x(u_{\rho(y;x_0)})) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ goes through the point $(x_0,y)$, and Trajectory (2) $\{(x,Y_x(u_{\rho(y;x_1)})) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ goes through the point $(x_1,y)$. 
The trajectory of subject $u$ lies in the region between Trajectories (1) and (2) if and only if $Y_{x_0}(u) < y \leq Y_{x_1}(u)$.
Thus, we have $\text{PNS}(y;x_0,x_1)=\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=\mathbb{P}(Y_{x_0}< y)-\mathbb{P}(Y_{x_1}< y)$, where $\mathbb{P}(Y_{x_0}< y)$ represents the probability of a subject's trajectory being below Trajectory (1) and $\mathbb{P}(Y_{x_1}< y)$ represents the probability of a subject's trajectory being below Trajectory (2).
}
 
\begin{comment}
\yuta{We use trajectories of potential outcomes to explain and visualize the interpretation of theorems and corollaries in this paper.
The trajectory $\{(x,Y_x(u)) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ represents a set of all potential outcome $Y_x(u)$ for the subjects $U=u$.
Each subject has a unique trajectory.
Assumption \ref{AS1} guarantees the subject's trajectory does not cross over each other.}

\yuta{We have two Trajectories (1) and (2) in Figure \ref{fig:1}. 
Trajectory (1) $\{(x,Y_x(u_{\rho(y;x_0)})) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ goes through the point $(x_0,y)$, and Trajectory (2) $\{(x,Y_x(u_{\rho(y;x_1)})) \in \Omega_X \times \Omega_Y;\forall x \in \Omega_X\}$ goes through the point $(x_1,y)$.
The trajectories of the subjects always exist in the region between Trajectories (1) and (2) if and only if they satisfy $Y_{x_0}< y \leq Y_{x_1}$.
Thus, we have $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})=\mathbb{P}(Y_{x_0}< y)-\mathbb{P}(Y_{x_1}< y)$, where $\mathbb{P}(Y_{x_0}< y)$ means the probability of subject's trajectory exists below Trajectory (1) and $\mathbb{P}(Y_{x_1}< y)$ means the probability of subject's trajectory exists below Trajectory (2).}
    





{\bf Remarks.}
\yuta{We have 
\begin{equation}
    \begin{aligned}
        &\mathbb{P}(Y_{x_0} < y,Y_{x_1} < y)\\
        &=\mathbb{P}(Y_{x_0} < y)-\mathbb{P}(Y_{x_0} < y \leq Y_{x_1})\\
        &=\mathbb{P}(Y_{x_0} < y)-\max\{\mathbb{P}(Y_{x_0} < y)-\mathbb{P}(Y_{x_1}< y),0\}\\
        &=\min\{\mathbb{P}(Y_{x_0}< y),\mathbb{P}(Y_{x_1} < y)\}
    \end{aligned}
\end{equation} from Theorem \ref{THEO1}, and Assumptions \ref{MONO_A} (or \ref{AS1}, \ref{SAS1}).} \jin{How this claim is obtained is not obvious.}
Thus, potential outcomes $(Y_{x_0},Y_{x_1})$ is ``\emph{comonotonic}.'' \jin{I don't see how the previous claim leads to comonotonic.}
\yuta{Two variables $(A,B)$ is said to comonotonic if $\mathbb{P}(A<a,B<b)=\min\{\mathbb{P}(A<a),\mathbb{P}(B<b)\}$ for all $(a,b) \in \Omega_A \times \Omega_B$ \citep{Puccetti2010}.}
The concept of comonotonicity has been studied in financial risk management and actuarial science \citep{Dhaene2002,Jouini2004,Puccetti2010}. \jin{What is the point of this remark?}

\end{comment}








%If Assumption \ref{MONO_A} is violated, PNS, PN and PS are never identifiable since we only have $\mathbb{P}(Y_{x_0}< y \leq Y_{x_1})+\mathbb{P}(Y\leq y|X=x_1)=\mathbb{P}(Y_{x_1}\leq y <  Y_{x_0})+\mathbb{P}(Y\leq y|X=x_0)$. Thus, Assumptions \ref{MONO_A} and \ref{AS1} are the \emph{weakest} assumptions for the identification of PNS, PN, and PS from the distributions $\mathbb{P}(Y|X)$. \jin{If this is true, then why not present Theorem \ref{THEO1} as a `if and only if' result? }
