%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


%%%%%My Preambles%%%%%%%
\usepackage{hyperref}  
\usepackage{url}         
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{bm}
\usepackage{color}
\usepackage{amsmath,amssymb,amsfonts}
\input{Ni_macros}
\usepackage{algorithm}
\usepackage{algorithmic}
% \usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}
\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={ellipse, draw, minimum width = 0.7 cm},
    point/.style = {circle, draw, inner sep=0.04cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=2pt, align=left, sloped}
}

%%%%%End My Preambles%%%%%%%

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Ordinal Causal Discovery: Supplementary Materials}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<yni@stat.tamu.edu>?Subject=Your UAI 2022 paper}{Yang~Ni}{}}
\author[1]{Bani~Mallick}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    Texas A\&M University\\
    College Station, Texas, USA
}

  \begin{document}
  \onecolumn \maketitle

\section{Proof of Theorem 1}

We need the notion of \textit{real analytic function}. 

\textbf{Definition (Real Analytic Function)} \emph{A real function is said to be analytic if it is infinitely differentiable and matches its Taylor series in a neighborhood of every point.}


Suppose $X\in \{1,\dots,S\}$ and $Y\in \{1,\dots,L\}$. 
Consider two competing causal models $p_{X\rightarrow Y}(X,Y| \pib,\betab,\gammab)$ and $p_{Y\rightarrow X}(Y,X|\rhob,\alphab,\etab)$. We will show that these two causal models are in general not equivalent, i.e., $P_{X\rightarrow Y}(X=s,Y=\ell| \pib,\betab,\gammab)\neq P_{Y\rightarrow X}(X=s,Y=\ell|\rhob,\alphab,\etab)$ for some $s\in \{1,\dots,S\}$ and $\ell\in\{1,\dots,L\}$, where $S,L> 2$. We prove it by contradiction. Suppose for any $s\in \{1,\dots,S\}$ and $\ell\in\{1,\dots,L\}$, 
\begin{align}\label{eq:eq}
    P_{X\rightarrow Y}(X=s,Y=\ell| \pib,\betab,\gammab)=P_{Y\rightarrow X}(X=s,Y=\ell|\rhob,\alphab,\etab).
\end{align}
The left-hand side of \eqref{eq:eq} is given by 
\begin{align*}
    P_{X\rightarrow Y}(X=s,Y=\ell| \pib,\betab,\gammab)&=P_{X\rightarrow Y}(Y=\ell|X=s, \betab,\gammab)P_{X\rightarrow Y}(X=s|\pib)\\
    &=[P_{X\rightarrow Y}(Y\leq\ell|X=s, \betab,\gammab)-P_{X\rightarrow Y}(Y\leq\ell-1|X=s, \betab,\gammab)]P_{X\rightarrow Y}(X=s|\pib)\\
    &=[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s,
\end{align*}
where $F(x)$ is the logistic link function $F(x)=\frac{e^x}{1+e^x}$ or the probit link function $F(x)=\Phi(x)$  with $\Phi(x)$ being the standard normal cumulative distribution function. Similarly, the right-hand side of \eqref{eq:eq} is given by 
\begin{align*}
    P_{Y\rightarrow X}(X=s,Y=\ell|\rhob,\alphab,\etab)=P_{Y\rightarrow X}(X=s|Y=\ell,\alphab,\etab)P_{Y\rightarrow X}(Y=\ell|\rhob)=[F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)]\rho_\ell.
\end{align*}
Therefore, \eqref{eq:eq} leads to 
\begin{align}\label{eq:diff}
    [F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s=[F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)]\rho_\ell
\end{align}
Note that the right-hand side of  \eqref{eq:diff} is a telescoping series in $s$. Hence, summing up both sides of \eqref{eq:diff} over $s$ from 1 to $S$, we have 
\begin{align}\label{eq:rho}
    \sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s&=[F(\eta_S-\alpha_\ell)-F(\eta_0-\alpha_\ell)]\rho_\ell\nonumber\\
    &=\rho_\ell.
\end{align}
The last equation is because $\eta_S=\infty$ and $\eta_0=-\infty$ and hence $F(\eta_S-\alpha_\ell)=1$ and $F(\eta_0-\alpha_\ell)=0$. 
Plug \eqref{eq:rho} into \eqref{eq:diff}, 
\begin{align*}
    [F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s=[F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)]\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s
\end{align*}
and hence
\begin{align}\label{eq:FF}
    \frac{[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}=F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)
\end{align}
Now, consider $s=1$ in \eqref{eq:FF} and note $\eta_0=-\infty$ and $\eta_1=0$,
\begin{align*}
    \frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}&=F(\eta_1-\alpha_\ell)-F(\eta_0-\alpha_\ell)\\
    &=F(-\alpha_\ell).
\end{align*}
Therefore,
\begin{align}\label{eq:alpha}
    \alpha_\ell=-F^{-1}\left\{\frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}
\end{align}

Sequentially plug \eqref{eq:alpha} into \eqref{eq:FF} for $s^*=2,\dots,S-1$ (note that one can at least plug in once for $s^*=2$ because $S>2$), 
\begin{align}\label{eq:eta}
    \eta_{s^*}=F^{-1}\left\{\frac{\sum_{s=1}^{s^*}[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}-F^{-1}\left\{\frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\},
\end{align}
Because the left-hand side of \eqref{eq:eta} is independent of $\ell$ whereas the right-hand side of \eqref{eq:eta} depends on $\ell$, we have,
\begin{align}\label{eq:af}
  F^{-1}\left\{\frac{\sum_{s=1}^{s^*}[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}-F^{-1}\left\{\frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}\nonumber\\
  -F^{-1}\left\{\frac{\sum_{s=1}^{s^*}[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)]\pi_s}\right\}+F^{-1}\left\{\frac{[F(\gamma_{\ell^*} -\beta_1)-F(\gamma_{\ell^*-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)]\pi_s}\right\}=0,
\end{align}
for any $\ell\neq\ell^*$. 
The link function $F$ is an analytic function: (i) the logistic link $F(x)=\frac{e^x}{1+e^x}$ is a composition of elementary functions and hence is analytic; and (ii) the probit link $F(x)=\Phi(x)$ is analytic because the error function erf() is analytic. Since $F'(x)$ is nowhere zero in either case, $F^{-1}(x)$ is analytic. 
Since the left-hand side of \eqref{eq:af} is a composition of $F$, $F^{-1}$, sums, products, and reciprocals of $\gamma_\ell,\gamma_{\ell^*},\gamma_{\ell-1},\gamma_{\ell^*-1},\beta_1,\dots,\beta_S,\pi_1,\dots,\pi_S$, it is an analytic function \citep{krantz2002primer} and therefore its zero set must have Lebesgue measure zero \citep{mityagin2015zero}. In summary, we have proven that the two causal models are not equivalent for almost all $(\pib,\betab,\gammab)$ with respect to the Lebesgue measure. Note that although our proof is for logistic or probit link, it is generalizable to other link functions as long as they are analytic functions and their derivatives are nowhere zero.


% \section{Proof of Theorem 1}

% Suppose $X\in \{1,\dots,S\}$ and $Y\in \{1,\dots,L\}$. 
% Consider two competing causal models $p_{X\rightarrow Y}(X,Y| \pib,\betab,\gammab)$ and $p_{Y\rightarrow X}(Y,X|\rhob,\alphab,\etab)$. We will show that these two causal models are in general not equivalent, i.e., $P_{X\rightarrow Y}(X=s,Y=\ell| \pib,\betab,\gammab)\neq P_{Y\rightarrow X}(X=s,Y=\ell|\rhob,\alphab,\etab)$ for some $s\in \{1,\dots,S\}$ and $\ell\in\{1,\dots,L\}$, where $S,L\geq 2$ and $\max\{S,L\}\geq 3$. Without loss of generality, assume $S\geq 3$. We prove it by contradiction. Suppose for any $s\in \{1,\dots,S\}$ and $\ell\in\{1,\dots,L\}$, 
% \begin{align}\label{eq:eq}
%     P_{X\rightarrow Y}(X=s,Y=\ell| \pib,\betab,\gammab)=P_{Y\rightarrow X}(X=s,Y=\ell|\rhob,\alphab,\etab).
% \end{align}
% The left-hand side of \eqref{eq:eq} is given by 
% \begin{align*}
%     P_{X\rightarrow Y}(X=s,Y=\ell| \pib,\betab,\gammab)&=P_{X\rightarrow Y}(Y=\ell|X=s, \betab,\gammab)P_{X\rightarrow Y}(X=s|\pib)\\
%     &=[P_{X\rightarrow Y}(Y\leq\ell|X=s, \betab,\gammab)-P_{X\rightarrow Y}(Y\leq\ell-1|X=s, \betab,\gammab)]P_{X\rightarrow Y}(X=s|\pib)\\
%     &=[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s,
% \end{align*}
% where $F(x)=\frac{e^x}{1+e^x}$ is the logistic link function. Similarly, the right-hand side of \eqref{eq:eq} is given by 
% \begin{align*}
%     P_{Y\rightarrow X}(X=s,Y=\ell|\rhob,\alphab,\etab)=P_{Y\rightarrow X}(X=s|Y=\ell,\alphab,\etab)P_{Y\rightarrow X}(Y=\ell|\rhob)=[F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)]\rho_\ell.
% \end{align*}
% Therefore, \eqref{eq:eq} leads to 
% \begin{align}\label{eq:diff}
%     [F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s=[F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)]\rho_\ell
% \end{align}
% Note that the right-hand side of  \eqref{eq:diff} is a telescoping series in $s$. Hence, summing up both sides of \eqref{eq:diff} over $s$ from 1 to $S$, we have 
% \begin{align}\label{eq:rho}
%     \sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s&=[F(\eta_S-\alpha_\ell)-F(\eta_0-\alpha_\ell)]\rho_\ell\nonumber\\
%     &=\rho_\ell.
% \end{align}
% The last equation is because $\eta_S=\infty$ and $\eta_0=-\infty$ and hence $F(\eta_S-\alpha_\ell)=1$ and $F(\eta_0-\alpha_\ell)=0$. 
% Plug \eqref{eq:rho} into \eqref{eq:diff}, 
% \begin{align*}
%     [F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s=[F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)]\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s
% \end{align*}
% and hence
% \begin{align}\label{eq:FF}
%     \frac{[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}=F(\eta_s-\alpha_\ell)-F(\eta_{s-1}-\alpha_\ell)
% \end{align}
% Now, consider $s=1$ in \eqref{eq:FF} and note $\eta_0=-\infty$ and $\eta_1=0$,
% \begin{align*}
%     \frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}&=F(\eta_1-\alpha_\ell)-F(\eta_0-\alpha_\ell)\\
%     &=F(-\alpha_\ell).
% \end{align*}
% Therefore,
% \begin{align}\label{eq:alpha}
%     \alpha_\ell=-F^{-1}\left\{\frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}
% \end{align}

% Sequentially plug \eqref{eq:alpha} into \eqref{eq:FF} for $s^*=2,\dots,S-1$ (note that one can at least plug in once for $s^*=2$ because $S\geq 3$), 
% \begin{align}\label{eq:eta}
%     \eta_{s^*}=F^{-1}\left\{\frac{\sum_{s=1}^{s^*}[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}-F^{-1}\left\{\frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\},
% \end{align}
% Because the left-hand side of \eqref{eq:eta} is independent of $\ell$ whereas the right-hand side of \eqref{eq:eta} depends on $\ell$, we have,
% \begin{align*}
%   F^{-1}\left\{\frac{\sum_{s=1}^{s^*}[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}-F^{-1}\left\{\frac{[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)]\pi_s}\right\}\\
%   =F^{-1}\left\{\frac{\sum_{s=1}^{s^*}[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)]\pi_s}{\sum_{s=1}^S[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)]\pi_s}\right\}-F^{-1}\left\{\frac{[F(\gamma_{\ell^*} -\beta_1)-F(\gamma_{\ell^*-1} -\beta_1)]\pi_1}{\sum_{s=1}^S[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)]\pi_s}\right\},
% \end{align*}
% for any $\ell,\ell^*\in\{1,\dots,L\}$ and $\ell\neq\ell^*$ (note that one can always find $\ell\neq\ell^*$ because $L\geq 2)$. Since $F^{-1}(x)=\log(\frac{x}{1-x})$, we have
% \begin{align}\label{eq:poly}
%     \frac{\left\{\sum_{s=1}^{s^*}\left[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)\right]\pi_s\right\}\left\{\sum_{s=2}^{S}\left[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)\right]\pi_s\right\}}{\left\{\left[F(\gamma_\ell -\beta_1)-F(\gamma_{\ell-1} -\beta_1)\right]\pi_1\right\}\left\{\sum_{s=s^*+1}^{S}\left[F(\gamma_\ell -\beta_s)-F(\gamma_{\ell-1} -\beta_s)\right]\pi_s\right\}}\nonumber\\
%     =\frac{\left\{\sum_{s=1}^{s^*}\left[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)\right]\pi_s\right\}\left\{\sum_{s=2}^{S}\left[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)\right]\pi_s\right\}}{\left\{\left[F(\gamma_{\ell^*} -\beta_1)-F(\gamma_{\ell^*-1} -\beta_1)\right]\pi_1\right\}\left\{\sum_{s=s^*+1}^{S}\left[F(\gamma_{\ell^*} -\beta_s)-F(\gamma_{\ell^*-1} -\beta_s)\right]\pi_s\right\}}
% \end{align}
% for $s^*=2,\dots,S-1$ and $\ell\neq \ell^*$. Let $r_\ell=e^{\gamma_\ell}$, $b_s=e^{-\beta_s}$ and hence $F(\gamma_\ell-\beta_s)=\frac{r_\ell b_s}{1+r_\ell b_s}$. Therefore, \eqref{eq:poly} is a polynomial equation in $r_\ell,r_{\ell^*},r_{\ell-1},r_{\ell^*-1},b_1,\dots,b_S,\pi_1,\dots,\pi_S$, which has at most a finite set of solutions and therefore have Lebesgue measure zero. In summary, we have proven that the two causal models are not equivalent for almost all $(\pib,\betab,\gammab)$ with respect to the Lebesgue measure.

% \clearpage




% \section{Proof of Theorem 1}
% We compute the conditional  $p(Y|X)$ and  marginal $p(X)$ distributions under $p_{Y\rightarrow X}(Y,X|\rhob,\alphab,\etab)$ and equate them to those under $p_{X\rightarrow Y}(X,Y| \pib,\betab,\gammab)$ for all configurations of $(X,Y)$,
% \begin{align*}
%     (a)~~&F(-\beta_1)=\frac{\rho_1F(-\alpha_1)}{\rho_1F(-\alpha_1)+\rho_2F(-\alpha_2)+\rho_3F(-\alpha_3)},\\
%     (b)~~&F(\gamma-\beta_1)-F(-\beta_1)=\frac{\rho_2F(-\alpha_2)}{\rho_1F(-\alpha_1)+\rho_2F(-\alpha_2)+\rho_3F(-\alpha_3)},\\
%     (c)~~&F(-\beta_2)=\frac{\rho_1[F(\eta-\alpha_1)-F(-\alpha_1)]}{\rho_1[F(\eta-\alpha_1)-F(-\alpha_1)]+\rho_2[F(\eta-\alpha_2)-F(-\alpha_2)]+\rho_3[F(\eta-\alpha_3)-F(-\alpha_3)]},\\
%     (d)~~&F(\gamma-\beta_2)-F(-\beta_2)=\frac{\rho_2[F(\eta-\alpha_2)-F(-\alpha_2)]}{\rho_1[F(\eta-\alpha_1)-F(-\alpha_1)]+\rho_2[F(\eta-\alpha_2)-F(-\alpha_2)]+\rho_3[F(\eta-\alpha_3)-F(-\alpha_3)]},\\
%     (e)~~&F(-\beta_3)=\frac{\rho_1F(\alpha_1-\eta)}{\rho_1F(\alpha_1-\eta)+\rho_2F(\alpha_2-\eta)+\rho_3F(\alpha_3-\eta)},\\
%     (f)~~&F(\gamma-\beta_3)-F(-\beta_3)=\frac{\rho_2F(\alpha_2-\eta)}{\rho_1F(\alpha_1-\eta)+\rho_2F(\alpha_2-\eta)+\rho_3F(\alpha_3-\eta)},\\
%     (g)~~&\pi_1=\rho_1F(-\alpha_1)+\rho_2F(-\alpha_2)+\rho_3F(-\alpha_3),\\
%      (h)~~&\pi_3=\rho_1F(\alpha_1-\eta)+\rho_2F(\alpha_2-\eta)+\rho_3F(\alpha_3-\eta).\\
% \end{align*}
% % We solve for $\rhob,\alphab,
% % \etab$ from Equations (a)-(h). Since there are more equations than variables, we arrive at the Equation \eqref{eq:ic}. 
% Eliminating $\rhob,\alphab,
% \etab$ from Equations (a)-(h), we arrive at two equations of  $\pib,\betab,\gammab$,
% \begin{align}\label{eq:ic}
%     \frac{C_1C_3}{(C_1+C_2)(C_2+C_3)}=\frac{D_1D_3}{(D_1+D_2)(D_2+D_3)}=\frac{(\pi_1-E_1)(\pi_3-E_3)}{(1-\pi_1-E_2-E_3)(1-\pi_3-E_1-E_2)},
% \end{align}
% with $C_\ell=\pi_\ell F(-\beta_\ell)$, $E_\ell=\pi_\ell F(\gamma-\beta_\ell)$, and $D_\ell=E_\ell-C_\ell$, for $\ell=1,2,3$. Solving \eqref{eq:ic} for $(\pib,\betab,\gammab)$ is equivalent to finding roots of polynomial functions after monotone transformations  $F(\gamma-\beta_\ell)=r_\ell$ and $F(-\beta_\ell)=b_\ell$, $\ell=1,2,3$, and a non-zero polynomial function is non-zero almost everywhere \citep{okamoto1973distinctness}. Therefore, $p(X,Y)\neq p_{Y\rightarrow X}(Y,X|\rhob,\alphab,\etab)$ except for a parameter set of Lebesgue measure zero (those satisfying the non-identifiable condition \eqref{eq:ic}). 

\section{Additional Experiment Results}




\subsection{Synthetic Data}

\paragraph{Number of Categories $L=3$} We investigate scenarios where the number of categories $L=3$.
The data are generated as in the main text ($n=500, q=10$) except that the number of categories is now set to $L=3$. 
%Note that $L=2$ would yield nonidentifiable model according to our causal identifiability theory. 
Six scenarios with different levels of signal strength are considered, $\sigma=0.25,0.5,0.75,1,1.25,1.5$. 
We report the SHD of OCD, BIC+, BIC, and BDe in Table \ref{tab:l3}, which shows that OCD significantly outperforms competing methods.

\begin{table}[h]
    \centering
    \begin{tabular}{lllllll}
    &\multicolumn{6}{c}{Signal Strength $\sigma$}\\\cline{2-7}
       & 0.25 & 0.5 & 0.75 & 1 & 1.25 & 1.5 \\\hline
    OCD & 5.2  & 1.6  & 1 & 0.8 & 0.2 & 0.2\\
    BIC+ &6.4 &5&3.6&3.8&3.8&3.6\\
    BIC &7 &5.8&4.6&4&3.2&3.8\\
    BDe&7&6.8&6.2&5.2&5&4.6
    \end{tabular}
    \caption{Structural
hamming distance between the true graph and the estimated graphs from OCD, BIC+, BIC, and BDe. The data are generated as in the main text with different levels of  signal strength except that the number of categories is set to $L=3$. }
    \label{tab:l3}
\end{table}





\paragraph{Higher-Dimensional Synthetic Data} 
As shown in Figure \ref{fig:scale3}, for all tested signal strength $\sigma\in\{0.25,0.5,0.75,1$\} and number of nodes $q=10,\dots,100$, SHD and SID of OCD are uniformly better than the competing methods and in general, OCD is quite stable as $q$ increases when the signal strength is at least moderate $\sigma\geq 0.5$ whereas the competing methods quickly deteriorate with $q$ regardless of the signal strength.

The CPU times of OCD in the synthetic data are shown in Figure \ref{fig:scale}, which appear to scale linearly in $n$ and $L$, and quadratically in $q$. 

\begin{figure}[ht]
     \centering
     
  
     
     
     \begin{subfigure}[b]{.24\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_sig1_SID_dim.pdf}
    \caption{SID in $q$ ($\sigma=0.25$)}
     \end{subfigure}
    %  \hfill
     \begin{subfigure}[b]{.24\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_sig2_SID_dim.pdf}
    \caption{SID in $q$ ($\sigma=0.5$)}
     \end{subfigure}
        %   \hfill
      \begin{subfigure}[b]{.24\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_sig3_SID_dim.pdf}
    \caption{SID in $q$ ($\sigma=0.75$)}
     \end{subfigure}
    %   \hfill
     \begin{subfigure}[b]{.24\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_sig4_SID_dim.pdf}
    \caption{SID in $q$ ($\sigma=1$)}
     \end{subfigure}
     
     \caption{SID for OCD, BDe, BIC, and BIC+ as functions of $q$ in the synthetic ordinal data with the sample size fixed at $n=500$ and different signal strength $\sigma\in\{0.25,0.5,0.75,1$\}.}
    \label{fig:scale3}
\end{figure}

\begin{figure}[ht]
     \centering
     
    %  \begin{subfigure}[b]{.3\textwidth}
    %      \centering
    %          \includegraphics[width=\textwidth]{true_DAG.pdf}
    % \caption{True DAG}
    %  \end{subfigure}
    %       \begin{subfigure}[b]{.34\textwidth}
    %      \centering
    %     \includegraphics[width=\textwidth]{sim_SHD.pdf}
    % \caption{SHD}
    % \label{fig:undisc}
    %  \end{subfigure}
    % %  \hfill
    %  \begin{subfigure}[b]{.34\textwidth}
    %      \centering
    %      \includegraphics[width=\textwidth]{sim_SID.pdf}
    % \caption{SID}
    % \label{fig:sn}
    %  \end{subfigure}
    %  ~~~~~~~~~~~~~~~~~~
    %  \begin{subfigure}[b]{.3\textwidth}
    %      \centering
    %      \includegraphics[width=\textwidth]{true_CPDAG.pdf}
    % \caption{True CPDAG}
    %  \end{subfigure}

     \begin{subfigure}[b]{.25\textwidth}
         \centering
             \includegraphics[width=\textwidth]{sim_oBN_CPU_n.pdf}
    \caption{CPU time in $n$}
     \end{subfigure}
    %  \hfill
     \begin{subfigure}[b]{.25\textwidth}
         \centering
         \includegraphics[width=\textwidth]{sim_oBN_CPU_L.pdf}
    \caption{CPU time in $L$}
     \end{subfigure}
        %   \hfill
     \begin{subfigure}[b]{.25\textwidth}
         \centering
         \includegraphics[width=\textwidth]{sim_oBN_CPU_q.pdf}
    \caption{CPU time in $q$}
     \end{subfigure}
     \caption{CPU times of OCD as functions of $n$, $L$, and $q$ in the synthetic ordinal data.}
    \label{fig:scale}
\end{figure}




\paragraph{Synthetic Data with Denser Graphs} We consider a scenario with $n=500$ observations, $q=50$ nodes, and $L=5$ categories. We randomly generate graphs with 25, 50, and 100 edges (Figure \ref{fig:2550100}). We report the Structural
hamming distance (SHD) between the true graph and the estimated graphs from OCD, BIC+, BIC, and BDe in Table \ref{tab:deg}. We find very minor decrease in performance of OCD whereas the competing methods perform substantially worse and deteriorate much faster.


\begin{figure}[h]
     \centering
     
  
     \begin{subfigure}[b]{.5\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_den_p25_truegraph.pdf}
    \caption{25 edges}
     \end{subfigure}
    %  \hfill
     \begin{subfigure}[b]{.5\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_den_p50_truegraph.pdf}
    \caption{50 edges}
     \end{subfigure}
        %   \hfill
      \begin{subfigure}[b]{.5\textwidth}
         \centering
\includegraphics[width=\linewidth]{sim_den_p100_truegraph.pdf}
    \caption{100 edges}
     \end{subfigure}
    %   \hfill
     
    
     \caption{Simulation truth of denser graphs.}
    \label{fig:2550100}
\end{figure}



\begin{table}[]
    \centering
    \begin{tabular}{llll}
    &\multicolumn{3}{c}{\# of Edges }\\\cline{2-4}
       & 25 & 50 & 100 \\\hline
    OCD & 0  & 0  & 1\\
    BIC+ &13 &-&-\\
    BIC &25 &48&92\\
    BDe&23&40&75
    \end{tabular}
    \caption{Structural
hamming distance between the true graph and the estimated graphs from OCD, BIC+, BIC, and BDe. The true graphs are generated randomly with 25, 50, and 100 edges. BIC+ is not applicable for 50 and 100 edges as it takes 150GB of memory. }
    \label{tab:deg}
\end{table}

\subsection{Real Data}


\paragraph{Sensitivity to Small Added Noise to CEP Data}  Following the idea in \citealt{mooij2016distinguishing}, we test the sensitivity of the best performing causal discovery methods (OCD with $L=15$, SLOPE, and bQCD) to small added noises. Specifically, we add independent centered Gaussian noise to $X$ and $Y$ with standard deviation $\tau\in\{10^{-8},10^{-7},\dots,10^{-1}\}$. We repeat the simulation of noises 5 times under each noise level and the average ACC and AUC are shown
in Figure \ref{fig:sens}. All methods have stable performance for $\tau=10^{-8}-10^{-5}$. The performance of SLOPE starts deteriorating at $\tau=10^{-4}$ whereas OCD and bQCD are much more robust (significant drop in ACC at $\tau=10^{-1}$). The robustness of OCD is expected because small added noise will not significantly affect data discretization.

\begin{figure*}[h]
     \centering
     \begin{subfigure}[b]{.25\textwidth}
         \centering
             \includegraphics[width=\textwidth]{CEP_sensitivity.pdf}
    \caption{ACC}
    \label{fig:up}
     \end{subfigure}
    %  \hfill
     \begin{subfigure}[b]{.25\textwidth}
         \centering
         \includegraphics[width=\textwidth]{CEP_sensitivity2.pdf}
    \caption{AUC}
    \label{fig:disc}
     \end{subfigure}
        \caption{Sensitivity to small added noise for the CEP data.}
    \label{fig:sens}
\end{figure*}

% \vfill



% \begin{table}[htb]
%     \centering
%       \caption{CEP data.}
%     \scalebox{.9}{\begin{tabular}{lcccc}
%         \hline
%         & SLOPPY  & EMD  & GR-AN  & GPI  \\ \hline
%         ACC & 0.59 & 0.55 & 0.4 & 0.6 \\
%         AUC & 0.67 & 0.53 & 0.47 & 0.61  \\
%         CPU & 1.3m & 4.6d &  NA & 30d  \\\hline
%         & PNL-MLP  & ANM  & CURE  & RECI  \\\hline
%         ACC &0.75 & 0.6 & 0.6 & 0.63  \\
%         AUC & 0.7& 0.45 & 0.61 & 0.68 \\
%         CPU & 8.3h & 3.2d& NA & 1.2h  \\\hline
%     \end{tabular}
%     }
    
%     \label{tab:cep}
% \end{table}

% \newpage

% %SHD (old simulations)
% \begin{table}[h]
%     \centering
%     \begin{tabular}{lllllll}
%     &\multicolumn{6}{c}{Signal Strength $\sigma$}\\\cline{2-7}
%       & 0.25 & 0.5 & 0.75 & 1 & 1.25 & 1.5 \\\hline
%         OCD & 4.6 & 1.2 & 0.6 & 0.2 & 0 & 0 \\ 
%         BDe & 7 & 7 & 7 & 7 & 6.6 & 5.4 \\ 
%         BIC & 7 & 7 & 7 & 7 & 7 & 7 \\ 
%         BIC+ & 7 & 6.2 & 5.6 & 5.2 & 3.8 & 3.8 \\ 
%     \end{tabular}
% \end{table}
% %SID (old simulations)
% \begin{table}[h]
%     \centering
%     \begin{tabular}{lllllll}
%     &\multicolumn{6}{c}{Signal Strength $\sigma$}\\\cline{2-7}
%       & 0.25 & 0.5 & 0.75 & 1 & 1.25 & 1.5 \\\hline
%         OCD & 8.2 & 1.8 & 1 & 0.2 & 0 & 0 \\ 
%         BDe & 13 & 12 & 7.6 & 7.8 & 5.8 & 3.8 \\ 
%         BIC & 13 & 9.4 & 7.6 & 8.2 & 8.2 & 9.4 \\ 
%         BIC+ & 13 & 13.6 & 15.6 & 14.8 & 11.2 & 11.2 \\ 
%     \end{tabular}
% \end{table}
% %SHD (new simulations: different G)
% \begin{table}[h]
%     \centering
%     \begin{tabular}{lllllll}
%     &\multicolumn{6}{c}{Signal Strength $\sigma$}\\\cline{2-7}
%       & 0.25 & 0.5 & 0.75 & 1 & 1.25 & 1.5 \\\hline
%         OCD & 6 & 2.4 & 0.6 & 0.4 & 0.4 & 0.4 \\ 
%         BDe & 7.8 & 7.8 & 7.8 & 7.6 & 7.6 & 7 \\ 
%         BIC & 7.8 & 7.8 & 7.8 & 7.8 & 7.8 & 7.4 \\ 
%         BIC+ & 7.8 & 6.4 & 5.4 & 5.6 & 5.2 & 4.6 \\ 
%     \end{tabular}
% \end{table}
% %SID (new simulations: different G)
% \begin{table}[h]
%     \centering
%     \begin{tabular}{lllllll}
%     &\multicolumn{6}{c}{Signal Strength $\sigma$}\\\cline{2-7}
%       & 0.25 & 0.5 & 0.75 & 1 & 1.25 & 1.5 \\\hline
%         OCD & 20.2 & 8 & 1.6 & 0.8 & 0.8 & 0.8 \\ 
%         BDe & 25.6 & 23.8 & 24.2 & 22.4 & 20.8 & 20 \\ 
%         BIC & 25.6 & 24.2 & 22.2 & 18.2 & 18.2 & 18.6 \\ 
%         BIC+ & 25.6 & 23.6 & 24.2 & 26.4 & 26.6 & 22.4 \\ 
%     \end{tabular}
% \end{table}
\clearpage
\bibliography{ref_short}

\vfill


\end{document}
