%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}




%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage[overload,ntheorem]{empheq}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{subfig}
\usepackage{comment}

\usepackage{float}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{definition}[theorem]{Definition}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{example}[theorem]{Example}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{corollary}[theorem]{Corollary}

\DeclareMathOperator{\pa}{pa}
\DeclareMathOperator{\ch}{ch}
\DeclareMathOperator{\an}{an}
\DeclareMathOperator{\de}{de}
\DeclareMathOperator{\cum}{cum}
\DeclareMathOperator{\ttop}{top}
\DeclareMathOperator{\argmax}{arg\,max}
\DeclareMathOperator{\argmin}{arg\,min}
\DeclareMathOperator{\var}{var}
\DeclareMathOperator{\rank}{rank}
\DeclareMathOperator{\thr}{thr}


%\usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{arrows.meta}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\indep}{\perp \!\!\! \perp}
\numberwithin{equation}{section}

\title{Learning Linear Non-Gaussian Polytree Models-Supplementary Materials}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Daniele Tramontano}
\author[2]{Anthea Monod}
\author[1]{\href{mailto:<mathias.drton@tum.edu>?Subject=Your UAI 2022 paper}{Mathias Drton}{}}


% Add affiliations after the authors
\affil[1]{%
Department of Mathematics and Munich Data Science Institute\\
Technical University of Munich\\
Germany
}
\affil[2]{%
Department of Mathematics\\
Imperial College London\\
UK
}

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE
\myexternaldocument{tramontano_678}

\begin{document}
\maketitle
\appendix
\section{Lemmas and Proofs}
\label{app:proofs}
\subsection{Section 2}
\begin{example}
\label{ex:two:vert}
Assume we have two random variables $X_1$ and $X_2$, coming from an unknown xSEM, and we wish to find the DAG $G$ that generates the model.  Assume further that $X_1$ and $X_2$ are correlated, leaving us with the two options
$G_1=([2],\{1\to 2\})$ and $G_2=([2],\{2\to 1\})$.  Under the model given by $G_1$, we have $X_1=\varepsilon_1$ and $X_2=\lambda_{1 2}X_1+\varepsilon_2$, and the covariance matrix from \eqref{eq:Sigma} becomes
\begin{equation}
    \Sigma=
        \begin{bmatrix}
            \varepsilon^{(2)}_1 & \lambda_{1 2}\varepsilon^{(2)}_1\\
            \lambda_{1 2}\omega_1 & \lambda^2_{1 2}\varepsilon^{(2)}_1+\varepsilon^{(2)}_2
        \end{bmatrix}.
\end{equation}
Observe that any positive definite $2\times 2$ matrix $\Sigma$ can be written in this way;  set $\varepsilon^{(2)}_1=\Sigma_{1 1}>0$, $\lambda_{1 2}=\Sigma_{2 1}/\Sigma_{1 1}$ and $\varepsilon^{(2)}_2=\Sigma_{2 2}-\Sigma_{1 2}^2/\Sigma_{1 1}>0$.  By symmetry, the model given by $G_2$ also allows its covariance matrices to be any positive definite $2\times 2$ matrix.  Hence, the sets of covariance matrices resulting from $G_1$ versus $G_2$ are the same, and nothing can be said about the graph on the basis of covariances alone.

Now we see how to solve this identifiability issue using higher order cumulants. The multi-trek rule yields that
\begin{equation*}
    \begin{aligned}
            \mathcal{C}^{(3)}_{111}&=\varepsilon^{(3)}_1, & \mathcal{C}^{(3)}_{112}&=\varepsilon^{(3)}_1\lambda_{12},\\
                \mathcal{C}^{(3)}_{122}&=\varepsilon^{(3)}_1\lambda_{12}^2,
            &\mathcal{C}^{(3)}_{222}&=\varepsilon^{(3)}_1\lambda_{12}^3+\varepsilon^{(3)}_2.
    \end{aligned}
\end{equation*} 
We observe a set of simple relations that imply that the matrix 
\begin{equation*}
    A^{1\to 2,3}=
        \begin{bmatrix}
            \Sigma_{11} & \mathcal{C}^{(3)}_{111} & \mathcal{C}^{(3)}_{112}\\
            \Sigma_{12} & \mathcal{C}^{(3)}_{112} & \mathcal{C}^{(3)}_{122}
        \end{bmatrix}
\end{equation*}
has rank 1; see also \cite{wang:drton:2020} where the first two columns of the matrix are considered.  Indeed, the second row of the matrix equals $\lambda_{12}$ times the first row.  
However, this rank constraint generally does not hold in cumulants $(\tilde\Sigma,\tilde{\mathcal{C}}^{(3)})\in\mathcal{M}^{(\le 3)}(G_2)$, where $G_2$ is the DAG with edge $1\xleftarrow{}2$.  A straightforward calculation confirms that the rank of $A^{1\to 2,3}(\tilde\Sigma,\tilde{\mathcal{C}}^{(3)})$ drops to one iff $\lambda_{21}=0$ or $\varepsilon^{(3)}_1=\varepsilon^{(3)}_2=0$.  In other words, for correlated variables $X_1$ and $X_2$ generated with at least one of the errors non-Gaussian with nonzero third moment, the rank constraint on $A^{1\to 2,3}$ discriminates $G_1$ and $G_2$.
these relations do not hold, in general, when the model is generated from the other graph, so checking if these relations holds or not give us a way to identify the right graph, up to a measure 0 set of parameters given by the intersection of $\mathcal{M}^{(\leq3)}(G_1)$ and $\mathcal{M}^{(\leq3)}(G_2)$. 
\end{example}



\begin{proof}[Proof of Corollary~\ref{cor:simple-trek-rule}]
    We say that a $k$-trek $T=(P_1,\dots,P_k)$ factorizes through $k$-trek $S=(Q_1,\dots,Q_k)$ if $Q_j\subset P_j$ for all $j$.   Indeed, it is easy to see that $\lambda^T=\lambda^{S}\lambda^{T-S}$, where  $T-S=(P_1-Q_1,\dots,P_k-Q_k)\in\mathcal{T}^S=\mathcal{T}(\ttop(S),\dots,\ttop(S))$ with $P_j-Q_j$ being the directed path from $\ttop(T)$ to $\ttop(S)$ that remains after removing the edges in $Q_j$ from $P_j$.

For the sake of readability, when the considered set of vertices is clear from the context, we denote the set of treks (and simple treks) by only $\mathcal{T}$ (and $\mathcal{S})$.  Now note that every trek factorizes along one and only one simple trek.  Hence, we can partition $\mathcal{T}=\dot{\bigcup}_{S\in \mathcal{S}}\mathcal{T}^{S}$, according to the simple trek through which the factorization occurs.  The expression in \eqref{eq:trek} may thus be rewritten as
\begin{equation*}
\begin{aligned}
        \mathcal{C}^{(k)}_{i_1,\dots,i_k}(G)=\displaystyle\sum_{S\in \mathcal{S}}\lambda^{S}\left(\displaystyle\sum_{T\in\mathcal{T}^{S}}\varepsilon^{(k)}_{\ttop(T)}\lambda^{T-S}\right).
\end{aligned}
\end{equation*}
By the multi-trek rule, the term in parentheses is  $\mathcal{C}^{(k)}_{\ttop(S)}(G)$.
\end{proof}

\begin{proof}[Proof of Corollary~\ref{cor:simple_trek_2}]
Since the graph is acyclic, the only trek in $\mathcal{T}(i,..,i)$ that has $i$ as $top$ is the trivial trek, from which come the $\varepsilon_i^{(k)}$ term. All the other treks in $\mathcal{T}(i,..,i)$ factorize through a set of distinct parents of $i$, so we can write the sum in~\ref{eq:trek} as
\begin{equation*}
    \displaystyle\sum_{p_1,\dots,p_k\in \pa(i)}\lambda_{p_1, i}\cdots\lambda_{p_k,i}(\displaystyle\sum_{T\in\mathcal{T}(p_1,..,p_k)}\lambda^T\varepsilon^{(k)}_{top(T)})+\varepsilon_i^{(k)}
\end{equation*}
and the term inside the internal parenthesis is $\mathcal{C}^{(k)}_{p_1,\dots,p_k}(G)$.

\end{proof}

\begin{proof}[Proof of Proposition~\ref{prop:rank}]
  By the simple multi-trek rule for polytrees 
  %discussed previously, 
%  \[
$
  c^{e,k}_m =\lambda_{ij}^{k-m}\mathcal{C}_{i}^{(k)}.
$
%  \]    
  Therefore, $c^{e,k}_m=\lambda_{ij}c^{e,k}_{m+1}$ so that the second row of $A^{e,K}$ equals $\lambda_{ij}$ times the first row.
\end{proof}

\subsection{Section 3}
\begin{proof}[Proof of Proposition~\ref{prop:cl:corr}]
As noted, $\mathcal{M}(R)$ may be computed using Kruskal's algorithm, which considers all edges in decreasing order of their weights and adds them to the spanning tree if their presence does not create a (undirected) cycle.

Let $\mathcal{S}(G)$ be the skeleton of $G$.  For a contradiction, assume that $\mathcal{M}(R)\not=\mathcal{S}(G)$.  Since both graphs are trees, we have $L:=\mathcal{M}(R)\setminus\mathcal{S}(G)\neq\emptyset$. Take $\tilde{e}=\{i,j\}=\argmax_{e\in L}|\rho_{e}|$.  Then $|\rho_{\tilde{e}}|\neq0$ and, thus, $\mathcal{T}(i,j)\neq\emptyset$.   The unique trek in $G$ that connects $i$ and $j$ must contain an edge $e$ that is not in $\mathcal{M}(R)$; otherwise we would have two paths between $i$ and $j$ in $\mathcal{M}(R)$ which cannot occur as $\mathcal{M}(R)$ is a tree.  Moreover, Wright's formula from Lemma~\ref{lem:wright} and the assumption made on the correlation coefficients imply that $|\rho_{\tilde{e}}|<|\rho_{e}|$.  But then $e$ would appear before $\tilde{e}$ in Kruskal's algorithm and $e$ would be added to $\mathcal{M}(R)$ since all the edges in $\mathcal{M}(R)$ with a weight higher than the weight of $\tilde{e}$ are correctly classified and so the presence of $e$ could not create a loop. We have arrived at a contradiction.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{theo:generic_cum}]
The first claim is a restatement of Proposition~\ref{prop:rank}. To prove the second claim, we need to show that for generic error cumulants, at least one of the $2\times 2$ subdeterminants of $A^{j\to i,K}$ is nonzero. Since these minors are polynomials in the cumulants, it is enough to prove that they are not identically zero; see, e.g., the lemma in \cite{okamoto:1973}.

By the simple multi-trek rule (Corollary~\ref{cor:simple-trek-rule}),
\begin{align*}
    &c^{j\to i,2}_i=\Sigma_{i,j}=\lambda_{i,j}\Sigma_{i,i}, \\
    &c^{j\to i,k}_m=\lambda^m_{i,j}\mathcal{C}^{(k)}_i \quad\forall m<k\leq K.
\end{align*}
%we have $c^{j\to i,2}_i=\Sigma_{i,j}=\lambda_{i,j}\Sigma_{i,i}$, and $c^{j\to i,s}_m=\lambda^m_{i,j}\mathcal{C}^{(s)}_i$ for every $m<s\leq K$, while from 
By Corollary~\ref{cor:simple_trek_2},
\begin{align*}
    c^{j\to i,2}_2=\Sigma_{j,j}&=\sum_{p,q\in\pa(i)}\lambda_{p,j}\lambda_{q,j}\Sigma_{p,q}+\varepsilon^{(2)}_j\\
    &=\sum_{p\in\pa(j)}\lambda_{p,j}^2\Sigma_{p,p}+\varepsilon_j^{(2)},
\end{align*}
%we have $c^{j\to i,2}_2=\Sigma_{j,j}=\sum_{p,q\in\pa(i)}\lambda_{p,j}\lambda_{q,j}\Sigma_{p,q}+\varepsilon^{(2)}_j$. 
because in a polytree any two distinct parents $p$ and $q$ have $\mathcal{T}(p,q)=\emptyset$ and, thus, $\Sigma_{p,q}=0$ by Lemma~\ref{lem:wright}.  

Consider now a minor involving the first column together with any other column with $m<k$.  This minor is
\begin{equation*}
    \begin{aligned}
        &\Sigma_{j,j}c^{j\to i,k}_{m-1}-\Sigma_{i,j}c^{j\to i,k}_{m}\\
        &=\Big[\sum_{p\in\pa(j)}\lambda_{p,j}^2\Sigma_{p,p}+\varepsilon_j^{(2)}\Big]\lambda^{m-1}_{i,j}\mathcal{C}^{(k)}_i-\lambda_{i,j}\Sigma_{i,i}\lambda^{m}_{i,j}\mathcal{C}^{(k)}_i\\
        &=\lambda^{m-1}_{i,j}\mathcal{C}^{(k)}_i\Big[\sum_{p\in\pa(j)}\lambda_{p,j}^2\Sigma_{p,p}+\varepsilon_j^{(2)}-\lambda^2_{i,j}\Sigma_{i,i}\Big]\\
        &=\lambda^{m-1}_{i,j}\mathcal{C}^{(k)}_i\Big[\sum_{p\in\pa(j)\setminus{i}}\lambda_{p,j}^2\Sigma_{p,p}+\varepsilon_j^{(2)}\Big].
    \end{aligned}
\end{equation*}
The term in parentheses is always positive, while the front factor is nonzero provided $\lambda_{i,j}\not=0$ and the error cumulants are chosen such that $\mathcal{C}^{(k)}_i\not=0$ (e.g., take any distribution with $\varepsilon_k^{(k)}=0$, for every $k\neq j$, and $\varepsilon^{(k)}_j\neq0$).
\end{proof}

\begin{proof}[Proof of Proposition~\ref{prop:MEC}]
The claim follows from Lemma~\ref{lem:wright} as $i\to j\xleftarrow{}l$ is the only case in which there are no treks between $i,l$.
\end{proof}

\begin{proof}
From Theorem~\ref{theo:generic_cum} we derive the correctness of the rank condition and thus of the entire algorithm  \textit{PairwiseOrientation\_Pop} as well as the relevant parts of the other two algorithms.  Proposition~\ref{prop:MEC} yields the correctness of the remaining parts of the other two algorithms. 
\end{proof}

\subsection{Section 4}

\begin{lemma}
\label{lemma:conc:ineq}
Consider a degree $k$ polynomial $f(X)=f(X_1,\dots,X_m)$, where $X_1,\dots,X_m$ are possibly dependent random variables with log-concave joint distribution on $\mathbb{R}^m$. Then exists a constant $L>0$ such that for all $\delta$ with
\begin{equation*}
    \frac{2}{L}\left(\frac{\delta}{e\sqrt{\var[f(X)]}}\right)^{\frac{1}{K}}>2,
\end{equation*}
we have
\begin{equation*}
    \begin{aligned}
            &\mathbb{P}[|f(X)-\mathbb{E}[f(X)]|>\delta] \\    &\leq\exp\left\{-\frac{2}{L}\left(\frac{\delta}{\sqrt{\var[f(X)]}}\right)^{\frac{1}{K}}\right\}.
    \end{aligned}
\end{equation*}
\end{lemma}
\begin{proof}[Proof of Corollary ~\ref{cor:log:cum}]
The results follows by bounding the variance of $\hat{c}^{(i,j),k}_m$.  To this end, we may express the variance as a linear combination of products of moments based on the definition of cumulants.  We may then bound each product of moments by a power of $M_K$ and note that no weight may exceed $(K-1)!$.  Applying Stirling's approximation for the factorial and bounding the Bell numbers that count the number of summands gives the result.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{theo:chow:cons}]
Let $F$ be the event defined in Lemma~\ref{lemma:lower:bound}.  Then 
\begin{equation*}
    \begin{aligned}
    &\mathbb{P}(\mathcal{M}_n(G)=\mathcal{S}(G))\geq\mathbb{P}(F)\\
    &\geq 1-\sum_{i\le j} \mathbb{P}(|\hat{\rho}_{i,j}-\rho_{i,j}|>\gamma),
    \end{aligned}
\end{equation*}
where the last inequality comes from the union bound. 

Correlations are scale-invariant, thus the value of $\rho_{i,j}$ and the distribution of $\hat\rho_{i,j}$ do not change under rescaling of the observed variables to $\tilde{X}_i=X_i/\lambda$.  Now, we apply Lemma~\ref{lemma:sym:matrices} with $A=\Tilde{\Sigma}$ and $B=\Tilde{\Sigma}^*$ as the new population and sample covariance matrices, respectively. Let $||\Tilde{\epsilon}||_{\infty}=||\Tilde{\Sigma}- \Tilde{\Sigma}^*||_{\infty}$. Then $||\Tilde{\epsilon}||_{\infty}\leq\frac{||\epsilon^{(i,j),2}||_{\infty}}{\lambda}$, where we indicate with $\epsilon^{(i,j),2}$ the vector containing the errors $\epsilon^{(i,j),2}_m$, for $m=0,1,2$.  This allows us to write
\begin{equation*}
    \begin{aligned}
        \mathbb{P}(|\rho^*_{i,j}-\rho_{i,j}|>\gamma)&\leq\mathbb{P}(||\Tilde{\epsilon}||_\infty>\frac{\gamma}{2+\gamma})\\
        &\leq\mathbb{P}(||\epsilon^{(i,j),2}||_\infty>\frac{\lambda\gamma}{2+\gamma})\\ &\leq3\exp\left\{-\frac{1}{2L\sqrt{M_2}}\left(\frac{\lambda\gamma\sqrt{n}}{2+\lambda}\right)^{\frac{1}{2}}\right\},
    \end{aligned}
\end{equation*}
where the last inequality comes from a union bound and Corollary~\ref{cor:log:cum} with $K=2$.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:taylor}]
\label{Taylor}
By Taylor expansion, we have
\begin{equation}
    \begin{aligned}
    f(c+\epsilon)-f(c)=\nabla(f)_{|c}\cdot\epsilon+\frac{\epsilon^t\cdot H(f)_{|c}\cdot\epsilon}{2},
    \end{aligned}
\end{equation}
where $H(f)_{|c}$ is the Hessian matrix of $f$ computed in $c$.  For the special quadratic polynomial $f$, the Hessian is constant with entries $\pm 1$, and the gradient contains entries of $c$, possibly negated.  The result follows by triangle inequality.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{theo:chow:cons}]
The event of correctly reconstructing the polytree $G$ is the intersection of the two events of correctly recovering of the skeleton and correctly orienting all the edges.  The probability of this intersection is bounded from below by the sum of the probabilities of the two events minus one.  Theorem~\ref{theo:chow:cons} and  Lemma~\ref{lem:po:lower:bound} imply the result.
\end{proof}


\begin{proof}[Proof of Lemma~\ref{lem:po:lower:bound}]
Given that we start with the correct skeleton, a mistake in the orientation appears if and only if there is an edge, $e$ in the skeleton for which $||\hat{v}_{r}(e)||\geq||\hat{v}_{w}(e)|$, so the union bound leads to the following lower bound 
\begin{equation*}
    \begin{aligned}
        &\mathbb{P}(\mathcal{A}_n^{PO}(E_{\mathcal{S}(G)},K)=G)\\
        &=\mathbb{P}(||\hat{v}_{r}(e)||<||\hat{v}_{w}(e)||, \forall e\in\mathcal{S}(G))\\
        &\geq1-(p-1)\mathbb{P}(||\hat{v}_{r}(e)||\geq||\hat{v}_{w}(e)||).
    \end{aligned}
\end{equation*}
Now we need an upper bound on $\mathbb{P}(||\hat{v}_{r}(e)||\geq||\hat{v}_{w}(e)||)$, adding and subtracting $||v_{w}(e)||$ on the right hand side of the inequality, and using $||v_{w}(e)||>\delta$, led to
\begin{equation*}
    \begin{aligned}
    &\mathbb{P}(||\hat{v}_{r}(e)||\geq||\hat{v}_{w}(e)||)\\
    &\leq\mathbb{P}(||\hat{v}_{r}(e)||+(||v_{w}(e)||-||\hat{v}_w(e)||)\geq\delta)\\
    &\leq\mathbb{P}(||\hat{v}_{r}(e)||+(||v_{w}(e)-\hat{v}_w(e)||)\geq\delta)\\
    &\leq\mathbb{P}(||\hat{v}_{r}(e)||\geq\tfrac{\delta}{2})+\mathbb{P}(||v_{w}(e)-\hat{v}_w(e)||\geq\tfrac{\delta}{2})\\
    &=\mathbb{P}(||\hat{v}_r(e)||^2\geq\tfrac{\delta^2}{4})+\mathbb{P}(||v_{w}(e)-\hat{v}_w(e)||^2\geq\tfrac{\delta^2}{4}).
    \end{aligned}
\end{equation*}
Now, using that $v_r(e)=0$ and applying Lemma~\ref{lem:taylor}, we obtain that
\begin{equation*}
    \begin{aligned}
    &\mathbb{P}(||\hat{v}_r(e)||^2\geq\tfrac{\delta^2}{4})+\mathbb{P}(||v_{w}(e)-\hat{v}_w(e)||^2\geq\tfrac{\delta^2}{4})\\
    &\leq\mathbb{P}(||\hat{v}^i_r(e)||^2\geq\tfrac{\delta^2}{4B(K)})+\\
    &\qquad+\mathbb{P}(||v^i_{w}(e)-\hat{v}^i_w(e)||^2\geq\tfrac{\delta^2}{4B(K)})\\
    &\leq2\mathbb{P}(4M_K||\epsilon||_{\infty}+2||\epsilon||_{\infty}^2\geq\tfrac{\delta}{2\sqrt{B(K)}}),
    \end{aligned}
\end{equation*}
where $v^i$ is one of the components of the vector. Finally, we obtain that
\begin{equation*}
    \begin{aligned}
        &\mathbb{P}(4M_K||\epsilon||_{\infty}+2||\epsilon||_{\infty}^2\geq\tfrac{\delta}{2\sqrt{B(K)}})\\
        &\leq\mathbb{P}(||\epsilon||_{\infty}\geq\tfrac{\delta}{4M_K\sqrt{B(K)}})+\mathbb{P}(||\epsilon||_{\infty}\geq\tfrac{\sqrt{\delta}}{\sqrt[4]{4B(K)}}).
    \end{aligned}
\end{equation*}
With $\delta':=\min\{\frac{\delta}{4M_K\sqrt{B(K)}},\frac{\sqrt{\delta}}{\sqrt[4]{4B(K)}}\}$ we can bound the right hand side of the last inequality by
\begin{equation*}
    \begin{aligned}
        2\mathbb{P}(||\epsilon||_\infty\geq\delta')\leq 2B(K)\exp\left\{-\tfrac{2}{LK^2\sqrt{M_K}}\left(\delta'\sqrt{n}\right)^{\frac{1}{K}}\right\}
    \end{aligned}
\end{equation*}
whenever $\frac{2}{LK^2\sqrt{M_K}}\left(\frac{\delta'\sqrt{n}}{e}\right)^{\frac{1}{K}}>2$ holds, and this concludes the proof.
\end{proof}



\begin{proof}[Proof of Lemma~\ref{lem:rho_crit}]
It suffices to show that under the event $\tilde{F}=\bigcap\{|\hat{\rho}_{i,j}-\rho_{i,j}|<\tilde{\gamma}\}$ all independence tests give the correct decision.  This claim was proven in Theorem 3.5 of \citep{lou:2021}.
The rest of the proof is the same as for Theorem~\ref{theo:chow:cons}.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{theo:pt_tp:cons}]
Having $\tilde{\gamma}\leq\gamma$ from Lemma~\ref{lemma:lower:bound} and Lemma~\ref{lem:rho_crit}, the probability that the skeleton is recovered correctly and that all the independence tests give the right answer is bounded from below by the probability of the event considered in Lemma~\ref{lemma:lower:bound}. Hence, it only remains to bound the probability that all cumulant tests correctly orient their respective edge. This can be done as in Lemma~\ref{lem:po:lower:bound}. Finally, carrying over the arguments from the proof of Theorem~\ref{theo:po:cons} gives the result. Notice that in the case of Algorithm~\ref{alg:samp:trip_pair}, the constant $\alpha^*<p-1-\textit{CP}$, where $\textit{CP}$ is the number of directed edges in the CPAG associated to $G$. 
\end{proof}


\section{Sample Version of the Algorithms}
\label{app:sec:samp:alg}

\begin{algorithm}[H]\caption{PairwiseOrientation$(E,K)$}
\label{alg:samp:pair}
    \begin{algorithmic}[1]
        \State{$O\gets\emptyset$}
        \For{$e=\{i,j\}\in E$} 
            \If{$||\hat{v}_{i\to j}(\{i,j\})||<||\hat{v}_{j\to i}(\{i,j\})||$}
                \State{$O\gets O\cup\{i\to j\}$}
                \Else\,  \State{$O\gets O\cup\{j\to i\}$}
            \EndIf
        \EndFor
    \Return $O$
    \end{algorithmic}
\end{algorithm}

\begin{algorithm}[H]\caption{PTO$(E,K,\rho_{\theta})$}
\label{alg:samp:pair_trip}
    \begin{algorithmic}[1]
        \State{$O\gets\emptyset$}
        \For{$i-j-k\in E$}
            \If{$|\hat{\rho}_{i,k}|<\rho_{\theta}$}
                \State{$E\gets E\setminus\{\{i,j\},\{j,k\}\}$}
                \State{$O\gets O\cup\{i\to j, k\to j\}$}
            \EndIf
        \EndFor
        \For{$i\to j\in O$}
            \For{$j-l\in E$}
                \State{$E\gets E\setminus\{j-l\}$}
                \State{$O\gets O\setminus\{j\to l\}$}
            \EndFor
        \EndFor
        \For{$\{i,j\}\in E$}
            \If{$||\hat{v}_{i\to j}(\{i,j\})||<||\hat{v}_{j\to i}(\{i,j\})||$}
                \State{$O\gets O\cup\{i\to j\}$}
                \For{$j-l\in E$}
                    \State{$E\gets E\setminus\{j-l\}$}
                    \State{$O\gets O\setminus\{j\to l\}$}
                \EndFor
                \Else 
                    \State{$O\gets O\cup\{j\to i\}$}
                    \For{$i-l\in E$}
                        \State{$E\gets E\setminus\{i-l\}$}
                        \State{$O\gets O\setminus\{i\to l\}$}
                    \EndFor
            \EndIf
        \EndFor
        \Return{O}
    \end{algorithmic}
\end{algorithm}
\vfill\mbox{ }

\begin{algorithm}[H]\caption{TPO$(E,K,O,o,\rho_{\theta})$}
\label{alg:samp:trip_pair}
    \begin{algorithmic}[1]
        \If{$E\neq\emptyset$}
            \If{$o=\emptyset$}
                \State{$\{i,j\}\gets E[1]$}
                \If{$||\hat{v}_{i\to j}(\{i,j\})||<||\hat{v}_{j\to i}(\{i,j\})||$}
                    \State{$o\gets (i\to j)$}
                    \State{$O\gets O\cup\{o\}$}
                    \Else  
                        \State{$o\gets (j\to i$)}
                        \State{$O\gets O\cup\{o\}$}
                \EndIf
            \EndIf
            \State{$E_o\gets E\cap t(o)$}
            \If{$E_o\neq\emptyset$}
                \State{$E\gets E\setminus E_o$}
                \For{$t(o)-k\in E_o$}
                    \If{$|\hat{\rho}_{s(o),k|}<\rho_{\theta}$}
                        \State{$O\gets O\cup\{k\to t(o)\}$}
                        \Else                                                       \State{$O\gets O\cup\{t(o)\to w\}$}
                            \State{$o\gets(t(o)\to w)$}                        
                            \State{$O,E\gets TPO\_Pop(E,K,O,o,\rho_{\theta})$}
                    \EndIf
                \EndFor
            \EndIf
            \State{$O,E\gets TPO\_Pop(E,K,O,\emptyset,\rho_{\theta})$}
        \EndIf
        \Return{O,E}
    \end{algorithmic}
\end{algorithm}

\bibliography{tramontano_678}

\end{document}
