
\subsection{Proof of Theorem~\ref{theorem:bivariate_identifiability}}

\begin{proof}
  If \(\mathcal{G}_0\) is the empty graph we have \(\mathbf{X}_1 \indep \mathbf{X}_2\). If the graph is not empty, and we have \(\mathbf{X}_1 \indep \mathbf{X}_2\) causal minimality is violated. Hence, we assume that the graph is not empty and \(\mathbf{X}_1 \not\indep \mathbf{X}_2\). The joint density has the following form
  \begin{equation*}
    p_{\mathbf{X}_1,\mathbf{X}_2}(\mathbf{x}_1, \mathbf{x}_2) = p_{\mathbf{X}_1}(\mathbf{x}_1)p_{\mathbf{N}_2}({\mathbf{x}_2} - f_2(\mathbf{x}_1)).
  \end{equation*}
  Let us assume \(\mathcal{G}_0\) is not identifiable from \(P_\mathbf{X}\) alone, i.e., there must exist a backward model of the same form
  \begin{equation*}
    p_{\mathbf{X}_1,\mathbf{X}_2}(\mathbf{x}_1, \mathbf{x}_2) = p_{{\mathbf{X}_2}}({\mathbf{x}_2})p_{\mathbf{N}_1}(\mathbf{x}_1-f_1({\mathbf{x}_2})).
  \end{equation*}
  Define
  \begin{equation}\label{eq:forwards_model}
    \pi_1(\mathbf{x}_1,\mathbf{x}_2) \coloneqq \nu({\mathbf{x}_2}-f_2(\mathbf{x}_1)) + \xi(\mathbf{x}_1)
  \end{equation}
  and
  \begin{equation}\label{eq:backwards_model}
    \pi_2(\mathbf{x}_1,\mathbf{x}_2) \coloneqq \tilde\nu(\mathbf{x}_1-f_1({\mathbf{x}_2})) + \eta({\mathbf{x}_2}),
  \end{equation}
  where \(\tilde{\nu} \coloneqq \log p_{\mathbf{N}_1}\) and \(\eta \coloneqq \log p_{\mathbf{X}_2}\). Clearly, we have that \(\pi_1(\mathbf{x}_1,\mathbf{x}_2) = \pi_2(\mathbf{x}_1,\mathbf{x}_2) = \log p_{\mathbf{X}_1,\mathbf{X}_2}(\mathbf{x}_1,\mathbf{x}_2)\).

  Considering first the backwards model (Eq.~\eqref{eq:backwards_model}) we derive the gradient \(\nabla_{\mathbf{x}_1} \pi_2(\mathbf{x}_1,\mathbf{x}_2)\) with respect to \(\mathbf{x}\), i.e.,
  \begin{equation}
    \nabla_{\mathbf{x}_1} \pi_2(\mathbf{x}_1,\mathbf{x}_2) = \nabla \tilde\nu(\mathbf{u}),
  \end{equation}
  where \(\tilde{\mathbf{u}} \coloneqq \tilde{\mathbf{u}}(\mathbf{x}_1,\mathbf{x}_2) \coloneqq \mathbf{x}_1- f_1({\mathbf{x}_2})\).
  Then, the second derivatives take the following form
  \begin{equation}
    D_{\mathbf{x}_1\mathbf{x}_2}\pi_2(\mathbf{x}_1,\mathbf{x}_2) = -\mathbf{J}_{f_1}({\mathbf{X}_2})^\top \mathbf{H}_{\tilde\nu}(\tilde{\mathbf{u}}),
  \end{equation}
  and
  \begin{equation}
    D_{\mathbf{x}_1\mathbf{x}_1} \pi_2(\mathbf{x}_1,\mathbf{x}_2) = \mathbf{H}_{\tilde\nu}(\tilde{\mathbf{u}}),
  \end{equation}
  with Jacobian \(\mathbf{J}_{f_1} \in \mathbb{R}^{d_{x_1} \times d_{x_2}}\) and Hessian \(\mathbf{H}_{\tilde\nu} \in \mathbb{R}^{d_{x_1} \times d_{x_1}}\). Since we have assumed that \(f_1\) and \(f_2\) are three times continuously differentiable the Hessian is symmetric (Schwarz's theorem) and invertible such that
  \begin{equation*}
    \begin{split}
      {Q}_2(\mathbf{x}_1,\mathbf{x}_2) &\coloneqq \left(D_{\mathbf{xx}} \pi_2(\mathbf{x}_1,\mathbf{x}_2)\right)^{-1} \left(D_{\mathbf{x}_1\mathbf{x}_2}\pi_2(\mathbf{x}_1,\mathbf{x}_2)\right)^\top  \\
      &= \mathbf{H}_{\tilde\nu}(\tilde{\mathbf{u}})^{-1}(-\mathbf{J}_{f_1}({\mathbf{x}_2})^\top \mathbf{H}_{\tilde\nu}(\tilde{\mathbf{u}}))^{\top} \\
      &= -\mathbf{J}_{f_1}({\mathbf{x}_2}),
    \end{split}
  \end{equation*}
  which does not depend on \(\mathbf{x}_1\).

  Now, we repeat the above steps for Eq.~\eqref{eq:forwards_model}, i.e.,
  \begin{equation}
    \nabla_{\mathbf{x}_1} \pi_1(\mathbf{x}_1,\mathbf{x}_2) = -\mathbf{J}_{f_2}(\mathbf{x}_1)^\top \nabla \nu\bigl(\mathbf{u}\bigr) + \nabla \xi(\mathbf{x}_1),
  \end{equation}
  and
  \begin{equation}
    D_{\mathbf{x}_1\mathbf{x}_2}\pi_1(\mathbf{x}_1,\mathbf{x}_2) = -\mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}\bigl(\mathbf{u}\bigr),
  \end{equation}
  where \(\mathbf{J}_{f_2}(\mathbf{x}_1) \in \mathbb{R}^{d_{x_2} \times d_{x_1}}\), \(\nabla \xi(\mathbf{x}_1) \in \mathbb{R}^{d_{x_1}}\), and \(\mathbf{H}_{\nu}\bigl(\mathbf{u}\bigr) \in \mathbb{R}^{d_{x_2} \times d_{x_2}}\). Finally,
  \begin{align}
    D_{\mathbf{x}_1\mathbf{x}_1}\pi_1(\mathbf{x}_1,\mathbf{x}_2) &= \mathbf{H}_{\xi}(\mathbf{x}_1)
    - \mathbf{H}_{f_2}(\mathbf{x}_1)[\nabla \nu\bigl(\mathbf{u}\bigr)] \\
    &+ \mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}\bigl(\mathbf{u}\bigr) \mathbf{J}_{f_2}(\mathbf{x}_1),
  \end{align}
  where \(\mathbf{H}_{\xi}(\mathbf{x}_1) \in \mathbb{R}^{d_{x_1} \times d_{x_1}}\) and the Hessian \(\mathbf{H}_{f_2} \in \mathbb{R}^{d_{x_2} \times d_{x_1} \times d_{x_1}}\) is a third-order tensor such that the contraction with the vector \(\nabla \nu\bigl(\mathbf{u}\bigr)\) may be written as \(\mathbf{H}_{f_2}[\nabla \nu\bigl(\mathbf{u}\bigr)]\), i.e.
  \begin{equation*}
    \left( \mathbf{H}_{f_2}[\nabla \nu\bigl(\mathbf{u}\bigr)] \right)_{ik} \\
    =\sum_{j=1}^{d_{x_2}} \frac{\partial^2 f_j(\mathbf{x}_1)}{\partial x_{1i} \partial x_{1k}}\,\left[\nabla \nu\bigl(\mathbf{u}\bigr)\right]_j.
  \end{equation*}
  We find
  \begin{align*}
    Q_1(\mathbf{x}_1, \mathbf{x}_2) &\coloneqq (D_{\mathbf{x}_1\mathbf{x}_1}\pi_1(\mathbf{x}_1,\mathbf{x}_2))^{-1}(D_{\mathbf{x}_1\mathbf{x}_2}\pi_1(\mathbf{x}_1,\mathbf{x}_2)) \\
    &= \bigg[\mathbf{H}_{\xi}(\mathbf{x}_1)
      - \mathbf{H}_{f_2}(\mathbf{x}_1)[\nabla \nu(\mathbf{u})]\\
    &+ \mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}\bigl(\mathbf{u}\bigr) \mathbf{J}_{f_2}(\mathbf{x}_1)\bigg]^{-1} \\
    &\ [-\mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}(\mathbf{u})].
  \end{align*}

  Recall that \(Q_2(\mathbf{x}_1, \mathbf{x}_2) = Q_2(\mathbf{x}_2)\) is essentially a function only of \({\mathbf{x}_2}\) such that \(D_{\mathbf{x}_1} Q_2(\mathbf{x}_1, \mathbf{x}_2) = \mathbf{0} \in \mathbb{R}^{d_{x_1}\times d_{x_1} \times d_{x_2}}\). Since \(Q_1(\mathbf{x}_1, \mathbf{x}_2) = Q_2(\mathbf{x}_1, \mathbf{x}_2)\), taking the derivative with respect to \(\mathbf{x}_1\) yields
  \begin{align*}
    D_{\mathbf{x}_1}  Q_1(\mathbf{x}_1, \mathbf{x}_2) &= D_{\mathbf{x}_1}  \big[(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1(\mathbf{x}_1,\mathbf{x}_2))^{-1} \\
    & \quad (D_{\mathbf{x}_1\mathbf{x}_2}\pi_1(\mathbf{x}_1,\mathbf{x}_2))\big] \\
    &= \mathbf{0} \in \mathbb{R}^{d_{x_1}\times d_{x_1} \times d_{x_2}}.
  \end{align*}
  Note that each slice \(\partial Q_1(\mathbf{x}_1, \mathbf{x}_2)/\partial x_{1k}\) is matrix-valued, and \(d_{x_1}\)-many such slices exist. Thus, we apply the product rule component-wise for each slice of the tensor and subsequently stack them together, i.e.
  \begin{align*}
    D_{\mathbf{x}_1} Q_1(\mathbf{x}_1, \mathbf{x}_2) &= D_{\mathbf{x}_1}(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1(\mathbf{x}_1,\mathbf{x}_2)^{-1})\\
    &\quad D_{\mathbf{x}_1\mathbf{x}_2}\pi_1(\mathbf{x}_1,\mathbf{x}_2) \\
    &+ (D_{\mathbf{x}_1\mathbf{x}_1}\pi_1(\mathbf{x}_1,\mathbf{x}_2))^{-1} \\
    &\quad D_{\mathbf{x}_1}(D_{\mathbf{x}_1\mathbf{x}_2}\pi_1(\mathbf{x}_1,\mathbf{x}_2)).
  \end{align*}
  Using the identity
  \begin{equation*}
    D_{\mathbf{x}_1}(\mathbf{A})^{-1}
    = - (\mathbf{A})^{-1}
    D_{\mathbf{x}_1}\mathbf{A} (\mathbf{A})^{-1},
  \end{equation*}
  where \(\mathbf{A} \coloneqq \mathbf{A}(\mathbf{x}_1,\mathbf{x}_2) \in \mathbb{R}^{d_{x_1}\times d_{x_1}}\). Therefore, we obtain
  \begin{multline*}
    -(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)^{-1}
    D_{\mathbf{x}_1}(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1) (D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)^{-1} D_{\mathbf{x}_1\mathbf{x}_2}\pi_1 \\
    = -(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)^{-1}
    D_{\mathbf{x}_1} (D_{\mathbf{x}_1\mathbf{x}_2}\pi_1),
  \end{multline*}
  where we drop the arguments of \(\pi_1(\mathbf{x}_1,\mathbf{x}_2)\), i.e., \(\pi_1 \coloneqq \pi_1(\mathbf{x}_1,\mathbf{x}_2)\) to improve readability.
  Making sure that all of the matrix-tensor products act on the appropriate indices of the tensors, the expression simplifies to
  \begin{equation*}
    D_{\mathbf{x}_1}(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1) (D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)^{-1} D_{\mathbf{x}_1\mathbf{x}_2}\pi_1 = D_{\mathbf{x}_1} (D_{\mathbf{x}_1\mathbf{x}_2}\pi_1).
  \end{equation*}
  Note that only \(D_{\mathbf{x}_1}(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)\) contains third order derivatives of the log marginal \(\xi\). Let us state this more explicitly.
  \begin{align*}
    D_{\mathbf{x}_1}(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1) &= D_{\mathbf{x}_1}\Big[\mathbf{H}_\xi(\mathbf{x}_1) - \mathbf{H}_{f_2}(\mathbf{x}_1)[\nabla \nu(\mathbf{u})] \\
    &+ \mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}(\mathbf{u}) \mathbf{J}_{f_2}(\mathbf{x}_1) \Big] \\
    &= D_{\mathbf{x}_1}\mathbf{H}_\xi(\mathbf{x}_1) - D_{\mathbf{x}_1}(\mathbf{H}_{f_2}(\mathbf{x}_1)[\nabla \nu(\mathbf{u})]) \\
    &+ D_{\mathbf{x}_1}(\mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}(\mathbf{u}) \mathbf{J}_{f_2}(\mathbf{x}_1)).
  \end{align*}
  And finally we have
  \begin{multline}\label{app_eq:tensor_differential_eq}
    D_{\mathbf{x}_1}\mathbf{H}_\xi(\mathbf{x}_1) (D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)^{-1} D_{\mathbf{x}_1\mathbf{x}_2}\pi_1 \\
    \begin{aligned}
      = &D_{\mathbf{x}_1} D_{\mathbf{x}_1\mathbf{x}_2}\pi_1 \Big[
        D_{\mathbf{x}_1}\big( \mathbf{H}_{f_2}(\mathbf{x}_1)[\nabla \nu(\mathbf{u})] \big) \\
        - &D_{\mathbf{x}_1}\big( \mathbf{J}_{f_2}(\mathbf{x}_1)^\top \mathbf{H}_{\nu}(\mathbf{u}) \mathbf{J}_{f_2}(\mathbf{x}_1) \big)
      \Big] \\
      (&D_{\mathbf{x}_1\mathbf{x}_1}\pi_1)^{-1} D_{\mathbf{x}_1\mathbf{x}_2}\pi_1
    \end{aligned}
  \end{multline}
  where the remaining second order derivatives of the log marginal \(\xi\) are contained in the expression for \(D_{\mathbf{x}_1\mathbf{x}_1}\pi_1\).
  This contradicts the assumption that \(P_\mathbf{X}\) is generated from a \emph{identifiable bivariate} GANM.
\end{proof}

\subsection{Proof of Corollary~\ref{corollary:multivariate_identifiability}}

\begin{proof}
  Suppose there are two identifiable GANMs that both induce the distribution \(P_\mathbf{X}\) with DAGs \(\mathcal{G}_0\) and \(\mathcal{G}_0^\prime\), respectively. For any two groups \(\mathbf{X}_Q, \mathbf{X}_R\) that satisfy Proposition 29 in~\citet{Peters2014} we consider the set of parents without \(R\) in \(\mathcal{G}_0\), i.e, \({pa}_Q \coloneqq pa_{\mathcal{G}_0}({Q}) \setminus R\) and the set of parents without \(Q\) in \(\mathcal{G}_0^\prime\), i.e, \({pa}_R \coloneqq pa_{\mathcal{G}_0^\prime}({R}) \setminus Q\). Denote their union by \(S \coloneqq {pa}_Q \cup {pa}_R\). For any \(s = ({q},{r})\) we write \(\mathbf{X}^*_{Q} = \mathbf{X}_{Q} \mid_{S=s}\) and \(\mathbf{X}^*_{R} = \mathbf{X}_{R} \mid_{R=r}\). From Lemma~\ref{lemma:ancestor_independence}, we have that \(\mathbf{N}_Q \indep \mathbf{X}_R, \mathbf{X}_S\) and \(\mathbf{N}_R \indep \mathbf{X}_Q, \mathbf{X}_S\). Thus, by Lemma 2 in \citet{Peters2011}, we have the following bivariate GANM in \(\mathcal{G}_0\)
  \begin{equation*}
    \mathbf{X}^*_{Q} = f_{Q}(\mathbf{X}_q, \mathbf{X}^*_{R}) + \mathbf{N}_Q, \quad \mathbf{N}_Q \indep \mathbf{X}^*_{R},
  \end{equation*}
  and in \(\mathcal{G}_0^\prime\)
  \begin{equation*}
    \mathbf{X}^*_{R} = f_{R}(\mathbf{X}_r, \mathbf{X}^*_{Q}) + \mathbf{N}_R, \quad \mathbf{N}_R \indep \mathbf{X}^*_{Q},
  \end{equation*}
  However this is a contradiction since in Corollary~\ref{corollary:multivariate_identifiability} we can choose any \(s=(q,r)\) that ensures that the bivariate GANMs are identifiable.
\end{proof}

\subsection{Proof of Lemma~\ref{lemma:ancestor_independence}}

\begin{proof}
  Write \(\mathbf{S} = (S_1, \dots, S_k)\) such that \(\mathbf{S} = (f_{S_1}(\mathbf{X}_{pa(S_1)}, \mathbf{N}_{S_1}), \ldots, f_{S_k}(\mathbf{X}_{pa(S_k)}, \mathbf{N}_{S_k}) )\). It takes finitely many steps to recursively substitute the parents of \(S_i, i \in [k]\) by the corresponding structural equation such that \(\mathbf{S} = f(\mathbf{N}_{A_1}, \ldots, \mathbf{N}_{A_l})\) with \(\{A_1, \ldots, A_l\}\) the set of all ancestors of nodes in \(\mathbf{S}\) that do not contain the node \(g\). The statement follows from the joint independence of the noise variables across groups.
\end{proof}

% \begin{lemma}
%   Consider the setting of Theorem~\ref{theorem:bivariate_identifiability}. If \(\mathbf{X}\) and \(\mathbf{N}_{\mathbf{Y}}\) are both multivariate Gaussian and the triple \((f, P(\mathbf{X}), P(\mathbf{N}_{\mathbf{Y}}))\) solves the differential equation in Condition~\ref{cond:identifiability}, then \(f\) must be a linear function.
% \end{lemma}
% \begin{proof}
%   We start by noticing that
%   \begin{multline*}
%     D_{\mathbf{xy}}\pi_i(\mathbf{x},\mathbf{y})\cdot D_\mathbf{x}\left(D_{\mathbf{xx}}\pi_i(\mathbf{x},\mathbf{y})\right) \\
%     = D_{\mathbf{xx}}\pi_i(\mathbf{x},\mathbf{y})\cdot D_\mathbf{x}\left(D_{\mathbf{xy}}\pi_i(\mathbf{x},\mathbf{y})\right),
%   \end{multline*}
%   for \(i \in \{1, 2\}\). Plugging in yields
%   \begin{equation}\label{eq:gaussian_identity}
%     \mathbf{B}_{\mathbf{x}\mathbf{y}} D_\mathbf{x}(\mathbf{A}_{\mathbf{x}\mathbf{x}}) = \mathbf{A}_{\mathbf{x}\mathbf{x}} D_\mathbf{x}(\mathbf{B}_{\mathbf{x}\mathbf{y}}).
%   \end{equation}
%   Since \(P(\mathbf{X})\) and \(P(\mathbf{N}_{\mathbf{Y}})\) are both multivariate Guassian, the third derivatives of the corresponding log-densities vanish and the second derivative is constant everywhere, i.e. \(D_\mathbf{x}\mathbf{H}_{\nu}(\mathbf{u}) = \mathbf{0}\), \(D_{\mathbf{x}}\mathbf{H}_\xi(\mathbf{x}) = \mathbf{0}\), \(\mathbf{H}_\nu(\mathbf{u}) = \mathbf{C}_\nu \in \mathbb{R}^{d_y \times d_y}\), and \(\mathbf{H}_\xi(\mathbf{x}) = \mathbf{C}_\xi \in \mathbb{R}^{d_x \times d_x}\) such that
%   \begin{align*}
%     D_\mathbf{x}(D_{\mathbf{xy}}\pi_1(\mathbf{x},\mathbf{y})) &= D_\mathbf{x}(-\mathbf{J}_f(\mathbf{x})^\top \mathbf{H}_{\nu}(\mathbf{u})) \\
%     &= -\mathbf{H}_f(\mathbf{x})\mathbf{C}_\nu,
%   \end{align*}
%   and
%   \begin{align*}
%     D_\mathbf{x}(D_{\mathbf{xx}}\pi_1(\mathbf{x},\mathbf{y})) &= D_\mathbf{x}(\mathbf{H}_{\xi}(\mathbf{x})
%       - \mathbf{H}_f(\mathbf{x})[\nabla \nu\bigl(\mathbf{u}\bigr)] \\
%     &+ \mathbf{J}_f(\mathbf{x})^\top \mathbf{H}_{\nu}\bigl(\mathbf{u}\bigr) \mathbf{J}_f(\mathbf{x})), \\
%     &= D_\mathbf{x}(\mathbf{H}_{\xi}(\mathbf{x})) - D_\mathbf{x}(\mathbf{H}_f(\mathbf{x})[\nabla \nu\bigl(\mathbf{u}\bigr)]) \\
%     &+ D_\mathbf{x}(\mathbf{J}_f(\mathbf{x})^\top \mathbf{H}_{\nu}\bigl(\mathbf{u}\bigr) \mathbf{J}_f(\mathbf{x})) \\
%     &= -D_\mathbf{x}(\mathbf{H}_f(\mathbf{x})[\nabla \nu\bigl(\mathbf{u}\bigr)]) \\
%     &+ D_\mathbf{x}(\mathbf{J}_f(\mathbf{x})^\top \mathbf{C}_\nu \mathbf{J}_f(\mathbf{x})).
%   \end{align*}
%   Then, Eq.~\ref{eq:gaussian_identity} implies that
%   \begin{multline*}
%     \mathbf{J}_f(\mathbf{x})^\top \mathbf{C}_\nu\Big[D_\mathbf{x}(\mathbf{H}_f(\mathbf{x})[\nabla \nu\bigl(\mathbf{u}\bigr)]) \\
%     + D_\mathbf{x}(\mathbf{J}_f(\mathbf{x})^\top \mathbf{C}_\nu \mathbf{J}_f(\mathbf{x}))\Big] \\
%     = \Big[\mathbf{C}_{\xi}
%       - \mathbf{H}_f(\mathbf{x})[\nabla \nu\bigl(\mathbf{u}\bigr)]\\
%     + \mathbf{J}_f(\mathbf{x})^\top \mathbf{C}_{\nu} \mathbf{J}_f(\mathbf{x})\Big] (-\mathbf{H}_f(\mathbf{x})\mathbf{C}_\nu).
%   \end{multline*}
%   Note that
%   \begin{align*}
%     D_\mathbf{x}(\mathbf{J}_f(\mathbf{x})^\top \mathbf{C}_\nu \mathbf{J}_f(\mathbf{x})) = 2\mathbf{H}_f(\mathbf{x})\mathbf{C}_\nu\mathbf{J}_f(\mathbf{x}).
%   \end{align*}
%   Due to the Gaussian log-density, we have
%   \begin{equation*}
%     \nabla \nu(\mathbf{u}) = \mathbf{C}_\nu(\mathbf{u} - \mathbf{\mu}),
%   \end{equation*}
%   Thus,
%   \begin{align*}
%     D_{\mathbf{x}}\bigl(\mathbf{H}_f(\mathbf{x})[\nabla\nu(\mathbf{u})]\bigr)_{ijr}
%     &= \sum_{k=1}^{d_y}\frac{\partial^3 f_k(\mathbf{x})}{\partial x_i\partial x_j\partial x_r}[\mathbf{C}_\nu(\mathbf{u}-\mathbf{\mu})]_k\\[6pt]
%     &\quad -\sum_{k=1}^{d_y}\frac{\partial^2 f_k(\mathbf{x})}{\partial x_i\partial x_j}[\mathbf{C}_\nu \mathbf{J}_f(\mathbf{x})]_{kr},
%   \end{align*}
%   for \(i,j,r \in [d_x]\).
%   % Thus, explicitly,
%   % \begin{align*}
%   %   D_{\mathbf{x}}\bigl(\mathbf{H}_f(\mathbf{x})[\nabla\nu(\mathbf{u})]\bigr)
%   %   = [D_{\mathbf{x}}\mathbf{H}_f(\mathbf{x})]\times_1 [\mathbf{C}_\nu(\mathbf{u}-b)] \\
%   %   - \mathbf{H}_f(\mathbf{x})\times_1 [\mathbf{C}_\nu\mathbf{J}_f(\mathbf{x})].
%   % \end{align*}
%   Now, since \(\mathbf{C}_\nu\) is negative definite there must exist \(\mathbf{\alpha}\) such that \(\nabla\nu(\mathbf{\alpha}) = \mathbf{0}\). Similar to~\citep{Hoyer2009}, we restrict ourselves to the submanifold \(\{(x,y) \in \mathbb{R}^{d_x \times d_y} : \mathbf{y} - f(\mathbf{x}) = \alpha\}\) on which \(\nabla\nu(\mathbf{\alpha}) = \mathbf{0}\).
%   Isolating all terms involving \(\nabla\nu(\mathbf{\mathbf{u}})\) on one side, and factorizing out the common term \(\mathbf{H}_f(\mathbf{x})\) on the other side, we obtain the condition that \(\mathbf{J}_f(\mathbf{x})\) is constant, such that \(\mathbf{H}_f(\mathbf{x})\) needs to vanish everywhere.
% \end{proof}
