\newpage
% Add contents to table of contents
\addtocontents{toc}{\protect\setcounter{tocdepth}{2}}

\onecolumn 

\setcounter{figure}{0}
\setcounter{table}{0}
\setcounter{equation}{0}
\setcounter{thm}{0}
\setcounter{proposition}{0}
\setcounter{observation}{0}
\setcounter{assumption}{0}

\title{
Geodesic Slice Sampler for Multimodal Distributions with Strong Curvature
\\(Supplementary Material)}
% \maketitle
\begin{center}
    {\huge \bf Appendix}
\end{center}

\tableofcontents 
% \newpage

\section{\ournamefull } \label{app:agss}

\subsection{Meta-\ournamefull } 
The \metaourmethod found in algorithm~\ref{alg:metaagss} is the combination of \ourmethod\ for $K$-steps followed by a local MCMC sampler for $L$-steps. 
\begin{algorithm}[H]
    \caption{\metaourmethod}
    \label{alg:metaagss}
    \textbf{Input:} Initial position $\bx^{[0]}$ and metric components $\bG(\bx)$. 
    Parameters $m\in \mathbb{N}$, $w\geq 0$, $K$ sweeps, $L$ steps of local MCMC sampler. \\
    \textbf{Output:} $N$ samples $\bx^{[n]}$.
    \begin{algorithmic}[1]
        \For{$n \leftarrow 1, \dots, N$}
            \State Let $\bx \gets \bx^{[n-1]}$
            \For{$k \leftarrow 1, \dots, K$}
                \State Update $\bx$ by \ourmethod\ with initial position $\bx$
            \EndFor
            \For{$l \leftarrow 1, \dots, L$}
                \State Update $\bx$ by local MCMC with initial position at $\bx$.
            \EndFor
            \State Set $\bx^{[n]} \gets  \bx$
        \EndFor
    \end{algorithmic}
\end{algorithm}


\subsection{Step-out and Shrinkage procedures}
The stepping-out and shrinkage procedures are Algorithm~\ref{alg:stepout} and Algorithm~\ref{alg:shrink} respectively, these algorithms are taken from \citet{Durmus2023}. Our code implementation of the step-out procedure has vectorized both while loops in Algorithm~\ref{alg:stepout}. This is done by evaluating the log density on all possible step-out points at once (vectorized). The code implementation of the shrinkage procedure (Algorithm~\ref{alg:shrink}) has a max number of iteration set at $100$ for the while loop, which if exceeded defaults back to the previous point of the chain. In the algorithm boxes we use the notation for the exponential map $ \gamma_{(x,v)}(t)$. JAX is used to handle automatic differentiation and the samplers are coded in the style of Blackjax \citep{Jax2018, Cabezas2024b}. 


\begin{algorithm}[H]
\caption{Stepping-out procedure. Call it $\text{Step-out}_{w,m}(s, \gamma_{(\bx, \bbv)})$}\label{alg:stepout}
\textbf{Input:} point $x \in \mathcal{M}$, direction $v \in \mathbb{S}^{d-1}_x$, level $s \in (0, p(x))$, hyperparameters $w \in (0, \infty)$ and $m \in \mathbb{N}$\\
\textbf{Output:} two points $\ell, r \in \mathbb{R}$ such that $\ell < 0 < r$
\begin{algorithmic}[1]
\State Draw $u \sim \text{Unif}([0, w])$.
\State Set $\ell := -u$ and $r := \ell + w$.
\State Draw $\iota \sim \text{Unif}(\{1, \dots, m\})$.
\State Set $i = 2$ and $j = 2$.
\While{$i \leq \iota$ and $p_{\cH}(\gamma_{(x,v)}(\ell)) > s$}
    \State Set $\ell = \ell - w$.
    \State Update $i = i + 1$.
\EndWhile
\While{$j \leq m + 1 - \iota$ and $p_{\cH}(\gamma_{(x,v)}(r)) > s$}
    \State Set $r = r + w$.
    \State Update $j = j + 1$.
\EndWhile
\State \textbf{return} $(\ell, r)$
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[H]
\caption{Shrinkage procedure. Call as $\text{Shrink}_{\ell,r}(s, \gamma_{(\bx, \bbv)})$}\label{alg:shrink}
\textbf{Input:} point $x \in \mathcal{M}$, direction $v \in \mathbb{S}^{d-1}_x$, level $s \in (0, p(x))$ and parameters $\ell < 0 < r$\\
\textbf{Output:} point $\theta \in L(x,v,s) \cap [\ell, r]$
\begin{algorithmic}[1]
\State Draw $\theta_h \sim \text{Unif}((0, r - l))$.
\State Set $\theta := \theta_h - 1_{\{\theta_h > r\}}(r - l)$.
\State Set $\theta_{\min} := \theta_h$.
\State Set $\theta_{\max} := \theta_h$.
\While{$p_{\cH}(\gamma_{(x,v)}(\theta)) \leq s$}
    \If{$\theta_h \in [\theta_{\min}, r - l]$}
        \State Set $\theta_{\min} = \theta_h$.
    \Else
        \State Set $\theta_{\max} = \theta_h$.
    \EndIf
    \State Draw $\theta_h \sim \text{Unif}((0, \theta_{\max}) \cup [\theta_{\min}, r - l))$.
    \State Set $\theta = \theta_h - 1_{\{\theta_h > r\}}(r - l)$.
\EndWhile
\State \textbf{return} $\theta$.
\end{algorithmic}
\end{algorithm}

% \subsection{The Geodesic Equations} \label{app:geoeq}
% We provide only the necessary background on Riemannian geometry. 

% A Riemannian metric tensor is the map $g(\cdot, \cdot):T_{\bx}\man \cross T_{\bx}\man\to \R$ where $T_{\bx}\man$ is the tangent space at position $\bx$ in $\man$. The map must satisfy  positive definiteness for every $\bx \in \man$. Namely, the components $\bG(\bx)$ are such that for all vectors $\bbv, \boldsymbol{u} \in T_{\bx}\man$, then $g(\bbv, \boldsymbol{u}) = \boldsymbol{v}^\top \bG(\bx) \boldsymbol{u} > 0$ (that is, $\bG(\bx)$ is positive definite). 

% Geodesic equations generalize straight lines to curved manifolds. Let $\man$ be a Riemannian manifold with metric tensor $\bG$. The tangent space at a point $\bx$ is denoted by $T_{\bx}\man$. A geodesic curve is determined by solving a second-order differential equation with initial conditions $\bx_0 \in \man$ and $\bbv_0 \in T_{\bx}\man$. The metric components are given by $g_{ij} = \bG(\bx)_{ij}$, and their inverse by $g^{km} = \bG^{-1}(\bx)_{km}$. The geodesic equations are:
% \begin{align*}
%     \dot \bx_k &= \bbv_k, \nonumber \\
%     \dot \bbv_k &= - \norm{\bbv}^2_{\Gamma^k}, \quad \mathrm{for}\ k = 1, \ldots, D. 
% \end{align*}
% The Christoffel symbols, using Einstein summation convention, are: $\Gamma^k_{ij} = \tfrac{1}{2} g^{km} ( \partial_i g_{m j} +  \partial_j g_{i m} - \partial_m g_{i j})$.

\subsection{The Geodesic Equations} \label{app:geoeq}
A Riemannian metric is a smooth, symmetric, and positive-definite tensor $g: T_{\bx}\man \times T_{\bx}\man \to \mathbb{R}$ for each point $\bx \in \man$. In coordinates, the metric is represented by a positive-definite matrix $\bG(\bx)$ such that for all $\bbv, \boldsymbol{u} \in T_{\bx}\man$, 
\begin{equation*}
g(\bbv, \boldsymbol{u}) = \bbv^\top \bG(\bx) \boldsymbol{u}.    
\end{equation*}
Geodesics are curves $\gamma(t)$ on $\man$ that locally minimize distance and generalize straight lines to curved spaces. They solve the geodesic equation, a second-order ODE determined by the metric. Given initial conditions $\gamma(0) = \bx_0 \in \man$ and $\dot{\gamma}(0) = \bbv_0 \in T_{\bx_0}\man$, the geodesic equation in local coordinates is
\begin{equation*}
\ddot{\gamma}^k(t) + \sum_{i,j=1}^D \Gamma^k_{ij}(\gamma(t)) \dot{\gamma}^i(t) \dot{\gamma}^j(t) = 0, \quad \text{for } k = 1, \ldots, D,
\end{equation*}
where $\Gamma^k_{ij}$ are the Christoffel symbols of the second kind, given by
\begin{equation*}
\Gamma^k_{ij} = \tfrac{1}{2} \sum_{m=1}^D g^{km} \left( \partial_i g_{mj} + \partial_j g_{im} - \partial_m g_{ij} \right),
\end{equation*}
with $g_{ij} = \bG(\bx)_{ij}$ and $g^{km} = (\bG^{-1}(\bx))_{km}$.
Alternatively, defining the position-velocity system with $\bx = \gamma(t)$ and $\bbv = \dot{\gamma}(t)$, the geodesic equations can be expressed as a first-order system (Equation~\ref{eq:geoeqs}):
\begin{align*}
    \dot \bx_k &= \bbv_k, \\
    \dot \bbv_k &= - \norm{\bbv}^2_{\Gamma^k}\quad \mathrm{for}\ k = 1, \ldots, D,
\end{align*}
where $\norm{\bbv}^2_{\Gamma^k}=\sum_{i,j=1}^D \Gamma^k_{ij}(\bx) \bbv_i \bbv_j$.

% The paper introduces the concept of Hausdorff density to account for the change in measure from Euclidean space to the manifold. Can you provide more intuitive explanation on why this adjustment is important for maintaining proper sampling behavior?

\subsection{Sampling Uniformly from the Unit Tangent Sphere}

Recall the unit tangent sphere is defined by
\begin{equation*}
    \dS^{D-1}_g(\bx):=\{\bbv\in \R^D: \ \norm{\bbv}_g^2=1\}.
\end{equation*}
\citet[Appendix C.4]{Durmus2023}  justify the existence of the uniform distribution distribution over $\dS^{D-1}_g(\bx)$. One method for producing samples from the uniform distribution on the unit tangent sphere is:
% 
\begin{enumerate}
    \item Sample $ \boldsymbol{z} \sim N(0, \bI) $.
    \item Transform $ \boldsymbol{v} \gets \boldsymbol{G}^{-\tfrac{1}{2}}(\boldsymbol{x}) \boldsymbol{z}$, then $\bbv$ is distributed according to $\mathcal{N}(0, \boldsymbol{G}^{-1}(\boldsymbol{x}))$.
    \item Compute the Riemannian norm $ \|\boldsymbol{v}\|_g = \sqrt{\boldsymbol{v}^T \bG(\bx) \boldsymbol{v}} $.
    \item Project to the boundary  $\boldsymbol{v} \gets \frac{\boldsymbol{v}}{\|\boldsymbol{v}\|_g}$. 
\end{enumerate}

% \paragraph{Rational behind the transformation}
% The initial vector is $\boldsymbol{z} \sim N(0, \boldsymbol{I})$, then the transformation $ \boldsymbol{v} = \boldsymbol{G}^{-\tfrac{1}{2}}(\boldsymbol{x}) \boldsymbol{z}$ is such that the norm in the tangent space is,
% \begin{align*}
%     \norm{\boldsymbol{v}}_{\boldsymbol{G}(\boldsymbol{x})} &=  \boldsymbol{v}^\top \boldsymbol{G}(\boldsymbol{x}) \boldsymbol{v} \\
%                     &=  (\boldsymbol{G}^{-\tfrac{1}{2}}(\boldsymbol{x}) 
%                      \boldsymbol{z}
%                      )^\top \boldsymbol{G}(\boldsymbol{x}) 
%                      (\boldsymbol{G}^{-\tfrac{1}{2}}(\boldsymbol{x}) \boldsymbol{z}  )\\
%                      &= \boldsymbol{z}^\top \boldsymbol{z}. \\
% \end{align*}
% That is, the metric from the original space is preserved. 


% \subsection{The Hausdorff measure}  \label{app:hausdorff}

% The volume form of a Riemannian Manifold with metric $\bG(\bx)$ is defined as $V(\dbx) := \sqrt{\det\bG(\bx)}\dbx$.
% For technical details about the volume form, interested readers can consult Proposition 2.41 in \citet{Lee2018}.
% The volume element gives the natural measure on the manifold, analogous to the Lebesgue measure in the Euclidean space \citep{Durmus2023}. The Hausdorff density is defined as the density which 
% integrates to one with respect to the volume element
% \begin{equation*}
%     p_{\cH}(\bx) = \frac{ p(\bx)}{\sqrt{ \det \bG(\bx)}}  .
% \end{equation*}

\subsection{The Hausdorff measure}  \label{app:hausdorff}

The volume form of a Riemannian manifold with metric $\bG(\bx)$ is defined as $V(\dbx) := \sqrt{\det\bG(\bx)}\dbx$.
For technical details about the volume form, interested readers can consult Proposition 2.41 in \citet{Lee2018}.
The volume element gives the natural measure on the manifold, analogous to the Lebesgue measure in Euclidean space \citep{Durmus2023}. The Hausdorff density is defined as the density which 
integrates to one with respect to the volume element:
\begin{equation*}
    p_{\cH}(\bx) = \frac{ p(\bx)}{\sqrt{ \det \bG(\bx)}}  .
\end{equation*}

An intuitive explanation for the volume element can be thought in terms of change-of-variables in Euclidean space. When transforming coordinates via a diffeomorphism $\phi: \mathbb{R}^D \to \mathbb{R}^D$, the standard density must be adjusted by the Jacobian determinant to preserve probability mass. That is, $|\det J|$ accounts for local volume distortion.

When $\phi:\mathbb{R}^D \to \mathbb{R}^d$, where $d>D$ maps from a lower-dimensional Euclidean space onto a manifold embedded in higher dimensions, the Jacobian $J$ is generally rectangular. In this case, the induced Riemannian metric (Pullback metric) on the manifold is $\bG(\bx) = J J^\top$, and the volume change is given by $\sqrt{\det \bG(\bx)}$, which generalizes $|\det J|$.

Thus, the Hausdorff density $p_{\cH}$ adjusts the density $p$ to be properly normalized on the manifold with respect to the intrinsic geometry. This adjustment ensures correct sampling and integration as seen in Proposition~\ref{prop:prop_haus}.





\subsection{Observations and Proposition } \label{app:proof_props}






Denote by $\bG_{IM}(\bx)$ the inverse Monge metric and by $\bG_{Ig}(\bx)$ the inverse generative metric.
The metrics are defined as:
\begin{equation*}
    \bG_{Ig}(\bx) = \left(\frac{p(\bx) + \lambda}{p_0 + \lambda}\right)^2 \bI, \quad
    \bG_{IM}(\bx) = \bI - \frac{\alpha^2}{1 + \alpha^2 \norm{\nabla \ell(\bx)}^2} \nabla \ell(\bx) \nabla \ell(\bx)^\top.
\end{equation*}

\begin{observation}
\emph{
    Let $p(\bx)$ be a smooth density function. Let $(\bx_t, \bbv_t)$ be the geodesic flow with initial conditions $(\bx_0,\bbv_0)$ such that $p(x_0)>0$ with respect to the Inverse Generative metric. Then, for $t$ such that $p(\bx_t) \to 0$ we have $\norm{\bbv_t}_{2}>\norm{\bbv_t}_{0}$.
    } 
\end{observation}

\paragraph{Analysis for the Inverse Generative metric}
Assume a geodesic curve starting at $(\bx_0,\bbv_0)$ satisfies $p(\bx_0) \geq p(\bx_t)$ for all $t \geq 0$ and $p(\bx_t) \to 0$. Recall that along a geodesic curve, the magnitude of the velocity with respect to the metric remains constant:
\begin{equation*}
\norm{\bbv_t}^2_{\bG_{Ig}} = \norm{\bbv_0}^2_{\bG_{Ig}} \quad \forall t.    
\end{equation*}
Thus, the equality holds:
\begin{align*}
    \norm{\bbv_t}^2_{\bG_{Ig} } &= \norm{\bbv_0}^2_{ \bG_{Ig} } \\
    \left(\frac{p(\bx_0) + \lambda}{p_0 + \lambda}\right)^2 \norm{\bbv_0}^2 &= \left(\frac{p(\bx_t) + \lambda}{p_0 + \lambda}\right)^2 \norm{\bbv_t}_2^2 \\
    \left(\frac{p(\bx_0) + \lambda}{p(\bx_t) + \lambda}\right)^2 \norm{\bbv_0}^2 &= \norm{\bbv_t}_2^2.
\end{align*}

Since $p(\bx_0) \geq p(\bx_t)$, it follows that $\norm{\bbv_t}_2^2 \geq \norm{\bbv_0}_2^2$, and as $p(\bx_t)\to 0$ the quantity is arbitrary large. 

\begin{observation}
\emph{
    Let $p(\bx)$ be a smooth density function. Let $(\bx_t, \bbv_t)$ be the geodesic flow with initial conditions $(\bx_0,\bbv_0)$  with respect to the Inverse Monge metric, such that $\bx_0$ is a local maximum.
    Then $\norm{\bbv_t}_{2}\geq \norm{\bbv_0}_{2}$ 
    for all $t\neq 0$.
    } 
\end{observation}

% {\color{red} Disclaimer: In progress}

% \paragraph{Analysis for the Inverse Monge Metric}
% Assume $t\in I$ as described before. Denote by $\theta_t = \arccos \tfrac{\inp{\bbv_t}{\nabla\ell(\bx_t)}}{\norm{\bbv_t}_2\norm{\nabla\ell(\bx_t)}_2}$.
% The Riemannian norm over the geodesic trajectory is preserved, thus:
% \begin{align*}
%     \norm{\bbv_0}_{ \bG_{IM} }^2 &= \norm{\bbv_t}^2_{ \bG_{IM} } \\
%     \norm{\bbv_0}_2^2 - \frac{\alpha^2}{1 + \alpha^2 \norm{\nabla \ell(\bx_0)}^2} \inp{\nabla \ell(\bx_0)}{\bbv_0}^2 &= \norm{\bbv_t}_2^2 - \frac{\alpha^2}{1 + \alpha^2 \norm{\nabla \ell(\bx_t)}^2}_2 \inp{\nabla \ell(\bx_t)}{\bbv_t}^2 \\
%     \norm{\bbv_0}_2^2 \left(1-\frac{\alpha^2 \norm{\nabla\ell(\bx_0)}^2 \cos^2 \theta_0}{1 + \alpha^2 \norm{\nabla \ell(\bx_0)}^2}  \right)&= \norm{\bbv_t}_2^2\left(1-\frac{\alpha^2 \norm{\nabla\ell(\bx_t)}^2 \cos^2 \theta_t}{1 + \alpha^2 \norm{\nabla \ell(\bx_t)}^2}  \right).
% \end{align*}

% Using the equality $\cos^2 \theta_t < \cos^2 \theta_0$ then $-\cos^2 \theta_0 < -\cos^2 \theta_t$. And using $\norm{\nabla\ell(\bx_t)}_2 > \norm{\nabla\ell(\bx_0)}_2$ then 
% \begin{equation*}
% \frac{\alpha^2 \norm{\nabla\ell(\bx_t)}^2}{1 + \alpha^2 \norm{\nabla\ell(\bx_t)}^2} <    
% \end{equation*}




% Therefore in regions where $v_t$ aligns with the gradient $\cos\theta_t$ approaches $1$ and for sufficiently large $\norm{\bbv_t}_2$ the right hand term approaches zero, then  $\norm{\bbv_t}_2^2 > \norm{\bbv_0}_2^2$. 


\paragraph{Analysis for the Inverse Monge metric}

We consider geodesics emanating from a mode. Let $\bx_0$ be a mode, meaning that $\nabla \ell(\bx_0) = 0$. Consider a geodesic emanating form $\bx_0$ with velocity $\bbv_0$, it holds
\begin{align*}
\norm{\bbv_0}_2^2 - \frac{\alpha^2}{1 + \alpha^2 \norm{\nabla \ell(\bx_0)}^2} \inp{\nabla \ell(\bx_0)}{\bbv_0}^2 &= \norm{\bbv_t}_2^2 - \frac{\alpha^2}{1 + \alpha^2 \norm{\nabla \ell(\bx_t)}^2}_2 \inp{\nabla \ell(\bx_t)}{\bbv_t}^2,
\end{align*}
or
\begin{align*}
    \norm{\bbv_0}_2^2+\frac{\alpha^2}{1 + \alpha^2 \norm{\nabla \ell(\bx_t)}^2}_2 \inp{\nabla \ell(\bx_t)}{\bbv_t}^2 = \norm{\bbv_t}_2^2.  
\end{align*}
We see that $\Vert \bbv_t \Vert^2 \geq \Vert \bbv_0\Vert^2$, so any geodesic starting from $\bx_0$ always has a "shrinkage" behavior. So this metric helps bring the entire space close to $\bx_0$ along geodesics, as it shrinks the space towards (multiple) modes. Also note that the collapsing towards modes depends on how flat the region is and how well-aligned the velocity and the gradient of $\ell$ is. For complicated distributions, the behavior should not depend monotonically on $\alpha$.


\textbf{Note:} For a multimodal distribution of dim $\geq 2$, Observation~\ref{obs:invmonge} and Observation~\ref{obs:invgen} guarantee that low-density/increasing-gradient regions the speed increases, but we do not have the guarantee that the geodesics given by the inverse metrics will reach the other modes. The geodesic could twist before reaching the other modes, which could negate the ``teleport/move fast" effect. 

% TODO: Give a detailed explanation here
% Note that this analysis also holds for the inverse Monge metric when the geodesic curve approaches a local maximum of the distribution, which explains the behavior observed in Figure~\ref{fig:hauspdf_inv} around the local maximum values when $\alpha=0.1$.


\begin{figure}[t]
    \centering
    \includegraphics[width=0.4\linewidth]{figs/figure1/inverse_monge_reparametrization.png}
    \includegraphics[width=0.4\linewidth]{figs/figure1/inverse_generative_reparametrization.png}    
    \caption{
    The Hausdorff density of a mixture of two Gaussian distributions evaluated 
    along the geodesic, namely $t\mapsto p_{\cH}(\hat{\gamma}_{(\bx, \bbv)}(t))$ for different values of $\lambda$ and $\alpha^2$. 
    Left: inverse Monge metric. Right: inverse Generative metric}
    \label{fig:hauspdf_inv}
\end{figure}



\begin{proposition} \label{prop:prop_haus}
\emph{
     An MCMC sampler targeting the Hausdorff density on a Riemannian manifold $\mathcal{M}$ with metric tensor $\bG(\bx)$ also targets the correct distribution on the Euclidean space.
     }
\end{proposition}

For a general setting proof of the Proposition  consult Section XII.1, Proposition 1.5 in \citet{Amann2005}.

The volume element on the manifold is defined as $V(\dbx) = \sqrt{\det \bG(\bx)} \dbx$, where $\bG(\bx)$ is the Riemannian metric tensor. Let $\bX$ be a random variable on $\mathcal{M}$ whose law is $p_{\mathcal{H}}$ where $p_{\mathcal{H}}(\bx)$ is the Hausdorff density. Let $B \in \mathcal{B}(\mathcal{M})$ be a Borel set on the manifold $\mathcal{M}$. The probability of $\bX$ being in $B$, under the Hausdorff target density, is given by 
%
\begin{align} 
    \label{eq:pH}
    \mathbb{P}(\bX \in B) &= \int_{B} p_{\mathcal{H}}(\bx) V(\dbx).
\end{align}
%
Substituting $p_{\mathcal{H}}(\bx) = \frac{p(\bx)}{\sqrt{\det \bG(\bx)}}$,
%
\begin{align}
    \label{eq:pE}
    \mathbb{P}(\bX \in B) &= 
    \int_{B} \frac{p(\bx)}{\sqrt{\det G(\bx)}} \sqrt{\det \bG(\bx)} \dbx 
    \nonumber
    \\
    &= \int_{B} p(\bx) \dbx.
\end{align}
%
Thus, the integral of the Hausdorff density with respect to the volume element on the manifold coincides with the integral of the Euclidean density $p(\bx)$ over the same set $B$.

Since the probabilities computed for any $B \in \mathcal{B}(\mathcal{M})$ are identical whether using \eqref{eq:pH} or \eqref{eq:pE}, the corresponding estimators for the probabilities also coincide. Consequently, an MCMC sampler on the manifold targeting the Hausdorff density $p_{\mathcal{H}}(\bx)$ correctly targets the Euclidean density $p(\bx)$ in $\mathbb{R}^D$.






% \section{Reversibility of the Method} \label{app:proof}


% In Sectio~\ref{app:detbal} we state Detailed Balance for our method. 
% In Section~\ref{prop:timerev} we study the sources of numerical error.
% We prove that in step-out and shrinkage kernels follow detailed balance if the solver is time reversible, and we identify  the sources of numerical error when we use the geodesic approximations. 



% \subsection{Detailed Balance} \label{app:detbal}

% \begin{assumption} \label{assm:1}
% \emph{
%     Let $\man$ be  $D$-dimensional, smooth, connected Riemannian manifold with metric tensor $g$ and let $\man$ be geodesically complete. 
%     }
% \end{assumption}

% \begin{assumption} \label{assm:2}
% \emph{
%     The unnormalized Hausdorff density $p_{\cH}:\man \to \R$ is lower semi-continuous function such that $\int_{\man} p_{\cH}(x)  V(\dd x) \in (0, \infty)$.
%     }
% \end{assumption}

% \mainthm

% \begin{proof}
%     The proof presented in \citet{Durmus2023} guarantees detailed balance of our method up to the numerical error given by the numerical solution of the geodesic equations.     
% \end{proof}


% \subsection{Sources of Numerical Error} \label{app:numerical_error}


% \paragraph{Time Reversible Numerical Solver} \label{app:time_rev}

% First we will prove that if the numerical solver is time reversible, then the shrinkage and step-out kernels satisfy detailed balance. This is formalized in the proposition~\ref{prop:timerev}.
% Let us define what is a  time-reversible ODE solver.
% \begin{definition}
%     \emph{
% A time reversible ODE solver $\hat\gamma_{z_0}(t)$ with initial condition $z_0$ is time reversible if for any $t\neq 0$ then 
%     \begin{equation}
%         z_0 = \hat\gamma_{z_t}(-t), \quad z_t = \hat\gamma_{z_0}(t).
%     \end{equation}
%     }
% \end{definition}



% \begin{proposition} \label{prop:timerev}
% \emph{
% Under the same setup as Theorem~\ref{thm:detailed_balance} let us denote a  numerical solution of the geodesic equations by $\hat{\gamma}_{(\bx, \bbv)}(t)$.
% If $\hat{\gamma}_{(\bx, \bbv)}(t)$ is time reversible, then the Step-out kernel and the Shrinkage kernel associated with \ourmethod\ satisfy detailed balance.
%      }
% \end{proposition}
     
% \begin{proof}
% Let us prove by contradiction. Assume the composition step-out kernel and shrinkage kernel  \textbf{does not} satisfy detailed balance.

% We adopt the  notation from \citet{Neal2003}. To reduce clutter we define $z:=(x,v)$.
% The detailed balance condition is:
% \begin{multline*}
%     P(\text{next state} = z_1, \text{intermediate choices} = r \mid \text{current state} = z_0) = \\
%     P(\text{next state} = z_0, \text{intermediate choices} = \pi(r) \mid \text{current state} = z_1).
% \end{multline*}    
% Where $r$ denotes the intermediate choices of the step-out and shrinkage procedures.  The intermediate choices $r$ are all the random variables obtained during the step-out and shrinkage procedures. For example the step-out procedure has the random location of the initial left and right sides of the interval $\ell$, $r$, and all values of $\ell$ and $r$ until the final value.
% $\pi(r)$  is a one-to-one unit Jacobian mapping which maps from the original intermediate choices to  intermediate choices of the reverse trajectory. 

% The mapping $\pi$ is constructed in section 4.3 of \citet{Neal2003} and  by Lemmas 11, 12, 15 and 16 in \citet{Durmus2023}.

% Denote the numerical flow of the geodesic equations by $\phi_{\theta}(z_0, r) := 
% \hat{\gamma}_{z}(\theta)$. The composition of the step out and shrinkage procedure outputs a value $\theta^*$ such that $\phi_{\theta^*}(z_0, r) = z_1$.

% By our contradiction assumption, the step-out kernel and shrinkage kernel  \textbf{do not} satisfy detailed balance.
% \begin{multline*}
%     1=P(\text{next state} = z_1, \text{intermediate choices} = r \mid \text{current state} = z_0) \neq \\
%     P(\text{next state} = z_0, \text{intermediate choices} = \pi(r) \mid \text{current state} = z_1)=0.
% \end{multline*}    
% Hence $\phi_{-\theta^*}(z_1, \pi(r)) \neq z_0$. Implying the integrator is not time reversible.  
% We have reached a contradiction. We conclude $\phi_{\theta}(z, r)$ must be time reversible. 
    
% \end{proof}

% \textbf{Note on implementation: }
% The code implementation ensures detailed balance for the step-out and shrinkage procedures. This is done by fixing the starting conditions at  $x_0$, $v_0$ for every right and left point if the step-out procedure and for every intermediate  point generated by the shrinkage method. This guarantees detailed balance on every intermediate point generated except the for the new state. For the new state the error generated by the numerical solver will deviate from detailed balance, but we do not observe any major significance in the experimental results obtained. 



% \paragraph{Steps which Entroduce Error: }
% We identify the parts of the proof of \citet{Durmus2023} where the numerical solution of the geodesic equations introduces error. 

% \textbf{In remark 17:} Define the geodesic flow $\phi_\theta: (x,v)\mapsto \left(\gamma_{(x, v)}(\theta), \dv[]{}{\theta}\gamma_{(x, v)}(\theta)\right)$ and the flip operator $\mathcal{J}: (x,v)\mapsto (x, -v)$. Denote the composition $T(\theta) := \mathcal{J} \circ \phi_\theta$.
% The rescaling property of geodesics and chain rule give:
% \begin{align*}
%     (\mathcal{J} \circ \phi_\theta)(x, v) 
%     &= \left(\gamma_{(x, v)}(\theta), -\frac{\mathrm{d} \gamma_{(x, v)}}{\mathrm{d}t} \bigg|_\theta \right) \\
%     &= \left(\gamma_{(x, -v)}(-\theta), \frac{\mathrm{d} \gamma_{(x, -v)}}{\mathrm{d}t} \bigg|_{-\theta} \right) \\
%     &= (\phi_{-\theta} \circ \mathcal{J})(x, v).
% \end{align*}
% These equalities are true for geodesics, and for the numerical approximation holt up to the numerical error of the approximation. 

% \textbf{In section 4.3}: Let $\nu_g(\mathrm{d}x)$ be the measure on the manifold (the volume form). 
% Let $\sigma_{D-1}^{(x)}$ as measure of the manifold  over $\mathbb{S}^{D-1}$  as constructed in {appendix C.4} in \citet{Durmus2023}. Under the map $T(\theta)$ and measurable $f$ the proof relies on the equality:
% \begin{align*}
%     \int_{\mathcal{M}} \int_{S_{x}^{D-1}} f\left(T^{(\theta)}(x, v)\right) \sigma_{d-1}^{(x)}(\mathrm{d}v) \nu_g(\mathrm{d}x)     
%     &= \int_{\mathcal{M}} \int_{S_{x}^{D-1}} f(x, v) \sigma_{D-1}^{(x)}(\mathrm{d}v) \nu_g(\mathrm{d}x),
% \end{align*}
% This equality again relies on the preservation of the measure $\sigma_{D-1}^{(x)}(\mathrm{d}v) \nu_g(\mathrm{d}x)$ for geodesics which is conserved up to the integration error for the numerical approximation. 




\section{Additional Experimental Results} \label{app:exp}

\subsection{Logistic regression} \label{app:logreg}
Here we denote by $\theta$ the random variable of interest and by the $x$ input data.
The Logistic regression model \citep{ Girolami2011} is
\begin{equation*}
    p(\by_i | \btheta, \bx_i) = \textrm{Bernoulli}(\by_i|s( \bx_i^\top\btheta )), \quad
    p(\btheta) = \cN(\btheta |0,\alpha \bI_D ), \quad i=1,..,N,
\end{equation*}
where $\alpha=100$ and $s(\cdot)$ is the Sigmoid function. The Fisher Information Metric for this probabilistic model including the addition of the the Hessian of the prior is: $\bG(\bx) = \bX^{\top}\bLambda\bX + \alpha^{-1}\bI $. Where $\bX$ is the covariate matrix and $\bLambda$ is a diagonal matrix with entries $\bLambda_{nn} = s(\bx_i^\top \btheta)\big(1-s(\bx_i^\top \btheta)\big)$.

References samples used for computing the Wasserstein distance are obtained with HMC-NUTS. The samples obtained with the Euclidean and Fisher metrics are just as close to the samples, but Fisher and Monge have higher effective sample size (ESS) and use less shrinkage iterations than the Euclidean metric (See Table~\ref{tab:logreg}). The Monge metric uses the parameter $\alpha=1$. $\mathcal{W}$ is the earth mover's distance. The notation used is $\mathrm{mean} \pm \mathrm{std}$ over 5 runs with different seeds. 

\subsection{Numerical integrators}

The numerical solvers we consider are part of the \texttt{diffrax} package 
\citep{Kidger2021}. We consider three groups of solvers. Simple solvers (euler, tsit, dopri). Implicit solvers (kv). And reversible solvers (revheun). The solvers have the following characteristics:
\begin{itemize}    
    \item euler: The Euler solver can only be used with a fixed step-size.
    \item tsit: Tsitouras' 5/4 method can be used with both fixed and adaptive step-size.
    \item dopri5: Dormand-Prince's 5/4 method can be used with both fixed and adaptive step-size.
    \item dopri8: Dormand-Prince's 8/7 method can be used with both fixed and adaptive step-size. 
    \item kv3: Kvaerno's 3/2 method is an implicit solver can be only used with adaptive step-size.
    \item kv5: Kvaerno's 5/4 method is an implicit solver can only be used  adaptive step-size.
    \item revheun: Reversible Heun method can be used with both fixed and adaptive step-size. 
\end{itemize}

% \begin{table}[t]
%     \centering
% \begin{tabular}{llllllll}
% \toprule
% model & metr & Wass & min ESS & avg ESS & avg step-out & avg shrinkage & t(s) \\
% \midrule
% aus & euclidean & [0.74, 0.12] & [18, 5] & [228, 12] & [0.14, 0.0] & [3.33, 0.01] & 8.2 \\
%  & fisher & [0.58, 0.01] & [177, 9] & [270, 13] & [0.95, 0.0] & [1.0, 0.01] & 3298.6 \\
% ger & euclidean & [0.49, 0.01] & [35, 10] & [119, 9] & [0.08, 0.0] & [4.19, 0.02] & 15.8 \\
%  & monge & [0.75, 0.02] & [80, 5] & [4673, 113] & [3.97, 0.03] & [0.28, 0.01] & 4665.4 \\
%  & fisher & [0.49, 0.0] & [85, 24] & [169, 5] & [0.95, 0.0] & [0.99, 0.01] & 19684.4 \\
% hrt & euclidean & [0.63, 0.01] & [137, 26] & [254, 16] & [0.19, 0.01] & [2.72, 0.02] & 7.3 \\
%  & monge & [0.74, 0.03] & [394, 55] & [2979, 266] & [2.99, 0.04] & [0.38, 0.02] & 637.4 \\
%  & fisher & [0.64, 0.01] & [232, 14] & [311, 8] & [0.92, 0.01] & [1.01, 0.01] & 1694.8 \\
% pim & euclidean & [0.21, 0.0] & [266, 41] & [445, 45] & [0.11, 0.0] & [3.59, 0.03] & 6.8 \\
%  & monge & [0.29, 0.01] & [515, 72] & [4656, 269] & [2.38, 0.03] & [0.53, 0.02] & 1711.7 \\
%  & fisher & [0.21, 0.0] & [427, 47] & [547, 30] & [0.93, 0.0] & [0.98, 0.01] & 425.4 \\
% rip & euclidean & [0.09, 0.01] & [829, 112] & [1499, 56] & [0.24, 0.01] & [2.46, 0.02] & 4.2 \\
%  & monge & [0.15, 0.02] & [1042, 211] & [3123, 685] & [1.38, 0.02] & [0.97, 0.01] & 593.9 \\
%  & fisher & [0.09, 0.0] & [1623, 113] & [1753, 92] & [0.91, 0.0] & [0.96, 0.01] & 204.7 \\
% \bottomrule
% \end{tabular}
%     \caption{Bayesian Logistic Regression, the notation is $[\mathrm{mean}, \mathrm{std}]$.}
% \label{tab:logreg}
% \end{table} 

\begin{table}[t]
    \centering
\begin{tabular}{llllllll}
\toprule
model & metr & $\mathcal{W}$ & min ESS & avg ESS & avg step-out & avg shrinkage & t(s) \\
\midrule
aus & euclidean & 0.74 $\pm$ 0.12 & 18 $\pm$ 5 & 228 $\pm$ 12 & 0.14 $\pm$ 0.0 & 3.33 $\pm$ 0.01 & 8.2 \\
 & fisher & 0.58 $\pm$ 0.01 & 177 $\pm$ 9 & 270 $\pm$ 13 & 0.95 $\pm$ 0.0 & 1.0 $\pm$ 0.01 & 3298.6 \\
ger & euclidean & 0.49 $\pm$ 0.01 & 35 $\pm$ 10 & 119 $\pm$ 9 & 0.08 $\pm$ 0.0 & 4.19 $\pm$ 0.02 & 15.8 \\
 & monge & 0.75 $\pm$ 0.02 & 80 $\pm$ 5 & 4673 $\pm$ 113 & 3.97 $\pm$ 0.03 & 0.28 $\pm$ 0.01 & 4665.4 \\
 & fisher & 0.49 $\pm$ 0.0 & 85 $\pm$ 24 & 169 $\pm$ 5 & 0.95 $\pm$ 0.0 & 0.99 $\pm$ 0.01 & 19684.4 \\
hrt & euclidean & 0.63 $\pm$ 0.01 & 137 $\pm$ 26 & 254 $\pm$ 16 & 0.19 $\pm$ 0.01 & 2.72 $\pm$ 0.02 & 7.3 \\
 & monge & 0.74 $\pm$ 0.03 & 394 $\pm$ 55 & 2979 $\pm$ 266 & 2.99 $\pm$ 0.04 & 0.38 $\pm$ 0.02 & 637.4 \\
 & fisher & 0.64 $\pm$ 0.01 & 232 $\pm$ 14 & 311 $\pm$ 8 & 0.92 $\pm$ 0.01 & 1.01 $\pm$ 0.01 & 1694.8 \\
pim & euclidean & 0.21 $\pm$ 0.0 & 266 $\pm$ 41 & 445 $\pm$ 45 & 0.11 $\pm$ 0.0 & 3.59 $\pm$ 0.03 & 6.8 \\
 & monge & 0.29 $\pm$ 0.01 & 515 $\pm$ 72 & 4656 $\pm$ 269 & 2.38 $\pm$ 0.03 & 0.53 $\pm$ 0.02 & 1711.7 \\
 & fisher & 0.21 $\pm$ 0.0 & 427 $\pm$ 47 & 547 $\pm$ 30 & 0.93 $\pm$ 0.0 & 0.98 $\pm$ 0.01 & 425.4 \\
rip & euclidean & 0.09 $\pm$ 0.01 & 829 $\pm$ 112 & 1499 $\pm$ 56 & 0.24 $\pm$ 0.01 & 2.46 $\pm$ 0.02 & 4.2 \\
 & monge & 0.15 $\pm$ 0.02 & 1042 $\pm$ 211 & 3123 $\pm$ 685 & 1.38 $\pm$ 0.02 & 0.97 $\pm$ 0.01 & 593.9 \\
 & fisher & 0.09 $\pm$ 0.0 & 1623 $\pm$ 113 & 1753 $\pm$ 92 & 0.91 $\pm$ 0.0 & 0.96 $\pm$ 0.01 & 204.7 \\
\bottomrule
\end{tabular}
    \caption{Bayesian Logistic Regression. Entries are reported as $\mathrm{mean} \pm \mathrm{std}$.}
\label{tab:logreg}
\end{table}



\section{Mathematical  Derivations} \label{app:math}

\subsection{The Generative and Inverse Generative metrics}

The Generative and Inverse Generative metrics read
\begin{equation*}
    G(x) = f(x) I = \exp (\log f(x) )I,
\end{equation*}
where the scalar factor is $f(x) = \left( \frac{p_0 + \lambda}{p(x) + \lambda} \right)^2$ for the Generative metric and  $f(x) = \left( \frac{p(x) + \lambda}{p_0 + \lambda} \right)^2$ for the Inverse Generative metric.

\paragraph{Square root and inverse square root}
The quantities are given by:
\begin{align*}
        G^{\tfrac{1}{2}}(x) &= \exp{\tfrac{1}{2} \log f(x)} I,\\
        G^{-\tfrac{1}{2}}(x) &= \exp{-\tfrac{1}{2} \log f(x)} I, \\
        \log |\det G(\bx)|&=  D \log f(x).
\end{align*}


\paragraph{Christoffel symbols derivation}
Given the Riemannian metric $G(x) = f(x) I$, the tensor entries are:
\begin{equation*}
G_{ij}(x) = f(x) \delta_{ij}.    
\end{equation*}

The Christoffel symbols for this metric are given by:
\begin{align*}
\Gamma^k_{ij} &= \frac{1}{2 f(x)} \left( \delta_{jk} \partial_i f(x) + \delta_{ik} \partial_j f(x) - \delta_{ij} \partial_k f(x) \right) \\
&= \frac{1}{2} \left( \delta_{jk} \partial_i \log f(x) + \delta_{ik} \partial_j \log f(x) - \delta_{ij} \partial_k \log f(x) \right).
\end{align*}
Denote by $e_k$ the standard basis vectors, the Christoffel symbols in matrix notation are: 
\begin{equation*}
\Gamma^k = \frac{1}{2} \left( \nabla \log f(x) \, e_k^\top + e_k \, \nabla \log f(x)^\top - \partial_k \log f(x) I \right).
\end{equation*}
We compute $\norm{\bbv}^2_{\Gamma^k}=\sum_{i,j=1}^D \Gamma^k_{ij}(\bx) \bbv_i \bbv_j$ which appears in the geodesic equations,
\begin{equation*}
    \norm{\bbv}^2_{\Gamma^k} = \inp{\bbv}{\nabla\log f} \bbv_k - \tfrac{1}{2} \norm{\bbv}^2 \partial_k \log f.
\end{equation*}
The geodesic equations read:
\begin{align*}
    \dot \bx &= \bbv, \\
    \dot \bbv &= \tfrac{1}{2} \norm{\bbv}^2 \nabla \log f-\inp{\bbv}{\nabla\log f} \bbv.
\end{align*}

% \paragraph{ Computation  $\nabla \log f$}
% First let us consider the Inverse Generative metric, recall $f= \left( \frac{p(x) + \lambda}{p_0 + \lambda} \right)^2$.
% The partial derivative of $f(x)$   with respect to $x_i$ is:
% \begin{equation*}
% \partial_i f(x) = \frac{d}{dx_i} \left( \frac{p(x) + \lambda}{p_0 + \lambda} \right)^2 = 2 \left( \frac{p(x) + \lambda}{p_0 + \lambda} \right)    
% \end{equation*}





\subsection{The Monge and Inverse Monge metrics}

The Monge metric and the Inverse Monge metric are:
\begin{equation*}    
    G(x) = I + \alpha^2 \nabla \ell\nabla \ell^\top,  \quad
    G^{-1}(x) = 
    I - \frac{\alpha^2}{1+\alpha^2\norm{\nabla \ell}^2} \nabla \ell\nabla \ell^\top.
\end{equation*}

\paragraph{Square root and inverse square root}
Define the quantity $L_\alpha := 1+\alpha^2 \norm{\nabla\ell}^2$, we list the quantities derived from the matrix and present later their derivation,
\begin{align*}
    G^{1/2}(x) &= I + 
    \frac{\alpha^2}{1+\sqrt{L_\alpha}}
   \nabla\ell(\bx)\nabla\ell(\bx)^\top, \\
    G^{-1/2}(x) &= 
    I-\frac{\alpha^2}{L_\alpha+  \sqrt{L_\alpha}} \nabla\ell(x) \nabla\ell(x)^\top, \\
    \log |\det G(\bx)| &= \log (1+\alpha^2 \norm{\nabla\ell}^2).
\end{align*}
% 
We now derive the quantities.
\citet{Hartmann2022}  gave
\begin{equation}
 G^{-\tfrac{1}{2}}(x) = I +\frac{1}{\norm{\nabla\ell}^2}\left(
\frac{1}{\sqrt{L_\alpha}}-1\right)\nabla\ell\nabla\ell^\top   
\label{eq:monge_1/2_hartmann}
\end{equation}
Note that  if $\norm{\nabla\ell}^2\to 0$ then Equation~\ref{eq:monge_1/2_hartmann} is undefined.
We find a more numerical stable form of $G^{-\tfrac{1}{2}}(x)$ 
Multiply the scalar $\frac{1}{\norm{\nabla\ell}^2}\left(
\frac{1}{\sqrt{L_\alpha}}-1\right)$ by its conjugate
\begin{align*}
  \frac{1}{\norm{\nabla\ell}^2}\left(
\frac{1}{\sqrt{L_\alpha}} -1
\right)   = \frac{1}{\norm{\nabla\ell}^2}\left(
\frac{1-\sqrt{L_\alpha}}{\sqrt{L_\alpha}}\right)
\left(\frac{1+ \sqrt{L_\alpha}}{1+ \sqrt{L_\alpha}} 
\right) = \frac{-\alpha^2}{  L_\alpha + \sqrt{L_\alpha}}.
\end{align*}
Plugging the scalar $\frac{-\alpha^2}{  L_\alpha + \sqrt{L_\alpha}}$ into $G^{-\tfrac{1}{2}}(\bx)$, then it is numerically stable for $\norm{\nabla\ell}^2\to 0$. \\ 
% 
\paragraph{The computation of $G^{\tfrac{1}{2}}(x)$} 
For convenience take $y = \nabla\ell(x)$. The the metric is 
$G(y) = I + y y^\top$. Let us assume the square root is of the form
 $G^{\tfrac{1}{2}}(y) = I + \lambda y y^\top$. Let us formulate a quadratic equation for $\lambda$: 
\begin{align*}
    G^{\tfrac{1}{2}}(y) G^{\tfrac{1}{2}}(y) &=  I + y y^\top\\
    I + 2 \lambda yy^\top + \lambda^2 \norm{y}^2yy^\top &= I + y y^\top \\
    0 &= \left(
        1-2 \lambda - \lambda^2 \norm{y}^2
    \right)  y y^\top.
\end{align*}
The solutions of the quadratic equation are
\begin{equation*}
\lambda = \frac{-1 \pm \sqrt{1 + \|y\|^2}}{\|y\|^2}.
\end{equation*}
Let us simplify $\frac{-1 + \sqrt{1 + \|y\|^2}}{\|y\|^2}$, multiply by its conjugate
\begin{align*}
    \frac{  \sqrt{1 + \|y\|^2}-1}{\|y\|^2}
    \left(\frac{  \sqrt{1 + \|y\|^2}+1}{\sqrt{1 + \|y\|^2}+1}\right)  = 
    \frac{\norm{y}^2}{\norm{y}^2\sqrt{1 + \|y\|^2} + 1} .
\end{align*}
Substitute $y = \alpha\nabla\ell(\bx)$ and we obtain the result
\begin{equation*}
    G^{1/2}(x)= I + \frac{\alpha^2}{\sqrt{1 + \alpha^2\|\nabla\ell(\bx)\|^2} + 1} \nabla\ell\nabla\ell^\top.
\end{equation*}

\paragraph{Christoffel symbols of the Monge metric}
The Christoffel associated to the Monge metric (derivation in Section~\ref{sec:mon} and \citet{Hartmann2022}) are
\begin{align*}
    \Gamma^k(x) &= \frac{\alpha^2}{1+\alpha^2\norm{\nabla\ell}^2}\nabla^2\ell \partial_k \ell,
\end{align*}
and the geodesic equations read
\begin{align*}
    \dot \bx &= \bbv, \\
    \dot \bbv &= -\frac{\alpha^2}{L_\alpha}\norm{\bbv}^2_{\nabla^2\ell} \nabla\ell.
\end{align*}

\paragraph{Christoffel symbols of the Inverse Monge metric} The Christoffel symbols associated to the inverse Monge metric (derivation in Section~\ref{sec:invmon}) are
\begin{equation*}
    \Gamma^k = \frac{\alpha^2}{2} \Big[
L_\alpha \left( \nabla f \nabla \ell^\top + \nabla \ell \nabla f^\top + 2f \nabla^2 \ell \right) \partial_k \ell
+  \nabla \ell \nabla \ell^\top \partial_k f
+ \alpha^2 \inp{\nabla \ell}{\nabla f} \nabla \ell \nabla \ell^\top \partial_k \ell 
\Big],
\end{equation*}
and the geodesic equations read
\begin{align*}
    \dot \bx &= \bbv, \\
    \dot \bbv &= - \frac{\alpha^2}{2} \Big[
2L_\alpha \left(         
        \inp{v}{\nabla f} \inp{\nabla\ell}{v} +         
        f\norm{v}^2_{\nabla^2\ell}
    \right) 
        \nabla \ell
+ \inp{\nabla\ell}{v}^2 \nabla f
+ \alpha^2 \inp{\nabla \ell}{\nabla f} \inp{\nabla\ell}{v}^2
\nabla \ell 
\Big].
\end{align*}

\subsection{Monge metric: Christoffel symbols derivation} \label{sec:mon}
For completeness let us do an alternative derivation of the Christoffel symbols from the one found in \citet{Hartmann2022}. 
Take the auxiliary function $f(x) = -\frac{1}{L_\alpha}$, where $L_\alpha = 1 + \alpha^2 \norm{\nabla\ell}^2$. The metric and inverse components are:
\begin{align*}
    g_{ij} &= \delta_{ij} + \alpha^2 \partial_i \ell\partial_j \ell,\\
    g^{km} &= \delta_{km} + \alpha^2 f(x) \partial_k \ell\partial_m \ell.
\end{align*}
The derivatives of the metric are:
\begin{align*}
    \partial_i g_{mj} &= \alpha^2\left( 
    \partial_{im} \ell\partial_j \ell +  \partial_{m} \ell\partial_{ij} \ell \right),\\
    \partial_j g_{im} &= \alpha^2\left( 
     \partial_{ij} \ell\partial_m \ell +  \partial_{i} \ell\partial_{jm} \ell     \right),\\ 
    \partial_m g_{ij} &= \alpha^2\left( 
     \partial_{im} \ell\partial_j \ell + \partial_{i} \ell\partial_{jm} \ell     \right).
\end{align*}
The Christoffel symbols read, 
\begin{align*}
    \Gamma^k_{ij} &= \frac{1}{2}g^{km}\left(
        \partial_i g_{mj} + 
        \partial_j g_{im} -
        \partial_m g_{ij}
    \right) \\
    &= \frac{\alpha^2}{2}g^{km}\left(             
             2  \partial_{m} \ell\partial_{ij} \ell
    \right) \\
    &= \alpha^2 \sum_m \left(
        \delta_{km} + \alpha^2 f(x)  \partial_k \ell\partial_m \ell
    \right)        
               \partial_{m} \ell\partial_{ij} \ell
     \\
    & = \alpha^2 \left(
        \partial_{k} \ell\partial_{ij} \ell
         + \alpha^2 f(x) \sum_m (\partial_m \ell)^2 \partial_k \partial_{ij} \ell
    \right)\\
    &=  \alpha^2 \partial_{k} \ell\partial_{ij}  \left(
    1 -  \tfrac{\alpha^2 \norm{\nabla\ell}^2}{1+\alpha^2 \norm{\nabla\ell}^2}
        \right)\\
    &= \frac{\alpha^2}{1+\alpha^2 \norm{\nabla\ell}^2} \partial_{k} \ell\partial_{ij}.
\end{align*}
Thus, the Christoffel symbols are $\Gamma^k_{ij} = \frac{\alpha^2}{L_\alpha} \partial_{k} \ell\partial_{ij}$.
Writing in matrix form $\Gamma^k$ of size $D\times D$ with components $[\Gamma^k]_{ij} = \Gamma^k_{ij}$,
\begin{equation*}
\Gamma^k = \frac{\alpha^2}{1+\alpha^2 \norm{\nabla\ell}^2} \nabla^2\ell \partial_{k} \ell.
\end{equation*}
Let us compute $\norm{\bbv}^2_{\Gamma^k}$, which appears in the geodesic equations,
\begin{align*}
    v^\top \Gamma^k v &= 
    \frac{\alpha^2}{1+\alpha^2 \norm{\nabla\ell}^2} \norm{v}^2_{\nabla^2\ell }\partial_{k}\ell. 
\end{align*}
The geodesic equations read:
\begin{align*}
    \dot \bx &= \bbv, \\
    \dot \bbv &= - \frac{\alpha^2}{L_\alpha} \norm{\bbv}^2_{\nabla^2\ell }\nabla \ell.  
\end{align*}


\subsection{Inverse Monge metric: Christoffel symbols derivation} \label{sec:invmon}
Again the auxilary function is $f(x) = -\frac{1}{L_\alpha}$, where $L_\alpha = 1 + \alpha^2 \norm{\nabla\ell}^2$, the metric and inverse components are
\begin{align*}
    g_{ij} &= \delta_{ij} + f(x)\alpha^2 \partial_i \ell\partial_j \ell\\
    g^{km} &= \delta_{km} + \alpha^2 \partial_k \ell\partial_m \ell.
\end{align*}
The derivatives of the metric are (we mark with the same color repeating terms)
\begin{align*}
    \partial_i g_{mj} &= \alpha^2\left(\partial_i f \partial_m \ell\partial_j \ell + 
    \textcolor{blue}{f \partial_{im} \ell\partial_j \ell} + \color{magenta}{f \partial_{m} \ell\partial_{ij} \ell} \right),\\
    \partial_j g_{im} &= \alpha^2\left(\partial_j f \partial_i \ell\partial_m \ell + 
    \color{magenta}{f \partial_{ij} \ell\partial_m \ell} + \textcolor{red}{f \partial_{i} \ell\partial_{jm} \ell}     \right),\\ 
    \partial_m g_{ij} &= \alpha^2\left(\partial_m f \partial_i \ell\partial_j \ell + 
    \textcolor{blue}{f \partial_{mi} \ell\partial_j \ell} + \textcolor{red}{f \partial_{i} \ell\partial_{mj} \ell  }   \right).
\end{align*}
Let us compute the Christoffel symbols of the first kind (blue and red terms will cancel out, pink terms add to each other)
\begin{align*}
   \Gamma_{kij} &= \frac{1}{2}\left( \partial_i g_{mj} + 
        \partial_j g_{im} -
        \partial_m g_{ij} \right) \\
        &=
        \frac{\alpha^2}{2}\left(\partial_i f \partial_m \ell\partial_j \ell + 
            \partial_j f \partial_i \ell\partial_m \ell - 
             \partial_m f \partial_i \ell\partial_j \ell + 
             \textcolor{magenta}{2 f \partial_{ij} \ell\partial_{m} \ell}
             \right).
\end{align*}
% 
The Christoffel symbols of the second kind read
\begin{align*}
    \Gamma^k_{ij} &= \frac{1}{2}g^{km}\left(
        \partial_i g_{mj} + 
        \partial_j g_{im} -
        \partial_m g_{ij}
    \right) \\
    &= \frac{\alpha^2}{2} \sum_m \left(
        \delta_{km} + \alpha^2 \partial_k \ell\partial_m \ell
    \right)
    \left(
            \partial_i f \partial_m \ell\partial_j \ell + 
            \partial_j f \partial_i \ell\partial_m \ell -
             \partial_m f \partial_i \ell\partial_j \ell + 
             2 f  \partial_{ij} \ell \partial_{m} \ell
    \right) \\    
    & = \frac{\alpha^2}{2} \bigg(
        \partial_i f \partial_k \ell\partial_j \ell + 
        \partial_j f \partial_i \ell\partial_k \ell -
         \partial_k f \partial_i \ell\partial_j \ell + 
         2 f  \partial_{ij} \ell \partial_{k} \ell\\
         & \quad+ \alpha^2 \partial_k \ell 
            \left(
                \partial_i f\norm{\nabla\ell}^2\partial_j \ell + 
                \partial_j f \partial_i\ell \norm{\nabla\ell}^2  -
                \inp{\nabla f}{ \nabla \ell} \partial_i \ell\partial_j \ell + 
                2 f  \partial_{ij} \ell\norm{\nabla\ell}^2 
            \right)
    \bigg)\\
    & = \frac{\alpha^2}{2}   \bigg( 
            \partial_k \ell \left(
        L_\alpha  \partial_i f \partial_j \ell
        + L_\alpha \partial_i \ell \partial_j f
        -  \alpha^2 \inp{\nabla f}{ \nabla \ell}
        \partial_i \ell\partial_j \ell
        + 2  L_\alpha f(x) \partial_{ij} \ell 
        \right) - \partial_k f\partial_i \ell\partial_j \ell
    \bigg).   \\
\end{align*}
Thus, the Christoffel symbols are:
\begin{equation*}
  \Gamma^k_{ij} = \frac{\alpha^2}{2}  \Big[
    \partial_k \ell \left(
L_\alpha 
    \left(  
           \partial_i f \partial_j \ell
           +\partial_i \ell \partial_j f
           +2f(x) \partial_{ij} \ell 
    \right) 
 - 
  \alpha^2 \inp{\nabla f}{ \nabla \ell}\partial_i \ell\partial_j \ell
    \right)
    - \partial_k f\partial_i \ell\partial_j \ell
\Big].  
\end{equation*}
Written in matrix form
\begin{equation*}
  \Gamma^k = \frac{\alpha^2}{2}
      \Big[
      \partial_k \ell  \left(
        L_\alpha 
        \left(  
               \nabla f \nabla \ell^\top
               +\nabla \ell \nabla f^\top
               +2f(x) \nabla^2 \ell 
        \right) 
     - 
      \alpha^2 \inp{\nabla f}{ \nabla \ell}\nabla \ell \nabla\ell ^\top
      \right)
      - \partial_k f\nabla \ell \nabla\ell ^\top
    \Big].
\end{equation*}
Let us compute $\norm{v}^2_{\Gamma^k}$ which appears in the geodesic equations
\begin{align*}
    v^\top \Gamma^k v &= 
    \frac{\alpha^2}{2}   \Big[
    \partial_k\ell\left(
2L_\alpha \left(         
        \inp{v}{\nabla f} \inp{\nabla\ell}{v} +         
        f\norm{v}^2_{\nabla^2\ell}
    \right) 
        -  \alpha^2 \inp{\nabla f}{ \nabla \ell}
        \inp{\nabla\ell}{v}^2
        \right)
    - \partial_k f \inp{\nabla\ell}{v}^2
\Big].
\end{align*}
Then the geodesic equations read,
\begin{align*}
    \dot \bx &= \bbv, \\
    \dot \bbv &= - \frac{\alpha^2}{2} \Big[
    \left(
2L_\alpha \left(         
        \inp{v}{\nabla f} \inp{\nabla\ell}{v} +         
        f\norm{v}^2_{\nabla^2\ell}
    \right) 
        - \alpha^2 \inp{\nabla f}{ \nabla \ell}
        \inp{\nabla\ell}{v}^2
        \right)\nabla \ell 
         -\inp{\nabla\ell}{v}^2\nabla f
\Big].
\end{align*}
Where the gradient of $f$ is:
\begin{equation*}
    \nabla f = \frac{2\alpha^2}{L^2_\alpha}\nabla^2\ell \nabla\ell.
\end{equation*}

\subsection{Target Distributions} \label{app:toydist}
The Funnel, Squiggle and Rosenbrock distributions are smooth bijective transformations from a $Z\sim \cN(\mu, \Sigma)$ to $X = f(X)$. We use the shorthand notation $x = x(z) $ and $z =z(x)$.
\paragraph{The Funnel distribution}
$p(x)  = \mathcal{N}(x_D|0,\sigma^2) \mathcal{N}(x_{1:{D-1}}|\mu, e^{x_D}I_{D-1} )$.  In this case $Z\sim \cN(0,I)$. The choice of parameters is $\sigma=3$ and $\mu=0$,
\begin{equation*}
    x = \begin{bmatrix}
    e^{\sigma z_D /2} z_{1:D-1}\\
    \sigma z_D
    \end{bmatrix},
    \quad
    \pdv{x}{z} = \begin{bmatrix}
    e^{\sigma z_D /2} I_{D-1} & \frac{\sigma}{2} e^{\sigma z_D /2} z_{1:D-1} \\
    0 & \sigma
\end{bmatrix}    
\quad 
    \pdv{z}{x} = 
    \begin{bmatrix}
        e^{- x_D /2} I_{D-1}  & -\tfrac{1}{2}e^{- x_D /2}x_{1:D-1} \\
        0 &  \tfrac{1}{\sigma}.
    \end{bmatrix}.
\end{equation*}
The log determinant of the inverse Jacobian is $\log \det(\pdv{z}{x}) =  -(D-1)x_D/2 - \log\sigma$.


\paragraph{The hybrid Rosenbrock distribution}
For simplicity here we show the two dimensional case, the full distribution can be consulted in \citet{Pagani2022}.
The two dimensional density is: $ p(x) = \cN(x_{1}|a, \frac{1}{2})
\cN(x_{2}|x_{1}^2,\frac{1}{2b})$.  In this case $Z\sim \cN(0,I)$. The choice of parameters is $a=1$, $b=100$ and block size of $3$ and $\lfloor \tfrac{D-1}{3}\rfloor$ total blocks,
\begin{equation*}
    x = 
    \begin{bmatrix}
    a + \tfrac{1}{\sqrt{2}}z_1 \\
     (a + \tfrac{1}{\sqrt{2}}z_1)^2 + \tfrac{1}{\sqrt{2b}}z_2
    \end{bmatrix},
    \quad
    \pdv{x}{z} = 
    \begin{bmatrix}
    \frac{1}{\sqrt{2}} & 0 \\
      \sqrt{2} a + z_1 & \frac{1}{\sqrt{2b}}
    \end{bmatrix},
    \quad
    \pdv{z}{x} = 
    \begin{bmatrix}
    \sqrt{2} & 0 \\
    -2\sqrt{2b}x_1 & \sqrt{2b}
    \end{bmatrix}.
\end{equation*}

\paragraph{The Squiggle distribution}
The density is $p(x) = \cN(x(z)|\mu, \Sigma)|\det \pdv{x}{z}|$, where $Z\sim \cN(\mu,\Sigma)$. 
The choice of parameters is $a = 1.5$, $\mu=0$, $\Sigma = \diag(5,\tfrac{1}{2},..,\tfrac{1}{2})$
\begin{equation*}
    x = 
    \begin{bmatrix}
    z_1 \\
     z_{2:D} -\sin(a z_1)
     \end{bmatrix} \quad 
     \pdv{x}{z} =
    \begin{bmatrix}
    1 & 0 \\
      -a \cos(a z_1) & I
    \end{bmatrix},
    \quad
    \pdv{z}{x} = 
    \begin{bmatrix}
    1 & 0 \\
    a \cos(a x_1) & I
    \end{bmatrix}.
\end{equation*}
The log determinant of the inverse Jacobian is $\log \det(\pdv{z}{x}) = 0$.

For these three toy problems the Fisher Information follows from the transformation rule of Riemannian metrics
\begin{equation*}
    G(x) = \pdv{z}{x}^\top  \Sigma^{-1} \pdv{z}{x}.    
\end{equation*}


\paragraph{Location and Scale parameters for Complex Distributions}
In Experiment~\ref{sec:exp3} we consider the mixture of two complex distributions. We introduce a location and scale parameter for each components of the mixture. 
The Funnel, Squiggle and Rosenbrock distributions are smooth bijective transformations from a $Z\sim \cN(\mu, \Sigma)$ to $Y = f(X)$.
Let us add a location and scale parameters by an additional transformation $g(Y) = X$, where $g(y) = \Sigma_{y} y + \mu_{y}$
\begin{equation*} 
     Z \overset{f}{\xmapsto{\hspace{0.6cm}}} Y \overset{g}{\xmapsto{\hspace{0.6cm}}} X.
\end{equation*}
The change of variable formula for the composition $g\circ f$ gives 
\begin{align*}
    p_X(x) &= p_Z \left( (g\circ f)^{-1}(x) \right) \left| \det\pdv{x}{z}\right| \\
        &=  p_Z \left( (f^{-1} \circ g^{-1})(x) \right) \left| \det \pdv{x}{y}\right|\left| \det\pdv{y}{z}\right|
\end{align*}
Plug in $g^{-1}(x) = \Sigma_y^{-1/2} (x-\mu_y) $, $\det \pdv{y}{z} = \det \Sigma^{-1/2}$, and $p_Z(z) = \cN(z|\mu, \Sigma)$, we obtain the expression of the density 
% with a location and scale parameters $(\mu_y, \Sigma_y)$:
\begin{equation*}
    p_X(x) = \cN\left( f^{-1} (\Sigma^{1/2} (x-\mu_y)) \bigg\vert \mu, \Sigma \right) \left| \det \pdv{x}{y}\right|\left| \det \Sigma_y^{-1/2} \right|.
\end{equation*}
Where $(\mu_y, \Sigma_y)$ are the location and scale parameters of the component of the mixture distribution.
% \paragraph{The  Allen-Cahn Field System}
% We consider the stochastic Allen–Cahn model \citep{Berglund2017} used as a benchmark in  \citet{Cabezas2024}. The log-density is:
% \begin{equation}
%     \log p(x) = -\beta \left( \frac{a}{2\Delta s} \sum_{i=1}^{D+1} (x_i - x_{i-1})^2 + \frac{b \Delta s}{4} \sum_{i=1}^{D} (1 - x_i^2)^2 \right). \label{eq:fieldsystem}
% \end{equation}
%  Similar to previous work we consider parameters $\Delta s = \tfrac{1}{D}$, and boundary conditions $x_0=x_{D=1}=0$. The values $a=0.1$ and $b=\tfrac{1}{a}$ are chosen to ensure bimodality for each $x_i$. We set the dimension $D=16$. 
 
%  An inspection of the density of the model (Eq.~\eqref{eq:fieldsystem}) reveals that the global maxima are $(1,..,1)$ and $(-1,..,-1)$, at these values the first terms in the sum  cancel out and the second term is zero. 
%  Note that the second term induces bimodality for $x_i = \pm 1$. Then combinations of values of the form
%  $(\pm 1,..,\pm 1)$ are local maxima, since the second term is zero but the first terms of the sum do not cancel out. Thus the problem has a total of $2^D$ maxima. 


\paragraph{The Allen-Cahn Field System}
We consider the stochastic Allen–Cahn model \citep{Berglund2017} used as a benchmark in \citet{Cabezas2024}. The log-density is:
\begin{equation}
    \log p(x) = -\beta \left( \frac{a}{2\Delta s} \sum_{i=1}^{D+1} (x_i - x_{i-1})^2 + \frac{b \Delta s}{4} \sum_{i=1}^{D} (1 - x_i^2)^2 \right). \label{eq:fieldsystem}
\end{equation}
We adopt the  parameter choices  $\Delta s = \tfrac{1}{D}$ and boundary conditions $x_0 = x_{D+1} = 0$, and the constants $a=0.1$ and $b=\tfrac{1}{a}$ ensure that the double-well potential induces bimodality in each component $x_i$, and we fix $D = 16$.

\paragraph{Analysis of multimodality}
To understand the maxima of this density, we analyze the two terms in the log-density function (Eq.~\ref{eq:fieldsystem}):
\begin{enumerate}
    \item The first term, $ 
    \sum_{i=1}^{D+1} (x_i - x_{i-1})^2$, penalizes differences between adjacent components, encouraging all components to have similar values.
    \item The second term, $\sum_{i=1}^{D} (1 - x_i^2)^2$, is minimized when $x_i = \pm 1$.
\end{enumerate}

The global maxima occur at $(1,\ldots,1)$ and $(-1,\ldots,-1)$ because these configurations minimize both terms simultaneously: all components have the same value (satisfying the first term) and each component equals $\pm 1$ (satisfying the second term).
    
Local maxima occur at all other combinations of $\pm 1$ values (i.e., at points $(\pm 1,\ldots,\pm 1)$ with mixed signs) because these configurations still satisfy the second term perfectly, but incur penalties from the first term due to sign changes between adjacent components.

This creates $2^D$ local maxima, making the problem highly multimodal, with the two homogeneous configurations being global maxima.


\paragraph{Kernel Stein Discrepancy}
Let $\pi$ and $\nu$ be two probability measures. We estimate the Kernel Stein Discrepancy with the biased but non-negative V-estimator. Given a sample $x_i\sim \nu$ for $i=1,\ldots,n$,
\begin{equation*}
    \widehat{\mathrm{KSD}}^2_{k, V}(\pi, \nu) = \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n k_\pi(x_i, x_j').
\end{equation*}
Denote by $p(x)$ be the density of the measure $\pi$, then,
\begin{equation*}
k_\pi(x, x') = \nabla_x \cdot \nabla_{x'} k(x, x') + \nabla_x k(x, x') \cdot \nabla_{x'} \log p(x') + \nabla_{x'} k(x, x') \cdot \nabla_x \log p(x) + k(x, x') \nabla_x \log p(x) \cdot \nabla_{x'} \log p(x),
\end{equation*}
where we choose the inverse multi quadratic kernel $k(x,x') = (1+(x-x')^\top(x-x'))^\beta$ for $\beta=-\tfrac{1}{2}$, following the choices made by \citet{Cabezas2024}.

