\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2022} with \usepackage[nohyperref]{icml2022} above.
%\usepackage{hyperref}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\CC}{\mathbb{C}}
\newcommand{\br}{\boldsymbol{r}}
\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\bz}{\boldsymbol{z}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

\usepackage{xr}
%\zxrsetup{tozreflabel=false, toltxlabel=true, verbose}
\externaldocument{liu_286}

\title{PathFlow: A Normalizing Flow Generator that Finds Transition Paths Supplementary Material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{Tianyi~Liu}
\author{Weihao~Gao}
\author{Zhirui~Wang}
\author{Chong Wang}
% Add affiliations after the authors
\affil{%
   ByteDance Inc.
}

\begin{document}
\onecolumn
\maketitle
\appendix
\section{Proof of Theorem~\ref{thm:rate}}\label{appendix:thm_rate}
For any function $F \in \{M_{ij}, \nabla_i U, \nabla_l M_{ij}, \nabla^2_{ij} U\}$, the error of the estimator can be decomposed by triangle inequality as follows,
\begin{align*}
    |F^{(T,k)}(\bz) - F(\bz)| \leq |F^{(k)}(\bz) - F(\bz)| + |F^{(T,k)}(\bz) - F^{(k)}(\bz)|.
\end{align*}
The first term is introduced by the finiteness of $k$. The second term is introduced by finiteness of the time $T$ of restrained dynamics. The following two lemmas provide upper bounds for finite-$k$ error and finite-$T$ error respectively.

\begin{lemma}[Error by finite $k$]\label{lem:k}
For any function $g(\br)$, consider the two functionals $\mathcal{I}[g]$ and $\mathcal{I}^{(k)}[g]$ defined as follows
\begin{align*}
    \mathcal{I}[g](\bz) &= \int_{\mathbb{R}^{3D}} g(\br) e^{-\beta V(\br)} \prod_{j=1}^N \delta(x_j(\br) - z_j) d\br, \\
    \mathcal{I}^{(k)}[g](\bz) &= (\frac{2\pi}{\beta k})^{N/2} \int_{\mathbb{R}^{3D}} g(\br) e^{-\beta (V(\br) + \frac{k}{2} \sum_{j=1}^N (x_j(\br)-z_j)^2)}  d\br.
\end{align*}
Their difference are bounded by
\begin{align*}
    |\mathcal{I}[g](\bz) - \mathcal{I}^{(k)}[g](\bz)| \leq \frac{1}{2\beta k} Tr[\nabla_z^2 \mathcal{I}[g](\bz)].
\end{align*}
Moreover, the difference of the derivatives of $\mathcal{I}[g](\bz)$ and $\mathcal{I}^{(k)}[g](\bz)$ are bounded by
\begin{align*}
    |\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} \mathcal{I}[g](\bz) - \frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} \mathcal{I}^{(k)}[g](\bz)| \leq \frac{1}{2\beta k} \frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} Tr[\nabla_z^2 \mathcal{I}[g](\bz)].
\end{align*}
\end{lemma}

\begin{proof}
The proof of the upper bound of $|\mathcal{I}[g](\bz) - \mathcal{I}^{(k)}[g](\bz)|$ follows~\cite{maragliano2006string}. We generate the proof to the upper bound of the derivatives. Consider the Fourier transform of $\mathcal{I}[g](\bz)$ as follows
\begin{align*}
    \hat{G}(\zeta) &= \int_{\CC^N} e^{-i \zeta \cdot \bz} \int_{\RR^{3D}} g(\br) e^{-\beta V(\br)} \prod_{j=1}^N \delta(x_j(\br) - z_j) d\br d\bz = \int_{\RR^{3D}} g(\br) e^{-\beta V(\br)} \int_{\CC^N} e^{-i \zeta \cdot \bz} \prod_{j=1}^N \delta(x_j(\br) - z_j) d\bz d\br \\
    &= \int_{\RR^{3D}} g(\br) e^{-\beta V(\br)} e^{-i \zeta \cdot \bx(\br)} d\br.
\end{align*}
The Fourier transform of $\mathcal{I}^{(k)}[g](\bz)$ is 
\begin{align*}
    \hat{G^{(k)}}(\zeta) &= (\frac{2\pi}{\beta k})^{N/2} \int_{\CC^N} e^{-i \zeta \cdot \bz} \int_{\RR^{3D}} g(\br) e^{-\beta (V(\br)  + \frac{k}{2} \sum_{j=1}^N (x_j(\br)-z_j)^2)} d\br d\bz \\
    &= (\frac{2\pi}{\beta k})^{N/2} \int_{\RR^{3D}} g(\br) e^{-\beta V(\br)} \int_{\CC^N} \exp\{-i \zeta \cdot \bz - \frac{\beta k}{2} \sum_{j=1}^N (x_j(\br)-z_j)^2\}  d\bz d\br \\
    &= (\frac{2\pi}{\beta k})^{N/2} \int_{\RR^{3D}} g(\br) e^{-\beta V(\br)} \int_{\CC^N} \exp\{- \frac{\beta k}{2} \sum_{j=1}^N (z_j - x_j(\br) + \frac{i}{\beta k} \zeta_j)^2 - i \zeta \cdot \bx(\br) - \frac{|\zeta|^2}{2\beta k}\}  d\bz d\br \\
    &= \int_{\RR^{3D}} g(\br) e^{-\beta V(\br)} e^{-i \zeta \cdot \bx(\br) - \frac{|\zeta|^2}{2 \beta k}} d\br = e^{-\frac{|\zeta|^2}{2\beta k}} \hat{G}(\zeta).
\end{align*}
By applying reverse Fourier transformation, we have
\begin{align*}
    &|\mathcal{I}[g](\bz) - \mathcal{I}^{(k)}[g](\bz)| = \big| \int_{\CC^N} e^{i \zeta \cdot \bz} (\hat{G}(\zeta) - \hat{G^{(k)}}(\zeta)) d \zeta \big| \leq \int_{\CC^N} e^{i \zeta \cdot \bz} \hat{G}(\zeta) |1 - e^{-\frac{|\zeta|^2}{2\beta k}}| d \zeta \\ 
    & \leq \int_{\CC^N} e^{i \zeta \cdot \bz} \hat{G}(\zeta) \frac{|\zeta|^2}{2\beta k} d \zeta = \frac{1}{2 \beta k} Tr[\nabla_z \mathcal{I}^{(k)}[g](\bz)].
\end{align*}
To generalize the upper bound to the derivatives, notice the Fourier transform of the derivatives $\frac{\partial^p}{\partial z_{i_1}, \cdots, \partial z_{i_p}} \mathcal{I}[g](\bz)$ and $\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} \mathcal{I}^{(k)}[g](\bz)$ are $i^p \zeta_{i_1} \dots \zeta_{i_p} \hat{G}(\zeta)$ and $i^p \zeta_{i_1} \cdots \zeta_{i_p} \hat{G_k}(\zeta)$, respectively.
% \begin{align*}
%   ((\hat{\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} G})(\zeta) &= i^p \zeta_{i_1} \dots \zeta_{i_p} \hat{G}(\zeta) \\
%     (\hat{\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} G^{(k)}})(\zeta) &= i^p \zeta_{i_1} \dots \zeta_{i_p} \hat{G^{(k)}}(\zeta)
% \end{align*}
% We still have the similar relation
% \begin{align*}
%     (\hat{\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} G^{(k)}})(\zeta) = e^{-\frac{|\zeta|^2}{2 \beta k}} (\hat{\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} G})(\zeta)
% \end{align*}
Similarly, applying reverse Fourier transformation
\begin{align*}
    &|\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} \mathcal{I}[g](\bz) - \frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} \mathcal{I}^{(k)}[g](\bz)| \leq \int_{\CC^N} e^{i \zeta \cdot \bz} i^p \zeta_{i1} \cdots \zeta_{i_p} \hat{G}(\zeta) |1 - e^{-\frac{|\zeta|^2}{2 \beta k}}| d \zeta \\
    &\leq \int_{\CC^N} e^{i \zeta \cdot \bz} i^p \zeta_{i1} \cdots \zeta_{i_p} \hat{G}(\zeta) \frac{|\zeta|^2}{2 \beta k} d \zeta = \frac{1}{2 \beta k}\frac{\partial^p}{\partial z_{i_1}, \dots, \partial z_{i_p}} Tr[\nabla^2_z \mathcal{I}[g](\bz)].
\end{align*}
\end{proof}

\begin{lemma}[Error by finite $T$~\citep{maragliano2006string}]\label{lem:T}
For any function $f(\br, \bz)$, consider the true average functional and the time average estimator defined as follows
\begin{align*}
    \mathcal{A}^{(k)}[f](\bz) = \int_{\mathbb{R}^{3N}} f(\br, \bz) p_k(\br, \bz) d\br, \,\\
    \mathcal{A}^{(T, k)}[f](\bz) = \frac{1}{T} \int_0^T f(\br(t), \bz) dt.
\end{align*}
As $T \to \infty$, their difference is given by 
\begin{align*}
     \mathcal{A}^{(T, k)}[f](\bz) - \mathcal{A}^{(k)}[f](\bz) -> \sqrt{\frac{\tau_k[f](\bz)}{T}} \xi_k[f](\bz),
\end{align*}
where $\xi_k[f](\bz)$ is a Gaussian variable with mean zero and variance 
\begin{align*}
    Var[\xi_k[f](\bz)] = \int_{\mathbb{R}^{3N}} (f(\br, \bz) - \mathcal{A}^{(k)}[f](\bz))^2 p_k(\br, \bz) d\br,
\end{align*}
and $\tau_k[f](\bz)$ is given by
\begin{align*}
    \tau_k[f](\bz) = \frac{1}{Var[\xi_k[f](\bz)]} \int_{t=0}^T \int_{\mathbb{R}^{3N}} \mathbb{E}[f(\br(t), \bz) - \mathcal{A}^{(k)}[f](\bz)](f(\br, \bz) - \mathcal{A}^{(k)}[f](\bz))^2 p_k(\br, \bz) d\br dt.
\end{align*}
Moreover, as $k$ goes to infinity, $Var[\xi_k[f](\bz)] = Var[\xi[f](\bz)] + O(1/k) $ and $\tau_k[f](\bz) = \tau[f](\bz) + O(1/\sqrt{k})$, where $\xi[f]$ and $\tau[f]$ are defined by replacing $\mathcal{A}^{(k)}[f]$ by its limiting functional
\begin{align*}
    \mathcal{A}[f](\bz) = Z^{-1} e^{\beta U(\bz)} \int_{\mathbb{R}^{3N}} f(\br, \bz) e^{-\beta V(\br)} \prod_{j=1}^N \delta(z_j - x_j(\br)) d\br.
\end{align*}
\end{lemma}

Now we are ready to given upper bounds for the errors of estimators of $\nabla U(\bz)$, $M(\bz)$, $\nabla M(\bz)$ and $\nabla^2 U(\bz)$ by applying Lemma~\ref{lem:k} and~\ref{lem:T} respectively.

\subsection{Error of $M(\bz)$}
Define $f_{ij}(\br) = \sum_k \frac{\partial x_i(r(\alpha))}{\partial r_k} \frac{\partial x_j(r(\alpha))}{\partial r_k}$, and $\mathbf{1}(\br) = 1$. Then $M_{ij}(\bz)$ can be written as
\begin{align*}
    M_{ij}(\bz) &= Z^{-1} e^{\beta U(\bz)} \int_{\RR^{3D}} \sum_k \frac{\partial x_i(\br(\alpha))}{\partial r_k} \frac{\partial x_j(\br(\alpha))}{\partial r_k} e^{-\beta V(\br)} \prod_{i=1}^N (z_i - x_i(\br)) d\br \,\\
    &= \frac{\int_{\RR^{3D}}  f_{ij}(\br) e^{-\beta V(\br)} \prod_{i=1}^N (z_i - x_i(\br)) d\br}{\int_{\RR^{3D}}   e^{-\beta V(\br)} \prod_{i=1}^N (z_i - x_i(\br)) d\br} = \frac{\mathcal{I}[f_{ij}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)}.
\end{align*}
Therefore, the finite-$k$ error $|M_{ij}(\bz) - M^{(k)}_{ij}(\bz)|$ is bounded by
\begin{align*}
    &|M_{ij}(\bz) - M^{(k)}_{ij}(\bz)| = |\frac{\mathcal{I}[f_{ij}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)} - \frac{\mathcal{I}^{(k)}[f_{ij}](\bz)}{\mathcal{I}^{(k)}[\mathbf{1}](\bz)}| \,\\
    &\leq \frac{1}{2\beta k} \frac{\mathcal{I}[f_{ij}](\bz) Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] + Tr[\nabla^2 \mathcal{I}[f_{ij}](\bz)] \mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} + O(\frac{1}{k^2}).
\end{align*}
The finite-$T$ error is bounded by
\begin{align*}
     &|M^{(T, k)}_{ij}(\bz) - M^{(k)}_{ij}(\bz)| = | \frac{1}{T} \int_0^T f_{ij}(\bz) dt - \int_{\RR^{3D}}  f_{ij}(\bz) p_k(\br, \bz) d\br | \,\\
     &= |\mathcal{A}^{(T, k)}[f_{ij}](\bz) - \mathcal{A}_{k}[f_{ij}](\bz) |\to  \sqrt{\frac{\tau[f_{ij}](\bz)}{T}} \xi_k[f_{ij}](\bz).
\end{align*}





\subsection{Error of $\nabla U(\bz)$}
$\nabla_i U(\bz)$ can be written as
\begin{align*}
    \nabla_i U(\bz) &= -\beta^{-1} \nabla_i \ln\big( Z^{-1} \int_{\RR^{3D}}  \exp(-\beta V(\br)) \prod_{j=1}^N \delta(x_j(\br) - z_j) d\br\big) \,\\
    &= -\beta^{-1} \nabla_i \ln \mathcal{I}[\mathbf{1}](\bz) = -\beta^{-1} \frac{\nabla_i \mathcal{I}[\mathbf{1}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)}.
\end{align*}
The finite-$k$ error is bounded by
\begin{align*}
    &|\nabla_i U(\bz) - \nabla_i U^{(k)}(\bz)| = \beta^{-1} | \frac{\nabla_i \mathcal{I}[\mathbf{1}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)} - \frac{\nabla_i \mathcal{I}^{(k)}[\mathbf{1}](\bz)}{\mathcal{I}^{(k)}[\mathbf{1}](\bz)}| \,\\
    &\leq \frac{1}{2\beta^2 k} \frac{|\nabla_i \mathcal{I}[\mathbf{1}](\bz)| Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] + |\nabla_i Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)]|\mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} + O(\frac{1}{k^2}).
\end{align*}
The finite-$T$ error is bounded by
\begin{align*}
    &|\nabla_i U^{(T, k)}(\bz) - \nabla_i U^{(k)}(\bz)| = | \frac{k}{T} \int_0^T (z_i - x_i(\br(t))) dt - \int_{\RR^{3D}}  k(z_i-x_i(\br)) p_k(\br, \bz) d\br | \\
    &= k |\mathcal{A}^{(T, k)}[z_i-x_i(\br)](\bz) - \mathcal{A}_{k}[z_i-x_i(\br)](\bz) |\to k \sqrt{\frac{\tau[z_i-x_i(\br)](\bz)}{T}} \xi_k[z_i-x_i(\br)](\bz).
\end{align*}





\subsection{Error of $\nabla M(\bz)$}
$\nabla_l M_{ij}(\bz)$ can be written as
\begin{align*}
    \nabla_l M_{ij}(\bz) = \nabla_l \left( \frac{\mathcal{I}[f_{ij}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)}\right) = \frac{\nabla_l \mathcal{I}[f_{ij}](\bz) \mathcal{I}[\mathbf{1}](\bz) - \mathcal{I}[f_{ij}](\bz) \nabla_l \mathcal{I}[\mathbf{1}](\bz) }{(\mathcal{I}[\mathbf{1}](\bz))^2}.
\end{align*}
Therefore, the finite-$k$ error is bounded by
\begin{align*}
    &|\nabla_l M_{ij}(\bz) - \nabla_l M^{(k)}_{ij}(\bz)| \,\\
    &= |\frac{\nabla_l \mathcal{I}[f_{ij}](\bz) \mathcal{I}[\mathbf{1}](\bz) - \mathcal{I}[f_{ij}](\bz) \nabla_l \mathcal{I}[\mathbf{1}](\bz) }{(\mathcal{I}[\mathbf{1}](\bz))^2} - \frac{\nabla_l \mathcal{I}^{(k)}[f_{ij}](\bz) \mathcal{I}^{(k)}[\mathbf{1}](\bz) - \mathcal{I}^{(k)}[f_{ij}](\bz) \nabla_l \mathcal{I}^{(k)}[\mathbf{1}](\bz) }{(\mathcal{I}^{(k)}[\mathbf{1}](\bz))^2} | \,\\
    &\leq |\frac{\nabla_l \mathcal{I}[f_{ij}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)} - \frac{\nabla_l \mathcal{I}^{(k)}[f_{ij}](\bz)}{\mathcal{I}^{(k)}[\mathbf{1}](\bz)}| + |\frac{\mathcal{I}[f_{ij}](\bz) \nabla_l \mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} - \frac{\mathcal{I}^{(k)}[f_{ij}](\bz) \nabla_l \mathcal{I}^{(k)}[\mathbf{1}](\bz)}{(\mathcal{I}^{(k)}[\mathbf{1}](\bz))^2}| \,\\
    &\leq  \frac{1}{2\beta k} \Big( \frac{|\nabla_l \mathcal{I}[f_{ij}](\bz)| Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] + |\nabla_l Tr[\nabla^2 \mathcal{I}[f_{ij}](\bz)]| \mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} \,\\
    &+  \frac{\mathcal{I}[f_{ij}](\bz) |\nabla_l Tr[ \nabla^2 \mathcal{I}[\mathbf{1}](\bz)]| + Tr[ \nabla^2 \mathcal{I}[f_{ij}](\bz)] |\nabla_l \mathcal{I}[\mathbf{1}](\bz)| }{(\mathcal{I}[\mathbf{1}](\bz))^2} + \frac{\mathcal{I}[f_{ij}](\bz) |\nabla_l \mathcal{I}[\mathbf{1}](\bz)| Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz) ]}{(\mathcal{I}[\mathbf{1}](\bz))^3}\Big) + O(\frac{1}{k^2}).
\end{align*}
The finite-$T$ error is bounded by
\begin{align*}
    &|\nabla_l M^{(T, k)}_{ij}(\bz) - \nabla_l M^{(k)}_{ij}(\bz)| 
    % \leq k|\frac{1}{T} \int_{t=0}^T \frac{\partial f_{ij}(\bz)}{\partial z_l} dt - \int \frac{\partial f_ij(\bz)}{\partial z_l} p_k(\br, \bz) d\br| \,\\
    % &+ \beta k^2|\frac{1}{T} \int_{t=0}^T f_{ij}(\bz)(z_l - x_l(\br(t))) dt - \int f_{ij}(\bz)(z_l - x_l(\br)) p_k(\br, \bz) d\br| \,\\
    % &+ \beta k^2|\frac{1}{T} \int_{t=0}^T f_{ij}(\bz)dt \frac{1}{T} \int_{t=0}^T (z_l - x_l(\br(t))) dt - \int f_{ij}(\bz) p_k(\br, \bz) d\br \int (z_l - x_l(\br)) p_k(\br, \bz) d\br|\,\\
    \leq |\mathcal{A}^{(T, k)}[\frac{\partial f_{ij}(\bz)}{z_l}](\bz) - \mathcal{A}^{(k)}[\frac{\partial f_{ij}(\bz)}{z_l}](\bz)| \,\\
    &+ \beta k |\mathcal{A}^{(T, k)}[f_{ij}(\bz)(z_l-x_l(\br))](\bz) - \mathcal{A}^{(k)}[f_{ij}(\bz)(z_l-x_l(\br))](\bz)| \,\\
    &+ \beta k |\mathcal{A}^{(T, k)}[f_{ij}(\bz)](\bz)\mathcal{A}^{(T, k)}[(z_l - x_l(\br))](\bz) - \mathcal{A}^{(k)}[f_{ij}](\bz)\mathcal{A}^{(k)}[(z_l - x_l(\br))](\bz)| \,\\
    \to \,\, & \sqrt{\frac{\tau[\frac{\partial f_{ij}(\bz)}{z_l}](\bz)}{T}} \xi[\frac{\partial f_{ij}(\bz)}{z_l}](\bz) + \beta k \sqrt{\frac{\tau[f_{ij}(\bz)(z_l-x_l(\br))](\bz)}{T}} \xi[f_{ij}(\bz)(z_l-x_l(\br))](\bz)  \,\\
    &+ \beta k \sqrt{\frac{\tau[f_{ij}(\bz)]}{T}}\mathcal{A}^{(k)}[(z_l - x_l(\br))](\bz) \xi[f_{ij}(\bz)](\bz)  \,\\
    &+ \beta k\sqrt{\frac{\tau[(z_l - x_l(\br))](\bz)}{T}} \mathcal{A}^{(k)}[f_{ij}](\bz) \xi[(z_l - x_l(\br))](\bz)  + O(\frac{1}{T}).
\end{align*}





\subsection{Error of $\nabla^2 U(\bz)$}
$\nabla^2_{ij} U(\bz)$ can be written as
\begin{align*}
    \nabla^2_{ij} U(\bz) = \nabla_j \left( -\beta^{-1} \frac{\nabla_i \mathcal{I}[\mathbf{1}](\bz) }{\mathcal{I}[\mathbf{1}](\bz)} \right) = -\beta^{-1} \frac{\nabla^2_{ij}\mathcal{I}[\mathbf{1}](\bz) \mathcal{I}[\mathbf{1}](\bz) - \nabla_i \mathcal{I}[\mathbf{1}](\bz) \nabla_j \mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2}.
\end{align*}
Therefore, the finite-$k$ error is bounded by
\begin{align*}
    &|\nabla^2_{ij} U(\bz) - \nabla^2_{ij} U^{(k)}(\bz)| \,\\
    &= \beta^{-1} | \frac{\nabla^2_{ij}\mathcal{I}[\mathbf{1}](\bz) \mathcal{I}[\mathbf{1}](\bz) - \nabla_i \mathcal{I}[\mathbf{1}](\bz) \nabla_j \mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} - \frac{\nabla^2_{ij}\mathcal{I}^{(k)}[\mathbf{1}](\bz) \mathcal{I}^{(k)}[\mathbf{1}](\bz) - \nabla_i \mathcal{I}^{(k)}[\mathbf{1}](\bz) \nabla_j \mathcal{I}^{(k)}[\mathbf{1}](\bz)}{(\mathcal{I}^{(k)}[\mathbf{1}](\bz))^2}| \,\\
    &\leq \beta^{-1} \left( |\frac{\nabla^2_{ij} \mathcal{I}[\mathbf{1}](\bz)}{\mathcal{I}[\mathbf{1}](\bz)} - \frac{\nabla^2_{ij} \mathcal{I}^{(k)}[\mathbf{1}](\bz)}{\mathcal{I}^{(k)}[\mathbf{1}](\bz)}| + |\frac{\nabla_i \mathcal{I}[\mathbf{1}](\bz) \nabla_j \mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} - \frac{\nabla_i \mathcal{I}^{(k)}[\mathbf{1}](\bz) \nabla_j \mathcal{I}^{(k)}[\mathbf{1}](\bz)}{(\mathcal{I}^{(k)}[\mathbf{1}](\bz))^2}| \right) \,\\
    &\leq \frac{1}{2\beta^2 k} \Big( \frac{|\nabla^2_{ij} \mathcal{I}[\mathbf{1}](\bz)| Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] + |\nabla^2_{ij} Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)]|\mathcal{I}[\mathbf{1}](\bz)}{(\mathcal{I}[\mathbf{1}](\bz))^2} \,\\
    &+ \frac{|\nabla_i \mathcal{I}[\mathbf{1}](\bz) \nabla_j Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] | + |\nabla_i Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] \nabla_j \mathcal{I}[\mathbf{1}](\bz)|}{(\mathcal{I}[\mathbf{1}](\bz))^2} + \frac{|\nabla_i \mathcal{I}[\mathbf{1}](\bz) \nabla_j \mathcal{I}[\mathbf{1}](\bz)| Tr[\nabla^2 \mathcal{I}[\mathbf{1}](\bz)] }{(\mathcal{I}[\mathbf{1}](\bz))^3}\Big) + O(\frac{1}{k^2}).
\end{align*}
The finite-$T$ error is bounded by
\begin{align*}
    &|\nabla^2_{ij} U^{(T, k)}(\bz) - \nabla^2_{ij} U^{(k)}(\bz)| 
    % \leq k|\frac{1}{T} \int_{t=0}^T \frac{\partial (z_j - x_j(\br(t)))}{\partial z_i} dt - \int \frac{\partial (z_j - x_j(\br))}{\partial z_i} p_k(\br, \bz) d\br| \,\\
    % &+ \beta k^2|\frac{1}{T} \int_{t=0}^T (z_j - x_j(\br(t)))(z_i - x_i(\br(t))) dt - \int (z_j - x_j(\br))(z_i - x_i(\br)) p_k(\br, \bz) d\br| \,\\
    % &+ \beta k^2|\frac{1}{T} \int_{t=0}^T (z_j - x_j(\br(t)))dt \frac{1}{T} \int_{t=0}^T (z_i - x_i(\br(t))) dt - \int (z_j - x_j(\br)) p_k(\br, \bz) d\br \int (z_i - x_i(\br)) p_k(\br, \bz) d\br|\,\\
    \leq k |\mathcal{A}^{(T, k)}[\frac{\partial(z_j - x_j(\br))}{z_i}](\bz) - \mathcal{A}^{(k)}[\frac{\partial(z_j - x_j(\br))}{z_i}](\bz)| \,\\
    &+ \beta k^2 |\mathcal{A}^{(T, k)}[(z_j - x_j(\br))(z_i-x_i(\br))](\bz) - \mathcal{A}^{(k)}[(z_j - x_j(\br))(z_i-x_i(\br))](\bz)| \,\\
    &+ \beta k^2 |\mathcal{A}^{(T, k)}[(z_j - x_j(\br))](\bz)\mathcal{A}^{(T, k)}[(z_i - x_i(\br))](\bz) - \mathcal{A}^{(k)}[(z_j - x_j(\br))](\bz)\mathcal{A}^{(k)}[(z_i - x_i(\br))](\bz)| \,\\
    \to \,\, & k \sqrt{\frac{\tau[\frac{\partial(z_j - x_j(\br))}{z_i}](\bz)}{T}} \xi[\frac{\partial(z_j - x_j(\br))}{z_i}](\bz) + \beta k^2 \sqrt{\frac{\tau[(z_j - x_j(\br))(z_i-x_i(\br))](\bz)}{T}} \xi[(z_j - x_j(\br))(z_i-x_i(\br))](\bz)  \,\\
    &+ \beta k^2 \sqrt{\frac{\tau[(z_i - x_i(\br))](\bz)}{T}}\mathcal{A}^{(k)}[(z_j - x_j(\br))](\bz) \xi[(z_i - x_i(\br))](\bz)  \,\\
    &+ \beta k^2\sqrt{\frac{\tau[(z_j - x_j(\br))](\bz)}{T}} \mathcal{A}^{(k)}[(z_i - x_i(\br))](\bz) \xi[(z_j - x_j(\br))](\bz)  + O(\frac{1}{T}).
\end{align*}










\section{Convergence rate of Hessian-vector product estimator}\label{appendix:hession}
Here we discuss the convergence rate of using Hessian vector product estimator $$\frac{\nabla U(\bz+\delta v) - \nabla U(\bz - \delta v)}{2 \delta},$$ compared to the direct estimator $\nabla^2 U v$. Here $v = \frac{M^T M \nabla U}{\|M \nabla U\|}$. By Theorem~\ref{thm:rate}, we know that
\begin{itemize}
  \item For any $\bz$, the error of estimating $\nabla U$ is bounded by $|\nabla_i U(\bz) - \nabla_i U^{(T,k)}(\bz)| \leq O(\frac{1}{k} + \frac{k}{\sqrt{T}} )$,
  \item For any $\bz$, the error of estimating $v = \frac{M^T M \nabla U}{\|M \nabla U\|}$ is bounded by $|v_i(\bz) - v^{(T,k)}_i(\bz)| \leq O(\frac{1}{k} + \frac{k}{\sqrt{T}})$,
  \item For any $\bz$, the error of estimating $\nabla^2 U$ is bounded by $|\nabla^2_{ij} U(\bz) - \nabla^2_{ij} U^{(T,k)}(\bz)| \leq O(\frac{1}{k} + \frac{k^2}{\sqrt{T}} )$.
\end{itemize}
First we consider the error of directly estimating $\nabla^2 U v$.
\begin{align*}
    &|(\nabla^2 U v)_i - (\nabla^2 U^{(T,k)} v^{(T,k)})_i| = |\sum_j \nabla^2_{ij} U(\bz) v_j(\bz) - \nabla^2_{ij} U^{(T,k)}(\bz) v^{(T,k)}_j(\bz)| \,\\
    &\leq \sum_j |\nabla^2_{ij} U(\bz) v_j(\bz) - \nabla^2_{ij} U^{(T,k)}(\bz) v^{(T,k)}_j(\bz)| \,\\
    & \leq \sum_j \left( |\nabla^2_{ij} U(\bz) - \nabla^2_{ij} U^{(T,k)}(\bz)| v_j(\bz) + |v_j(\bz) - v^{(T,k)}_j(\bz)| \nabla^2 U_{ij}(\bz) + |\nabla^2_{ij} U(\bz) - \nabla^2_{ij} U^{(T,k)}(\bz)| |v_j(\bz) - v^{(T,k)}_j(\bz)| \right) \,\\
    &\leq O(\frac{1}{k} + \frac{k^2}{\sqrt{T}}).
\end{align*}
So the convergence rate of the error is $O(\frac{1}{k} + \frac{k^2}{\sqrt{T}})$, due to the contribution of $|\nabla^2_{ij} U(\bz) - \nabla^2_{ij} U^{(T,k)}(\bz)|$.


Now let's consider the Hessian-vector product estimator. The error can be decomposed into three terms as follows
\begin{align*}
    &|\frac{\nabla_i U^{(T,k)}(\bz+\delta v^{(T,k)}) - \nabla_i U^{(T,k)}(\bz-\delta v^{(T,k)})}{2 \delta} - (\nabla^2 U v)_i| \,\\
    & \leq |\frac{\nabla_i U^{(T,k)}(\bz+\delta v^{(T,k)}) - \nabla_i U^{(T,k)}(\bz-\delta v^{(T,k)})}{2 \delta} - \frac{\nabla_i U(\bz+\delta v^{(T,k)}) - \nabla_i U(\bz-\delta v^{(T,k)})}{2 \delta}| \,\\
    &+ |\frac{\nabla_i U(\bz+\delta v^{(T,k)}) - \nabla_i U(\bz-\delta v^{(T,k)})}{2 \delta} - (\nabla^2 U v^{(T,k)})_i| \,\\
    &+ |(\nabla^2 U v^{(T,k)})_i - (\nabla^2 U v)_i|.
\end{align*}
The first term comes from the error of estimating $\nabla U$, which can be upper bounded by
\begin{align*}
    &|\frac{\nabla_i U^{(T,k)}(\bz+\delta v^{(T,k)}) - \nabla_i U^{(T,k)}(\bz-\delta v^{(T,k)})}{2 \delta} - \frac{\nabla_i U(\bz+\delta v^{(T,k)}) - \nabla_i U(\bz-\delta v^{(T,k)})}{2 \delta}| \,\\
    &\leq \frac{1}{2\delta} \left(|\nabla_i U^{(T,k)}(\bz+\delta v^{(T,k)}) - \nabla_i U(\bz+\delta v^{(T,k)}) | + |\nabla_i U^{(T,k)}(\bz-\delta v^{(T,k)}) - \nabla_i U(\bz-\delta v^{(T,k)})| \right) \,\\
    & \leq O(\frac{1}{\delta} (\frac{1}{k} + \frac{k}{\sqrt{T}})).
\end{align*}
The second term comes from finite difference estimate of $\nabla^2 U$. Using Taylor expansion,
\begin{align*}
    &\nabla_i U(\bz+\delta v^{(T,k)}) - \nabla_i U(\bz-\delta v^{(T,k)}) \,\\
    &= \left( U_i(\bz) + \delta v^{(T,k)} \nabla \nabla_i U(\bz) + \frac{\delta^2}{2} (v^{(T,k)})^T \nabla^2 \nabla_i U(\bz) v^{(T,k)} \right) \,\\
    &- \left( U_i(\bz) - \delta v^{(T,k)} \nabla \nabla_i U(\bz) + \frac{\delta^2}{2} (v^{(T,k)})^T \nabla^2 \nabla_i U(\bz) v^{(T,k)} \right) + O(\delta^3) \,\\
    &= 2 \delta (\nabla^2 U v^{(T,k)})_i + O(\delta^3).
\end{align*}
Therefore, the error is bounded by
\begin{align*}
    &|\frac{\nabla_i U(\bz+\delta v^{(T,k)}) - \nabla_i U(\bz-\delta v^{(T,k)})}{2 \delta} - (\nabla^2 U v^{(T,k)})_i|
    % & = |\frac{\left(\nabla U_i(\bz) + \delta v^{(T,k)} \nabla \nabla_i U(\bz) + \frac{\delta^2}{2} v^{(T,k)}^T \nabla^2 \nabla_i U(\bz) v^{(T,k)} + O(\delta^3) \right) - \left(\nabla U_i(\bz) - \delta v^{(T,k)} \nabla \nabla_i U(\bz) + \frac{\delta^2}{2} v^{(T,k)}^T \nabla^2 \nabla_i U(\bz) v^{(T,k)} + O(\delta^3) \right)}{2 \delta} - (\nabla^2 U v^{(T,k)})_i| \,\\
    = O(\delta^2).
\end{align*}
The third term comes from the error of estimating $v$, which can be upper bounded by
\begin{align*}
    |(\nabla^2 U v^{(T,k)})_i - (\nabla^2 U v)_i| \leq \sum_j \nabla^2_{ij} U(\bz) |v_j(\bz) - v^{(T,k)}_j(\bz)| \leq O(\frac{1}{k} + \frac{k}{\sqrt{T}}).
\end{align*}
The combined convergence rate of the total estimation error is bounded by
\begin{align*}
    &|\frac{\nabla_i U^{(T,k)}(\bz+\delta v^{(T,k)}) - \nabla_i U^{(T,k)}(\bz-\delta v^{(T,k)})}{2 \delta} - (\nabla^2 U v)_i| \,\\
    &\leq O(\frac{1}{\delta}(\frac{1}{k} + \frac{k}{\sqrt{T}})) + O(\delta^2) + O(\frac{1}{k} + \frac{k}{\sqrt{T}}).
\end{align*}

The upper bound contains two terms of $\delta$ which are $O(\frac{1}{\delta}(\frac{1}{k} + \frac{k}{\sqrt{T}}))$ and $O(\delta^2)$. The optimal $\delta$ is $O((\frac{1}{k} + \frac{k}{\sqrt{T}})^{1/3})$ which makes the rate of the error be $O((\frac{1}{k} + \frac{k}{\sqrt{T}})^{2/3})$. The third term is always not the leading term.

\textit{Remark:} The benifit of Hessian-vector product estimator depends on how fast $T$ grows as $k$ grows. Let $\alpha = \lim_{k \to \infty} \frac{\log{T}}{\log{k}}$. 
\begin{itemize}
    \item If $\alpha \leq 2$, neither direct estimator nor Hessian-vector product estimator converges.
    \item If $2 < \alpha \leq 4$, direct estimator does not converge, but Hessian-vector product estimator converges at a rate of $O(k^{-(\alpha-2)/3})$.
    \item If $4 < \alpha \leq 6$, both estimators converge. The direct estimator converges at a rate of $O(k^{-(\alpha-4)/2})$ and the Hessian-vector product estimator converges at a rate of $O(k^{-2/3})$. If $\alpha \leq 16/3$, the Hessian-vector product estimator converges faster and if $\alpha > 16/3$, the direct estimator converges faster.
    \item If $\alpha > 6$, both estimators converge. The direct estimator converges at a rate of $O(k^{-1})$ and the Hessian-vector product estimator converges at a rate of $O(k^{-2/3})$. The direct estimator converges faster.
\end{itemize}
The relation between the convergence rate and $\alpha$ are shown in Figure~\ref{fig:rate}.

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.5\linewidth]{figure/rate.png}
    \caption{Relation between the convergence rate $\beta = -\lim_{k \to \infty} \frac{\log |\nabla^2 U^{(T, k)} v^{(T, k)} - \nabla^2 U v|}{\log k}$ and the rate $\alpha = \lim_{k \to \infty} \frac{\log T}{\log k}$.}
    \label{fig:rate}
\end{figure}

\section{M\"{u}ller Potential Parameters}\label{appendix:muller}
This section provides the detailed parameters in Eq.\eqref{muller_potential}.
\begin{align*}
    &A= (-200,-100,-170,15),~~a=(-1,-1,-6.5,0.7),\\
    & b=(0,0,11,0.6),~~c=(-10,-10,-6.5,0.7),\\
    &x^0=(1,0,-0.5,-1),y^0 =(0,0.5,1.5,1).
\end{align*}

\end{document}