\documentclass[accepted]{uai2022}


\usepackage[round]{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
    
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{url,booktabs,nicefrac,microtype}

\usepackage{xcolor}
\definecolor{linkblue}{rgb}{0.1,0.4,0.7} 

\usepackage{multicol}
\usepackage{float}
\usepackage{cancel}
\usepackage{physics}
\usepackage{amsfonts,amsmath,amsthm,amssymb}
\usepackage{mathtools}
\usepackage{wrapfig}
\usepackage{pifont}
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usetikzlibrary{arrows}

\newcommand{\cmark}{ \textcolor{green!60!black}{\ding{51}} }
\newcommand{\xmark}{ \textcolor{red!60!black}{\ding{55}} }
\usepackage{wrapfig}

\usepackage{colortbl}
\usepackage{tabularx}
\usepackage{graphbox}
\usepackage{comment}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{algorithm, algpseudocode, algcompatible}

\usepackage{caption}
\usepackage{subcaption}
\input{math.tex}

% Title
\title{Variational multiple shooting for Bayesian ODEs with Gaussian processes}


% Add authors
\author[1]{\href{mailto:<pashupati.hegde@aalto.fi>?Subject=Your UAI 2022 paper}{Pashupati~Hegde}{}}
\author[1]{\c{C}a\u{g}atay~Y{\i}ld{\i}z}
\author[1]{Harri~L{\"a}hdesm{\"a}ki}
\author[1]{Samuel~Kaski}
\author[1]{Markus~Heinonen}

% Add affiliations after the authors
\affil[1]{
    Department of Computer Science\\
    Aalto University\\
    Finland
}

\begin{document}
\maketitle

\begin{abstract}
Recent machine learning advances have proposed black-box estimation of \textit{unknown continuous-time system dynamics} directly from data. However, earlier works are based on approximative solutions or point estimates. We propose a novel Bayesian nonparametric model that uses Gaussian processes to infer posteriors of unknown ODE systems directly from data. We derive sparse variational inference with decoupled functional sampling to represent vector field posteriors. We also introduce a probabilistic shooting augmentation to enable efficient inference from arbitrarily long trajectories. The method demonstrates the benefit of computing vector field posteriors, with predictive uncertainty scores outperforming alternative methods on multiple ODE learning tasks.
\end{abstract}

\section{Introduction} \label{section:intro}

Ordinary differential equations (ODEs) are powerful models for continuous-time non-stochastic systems, which are ubiquitous from physical and life sciences to engineering \citep{hirsch2012differential}. In this work, we consider non-linear ODE systems
\begin{align}
    \x(t) &= \x_0 + \int_0^t \f( \x(\tau)) d\tau \label{eq:odeproblemtraj} \\
    \dot{\x}(t) &:= \frac{d\x(t)}{dt} = \f(\x(t)), \label{eq:odeproblem}
\end{align}
where the state vector $\x(t) \in \R^D$ evolves over time $t \in \R_+$ from an initial state $\x_0$ following its time derivative $\dot{\x}(t)$, and $\tau$ is an auxiliary time variable. Our goal is to learn the differential function $\f : \R^D \mapsto \R^D$ from state observations, when the functional form of $\f$ is unknown.


\begin{figure*}[t]
\centering
\includegraphics[width=0.8\textwidth]{plots/illustration_model.pdf}
\caption{Illustration of GPODE: The model learns a GP posterior (a) of a vector field. Valid ODE trajectories are sampled from the posterior process as shown in (b) and (c).}
\label{fig:gpode_illustration}
\end{figure*}

The conventional mechanistic approach involves manually defining the equations of dynamics and optimizing their parameters \citep{butcher2008numerical}, or inferring their posteriors \citep{girolami2008bayesian} from data. However, the equations are unknown or ambiguous for many systems, such as human motion \citep{4359316}. Some early works explored fitting unknown ODEs with splines \citep{henderson2014network}, Gaussian processes \citep{aijo2009learning} or kernel methods \citep{heinonen2014learning} by resorting to less accurate gradient matching approximations \citep{varah1982spline}. Recently, \citet{heinonen2018learning} proposed estimation of free-form non-linear dynamics using Gaussian processes without gradient matching. However, the approach is restricted to learning point estimates of the dynamics, limiting the uncertainty characterization and generalization.  \citet{chen2018neural} proposed modeling ODEs with neural networks and adjoints, which was later extended to the Bayesian setting by \citet{dandekar2020bayesian}. However, the gradient descent training in such approaches can be ill-suited for complex or long-horizon ODEs with typically highly non-linear integration maps \citep{diehl2017}.


In this work, we introduce efficient Bayesian learning of unknown, non-linear ODEs. Our contributions are:
\begin{itemize}
    \item We introduce a way of learning posteriors of vectorfields using Gaussian processes as flexible priors over differentials $\f$, and thereby build on the work by \citet{heinonen2018learning}. We adapt decoupled functional sampling to simulate ODEs from vector field posteriors.
    \item For the difficult problem of gradient optimizations of ODEs, we introduce a novel probabilistic shooting method. It is motivated by the canonical shooting methods from optimal control and makes inference stable and efficient on long trajectories.
    \item We empirically show the effectiveness of the proposed method even while learning from a limited number of observations. We  demonstrate the ability to infer arbitrarily long trajectories efficiently with the shooting extension.
\end{itemize}


\section{Related Works}
\paragraph{Mechanistic ODE models.} In mechanistic modelling the equation $\f_\bt$ is predefined with a set of coefficients $\bt$ to be fitted \citep{butcher2008numerical}. Several works have proposed embedding mechanistic models within Bayesian or Gaussian process models \citep{calderhead2008,dondelinger2013ode,wenk2020odin}. Recently both Julia and Stan have introduced support for Bayesian analysis of parametric ODEs \citep{rackauckas2017differentialequations,stan}. Since this line of work assumes a known dynamics model, we do not consider these methods in the experiments. 

\paragraph{Free-form ODE models.} Multiple works have proposed fitting unknown, non-linear and free-form ODE differentials with gradient matching using splines \citep{ramsay2007parameter}, Gaussian processes \citep{aijo2009learning} or kernel methods \citep{heinonen2014learning}. Recently, \citet{heinonen2018learning} proposed accurate \textit{maximum a posteriori}(MAP) optimisation of vector fields with sensitivity equation gradients \citep{kokotovic1967direct}. Neural ODEs \citep{chen2018neural} introduced adjoint gradients \citep{pontryagin1962mathematical} along with flexible black-box neural network vector fields. Several extensions to learning latent ODEs have been proposed \citep{yildiz2019ode2vae,rubanova2019}. 

\paragraph{Discrete-time state-space models.} There is a large literature on Markovian state-space models that operate over discrete time increments \citep{wang2005,turner2010state, frigola2014variational}. Typically nonlinear state transition functions are modeled with Gaussian processes and applied to latent state estimation or system identification problems with dynamical systems \citep{eleftheriadis2017identification, doerr2018probabilistic, ialongo2019overcoming}. In this paper, we focus strictly on continuous-time models and leave the study of discrete vs. continuous formulations for future work.

\paragraph{Stochastic differential equations.} As an alternative formulation of inferring unknown dynamics from observational data, one can assume stochastic transitions and learn models of stochastic differential equations (SDEs). Existing works have utilized Gaussian processes \citep{archambeau2007gaussian, duncker2019learning, jorgensen2020stochastic} and neural networks \citep{tzen2019neural, li2020scalable} to model non-linear SDEs. However, since they assume a different model (i.e. deterministic transitions vs stochastic transitions), we will restrict the experimental comparisons to other ODE-based approaches.

\section{Methods} \label{section:methods}

We consider the problem of learning ODEs \eqref{eq:odeproblem} with GPs and propose a Bayesian model to infer posteriors over the differential $\f(\cdot)$. 

\subsection{Bayesian modeling of ODEs using GPs}
We assume a sequence of $N$ observations $\Y = (\y_1, \y_2, \ldots \y_N)^T \in \R^{N \cross D}$ along a trajectory, with $\y_i \in \R^D$ representing the noisy observation of the unknown state $\x(t_i) \in \R^D$ at time $t_i$. Similar to \cite{heinonen2018learning}, we assume a zero mean vector-valued Gaussian process prior over $\f$,
\begin{align}
    \f(\x) &\sim \GP(\0, K(\x, \x')),
\end{align}
which defines a distribution of differentials $\f(\x)$ with covariance $\cov[\f(\x), \f(\x')] = K(\x, \x')$, where $K(\x, \x') \in \R^{D \cross D}$ is a stationary matrix-valued kernel. We follow the commonly used sparse inference framework for GPs using inducing variables \citep{titsias2009variational}, and augment the full model with inducing values $\mathbf{U} = (\mathbf{u}_1, \ldots ,\mathbf{u}_M)^T \in \mathbb{R}^{M \times D}$ and inducing locations $\mathbf{Z} = (\mathbf{z}_1, \ldots, \mathbf{z}_M )^T \in \mathbb{R}^{M \times D}$ such that $\mathbf{u}_m = \f(\mathbf{z}_m)$. The inducing variables are trainable `landmark' state-differential pairs, from which the rest of the differential field is interpolated (See Figure \ref{fig:gpode_illustration}, where arrow locations are the $\mathbf{z}_m$ and arrow end-points are the $\mathbf{u}_m$). The inducing augmentation leads to the following prior and conditionals \citep{hensman2013gaussian}:
\begin{align}
    p(\U) &= \N(\U | \0, \K_{\Z\Z}), \\
    p(\f | \U; \Z) &= \N(\f | \A \mathrm{vec}(\U), \K_{\X \X} - \A \K_{\Z \Z} \A^T),
\end{align}
where $\X = (\x_1, \x_2, \ldots \x_{N'})^T \in \R^{N' \times D}$ collects all the intermediate state evaluations $\x(t_i)$ encountered along a numerical approximation of the true continuous ODE integral \eqref{eq:odeproblemtraj}, $\f = (\f(\x_1)^T, \ldots, \f(\x_{N'})^T)^T \in \mathbb{R}^{N'D \times 1}$, $\K_{\X\X}$ is a block-partitioned matrix of size $N'D \times N'D$ with $D \times D$ blocks, so that block $(\K_{\X\X})_{i,j} = K(\x_i, \x_j)$, and $\A = \K_{\X\Z} \K^{-1}_{\Z\Z}$. For notational simplicity, we assume that  the measurement time points are among the time points of the intermediate state evaluations of a numerical ODE solver.

The joint probability distribution follows
\begin{align}
    p(\Y, \f, \U, \x_0) &= p(\Y|\f,\x_0)p(\f,\U)p(\x_0) \\
    &\hspace{-10mm} = \prod\limits_{i=1}^{N}p(\y_i|\f, \x_0) p(\f|\U) p(\U) p(\x_0),
\end{align}
where the conditional distribution $p(\y_i| \f, \x_0) = p(\y_i|\x_i)$ computes the likelihood over ODE state solutions $\x_i = \x_0 + \int_0^{t_i} \f( \x(\tau)) d\tau$. 


\subsection{Variational inference for GP-ODEs}
In contrast to earlier approach that estimates MAP solutions \citep{heinonen2018learning}, our goal is to infer the posterior distribution $p(\f,\x_0 | \Y)$ of the vector field $\f$ and initial value $\x_0$ from observations $\Y$. The posterior is intractable due to the non-linear integration map $\x_0 \overset{\f}{\mapsto} \x(t)$. 

We use the stochastic variational inference (SVI) formulation for sparse GPs \citep{hensman2013gaussian} in this work. We introduce a factorized Gaussian posterior approximation  for the inducing variables across state dimensions $q(\U) = \prod_{d=1}^{D}\N(\bu_d|\m_d, \Q_d), \bu_d \in \R^M$ where $\m_d \in \R^M, \Q_d \in \R^{M\times M}$ are the mean and covariance parameters of the variational Gaussian posterior approximation for the inducing variables. We treat the inducing locations $\Z$ as optimized hyperparameters. The posterior distribution for the variational approximation can be written as
\begin{align}
    q(\f) &= \int p(\f|\U) q(\U) d\U \\
     &\hspace{-8mm}= \int \N\left(\f | \A \mathrm{vec}(\U), \K_{\X\X} - \A \K_{\Z\Z}\A^T \right) q(\U) d\U. \label{eq:inducing_posterior_gp}
     \raisetag{2\normalbaselineskip}
\end{align}

The posterior inference goal then translates to estimating the posterior $p(\f, \U, \x_0 | \Y)$ of the inducing points $\U$ and initial state $\x_0$. Under variational inference this learning objective
\begin{align}
    \argmin_{q} \: \KL\big[ \, q(\f,\U,\x_0) \, || \, p(\f,\U,\x_0|\Y) \, \big]
\end{align}
translates into maximizing the evidence lowerbound (ELBO), 
\begin{align}
    \log p(\Y) &\ge \sum_{i=1}^N \overbrace{\E_{q(\f, \x_0)} \log p(\y_i | \f, \x_0)}^{\text{variational likelihood}}  - \overbrace{\KL[ q(\U) || p(\U)]}^\text{inducing KL} \notag \\
    &\quad - \underbrace{\KL[ q(\x_0) || p(\x_0)]}_\text{initial state KL},
\end{align}
where we also assume variational approximation $q(\x_0) = \N(\a_0, \bS_0)$ for the initial state $\x_0$. See supplementary section 1.1 for detailed derivations of the above equations.

\begin{figure*}[!h]
    \centering
    \begin{subfigure}[b]{0.9\columnwidth}
    \centering
    \includegraphics[width=1.0\textwidth]{plots/illustration_full.pdf}
    \caption{The full model formulation}
    \end{subfigure}
    \qquad
    \begin{subfigure}[b]{0.9\columnwidth}
    \centering
    \includegraphics[width=1.0\textwidth]{plots/illustration_shooting.pdf}
    \caption{Shooting augmentation}
    \end{subfigure}
    \caption{Illustrations of GPODE formulations: the full model formulation (a) follows the long trajectory integration, whereas the shooting version (b) splits the long trajectory into multiple short subintervals.}
    \label{fig:shooting_illustration}
\end{figure*}

\subsection{Sampling ODEs from Gaussian processes}

The Picard-Lindel\"{o}f theorem \citep{lindelof} ensures valid ODE systems define unique solutions to the initial value problem (IVP) \eqref{eq:odeproblemtraj}. In order to sample valid state trajectories for the IVP, we need to efficiently sample GP functions $\f(\cdot) \sim q(\f)$ \eqref{eq:inducing_posterior_gp}. This way, we can evaluate the sample function $\f(\x(t))$ at arbitrary states $\x(t)$ encountered during ODE forward integration, while accounting for both the inducing and interpolation distributions of Equation \eqref{eq:inducing_posterior_gp}.  Unfortunately, function-space sampling of such GPs has prohibitive cubic complexity \citep{rasmussen2006gaussian,pmlr-v108-ustyuzhaninov20a}, while the more efficient weight-space sampling with Fouriers cannot accurately express the posterior \eqref{eq:inducing_posterior_gp} \citep{wilson2020efficiently}. 

We use the decoupled sampling that decomposes the posterior into two parts \citep{wilson2020efficiently},
\begin{align}
\label{eq:decoupled_conditional}
    \overbrace{\f(\x)|\U}^\text{posterior} &= \overbrace{\f(\x)}^\text{prior} + \overbrace{K(\x,\Z)K(\Z,\Z)^{-1}(\U - \f_{\Z}))}^\text{update}.  \\
    &\approx \sum_{i=1}^{F} \w_i \bphi_i(\x) + \sum_{j=1}^{M} \bnu_j K(\x, \z_j),
\end{align}
where we use $F$ Fourier bases $\bphi_i(\cdot)$ with $\w_i \sim \N(\0,I)$ \citep{rahimi2007random} to represent the stationary prior, and function basis $K(\cdot,\z_j)$ for the posterior update with $\bnu = K(\Z,\Z)^{-1}(\U - \bPhi \W)$, $\bPhi = \bphi(\Z) \in \R^{M \times F}, \W \in \R^{F \times D}$. By combining these two steps, we can accurately evaluate functions from the posterior \eqref{eq:inducing_posterior_gp} in linear time at arbitrary locations. We refer the reader to the supplementary section 1.2 for more details.  We note that concurrent works by \citet{mikheeva2021aligned} and \citet{ensinger2021symplectic} also utilize the decoupled-sampling to infer ODE posteriors with GPs. 

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.8\textwidth]{plots/vdp_uniform.pdf}
    \caption{Learning the 2D Van der Pol dynamics \textbf{(a)} with alternative methods \textbf{(b-d)}. Column 1 shows the vector fields while columns 2 and 3 show the state trajectories $x_1(t)$ and $x_2(t)$. GPODE learns the posterior accurately.}
    \label{fig:vdp_illustration}
\end{figure*}

\subsection{Augmenting the ODE model with shooting system}
A key bottleneck in ODE modeling is the poor gradient descent performance over long integration times $\x_{0:T}$, which can exhibit vanishing or exploding gradients \citep{haber2017,choromanski2020}. Earlier approaches tackled this issue mainly with more accurate numerical solvers \citep{zhuang2020,zhuang2021mali}. The nonlinearity of the integration map $\x_0 \overset{\f}{\mapsto} \x_t$ motivates us to instead segment the full integration $\x_{0:T}$ into short segments, which are easier to optimize and can be trivially parallelized. This is called the \emph{multiple shooting} method in optimal control literature \citep{osborne1969,bock1984}, in the context of parameter estimation of ODEs \citep{vandomselaar1975nonlinear,bock1983recent}. Recently, \citet{massaroli2021differentiable, turan2021multiple} also introduced a multiple-shooting framework within the context of deterministic neural ODEs. We introduce probabilistic shooting for the Gaussian process posterior inference of ODEs.

We begin by introducing shooting state variables $\mathbf{S} = (\s_0,\s_1,\ldots,\s_{N-1})$, $\s_i \in \R^D$, and segment the continuous state function $\x(t;\x_0)$ \eqref{eq:odeproblemtraj} into $N$ segments $\{(\s_{i-1},\x(t_i; \s_{i-1}))\}_{i=1}^{N}$ that branch from the shooting variables $\s_{i-1}$ (See Figure \ref{fig:shooting_illustration});
\begin{align}
    \x(t_i; \s_{i-1}) &= \s_{i-1} + \int_{t_{i-1}}^{t_i} \f(\x(\tau)) d\tau  \label{eq:shooting_ivps}.
\end{align}
In addition, every shooting variable is approximately matched with the ODE state evolution from the previous shooting state,
\begin{align}
    \s_i &= \x(t_i; \s_{i-1}) + \bxi \label{eq:shooting_constraints},
\end{align}
where $\bxi \in \R^D$ represents the tolerance parameter controlling the shooting approximation. The augmented system is equivalent to the original ODE system in case the constraints $\s_i = \x(t_i; \s_{i-1})$ are satisfied exactly at the limit $\bxi \to \0$. We place a Gaussian prior over the tolerance parameter $\bxi \sim \N(\0, \sigma^2_\xi\eye)$, which translates into the following prior over shooting variables  
\begin{align}
    p(\s_i|\s_{i-1}) &= \N(\s_i|\x(t_i; \s_{i-1}), \sigma^2_\xi \eye). \label{eq:shooting_prior}
\end{align}

Further, the joint probability of the augmented model after placing a GP prior over the vectorfield $\f$ can be written as 
\begin{align}
p(\Y,\S,\f) &= \prod\limits_{i=1}^{N} p(\y_{i}|\s_{i-1}, \f) \prod\limits_{i=1}^{N-1}p(\s_{i}|\s_{i-1}, \f)p(\s_0) p(\f).
\end{align}

\subsection{Variational inference for the augmented model}
To infer the augmented posterior $p(\f,\U, \S| \Y)$ we introduce variational approximation for the shooting variables $q(\S) = q(\s_0) \cdots q(\s_{N-1})$, where each distribution $q(\s_i) = \N(\s_i|\a_i,\bS_i)$ is a Gaussian. This results in the joint variational approximation
\begin{align}
    q(\S,\f,\U) &= \prod_{i=0}^{N-1} q(\s_i) p(\f|\U) q(\U),
\end{align}
and the following evidence lower bound for the shooting model,
\begin{align}
    \L_{\mathrm{shooting}}
    &= \sum_{i=1}^N \E_{q(\s_{i-1},\f)} \Big[\log p(\y_i|\s_{i-1}, \f)\Big] \nonumber\\
    &\hspace{-15mm} + \sum_{i=1}^{N-1} \E_{q(\s_{i},\s_{i-1},\f)} \Big[ \log p\left(\s_i | \s_{i-1}, \f \right) \Big] - \E_{q(\s_i)}  \Big[ \log q(\s_{i})  \Big] \nonumber\\
    &\hspace{-15mm}  - \KL[q(\s_0) \, || \, p(\s_0)] - \KL[ q(\U) \, || \, p(\U)]. \label{eq:shooting_elbo}
\end{align}

 The ELBO consists of an expected log-likelihood term, which matches the state evolution \eqref{eq:shooting_ivps} from every shooting variable to the corresponding observation. In addition, the posterior approximation for every shooting variable is also matched with the ODE evolution of the approximated posterior of the previous shooting state, leading to corresponding cross-entropy and entropy terms. 
 
The ELBO for the augmented shooting model requires solving only the short segments $\eqref{eq:shooting_ivps}$ with simpler integration maps, thus great at mitigating problems with vanishing/exploring gradients. Since the involved numerical ODE integration can be done in parallel, the shooting model is also computationally faster than the full model in practice. See supplementary section 1.3 for a plate diagram and detailed derivation of the approach.

\begin{table*}[t]
\centering
\caption{VDP system learning performance on extrapolation task with observations on regular (task 1) and irregular time intervals (task 2). We report mean $\pm$ standard error over 5 runs from different random initialization, the best values bolded. ($\uparrow$): higher is better, ($\downarrow$) lower is better}
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{l c c c c}
\toprule
 & \multicolumn{2}{c}{Task 1: Regular time-grid} & \multicolumn{2}{c}{Task 2: Irregular time-grid}\\
 \cmidrule(lr){2-3} 
 \cmidrule(lr){4-5} 
 & MNLL ($\downarrow$)& MSE ($\downarrow$)& MNLL ($\downarrow$) & MSE ($\downarrow$) \\
\midrule
Bayesian NeuralODE (HMC) &  $0.82 \pm 0.01$ & $1.45 \pm 0.04$ & $0.88 \pm 0.01$ & $1.68 \pm 0.04$ \\
NeuralODE & - & $0.29 \pm 0.11$ & - & $0.55 \pm 0.07$\\
npODE & $1.47 \pm 0.59 $ & $0.16 \pm 0.05 $ & $8.89 \pm 3.06 $ & $2.08 \pm 0.78 $ \\
% GP gradient matching & $1.47 \pm 0.02$ & $1.27 \pm 0.01$ & - & -\\
GPODE & $\mathbf{0.60 \pm 0.03}$ & $\mathbf{0.13 \pm 0.01}$ & $\mathbf{0.41 \pm 0.18}$ & $\mathbf{0.21 \pm 0.07}$ \\
\bottomrule
\end{tabular}
}
\label{table:vdp_illustration}
\end{table*}

\section{Experiments} \label{section:experiments}
We validate the proposed method on Van der Pol (VDP) and FitzHugh–Nagumo (FHN) systems and on the task of learning human motion dynamics (MoCap). The predictive performance of the proposed GPODE is compared against npODE \citep{heinonen2018learning}, NeuralODE \citep{chen2018neural} and Bayesian version of NeuralODE \citep{dandekar2020bayesian}. We use 16 inducing points in VDP and FHN experiments and 100 inducing points for the MoCap experiments. Except for the NeuralODE model, we assume Gaussian observation likelihood, and infer the unknown noise scale parameter from the training data. All the experiments use squared exponential kernel with automatic relevance determination (ARD) along with 256 Fourier basis functions for decoupled GP sampling. Along with the variational parameters, kernel lengthscales, signal variance, noise scale, and inducing locations are jointly optimized against the model ELBO while training. In addition, for the shooting model, we fix the constraint tolerance parameter to a small value $\sigma^2_\xi = 1e^{-6}$ consistently across all the experiments. In all the shooting experiments, we considered the number of shooting segments to be the same as the number of observation segments in the dataset. A codebase for implementing the proposed methods is provided \url{https://github.com/hegdepashupati/gaussian-process-odes}. 

We use the \texttt{dopri5} solver with tolerance parameters \texttt{rtol}$=1e^{-5}$ and \texttt{atol}$=1e^{-5}$, and use the adjoint method for computing loss gradients with \texttt{torchdiffeq}\footnote{\url{https://github.com/rtqichen/torchdiffeq}} package \citep{chen2018neural}. All the experiments are repeated 5 times with random initialization, and means and standard errors are reported over multiple runs. The predictive performance of different models are measured with mean squared error (MSE) and mean negative log likelihood (MNLL) metrics. 

\subsection{Learning Van der Pol dynamics}
We first illustrate the effectiveness of the proposed method by inferring the vector field posterior on a two-dimensional VDP (see Figure \ref{fig:vdp_illustration}),
\begin{align}
    \dot{x}_1 = x_2, \\ \nonumber
    \dot{x}_2 = -x_1 + 0.5 x_2 (1-x_1^2).
\end{align}
We simulate a trajectory of 50 states following the true system dynamics from the initial state $\left(x_1(0), x_2(0)\right) = \left(-1.5, 2.5 \right)$, and add Gaussian noise with $\sigma^2=0.05$ to generate the training data. We explore two scenarios with training time interval $t \in [0,7]$ and forecasting interval $t \in [7,14]$: (1) over a regularly sampled time grid, (2) over an irregular grid using uniform random sampling of time points. Task (2) demonstrates one of the key advantages of continuous-time models with the ability to handle irregular data. 

Figure \ref{fig:vdp_illustration}(b) shows that both GPODE and Bayesian NeuralODE learn a vector field posterior whose posterior mean closely matches the ground truth, with low variance (\textcolor{blue}{blue regions}) near the observed data. The posterior variance increases away from the observed data (\textcolor{orange}{orange regions}), indicating a good uncertainty characterization, while the npODE with MAP estimation seems to overfit. NeuralODE learns an appropriate vector field, but requires careful tuning of regularization and hyperparameters for a good fit with a limited number of observations. A quantitative evaluation of the model fits in Table \ref{table:vdp_illustration} indicates the better performance of GPODE as compared to the other methods under comparison. 


\begin{table}
\caption{Imputation results on the FHN system.}
\centering
\resizebox{0.9\columnwidth}{!}{
\begin{tabular}{lcc}
\toprule
 & MNLL ($\downarrow$) & MSE ($\downarrow$)  \\
\midrule
Bayesian NeuralODE (HMC) & $0.77 \pm 0.12$ & $0.24 \pm 0.03$ \\
NeuralODE & - & $0.18 \pm 0.00$ \\
npODE & $6.49 \pm 1.49$ & $\mathbf{0.08 \pm 0.01}$ \\
GPODE & $\mathbf{0.09 \pm 0.05}$ & $\mathbf{0.07 \pm 0.02}$ \\
\bottomrule
\end{tabular}
}
\label{table:fhn_interpolation}
\end{table}

\subsection{Learning with missing observations}
We illustrate the usefulness of learning Bayesian ODE posteriors under missing data with the FHN oscillator
\begin{align}
\dot{x}_1 &= 3(x_1 - x_1^3/3 + x_2), \\ \nonumber
\dot{x}_2 &= (0.2 - 3x_1 - 0.2 x_2)/3.   
\end{align}
We generate a training sequence by simulating 25 regularly-sampled time points from $t \in [0, 5.0]$ with added Gaussian noise with $\sigma^2 = 0.025$. We remove all observations at the quadrant $x_1>0, x_2<0$ and evaluate model accuracy in this region. The interpolation performance for different models is shown in Table \ref{table:fhn_interpolation}. The point estimates of npODE and NeuralODE have biases, while the Bayesian variants
of GPODE and NeuralODE provide good uncertainty estimates corresponding to their better predictive performance. 

\begin{figure}
\centering
\includegraphics[width=0.85\columnwidth]{plots/convergence_runtime.pdf}
\caption{Optimization efficiency with GPODE models.}
\label{fig:gpode_shooting_efficiency}
\end{figure}

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.9\textwidth]{plots/vdp_seqlen.pdf}
    \caption{Varying sequence length and observation noise: shooting formulation makes GPODE feasible for long sequences, outperforming the non-shooting version and competing methods. We report the results for different levels of observation noise and training sequence length on the VDP system.}    
    \label{fig:seqlen_illustration}
\end{figure*}

\subsection{Learning long trajectories with the shooting formulation}
We demonstrate the necessity of the shooting formulation for working with long training trajectories. We use the VDP system with four observations per unit of time for $T = (25,40,55)$ corresponding to $N = (100,160,220)$ observed states. We also vary the observation variance as $\sigma^2 = (0.01,0.05,0.1)$ and test the model for forecasting additional 50 time points.

Figure \ref{fig:seqlen_illustration} demonstrates that vanilla-GPODE and NeuralODE, and Bayesian NeuralODE fail to fit the data with long sequences on all noise levels. In contrast, inference for the shooting model is successful in all settings. The npODE is remarkably robust to long trajectories. We believe the robustness of npODE mainly stems from the excellent parameter initialization strategy (see supplementary section 2.2) coupled with the fully deterministic optimization setup (no reparametrization gradients).

Figure \ref{fig:gpode_shooting_efficiency} shows a runtime trace comparison between vanilla GPODE and the shooting variant in wall-clock time for a fixed budget of 15000 optimization steps on the VDP system with $N=100$, $T=25$ and $\sigma^2=0.01$. The shooting model converges approximately 10 times faster. The speedup stems from the parallelization of the shooting ODE solver, since the shooting method splits the full IVP problem into numerous short and less non-linear IVPs. In addition, the shooting method relaxes the inference problem with its auxiliary augmentation.  This experiment was conducted on a system with AMD Ryzen 5 3600 processor and Nvidia GeForce GTX 1660S GPUs.


\begin{figure*}[t]
    \centering
    \includegraphics[width=0.85\textwidth]{plots/mocap_39_100.pdf}
    \caption{Learning the walking dynamics of subject \texttt{39}: The true dynamics and predicted dynamics (mean) for the first three components in PCA space are shown in (a). Corresponding trajectories in the observation space for 6 different sensors are shown in (b) (We do not plot the observation noise variance)}
    \label{fig:mocap_illustration}
\end{figure*}

\begin{table*}
\centering
\caption{Test MNLL and MSE metrics for dynamics prediction task on CMU MoCap dataset.}
\resizebox{0.9\textwidth}{!}{
\begin{tabular}{c l  c c  c c  c c}
\toprule
\multirow{2}*{Metric}  & \multirow{2}*{Method} & \multicolumn{2}{c}{Subject 09}
 & \multicolumn{2}{c}{Subject 35}
 & \multicolumn{2}{c}{Subject 39} \\
 \cmidrule(lr){3-4}
 \cmidrule(lr){5-6}
 \cmidrule(lr){7-8}
 & & short & long
 & short & long
 & short & long \\
\midrule
\multirow{3}*{MNLL($\downarrow$)}
& Bayesian NeuralODE (VI) & $2.03 \pm 0.10 $ & $1.50 \pm 0.05 $ & $1.42 \pm 0.05 $ & $1.37 \pm 0.06 $ & $1.61 \pm 0.07 $ & $1.45 \pm 0.03 $ \\
& npODE & $2.09 \pm 0.01 $ & $1.78 \pm 0.08 $ & $1.67 \pm 0.02 $ & $1.66 \pm 0.04 $ & $2.06 \pm 0.05 $ & $1.78 \pm 0.04 $ \\
& GPODE-vanilla & $1.30 \pm 0.02$ & $1.26 \pm 0.02$ & $1.27 \pm 0.04$ & $1.39 \pm 0.04$ & $1.29 \pm 0.01$ & $\mathbf{1.13 \pm 0.01}$ \\
& GPODE-shooting & $\mathbf{1.19 \pm 0.02}$ & $\mathbf{1.14 \pm 0.02}$ & $\mathbf{1.25 \pm 0.06}$  &  $\mathbf{1.08 \pm 0.02}$ & $\mathbf{1.25 \pm 0.01}$ & $1.36 \pm 0.02$  \\

\midrule
\multirow{3}*{MSE($\downarrow$)} 
& Bayesian NeuralODE (VI) & $25.50 \pm 1.70 $ & $21.32 \pm 2.58 $ & $23.09 \pm 3.95 $ & $20.86 \pm 2.95 $ & $53.34 \pm 5.31 $ & $39.66 \pm 6.82 $ \\
& NeuralODE & $27.53 \pm 2.87 $ & $33.83 \pm 2.46 $ & $36.50 \pm 3.86 $ & $23.54 \pm 0.56 $ & $115.38 \pm 10.96 $ & $53.51 \pm 2.98 $ \\
& npODE & $17.91 \pm 1.62 $ & $19.76 \pm 4.29 $ & $26.24 \pm 2.88 $ & $22.83 \pm 3.91 $ & $92.80 \pm 15.74 $ & $55.94 \pm 4.63 $ \\
& GPODE-vanilla & $15.78 \pm 0.67$ &  $12.62 \pm 1.14$  & $16.14 \pm 0.99$ & $15.53 \pm 0.76$ & $\mathbf{20.71 \pm 1.25}$ & $23.64 \pm 1.94$ \\
& GPODE-shooting  & $\mathbf{9.11 \pm 0.37}$  & $\mathbf{8.38 \pm 1.23}$ & $\mathbf{10.11 \pm 0.79}$ & $\mathbf{11.66 \pm 0.73}$ & $26.72 \pm 0.63$ & $\mathbf{21.17 \pm 2.88}$  \\
\bottomrule
\end{tabular}
}
\label{table:mocap}
\end{table*}

\subsection{Learning human motion dynamics}
We learn the dynamics of human motion from noisy experimental data from CMU MoCap database for three subjects, \texttt{09}, \texttt{35} and \texttt{39}. The dataset consists of 50 sensor readings from different parts of the body while walking or running. We follow the preprocessing of \citet{4359316} and center the data. The dataset was further split into train, test, and validation sequences. We observed that the NeuralODE, the Bayesian NeuralODE version with VI, and npODE models suffer from over-fitting, and we remedy this by applying early stopping by monitoring the validation loss during optimization.

We project the original 50-dimensional data into a 5-dimensional latent space using PCA and learn the dynamics in the latent space. To compute the data likelihood, we project the latent dynamics back to the original data space by inverting the PCA. We divide the experiment into sub-tasks MoCap-short and MoCap-long, based on the length of the sequence considered for model training (see the supplementary section for more details on the dataset and experimental setup). The model predictive performance is measured on unseen test sequences in both tasks. 

Table \ref{table:mocap} indicates that GPODE outperforms the competing npODE and NeuralODE model variants. Figure \ref{fig:mocap_illustration} visualizes the predicted dynamics for a test sequence. The GPODE variants have reasonable posterior uncertainties, while NeuralODE variants and npODE tend to be overconfident and make more mistakes (see Figure \ref{fig:mocap_illustration} (b), sensors \texttt{05}, \texttt{41} and \texttt{47}) . We note that some variations in the data space cannot be accurately estimated due to the low-dimensional PCA projection. 

\section{Conclusion and Discussion}
We proposed a novel model for Bayesian inference of ODEs using Gaussian processes. With this approach, one can model unknown ODE systems directly from the observational data and learn posteriors of the continuous-time vector fields. In contrast, earlier works produce point estimate solutions. We believe this to be a significant addition to the data-descriptive ODE modeling methods, especially for applications where uncertainty quantification is critical. Many conventional machine learning algorithms have been interpreted and modeled as continuous-time dynamical systems, with applications to generative modeling \citep{grathwohl2019ffjord} and probabilistic alignment \citep{pmlr-v108-ustyuzhaninov20a}, among others. However, scaling GPs to high-dimensional datasets (such as images) can be a bottleneck. The applicability of the proposed model as a plug-in extension for these applications can be studied as part of future work. 

We also highlighted a problem of learning black-box ODE models on long trajectories and proposed a probabilistic shooting framework enabling efficient inference on such tasks. This framework can be applied to other existing approaches, such as NeuralODEs. However, the proposed shooting augmentation introduces model approximation and involves approximating inference over auxiliary shooting variables. Hence the benefits of the shooting augmentation can be task specific, especially on short sequences. Comprehensive empirical studies across different types of tasks can be considered in future work.
\bibliography{references}

\end{document}
