\documentclass[accepted]{uai2022}

\usepackage[round]{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
    
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{url,booktabs,nicefrac,microtype}

\usepackage{xcolor}
\definecolor{linkblue}{rgb}{0.1,0.4,0.7} 

\usepackage{multicol}
\usepackage{float}
\usepackage{cancel}
\usepackage{physics}
\usepackage{amsfonts,amsmath,amsthm,amssymb}
\usepackage{mathtools}
\usepackage{wrapfig}
\usepackage{pifont}
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usetikzlibrary{arrows}

\newcommand{\cmark}{ \textcolor{green!60!black}{\ding{51}} }
\newcommand{\xmark}{ \textcolor{red!60!black}{\ding{55}} }
\usepackage{wrapfig}

\usepackage{colortbl}
\usepackage{tabularx}
\usepackage{graphbox}
\usepackage{comment}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{algorithm, algpseudocode, algcompatible}

\usepackage{caption}
\usepackage{subcaption}
\input{math.tex}

% Title
\title{Variational multiple shooting for Bayesian ODEs with Gaussian processes (Supplementary material)}


% Add authors
\author[1]{\href{mailto:<pashupati.hegde@aalto.fi>?Subject=Your UAI 2022 paper}{Pashupati~Hegde}{}}
\author[1]{\c{C}a\u{g}atay~Y{\i}ld{\i}z}
\author[1]{Harri~L{\"a}hdesm{\"a}ki}
\author[1]{Samuel~Kaski}
\author[1]{Markus~Heinonen}

% Add affiliations after the authors
\affil[1]{
    Department of Computer Science\\
    Aalto University\\
    Finland
}

\begin{document}
\onecolumn 
\maketitle

\section{Detailed Derivations}
\subsection{Inference for the vanilla GPODE model}

\paragraph{The model.}
We consider the problem of inferring an ODE system
\begin{align}
    \y(t) &= \x(t) + \be \\ 
    \x(t) &= \x_0 + \int_0^t \f( \x(\tau)) d\tau \label{eq:supp_odeproblem}
\end{align}
from some noisy observations $\y(t)$ of the true system state $\x(t) \in \R^D$, whose evolution over time $t \in \R_+$ follows a differential equation vector field
\begin{align}
    \dot{\x}(t) &= \frac{d\x(t)}{dt} := \f(\x(t)), \qquad \f : \R^D \mapsto \R^D
\end{align}
starting from an initial state $\x_0 \in \R^D$. Our goal is to learn the underlying ODE vector field $\f$.

We propose a Gaussian process prior to the differential function
\begin{align}
    \f(\x) &\sim \GP(\0, k(\x, \x')).
\end{align}

Following \citet{titsias2009variational} for sparse inference of GPs using inducing variables, we augment the full model with inducing values $\U = (\bu_1, \ldots, \bu_M)^T \in R^{M \cross D}$ and inducing locations $\Z = (\z_1, \ldots, \z_M)^T  \in R^{M \cross D}$, which results in a low-rank GP
\begin{align}
    p(\U) &= \N(\U | \0, \K_{\Z \Z}) \\
    p(\f | \U) &= \N(\f | \A \mathrm{vec}(\U), \K_{\X \X} - \A \K_{\Z \Z} \A^T),
\end{align}
where $\X = (\x_1, \x_2, \ldots \x_{N'})^T \in \R^{N' \times D}$ collects all the intermediate state evaluations $\x(t_i)$ encountered along numerical approximation of the true continuous ODE integral \eqref{eq:supp_odeproblem}, $\f = (\f(\x_1)^T, \ldots, \f(\x_{N'})^T)^T \in \mathbb{R}^{N'D \times 1}$, $\K_{\X\X}$ is a block-partitioned matrix of size $N'D \times N'D$ with $D \times D$ blocks, so that block $(\K_{\X\X})_{i,j} = K(\x_i, \x_j)$, and $\A = \K_{\X\Z} \K^{-1}_{\Z\Z}$. 

\paragraph{The joint model} 
The joint probability of the model is
\begin{align}
    p(\Y,\f,\U, \x_0) &= \prod_{i=1}^{N} p(\y_i | \f, \x_0) p(\f|\U)p(\U) p(\x_0)\\
    &=  \prod_{i=1}^{N} \underbrace{p(\y_i | \f, \x_0)}_{\text{likelihood}} \underbrace{p(\f, \U)}_{\text{GP prior}} \underbrace{p(\x_0)}_{\text{initial state prior}},
\end{align}
where we assume a standard Gaussian prior $p(\x_0) = \N(\0, \eye)$ for the unknown initial state $\x_0$.

\begin{figure}[ht]
    \centering
    \begin{subfigure}[b]{0.42\columnwidth}
    \resizebox{1.1\columnwidth}{!}{
    \begin{tikzpicture}
     \node[latent][fill=green] (x0) {$x_0$}; 
     \node[latent,right=of x0] (x1) {$x_1$}; 
     \node[latent,right=of x1] (x2) {$x_2$}; 
     \node[latent,right=of x2] (xn) {$\cdots$}; 
     \node[obs,below=of x1] (y1) {$y_1$}; 
     \node[obs,right=of y1] (y2) {$y_2$}; 
     \node[obs,right=of y2] (yn) {$\cdots$}; 
     \path[->] (x0) edge[dotted,bend right=15] node[fill=white, anchor=center, pos=0.4, draw, circle, solid, scale=0.5] {$f$} (x1);
     \path[->] (x1) edge[dotted,bend right=15] node[fill=white, anchor=center, pos=0.4, draw, circle, solid, scale=0.5] {$f$} (x2);
     \path[->] (x2) edge[dotted,bend right=15] node[fill=white, anchor=center, pos=0.4, draw, circle, solid, scale=0.5] {$f$} (xn);
     \edge {x1} {y1};
     \edge {x2} {y2};
     \edge {xn} {yn};
    \end{tikzpicture}
    }
    \caption{Vanilla GPODE model}
    \end{subfigure}
    \hspace{10mm}
    \begin{subfigure}[b]{0.42\columnwidth}
    \resizebox{1.1\columnwidth}{!}{
    \begin{tikzpicture}
     \node[latent][fill=green]           (s0) {$s_0$}; 
     \node[latent,right=of s0][fill=green] (s1) {$s_1$}; 
     \node[latent,right=of s1][fill=green] (sn) {$\cdots$}; 
     \node[latent,below=of s1] (x1) {$x_1$}; 
     \node[latent,right=of x1] (x2) {$x_2$}; 
     \node[latent,right=of x2] (xn) {$\cdots$}; 
     \node[obs,below=of x1] (y1) {$y_1$}; 
     \node[obs,right=of y1] (y2) {$y_2$}; 
     \node[obs,right=of y2] (yn) {$\cdots$}; 
     \path[->] (s0) edge[dotted,bend right=15] node[fill=white, anchor=center, pos=0.5, draw, circle, solid, scale=0.5] {$f$} (x1);
     \path[->] (s1) edge[dotted,bend right=15] node[fill=white, anchor=center, pos=0.5, draw, circle, solid, scale=0.5] {$f$} (x2);
     \path[->] (sn) edge[dotted,bend right=15] node[fill=white, anchor=center, pos=0.5, draw, circle, solid, scale=0.5] {$f$} (xn);
     \path[->] (x1) edge[dotted,bend right=15](s1);
     \path[->] (x2) edge[dotted,bend right=15](sn);
     \edge {x1} {y1};
     \edge {x2} {y2};
     \edge {xn} {yn};
    \end{tikzpicture}
    }
    \caption{Shooting GPODE model}
    \end{subfigure}
    \hspace{10mm}
    \caption{Plate diagrams: latent random variables that are considered during model inference are shaded in green. The intermediate variables $\x_i$ (unshaded) are defined as deterministic transformations of the inferred variables (conditioned on the vectorfield). In the vanilla GPODE formulation (a), the initial state distribution $\x_0$ is integrated forward in time to match all the observations $\{\y_1, \y_2, \ldots, \y_N\}$ forming a full trajectory. The shooting version (b) splits the full trajectory into multiple subintervals. Every subinterval $i$ starts with an approximated state distribution $\s_i$, which is integrated forward to match the next observation $\y_{i+1}$. In addition, the state evolution from the previous shooting variable is matched to the variational shooting approximation at the current state.}
    \label{fig:plate}
\end{figure}


\paragraph{Inference.}
Our primary goal is to learn the vector field $\f$ by inferring the model posterior $p(\f,\U, \x_0 | \Y)$, which is intractable. We resort to stochastic variational inference \citet{hensman2013gaussian}, and introduce a factorized Gaussian posterior approximation  for the inducing variables across state dimensions 
\begin{align}
    q(\U) &= \prod_{d=1}^{D}\N(\bu_d|\m_d, \Q_d),
\end{align}
where, $\bu_d \in \R^M$ and  $\m_d \in \R^M, \Q_d \in \R^{M\times M}$ are the mean and covariance parameters of the variational Gaussian posterior approximation for the inducing variables. 
The Gaussian process posterior process with an inducing approximation can be written as
\begin{align}
    q(\f) &= \int p(\f|\U) q(\U) d\U \\
          &= \int \N\left(\f | \A \mathrm{vec}(\U), \K_{\X\X} - \A \K_{\Z\Z}\A^T \right) q(\U) d\U. \label{eq:supp_posterior_gp}
\end{align}
We also introduce posterior approximation for the initial state variable $\x_0$,
\begin{align}
    q(\x_0) = \N(\x_0 | \m_0, \S_0).
\end{align}
This results in a variational joint posterior approximation
\begin{align}
    q(\f,\U,\x_0) &= q(\f,\U)q(\x_0)\\ 
    &= p(\f|\U) q(\U) q(\x_0),
\end{align}

\paragraph{ELBO.}
With the above model specification, under variational inference of the posterior approximations, the evidence lower bound (ELBO) $\log p(\Y) \ge \L$ can be written as,
\begin{align}
    \L &= \iiint q(\f, \U, \x_0) \log \frac{p(\Y, \f, \U , \x_0)}{q(\f, \U, \x_0)} d\f d\U d\x_0 \\
    &= \iiint q(\f, \U, \x_0) \log \prod_{i=1}^{N} \underbrace{p(\y_i | \f, \x_0)}_{\L_y}  \frac{p(\f|\U)}{p(\f|\U)} \underbrace{\frac{p(\U)}{q(\U)}}_{\L_u}  \underbrace{\frac{p(\x_0)}{q(\x_0)}}_{\L_{\x_0}} d\f d\U d\x_0.
\end{align}
Hence the ELBO decomposes into three additive terms
\begin{align}
    \L &= \L_y + \L_u + \L_{\x_0},
\end{align}
where each term contains the (relevant parts of) expectation over $q(\f,\U,\x_0)$.

\paragraph{Likelihood term.}
The variational likelihood term $\L_y$ is an expectation of the likelihood wrt the variationally marginalized vectorfield posterior $q(\f)$, and the initial state distribution $q(\x_0)$,
\begin{align}
    \L_y &= \iint q(\f,\x_0) \log p(\y | \f, \x_0) d\f d\x_0 \\ 
         &= \sum_{i=1}^N \E_{q(\f, \x_0)} \log p(\y_i|\f, \x_0).
\end{align}
This term computes the likelihood $p(\y_i|\f, \x_0) = p(\y_i|\x_i)$ over ODE state solutions $\x_i = \x_0 + \int_0^{t_i} \f( \x(\tau)) d\tau$ for a single realization of the vector field $\f \sim p(\f)$ and the initial state $\x_0 \sim p(\x_0)$. Because of the non-linear integration $\x_0 \mapsto \x(t)$, we cannot solve this integral analytically. Instead, we resort to Monte Carlo integration by sampling ODE trajectories over different vector field realizations $\f \sim q(\f)$ and initial states $\x_0 \sim q(\x_0)$. In practice, this term can be approximated as  
\begin{align}
    \L_y &\approx \frac{1}{S}\sum_{s=1}^S \sum_{i=1}^N \log p(\y_i|\f^{(s)}, \x^{(s)}_0)
\end{align}
where we sum over $S$ reparameterized samples $\f^{(s)} \sim q(\f)$ and $\x^{(s)}_0 \sim q(\x_0)$.

\paragraph{Inducing KL.} This term corresponds to the KL divergence between variational posterior and the prior distribution of inducing values. This term can be derived analytically as the KL between multivariate Gaussians. 
\begin{align}
\L_u &= \int q(\U) \log \frac{p(\U)}{q(\U)} d\U \\
&= \sum_{d=1}^{D}\int q(\bu_d) \log \frac{p(\bu_d)}{q(\bu_d)} d\bu \\
&=- \sum_{d=1}^{D}\KL\left[q(\bu_d)||p(\bu_d)\right]
\end{align}

\paragraph{Initial state KL.} This term corresponds to the KL divergence between variational posterior and the prior distribution of the initial state. With an assumption of Gaussian prior and variational posterior, this term can also be derived analytically, 
\begin{align}
\L_{\x_0} &= \int q(\x_0) \log \frac{p(\x_0)}{q(\x_0)} d\x_0 \\
&= - \KL\left[q(\x_0)||p(\x_0)\right]
\end{align}


\paragraph{Complete ELBO.}
The full ELBO is then
\begin{align}
    \L &= \sum_{i=1}^N \E_{q(\f, \x_0)} \log p(\y_i | \f, \x_0) - \KL[ q(\U) \, || \, p(\U)] - \KL[ q(\x_0) \, || \, p(\x_0) ] \label{eq:supp_full_elbo}
\end{align}

\subsection{Decoupled sampling of GPODEs}
In this section, we provide details for simulating valid ODE trajectories from a GP vector field posterior of the form 
\begin{align}
    q(\bu) &= \N(\m, \Q), \\
    q(\f) &= \int p(\f|\bu) q(\bu) d\bu \\
          &= \int \N\left(\f | \A \bu, \K_{\X\X} - \A \K_{\Z\Z}\A^T \right) q(\bu) d\bu, \label{eq:supp_vf_posterior}
\end{align}
where $\A = \K_{\X\Z} \K^{-1}_{\Z\Z}$ and $\m \in \R^M, \Q \in \R^{M \times M}$ are the variational mean and covariance parameters of the Gaussian posterior approximation for inducing variables. For simplicity, we consider a scalar valued GP, but it is straightforward to extend this approach to vector-valued GPs. 


A sparse GP posterior of the form \eqref{eq:supp_vf_posterior} can be decomposed into two parts using Matheron's rule (Corollary 2 \citet{ wilson2020efficiently}),
\begin{align}
\label{eq:supp_decoupled_conditional}
    \underbrace{f(\x)|\bu}_\texttt{posterior} &= \underbrace{f(\x)}_\texttt{prior} + \underbrace{k(\x,\Z)K(\Z,\Z)^{-1}(\bu - \f_{\Z}))}_\texttt{update}.
\end{align}

\citet{wilson2020efficiently} propose a decoupled sampling from the \texttt{posterior} by using different bases for the \texttt{prior} and \texttt{update} terms. In particular, they propose Fourier basis functions for the \texttt{prior} term and canonical basis for the \texttt{update} term respectively 
\begin{align}
\label{eq:supp_decoupled_bases}
    \underbrace{f(\x)|\bu}_\texttt{posterior} &\approx \underbrace{\sum_{i=1}^{F} w_i \phi_i(\x)}_\texttt{prior} + \underbrace{\sum_{j=1}^{M} \nu_j K(\x, \z_j)}_\texttt{update},
\end{align}
where we use $F$ Fourier bases $\phi_i(\cdot)$ with $w_i \sim \N(0,1)$ \citep{rahimi2007random} to represent the stationary prior, and function basis $K(\cdot,\z_j)$ for the posterior update with $\bnu = K(\Z,\Z)^{-1}(\bu - \bPhi \w)$, $\bPhi = \bphi(\Z) \in \R^{M \times F}, \w \in \R^{F}$. We can evaluate functions from the posterior \eqref{eq:supp_vf_posterior} in linear time at arbitrary locations.

For the experimental results presented in the paper, we use a squared exponential kernel for which we can compute the feature maps $\bphi_i(\x) = \sqrt{\frac{\sigma_f^2}{F}}(\cos \x^T\bo_i, \sin \x^T\bo_i)$ where $\bo_i$ is sampled proportional to the spectral density of the squared exponential kernel $\bo_i \sim \N(\0, \Lambda^{-1})$, $\Lambda$ is a diagonal matrix collecting lengthscale parameters of the kernel $\Lambda = \text{diag}(l_1^2,l_2^2,\ldots, l_D^2)$ and $\sigma_f^2$ is the signal variance parameter. In the case of the squared exponential kernel, this results in $2F$ feature maps $\bphi(\x) \in \R^{2F}$, for which we sample weights $\w \in \R^{2F}$ from the standard Normal  $w_i \sim \N(0,1)$. By fixing random samples of feature maps $\bphi(\cdot)$, corresponding weights $\w$ and inducing values $\bu$ for an ODE integration call, we can sample a unique ODE trajectory from a posterior vector field of the form \eqref{eq:supp_vf_posterior}.

\subsection{Probabilistic shooting formulation for GPODEs}
\paragraph{The model.}
We consider the problem of inferring an ODE system
\begin{align}
    \y(t) &= \x(t) + \be \\ 
    \x(t) &= \s_0 + \int_0^t \f( \x(\tau)) d\tau,
\end{align}
from some noisy observations $\y(t)$ of the true system state $\x(t) \in \R^D$, whose evolution over time $t \in \R_+$ follows a differential equation
\begin{align}
    \dot{\x}(t) &= \frac{d\x(t)}{dt} := \f(\x(t)), \qquad \f : \R^D \mapsto \R^D
\end{align}
starting from the initial state $\s_0 \in \R^D$. Our goal is to learn the underlying ODE vector field $\f$.

\paragraph{Shooting augmentation.}
We propose an augmented `shooting' ODE system
\begin{align}
    \y_i &= \x(t_i; \s_{i-1}) + \be \\
    \x(t_i; \s_{i-1}) &= \s_{i-1} + \int_{t_{i-1}}^{t_i} \f(\x(\tau)) d\tau  \label{eq:supp_shooting_ode} \\
    \s_i &= \x(t_i; \s_{i-1}) + \bxi \label{eq:supp_shooting_tolerence},
\end{align}
where we divide the state function $\x(t)$ into $N$ short segments, with the end state of $i^{\textit{th}}$ segment $\x(t_i; \s_{i-1})$ defining solutions to initial value problems \eqref{eq:supp_shooting_ode} starting from the corresponding shooting variables $\s_{i-1}$. These short shooting segments follow the same differential $\f$ as the original model. The augmented system is equivalent to the original ODE system, in the limit when the tolerance parameter $\bxi \to \0$.

We assume Gaussian distributions on both observation noise and tolerance parameters, resulting in the following distributions,
\begin{align}
    p(\y_i|\s_{i-1}) &=  \N(\y_i|\x(t_i; \s_{i-1}), \sigma^2_y \eye); \qquad \be \sim \N(\0, \sigma^2_y\eye), \label{eq:observation_prior}\\
    p(\s_i|\s_{i-1}) &= \N(\s_i|\x(t_i; \s_{i-1}), \sigma^2_\xi \eye); \qquad \bxi \sim \N(\0, \sigma^2_\xi\eye). \label{eq:supp_shooting_prior}
\end{align}

\paragraph{Gaussian process ODE.}
We propose a Gaussian process prior for the differential function
\begin{align}
    \f(\x) &\sim \GP(\0, k(\x, \x'))
\end{align}
In addition, we augment the full model with inducing values $\U = (\bu_1, \ldots, \bu_M)^T \in R^{M \cross D}$ and inducing locations $\Z = (\z_1, \ldots, \z_M)^T  \in R^{M \cross D}$, which results in a low-rank GP
\begin{align}
    p(\U) &= \N(\U | \0, \K_{\Z \Z}) \\
    p(\f | \U) &= \N(\f | \A \mathrm{vec}(\U), \K_{\X \X} - \A \K_{\Z \Z} \A^T),
\end{align}
where $\A = \K_{\X \Z} \K_{\Z \Z}^{-1}$.


\paragraph{The joint model.} 
The joint probability of the model is
\begin{align}
    p(\Y,\S,\f,\U) &= p(\Y|\S,\f)p(\S|\f)p(\f|\U)p(\U) \\
    &=  \prod_{i=1}^{N} \underbrace{p(\y_i | \s_{i-1}, \f)}_{\text{likelihood}} \prod\limits_{i=1}^{N-1}\underbrace{p(\s_{i} | \s_{i-1}, \f)}_{\text{shooting prior}} \underbrace{p(\s_0)}_{\text{initial state}} \underbrace{p(\f|\U)p(\U)}_{\text{GP prior}},
\end{align}
where $\S = (\s_0, \s_1, \ldots \s_{N-1})^T \in \R^{N \times D}$ collects all shooting variables. 

We also note that observations are at indices $1,\ldots,N$, while the shooting variables are always one behind the observations at $0, \ldots, N-1$ (see plate diagram \ref{fig:plate} (b)).

\paragraph{Inference.}
Our primary goal is to learn the vector field $\f$ by inferring the model posterior $p(\S,\f,\U | \Y)$, which is intractable. Similar to non-shooting GPODEs, we introduce a factorized Gaussian posterior approximation  for the inducing variables across state dimensions 
\begin{align}
    q(\U) &= \prod_{d=1}^{D}\N(\bu_d|\m_d, \Q_d),
\end{align}
where, $\bu_d \in \R^M$ and  $\m_d \in \R^M, \Q_d \in \R^{M\times M}$ are the mean and covariance parameters of the variational Gaussian posterior approximation for the inducing variables. 

The Gaussian process posterior process with an inducing approximation can be written as
\begin{align}
    q(\f) &= \int p(\f|\U) q(\U) d\U \\
          &= \int \N\left(\f | \A \mathrm{vec}(\U), \K_{\X\X} - \A \K_{\Z\Z}\A^T \right) q(\U) d\U.
\end{align}

Next, we introduce a factorized Gaussian posterior approximations for the shooting variables $\S$ as well,
\begin{align}
    q(\S) &= \prod\limits_{i=0}^{N-1} q(\s_i) = \prod\limits_{i=0}^{N-1} \N(\s_i | \a_i, \bS_i).
\end{align}
where, $\a_i \in \R^D$ and $\bS_i \in \R^{D\times D}$ are the mean and covariance parameters of the variational Gaussian posterior approximation for the shooting variables. 

This results in a variational joint posterior approximation
\begin{align}
    q(\S,\f,\U) &= q(\S) q(\f,\U)\\ 
    % &= \prod\limits_{i=0}^{N-1} q_i(\s_i) p(\f|\U) q(\U).
    &= \prod\limits_{i=0}^{N-1} q(\s_i) p(\f|\U) q(\U).
\end{align}

\paragraph{ELBO.}
Under variational inference the posterior approximations $q$ are optimized to match the true posterior in the KL sense,
\begin{align}
    \argmin_q \: \KL\big[ q(\S,\f,\U) \, || \, p(\S,\f,\U | \Y) \big].
\end{align}
This is equivalent to maximizing the evidence lower bound (ELBO) $\log p(\Y) \ge \L$,
\begin{align}
    \L &= \iiint q(\S, \f, \U) \log \Bigg[ \frac{p(\Y, \S, \f, \U)}{q(\S, \f, \U)}\Bigg] d\S d\f d\U\\
    &= \iiint  q(\S, \f, \U) \log \Bigg[\prod_{i=1}^{N} p(\y_i | \s_{i-1}, \f) \cdot \prod_{i=1}^{N-1} \frac{p(\s_{i} | \s_{i-1}, \f)}{q(\s_{i})}  \cdot \frac{p(\s_0)}{q(\s_0)} \cdot \frac{p(\f,\U)}{q(\f,\U)} \Bigg]d\S d\f d\U \\
    &= \underbrace{\iint  q(\S)q(\f) \log \prod_{i=1}^{N} p(\y_i | \s_{i-1}, \f) d\S d\f}_{\L_y} + \underbrace{\iint q(\S)q(\f) \log \prod_{i=1}^{N-1} p(\s_{i} | \s_{i-1}, \f)d\S d\f}_{\L_{sc}} \nonumber \\
    &\qquad  \underbrace{-  \int q(\S)  \log \prod_{i=1}^{N-1} q(\s_{i}) d\S }_{\L_{se}}+ \underbrace{\int  q(\s_0) \log \frac{p(\s_0)}{q(\s_0)} d\s_0}_{\L_0}  + \underbrace{\int  q(\U) \log \frac{p(\U)}{q(\U)} d\U}_{\L_u}   \\    
\end{align}
which results in the ELBO decomposing into four additive terms
\begin{align}
    \L &= \L_y + \L_{sc} + \L_{se} + \L_0 + \L_u,
\end{align}
where each term contains the (relevant parts of) expectation over $q(\S,\f,\U)$.

\paragraph{Likelihood term.}
The variational likelihood term $\L_y$ is an expectation of the likelihood under the posteriors of shooting variables $q(\S)$ and the posterior vectorfield $q(\f)$,
\begin{align}
    \L_y &= \iint  q(\S)q(\f) \log \prod_{i=1}^{N} p(\y_i | \s_{i-1}, \f) d\S d\f\\ 
         &= \sum_{i=1}^N \iint q(\s_{i-1}) q(\f) \log p(\y_i | \s_{i-1}, \f) d\s_{i-1} d\f \\ 
         &= \sum_{i=1}^N \E_{q(\s_{i-1})q(\f)} \Big[\log p(\y_i|\s_{i-1}, \f) \Big].
\end{align}
We can evaluate this term with Monte Carlo integration by taking reparameterized samples from the posteriors $\f^{(s)} \sim q(\f)$ and $\s^{(s)}_{i-1} \sim q(\s_{i-1})$ as below 
\begin{align}
    \L_y &= \sum_{i=1}^N \E_{q(\s_{i-1},\f)} \Big[\log p(\y_i|\s_{i-1}, \f) \Big]\\
    \L_y &\approx \frac{1}{S}\sum_{s=1}^S \sum_{i=1}^N \Big[\log p(\y_i|\x^{(s)}_i)\Big], 
\end{align}
where $\x^{(s)}_i$ is defined as solution to the following initial value problem,
\begin{align}
    \x^{(s)}_i := \x^{(s)}(t_i; \s_{i-1}) &= \s^{(s)}_{i-1} + \int_{t_{i-1}}^{t_i} \f^{(s)}(\x(\tau)) d\tau.
\end{align}


\paragraph{Shooting cross-entropy term.}
This term computes the cross-entropy between the prior specification for the shooting variables under the ODE evolution $p(\s_{i} | \s_{i-1}, \f)$, and the point-wise approximations $q(\s_i)$,
\begin{align}
    \L_{se} &= \iint q(\S)q(\f) \Big[ \log \prod_{i=1}^{N-1} p(\s_{i} | \s_{i-1}, \f)\Big] d\S d\f \\ 
    &= \iint q(\s_{N-1}) \cdots q(\s_1) q(\s_0) q(\f) \Big[ \log p(\s_{N-1}|\s_{N-2}, \f) \cdots p(\s_1|\s_0, \f)\Big] d\S d\f\\ 
    &= \sum_{i=1}^{N-1} \iint q(\f) q(\s_i) q(\s_{i-1}) \Big[ \log p(\s_i | \s_{i-1}, \f)\Big] d\s_{i-1} d\s_i d\f \\  
    &= \sum_{i=1}^{N-1} \E_{q(\s_{i}, \s_{i-1},\f)} \Big[ \log p\left(\s_i | \s_{i-1}, \f \right) \Big] .
\end{align}

This term can also be numerically estimated with Monte Carlo integration using posterior samples $\f^{(s)} \sim q(\f)$, $\s^{(s)}_{i-1} \sim q(\s_{i-1})$ and  $\s^{(s)}_{i} \sim q(\s_{i})$
\begin{align}
    \L_{se} &= \sum_{i=1}^{N-1} \E_{q(\s_{i}, \s_{i-1},\f)} \Big[ \log p\left(\s_i | \s_{i-1}, \f \right) \Big] \\
    &\approx \frac{1}{S}\sum_{s=1}^S \sum_{i=1}^{N-1} \log p\left(\s^{(s)}_i \big| \x^{(s)}_i \right), \\
     \x^{(s)}_i := \x^{(s)}(t_i; \s_{i-1}) &= \s^{(s)}_{i-1} + \int_{t_{i-1}}^{t_i} \f^{(s)}(\x(\tau)) d\tau.
\end{align}

\paragraph{Shooting entropy term.}
This term computes the entropy of the posterior approximations for shooting variables $q(\s_{i})$. Since we assume factorized Gaussian approximations, this term can be simplified analytically as the sum of Gaussian entropy. 
\begin{align}
    \L_{se} &= - \int q(\S)  \log \prod_{i=1}^{N-1} q(\s_{i}) d\S \\ 
    &= - \sum_{i=1}^{N-1} \E_{q(\s_i)}  \Big[ \log q(\s_{i})  \Big].
\end{align}

\paragraph{Initial state KL term.} This term corresponds to the KL divergence between variational posterior and the prior distribution of the initial state. With the assumption of Gaussian prior and variational posterior, this term can also be derived analytically, 
\begin{align}
\L_0 &= \int q(\s_0) \log \frac{p(\s_0)}{q(\s_0)} d\s_0 \\
&= - \KL\left[q(\s_0)||p(\s_0)\right].
\end{align}


\paragraph{Inducing KL term.} This term corresponds to the KL divergence between variational posterior and prior distribution of inducing values. This term can also be derived analytically as the KL between multivariate Gaussians. 
\begin{align}
\L_u &= \int q(\U) \log \frac{p(\U)}{q(\U)} d\U \\
&= \sum_{d=1}^{D}\int q(\bu_d) \log \frac{p(\bu_d)}{q(\bu_d)} d\bu \\
&=- \sum_{d=1}^{D}\KL\left[q(\bu_d)||p(\bu_d)\right].
\end{align}


\paragraph{Complete ELBO.}
The full ELBO is then
\begin{align}
    \L &= \L_y + \L_{sc} + \L_{se} + \L_0 + \L_u \\
&= \sum_{i=1}^N \E_{q(\s_{i-1},\f)} \Big[\log p(\y_i|\s_{i-1}, \f) \Big] + \sum_{i=1}^{N-1} \E_{q(\s_{i}, \s_{i-1},\f)} \Big[ \log p\left(\s_i | \s_{i-1}, \f \right) \Big]\nonumber\\
&\quad  - \sum_{i=1}^{N-1} \E_{q(\s_i)}  \Big[ \log q(\s_{i})  \Big] - \KL[q(\s_0) \, || \, p(\s_0)] - \KL[ q(\U) \, || \, p(\U)]
\end{align}
which in practice is numerically estimated with Monte Carlo integration
\begin{align}
\L &\approx \frac{1}{S}\sum_{s=1}^S \sum_{i=1}^N \Big[\log p(\y_i|\x^{(s)}_i)\Big] + \frac{1}{S}\sum_{s=1}^S \sum_{i=1}^{N-1} \log p\left(\s^{(s)}_i \big| \x^{(s)}_i \right) \nonumber\\
 &\quad - \sum_{i=1}^{N-1} \E_{q(\s_i)}  \Big[ \log q(\s_{i})  \Big] - \KL[q(\s_0) \, || \, p(\s_0)] - \KL[ q(\U) \, || \, p(\U)]
\end{align}
where $\f^{(s)} \sim q(\f)$, $\s^{(s)}_{i-1} \sim q(\s_{i-1})$, $\s^{(s)}_{i} \sim q(\s_{i})$ and 
\begin{align}
     \x^{(s)}_i &:= \x^{(s)}(t_i; \s_{i-1}) = \s^{(s)}_{i-1} + \int_{t_{i-1}}^{t_i} \f^{(s)}(\x(\tau)) d\tau.
\end{align}


\section{Experimental Details}
\begin{algorithm}[th]
\caption{GPODEs : Bayesian inference of ODEs using Gaussian processes}
\label{alg:gpode}
\begin{algorithmic}
   \STATE {\bfseries Inputs:} 
   \STATE {} \hspace{5mm}  - Observed states $\mathbf{Y}$, observation time sequence $\mathbf{t}$.
   \STATE {\bfseries Initialize hyperparameters:} 
   \STATE {} \hspace{5mm}  - Kernel parameters $\mathbf{\theta}$, likelihood parameters, inducing locations $\mathbf{Z}$.
   \STATE {\bfseries Initialize variational parameters:} 
   \STATE {} \hspace{5mm}  - Parameters of $q(\mathbf{U}) =  \mathcal{N}(\mathbf{m}, \mathbf{Q})$.
   \STATE {} \hspace{5mm}  - Parameters of $q(\mathbf{x_0}) = \mathcal{N}(\mathbf{a_0}, \mathbf{\Sigma_0})$.
   \STATE {\bfseries Optimization:} 
   \FOR {every optimization step}
    \STATE (1) Sample a function $\mathbf{f}$ from the ODE posterior in \eqref{eq:supp_posterior_gp} by taking following samples:
    \STATE {} \hspace{5mm}  - Parameters of Fourier bases $\mathbf{\omega_\theta}$  proportional to the spectral density of GP kernel,
    \STATE {} \hspace{5mm}  - Weights $\mathbf{w} \sim \mathcal{N}(\mathbf{0}, \textbf{\textrm{I}})$,
    \STATE {} \hspace{5mm}  -  Sample from the inducing posterior $\mathbf{U} \sim \mathcal{N}(\mathbf{m}, \mathbf{Q})$.
    \STATE (2) Sample initial state $\mathbf{x}_0 \sim \mathcal{N}(\mathbf{a_0}, \mathbf{\Sigma_0})$.
    \STATE (3) Compute predicted states $\hat{\mathbf{Y}} = \textrm{ODEsolve}(\mathbf{f}, \mathbf{x_0}, \mathbf{t})$.
    \STATE (4) Compute ELBO from \eqref{eq:supp_full_elbo} : $\textrm{likelihood}(\mathbf{Y}, \hat{\mathbf{Y}})$, 
    $\textrm{KL}[q(\mathbf{U})||p(\mathbf{U})]$, $\textrm{KL}[q(\mathbf{x_0})||p(\mathbf{x_0})]$.
   \STATE (5) Update all parameters with stochastic gradients of ELBO.
   \ENDFOR
\end{algorithmic}
\end{algorithm}

\subsection{Optimization setup}
We use Adam \citep{kingma2014adam} optimizer and jointly train all the variational parameters and hyperparameters.
The complete list of optimized parameters, along with additional method-specific details, are given below.

\paragraph{Vanilla GPODE model.} We use `whitened' representation for the inducing variables and optimize following parameters against the evidence lowerbound (see algorithm \ref{alg:gpode}). 
\begin{itemize}[noitemsep,nolistsep]
    \item Variational parameters:
    \begin{itemize}
        \item Inducing variables $q(\U)$,  initial states $q(\x_0)$
    \end{itemize}
    \item Hyperparameters:
    \begin{itemize}
        \item Inducing locations $\Z$
        \item Likelihood parameters: scale parameter for the Gaussian likelihood
        \item Kernel parameters: length scales and signal variance parameters in case of squared exponential kernel
    \end{itemize}
\end{itemize}


\paragraph{Shooting GPODE model.} We use `whitened' representation for inducing variables and optimize the following parameters against the evidence lower bound. 
\begin{itemize}[noitemsep,nolistsep]
    \item Variational parameters:
    \begin{itemize}
        \item Inducing variables $q(\U)$, shooting states $q(\S)$
    \end{itemize}
    \item Hyperparameters:
    \begin{itemize}
        \item Inducing locations $\Z$
        \item Likelihood parameters: scale parameter for the Gaussian likelihood
        \item Kernel parameters: length scales and signal variance parameters in case of squared exponential kernel
    \end{itemize}
\end{itemize}

\paragraph{npODE model.} We use `whitened' representation for inducing variables, maximum a posteriori (MAP) objective, and optimize following parameters:
\begin{itemize}[noitemsep,nolistsep]
    \item Inducing values $\U$ and locations $\Z$.
    \item Likelihood parameters: scale parameter for the Gaussian likelihood.
    \item Kernel parameters: length scales and signal variance parameters in case of the squared exponential kernel. 
\end{itemize}

\paragraph{NeuralODE model.} We use \texttt{tanh} activation and a fully connected block with one hidden layer having $32$ units in Van der Pol/ Fitz-Hugh Nagumo experiments. In MoCap experiments, we try one/two hidden layers with $64$/$128$ hidden units, and report the best results. All the network parameters were optimized against \texttt{MSE} loss.

\paragraph{Bayesian NeuralODE model.} We utilized the codebase \footnote{\url{https://github.com/RajDandekar/MSML21_BayesianNODE}} provided by \cite{dandekar2020bayesian} for training Bayesian version of NeuralODEs. We used networks with one hidden layer and $32$ units VDP/FHN experiments and performed posterior sampling with HMC. In case of experiments with long sequences (shooting illustration on VDP and MocCap) the HMC sampling had convergence issues, hence we performed variational inference instead. In case of MoCap experiments, we tried networks with two hidden layers and $64$/$128$ hidden units, and performed mean-field variational inference.

\subsection{Additional details on the inducing variables}
\paragraph{‘Whitening’ the inducing variables.} While performing sparse inference for GPs using inducing variables, it is a common practice to use noncental parameterization $\tilde{\mathbf{U}} = \mathbf{L_\theta U}$ where $\mathbf{L_\theta} \mathbf{L_\theta}^T =\mathbf{K_\theta}(\mathbf{Z},\mathbf{Z})$ \citep{hensman2015mcmc}. Such a reparametrization turns the inference for $\mathbf{U}$ with prior $\mathcal{N}(\mathbf{0}, \mathbf{K_{ZZ}})$ into inference for $\tilde{\mathbf{U}}$ with isotropic Gaussian prior $\mathcal{N}(\mathbf{0}, \textbf{\textrm{I}})$. This generally improves the optimization performance by decorrelating the latent parameters from each other.

\paragraph{Initializing inducing variables using data gradients.} In case of sparse Gaussian process model with inducing variables, we initialize the vector field with empirical gradients from the observed data. We first initialize inducing locations $\Z$ as \texttt{kmeans} cluster centers of observations $\Y$. Next we compute empirical gradient estimates, $\dot{\Y} = (\y_2-\y_1,\y_3-\y_2, \ldots, \y_{N} - \y_{N-1})$ at locations $\tilde{\Y} = (\y_1,\y_2, \ldots, \y_{N-1})$ and initialize inducing values $\U$ as the GP mean interpolation of empirical gradients at inducing locations. 
\begin{align}
    \U &=  \Delta t \cdot K(\Z, \tilde{\Y})K(\tilde{\Y}, \tilde{\Y})^{-1} \dot{\Y},
\end{align}
where $\Delta t$ is the time difference between two consecutive observations in the dataset.  

\subsection{Additional details on the CMU MoCap experiment}
\paragraph{Details on the dataset.} The dataset used in this experiment was obtained from \url{http://mocap.cs.cmu.edu/}. The database consists of sensor recordings of multiple activities for different subjects in \texttt{.amc} files. We selected three subjects with the most number of walking or running sequences: subjects \texttt{09}, \texttt{35}, and \texttt{39}. The \texttt{.amc} files considered for train, validation and test purposes are given in table \ref{table:supp_mocap_splits}. The training sequences and their lengths were selected to include at least one full cycle of the dynamics while learning the model. The observation sequence lengths for training/test/validation splits are reported in table \ref{table:supp_mocap_data}.

\paragraph{Details on the PCA} In the CMU MoCap experiment, we project the data from $D$ dimensional observation-space to $K<D$ dimensional latent-space using eigenvectors corresponding to top-$K$ eigenvalues. The ODE model is then learnt in the latent-space and model predictions are projected back into the observation-space using $K$ eigenvectors. We refer to this as `inverting the PCA' in the main text. 


\begin{table}
\caption{For each subject (a), we report the activity considered for the experiment (b), the data split train/validation/test (c), the number of sequences considered for the corresponding split (d), and the files used in the corresponding split (e).}
\centering
\resizebox{0.8\columnwidth}{!}{
\begin{tabular}{l c c c c}
\toprule
 (a) subject & (b) activity & (c) split & (d) \# sequences & (e) files\\
\midrule
\multirow{ 2}{*}{subject \texttt{09}} & \multirow{ 2}{*}{running} &
train & 6 & \shortstack{\texttt{05.amc}, \texttt{06.amc}, \texttt{07.amc}, \\ \texttt{08.amc}, \texttt{09.amc}, \texttt{11.amc}}\\
 \cmidrule(lr){3-5} 
& & validation & 2 & \texttt{01.amc}, \texttt{02.amc}\\
\cmidrule(lr){3-5} 
& & test & 2 & \texttt{03.amc}, \texttt{04.amc}\\
\cmidrule(lr){1-5} 
\multirow{ 2}{*}{subject \texttt{35}} & \multirow{ 2}{*}{walking} &
train & 16 & \shortstack{\texttt{01.amc}, \texttt{02.amc}, \texttt{03.amc}, \texttt{04.amc}, \\
\texttt{05.amc}, \texttt{06.amc}, \texttt{07.amc}, \texttt{08.amc}, \\
\texttt{09.amc}, \texttt{10.amc}, \texttt{11.amc}, \texttt{12.amc}, \\
\texttt{13.amc}, \texttt{14.amc}, \texttt{15.amc}, \texttt{16.amc} }\\
\cmidrule(lr){3-5} 
& & validation & 3 & \texttt{28.amc}, \texttt{29.amc}, \texttt{30.amc} \\
\cmidrule(lr){3-5} 
& & test & 4 & \texttt{31.amc}, \texttt{32.amc}, \texttt{33.amc}, \texttt{34.amc}\\
\cmidrule(lr){1-5} 
\multirow{ 2}{*}{subject \texttt{39}} & \multirow{ 2}{*}{walking} &
train & 6 & \shortstack{\texttt{01.amc}, \texttt{02.amc}, \texttt{07.amc}, \\
\texttt{08.amc}, \texttt{09.amc}, \texttt{10.amc}}\\
\cmidrule(lr){3-5} 
& & validation & 2 & \texttt{03.amc}, \texttt{04.amc} \\
\cmidrule(lr){3-5} 
& & test & 2 & \texttt{05.amc}, \texttt{06.amc}\\
\bottomrule
\end{tabular}
}
\label{table:supp_mocap_splits}
\end{table}

\begin{table}[H]
\caption{For each subject (a), we report the experiment type (b), the data split train/validation/test (c), and the number of observations considered for the corresponding split.}
\centering
\resizebox{0.5\columnwidth}{!}{
\begin{tabular}{l c c c}
\toprule
 (a) subject & (b) experiment & (c) split & (d) sequence length\\
\midrule
\multirow{ 6}{*}{subject \texttt{09}} & \multirow{ 3}{*}{short} &
train & 50\\
& & validation & 120\\
& & test & 120\\
\cmidrule(lr){2-4}
& \multirow{ 3}{*}{long} &
train & 100\\
& & validation & 120\\
& & test & 120\\
\cmidrule(lr){1-4}
\multirow{ 6}{*}{subject \texttt{35}} & \multirow{ 3}{*}{short} &
train & 50\\
& & validation &300\\
& & test & 300\\
\cmidrule(lr){2-4}
& \multirow{ 3}{*}{long} &
train & 250\\
& & validation & 300\\
& & test & 300\\
\cmidrule(lr){1-4}
\multirow{ 6}{*}{subject \texttt{39}} & \multirow{ 3}{*}{short} &
train & 100\\
& & validation &300\\
& & test & 300\\
\cmidrule(lr){2-4}
& \multirow{ 3}{*}{long} &
train & 250\\
& & validation & 300\\
& & test & 300\\
\bottomrule
\end{tabular}
}
\label{table:supp_mocap_data}
\end{table}

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.8\textwidth]{plots/vdp_uniform.pdf}
    \caption{Learning the 2D Van der Pol dynamics on irregularly sampled observations \textbf{(a)} with alternative methods \textbf{(b-d)}. Column 1 shows the vector fields while columns 2 and 3 show the state trajectories $x_1(t)$ and $x_2(t)$. GPODE learns the posterior accurately.}
    \label{fig:vdp_illustration_irregular}
\end{figure*}

\begin{figure*}[t]
    \centering
    \begin{subfigure}[b]{0.45\columnwidth}
    \centering
    \includegraphics[width=1.0\textwidth]{plots/convergence_runtime.pdf}
    \caption{Convergence across wall-clock time}
    \end{subfigure}
    \qquad
    \begin{subfigure}[b]{0.45\columnwidth}
    \centering
    \includegraphics[width=1.0\textwidth]{plots/convergence_iters.pdf}
    \caption{Convergence across gradient steps during optimization}
    \end{subfigure}
    \caption{Optimization efficiency with GPODE models.}
\end{figure*}


\bibliography{references}
\end{document}
