%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage[round]{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% Additional packages
\usepackage{amsmath}  % blackboard math symbols
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{booktabs}     % professional-quality tables
\usepackage{color} % to delete
\usepackage{pifont}% http://ctan.org/pkg/pifont
\usepackage[T1]{fontenc} % to delete
\usepackage{arydshln} % \hdashline and \cdashline
\usepackage{subcaption}
%\usepackage[draft]{hyperref}
%  \hypersetup{
%	breaklinks=true,   % splits links across lines
%	colorlinks=true,   % displays links as colored text instead of blocks
%	pdfusetitle=false,  % \title and \author values into pdf metadata
%	% etc.
%	linkcolor={red!50!black},
%	citecolor={blue!50!black},
%	urlcolor={blue!80!black}
%}
\hypersetup{
	colorlinks=false,
	pdfborder={0 0 0},
}

% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{enumitem}

% Additional Commands
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\newcommand{\bs}[1]{\boldsymbol{#1}}
\newcommand{\CD}{\mathcal{D}}
\newcommand{\EE}{\mathbb{E}}

\newcommand{\zv}{\boldsymbol{z}}
\newcommand{\thetav}{\boldsymbol{\theta}}
\newcommand{\psiv}{\boldsymbol{\psi}}
\newcommand{\varphiv}{\boldsymbol{\varphi}}
\newcommand{\phiv}{\boldsymbol{\phi}}
\newcommand{\gammav}{\boldsymbol{\gamma}}
\newcommand{\xv}{\boldsymbol{x}}
\newcommand{\uv}{\boldsymbol{u}}
\newcommand{\yv}{\boldsymbol{y}}
\newcommand{\sigmav}{\boldsymbol{\sigma}}
\newcommand{\epsilonv}{\boldsymbol{\epsilon}}
\mathchardef\mhyphen="2D

\newcommand{\bigCI}{\mathrel{\text{\scalebox{1.07}{$\perp\mkern-10mu\perp$}}}}
\newcommand{\rh}[1]{{\color{red}#1}}
\newcommand{\code}[1]{{\color{blue!50!black}#1}}

\colorlet{Mycolor1}{blue!50!black}

\title{Capturing Actionable Dynamics \\ with Structured Latent Ordinary Differential Equations}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<chapfuwa@stanford.edu>?Subject=Structured Latent Ordinary Differential Equations}{Paidamoyo~Chapfuwa}{}}
\author[1]{ Sherri~Rose}
\author[2]{Lawrence~Carin}
\author[3]{Edward~Meeds}
\author[4]{Ricardo~Henao}
% Add affiliations after the authors
\affil[1]{%
	Stanford University\\
	USA
}
\affil[2]{%
	KAUST\\
	Saudi Arabia
}
\affil[3]{%
	Microsoft Research\\
	Cambridge\\
	UK
}
\affil[4]{%
	Duke University\\
	USA
}

\begin{document}
	\maketitle
	\begin{abstract}
		End-to-end learning of dynamical systems with black-box models, such as neural ordinary differential equations (ODEs), provides a flexible framework for learning dynamics from data without prescribing a mathematical model for the dynamics. Unfortunately, this flexibility comes at the cost of understanding the dynamical system, for which ODEs are used ubiquitously. Further, experimental data are collected under various conditions (inputs), such as treatments, or grouped in some way, such as part of sub-populations. Understanding the effects of these system inputs on system outputs is crucial to have any meaningful model of a dynamical system. To that end, we propose a structured latent ODE model that explicitly captures \emph{system input} variations within its latent representation. Building on a static latent variable specification, our model learns (independent) stochastic factors of variation for each input to the system, thus separating the effects of the system inputs in the latent space. This approach provides actionable modeling through the \emph{controlled generation} of time-series data for novel input combinations (or perturbations). Additionally, we propose a flexible approach for quantifying uncertainties, leveraging a quantile regression formulation. Results on challenging biological datasets show consistent improvements over competitive baselines in the controlled generation of observational data and inference of biologically meaningful system inputs.
		%
		% 		End-to-end learning of system dynamics with black-box neural network models, such as neural ODEs, provide a flexible framework for learning dynamics from data without prior knowledge of a prescribed mathematical description of the dynamics.  This comes at the cost of  understanding of the dynamical system for which ODEs are used ubiquitously in science.  Furthermore experimental data are collected under various conditions, such as treatments, or grouped in some way, such as a sub-population age.  Understanding the effects of these system inputs on system outputs is crucial to any meaningful model of a dynamical system.  To that end, we propose a structured latent ODE model that explicitly captures \emph{system input} variations in its latent representation. Following (CCVAE) our model learns (independent) stochastic factors of variation for each input to the system and importantly infers each input using the same input-specific factor, thus disentangling the effects of the system inputs in the latent space.  This approach provides actionable modeling by conditional generation of time-series data for novel input combinations and counterfactual generation (what if we changed an input for this data instance?)  Additionally  we propose a flexible approach for quantifying uncertainties leveraging a quantile regression formulation.
		% 		Experimental results on challenging biological datasets show consistent improvements over competitive baselines in the \emph{controlled} generation of observational data and prediction of biologically meaningful system inputs.
		%\rh{rephrase}
		% 		We leverage the latent representations  
		% 		%\rh{how?} 
		% 		for a mechanistic approach to generating observations governed by nonlinear dynamical systems (\emph{e.g.}, biological, physical, \emph{etc}.) to interpret population, sub-population, and individual trajectories given system inputs.
		% 		%or unseen combinations, \emph{i.e.}, zero-short learning.
		% 		% or predicting unknown system inputs given observational data.
		% 		%\rh{unseen what?}
		% 		Further, we propose a flexible approach for quantifying uncertainties leveraging a quantile regression formulation.
		% 		Experimental results on challenging biological datasets show consistent improvements over competitive baselines in the \emph{controlled} generation of observational data and prediction of biologically meaningful system inputs.
		% \emph{e.g.}, genetic components in synthetic biology.
		%\rh{important how?}
	\end{abstract}
	
	\section{Introduction}
	% Why is it important to disentangle - latent variables for dynamic systems
	% Has that been overlooked?
	% mechanistic models can be leveraged for subsequent
	%use in applications where experiments are either
	%impossible or difficult to achieve, enables extrapolation
	%to predictions about behaviours not present in the
	%original data
	% Simpler implementations or generalize? One dimensional z? 
	% Unseen Perturbations?  X || Y | Z, assume that p(y_2|z, y_1) = p(y_2|z) or extrapolations
	% Generate data from unseen combinations, e.g, Gaussian Kernels 
	% Automated wet labs: Effect of different interventions, hard to do with standard modeling, hard to do with white box model
	% Generate dynamics given individual, group  or population characteristics, specify a representation where they are disentangled consistent with expected dynamics 
	% Base our formulation as a differential equation solver
	% Difficult to disentangle on what is group, population or individual, leveraged recently proposed CCVAE
	% Disentangle? Hierarchical modeling with blackbox models? Multilabel modeling?
	% Improve CCVAE, enforcing consistently of latent codes with class label
	% Flexible approach for capturing uncertainties
	%\section{Dynamical System}
	%\ash{TODO: inputs are \emph{static} $\uv$ instead of $\uv(t)$. We have resolved this inconsistency.}
	Dynamical systems are fundamental models in many scientific domains. Examples include the study of biological processes such as gene regulation \citep{calderhead2009accelerating}, human cardiovascular systems \citep{zenker2007inverse}, epidemiology \citep{siettos2013mathematical}, and synthetic biology \citep{roeder2019efficient}. The evolution of continuous-time dynamical systems are commonly modeled mathematically by ordinary differential equations (ODEs) as
	%
	\begin{align}
		\frac{d\xv}{dt} = f\left(\xv(t), t, \uv(t) \right) \,, \quad   \xv(0) = \xv_{0} \,, \quad t\in[0, T] \,,
	\end{align}
	%
	and are governed by mathematical rules known as \emph{dynamics} $f (\cdot)$, where  $\xv(t) \in \mathbb{R}^D$ is the \emph{state} (snapshot of the process at time $t$) or solution of the ODE system, and $\uv(t)$ are the system \emph{inputs}. Moreover, given a state $\xv_{0}$ as the \emph{initial condition}, the dynamics define a temporal \emph{trajectory} from a starting point at $t=0$. Such systems can be categorized as deterministic {\em vs.} stochastic, or linear {\em vs.} nonlinear. In practice, we are given a set of noisy observations $\yv(t) = m(t, \xv(t))$ at $t=t_0,\ldots,t_T$, where $m (\cdot)$ is the unknown \emph{emission} function, and we typically make assumptions to estimate  functions $\{f(\cdot)$, $\xv(t)$, $m(\cdot)\}$ parametrically or nonparametrically.
	
	%%%%  Paragraph 2: Probems with existing approaches 
	
	Classical \emph{state-space} models, such as the Kalman filter \citep{kalman1960new}, assume a parametric \emph{linear Gaussian state-space model} for the dynamics and emission functions.
	%where $f(\cdot) = A(t) \bs{x}(t) +  \uv(t)$, where $ \uv(t)$ is random Gaussian noise, and $A(t)$ is the \emph{transition matrix}. 
	Because these assumptions are violated in practice and limit model flexibility,  modifications were introduced which can generalize to nonlinear systems \citep{julier1997new,julier2004unscented}.  Recent variants of the \emph{Gaussian state-space model} retain the Markovian structure of hidden Markov models and leverage neural networks for learning nonlinear dynamics and emission functions \citep{krishnan2017structured, fraccaro2017disentangled, miladinovic2019disentangled}.
	%% Next two paragraphs needs refactoring 
	%% - reprocuing Kernel
	%% - autoregressive 
	%Paradigm shift: whereas earlier f was pre-defined/hand-designed according to the domain,
	%here we would like to estimate an f that suits our objective.
	
	While nonlinear systems are flexible, they are difficult to solve  and rarely yield closed-form solutions for $\xv(t)$. Hence, {\em implicit} approximations to numerical integration of system dynamics have been considered, {\em e.g.}, methods that directly solve for $\xv(t)$ for a known $f(\cdot)$, leveraging the adaptive Euler method \citep{runge1895numerische, kutta1901beitrag, alexander1990solving}. Such approaches are computationally imprecise and challenging to scale for complex systems. Several approaches adopt gradient matching using Gaussian processes (GPs) \citep{calderhead2009accelerating, graepel2003solving, rasmussen2003gaussian}, and related approaches based on a reproducing kernel Hilbert space (RKHS) \citep{gonzalez2014reproducing} primarily, to avoid numerical integration. Unfortunately, kernel learning with GPs or RKHS is challenging to scale for large datasets and requires complete observability of $\xv(t)$ \citep{ghosh2021variational}. Alternatively, some methods conveniently presume discrete-time nonlinear dynamical modeling for deterministic and easy-to-evaluate state-space solutions, such as recurrent (or autoregressive) neural networks \citep{valpola2002unsupervised, karl2016deep, yingzhen2018disentangled}, albeit constrained to pre-specified time-horizons.
	
	We further divide methods that learn nonlinear dynamics according to assumptions required for estimating ODE dynamics, where $f(\cdot)$ is modeled as a neural network \citep{chen2018neural}, or more recently, parameterized by a latent variable model \citep{rubanova2019latent}, that leverages amortized variational inference \citep{kingma2013auto, rezende2014stochastic}. While a large body of machine learning approaches assume a known parametric form of the dynamics $f(\cdot)$ \citep{linial2021generative, wan2001unscented, wenk2020odin}, alternative flexible approaches assume that the parametric form of $f(\cdot)$ is unknown  \citep{rubanova2019latent, roeder2019efficient}.  Moreover, several specifications of variational inference for latent variable \emph{state-space} models have been proposed \citep{linial2021generative, rubanova2019latent, karl2016deep, roeder2019efficient, miladinovic2019disentangled, yingzhen2018disentangled, fraccaro2017disentangled}. Of these, only \citet{roeder2019efficient} considers a structured hierarchical latent variable model accounting for both observations and system inputs. So motivated, we adopt a data-driven approach to learn unknown functions $\{f(\cdot), m (\cdot)\}$ parameterized by neural networks. Moreover, we leverage a variational inference approach to learn a structured latent variable model (separating \emph{input-} from \emph{noise-specific} components) given observations $\yv(t)$, as well as \emph{static} system inputs $\uv$, to characterize the unknown dynamics and emission functions.
	
	Closely related to our work are latent variable state-space models focused on features that separate \emph{static from dynamic}  \citep{yingzhen2018disentangled, fraccaro2017disentangled}, \emph{domain-invariant from domain-specific} \citep{miladinovic2019disentangled}, \emph{position from momentum} \citep{yildiz2019ode2vae}, and parameter (system input) estimation \citep{linial2021generative}.
	%restricted to fully observed systems of ODEs. 
	In contrast, our work focuses on synthesizing observational data $\yv(t)$ from dynamical systems given: $(i)$ combinations of previously unseen inputs $\uv$ (also known as \emph{zero-shot} learning), and $(ii)$ a simulated \emph{continuous-time state-space} $\xv(t)$ from an ODE solver. Controlled generation of observations under combinations of \emph{system input} is foundational in experimental science for a mechanistic understanding of biology phenomena \citep{roeder2019efficient, yuan2021cellbox}, particularly in scenarios when obtaining experimental data is expensive. Unlike \citet{roeder2019efficient}, we do not impose a hierarchical-latent structure or assume a known Gaussian emission process.  Additionally our model enables inference of the system inputs $\uv$ given observational data $\yv(t)$, which is not considered in \citet{roeder2019efficient}.
	
	The key contributions of this paper are as follows:
	%
	%to predict behavior in regimes where no data were collected.
	%  A mechanism is a complex entity that generates certain phenomena, after receiving some input. Mechanistic explanations aim at opening the black boxes of mechanisms and at understanding how the different parts work together.
	\begin{itemize}
		[topsep=0pt,itemsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=9pt]
		\item We present a principled statistical framework for integrating structured representation learning from \emph{systems inputs} and \emph{observations} with mechanistic models.
		\item We demonstrate that the proposed generative model accurately simulates system outputs (observations) given novel combinations or perturbations of system inputs, \emph{i.e.}, zero-shot learning. 
		\item We formulate a flexible quantile regression approach for quantifying uncertainties in generated observations.
		\item We demonstrate the benefits of integrating a structured latent ODE with a flexible emission function for improved performance over competitive baselines given challenging biological data: 
		% \begin{itemize}
		% \item
		($i$) accurately inferring unknown  \emph{static} system inputs $\uv$ from noisy observations $\yv(t)$,
		%	\item Accurately synthesizing observations $\yv(t)$ from prior latent variable $\zv$ and  novel system input combinations $\uv$, \emph{i.e.}, zero-shot learning.
		% \item
		and ($ii$) improved uncertainty estimates of \emph{observational} noise.
		% \end{itemize}
	\end{itemize}
	%
	% hard, people have done this and this is what I am going to do
	% stochastic vs deterministic
	% Nonlinear dynamical systems
	%Uses of Model:
	%* Prediction (ensemble)
	%* Design/Optim
	%* Control (real time optimisation)
	%* Understanding
	%    * Interpretable (sparse or low-rank)
	%   * Generalize (F= ma); regularising the problem 
	
	%Techniques (Optimisation + Data):
	%* Regression techniques 
	%   * Linear vs. Non-linear (add penalty)
	%   * Kalman filter - linear regression
	%* NN/Deep Learning
	%* Genetic programming
	
	% Challenges:
	%* We don’t know f
	%* Non-linear f
	%  * Linear: 
	%      * dx/dt = A x or dx/dt = \lambda x then x(t) = (e^\lambda * t ) * x(0)
	%     * Solutions super-pose 
	%* High-dimensional: x \in R^{n}
	%* Multi-scale, e.g., the brain
	%   * Structures that matter at every scale and every time scale (cells, organs, DNA)
	%* Chaos
	%* Noise
	%* Latent variables (maybe unmeasured hidden variables)
	
	%%% Paragraphs 3:  What we are going to do and contributions
	% We are giung to do x
	
	\begin{figure}[!t]
		\centering
		\includegraphics[scale=0.74]{Figures/proposed_model}
		\caption{Illustration of the proposed structured latent ODE (SL-ODE) model. Generative: prior $\zv =\{\zv_{ \uv}, \zv_{\epsilonv}\}$ \eqref{eq:prior} is mapped to states $X$ simulated from an ODE solver given \emph{dynamics} $f_{\thetav}$  \eqref{eq:state-time} to generate observations (\emph{system outputs}) $Y$ from the \emph{emission} function $m(\cdot)$. Inference: posterior $q_{\varphiv, \phiv} (\zv|Y,  \uv)$  is decomposed according to $q_{\phi}(\zv|Y)$ and  $q_{\varphiv} ( \uv| \zv_{\uv})$ \eqref{eq:posterior} where $\uv$ are \emph{system inputs}.
		}
		\label{fig:model}
	\end{figure}
	
	\section{Structured Latent ODE Model (SL-ODE)}
	% Assume dynamics are generated in this way
	% Add figure (Ted's paper)
	% Learn parameters of the generative process
	% Posterior
	% Disentanglement
	% Estimate  uncertainty
	% How to estimate parameters because its hard 
	We propose a mechanistic approach for generating observations governed by nonlinear dynamical systems. Figure~\ref{fig:model} illustrates the proposed approach. Specifically, we leverage an amortized inference framework \citep{kingma2013auto, rezende2014stochastic} to learn a structured latent representation given time-series observational data and static system inputs. Below we present the proposed generative process, including a quantile regression formulation for flexible (asymmetric) uncertainty estimation.
	
	\subsection{Generative Process}
	We assume observations $\CD = \{Y,  \uv\} _{i=1} ^{N}$, where $Y_i \in \mathbb{R} ^ {K \times T}$ is a matrix of $K$ measurements at $T$ time points, for $i=1,\ldots,N$ observations and $\uv$ are the (auxiliary) \emph{static} inputs (or system conditions). We propose a generative process that synthesizes $Y$ given $\uv$ as follows
	%
	\begin{align}
		\label{eq:prior}
		\zv_{ \uv} &\sim p_{\psiv} (\zv_{ \uv}| \ \uv) \,, \quad \zv_{\epsilonv} \sim p(\zv_{\epsilonv}) \,, \quad \zv = \{\zv_{ \uv} , \zv_{\epsilonv} \}\\
		\label{eq:dynamics}
		\frac{d\xv}{dt}&= f_{\thetav} (\xv; \zv, t) \\
		\label{eq:state-time}
		X  &= {\rm ODESolve} \left(f_{\thetav}, \bs{x}_0, (t_0, t_1, ..,t_T) \right) \\
		\label{eq:observer}
		Y &\sim p\left(Y| m_{\gammav}(X), \sigma, \tau\right) \,,
	\end{align}
	%
	where the functions defining $f_{\thetav}(\cdot)$, $p_{\psiv}(\cdot)$, and $m_{\gammav}(\cdot)$ are specified as neural networks parameterized by $\{\thetav, \psiv, \gammav\}$, respectively. We synthesize $Y$ in \eqref{eq:observer} as governed by black-box dynamics $f_{\thetav} (\cdot)$ in \eqref{eq:dynamics} parameterized by the latent representation $\zv$ (composed of system inputs and process-noise) in \eqref{eq:prior}. Moreover, the ODE solver (ODESolve) in \eqref{eq:state-time} enables recovery of the \emph{state-time} matrix $X$ at $\{t_0,\ldots,t_T\}$ for the corresponding observations $Y$. 
	%\ash{TODO: clarify mapping $\zv \rightarrow \xv_{0}$ in the main text} 
	See Supplementary Material (SM) for detailed formulation of $f_{\thetav} (\cdot)$ and initial state mapping $\zv \rightarrow \xv_{0}$.
	
	\paragraph{Structured Latent-Space Representations}
	To enable controlled generation of system outputs (observations) from novel combinations or perturbations of system inputs, we specify a conditional prior that captures the relationships among heterogeneous system input values. We assign latent variable $\zv$ to be the concatenation of \emph{input-specific} $\zv_{ \uv}$ and \emph{noise-specific} $\zv_{\epsilonv}$, variables with prior distributions $p_{\psiv}(\zv_{ \uv}| \uv)$ and $p(\zv_{\epsilonv})$, respectively. Moreover, we learn a continuous and smooth representation of the input data in \eqref{eq:prior}. 
	%Without loss of generality
	We conveniently assume a Gaussian distribution:
	%
	\begin{align}
		p_{\psiv} (\zv_{ \uv}| \ \uv)  =  N\left(\bs{\mu}_{\psiv}(\uv), \text{diag} \left(\sigmav ^2_{\psiv} (\uv)\right)\right)\,,
	\end{align}
	%
	where $\bs{\mu}_{\psiv} (\cdot)$ and  $\sigmav ^2_{\psiv} (\cdot)$ are the mean and variance functions of $\uv$, respectively. Further, we assume a standard Gaussian $p(\zv_{\epsilonv}) = N(\bs{0},  \text{diag} (\bs{I}))$ to model \emph{process noise} affecting the dynamical system $f_{\theta}(\cdot)$, thus modeling approximations and integration errors.
	Though we assume a Gaussian distribution for convenience, more sophisticated alternative mechanisms for representing $\zv$ can be considered, such as 
	%with a Uniform distribution or 
	normalizing flows \citep{rezende2015variational}.
	
	\paragraph{Black-box Dynamics}
	%% Under which conditions does the solver work as the form of f_theta (differentiable and bounded)
	%% Accuracy vs computational costs
	ODESolve is a solver that simulates the \emph{state-time} matrix $X \in  \mathbb{R} ^{D \times T}$ \eqref{eq:state-time} as the solution to the dynamics \eqref{eq:dynamics} at desired time points  $\{t_0,\ldots,t_{T} \}$ given the initial state $\bs{x_0}$. We control the tradeoff between the accuracy of the simulated $X$ and the computational cost with a tolerance hyperparameter. Note that $X$ can be solved at arbitrary time-points, including irregularly sampled observations (see \citet{de2019gru, rubanova2019latent} for details). 
	%To efficiently learn  parameters of  the dynamics $f_{\thetav}$, we follow \citep{chen2018neural}, which efficiently learns gradients through the \emph{adjoint sensitivity method} \citep{pontryagin1987mathematical, stapor2018optimization}. 
	We specify the dynamics $f_{\thetav}(\cdot)$ using a multilayer perceptron (MLP) and, following  \citet{chen2018neural}, we learn the parameters of $f_{\thetav}(\cdot)$ using the \emph{adjoint sensitivity method}. Note that the recently proposed \emph{stochastic} adjoint sensitivity method \citep{li2020scalable} can be also considered for computational efficiency.
	
	\paragraph{Emission Process}
	In practice, observations $Y$ can be either non-negative or have a skewed distribution across a diverse range of applications such as those with biological signals, {\em e.g.}, heart-rate, temperature, blood pressure, \emph{etc}. While non-skewed distributions such as the standard Gaussian are convenient, they are inappropriate for such observations, since they are typically characterized by a symmetric variance.  So motivated, we wish to estimate a flexible (skewed) distribution by synthesizing  observations $Y \sim p(Y| m_{\gammav}(X), \sigma, \tau)$  from an asymmetric Laplace distribution (ALD) \citep{geraci2007quantile}, where  $0 < \tau  < 1$, $\sigma >0$, $-\infty < m_{\gammav}(X) < \infty$, are skew, scale, and location parameters, respectively. The ALD is formulated as:
	%
	\begin{align}
		p_Y( & Y; m_{\gammav}(X), \sigma, \tau)  = \frac{\tau(1-\tau)}{\sigma}  \times \label{eq:ALD}  \\
		&	\exp \Bigg(- \left( \frac{Y-m_{\gammav}(X)}{\sigma}\right) \Bigg.
		\Bigg. \Big[ \tau - I(Y \le m_{\gammav}(X))\Big] \Bigg)\,, \notag
	\end{align}
	%
	where $I \left(\cdot\right)$ is the indicator function.  Note $m_{\gammav}(\cdot)$ is a transformation that maps the state-time matrix $X$ to observations $Y$, s.t.,  $P\left(Y < m_{\gammav}(X) \right)= \tau$, where $ m_{\gammav}(X)$ is the $\tau$-th quantile of the distribution.  Consequently, learning $\{m_{\gammav}(X) \}_{s=1}^{S}$ that corresponds to a set of $S$ quantiles $\{\tau \}_{s=1}^{S}$, provides a flexible approach for asymmetric uncertainty estimation. In our experiments we learn $\sigmav(t) \in \mathbb{R}^K$  and set $\tau = \{0.025, 0.50,  0.975\}$, so $S=3$, thus effectively learning the median and 95\% confidence intervals. However, alternatives such as the interquantile range, for which $\tau = \{0.25, 0.75\}$ are also possible.
	
	\subsection{Learning}
	We aim to maximize the joint marginal log-likelihood:
	%
	\begin{align}\label{eq:mll}
		\max_{\thetav, \psiv, \gammav} & \ \mathbb{E}_{ Y,  \uv \sim \CD }  \log p_{\thetav, \psiv, \gammav}( Y, \uv) =  \notag \\
		& \ 	\max_{\thetav, \psiv, \gammav}  \mathbb{E}_{Y,  \uv \sim \CD } \log \int_{}^{} p_{\thetav, \psiv, \gammav}(Y,  \uv ,\zv) d\zv \,,
	\end{align}
	%
	% w.r.t. neural network parameters $\{\thetav, \psiv, \gammav\}$,
	where we marginalize out the latent variable $\zv$. For high-dimensional datasets and complex generative models such as neural networks, integration over the latent variables in \eqref{eq:mll} is intractable. Therefore, we introduce a variational posterior $q_{\varphiv, \phiv} (\zv|Y,  \uv)$ to approximate the true (but intractable) posterior $p(\zv| Y, \uv)$ specified as a neural network with parameters $\{ \varphiv, \phiv\}$.
	
	\paragraph{Posterior Distribution}
	Several variations for modeling $q_{\varphiv, \phiv} (\zv|Y,  \uv)$  consistent with assumed generative models have been proposed. For instance, \citet{kingma2014semi, siddharth2017learning}, assume a latent $\uv$ and decomposition $q(\zv, \uv |Y) = q(\zv|Y,  \uv) q( \uv| Y)$. However, such assumptions require {\em ad hoc} auxiliary objectives for efficiently learning from $\uv$. Moreover, $q(\zv|Y,  \uv)$ does not capture relationships among the different input values or learn input-specific representations, which is crucial for mechanistic understanding and zero-shot learning. Fortunately, more recently, \citet{joy2020ccvae} formulated a principled inference model that allows capturing input-specific representations by leveraging both Bayes' theorem and conditional independence $Y \bigCI \uv | \zv$ (consistent with our assumed generative graph) via
	%
	\begin{align}
		\label{eq:posterior}
		q_{\varphiv, \phiv} (\zv|Y,  \uv) = \frac{q_{\varphiv} ( \uv| \zv_{\uv}) q_{\phiv}(\zv|Y)}{ q_{\varphiv, \phi} ( \uv|Y )}\,,
	\end{align}
	%
	where $q_{\phi}(\zv|Y)$ and $q_{\varphiv} ( \uv| \zv_{\uv})$ are neural networks parameterized by $\{\varphiv, \phiv\}$, and 
	\begin{align}
		q_{\varphiv, \phi} ( \uv|Y ) = \int q_{\varphiv} ( \uv| \zv_{\uv}) q_{\phiv}(\zv|Y) d\zv \,.
	\end{align}
	Moreover, we specify the variational distribution as Gaussian $q_{\phi}(\zv|Y) = N \left(\bs{\mu}_{\psiv}(Y), \text{diag} \left(\sigmav ^2_{\psiv} (Y)\right)\right)$  and categorical $q_{\varphiv} ( \uv| \zv_{\uv}) =  {\rm Cat} \left(\uv | \pi_{\varphiv} (\zv_{\uv}) \right)$, if $\uv$ is discrete or Gaussian otherwise. 
	
	\paragraph{Evidence Lower Bound}
	Introducing \eqref{eq:posterior} to approximate the posterior in \eqref{eq:mll} yields a tractable \emph{evidence lower bound} (ELBO) for each observation as derived by \citet{joy2020ccvae}:
	%
	\begin{align}
		\log  & p_{\thetav, \psiv, \gammav}( Y, \uv)  \ge  \log q_{\varphiv, \phiv} ( \uv|Y ) + \log p(\uv)  +  \label{eq:elbo} \\
		& \mathbb{E}_{q_{\phiv}(\zv| Y)}  \Bigg[  \frac{q_{\varphiv} ( \uv| \zv_{\uv}) }{ q_{\varphiv, \phiv} ( \uv|Y )} \Bigg. 
		\Bigg. \log \left(\frac{p_{\thetav, \psiv, \gammav} (Y |\zv) p_{\psiv}(\zv| \uv)}{q_{\varphiv} (\uv| \zv_{ \uv}) q_{\phiv}(\zv|Y)}  \right)\Bigg]  \,, \notag
	\end{align}
	%
	where $\log p(\uv)$ is a constant, $\log q_{\varphiv, \phiv} ( \uv|Y)$ is a classification or regression conditional distribution formulation for $\uv$ discrete or continuous, respectively, and $\frac{q_{\varphiv} (\uv| \zv_{\uv})}{ q_{\varphiv, \phiv} (\uv|Y)}$ are weights for the log-likelihood ratio we seek to maximize. We leverage the simulated \emph{state-time} matrix $X$ \emph{trajectories} from the ODESolver, as a means of constraining the mapping $\zv \rightarrow Y$ in $p_{\thetav, \psiv, \gammav} (Y |\zv)$ with learned dynamics $f_{\thetav}(\cdot)$ according to the emission process in \eqref{eq:observer} formulated as an ALD distribution in  \eqref{eq:ALD}. We learn neural network parameters $\{ \thetav, \psiv,  \gammav, \varphiv, \phiv\}$ by maximizing the evidence lower bound (ELBO) in \eqref{eq:elbo} via stochastic gradient descent.
	%(a) De Brouwer, Edward, et al. "GRU-ODE-Bayes: Continuous modeling of sporadically-observed time series."arXiv preprint arXiv:1905.12374 (2019).
	%(b) Yildiz, Cagatay, Markus Heinonen, and Harri L hdesm ki. "ODE2VAE: Deep generative second order ODEswith Bayesian neural networks." (2019).
	%(c) Li, Xuechen, et al. "Scalable gradients for stochastic diff erential equations." International Conference onArtifi cial Intelligence and Statistics. PMLR, 2020.
	
	
	\paragraph{Theoretical Connections}
	Assuming a perfectly disentangled latent space \citep{higgins2018towards}, we propose a generative process that synthesizes observations $Y$ given system-inputs $\uv$, subject to latent variable $\zv = \{\zv_{\uv}, \zv_{\epsilonv}\}$, which is a concatenation of independent sources of variation, \emph{i.e.},  \emph{input-specific} $\zv_{\uv}$ and \emph{noise-specific} $\zv_{\epsilonv}$. However, inferring the independent factors from posterior $q_{\varphiv, \phiv} (\zv|Y,  \uv)$ \eqref{eq:posterior} without supervision is impossible in arbitrary generative models \citep{locatello2019challenging}.  Hence we leverage the formulation from \citep{joy2020ccvae}, which naturally enables system-input inference $q_{\varphiv, \phi} ( \uv|Y )$ consistent with our assumed data-generation model (see Figure \ref{fig:model}), and without requiring additional {\em ad hoc} loss terms.
	
	\begin{table*}[!t]
		\centering
		\caption{Summary of related work. We categorize methods in terms of $(i)$ assumptions  required for estimating $\{f(\cdot), m(\cdot)\}$, the ODE and emission functions, respectively, and $(ii)$ ability to perform tasks essential for the mechanistic understanding of system input effects: inferring of system inputs $\uv$ given observations $Y$ and controlled generation of $Y$ given $\uv$.}
		\label{tb:baselines}
		\resizebox{1\textwidth}{!}{
			\begin{tabular}{lrrrrrr}
				Method & ODE function $f (\cdot)$ &  Emission function $m (\cdot)$ & Predicts $\uv$  & Controlled generation given $\uv$ & Continuous-time & Asymmetric Uncertainty\\
				\toprule
				UKF \citep{wan2001unscented} &required& required & \xmark & \xmark  & \xmark & \xmark\\
				GOKU-net \citep{linial2021generative} & required & learned& \cmark & \xmark  & \cmark & \xmark\\
				Hierarchical-ODE \citep{roeder2019efficient} & learned & required & \xmark &  \cmark & \cmark & \xmark\\
				DMM \citep{krishnan2017structured} & learned & learned & \xmark &  \xmark &  \xmark\ & \xmark\\
				Latent-ODE \citep{rubanova2019latent} & learned & learned & \xmark &  \xmark  & \cmark & \xmark\\
				\hdashline
				SL-ODE (proposed) & learned & learned& \cmark & \cmark & \cmark & \cmark\\
				\bottomrule
			\end{tabular}
		}
		%	\end{wraptable}
	\end{table*}
	
	\subsection{Inference}
	ODE models are commonly used for observational data imputation, \emph{i.e.}, interpolating or extrapolating tasks \citep{rubanova2019latent, chen2018neural}. For interpolation, ODE models generate an observation conditioned on values from a subset of time points $ T_I \subseteq \{t_0,...,t_T \}$ within the full-time interval $t \in [0, T]$. Moreover, for extrapolation tasks, the ODE model  generates observations at future  time points $t > T$,  conditioned on values from previous times $t\in [0, T]$. Unlike previous works, here we focus on deeper understanding of system input effects, namely,  $(i)$ synthesizing observations given latent variable sample $\bs{z}$ from the prior distribution in \eqref{eq:prior}, and $(ii)$  inferring system inputs $\bs{u}$ given observations. Further, we consider the challenging \emph{zero-shot learning} setup for synthesizing data from novel combinations or perturbations of system inputs.
	%% Interpolation
	%% Extrapolation
	%% Zero shot learning
	
	\section{Related Work}
	% Variational inference for dynamical systems
	%\ash{TODO: include discrete-time and continuous-time separation (Table ~\ref{tb:baselines})}
	%\ash{TODO: include additional neural ODE/SDE prior works in our discussion}
	
	\paragraph{Variational Learning} Recent machine learning research in variational inference for latent state-space models has benefited from advances in computational efficiency of integrating mechanistic models with observational data \citep{zenker2007inverse}. For instance, recently proposed neural ODEs \citep{rubanova2019latent} have enabled learning of continuous-time dynamics $f(\cdot)$ at low computational costs. For these latent state-space models, the estimation of model parameters is specified as a maximum-likelihood problem, where the dynamics are set as a constraint \citep{gonzalez2014reproducing}. Most approaches rely on amortized inference \citep{kingma2013auto, rezende2014stochastic} to learn an intractable posterior \citep{linial2021generative, roeder2019efficient, rubanova2019latent}. However, these variational learning methods diverge in two main aspects: $i)$ proposed probabilistic graphical model, and $ii)$ assumptions needed to estimate $\{f(\cdot)$, $m(\cdot)\}$, the dynamics and emission functions, respectively. Unlike existing approaches that assume a Gaussian emission process, the proposed method SL-ODE formulates a flexible quantile regression approach for capturing uncertainties in observational data.	See Table~\ref{tb:baselines} for an overview of the various modeling assumptions.
	
	% Latent stete-space and noise
	\paragraph{Structured Latent-Space Representations}
	Structured latent space modeling for nonlinear dynamical systems has been considered in the context of Kalman variational auto-encoders that retain the Markovian structure of hidden Markov models \citep{krishnan2017structured, fraccaro2017disentangled, miladinovic2019disentangled, yingzhen2018disentangled}. Such  latent  state-space models  focus on separating \emph{static from dynamic}  \citep{fraccaro2017disentangled, yingzhen2018disentangled}, \emph{domain-invariant from domain-specific} \citep{miladinovic2019disentangled}, and \emph{position from momentum}  \citep{yildiz2019ode2vae} latent variables. Complementary to these methods, we  do not impose the Markovian structure but instead propose to learn a principled structured variational posterior $q_{\varphiv, \phiv} (\zv|Y,  \uv)$ conditional on both observations $Y$ and system inputs $\uv$, which we decompose according to \eqref{eq:posterior}. Our structured latent-space enables previously overlooked tasks essential for the mechanistic understanding of system input effects on dynamical systems: $(i)$ \emph{controlled} generation of observations given system  inputs,  and $(ii)$ \emph{inference} of system inputs from observations. Variational inference methods rarely account for \emph{system inputs} except for \citet{roeder2019efficient, linial2021generative}. While \citet{roeder2019efficient} enables controlled generation, their formulation does not facilitate system input inference given observations, and though \citet{linial2021generative} enables system input inference, controlled generation is not considered.
	
	\section{Experiments}
	%
	Below we provide details on the baseline methods considered for comparisons, the datasets employed, and the metrics used to evaluate our proposed approach. PyTorch code to replicate all experiments can be found at \code{\url{https://github.com/paidamoyo/structured_latent_ODEs}}. We summarize the SL-ODE training procedure, which is shared across all baseline methods except for the optimized evidence lower bound in Algorithm~\ref{alg:algorithm}. See the SM for comprehensive details of the neural architectures of the baselines and proposed model.
	
	
	\subsection{Baselines}
	%	For fair comparisons, we preserve the assumed baseline data generating graphical models but share the ODE $f(\cdot)$,  emission $m(\cdot)$,  and encoder (maps observations $\yv(t)$ to latent $\zv$)  functions, which we learn across all methods. 
	For fair comparisons, i.e., all models use the same neural network architecture to model the ODE $f(\cdot)$,  emission $m(\cdot)$,  and encoder (maps observations $\yv(t)$ to latent $\zv$)  functions. However, we preserve the assumed data generative process for each baseline. Recent state-of-the-art generative models for disentangled representations, \emph{i.e.}, identifying independent factors of variation in data $Y$, leverage amortized inference \citep{locatello2019challenging, kim2018disentangling}. Therefore, we compare to competitive variational ODE-based baselines. We consider the following baselines:
	%
	\begin{itemize}
		[topsep=0pt,itemsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=8pt]
		\item  Latent-ODE: Gaussian latent variable model \citep{rubanova2019latent}.
		\item  GOKU-Net: Gaussian latent variable model accounting for system input inference \citep{linial2021generative}.
		\item Hierarchical-ODE: Hierarchical latent variable model with conditional prior for system inputs \citep{roeder2019efficient}.
	\end{itemize}
	%
	See Table~\ref{tb:baselines} for a summary of the modeling assumptions in the baseline methods. Note that all baseline methods consider a Gaussian emission process, where the \emph{observation} noise $\epsilonv(t)$ is shared across all observations.  In contrast, our work adopts a flexible quantile regression approach formulated as an asymmetric Laplace distribution \eqref{eq:ALD}.
	% The unscented Kalman filter for nonlinear estimation
	% ODIN: ODE-Informed Regression for Parameter and State Inference in Time-Continuous Dynamical Systems
	% Latent ODEs for Irregularly-Sampled Time Series
	% Generative ODE Modeling with Known Unknowns
	%  Efficient Amortised Bayesian Inference for Hierarchical and Nonlinear Dynamical Systems
	% Structured Inference Networks for Nonlinear State Space Models
	
	\subsection{Datasets}
	We perform evaluation on three biological datasets described below: $(i)$ {\sc Cardiovascular System}, $(ii)$ {\sc Synthetic Biology}, and $(iii)$ {\sc Human Viral Challenge}.
	
	\paragraph{Human Viral Challenge} A real-world physiological dataset collected over multiple days from subjects equipped with Empatica E4 wearable wristband devices. On the second day, subjects were inoculated (challenged) with an H3N2 influenza pathogen, causing some to become infected, as clinically determined by viral shedding between 24 and 48 hours after inoculation.
	Moreover, peak symptoms usually occur, in average, 72 hours after inoculation.
	See \citet{she2020adaptive} for additional experimental details. We learn from 35 subjects' noisy time-series observations from four sensors $\yv(t) = [{\rm HR, TEMP, EDA, ACC}]$: heart rate (HR), temperature (TEMP), electrodermal activity (EDA), and accelerometer (ACC). Automated infection detection (\emph{e.g.}, viral shedding) from a healthy baseline, around inoculation time and before shedding, has the potential to improve health awareness and is crucial in implementing effective infection prevention strategies. Hence, we evaluate our model on 5-fold cross-validation (due to small sample size) for subject outcome $\uv = [u_1, u_2]$, where $u_1 \in \{0, 1\}$ and $u_2 \in \{0, 1\}$ indicates symptoms and viral shedding respectively.
	
	\begin{table*}[!t]	
		\centering
		\caption{Performance comparisons for {\sc Human Viral Challenge} via 5-fold cross-validation. System inputs $\uv$ are binary outcomes indicating viral shedding and symptoms. We report methods without system input inference or controlled prior generation mechanisms as NA (not available).}
		\resizebox{0.85\textwidth}{!}{
			\begin{tabular}{lrrrr}
				Method &  $\uv$ Accuracy (\%)  $\uparrow$& $L_1$ error (posterior, prior)  $\downarrow$  & ELBO $\uparrow$\\
				\toprule
				%	 &&{\bf Dataset:  {\sc Cardiovascular System} }&\\
				Latent-ODE & NA & (108.08, NA)&-362.48 \\
				GOKU-Net & 0.66 & (91.97, NA) &  -477.87\\
				Hierarchical-ODE  & NA & (260.78, 347.97) & -426.43\\
				\hdashline
				SL-ODE-Gaussian (ablation) &  0.63 & (88.86, 110.71)& -355.89 \\
				SL-ODE (proposed) & \textbf{ 0.67}& (\textbf{39.73}, \textbf{40.3})& \textbf{-327.73}\\
				\bottomrule
			\end{tabular}	
		}
		\label{tb:hvc_results}
		% L1 error generation from prior
	\end{table*}
	
	\paragraph{Cardiovascular System}
	In a clinical setting, identification of system inputs $\uv$ and states $\xv(t)$ given noisy patient-specific clinical observations $\yv(t)$ has the potential to improve differential diagnosis and predict responses to therapeutic interventions. As a result, several models for the cardiovascular system have been adapted in critical care environments, including a simplified cardiovascular system ODE model \citep{zenker2007inverse}, also recently considered in \citet{linial2021generative}. 
	%Following \citet{linial2021generative}, we generate a synthetic dataset where we formulate the ODE dynamics as: 
	%%
	%\begin{align*}
	%\frac{dSV(t)}{dt} = I_{\rm external}\\
	%\frac{dP_a(t)}{dt} = \frac{1}{C_a}\left(\frac{P_a(t) - P_v(t)}{R_{\rm TPR}(S)} - SV \cdot f_{\rm HR}(S)\right)\\
	%\frac{dP_v(t)}{dt} = \frac{1}{C_v} \left( -C_a \frac{dP_a(t)}{dt} + I_{\rm external} \right) \\
	%\frac{dS(t)}{dt} = \frac{1}{\tau_{\rm Baro}} \left( 1- \frac{1}{1 + e^{-k_{\rm width} \left(P_a(t) - P_{a_{\rm set}} \right)}} -S \right) \,,
	%\end{align*}
	%%
	%where
	%%
	%\begin{align*}
	%%f_{\rm HR} &(S) = S(t) \left(f_{\rm HR_{\rm Max}} - f_{\rm HR_{\rm Min}} \right) + f_{\rm HR_{\rm Min}}\\
	%R&_{\rm TPR}(S) =\\
	%&S(t) \left( R_{\rm TPR_{\rm Max}} - R_{\rm TPR_{\rm Min}}\right) + R_{\rm TPR_{\rm Min}} + R_{\rm TPR_{\rm Mod}} \\
	%f&_{\rm HR} (S) = S(t) \left(f_{\rm HR_{\rm Max}} - f_{\rm HR_{\rm Min}} \right) + f_{\rm HR_{\rm Min}}
	%\end{align*}
	%
	Following \citet{linial2021generative} we generate ODE states $\xv(t)= ( SV(t), P_a(t), P_v(t), S(t) )$ representing cardiac stroke volume (amount of blood ejected by the heart), arterial blood pressure, venous blood pressure, and autonomic baroreflex tone (reflex responsible for adapting perturbations in blood pressure and keeping homeostasis), respectively. We observe noisy sequences $\yv(t) = ( P_a(t), P_v(t), f_{\rm HR} (t) ) + \epsilonv(t)$,  where $f_{\rm HR}(t)$ is the patients heart-rate, and  $\epsilonv(t)$ is the \emph{observation} noise. 
	%Note that $\{C_a, C_v,  \tau_{\rm Baro}, -k_{\rm width} , P_{a_{\rm set}}, R_{\rm TPR_{\rm Max}}, R_{\rm TPR_{\rm Min}}, \\ f_{\rm HR_{\rm Min}}, f_{\rm HR_{\rm Max}} \}$ are constants shared across all datasets.
	
	We wish to infer system inputs $\uv = \left( I_{\rm external}, R_{\rm TPR_{\rm Mod}}\right)$ from 1000 time-series observations $\yv(t)$, where $ I_{\rm external}  < 0$ implies a patient is loosing blood, while $R_{\rm TPR_{\rm Mod}} < 0$ implies septic shock (\emph{i.e.}, total peripheral resistance is getting low), resulting in four interpretable conditions:
	\begin{itemize}
		[topsep=0pt,itemsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=8pt]
		\item Healthy (both non-negative).
		\item Hemorrhagic shock ($I_{\rm external}  < 0,R_{\rm TPR_{\rm Mod}} \ge 0 $).
		\item  Distributive shock ($I_{\rm external}  \ge 0, R_{\rm TPR_{\rm Mod}} < 0$).
		\item  Combined shock ($I_{\rm external}  < 0, R_{\rm TPR_{\rm Mod}} < 0$).
	\end{itemize}
	
	\paragraph{Synthetic Biology}
	The {\sc synthetic biology}  case study is derived from a laboratory experimental dataset. Measurements are collected to model the dynamic behavior of genetically engineered devices in bacterial cell cultures with different combinations of shared genetic components.  Characterization of cell culture  response in genetic components given \emph{experimental conditions}  (or \emph{treatments})  to generate desired responses for diagnostic, therapeutic, biotechnology applications,  \emph{etc.}, is time-intensive and unreliable \citep{nielsen2016genetic}.  Therefore, we wish to learn a structured latent representation of the system inputs and observations to characterize novel devices consisting of combinations from select genetic components, \emph{i.e.}, zero-shot learning, across different treatments. Below we summarize the dataset; see \citet{roeder2019efficient} for a detailed description including ODE dynamics. The system inputs $\uv = [\bs{c},  \bs{g}]$, consist of two variables:
	\begin{itemize}
		[topsep=0pt,itemsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=8pt]
		\item A multi-hot vector representing different combinations of genetics components making up six genetic devices  $\bs{g} \in \{{\rm Pcat \mhyphen Pcat}, RS100 \mhyphen S32, RS100 \mhyphen S34,  R33 \mhyphen S32, R33  \\ \mhyphen S175,  R33 \mhyphen S34\}$.
		\item Different concentrations of  chemicals (or treatments) $ \bs{c} = \{C_6, C_{12}\}$.
	\end{itemize}
	Given the system inputs, we observe $312$ noisy time-series observations captured from four optical devices $\yv(t) = [\rm {OD, RFP, YFP, CFP}]$: optical density (OD), red fluorescent protein (RFP), yellow fluorescent protein (YFP), and cyan fluorescent protein (CFP). We evaluate our model on two tasks: $(i)$ 4-fold cross-validation (due to small sample size) for \emph{multiple device inference}, and $(ii)$ held-out (novel) device inference (\emph{i.e., zero-shot learning}), which we evaluate on observations from $\bs{g} = R33 \mhyphen S34$ and $\bs{g} = R33 \mhyphen S32$.
	
	\begin{table*}[!thb]
		\centering
		\caption{Performance comparisons for {\sc Synthetic Biology} data via $4$-fold cross-validation \emph{multiple device} inference task. System inputs $\uv = [\bs{g}, \bs{c}]$, where $\bs{g}$ are categorical device genetic components and $\bs{c}$ are continuous treatment values. We report methods without system input inference or controlled prior generation mechanisms as NA.}
		\resizebox{0.85\textwidth}{!}{
			\begin{tabular}{lrrrr}
				Method &  $\bs{g}$ Accuracy (\%)  $\uparrow$& $\bs{c}$ MSE  $\downarrow$& $L_1$ error  (post, prior) $\downarrow$  & ELBO $\uparrow$\\
				\toprule
				Latent-ODE & NA &  NA & (17.47, NA)& 880.83\\
				GOKU-Net & 90.71 & 1.34 & (5.08, NA)& 1411.61\\
				Hierarchical-ODE  & NA & NA  & (18.25, 18.17)& 896.07\\
				%	K-Means & & NA \\
				\hdashline
				SL-ODE-Gaussian (ablation) & 91.07 & \textbf{0.87} & (5.58,  14.21) & 1296.11\\
				SL-ODE (proposed) & \textbf{92.95}& 0.98 &  (\textbf{4.95}, \textbf{12.87})& \textbf{1830.89}\\
				\bottomrule
			\end{tabular}	
		}
		\label{tb:sb_results}
		% L1 error generation from prior
	\end{table*}
	
	% Synthetic biology: recover observer process with linear transformation of the states?
	% Challenge dataset?
	% CHIL: Challenge dataset
	% Systems of ODEs with some noise, Max Welling paper 
	% Simple systems of ODEs, synthetic examples that fail
	
	% Extrapolation (outside the range of the observed data)
	% Automated statistician paper - Zubin 
	% Zero shot learning 
	%% when does zero short work? Toggling on/off
	%% Does modeling the quantiles improve disentanglement?
	
	% Baseline: No separation in dimensions
	%Predict dynamic behavior of genetically engineered devices in bacteria
	% Groups (devices): different combinations of S shared components
	%Treatments: experiment conditions (different concentrations of chemicals {C6,C12})
	% Observations: cameras capture OD, RFP, YFP, CFP over time
	
	%	\paragraph{Held-out Device Inference}
	\begin{algorithm}[!t]
		\caption{SL-ODE: Structured Latent ODE Model.}
		\label{alg:algorithm}
		\textbf{Input}: ODE solver, Hyper-parameters\\
		\textbf{Parameter}: Initialize parameters $\{ \thetav, \psiv,  \gammav, \varphiv, \phiv\}$\\
		\textbf{Output}: Maximize ELBO
		\begin{algorithmic}[1] %[1] enables line numbers
			\STATE $ \zv \sim q_{\phi}(\zv|Y)$ specified as Encoder ($\yv(t); \phiv$)\\
			\STATE $\xv_0 = {\rm InitState} (\zv; \thetav) $\\
			\STATE 	Simulate %$X  = {\rm ODESolve} \left(f_{\thetav}, \bs{x}_0, (t_0, t_1, ..,t_T) \right)$, s.t. $\frac{d\xv}{dt} = f_{\thetav} (\xv; \zv, t)$\\
			\begin{align*}
				X  & = {\rm ODESolve} \left(f_{\thetav}, \bs{x}_0, (t_0, t_1, ..,t_T) \right) \\
				{\rm s.t.} \ \ \frac{d\xv}{dt} & = f_{\thetav} (\xv; \zv, t) .
			\end{align*}
			\STATE Reconstruct $Y \sim p\left(Y| m_{\gammav}(X), \sigma, \tau\right)$\\
			\STATE Comptute ELBO
			\begin{align*}
				&\mathbb{E}_{q_{\phiv}(\zv| Y)}  \Bigg[  \frac{q_{\varphiv} ( \uv| \zv_{\uv}) }{ q_{\varphiv, \phiv} ( \uv|Y )} \Bigg. 
				\Bigg. \log \left(\frac{p_{\thetav, \psiv, \gammav} (Y |\zv) p_{\psiv}(\zv| \uv)}{q_{\varphiv} (\uv| \zv_{ \uv}) q_{\phiv}(\zv|Y)}  \right)\Bigg] \\
				& + \log q_{\varphiv, \phiv} ( \uv|Y ) + \log p(\uv)
			\end{align*}
			\STATE Backpropagate and update $\{ \thetav, \psiv,  \gammav, \varphiv, \phiv\}$
			\STATE \textbf{return} solution
		\end{algorithmic}
	\end{algorithm}
	
	\subsection{Quantitative Analysis}
	Experimental results in Tables \ref{tb:hvc_results}, \ref{tb:sb_results}, and \ref{tb:cs_results} (in SM) demonstrate that the proposed SL-ODE consistently outperforms baseline methods across all evaluation metrics and datasets.
	We evaluate SL-ODE and compare to baseline methods on the following metrics:
	\begin{itemize}
		[topsep=0pt,itemsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=8pt]
		\item System input  inference $\uv$ given observational data $\yv(t)$. We report accuracy and mean squared error (MSE) for categorical and continuous system inputs, respectively.
		\item We compare averaged system input-specific $L_1$ error from posterior or prior predictive distributions against ground truth observations. For the prior distribution, we evaluate  methods capable of \emph{controlled} generation given system inputs $\uv$.
		\item  Estimated evidence lower bound for model fit evaluation.
	\end{itemize}
	
	\paragraph{Evidence Lower Bound (ELBO)}
	As expected, the latent-ODE model has the worst ELBO, since it is the only model that does not account for system inputs when modeling the posterior or prior distributions. Therefore, the model capacity is limited to a simple Gaussian posterior distribution. In contrast, our structured modeling approach has significant benefits over baseline methods in model fit (or ELBO), due to its system input inference \eqref{eq:posterior} and structured conditional prior \eqref{eq:prior}. Though Hierarchical-ODE assumes a conditional prior, it does not consider a system input inference mechanism. Moreover, while GOKU-Net considers a system input inference mechanism, it is constrained by its Gaussian prior.
	
	\paragraph{Posterior and prior predictive distributions $L_1$ error} Formulated as an absolute difference between \emph{input-specific} predictions and ground truth averaged across observations and system inputs. Our model achieves the lowest posterior and prior distributions $L_1$ error across all datasets. However, we noticed a drop in performance between the {\sc  Synthetic Biology} posterior and prior errors. We attribute the performance decline to the challenge associated with accounting for complex system inputs, \emph{i.e.}, heterogeneous (mixture of categorical and continuous) variables. Note that we do not report the prior $L1$ error on GOKU-Net and Latent-ODE since these models do not consider controlled generation given system inputs.
	
	\begin{figure}[!t]
		\centering
		(a) SL-ODE
		\includegraphics[width=\columnwidth]{Figures/mechanistic_prior_samples_cr}
		(b) Hierarchical-ODE
		\includegraphics[width=\columnwidth]{Figures/hierarchical_prior_samples_cr}
		\caption{Ground truth (black) \emph{vs.} \emph{controlled} generated observations (colored) given system inputs $\uv$ according to assumed prior for (a) proposed SL-ODE and (b) Hierarchical-ODE models on {\sc Cardiovascular System} data. We average synthesized observational data $\yv(t)$ across all class-specific time series and report the estimated median with 95\% confidence interval (CI).
			% \rh{95\% intervals? thought you were estimating IQRs? please clarify}
		}
		\label{fig:cv-prior}
	\end{figure}
	
	\begin{figure}[!thb]
		\centering
		\subfloat[\centering SL-ODE]{
			{\includegraphics[width=0.48\columnwidth]{Figures/mech_sb_post_95_cr} }
		}%
		%	\qquad
		\subfloat[\centering GOKU-Net]{
			{\includegraphics[width=0.48\columnwidth]{Figures/goku_sb_post_cr} }
		}%
		\caption{Posterior predictive distribution on {\sc Synthetic Biology} data via $4$-fold cross-validation \emph{multiple device} inference task for (a) proposed SL-ODE and (b) GOKU-Net models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid) across three $\bs{c}=[C_{6}, C_{12}]$ treatments (minimum, median, and maximum), \emph{e.g.}, when  $C_6$= minimum, output is averaged across all $C_{12}$. Shaded areas indicate the predicted 95\% CI.}
		\label{fig:mutiple-device}
	\end{figure}
	
	\paragraph{System input  inference}
	We report a competitive advantage over GOKU-Net in  {\sc Synthetic Biology}  and {\sc Human Viral Challenge} system input inference, owing to our structured conditional prior representations \eqref{eq:prior}, which is not considered in GOKU-Net. Note that we do not report results on Hierarchical-ODE and Latent-ODE methods, which do not consider system input inference.
	
	\begin{figure}[!thb]
		\centering
		\subfloat[\centering  SL-ODE: Posterior]{
			{\includegraphics[width=0.48\columnwidth]{Figures/mech_post_heldout_cr} }
		}%
		%	\qquad
		\subfloat[\centering SL-ODE: Prior]{
			{\includegraphics[width=0.48\columnwidth]{Figures/mech_prior_heldout_cr} }
		}%
		\caption{SL-ODE {\sc Synthetic Biology} \emph{held-out device} ($\bs{g}=R33\mhyphen S34$) task. Ground truth \emph{vs.}  (a) posterior predictive distribution and (b) \emph{controlled} generated observations given system inputs $\uv = [\bs{g}, \bs{c}]$ according to assumed prior distribution \eqref{eq:prior}.  We plot the median (circles) with 95\% CI against ground truth observations (crosses) averaged (200 $\zv$ samples) across all observations at the final time-point sweeping all $ \bs{c} =[C_{6}, C_{12}]$ treatments.}
		\label{fig:heldout-device}
	\end{figure}
	
	
	\subsection{Qualitative Analysis}
	%	\ash{I updated the plot function to match vi-hds. The results are worse than in the vi-hds paper because these are based on prior samples instead of the posterior, which is more challenging. I have verified the y-values with the individual device plots for R33-S34 they seem correct. Let me know if I am missing something. Thanks!}
	%	\textcolor{red}{I don't think the results are correct in Figure \ref{fig:heldoutdevice}.  Or at least from what I can see the results are poor and much worse than in the vi-hds paper.  Ash, can you verify that it corresponds to the same experiment in vi-hds?  Can we use the same plotting functions (if not, we need to include much higher resolution figures)?  I also don't understand the y-values.  Are these different from what vi-hds uses?}
	We further compare against the best performing baseline methods Hierarchical-ODE and GOKU-Net in Figures~\ref{fig:cv-prior} and \ref{fig:mutiple-device}, respectively. Figure~\ref{fig:cv-prior} demonstrates that the controlled generated samples from the assumed prior distribution of SL-ODE match the ground truth class-specific time-series better than samples from  Hierarchical-ODE on the {\sc Cardiovascular System} dataset. Moreover, the estimated 95\% CI of SL-ODE exhibit low-variance predictions. Similarly, in  Figure \ref{fig:mutiple-device} we present low-variance predictions at earlier times than GOKU-Net  on the {\sc Synthetic Biology} dataset \emph{multi-device} task per reported 95\% CI posterior predictive distributions. See Figure~\ref{fig:mutiple-device_SM} (in SM) for complete \emph{multiple device} inference results across all methods. We observe a similar trend on the {\sc Human Viral Challenge} dataset (see Figures~\ref{fig:hvc_post_00}-\ref{fig:hvc_post_11} in SM), albeit capturing imperfect dynamics limited by the ODE class. This demonstrates that our quantile regression emission formulation \eqref{eq:ALD} has a competitive advantage for capturing flexible and asymmetric uncertainties over the typical choice of standard Gaussian emission process. Additionally, the ablation study illustrates that the proposed SL-ODE with an \emph{asymmetric} Laplace likelihood \eqref{eq:ALD} has a quantitative competitive advantage over the alternative (SL-ODE-Gaussian) with Gaussian likelihood.
	%\ash{TODO: Additionally, the ablation study in SM illustrates that the proposed SL-ODE-Laplace has a quantitative competitive advantage over the alternative SL-ODE-Gaussian.}
	
	%\ash{We argue that strong inductive bias in the form of system-input prediction makes it possible for SL-ODE to learn independent (stochastic) factors of variation for each input to the system, thus separating the effects of the system inputs in the latent space.  Since Hierarchical-ODE models latent system-input factors without supervision, it is impossible to learn disentangled representations \citep{locatello2019challenging}, hence the  poor conditional generation. }
	% \rh{need to explain why you are only presenting visual results for Hierarchical-ODE and GOKU-Net in Figure 2 and 3, respectively. Also, you need to clarify the 95\% CI issue.}
	
	Finally, Figure~\ref{fig:heldout-device} shows posterior and prior predictive summaries on the challenging {\sc Synthetic Biology} \emph{held-out device} task (so-called zero-shot learning) across all treatment values. Interestingly, except for mid $C_{12}$ treatments from YFP, SL-ODE closely matches ground truth observations for the posterior and prior predictive distributions. Accurately synthesizing data under novel input combinations is crucial for experimental science, where obtaining data is typically expensive and time consuming. We anticipate performance gains with additional training data from an S34 device component known to bind to $C_{12}$ \citep{roeder2019efficient}. See Figure~\ref{fig:heldout-device_SM} (in SM) for additional zero-shot learning results from held-out device $\bs{g} = R33 \mhyphen S32$.
	
	%Moreover, the ability to synthesize data under novel combinations (or perturbations) of system input is crucial for experimental science where obtaining experimental data is expensive.
	%This As expected, we report high variance predictions from prior samples.
	
	
	
	
	\section{Conclusions}
	We have presented a principled statistical framework for integrating mechanistic models with amortized inference. We applied this framework to a constrained maximum-likelihood estimation of time-series observational data and static system inputs. Moreover, we demonstrated the benefits of capturing \emph{system input-specific} variations in the latent space for a deeper understanding of system input effects on dynamical systems. Further, the proposed inference method does not assume known ODE dynamics or emission functions. Unlike prior works that presume a Gaussian emission process, we quantify \emph{observation noise} with quantile regression for flexible (skewed) uncertainty estimation. We presented results on three challenging biological datasets, characterizing human physiological event states, cardiovascular systems, and genetically engineered devices in synthetic biology. We demonstrated significant performance gains over competitive baselines in uncertainty estimation and mechanistic understanding tasks: \emph{controlled} generation of observational data given novel system input combinations, and {\em inference} of biologically meaningful inputs from observational data. In the future, we plan to extend our structured representation formulation to account for time-varying system inputs, frequently encountered in several dynamical systems, such as gene regulation \citep{calderhead2009accelerating}. Finally, current research aims to account for irregularly sampled observations \citep{de2019gru, rubanova2019latent}, these approaches may also augment the scope of the proposed structured latent ODE model.
	%	Additionally, augmenting the SL-ODE encoder to account for irregularly sampled data by employing techniques in \citet{de2019gru, rubanova2019latent} would be an interesting future direction.
	% Future work, time-varying system inputs?
	
	%	\begin{contributions} % will be removed in pdf for initial submission,
	%		% so you can already fill it to test with the
	%		% ‘accepted’ class option
	%		Briefly list author contributions.
	%		This is a nice way of making clear who did what and to give proper credit.
	%	\end{contributions}
	
	\begin{acknowledgements} % will be removed in pdf for initial submission,
		% so you can already fill it to test with the
		% ‘accepted’ class option
		The authors would like to thank the anonymous reviewers for their insightful comments.
		This research was supported by NIH/NINDS 1R61NS120246, NIH/NIDDK R01-DK123062, and ONR N00014-18-1-2871-P00002-3.
		%		\ash{TODO: Add Funding or organizations}
	\end{acknowledgements}
	
	\bibliography{chapfuwa_194}
	
	\clearpage
	\appendix
	
	\section{Additional Results}
	Figure~\ref{fig:mutiple-device_SM} and Figures~\ref{fig:hvc_post_00}-\ref{fig:hvc_post_11} provide all qualitative visualizations of the posterior predictive distributions across all methods on {\sc Synthetic Biology} and {\sc Human Viral Challenge} datasets. Note that for fair comparisons, Hierarchical-ODE preserves the data generating graphical model of \citet{roeder2019efficient} but deviate in dynamics and emission functions, resulting in significantly worse performance than reported in  \citet{roeder2019efficient}. Additionally, we present results from held-out device posterior predictive distribution and controlled generated observations from novel device $\bs{g}=R33\mhyphen S32$ in Figure~\ref{fig:heldout-device_SM}. See Table \ref{tb:cs_results} for  {\sc Cardiovascular System} quantitative results.
	
	\section{Experimental Setup}
	Below we provide details of the neural-network architectures, selected hyper-parameters and pseudo-code for the proposed SL-ODE algorithm.
	
	\begin{table*}[!htb]
		\centering
		\caption{Summary of data-specific hyper-parameters.}
		%\resizebox{1\columnwidth}{!}{
		\begin{tabular}{lrrr}
			Hyper-parameter & {\sc Synthetic Biology}  & {\sc Cardiovascular System} & {\sc Human Viral Challenge}\\
			\toprule
			Mini-batch size  & 36 & 128 &  28 \\
			Learning rate & $3 \times 10 ^{-4}$ & $1 \times 10 ^{-3}$ & $1 \times 10 ^{-3}$ \\
			States dimension ($D$) & 8 &5\ & 5\
		\end{tabular}
		%}
		%	\end{wraptable}
		\label{tb:hyper}
	\end{table*}
	
	\subsection{Neural-Network Architectures}
	In all experiments, SL-ODE (proposed), GOKU-Net, Latent-ODE, and Hierarchical-ODE share the ODE $f(\cdot)$,  emission $m(\cdot)$, and encoder (maps observations $\yv(t)$ to latent $\zv$)  functions, detailed below. In general, we specify two-layer multilayer perceptrons (MLPs) with 25 hidden units and Rectified Linear Unit (ReLU) as activation functions. Additionally, we implement 2-layer MLPs  for  the \emph{system input-specific} distributions:
	\begin{itemize}
		[topsep=0pt,itemsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=8pt]
		\item  Prior distribution $p_{\psiv}(\zv_{ \uv}| \uv)$ used in SL-ODE and Hierarchical-ODE.
		\item Variational distribution $q_{\varphiv} ( \uv| \zv_{\uv})$ used in SL-ODE and GOKU-Net.
	\end{itemize}
	
	\begin{table*}[!thb]	
		\centering
		\caption{Performance comparisons for {\sc Cardiovascular System} on test data. System inputs $\uv$ are interpretable patient states. We report methods without system input inference or controlled prior generation mechanisms as NA.}
		\resizebox{0.85\textwidth}{!}{
			\begin{tabular}{lrrrr}
				Method &  $\uv$ Accuracy (\%)  $\uparrow$& $L_1$ error (posterior, prior)  $\downarrow$  & ELBO $\uparrow$\\
				\toprule
				%	 &&{\bf Dataset:  {\sc Cardiovascular System} }&\\
				Latent-ODE & NA & (6.95, NA)& 9.12\\
				GOKU-Net & \textbf{100}& (5.06, NA) & 324.81\\
				Hierarchical-ODE  & NA & (4.25, 4.42) & 374.94\\
				\hdashline
				SL-ODE-Gaussian (ablation) & \textbf{100} & (0.66,  0.67)&  561.29 \\
				SL-ODE (proposed) & \textbf{100}& (\textbf{0.56}, \textbf{0.57})& \textbf{752.23}\\
				\bottomrule
			\end{tabular}	
		}
		\label{tb:cs_results}
		% L1 error generation from prior
	\end{table*}
	
	\paragraph{Encoder} 
	Following  \citet{roeder2019efficient}, we apply a $1D$ CNN to observations $\yv(t)$ $\rightarrow$ average pooling $\rightarrow$ two-layer MLPs  $\rightarrow$  latent variable $\zv$ described with mean $\bs{\mu}$ and variance $\text{diag} (\sigmav ^2)$. Note that the Hierarchical ODE model has an additional 2-layer MLP mapping system inputs to an input-specific latent variable. 
	
	\paragraph{Black-box Dynamics}
	We leverage the \emph{adjoint solver} \cite{chen2018neural} to simulate the state-time matrix $X$ where the dynamics $f_{\thetav}(\cdot)$ are 2-layer MLPs with \emph{Sigmoid} output-layer activations. Following \citet{roeder2019efficient}, we specify dynamics as
	\begin{align*}
		\frac{d\xv}{dt} &= f_1(\xv, \zv, t; \theta) - \xv \odot f_2(\xv, \zv, t; \theta)\,,
	\end{align*}
	where $\odot$ is the Hadamard product. Further, we initialize the initial state $\bs{x}_0$ as $\zv \rightarrow$ 2-layer MLPs with  \emph{Sigmoid} output activation $\rightarrow$  $\bs{x}_0$.
	
	\paragraph{Emission} We map the states $X$ to the observations $Y$ with a 1-layer linear MLP. For all baseline methods, the emission function outputs observation means $\bs{m}(t)$ and variances $\bs{\epsilon(t)}$. In contrast, our proposed approach (SL-ODE), outputs the median $\bs{m}(t)$, upper-  $\bs{u}(t)$, and  lower-  $\bs{l}(t)$ quantiles according to the specified $\tau$.
	
	%	\begin{figure*}[!t]
	%		\begin{subfigure}{.5\textwidth}
	%			\centering
	%			\includegraphics[scale=0.328]{Figures/mech_sb_post_95}
	%			\caption{SL-ODE}
	%		\end{subfigure}
	%		\begin{subfigure}{.5\textwidth}
	%			\centering
	%			\includegraphics[scale=0.328]{Figures/goku_sb_post}
	%			\caption{GOKU-Net}
	%		\end{subfigure}
	%		\begin{subfigure}{.5\textwidth}	
	%			\centering
	%			\includegraphics[scale=0.328]{Figures/latent_sb_post}
	%			\caption{Latent-ODE}
	%		\end{subfigure}
	%		\begin{subfigure}{.5\textwidth}	
	%			\centering
	%			\includegraphics[scale=0.328]{Figures/hierarchical_sb_post}
	%			\caption{Hierarchical-ODE}
	%		\end{subfigure}
	%		\caption{Posterior predictive distribution on {\sc Synthetic Biology} data via $4$-fold cross-validation \emph{multiple device} inference task for (a) proposed SL-ODE, (b) GOKU-Net, (c) Latent-ODE, and (d) Hierarchical-ODE models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid) across three $\bs{c}=[C_{6}, C_{12}]$ treatments (minimum, median, and maximum). Shaded areas indicate the predicted 95\% confidence interval (CI).}
	%		\label{fig:mutiple-device_SM}
	%	\end{figure*}
	
	\begin{figure*}[!thb]
		\centering
		\subfloat[\centering SL-ODE]{
			{\includegraphics[width=0.23\textwidth]{Figures/mech_sb_post_95_cr} }
		}%
		%	\qquad
		\subfloat[\centering GOKU-Net]{
			{\includegraphics[width=0.23\textwidth]{Figures/goku_sb_post_cr} }
		}%
		\subfloat[\centering Latent-ODE]{
			{\includegraphics[width=0.23\textwidth]{Figures/latent_sb_post_cr} }
		}%
		\subfloat[\centering Hierarchical-ODE]{
			{\includegraphics[width=0.23\textwidth]{Figures/hierarchical_sb_post_cr} }
		}%
		\caption{Posterior predictive distribution on {\sc Synthetic Biology} data via $4$-fold cross-validation \emph{multiple device} inference task for (a) proposed SL-ODE, (b) GOKU-Net, (c) Latent-ODE, and (d) Hierarchical-ODE models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid) across three $\bs{c}=[C_{6}, C_{12}]$ treatments (minimum, median, and maximum), \emph{e.g.}, when  $C_6$= minimum, output is averaged across all $C_{12}$. Shaded areas indicate the predicted 95\% confidence interval (CI).}
		\label{fig:mutiple-device_SM}
	\end{figure*}
	
	
	\begin{figure*}[!thb]
		\centering
		\subfloat[\centering  SL-ODE: Posterior]{
			{\includegraphics[width=0.48\textwidth]{Figures/mech_post_heldout_21_cr} }
		}%
		%	\qquad
		\subfloat[\centering SL-ODE: Prior]{
			{\includegraphics[width=0.48\textwidth]{Figures/mech_prior_heldout_21_cr} }
		}%
		\caption{SL-ODE {\sc Synthetic Biology} \emph{held-out device} ($\bs{g}=R33\mhyphen S32$) task. Ground truth \emph{vs.}  (a) posterior predictive distribution and (b) \emph{controlled} generated observations given system inputs $\uv = [\bs{g}, \bs{c}]$ according to assumed prior distribution.  We plot the median (circles) with 95\% CI against ground truth observations (crosses) averaged (200 $\zv$ samples) across all observations at the final time-point sweeping all $ \bs{c} =[C_{6}, C_{12}]$ treatments.}
		\label{fig:heldout-device_SM}	
	\end{figure*}
	
	
	
	\begin{figure*}[!thb]
		\centering
		\subfloat[\centering SL-ODE]{
			{\includegraphics[width=0.23\textwidth]{Figures/hvc_patient/slode_hvc_post_00_cr} }
		}%
		%	\qquad
		\subfloat[\centering GOKU-Net]{
			{\includegraphics[width=0.23\textwidth]{Figures/hvc_patient/goku_hvc_post_00_cr} }
		}%
		%	\qquad
		\subfloat[\centering Latent-ODE]{
			{\includegraphics[width=0.23\textwidth]{Figures/hvc_patient/latent_hvc_post_00_cr} }
		}%
		%\qquad
		\subfloat[\centering Hierarchical-ODE]{
			{\includegraphics[width=0.23\textwidth]{Figures/hvc_patient/hierarchical_hvc_post_00_cr} }
		}%
		\caption{Posterior predictive distribution on {\sc Human Viral Challenge} for randomly selected test patient showing one of the four combination binary outcomes $\uv$ for viral shedding (sh=0) and symptoms (sx=0) onset (a) proposed SL-ODE, (b) GOKU-Net, (c) Latent-ODE, and (d) Hierarchical-ODE models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid). We do not show error bars since they are too large due to noisy data.}%
		\label{fig:hvc_post_00}%
	\end{figure*}
	
	\begin{figure*}[!thb]
		\centering
		\subfloat[\centering SL-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/slode_hvc_post_01_cr} }
		}%
		%	\qquad
		\subfloat[\centering GOKU-Net]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/goku_hvc_post_01_cr} }
		}%
		%\qquad
		\subfloat[\centering Latent-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/latent_hvc_post_01_cr} }
		}%
		%\qquad
		\subfloat[\centering Hierarchical-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/hierarchical_hvc_post_01_cr} }
		}%
		\caption{Posterior predictive distribution on {\sc Human Viral Challenge} for randomly selected test patient showing one of the four combination binary outcomes $\uv$ for viral shedding (sh=0) and symptoms (sx=1) onset (a) proposed SL-ODE, (b) GOKU-Net, (c) Latent-ODE, and (d) Hierarchical-ODE models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid). We do not show error bars since they are too large due to noisy data.}%
		\label{fig:hvc_post_01}%
	\end{figure*}
	
	
	\begin{figure*}[!thb]
		\centering
		\subfloat[\centering SL-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/slode_hvc_post_10_cr} }
		}%
		%	\qquad
		\subfloat[\centering GOKU-Net]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/goku_hvc_post_10_cr} }
		}%
		%\qquad
		\subfloat[\centering Latent-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/latent_hvc_post_10_cr} }
		}%
		%	\qquad
		\subfloat[\centering Hierarchical-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/hierarchical_hvc_post_10_cr} }
		}%
		\caption{Posterior predictive distribution on {\sc Human Viral Challenge} for randomly selected test patient showing one of the four combination binary outcomes $\uv$ for viral shedding (sh=1) and symptoms (sx=0) onset (a) proposed SL-ODE, (b) GOKU-Net, (c) Latent-ODE, and (d) Hierarchical-ODE models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid). We do not show error bars since they are too large due to noisy data.}%
		\label{fig:hvc_post_10}%
	\end{figure*}
	
	\begin{figure*}[!thb]
		\centering
		\subfloat[\centering SL-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/slode_hvc_post_11_cr} }
		}%
		%\qquad
		\subfloat[\centering GOKU-Net]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/goku_hvc_post_11_cr} }
		}%
		%\qquad
		\subfloat[\centering Latent-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/latent_hvc_post_11_cr} }
		}%
		%\qquad
		\subfloat[\centering Hierarchical-ODE]{
			{\includegraphics[width=.23\textwidth]{Figures/hvc_patient/hierarchical_hvc_post_11_cr} }
		}%
		\caption{Posterior predictive distribution on {\sc Human Viral Challenge} for randomly selected test patient showing one of the four combination binary outcomes $\uv$ for viral shedding (sh=1) and symptoms (sx=1) onset (a) proposed SL-ODE, (b) GOKU-Net, (c) Latent-ODE, and (d) Hierarchical-ODE models. For clarity, we plot ground truth (dotted) time-series against median predictions (solid). We do not show error bars since they are too large due to noisy data.}%
		\label{fig:hvc_post_11}%
	\end{figure*}
	
	
	\subsection{Hyper-parameter Selection}
	%
	We use the Adam  optimizer \citep{kingma2014adam} with the following hyper-parameters: first moment $0.9$, second moment $0.99$, and epsilon $1 \times 10 ^{-8}$. We train all models using one NVIDIA P100 GPU with 16GB memory. See Table~\ref{tb:hyper} for data-specific hyper-parameters.  We split the {\sc Cardiovascular System} data into training, validation, and test sets as 80\%, 10\%, and 10\% partitions, respectively. Further, we use the validation set for early stopping and learning model hyper-parameters. However, for the {\sc Synthetic Biology} and {\sc Human Viral Challenge} datasets, we perform $k$-fold cross-validation due to the small sample sizes.
	
	%	\subsection{SL-ODE Algorithm}
	%	%
	%	See Algorithm~\ref{alg:algorithm_SM} for a summary of the SL-ODE training procedure, which is shared across all baseline methods except for the optimized evidence lower bound.
	%	
	%	\begin{algorithm}[!htb]
	%		\caption{SL-ODE.}
	%		\label{alg:algorithm_SM}
	%		\textbf{Input}: ODE solver, Hyper-parameters\\
	%		\textbf{Parameter}: Initialize parameters $\{ \thetav, \psiv,  \gammav, \varphiv, \phiv\}$\\
	%		\textbf{Output}: Maximize ELBO
	%		\begin{algorithmic}[1] %[1] enables line numbers
	%			\STATE $ \zv \sim q_{\phi}(\zv|Y)$ specified as Encoder ($\yv(t); \phiv$)\\
	%			\STATE $\xv_0 = {\rm InitState} (\zv; \thetav) $\\
	%			\STATE 	Simulate $X  = {\rm ODESolve} \left(f_{\thetav}, \bs{x}_0, (t_0, t_1, ..,t_T) \right)$, s.t. $\frac{d\xv}{dt} = f_{\thetav} (\xv; \zv, t)$\\
	%			\STATE Reconstruct $Y \sim p\left(Y| m_{\gammav}(X), \sigma, \tau\right)$\\
	%			\STATE Comptute ELBO
	%			\begin{align*}
	%				&\mathbb{E}_{q_{\phiv}(\zv| Y)}  \Bigg[  \frac{q_{\varphiv} ( \uv| \zv_{\uv}) }{ q_{\varphiv, \phiv} ( \uv|Y )} \Bigg. 
	%				\Bigg. \log \left(\frac{p_{\thetav, \psiv, \gammav} (Y |\zv) p_{\psiv}(\zv| \uv)}{q_{\varphiv} (\uv| \zv_{ \uv}) q_{\phiv}(\zv|Y)}  \right)\Bigg] \\
	%				& + \log q_{\varphiv, \phiv} ( \uv|Y ) + \log p(\uv)
	%			\end{align*}
	%			\STATE Backpropagate and update $\{ \thetav, \psiv,  \gammav, \varphiv, \phiv\}$
	%			\STATE \textbf{return} solution
	%		\end{algorithmic}
	%	\end{algorithm}
	%	
\end{document}
