\documentclass[accepted]{uai2022}
%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage[utf8]{inputenc}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{booktabs} % commands to create good-looking tables

%% Self-defined macros
\newcommand{\subdir}{subfiles}
\newcommand{\figdir}{img}

%\usepackage{xr-hyper}
\usepackage{nameref,zref-xr}
\zxrsetup{toltxlabel}
\zexternaldocument*{lindinger_112}
%\usepackage{multicol}
%\usepackage{cuted}
%\AfterEndEnvironment{strip}{\leavevmode}
\usepackage{caption}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{enumerate}
%\usepackage{lscape}
\usepackage{xcolor}
\usepackage{cancel}
%\usepackage{hyperref}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{multirow}
\usepackage{sidecap}

\newcommand{\gauss}[3]{\mathcal{N}\left( #1 \vert #2, #3 \right)}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}

\newcommand{\christoph}[1]{ \textcolor{orange!75!black}{C: #1}}
\newcommand{\jakob}[1]{ \textcolor{red}{J: #1}}
\newcommand{\barbara}[1]{ \textcolor{green}{B: #1}}
\newcommand{\rebuttal}[1]{\textcolor{black}{#1}}

\newtheorem{remark}{Remark}%
	
%\title[Laplace GPSSMs]{Laplace Approximated Gaussian Process State-Space Models}
\title{Laplace Approximated Gaussian Process State-Space Models (Supplementary Material)}	

\author[1,2,3]{\href{mailto:<jakob.lindinger@sick.de>?Subject=Your UAI 2022 paper}{Jakob~Lindinger}{}}

\author[1]{\href{mailto:<barbara.rakitsch@de.bosch.com>?Subject=Your UAI 2022 paper}{Barbara~Rakitsch}{}}

\author[2,3]{Christoph~Lippert}

\affil[1]{%
	Bosch Center for Artificial Intelligence\\
	Renningen, Germany}
%\postcode{71272} \street{Robert-Bosch-Campus 1},

\affil[2]{%
	Hasso Plattner Institute\\
	Potsdam, Germany}

\affil[3]{%
	University of Potsdam\\
	Germany}

\begin{document}
\onecolumn
\maketitle
\appendix
\setcounter{figure}{4}
\setcounter{table}{2}
\setcounter{equation}{20}

%\begin{strip}
\section{GPSSMs and the FITC approximation}
\label{sec:appx_proof}

In this section we formalize and prove a remark on the properties of variational approximations for GPSSMs that have been used in the literature so far.
More precisely, we show that the ELBO of \cite{ialongo_overcoming_2019} remains unchanged when the FITC approximation \citep{snelson_sparse_2005}, $ p(F_T \mid F_M) \approx \prod_{t=0}^{T-1} p(f_t \mid F_M) $ is used in both the prior and the approximate posterior.
This remark is a direct extension of the equivalent remark of \cite{frigola_variational_2014} about the same property of their model, which can be found in their appendix.

Before we begin, we first reiterate some previous results:
The ELBO of the VCDT method from \cite{ialongo_overcoming_2019} is given as
\begin{align} \nonumber
	\mathcal{L}_\text{VCDT} = &\sum_{t=1}^{T} \mathbb{E}_{q(x_t)}\left[\log p(y_t \mid x_t)\right]
	- KL\left[q(x_0)\parallel p(x_0)\right]	- KL\left[q(F_M)\parallel p(F_M)\right] \\
	\label{eq:appx_elbo_vcdt}
	- &\sum_{t=1}^{T} \mathbb{E}_{q(x_{t-1},f_{t-1},F_M)}\left[KL\left[ q(x_t\mid F_M, x_{t-1})\parallel p(x_t \mid f_{t-1}, x_{t-1}) \right]\right],
\end{align}
where the posterior marginals are given by
\begin{align}\label{eq:appx_vcdt_qx}
	q(x_t) &= \int \left[\prod_{t'=1}^t q(x_{t'}\mid F_M, x_{t'-1})\right] q(x_0)q(F_M) dF_M dx_{0:t-1}, \\
	\label{eq:appx_vcdt_qxf}
	q(x_{t-1},f_{t-1},F_M) &= \int \left[\prod_{t'=1}^{t-1} q(x_{t'}\mid F_M, x_{t'-1})\right] q(x_0)q(F_M)p(f_{t-1}\vert F_M) dx_{0:t-2},
\end{align}
where we use the shorthand $ dx_{i:j} = \prod_{t'=i}^{j}dx_{t'} $.
This can be obtained by using the prior and approximate posterior,
\begin{align}\label{eq:appx_vcdt_p}
	p(Y_T, X_{T_0}, F_T, F_M) &= 
	p(x_0)p(F_T \mid F_M)p(F_M)\prod_{t=1}^{T} p(y_t\mid x_t) p(x_t\mid f_{t-1}, x_{t-1}), \\
	\label{eq:appx_vcdt_q}
	q( X_{T_0}, F_T, F_M) &= 
	q(x_0)p(F_T \mid F_M)q(F_M)\prod_{t=1}^{T} q(x_t\mid F_M, x_{t-1}),
\end{align}
respectively.
To clarify some of the above points, we first of all note that the expectation value is defined as $ \mathbb{E}_{p(a,b)}\left[g(a,b)\right] = \int p(a,b) g(a,b) da db $, and the KL-divergence as  $ KL\left[q(a)\parallel p(a)\right]= \int q(a) \log\frac{ q(a)}{p(a)} da $.
The ELBO in Eq.~\eqref{eq:appx_elbo_vcdt} is the same as in \cite{ialongo_overcoming_2019}, where we simply collected all the terms that are somewhat spread over their paper and use our notation (introduced in Sec.~\ref{sec:background}).
The marginals in Eqs.~\eqref{eq:appx_vcdt_qx} and \eqref{eq:appx_vcdt_qxf} are not written out in as much detail in \cite{ialongo_overcoming_2019}, but can be simply obtained from plugging in Eqs.~\eqref{eq:appx_vcdt_p} and \eqref{eq:appx_vcdt_q} in the general formula for the ELBO in Eq.~\eqref{eq:elbo} and collecting the remaining terms.
This will be almost equivalent to the very similar derivation that we perform as part of the proof below.
Finally, the exact definitions of the distributions $ q $ and $ p $ appearing in Eqs.~\eqref{eq:appx_elbo_vcdt} - \eqref{eq:appx_vcdt_q} \citep[given in][]{ialongo_overcoming_2019} are not very important for our current undertaking, only their conditional (in)dependencies.

After these preliminaries we are ready to formally state our observation about the FITC approximation:

\begin{remark}
	\label{remark}	
	The ELBO in Eq.~\eqref{eq:appx_elbo_vcdt} remains unchanged when the FITC approximation, $ p(F_T \mid F_M) \approx \prod_{t=0}^{T-1} p(f_t \mid F_M) $, is used in	Eqs.~\eqref{eq:appx_vcdt_p} and \eqref{eq:appx_vcdt_q}.
\end{remark}

\begin{proof}[Proof of Remark {\upshape\ref{remark}}]
	The proof is conceptually simple: It relies on altering Eqs.\ \nolinebreak\eqref{eq:appx_vcdt_p} and \eqref{eq:appx_vcdt_q} with the FITC approximation and then calculating the ELBO by using Eq.\ \nolinebreak\eqref{eq:elbo} and showing that this coincides with Eqs.\ \nolinebreak\eqref{eq:appx_elbo_vcdt} -  \eqref{eq:appx_vcdt_qxf}.
	The concepts needed in the proof are also very simple:
	We only require the definition of expectation value and KL-divergences from above, rules for the logarithm and the property for arbitrary probability densities $ q(a,b) $ that
	\begin{equation}\label{eq:appx_property}
		\int q(a,b) g(b) da db = \int q(b) g(b)  \left( \int  q(a \mid b) da \right) db = \int q(b)g(b) db,
	\end{equation}
	due to the normalization property of probability distributions.
	
	We start by defining the alternative VCDT-FITC model by plugging in the FITC approximation $ p(F_T \mid F_M) \approx \prod_{t=0}^{T-1} p(f_t \mid F_M) $ in Eqs.\ \nolinebreak\eqref{eq:appx_vcdt_p} and \eqref{eq:appx_vcdt_q}:
	\begin{align}\label{eq:appx_fitc_p}
		p_\text{FITC}(Y_T, X_{T_0}, F_T, F_M) &= 
		p(x_0)p(F_M)\prod_{t=1}^{T} p(f_{t-1} \mid F_M)p(y_t\mid x_t) p(x_t\mid f_{t-1}, x_{t-1}), \\
		\label{eq:appx_fitc_q}
		q_\text{FITC}( X_{T_0}, F_T, F_M) &= 
		q(x_0)q(F_M)\prod_{t=1}^{T} q(x_t\mid F_M, x_{t-1})p(f_{t-1} \mid F_M).
	\end{align}
	Plugging these equations in Eq.\ \nolinebreak\eqref{eq:elbo} yields the formula for the ELBO of this model:
	\begin{equation}\label{eq:appx_fitc_elbo_initial}
		\mathcal{L}_\text{VCDT-FITC} = \mathbb{E}_{q_\text{FITC}( X_{T_0}, F_T, F_M)} \left[\log \frac{p_\text{FITC}(Y_T, X_{T_0}, F_T, F_M)}{q_\text{FITC}( X_{T_0}, F_T, F_M)} \right]
	\end{equation}
	Evaluating the terms inside the logarithm according to Eqs.\ \nolinebreak\eqref{eq:appx_fitc_p} and \eqref{eq:appx_fitc_q} yields
	\begin{equation} \label{eq:appx_fitc_elbo_1}
		\mathcal{L}_\text{VCDT-FITC} = \mathbb{E}_{q_\text{FITC}( X_{T_0}, F_T, F_M)} \left[
		\sum_{t=1}^T \log p(y_t \mid x_t) + \log \frac{p(x_0)}{q(x_0)} 	
		+ \log \frac{p(F_M)}{q(F_M)} + \sum_{t=1}^T \log \frac{p(x_t \mid f_{t-1}, x_{t-1})}{q(x_t \mid F_M, x_{t-1})} \right].
	\end{equation}
	Note that here the identical FITC terms, $ \prod_{t=0}^{T-1} p(f_t \mid F_M) $ in the prior and approximate posterior have canceled each other out inside the logarithm, but this also happens for the terms $ p(F_T\mid F_M) $ in the original VCDT formulation [Eqs.\ \nolinebreak\eqref{eq:appx_vcdt_p} and \eqref{eq:appx_vcdt_q}].
	
	Using the definition of the expectation value, then the property in Eq.\ \nolinebreak\eqref{eq:appx_property} and then the definition of the KL-divergence, we identify two of the KL-terms in Eq.\ \nolinebreak\eqref{eq:appx_elbo_vcdt}:
	\begin{align} \nonumber
		\mathcal{L}_\text{VCDT-FITC} &= \mathbb{E}_{q_\text{FITC}( X_{T_0}, F_T, F_M)} \left[
		\sum_{t=1}^T \log p(y_t \mid x_t) + \sum_{t=1}^T \log \frac{p(x_t \mid f_{t-1}, x_{t-1})}{q(x_t \mid F_M, x_{t-1})} \right] \\
		\label{eq:appx_fitc_elbo_2}
		&- KL\left[q(x_0)\parallel p(x_0)\right] - KL\left[q(F_M)\parallel p(F_M)\right].
	\end{align}
	In the following we carefully treat the remaining two terms inside the expectation: First,
	\begin{align} \nonumber
		&\mathbb{E}_{q_\text{FITC}( X_{T_0}, F_T, F_M)} \left[
		\sum_{t=1}^T \log p(y_t \mid x_t) \right] \\
		\nonumber
		= &\sum_{t=1}^T \int q(x_0)q(F_M) \left[ \prod_{t'=1}^{T} q(x_{t'}\mid F_M, x_{t'-1}) \cancel{p(f_{t'-1} \mid F_M)} \right] \log p(y_t \mid x_t) dF_M dx_{0:T} \cancel{df_{0:T-1}} \\
		\nonumber
		= &\sum_{t=1}^T \int \left( \int q(x_0)q(F_M)\left[\prod_{t'=1}^{t} q(x_{t'}\mid F_M, x_{t'-1}) \right] dF_M  dx_{0:t-1} \right) \log p(y_t \mid x_t) dx_t\\
		\label{eq:appx_fitc_elbo_3}
		= &\sum_{t=1}^{T} \mathbb{E}_{q(x_t)}\left[\log p(y_t \mid x_t)\right],	
	\end{align}
	which we recognize as the first term in Eq.\ \nolinebreak\eqref{eq:appx_elbo_vcdt}.
	Here, we have used Eq.\ \nolinebreak\eqref{eq:appx_property} to get rid of the integrals over all $ f_{t'} $ and over all $ x_{t'} $ for $ t' > t $ in the second step and we have identified $ q(x_t) $ from Eq.\ \nolinebreak\eqref{eq:appx_vcdt_qx} in the last step.
	
	We note that for the term in Eq.\ \nolinebreak\eqref{eq:appx_fitc_elbo_3} as well as for the KL-terms in Eq.\ \nolinebreak\eqref{eq:appx_fitc_elbo_2}, the difference between the FITC and the non-FITC approximate posterior does not play a role since the terms within the expectation value do not depend on any $ f_t $.
	This is different for the remaining term in Eq.\ \nolinebreak\eqref{eq:appx_fitc_elbo_2}, which has a dependence on $ f_t $:
	\begin{align} \nonumber
		&\mathbb{E}_{q_\text{FITC}( X_{T_0}, F_T, F_M)} \left[
		\sum_{t=1}^T \log \frac{p(x_t \mid f_{t-1}, x_{t-1})}{q(x_t \mid F_M, x_{t-1})} \right] \\
		\nonumber
		= &\sum_{t=1}^T \int 
		\left[ q(x_0)q(F_M) \left(  \prod_{t'=1}^{T} p(f_{t'-1} \mid F_M) q(x_{t'}\mid F_M, x_{t'-1})  \right) \log \frac{p(x_t \mid f_{t-1}, x_{t-1})}{q(x_t \mid F_M, x_{t-1})} dF_M dx_{0:T} df_{0:T-1} \right]
		\\
		\nonumber
		= &\sum_{t=1}^T \int 
		\left[ q(x_0)q(F_M)p(f_{t-1} \mid F_M)\prod_{t'=1}^{t-1} q(x_{t'}\mid F_M, x_{t'-1}) dx_{0:t-2} \right.\\
		\nonumber
		&\hphantom{\sum_{t=1}^T \iint} \times \left. \left(\int q(x_{t}\mid F_M, x_{t-1}) \log \frac{p(x_t \mid f_{t-1}, x_{t-1})}{q(x_t \mid F_M, x_{t-1})} dx_t \right) dF_M df_{t-1} dx_{t-1} \right]
		\\
		\label{eq:appx_fitc_elbo_4}
		= &-\sum_{t=1}^{T} \mathbb{E}_{q(x_{t-1},f_{t-1},F_M)}\left[KL\left[ q(x_t\mid F_M, x_{t-1})\parallel p(x_t \mid f_{t-1}, x_{t-1}) \right]\right].	
	\end{align}
	We recognize Eq.\ \nolinebreak\eqref{eq:appx_fitc_elbo_4} as the last term in Eq.\ \nolinebreak\eqref{eq:appx_elbo_vcdt}.
	Here, we have used Eq.\ \nolinebreak\eqref{eq:appx_property} to get rid of the integrals over all $ f_{t'} $ for $ t' \neq t-1 $ and over all $ x_{t'} $ for $ t' > t $ in the second step and additionally did some reordering of the terms.
	In the last step, we have identified $ q(x_{t-1},f_{t-1},F_M) $ from Eq.\ \nolinebreak\eqref{eq:appx_vcdt_qxf} and a (negative) KL-divergence.
	It is interesting to note that at this point the non-FITC variational approximation still would not make a difference:
	Since every summand within the expectation value in the first line of Eq.\ \nolinebreak\eqref{eq:appx_fitc_elbo_4} only depends on a single $ f_t $, the switching of summation and expectation makes both versions, $ p(F_T\mid F_M) $ and $ \prod_{t=0}^{T-1} p(f_t\mid F_M) $, reduce to $ p(f_{t-1}\mid F_M) $ [using Eq.\ \nolinebreak\eqref{eq:appx_property}] in the end.
	
	Together, Eqs.\ \nolinebreak\eqref{eq:appx_fitc_elbo_initial} - \eqref{eq:appx_fitc_elbo_4} show that the ELBO of the VCDT-FITC model matches that of the original VCDT model in Eq.\ \nolinebreak\eqref{eq:appx_elbo_vcdt}, therefore completing the proof.
\end{proof}

We furthermore note that since \cite{eleftheriadis_identification_2017} use the same conditional (in)dependencies in their variational family as \cite{frigola_variational_2014}, the remark in the appendix of the latter work holds also true for the former.
In summary, all practically viable variational treatments of GPSSMs either explicitly use the FITC approximation in both prior and approximate posterior \citep{doerr_probabilistic_2018} or implicitly do so \citep{frigola_variational_2014,eleftheriadis_identification_2017,ialongo_overcoming_2019}, i.e., the latter methods could have used the approximation and would have arrived at the same optimization objective.\footnote{This does not hold for the method described as "non-factorised non-linear" in \cite{ialongo_overcoming_2019}, but the $ \mathcal{O}(T^3) $ scaling makes this and similar methods unattractive to use in practice.}
This somewhat weakens the criticism of \cite{ialongo_overcoming_2019} on the choice of \cite{doerr_probabilistic_2018} to use a FITC prior and motivates us to do the same in our work.

\section{Derivation of the Laplace approximation}
\label{sec:appx_laplace}
In this section we provide a more extensive derivation for the Laplace approximation in our notation, see e.g.~\cite{mackay_information_2003,skaug_automatic_2006,kristensen_tmb_2016} for derivations in other notations or contexts.
As explained in Sec.~\ref{sec:laplace}, we use the Laplace approximation to approximate the marginal likelihood [Eq.~\eqref{eq:marginal_lik}],
\begin{equation}\label{eq:appx_marginal_lik}
	p_\theta(Y_T) = \int p_\theta(Y_T, X_{T_0}) dX_{T_0}.
\end{equation}
In some more detail, we do this with the following steps:
We first introduce an exponential and a logarithm that cancel each other out,
\begin{equation}\label{eq:appx_marginal_lik1}
	p_\theta(Y_T) = \int \exp \left[ \log p_\theta(Y_T, X_{T_0})\right] dX_{T_0}.
\end{equation}
Then we find a mode $ \hat{X}_{T_0} $ of $ \log p_\theta(Y_T, X_{T_0}) $,
\begin{equation} \label{eq:appx_lap_mode}
	\hat{X}_{T_0} = {\argmax\ }_{X_{T_0}} \log p_\theta(Y_T, X_{T_0}),
\end{equation}
and perform a second order Taylor expansion of  $ \log p_\theta(Y_T, X_{T_0}) $ around this mode leading to
\begin{equation}\label{eq:appx_marginal_lik2}
	p_\theta(Y_T) \approx \int \exp \left[ \log p_\theta(Y_T, \hat{X}_{T_0}) - \frac{1}{2}
	\left( \hat{X}_{T_0}-X_{T_0}\right)^\top H \left( \hat{X}_{T_0}-X_{T_0}\right)
	\right] dX_{T_0}.
\end{equation}
Here $ H $ is the negative Hessian of $ \log p_\theta(Y_T, X_{T_0}) $ as in Eq.~\eqref{eq:lap_hess} and the first order term vanishes since we are expanding around a mode, where, by definition, the gradient vanishes.
The first term in the exponential in Eq.~\eqref{eq:appx_marginal_lik2} is constant in $ X_{T_0} $ and can be pulled out of the integral:
\begin{equation}\label{eq:appx_marginal_lik3}
	p_\theta(Y_T) \approx p_\theta(Y_T, \hat{X}_{T_0}) \int \exp \left[-\frac{1}{2}
	\left( \hat{X}_{T_0}-X_{T_0}\right)^\top H \left( \hat{X}_{T_0}-X_{T_0}\right)
	\right] dX_{T_0}.
\end{equation}
Remaining in the integral, we recognize the exponential of a negative quadratic form, i.e., an unnormalized multivariate Gaussian, which generally has the density \citep[see e.g.][]{mackay_information_2003},
\begin{equation}\label{eq:appx_gauss}
	\gauss{x}{\mu}{\Sigma} = \det (2\pi \Sigma)^{-\frac{1}{2}} \exp \left[-\frac{1}{2}
	\left( \mu-x\right)^\top\Sigma^{-1} \left(  \mu-x\right) \right],
\end{equation}
with mean $ \mu $ and covariance matrix $ \Sigma $.
We therefore recognize $ \hat{X}_{T_0} $ as the mean and $ H $ as the inverse of the covariance matrix, i.e., the precision matrix, in Eq.~\eqref{eq:appx_marginal_lik3}, leading to
\begin{equation}\label{eq:appx_marginal_lik4}
	p_\theta(Y_T) \approx p_\theta(Y_T, \hat{X}_{T_0}) \int \det (2\pi H^{-1})^{\frac{1}{2}} \gauss{X_{T_0}}{\hat{X}_{T_0}}{H^{-1}} dX_{T_0}.
\end{equation}
This integral is very easy to solve since the factor with the determinant does not depend on $ X_{T_0} $, leaving us with an integral over a normalized probability density which, by definition, evaluates to 1.
Therefore
\begin{align}\nonumber
	p_\theta(Y_T) &\approx p_\theta(Y_T, \hat{X}_{T_0}) \det (2\pi H^{-1})^{\frac{1}{2}} \\
	&= \sqrt{2\pi}^{d_x(T+1)} p_\theta(Y_T, \hat{X}_{T_0}) \det (H)^{-\frac{1}{2}},
\end{align}
where we used standard rules for determinants, using that $ H $ is of dimension $ d_x(T+1) \times d_x(T+1) $.
This is the formula that can be found in Eq.~\eqref{eq:lap_lik}.

\section{Using the Implicit Function Theorem}
\label{sec:appx_ift}
In order to obtain a formula for $ \frac{\partial\hat{X}_{T_0}(\theta)}{\partial \theta } $ we start by defining a function $ h $, that is the Jacobian of the function $ g_{\text{GP}} $ in Eq.~\eqref{eq:g_gpssm}:
\begin{equation}\label{eq:ift_h}
	h(x, \theta, F_M) =  \left. -\frac{\partial \log p_\theta(Y_T, X_{T_0}\mid F_M)}{\partial X_{T_0} } 
	\right\rvert_{X_{T_0} = x}.
\end{equation}
By definition, plugging in the mode $ \hat{X}_{T_0} $ for the fixed value $ \hat{\theta} $ of the parameters $ \theta $ that has been used when obtaining $ \hat{X}_{T_0} $, yields a vanishing Jacobian,
$ h(\hat{X}_{T_0}(\hat{\theta}), \hat{\theta}, F_M) = 0 $.
Here, we made the dependence of the mode on the parameter setting $ \hat{\theta}  $ explicit.
Under mild differentiability assumptions, the IFT, roughly speaking, guarantees that $ \hat{X}_{T_0}(\theta) $ is in fact a function of $ \theta $ that is \emph{implicitly defined} through a vanishing Jacobian in the vicinity of $ \hat{\theta} $, i.e., through the equation
\begin{equation}\label{eq:ift_h0}
	h(\hat{X}_{T_0}(\theta), \theta, F_M) = 0,
\end{equation}
and that is also differentiable at $ \hat{\theta} $ and in its vicinity.
Therefore, we can calculate the total derivative wrt.~$ \theta $ on both sides of Eq.~\eqref{eq:ift_h0}, yielding
\begin{equation}\label{eq:ift_total_derivative}
	\frac{\partial h(\hat{X}_{T_0}, \theta, F_M)}{\partial \theta }  +
	\left. \frac{\partial h(x, \theta, F_M)} {\partial x } \right\rvert_{x = \hat{X}_{T_0}}
	\frac{\partial\hat{X}_{T_0}(\theta)}{\partial \theta } = 0,
\end{equation}
where we used the chain rule.
Recognizing
\begin{equation*}
	\left. \frac{\partial h(x, \theta, F_M)} {\partial x } \right\rvert_{x = \hat{X}_{T_0}} = H(\theta, F_M),	
\end{equation*}
i.e., the Hessian $ H $ [cf.~Eq.~\eqref{eq:lap_hess}] of the function $ g_\text{GP} $ in Eq.~\eqref{eq:g_gpssm}, we can solve Eq.~\eqref{eq:ift_total_derivative} for the required derivative, obtaining
\begin{equation}\label{eq:ift_result_appx}
	\frac{\partial\hat{X}_{T_0}(\theta)}{\partial \theta } = H^{-1}(\theta, F_M)\frac{\partial h(\hat{X}_{T_0}, \theta, F_M)}{\partial \theta }.
\end{equation}
This is Eq.~\eqref{eq:ift_result} appearing in Sec.~\ref{sec:IFT}.

\section{Exploiting Structure and Sparsity of the Hessian}
\label{sec:appx_hessian}
In the following we provide some technical details on how we obtain the blocks of the Hessian in Eq.~\eqref{eq:sparse_structure} in Sec.~\ref{sec:appx_hessian_blocks}
before we show in Sec.~\ref{sec:appx_hessian_det_solve} how these blocks can be used to efficiently calculate the determinant of the Hessian and performing a Hessian solve needed for Eqs.~\eqref{eq:vi_laplace} and \eqref{eq:ift_result}, respectively.

\subsection{Efficiently obtaining the non-zero blocks of the Hessian}
\label{sec:appx_hessian_blocks}
Reverse mode automatic differentiation frameworks such as TensorFlow \citep{abadi_tensorflow:_2016} implement efficient vector-Jacobian products.
As we can think of the Hessian as the Jacobian of the Jacobian of the function $ g_\text{GP} $ in Eq.~\eqref{eq:g_gpssm}, we can naively obtain the $ d_x(T+1) $ columns of the Hessian in Eq.~\eqref{eq:sparse_structure} by considering the vector-Hessian products $ e_j^\top H $ with all $ d_x(T+1) $ unit vectors $ e_j = \{\delta_{jj'}\}_{j' = 0}^{d_xT} \in \mathbb{R}^{d_x(T+1)}$, where $ \delta_{ij} $ is the Kronecker delta.
This would require $ \mathcal{O}(T^2d_x^2) $ storage and computation time.
As in Sec.~\ref{sec:sparsity}, this is wasteful since many unnecessary zeros are being calculated.
Therefore, in order to tackle the problem of obtaining only the non-zero blocks of the Hessian, we propose instead to use only vector-Hessian products with the three block-vectors $ \widetilde{e}_0 $, $ \widetilde{e}_1 $, and $ \widetilde{e}_2 $ defined by $ \widetilde{e}_k = \{\delta_{k,k'\%3}\mathbb{I}_{d_x}\}_{k' = 0}^{T} \in \mathbb{R}^{d_x(T+1)\times d_x}$.
Here $ \% $ denotes the modulo operation and $ \mathbb{I}_{d_x} $ is the identity matrix of size $ d_x \times d_x $.
As an example, $ \widetilde{e}_0 = (\mathbb{I}_{d_x}, 0, 0, \mathbb{I}_{d_x}, \cdots)^\top $ such that
\begin{equation*}
	\widetilde{e}_0^\top H = (A_0, B_1, B_3^\top, A_3, B_4, B_6^\top, \cdots)^\top,
\end{equation*}
which can be implemented as $ d_x $ vector-Hessian products.
Similarly $ \widetilde{e}_1^\top H = (B_1^\top, A_1, B_2, \cdots)^\top $ and $ \widetilde{e}_2^\top H = (0, B_2^\top, A_2, B_3 \cdots)^\top $ such that we can obtain all non-zero elements of the Hessian with only $ 3d_x $ vector-Hessian products,
reducing the memory and time requirements to $ \mathcal{O}(Td_x^2) $.
After some reshaping and transposing this provides us with the quantities $ \{A_t\}_{t=0}^T $ and $ \{B_t\}_{t=1}^T $ which are needed for the further steps in our sparse algorithm in the following section.

\subsection{Efficiently calculating the Hessian determinant and performing Hessian solves}
\label{sec:appx_hessian_det_solve}
We follow e.g.~\cite{koulaei_computing_2007} in noting that the Hessian $ H $ in Eq.~\eqref{eq:sparse_structure} allows the factorization
\begin{equation}\label{eq:sparse_factorization}
	H = (\Lambda + B^\top)\Lambda^{-1}(\Lambda + B),
\end{equation}
where $ B $ is the strictly upper-triangular part of $ H $ (consisting only of the different $ B_t $ blocks) and $ \Lambda $ is the block diagonal matrix of recursively defined blocks
\begin{equation}\label{eq:sparse_lambda}
	\Lambda_0 = A_0, \qquad \Lambda_t = A_t - B_t^\top \Lambda_{t-1}^{-1} B_t, \quad t = 1,\dots,T.
\end{equation}
The factorization in Eq.~\eqref{eq:sparse_factorization} allows us to calculate the determinant of the Hessian as
\begin{equation}\label{eq:sparse_det}
	\det H = \prod_{t=0}^T \det \Lambda_t,
\end{equation}
using $\det (\Lambda + B^\top) = \det (\Lambda + B) = \det \Lambda  $ (since $ \Lambda $ is block diagonal and $ B $ and $ B^\top $ are strictly upper and lower triangular, respectively), and $ \det(CD) =\det C \det D $ \citep[see also][]{salkuyeh_comments_2006}.
The Hessian solve in Eq.~\eqref{eq:ift_result} can be done by exploiting Eq.~\eqref{eq:sparse_factorization} as well, yielding
\begin{equation}\label{eq:sparse_solve}
	H^{-1} = (\Lambda + B)^{-1}\Lambda(\Lambda + B^\top)^{-1}.
\end{equation}
Therefore a solve with a block banded lower triangular matrix  $ (\Lambda + B^\top) $, then a matrix multiplication with a block diagonal matrix $ (\Lambda) $ followed by a solve with a block banded upper triangular matrix  $ (\Lambda + B) $ are equivalent to a solve with $ H $.
Since all of these operations can be performed in $ \mathcal{O}(Td_x^3) $ steps, which is true as well for the calculation of the blocks of $ \Lambda $ in Eq.~\eqref{eq:sparse_lambda} and the determinant in Eq.~\eqref{eq:sparse_det}, we have achieved the desired theoretical speed-ups.

We note that further speed-ups are possible, such as considering the inverse subset algorithm to calculate the derivative of the logarithm of the determinant of the Hessian \citep[see e.g.][]{kristensen_tmb_2016, durrande_banded_2019}.
Furthermore, several inherently sequential parts of the code could be written in C\texttt{++} as is done in \cite{durrande_banded_2019}.

\section{Algorithm and Implementation Details}
\label{sec:pseudocode}

\begin{algorithm}
	\caption{ELBO for GPSSM}
	%\captionof{algorithm}{ELBO for GPSSM}
	\label{algorithm}
	\begin{algorithmic}
		\State Given time series $ Y_T $
		\State Choose number of iteration and samples $ I$ and $N $, latent state dimension $ d_x $
		\State Initialize model parameters $ \theta $, variational parameters $ \psi = \{m,S\} $
		\For{$ i=1\dots I $}
		\Comment{Optimization steps}
		\For{$ n=1\dots N $}
		\Comment{Reparameterized samples}
		\State Sample $ F_M^{(n)} \sim q_\psi(F_M) $
		\State Find $ \hat{X}_{T_0}^{(n)} $ by maximizing $ g_\text{GP}(X_{T_0}, \theta, F_M^{(n)}) $
		\Comment{Eq.~\eqref{eq:g_gpssm}}
		\State Obtain non-zero elements of $ H $ $ (A_t, B_t)$
		\Comment{Appx.~\ref{sec:appx_hessian_blocks}}
		\State Evaluate $ \det H $
		\Comment{Eqs.~\eqref{eq:sparse_lambda} and \eqref{eq:sparse_det}}
		\State Evaluate $ \widetilde{p}_\theta(Y_T \mid F_M^{(n)}) $
		\Comment{Eq.~\eqref{eq:vi_laplace}}
		\EndFor
		\State Evaluate $ \mathcal{L}(\theta, \psi) $
		\Comment{Eqs.~\eqref{eq:objective} and \eqref{eq:objective_sampling}}
		\State Obtain gradients $ \frac{\partial \mathcal{L}}{\partial \theta} $,
		$ \frac{\partial \mathcal{L}}{\partial \psi} $, using custom $ \frac{\partial \hat{X}_{T_0}^{(n)}}{\partial \theta} $,
		$ \frac{\partial \hat{X}_{T_0}^{(n)}}{\partial \psi} $
		\Comment{Eqs.~\eqref{eq:ift_result} and \eqref{eq:sparse_solve}}
		\State Update $ \theta $ and $ \psi $
		\EndFor
	\end{algorithmic}
\end{algorithm}

In Alg.~\ref{algorithm}, we provide the basic algorithm to evaluate and optimize the optimization objective $ \mathcal{L}(\theta, \psi) $ in Eq.~\eqref{eq:objective_sampling}.
Then, in the following we provide some details on our implementation of Alg.~\ref{algorithm}.
First general details in Sec.~\ref{sec:appx_exp_implement_general} and then some practical details that are required to make the algorithm work on practical problems in Sec.~\ref{sec:appx_exp_implement_practical}.

\subsection{General implementation details}
\label{sec:appx_exp_implement_general}
We build our implementation on TensorFlow \citep{abadi_tensorflow:_2016} and use the multioutput GP implementation provided by GPflow \citep{van_der_wilk_framework_2020}.
For finding the mode in Eq.~\eqref{eq:appx_lap_mode} required for the Laplace approximation, we use the BFGS algorithm \citep[see e.g.][]{fletcher_practical_1987} provided by TensorFlow-Probability.
For the implementation of the custom derivative coming from the Implicit Function Theorem in Eq.~\eqref{eq:ift_result}, we use the \textit{custom\_gradient} function provided by TensorFlow.

\subsection{Practical implementation details}
\label{sec:appx_exp_implement_practical}
In the following we first tackle the practical details of the three extensions of Alg.~\ref{algorithm} mentioned in Sec.~\ref{sec:algorithm}: Minibatches, multi-dimensional latent states and control inputs.
At the end of this section, we also discuss some practical details related to the optimization required to get the mode $ \hat{X}_{T_0} $ for the Laplace approximation.

\paragraph{Minibatches}
Time series are typically too long to be handled in one batch such that using minibatches helps obtaining a computationally tractable algorithm.
The first term in our optimization objective [Eq.~\eqref{eq:objective_sampling}] can be written as a sum over the observations $ y_t $ [cf.~Eqs.~\eqref{eq:vi_laplace} and \eqref{eq:gpssm_joint_fitc}].
We can approximate this using minibatches (or rather subsequences) of length $ T_b $, $ Y_{T_b} = \{y_t\}_{t=t_0}^{t_0+T_b} $ starting at some (random) time index $ t_0 $.
Note that this is an approximation that ignores the temporal structure of the data and ignores effects coming from observation and transitions before and after the minibatch which is discussed in more detail in \cite{aicher_stochastic_2019}.
Nevertheless, minibatching nicely integrates with the sampling step in Eq.~\eqref{eq:objective_sampling}, where we can draw a new minibatch for every sample from $  q_\psi(F_M) $, resulting in a new approximation,
\begin{equation}\label{eq:objective_minibatch}
	\int q_\psi(F_M) \log \widetilde{p}_\theta(Y_T \mid F_M) dF_M \approx
	\frac{T}{T_b}\sum_{n=1}^N \log \widetilde{p}_\theta(Y_{T_b}^{(n)} \mid F_M^{(n)}), \quad F_M^{(n)} \sim q_\psi(F_M).
\end{equation}

On a more technical note, we find that when using minibatches, we do not need a buffer to reduce the effect of the initial distribution as suggested in \cite{aicher_stochastic_2019} and e.g.~used in \cite{longi_traversing_2021}.
This might be the case since we usually normalize the data and use an uninformative initial distribution such as $ p(x_0) = \gauss{x_0}{0}{1} $, such that the Laplace approximation can flexibly find a value for $ \hat{x}_0 $ (or $ \hat{x}_{t_0-1} $ for the minibatches) that has high mass under the initial distribution.

\paragraph{Multi-dimensional latent states}
For challenging problems a one-dimensional latent state is typically not expressive enough \citep{frigola_bayesian_2015} and we require multi-dimensional latent states $ x_t $.
This is problematic in the transition model [Eq.~\eqref{eq:fitc_trans}], where the mean and the covariance of the GP appear, both of which are one-dimensional.
We follow the literature \citep[e.g.][]{doerr_probabilistic_2018,ialongo_overcoming_2019} and choose independent (sparse) GPs for each dimension of the latent state resulting in
\begin{equation*}
	p_\theta(x_t\mid x_{t-1}, F_M^{d_x})
	= \prod_{d=1}^{d_x} \gauss{x_t^{(d)}}{x_{t-1}^{(d)} + \mu^{(d)}(x_{t-1}, F_M^{(d)})}{Q_d + \Sigma^{(d)}(x_{t-1})},
\end{equation*}
where $ F_M^{d_x} = \{F_M^{(d)}\}_{d=1}^{d_x} $ is the collection of all inducing outputs, $ x_t^{(d)}  $ is the $ d $-th dimension of the latent state and $ \mu^{(d)} $ and $ \Sigma^{(d)} $ are the mean and covariance of the sparse GP responsible for the $ d $-th dimension, respectively [cf.~Eqs.~\eqref{eq:gp_mean} and \eqref{eq:gp_cov}].
In order to reduce non-identifiabilities and to simplify training, we choose a diagonal transition noise variance, $ Q = \text{diag}(\{Q_d\}_{d=1}^{d_x}) $, the same inducing points $ X_M $ and kernel hyperparameters for all GPs, thus only leaving the variational distributions $ q^{(d)}_{\psi_d}(F_M^{(d)}) $ as the distinguishing property of the different dimensions of the transition function.

\paragraph{Control inputs}
Lastly, time series prediction problems often come with an additional time series $ U_T = \{u_t\}_{t=1}^T $, $ u_t \in \mathbb{R}^{d_u} $ of control inputs that are applied at time index $ t $ and change the behavior of the system.
In GPSSMs these are modeled as additional (constant) inputs to the GP which are simply concatenated with the latent states $ x_t $ at the same time index.
This leads only to two small changes in the algorithm:
The kernel of the GPs have to accept input pairs coming from $ \mathbb{R}^{d_x +d_u} $ and the inducing points $ X_M $ also have to live in that higher dimensional space.
%\jakob{Summarize parameters of the model again, or is this too much?}

\paragraph{Optimizing the latent states}
In order to obtain the mode $ \hat{X}_{T_0} $ required for the Laplace approximation, we have to perform an optimization in the inner loop of our algorithm (Alg.~\ref{algorithm}).
This requires some fine-tuning to get a numerically stable algorithm:
First, since this is a non-stochastic problem, we use an approximate second-order optimizer for fast convergence and choose the BFGS optimizer \citep[see e.g.][]{fletcher_practical_1987}.
Second, while it is generally no longer considered as state of the art to use the Adam optimizer on all parameters including the variational ones for optimizing models involving sparse GPs \citep[see e.g.][]{salimbeni_natural_2018, adam_dual_2021}, we also empirically found that this approach leads to the BFGS optimizer struggling to find good optima (or even not converging at all).
We therefore opted for the following two strategies: we use the BFGS optimizer to find the mode in the inner loop and then i) the Natural gradient optimizer \citep{salimbeni_natural_2018} on the variational parameter $ \psi $ and the Adam optimizer on all other model parameters or ii) using a diagonal covariance for the variational distribution $ q_\psi(F_M) $ and then optimizing all parameters with the Adam optimizer.

We consider the optimization for finding the mode $ \hat{X}_{T_0}^{(n)} $ (see Alg.~\ref{algorithm}) as failed if (i) the maximum number of iterations has been reached without converging within the tolerance, (ii) the determinant of the Hessian [required for Eq.~\eqref{eq:vi_laplace}] is negative, or iii) some of the optimal latent states $ \hat{x}_t $ converge to positions far outside of the region where there are inducing points $ X_M $ (which is justified when normalizing the data as we do).
In all cases we simply ignore the contributions of these terms to the optimization objective by removing the $ n $-th term from the sum in Eq.~\eqref{eq:objective_sampling}.
Typically this leads to the removal of well below $ 1\% $ of the optimization runs.

\section{Experimental details}
\subsection{Kink}
\label{sec:appx_exp_detail_kink}
\paragraph{Training data}
For each of the ten different seeds, we generate a (different, noise dependent) sequence of latent states $ X_T $ according to the description in Sec.~\ref{sec:kink}.
Then we add (seed dependent) zero-mean iid.~Gaussian noise with emission noise variances $ \sigma_y^2 \in \{0.008,0.08,0.8\} $ to each of the sequences of latent states, thus generating $ Y_T $, our training data,
resulting in 30 time series in total.
See also Fig.~\ref{fig:appx_kink_series} for a visualization of the data.

\paragraph{VCDT model}
For the experiment in Sec.~\ref{sec:kink}, the VCDT model from \cite{ialongo_overcoming_2019} uses the same emission model, $ p(y_t \mid x_t) = \gauss{y_t}{x_t}{\sigma_y^2} $ that is fixed to the groundtruth.
The initial distribution is chosen slightly more flexible, $ p(x_0) = \gauss{x_0}{m_0}{\sigma_0} $, with learnable parameters $ m_0$ and $\sigma_0 $.
This is important for this model as the sequential sampling in the inference requires a well tuned starting point.
Another difference lies in the GP contribution to the transition function.
Whereas we use a sparse GP with the FITC approximation, resulting in Eq.~\eqref{eq:fitc_trans}, \cite{ialongo_overcoming_2019} use a proper prior on the GP and inducing outputs, resulting in the standard GP transition model $ p(x_t \mid x_{t-1}) = \gauss{x_t}{f_{t-1}}{Q} $, where $ f_{t-1} = f(x_{t-1}) $.
In this experiment a zero-mean GP prior is placed on the function $ f $, as was done in the original work.
See also Appx.~\ref{sec:appx_proof} for a discussion on the role of the FITC prior.

\paragraph{VCDT training}
For the training of the VCDT model we took the code provided in \cite{ialongo_overcoming_2019} and contacted the authors for the full experimental details.
Taking their suggestions into account and after additional fine-tuning, we fix the inducing points to 50 points in the range of $ [-4,2] $ and take an RBF kernel for the GP, where we initialize the trainable variance and lengthscale to 2.
We leave all other model parameters at their standard initialization values except for the Cholesky factor of the covariance matrix of the variational distribution and the transition noise which we scale by a factor of 0.1 and 0.01, respectively.
For training we set the number of samples to be drawn from the posterior to 100 and take the Adam optimizer \citep{kingma_adam:_2015} using a learning rate of $ 10^{-3} $ for $ 10^4 $ iterations.

\paragraph{Laplace training}
We use the same kernel and inducing point settings as for the VCDT model.
We initialize the means of the variational distribution to zero-mean Gaussian noise with standard deviation $ 10^{-3} $ and the Cholesky factor of its covariance matrix to $ 10^{-5} $ times that of the prior Cholesky factor.
The transition noise variance is initialized to $ 0.0025 $.
For the training we use $ N=10 $ samples from the variational posterior and a minibatch size of $ T_b = 30 $.
We optimize the parameters using a combination of the Adam optimizer \citep{kingma_adam:_2015} and the Natural gradient optimizer \citep{salimbeni_natural_2018} provided by GPflow \citep{matthews_gpflow:_2017}:
The Natural gradient optimizer is used on the variational parameters with an initial learning rate of $ 0.004 $ and the Adam optimizer on all other parameters with an initial learning rate of $ 0.0008 $.
We train for 7000 iterations where we scale both learning rates with a factor of $ 0.99 $ every 500 steps.
For the first 1000 iterations we scale the KL-divergence in our optimization objective [Eq.~\eqref{eq:objective}] by a factor which we initialize at $ 0 $ and linearly increase to $ 1 $ over these iterations.
We found this to help learning the sparse GP initially, which in turn improved the optimization behavior for finding the mode required for the Laplace approximation.
For finding this mode, we initialize $ X_{T_0} $ to the observations $ Y_T $ adding zero-mean Gaussian noise with standard deviation $ 10^{-3} $.
We set the parameters of the \textit{bfgs\_minimize} function provided by TensorFlow-Probability to 500 for \textit{max\_iterations} and to $ 10^{-8} $ for \textit{tolerance}, counting runs as not converged if this (gradient norm) tolerance is not achieved within the 500 iterations.

\paragraph{Evaluation for Tab.~\ref{tab:kink}}
For the evaluation of the (train) log-likelihoods we proceed as follows:
We take $ J = 70 $ linearly spaced points $ \{x_j\}_{j=1}^J $ between the minimum and maximum latent state of the ground truth time series $ X_{T_0} $ (different for every seed but not for different $ \sigma_y^2 $), since this is the region in which the GP should be able to reasonably learn the ground truth.
For each of the trained sparse GPs we obtain (marginal) predictions for each of these test points $ x_j $, i.e., mean and variance predictions $ \mu(x_j) $, and $ \Sigma(x_j) $, respectively using Eqs.~\eqref{eq:gp_mean} and \eqref{eq:gp_cov}.
These are then compared with the groundtruth, i.e., the \textit{kink} function evaluated at these locations, $ f_k(x_j) $, via the average log-likelihood LL,
\begin{equation*}
	\text{LL} = \frac{1}{J} \sum_{j=1}^{J} \log \gauss{f_k(x_j)}{\mu(x_j)}{\Sigma(x_j)}.
\end{equation*} 

\paragraph{Evaluation for Tab.~\ref{tab:convergence}}
\rebuttal{For tracking the convergence of the parameters of $ q(x_t|x_{t-1},F_M) $ of the VCDT method, which is given as}
\begin{equation} \label{eq:appx_vcdt_transition}
	q(x_{t+1}|x_{t},F_M) = \gauss{x_{t+1}}{A_t\mu(x_t,F_M) + b_t}{S_t +A_t\Sigma(x_t)A_t^\top},
\end{equation}
\rebuttal{where $ \mu(x_t,F_M) $ and $ \Sigma(x_t) $ are as in Eqs.~\eqref{eq:gp_mean} and \eqref{eq:gp_cov}, respectively, we proceed as follows:
	For all of the ten randomly generated \textit{kink} data sets with $ T=120 $ and $ \sigma_y^2=0.08 $ we train the VCDT method and record the values $ A_t^i $, $ b_t^i $, and $ S_t^i $ for $ t\in\{0,40,80,120\} $ and for every tenth iteration $ i $ until $ I =10000 $.
	As expected, the variance parameter $S_t$ decreases towards the end of the optimization, indicating that the GP model can explain larger parts of the data and less transition noise is needed.
	The parameters $A_t$ and $b_t$ converge to some one-dimensional value (due to the one-dimensional latent state chosen for the \textit{kink} data set) that is in general different for every time point $t$ and depends on the realization of the toy data set.
	We define the parameters to be converged when they change only very little until the end of the optimization. 
	More precisely, we define the iteration at which the parameters are converged as}
\begin{equation*}
	I_t^\text{(con)} = (\max_i |A_t^i-A_t^I| > 0.03) + 10,
\end{equation*}
\rebuttal{and equivalently for $ b_t $.
	The values reported in Tab.~\ref{tab:convergence} are then the parameters $ A_t^{I_t^\text{(con)}} $ and $ b_t^{I_t^\text{(con)}} $ averaged over the ten different realizations.}

\subsection{System identification}
\label{sec:appx_exp_detail_sys_ident}
\paragraph{Training data}
We prepare the data as follows:
First, we normalize the data (observations and control inputs) to have zero mean and unit variance.
Then we prepare the ten splits of the data:
These are deterministic and depend on the total length of the time series $ T_\text{tot} $, the length of the training sequence $ T_\text{train} $ (which we always choose to be $ T_\text{tot}/2 $) and the length of the test sequence $ T_\text{test} $ on which we want to predict, which we choose to be 120.
For the first data split, the training sequence starts with the first data point in the time series, while for the last data split the test sequence ends with the last data point (meaning, the training sequence starts with data point $ T_\text{tot} - T_\text{tot}/2- T_\text{test} = T_\text{tot}/2- 120$).
All other 8 data splits are chosen such that the inital points of the training sequence are linearly spaced between the extreme cases described above, thus maximizing the diversity between the training tasks.

\paragraph{Laplace training}
We use an RBF kernel with automatic relevance determination and initialize its lengthscales to $ \sqrt{3} $ and its variance to $ 0.5^2 $.
We choose $ M=100 $ trainable inducing points and initialize them with a Sobol sequence in the hypercube of dimension $ d_x + d_c = 3 $ centered around the origin with side lengths equal to the largest value $ \vert y_t\vert  $ in the (normalized) training sequence.
We initialize the means of the variational distributions to zero-mean Gaussian noise with standard deviation $ 5 \times 10^{-3} $ and choose diagonal covariance matrices which we initialize to $ 10^{-6} $ times the identity matrix.
The transition noise variance $ Q $ is initialized to $ 0.01 $ and the trainable parameters of the emission model to $ b=0 $ and $ \Omega = 0.005 $.
For the training we use $ N=10 $ samples from the variational posterior and a minibatch size of $ T_b = 30 $.
As a pretraining step that we found helpful to get the sparse GP and the inner optimization to "interact nicely with each other", we train all parameters for 50 iterations with the Adam optimizer and a learning rate of $ 0.003 $.
During this pretraining we scale the KL-divergence in our optimization objective [Eq.~\eqref{eq:objective}] by a factor which we initialize at $ 0 $ and linearly increase to $ 1 $, while exponentially scaling the parameter $ Q $ down to $ 0.001 $.
Then, we optimize all parameters using the Adam optimizer with an initial (dataset dependent) learning rate of $ 0.2/T_\text{tot} $.
We train for 20000 iterations where we scale the learning rate with a factor of $ 0.95 $ every 500 steps.
For finding the mode in the inner optimization, we initialize the first dimension of $ X_{T_0} $ to the observations $ Y_T $ adding zero-mean Gaussian noise with standard deviation $ 10^{-3} $, and we initialize the second dimension to zero-mean Gaussian noise with standard deviation $ 10^{-1} $.
We set the parameters of the \textit{bfgs\_minimize} function provided by TensorFlow-Probability to 500 for \textit{max\_iterations} and to $ 2\times 10^{-7} $ for \textit{f\_relative\_tolerance}, counting runs as not converged if this tolerance (on the relative change of the objective value) is not achieved within the 500 iterations.

\paragraph{VCDT model}
For the VCDT model, we use the one employed in \cite{ialongo_overcoming_2019}, i.e., a four-dimensional latent state $ x_t $ with initial distribution $ p(x_0) = \gauss{x_0}{m_0}{\sigma_0} $, with learnable parameters $ m_0$ and $\sigma_0 $, and the GP transition model from Eq.~\eqref{eq:gpssm_trans}, using a fixed linear mean function (residual).
The emission model is the same that we also use.

\paragraph{VCDT training}
For the training of the VCDT model we took the code provided in \cite{ialongo_overcoming_2019} and, after contact with the authors, employed similar settings as in \cite{doerr_probabilistic_2018} for the following hyperparameters:
We use an RBF kernel with automatic relevance determination and initialize its lengthscales to $ 2 $ and its variance to $ 0.5^2 $.
We choose $ M=100 $ trainable inducing points of dimensionality $ d_x + d_c = 5 $ and initialize them uniformly in the range $ [-2,2] $.
We initialize the means of the variational distributions to zero-mean Gaussian noise with standard deviation $ 0.05 $ and initialize the covariance matrices to $ 10^{-4} $ times the identity matrix.
The transition noise variance $ Q $ is initialized to $ 0.002^2 $, $ \Omega$ to $ 10^{-4} $, and $ \sigma_0 $ to $ 10^{-2} $.
For the training we use 100 latent state samples and the Adam optimizer with an initial learning rate of $ 0.01 $ running for 20000 iterations, where the learning rate is scaled by a factor of $ 0.98 $ every 500 iterations.
For all other settings, we employed the default values provided in the code of \cite{ialongo_overcoming_2019}.

\paragraph{PRSSM model and training}
For the PRSSM model we take the exact model used in \cite{doerr_probabilistic_2018} and use the provided code for training, leaving the recommended settings unchanged.
The only difference is the new data split in training and test sequence as explained above.

\paragraph{Evaluation}
For the evaluation all models are only provided with the test control inputs for $ T_\text{pred} = 120 $ time points after the end of the training series and with those have to predict the future observations $ Y_{T_\text{pred}} $.
This is done by getting the latent state at the end of the training sequence that then becomes $ x_0 $ for the test sequence.
Starting from this, each trained model can then recursively sample future observations from the prior model [Eq.~\eqref{eq:gpssm_joint}, with trained parameters $ \theta $].
We store the means $ \mu_t $ and standard deviations $ \sigma_t $ over multiple recursive sampling runs at each time point $ t=t_0,\dots, t_0 +T_\text{pred} $, where $ t_0 $ is the initial time index of the test sequence.
The way how $ x_0 $ is obtained and the recursive sampling is performed is slightly different for each model due to the different inference schemes.
For PRSSM we simply follow the procedure outlined in the code.
For VCDT we take the $ x_0 $ as the last state obtained in the last iteration of training, together with the sample from $ q(F_M) $ that has been used to obtain those. See \cite{ialongo_overcoming_2019} for more information about the inference scheme. There are multiple, 100, of such $ x_0 $ and $ q(F_M) $ pairs as the inference scheme requires multiple latent state samples per iteration. The prediction coming from recursively sampling from the model with these values are then averaged over to obtain $ \mu_t $ and $ \sigma_t $.
For our method we obtain the $ x_0 $ from the converged model as follows:
We take the subsequence of $ Y_T $ of length $ T_b $ that ends at the last observation of the training sequence and obtain $ N $ samples from $ q(F_M) $.
For each of these samples we run the inner loop of Alg.~\ref{algorithm} thus providing us with $ N $ optimal latent state sequences $ \hat{X}_{T_0}^{(n)} $ from which we take the last state as an estimate for $ x_0 $ for the prediction task.
We use the same $ N $ samples from $ q(F_M) $, sample $ S = 100 $ times for each $ N $ from the prior model, thus providing an average over $ SN $ samples, yielding $ \mu_t $ and $ \sigma_t $.
The log-likelihood is then evaluated as
\begin{equation*}
	\text{LL} = \frac{1}{T_\text{test}} \sum_{t=t_0}^{t_0+T_\text{test}} \log \gauss{y_t}{\mu_t}{\sigma_t}
\end{equation*}
and the RMSE as
\begin{equation*}
	\text{RMSE}^2 = \frac{1}{T_\text{test}} \sum_{t=t_0}^{t_0+T_\text{test}} (y_t-\mu_t)^2,
\end{equation*}
for $ T_\text{test} \in \{30,60,90,120\} $, respectively.
Both these quantities are calculated on the unnormalized data, i.e., after reversing the normalization for the predictions.


\section{Additional experiments}
\label{sec:appx_more_exp}

\begin{figure}
	\centering
	\includegraphics[width=0.9\textwidth]{kink_time_series.pdf}
	\caption{Visualization of the time series created by the \textit{kink} transition function in Sec.~\ref{sec:kink}. Shown are the latent states $ x_t $ in gray and the observations $ y_t $ in black for three different emission noise variance $ \sigma_y^2 \in \{0.008, 0.08, 0.8\} $ (top to bottom). Note that the underlying latent states are the same in all plots.}
	\label{fig:appx_kink_series}
\end{figure}

\begin{table}
	\begin{center}
		\caption{Comparison of average mean predictive log-likelihoods (higher is better) and their standard errors for ten repetitions on five different benchmark datasets.
			We evaluate our model (Laplace), VCDT \citep{ialongo_overcoming_2019} and PRSSM \citep{doerr_probabilistic_2018} on predictions for $ T \in \{30,60,90,120\} $ steps in the future. 
			See Appx.~\ref{sec:appx_exp_detail_sys_ident} for more details and Fig.~\ref{fig:main-comp} for a visualization.
		}
		\label{tab:main-comp}
		\begin{tabular}{ c | c | c c c c }
			\toprule
			\textsc{Dataset}  & \textsc{Model} & \textsc{T=30} & \textsc{T=60} & \textsc{T=90} & \textsc{T=120} 
			\tabularnewline
			\midrule
			
			\multirow{3}{*}{ \textsc{Actuator}} 
			& \textsc{Laplace}  & -0.6(0.2) &	-0.8(0.2) &	-1.7(0.7)&	-1.7(0.5) \tabularnewline
			& \textsc{VCDT}  &  -0.6(0.3)&	-0.9(0.3)&	-1.0(0.3)&	-1.0(0.2) \tabularnewline
			& \textsc{PRSSM}  & -1.0(0.3)&	-0.8(0.2)&	-0.8(0.1)&	-0.8(0.2)  \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ \textsc{Ballbeam} }
			& \textsc{Laplace}  &	1.6(0.4)&	1.2(0.3)&	\text{0.9(0.3)}&	\text{1.0(0.2)}	\tabularnewline
			& \textsc{VCDT}  & 2.5(0.7)&	0.0(1.6)&	-2.2(2.2)&	-2.2(2.0) \tabularnewline
			& \textsc{PRSSM}  & {-1.8(1.4)}&	-1.6(0.9)&	-2.7(1.4)&	-2.4(1.5)  \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ \textsc{Drive}}
			& \textsc{Laplace}  &-1.7(0.5)&	-1.5(0.3)&	-1.4(0.2)&	-1.4(0.1) \tabularnewline
			& \textsc{VCDT}  & -1.2(0.1)&	-1.2(0.0)& %\textbf{-1.200(0.014)}&	\textbf{-1.195(0.011)} \tabularnewline
			\text{-1.2(0.0)}&	\text{-1.2(0.0)} \tabularnewline
			& \textsc{PRSSM}  &{-4.8(1.6)}&	{-3.9(1.0)}&	{-3.4(0.6)}&	{-3.3(0.5)}  \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ \textsc{Dryer}} 
			& \textsc{Laplace}  & -0.3(0.1)&	-0.2(0.1)&	-0.3(0.1)&	-0.3(0.1)\tabularnewline
			& \textsc{VCDT}  & {-3.2(0.6)}&{-3.2(0.5)}&	{-2.8(0.4)}&	{-2.9(0.4)} \tabularnewline
			& \textsc{PRSSM}  & -0.9(0.5)&	-0.3(0.2)&	-0.1(0.2)& -0.0(0.2) \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ 
				\shortstack[c]{\textsc{Furnace}}}
			& \textsc{Laplace}  &\text{-2.2(0.1)}&	\text{-2.4(0.1)}&	\text{-2.5(0.1)}&	\text{-2.6(0.1)} \tabularnewline
			& \textsc{VCDT}  &  -14.0(1.5)&	{-14.2(0.9)}&	{-15.6(0.8)} & {-21.2(1.0)} \tabularnewline
			& \textsc{PRSSM}  & -10.1(2.9)&	-8.5(2.5)&	-8.4(2.6)&	-11.8(3.6)		\tabularnewline
			
			\bottomrule
			
		\end{tabular}
	\end{center}
\end{table}

\begin{table}
	\begin{center}
		\caption{Comparison of average RMSEs (lower is better) and their standard errors for ten repetitions on five different benchmark datasets.
			We evaluate our model (Laplace), VCDT \citep{ialongo_overcoming_2019} and PRSSM \citep{doerr_probabilistic_2018} on predictions for $ T \in \{30,60,90,120\} $ steps in the future. See Appx.~\ref{sec:appx_exp_detail_sys_ident} for more details.}
		\label{tab:appx-comp-rmse}
		\begin{tabular}{ c | c | c c c c }
			\toprule
			\textsc{Dataset}  & \textsc{Model} & \textsc{T=30} & \textsc{T=60} & \textsc{T=90} & \textsc{T=120} 
			\tabularnewline
			\midrule
			
			\multirow{3}{*}{ \textsc{Actuator}} 
			& \textsc{Laplace}  & 0.39(0.20)&	0.42(0.18)&	0.52(0.24)&	0.55(0.23) \tabularnewline
			& \textsc{VCDT}  &  0.40(0.06)&	0.45(0.07)&	0.52(0.07)&	0.54(0.07) \tabularnewline
			& \textsc{PRSSM}  & 0.53(0.09)&	0.50(0.08)&	0.53(0.05)&	0.52(0.05)  \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ \textsc{Ballbeam} }
			& \textsc{Laplace}  &	0.05(0.03)&	0.07(0.05)&	0.08(0.05)&	0.08(0.05) \tabularnewline
			& \textsc{VCDT}  & 0.03(0.01)&	0.07(0.03)&	0.11(0.04)&	0.14(0.04) \tabularnewline
			& \textsc{PRSSM}  & 0.06(0.01)&	0.06(0.01)&	0.07(0.01)&	0.07(0.01)  \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ \textsc{Drive}}
			& \textsc{Laplace}  & 0.90(0.21)&	0.88(0.11)&	0.90(0.11)&	0.94(0.20) \tabularnewline
			& \textsc{VCDT}  & 0.76(0.03)&	0.77(0.01)&	0.77(0.01)&	0.77(0.01) \tabularnewline
			& \textsc{PRSSM}  & 0.58(0.07)&	0.58(0.06)&	0.56(0.04)&	0.55(0.03)  \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ \textsc{Dryer}} 
			& \textsc{Laplace}  & 0.31(0.07)&	0.33(0.08)&	0.39(0.24)&	0.58(0.55)\tabularnewline
			& \textsc{VCDT}  &  0.22(0.01)&	0.22(0.01)&	0.21(0.01)&	0.22(0.01) \tabularnewline
			& \textsc{PRSSM}  & 0.31(0.07)&	0.26(0.06)&	0.24(0.06)&	0.23(0.06) \tabularnewline
			
			\midrule
			%\midrule
			
			\multirow{3}{*}{ 
				\shortstack[c]{\textsc{Furnace}}}
			& \textsc{Laplace}  &2.15(1.05)&	2.82(1.36)&	3.48(1.64)&	3.60(1.80) \tabularnewline
			& \textsc{VCDT}  &  1.07(0.05)&	1.11(0.02)&	1.15(0.01)&	1.33(0.03) \tabularnewline
			& \textsc{PRSSM}  & 1.40(0.30)&	1.75(0.55)&	2.11(0.82)&	2.34(0.86)		\tabularnewline
			
			\bottomrule
			
		\end{tabular}
	\end{center}
\end{table}

\begin{figure}
	\centering
	\includegraphics[width=0.9\textwidth]{sys_ident_pred_summary.pdf}
	\caption{
		Exemplary figure showing the differences in predictions for the three different methods on the system identification data sets.
		Shown are predictions for the same seeds on the \textit{Actuator}, \textit{Ballbeam}, \textit{Drive}, \textit{Dryer}, and \textit{Gas Furnace} datasets in the different rows (top to bottom). From left to right are the predictions for our method, VCDT \citep{ialongo_overcoming_2019}, PRSSM \citep{doerr_probabilistic_2018}, and the control inputs (all unnormalized).}
	\label{fig:appx_sys_ident_pred}
\end{figure}
%\end{strip}
\bibliography{references}

\end{document}
