% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage[ruled]{algorithm2e}

\usepackage{amsmath, amssymb, amsthm, amsfonts} 
\usepackage{bm} % bold symbols
\usepackage{bbm}
\usepackage{multirow, array}
\newcolumntype{C}{>{\displaystyle}c}
\usepackage{hyperref}

\newcommand{\bx}{\mathbf{x}}
\newcommand{\by}{\mathbf{y}}
\newcommand{\bs}{\mathbf{s}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\balpha}{\boldsymbol{\alpha}}
\newcommand{\bt}{\Delta t}
\newcommand{\bth}{\bm{\theta}}
\newcommand{\bTh}{\bm{\Theta}}
\newcommand{\bpsi}{\bm{\psi}}
% \newcommand{\sig}{\mathrm{Sig}}
\newcommand{\hatsig}{\hat{\mathrm{Sig}}}
\newcommand{\kxy}{k_{x,y}}
\renewcommand{\d}{\mathrm{d}}
\newcommand{\mesh}{\mathrm{mesh}(0,T)}
\newcommand{\Ds}{\mcal{D}_{\textsc{s}}}
\newcommand{\Dm}{\mcal{D}_{\textsc{m}}}

\setlength\parindent{0pt} %% Do not touch this


\newtheorem{Theorem}{Theorem}
\newtheorem{Proposition}{Proposition}
\newtheorem{Definition}{Definition}
\newtheorem{Remark}{Remark}
\newtheorem{Lemma}{Lemma}
\newtheorem{Example}{Example}
\newtheorem{Corollary}{Corollary}

\usepackage[acronym,nowarn,section,%nogroupskip,
nonumberlist,shortcuts]{glossaries}
\glsdisablehyper
\newacronym{abc}{ABC}{approximate Bayesian computation}
\newacronym{lfi}{LFI}{likelihood-free inference}
\newacronym{gbi}{GBI}{generalised Bayesian inference}
\newacronym{saabc}{SA-ABC}{semi-automatic ABC}
\newacronym{mmd}{MMD}{maximum mean discrepancy}
\newacronym{mcmc}{MCMC}{Markov chain Monte Carlo}
\newacronym{skrr}{SR-ABC}{Signature regression ABC}
\newacronym{swd}{SWD}{sliced Wasserstein distance}
\newacronym{k2abc}{K2-ABC}{double kernel ABC}
\newacronym{rej}{REJ-ABC}{rejection ABC}
\newacronym{gbm}{GBM}{geometric Brownian motion}
\newacronym{mh}{MH}{Metropolis-Hastings}
\newacronym{ma2}{MA(2)}{moving average model of order 2}
\newacronym{wass}{Wass}{Wasserstein distance}
\newacronym{sabc}{S-ABC}{Signature ABC}
\newacronym{iid}{iid}{independent, identically distributed}
\newacronym{pmcmc}{pMCMC}{particle Markov chain Monte Carlo}
\newacronym{rkhs}{RKHS}{reproducing kernel Hilbert space}
\newacronym{wl}{WL}{Weisfeiler-Lehman}
\newglossaryentry{cde}
{
  name={CDE},
  description={controlled differential equation},
  first={\glsentrydesc{cde} (\glsentrytext{cde})},
  plural={CDEs},
  descriptionplural={controlled differential equations},
  firstplural={\glsentrydescplural{cde} (\glsentryplural{cde})}
} 
\newglossaryentry{sde}
{
  name={SDE},
  description={stochastic differential equation},
  first={\glsentrydesc{sde} (\glsentrytext{sde})},
  plural={SDEs},
  descriptionplural={stochastic differential equations},
  firstplural={\glsentrydescplural{sde} (\glsentryplural{sde})}
} 
\newcommand{\bd}[1]{\mathbf{#1}}
\newcommand{\bv}[1]{BV_{\left[0,T\right]}(#1)}
\newcommand{\Pl}[1]{\mathcal{P}_{\left[0,T\right]}(#1)}
\newcommand{\mcal}[1]{\mathcal{#1}}
\newcommand{\sig}[1]{\mathrm{Sig}(#1)}
\newcommand{\kap}[1]{\kappa(#1, \cdot)}
\newcommand{\onevar}[1]{\| #1 \|_{\mathrm{1-var}}}
\newcommand{\norm}[2]{\left\lVert#1\right\rVert_{#2}}
% \newcommand{\norm}[2]{\| #1 \|_{#2}}
\newcommand{\pH}{\prod_{m \geq 0} \mcal{H}^{\otimes m}}
\newcommand{\ip}[3]{\left\langle #1, #2 \right\rangle_{#3}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Approximate Bayesian Computation with Path Signatures}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<joel.dyer@cs.ox.ac.uk>?Subject=Your UAI 2024 paper}{Joel Dyer}{}}
\author[3]{Patrick Cannon}
\author[3]{Sebastian M. Schmon}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of Oxford
}
\affil[2]{%
    Institute for New Economic Thinking\\
    University of Oxford
}
\affil[3]{%
    No affiliation
  }
  
  \begin{document}
\maketitle

\begin{abstract}
    Simulation models often lack tractable likelihood functions, making likelihood-free inference methods indispensable. Approximate Bayesian computation generates likelihood-free posterior samples by comparing simulated and observed data through some distance measure, but existing approaches are often poorly suited to time series simulators, for example due to an independent and identically distributed data assumption. In this paper, we propose to use path signatures in approximate Bayesian computation to handle the sequential nature of time series. We provide theoretical guarantees on the resultant posteriors and demonstrate competitive Bayesian parameter inference for simulators generating univariate, multivariate, and irregularly spaced sequences of non-\textit{iid} data.
\end{abstract}

\section{Introduction}\label{sec:intro}
Simulation models are an increasingly popular tool in a broad range of scientific disciplines including cosmology \citep{Alsing_2018}, economics \citep{Geanakoplos2012}, and the biological sciences \citep{Christensen2015}. A drawback of such models is that, while they are straightforward to sample from, their complexity typically does not allow for explicit evaluation of the associated likelihood function. Consequently, traditional approaches to statistical inference are infeasible and alternative \gls{lfi} methods are usually adopted.

Many \gls{lfi} approaches have been proposed. One of the most widely used \gls{lfi} methods is \gls{abc} \citep{tavare1997inferring, pritchard1999population, beaumont2002approximate}, in which the Bayesian posterior distribution is approximated by sampling parameters $\bth$ from a prior distribution and synthetic datasets $\bx$ from a stochastic simulator -- with likelihood denoted $p(\bx \mid \bth)$ -- and comparing the output $\bx$ with real data $\by$. If the simulator output is  sufficiently `close' to the observation, then $\bth$ is retained as a sample from the approximate posterior distribution; otherwise, it is discarded.

However, measuring closeness between model outputs is known to be challenging. This is particularly the case for time series data, which can exhibit complex dependency structures and may be multivariate and sampled at irregular time intervals. 
A common approach is to attempt to distil important features of the data using summary statistics and compare these instead (see e.g. \citet{prangle2018handbook}).
In practice, informative summary statistics are difficult to craft, which presents a trade off: a poor choice can materially degrade \gls{abc}-based posterior approximations, yet constructing a sufficiently powerful choice can require substantial domain expertise, problem insight, and costly experimentation (see e.g. \citet{drovandi2021comparison} for a recent comparison of methods with and without summaries). 

In other approaches the engineering of summary statistics is bypassed altogether in favour of distances on the full dataset \citep[e.g.][]{Park2016, jiang2018approximate, Bernton2019, nguyen2020approximate}. 
However, in many such cases the focus is on \textit{iid} data, with non-\textit{iid} or sequential data appearing as an afterthought. The result of this is that there is a scarcity of automatic approaches to performing approximate Bayesian inference for generic dynamic, stochastic simulation models in the \gls{abc} literature. 
Developing automatic approaches to \gls{abc} that are more tailored to simulators generating sequences of dependent points will thus increase the ease with which \gls{abc} methods can be deployed in a broader range of real-world inference settings. 

In response to this challenge, we present here two novel methods for performing \gls{abc} for time series models that bypass the difficult problem of manually constructing summary statistics for sequential data. Our approach leverages so-called \emph{path signatures}, a key object in the mathematics of rough path theory and the theory of controlled differential equations \citep[see e.g.][]{LyonsT.J2007Dedb, lyons2014rough}, to generate \gls{abc} schemes that places sequential data at centre stage.
Signatures have been employed successfully in a variety of machine learning tasks (see, e.g., \citep{Li2017, Moore2019}), and constitute a natural feature set for multivariate and even irregularly sampled sequential data \citep{salvi2020computing}.
We demonstrate that the path signature can be employed in two different ways to construct useful distance measures for time series data in \gls{abc}: either directly as a summary statistic, or in the context of a regression-based semi-automatic \gls{abc} approach. We further show that such approaches can recover more accurate posteriors than existing techniques.

\section{Background}\label{sec:back}

In this section, we recapitulate some standard approaches to \gls{abc} with an emphasis on time series data, and provide an overview of path signatures. Appendix \ref{app:signatures} expands on this introduction to path signatures for the unfamiliar reader.

\subsection{Approximate Bayesian Computation}\label{sec:abc}
\glsresetall
 
%
Let $\mcal{X}^n$ be the space of length $n$ sequences taking values in a set $\mcal{X}$. Suppose we have time series data $\by = (\bd{y}_{t_1}, \bd{y}_{t_2}, \ldots, \bd{y}_{t_n})$ with each $\bd{y}_{t_i} \in \mcal{X}$, observed at real times $0 = t_1 < t_2 < \ldots < t_n = T$, and assumed to have been drawn from a model with measure $\mu_{\bth}$ parameterised by $\bth = (\bth_1, \ldots, \bth_p) \in \bTh \subseteq \mathbb{R}^p$. We assume that $\mu_{\bth}$ has density $p_{\bth}$ with respect to the Lebesgue measure. Given a prior density $\pi$ (also wrt Lebesgue) on $\bTh$, the central object in Bayesian inference is the posterior distribution
\begin{equation}\label{eq:Bayes}
    \pi(\bth \mid \by) \propto p_{\bth}(\by) \pi(\bth).
\end{equation}
For simulation models, evaluating the likelihood function $p_{\bth}(\by)$ is commonly intractable, making standard Bayesian approaches to posterior inference such as \gls{mcmc} infeasible. 
    
In such scenarios, an established alternative is \gls{abc} \citep{tavare1997inferring, pritchard1999population, beaumont2002approximate} which allows the user to approximate the true posterior \eqref{eq:Bayes} using only forward simulations. Broadly, the user specifies summary statistics $\bs : \mcal{X}^n \to \mathcal{S}$ (usually $\mathcal{S} = \mathbb{R}^k$ for some $k\geq 1$), and a distance measure $\rho$; $p_{\bth}(\bd{y})$ is then approximated as
\begin{equation}\label{eq:ABC_likelih}
    \tilde{p}_{\varepsilon}\{\bs(\by) \mid \bth\} = \int K_{\varepsilon}\left[\rho\{ \bs(\by), \bs(\bx)\} \right] \, p_{\bth}(\bx)\, \mathrm{d}\bx,
\end{equation}
where $K_{\varepsilon}(\cdot) = K(\cdot/\varepsilon)/\varepsilon$ is a kernel function with bandwidth parameter $\varepsilon$.
%
The resulting \gls{abc} posterior is then
\begin{equation}\label{eq:abc_post}
    \pi_{\varepsilon}\{\bth \mid \bs(\by) \} \propto \tilde{p}_{\varepsilon}\{\bs(\by) \mid \bth \} \pi(\bth).
\end{equation}

The approach as presented above leaves open a plethora of possible choices for $\bs$, $\rho$ and $K_{\varepsilon}(\cdot)$. 
We summarise here some of the most common and well-known choices. 

% \vspace{-0.3cm}
\paragraph{Rejection ABC}
%
The standard \gls{rej} algorithm corresponds to choosing a uniform kernel $K_{\varepsilon}(\cdot) \propto \mathbbm{1}\left(\cdot \leq \varepsilon\right)$. 
%
The choice of threshold $\varepsilon$ is left to the experimenter, and for example may be determined in advance of the inference procedure, or chosen after simulation time such that a certain proportion of the total simulation budget is retained \citep{cornuet2008inferring}.

% \vspace{-0.3cm}
\paragraph{Semi-automatic ABC} \cite{Fearnhead2012} propose \gls{saabc}, in which an estimate of the posterior mean, $\bs(\by) = \mathbb{E}\left( \bth \mid \by \right)$, acts as the summary statistic, and the Euclidean distance is used as $\rho$. 
% 
Given a set of $N$ training data points $\left(\bx^{(i)}, \bth^{(i)}\right) \sim p_{\bth}(\bx)\pi(\bth)$, $i=1, \dots, N$, and a candidate vector $\mathbf{g}(\cdot)$ of $J$ summary statistics, the method performs vector-valued regression from $\mathbf{g}(\bx^{(i)})$ to $\bth^{(i)}$ to estimate $\bs(\by)$. 
% 
A drawback of this method is that it requires the construction of an initial set of candidate summaries, which would need to be informative. Other approaches in this vein include \citet{Nakagome2013}, in which the authors propose the use of \gls{saabc} using kernel ridge regression to exploit the nonlinearities induced by kernel methods. 

% \vspace{-0.3cm}
%
\paragraph{K2-ABC} \citet{Park2016} propose \gls{k2abc}, which bypasses the problem of constructing summary statistics for \textit{iid} data by using the \gls{mmd} between (a) the simulator's distribution $f_{\bth}$, where $\bx = (\bx_1, \dots, \bx_n) \sim p_{\bth}(\bx) = \prod_{i=1}^{n} f_{\bth}(\bx_i)$, and (b) the true density $f^*$ giving rise to the \textit{iid} observations comprising $\by$, respectively. That is, from a kernel $\kappa : \mathcal{X} \times \mathcal{X} \to \mathbb{R}$, the discrepancy between $\bx$ and $\by$ is taken to be
\begin{equation}
    \textsc{mmd}^2 = \|\mathbb{E}_{\bd{z} \sim f_{\bth}}[\kappa(\bd{z}, \cdot)] - \mathbb{E}_{\bd{z}' \sim f^*}[\kappa(\bd{z}', \cdot)] \|_{\mathcal{H}}^2,
\end{equation}
where $\mathcal{H}$ is the \gls{rkhs} associated with $\kappa$. In this way, the choice of summary statistics (e.g. as required in \gls{saabc}) can be seen as being replaced by the choice of kernel $\kappa$. For time series data, the authors suggest that the dependency structure can be ignored, and that the observation $\lbrace{\bd{y}_i : i=1, \dots, n\rbrace}$ and simulation output $\lbrace{\bd{x}_i : i = 1, \dots, m\rbrace} $ can still be treated as \textit{iid} data from the marginal densities $f_{\bth}$ and $f^*$, respectively. 
% 

% \vspace{-0.8cm}
\paragraph{Wasserstein ABC (W-ABC)} \citet{Bernton2019} propose to use as its measure of discrepancy the $p$-Wasserstein distance between the empirical distribution of observations $\by = (\bd{y}_1, \bd{y}_2, \dots, \bd{y}_n)$, and simulated data $\bx = (\bd{x}_1, \bd{x}_2, \dots, \bd{x}_m)$, with $\bd{y}_i, \bd{x}_j \in \mathbb{R}^d$. That is, the distance $\rho$ is taken to be
\begin{equation}\label{eq:wass}
    \mathcal{W}_p(\by, \bx)^p = \inf_{\gamma \in \Gamma_{n,m}} \sum_{i =1}^{n} \sum_{j=1}^{m} \rho_0(\bd{y}_i, \bd{x}_j)^p \gamma_{ij}
\end{equation}
where $\rho_0$ is a distance on $\mathbb{R}^d$ and $\Gamma_{n,m}$ is the set of $n\times m$ matrices with non-negative entries, columns summing to $m^{-1}$, and rows summing to $n^{-1}$. 
% 
The authors discuss multiple strategies to account for the structured and ordered nature of time series data, such as the 
%
\emph{Wasserstein curve matching} distance, in which a time augmentation $\bd{y}_{t_i} \mapsto (t_i, \bd{y}_{t_i})$ is applied to the data, and the following ground distance between elements of the sequence used:
\begin{equation}\label{eq:wass_curve}
    \rho_{0}\{(t_i, \bd{y}_{t_i}), (t_j, \bd{x}_{t_j}); \lambda\} = \| \bd{y}_{t_i} - \bd{x}_{t_j} \| + \lambda \vert{ t_i - t_j \vert}.
\end{equation}
In the above, $\lambda > 0$ is a free parameter that interpolates the distance in \eqref{eq:wass} between the sum of Euclidean distances $\sum_{i} \| \bd{y}_{t_i} - \bd{x}_{t_i} \|$ (when $n=m$) and the Wasserstein distance between the empirical marginal distributions of $\by$ and $\bx$. 
% 
Such an approach is however of limited suitability for time series data: the curve matching distance will not in general respect the ordering of the observations in $\bx$ and $\by$, and will ultimately still permit permutations of their elements (see Appendix \ref{app:WassersteinFailure} for a simple example). %  

\subsection{Path Signatures}\label{sec:pathsigs}

Let $\mcal{H}$ be a Hilbert space carrying an inner product $\ip{\cdot}{\cdot}{\mcal{H}}$, and $h : [0,T] \to \mcal{H}$ be a $\mcal{H}$-valued path on the interval $[0,T]$. Further, let $\zeta(0,T) = \{t_1, \dots, t_n\}$ denote a finite partition of the interval $[0,T]$, with $0 = t_1 < \dots < t_n = T$. Throughout, we will consider $\mcal{H}$-valued paths of bounded variation over the interval $[0,T]$, i.e. paths for which
\begin{equation*}
    \onevar{h} := \sup_{\zeta(0,T)} \sum_{i=1}^{n - 1} \norm{h_{t_{i+1}} - h_{t_i}}{\mcal{H}} < \infty,
\end{equation*}
where the supremum is taken over all finite partitions of the domain and $n = \vert{\zeta(0,T)}\vert$. We denote with $\bv{\mcal{H}}$ the space of such paths. By defining the product Hilbert space
\begin{equation}
    \pH := \mathbb{R} \oplus \mcal{H} \oplus \left(\mcal{H} \otimes \mcal{H}\right) \oplus \dots \oplus \mcal{H}^{\otimes m} \oplus \dots,
\end{equation}
endowed with an addition operation, inner product, and norm acting on any $A = (a_0, a_1, \dots)$, $B = (b_0, b_1, \dots) \in \pH$ as, respectively, 
\begin{align}
    A + B &:= (a_0 + b_0, a_1 + b_1, \dots),\\%\quad \quad \quad \text{ and inner product } \quad \quad \quad 
% \end{equation}
% \begin{equation}
\label{eq:inn_prod_ta}
    \ip{A}{B}{} &:= \sum_{m \geq 0} \ip{a_m}{b_m}{\mcal{H}^{\otimes m}}
\end{align}
where 
\begin{equation*}
    \ip{u_1 \otimes \dots \otimes u_m}{v_1 \otimes \dots \otimes v_m}{\mcal{H}^{\otimes m}} = \prod_{j=1}^m \ip{u_j}{v_j}{\mcal{H}},
\end{equation*}
and
\begin{equation}
    \norm{A}{} := \sqrt{\sum_{m \geq 0} \norm{a_m}{\mcal{H}^{\otimes m}}^2},
\end{equation}
the \emph{path signature} \citep[see e.g.][]{LyonsT.J2007Dedb} of $h \in \bv{\mcal{H}}$, denoted $\sig{h}$, maps $h$ to an infinite series of tensors as
\begin{equation}\label{eq:sig}
    %
    h \mapsto \{1, S_1(h), S_2(h), \dots\} \in E \subset \pH.
    % 
\end{equation}
In the above, $E$ is the subspace of $\pH$ consisting of the elements of the product Hilbert space that have finite norm. The terms of the signature are defined as
\begin{equation*}%\label{eq:sig_terms}
    S_m(h) := \int_{0}^T {\d }h^{\otimes m} 
    % 
    = \underset{0 \leq t_1 < \dots < t_m \leq T}{\int \dots \int} {\d }h_{t_1} \otimes \dots \otimes {\d }h_{t_m}.
\end{equation*}
We adopt the convention that $\mcal{H}^{\otimes 0} = \mathbb{R}$. The number $m$ of integrals comprising the terms of the signature is often referred to as the \textit{depth} of that signature term, and we adopt this terminology throughout. The collection of all such integrals at every depth $m \geq 0$ is a set of statistics for path-valued random variables that describe geometric features of the path and behave analogously to monomials.  

The following simple example of a two-dimensional path provides a demonstration of the sort of geometric information captured by the signature:

\begin{Example}[Example 2.3, \citet{Kiraly2019}]\label{ex:sig}
    Let $h_t$ take values in $\mathbb{R}^2$, $h_t = (a_t, b_t)$. Then ${\d }h_{t} = ({\d }a_t, {\d }b_t)$, such that
    \begin{equation*}
        S_1(h) = 
        \left[\begin{array}{CC}
            \int_{t=0}^T {\d }a_t\\[10pt] \int_{t=0}^T {\d }b_t 
        \end{array}\right],
        \ \ \ \ \ 
        \text{and}
    \end{equation*}
    \begin{equation*}
        S_2(h) =         
        \left[\begin{array}{CC}
            \int_{t'=0}^T \int_{t=0}^{t'} {\d }a_{t}{\d }a_{t'} & \int_{t'=0}^T \int_{t=0}^{t'} {\d }a_{t}{\d }b_{t'} \\[10pt]
            \int_{t' = 0}^T \int_{t = 0}^{t'} {\d }b_{t}{\d }a_{t'} & \int_{t'=0}^T \int_{t=0}^{t'} {\d }b_{t}{\d }b_{t'}
        \end{array}\right].
    \end{equation*}
    
    In the above, the variables $t$ and $t'$ are dummy time indices that are being integrated over. These terms can be further interpreted geometrically: the terms in $S_1(h)$ capture the increments along each dimension, while the off-diagonal elements of $S_2(h)$ capture the areas above and below the curve; see Figure \ref{fig:geom}. Higher order terms capture higher order notions of area that are harder to visualise and interpret.
\end{Example}

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{fig8.png}
    \caption{Geometric interpretation of the signature terms for the two-dimensional path from Example \ref{ex:sig}, shown as the dark green curve. Depth-1 terms correspond to the increments $a_T - a_0$ and $b_T - b_0$, while the depth-2 terms $[S_2(h)]_{21}$ and $[S_2(h)]_{12}$ correspond to the blue and yellow areas, respectively.}\label{fig:geom}
\end{figure}

Signatures have a number of desirable properties; for example, they are a \emph{universal nonlinearity}. This means that for any compact set $\mathcal{K}$ of paths of bounded variation, any continuous, real-valued function $f$ on $\mathcal{K}$ can be approximated uniformly by linear functionals of the signature, i.e. for any $\varepsilon>0$ there exists a linear functional $L$
\begin{equation*}
    \sup_{h\in \mathcal{K}}\Big|f(h) - L\left[\sig{h}\right]\Big| < \varepsilon.
\end{equation*}
Appendix \ref{app:un} provides further details. Further, signatures have the desirable property of being an essentially \textit{injective} map (see Appendix \ref{app:signature_invariances}), an important consequence of which in the context of approximate inference is that the path signature for a sequence of data can be seen as a \textit{sufficient statistic}, since by the Fisher-Neyman factorization theorem \citep[see, e.g., ][Theorem 2.21]{schervish1995theory} an injective function of a sufficient statistic is also sufficient. 
% 

\subsubsection{The Signature Kernel}

The signature can be kernelised following \citet{Kiraly2019}:

\begin{Definition}[Signature kernel, \citet{Kiraly2019}]\label{def:sig_kern}
    The \emph{signature kernel} acts on $h, g \in \bv{\mcal{H}}$ as
    \begin{equation}\label{eq:sig_kern}
        k : %\bv{\mcal{H}} \times \bv{\mcal{H}} \to \mathbb{R}, \ \ \ 
        (h, g) \mapsto \ip{\sig{h}}{\sig{g}}{} \in \mathbb{R},
    \end{equation}
    where the inner product is defined as in Equation \eqref{eq:inn_prod_ta}.
\end{Definition}
A key insight of \citet{Kiraly2019} was to recognise that evaluation of the signature kernel -- which operates on \textit{paths} in $\mcal{H}$ -- can be performed using only evaluations of an inner product $\kappa$ that operates on \textit{points} in the path, amounting to a kernel trick for the signature kernel. \citet{Kiraly2019} further describe an efficient Horner scheme to evaluate a truncated signature kernel that approximates Equation \eqref{eq:sig_kern}. \citet{salvi2020computing} extend this work by recognising that the signature kernel solves a Goursat partial different equation, permitting numerical estimation of the signature kernel using finite difference methods. 
% 

\subsubsection{Path Signatures in Practice}

In light of their interesting and useful properties described above, signatures can be seen as a canonical feature transformation for path-valued random variables. 
%
However, there exists an incongruity between our discussion so far and the scenarios faced in real-world settings: in reality and from the output of simulation models, we tend to observe discretely sampled data $\bd{x} = (\bd{x}_{t_1}, \bd{x}_{t_2}, \dots, \bd{x}_{t_n})$ at times $0 = t_1 < t_2 < \dots < t_n = T$, where $\bd{x}_{t} \in \mcal{X}$ for some finite-dimensional space $\mcal{X}$ (for example $\mathbb{R}^d$ for some $d \geq 1$), rather than continuous paths $x \in \bv{\mcal{H}}$. 
%
The incongruity is fixed %
in the following way: 
\begin{enumerate}
    \item[(a)] As noted by \citet{Kiraly2019}, the aforementioned signature kernel trick can be used to introduce nonlinearities and embed the $\mcal{X}$-valued sequence $\bd{x}$ in a Hilbert space. In particular, by choosing a reproducing kernel $\kappa : \mcal{X} \times \mcal{X} \to \mathbb{R}$ with \gls{rkhs} $\mcal{H}$ and canonical feature map $\kap{\bd{x}_{t}} \in \mcal{H}$ as the inner product on the data space $\mcal{X}$, we may implicitly construct a sequence $(\kap{\bd{x}_{t_1}}, \kap{\bd{x}_{t_2}}, \dots, \kap{\bd{x}_{t_n}})$ of points in $\mcal{H}$ from sequences of data in $\mcal{X}$.
    \item[(b)] To construct continuous paths from the discrete sequence above, an interpolation scheme is employed. While many interpolation schemes are possible, the most common is linear interpolation. Indeed, \citet{Kiraly2019} and \citet{salvi2020computing} assume a linear interpolation to construct \textit{discretised} signature kernels operating on sequences of points, and we use this interpolation scheme throughout this work.
\end{enumerate}
By combining these two steps, we progress from a sequence $\bd{x}$ of points in $\mcal{X}$ to a $\mcal{H}$-valued, piecewise linear path $h$, which for $i=1,\ldots,n-1$ and $t\in [t_i, t_{i+1}]$ is given by
\begin{equation}\label{eq:lifted_path}
    h_t := \kap{\bd{x}_{t_{i}}} + \frac{t - t_{i}}{t_{i+1} - t_{i}} \{\kap{\bd{x}_{t_{i+1}}} - \kap{\bd{x}_{t_{i}}}\}. 
\end{equation}
% 
Piecewise linear paths constructed in this way are naturally of bounded variation if, for example, $\kappa$ is a continuous and/or 
uniformly bounded kernel (see Proposition \ref{prop:sig_inj_abc} in Appendix \ref{app:sabc_proof_a}). We will assume this throughout, such that all observed sequences in $\mcal{X}$ lift to piecewise linear paths of bounded variation in $\mcal{H}$ under the feature map corresponding to $\kappa$.


\section{Methods}\label{sec:signatureABC}

Given its unique properties, the path signature and its associated kernel
are natural candidates for feature maps and discrepancy measures in \gls{abc} to handle time series data of different kinds.
In this section, we introduce and investigate two techniques for incorporating signatures into \gls{abc}.
% 
\subsection{Signature ABC}\label{sec:skd}

% \glsreset{rej}

The first approach we consider entails using the signature directly as a summary statistic in ABC. Though signatures are infinite-dimensional objects, we can leverage their kernel representation (see Definition \ref{def:sig_kern}) to compute the distance between two sequences $\bd{x}, \bd{y}$ as the norm induced by the associated signature inner product. That is, for two time series $\bd{x}$ and $\bd{y}$, we can interpret the signature of their lifted paths as a \emph{sufficient} summary statistic, $\bd{s}(\bd{x}) = \sig{\bd{x}}$, and compute
\begin{align}
\label{eq:sig_distance}
    \rho\{\bd{s}(\bd{x}), \bd{s}(\bd{y})\} :=& \| \sig{\bd{x}} - \sig{\bd{y}} \|^2\\\nonumber
    =& k(\bd{x}, \bd{x}) + k(\bd{y}, \bd{y}) - 2\,k(\bd{x}, \bd{y}),
\end{align}
where $k(\bd{x}, \bd{y}) = \ip{\sig{\bd{x}}}{\sig{\bd{y}}}{}$.
The resulting distance can be computed easily using existing software\footnote{E.g., \href{https://github.com/crispitagorico/sigkernel}{\texttt{sigkernel}} or \href{https://github.com/tgcsaba/KSig}{\texttt{KSig}}.} and used to derive an \gls{abc} posterior via Equations \eqref{eq:ABC_likelih}-\eqref{eq:abc_post}. For example, it may be embedded in rejection \gls{abc}, yielding
\begin{equation*}
    \pi_{\varepsilon}(\bth \mid \bd{y}) \propto \pi(\bth) \int \mathbbm{1}\left(\| \sig{\bd{x}} - \sig{\bd{y}} \|^2 \leq \varepsilon\right) \mu_{\bth}( {\d } \bd{x}),
\end{equation*}
as the ABC posterior. We term this approach \gls{sabc}. 
%
Injectivity of the path signature, and continuity of the norm the inner product induces, further guarantees the asymptotic correctness of this \gls{sabc} posterior as $\varepsilon \to 0$ with $n$ fixed:

\begin{Proposition}\label{thm:abc_convergence}
    Let $\mcal{X} := \mathbb{R}^d$, $\bd{y} = (\bd{y}_1, \dots, \bd{y}_n) \in \mcal{X}^n$, and let $\rho$ be as in Equation \eqref{eq:sig_distance}, resulting in the \gls{sabc} posterior $\pi_{\varepsilon}$. Suppose the density function $p_{\bth}(\bd{x})$ satisfies
    \begin{equation*}
        \sup_{\bth \in \bTh \setminus \mcal{N}_{\bTh}} p_{\bth}(\bd{y}) < \infty,
    \end{equation*}
    where $\mcal{N}_{\bTh}$ is a set such that $\pi(\bth) = 0\, \forall \bth \in \mcal{N}_{\bTh}$, and that there exists $\bar{\varepsilon} > 0$ such that
    \begin{equation*}
        \sup_{\bth \in \bTh \setminus \mcal{N}_{\bTh}} \sup_{\bd{z} \in \mcal{A}^{\bar{\varepsilon}}} p_{\bth}(\bd{z}) < \infty,
    \end{equation*}
    where $\mcal{A}^{\bar{\varepsilon}} := \lbrace{\bd{z} \in \mathcal{X}^n : \rho\{\bs(\bd{y}), \bs(\bd{z})\} \leq \bar{\varepsilon} }\rbrace$. Then for any measurable $\bd{B} \subset \bTh$,
    % 
    \begin{equation}
        \lim_{\varepsilon \to 0} \int_{\bd{B}} \pi_{\varepsilon}(\bth \mid \by)\, {\mathrm d}\bth = \int_{\bd{B}} \pi(\bth \mid \by)\, {\mathrm d}\bth.
    \end{equation}
\end{Proposition}

%
The proof is provided in Appendix \ref{app:sabc_proof_a}. % 

\subsection{Signature Regression ABC}

%

% For the case of time series models, 
We consider a second use of path signatures in \gls{abc}, namely in the \gls{saabc} method described by \citet{Fearnhead2012}. Given its status as a universal nonlinearity as discussed in Section \ref{sec:pathsigs}, the path signature provides a natural basis for learning functions on sequences, and a natural set of summary statistics for the regression task required in \gls{saabc}. Regression on the full path signature is of course impossible, since the signature is infinite-dimensional. However, this may once again be circumvented using the signature kernel and corresponding kernel trick (see Definition \ref{def:sig_kern}) in kernel ridge regression \citep{HastieTrevor2001Teos} to implicitly regress parameters onto the \emph{full} signature, which is in a sense equivalent to using the infinitely long path signature as the candidate set of summary statistics in semi-automatic \gls{abc}. That is, using training examples $(\bx^{(i)}, \bth^{(i)}) \sim p_{\bth}(\bx)\, \pi(\bth), i=1,\ldots,R$, we find a function $\hat{\bth}_j$ in the \gls{rkhs} associated with the signature kernel $k$, which by the Representer Theorem has the following form for each component $\bth_j, j = 1, \dots, p$ of the $p$-dimensional parameters $\lbrace{\bth^{(i)}\rbrace}_{i=1}^{R}$:
\begin{equation}
    \hat\bth_{j}(\bx) = \sum_{i=1}^{R} \boldsymbol{\omega}_i^{(j)} k(\bx, \bx^{(i)})
\end{equation}
with $\boldsymbol{\omega}^{(j)} = \left(G + \alpha I_R\right)^{-1}\bpsi^{(j)}$, $\bpsi^{(j)} = \left[\bth_{j}^{(1)}, \bth_{j}^{(2)}, \ldots, \bth_{j}^{(R)}\right]'$ with $'$ denoting matrix transposition, 
% 
$G_{mn} = k(\bx^{(m)}, \bx^{(n)})$, $I_R$ an $R\times R$ identity matrix, and %where %
$\alpha \geq 0$ is a regularisation parameter to be tuned. In this way, path signatures also enable 
% 
the semi-automatic construction of summary statistics in \gls{saabc}. This approach to \gls{abc} is somewhat similar to that of \citet{Nakagome2013}, who employ kernel ridge regression with a Gaussian RBF kernel to perform \gls{saabc}. Our approach differs, however, in that \citet{Nakagome2013} propose the use of hand-crafted summary statistics as input to the kernel ridge regression model, while we use the full data.

Once the data is summarised with this regression model, the discrepancy between simulation and observation is taken as the Euclidean distance between their corresponding outputs from the kernel ridge regression model. We herein refer to this approach as \gls{skrr}. Further technical details are provided in Appendix \ref{app:skabc}. 


% \begin{sidewaysfigure}
\begin{figure*}
    \centering
    \includegraphics[width=\linewidth]{fig1.png}
    \caption{(Ricker model) (\textbf{a}) Wasserstein distances and (\textbf{b}) maximum mean discrepancies between the posteriors recovered from each ABC method and an approximate ground truth from \gls{pmcmc}. (\textbf{c}) Squared distances between the means of the \gls{abc} posteriors and the posterior mean from \gls{pmcmc}. Our methods are shown in blue.}
    \label{fig:ricker_metrics}
% \end{sidewaysfigure}
\end{figure*}

\begin{algorithm}[t]
\SetAlgoLined
\textbf{Input:} prior $\pi$, observation $\by$, distance function $\mcal{D}(\cdot , \cdot)$, number of particles $N$, final sample size $M < N$\;
%
\KwResult{Posterior samples $\lbrace{\bth^{(i)}\rbrace}_{i=1}^M$}
 \For{$i=1,\dots,N$}{
  Sample $\bth^{(i)} \sim \pi$\;
  Simulate $\bx^{(i)} \sim p_{\bth^{(i)}}$\;
  Evaluate distance $\mcal{D}(\bx^{(i)}, \by)$\;
 }
% 
 Retain the $M$ particles $\lbrace{\bth^{(i)}\rbrace}_{i=1}^{M}$ with the lowest distances
 
\caption{Rejection sampling scheme}
\label{alg:Rej}
\end{algorithm}

\section{Experiments}\label{sec:Exp}

In this section, we present experiments comparing the performance of our methods, \gls{sabc} and \gls{skrr}, against the use of the Wasserstein distance \citep{Bernton2019} (W-ABC) and the \gls{mmd} \citep{Park2016} (K2-ABC) as measures of discrepancy in ABC, along with \gls{saabc} \citep{Fearnhead2012}. The models with which we conduct experiments were chosen to cover a range application domains, namely: ecology, finance, and public health/epidemiology. These models were also chosen due to the fact that approximate ground-truth posteriors are readily available via standard \gls{mcmc} techniques, permitting a proper evaluation of the methods' performance in the posterior inference task. Finally, they were also chosen for the variety of outputs they produce: chaotic, integer-valued time-series in the first example; non-stationary, real-valued sequences in the second; and continuous-time, variable-length sequences of multivariate and irregularly spaced points in the final case. Further details on the experiments we present below, along with additional results, are provided in Appendix \ref{app:exp}.

\subsection{Implementation Details}\label{sec:ExpImpDet}

For all distances, we sample from the \gls{abc} posterior using a simple \gls{rej} scheme as outlined in Algorithm \ref{alg:Rej} and, unless stated otherwise, use $N=10^5$ and $M=10^3$. While other, more sophisticated schemes exist, we choose this to facilitate a simple and transparent comparison of the different distance measures. 
To assess the quality of the recovered posteriors, we compute the 1-Wasserstein distance and an unbiased estimate of the maximum mean discrepancy (MMD) between the approximate ground truth posteriors $\hat{\pi}_{\cdot \mid \by}$ and empirical posteriors $\hat{\pi}_{\mathrm{ABC}}$. In both cases, smaller values indicate a closer match to the approximate ground truth. To estimate the MMD between posteriors, we use a Gaussian RBF kernel with scale parameter chosen with the median heuristic \citep{Briol2019}. For \gls{sabc} and W-ABC, we also report results obtained by applying a (lag-1) \textit{delay} transformation to the time series before distance computations, which acts on a time series $\bd{x}$ as
\begin{align*}
    (\bd{x}_{t_1}, \bd{x}_{t_2},& \dots, \bd{x}_{t_n}) \mapsto\\ 
    &((\bd{x}_{t_1}, \bd{x}_{t_2}), (\bd{x}_{t_2}, \bd{x}_{t_3}), \dots, (\bd{x}_{t_{n-1}}, \bd{x}_{t_n})).
\end{align*}
Such a transformation was considered in \citet{Bernton2019} for time series data, and may improve the accuracy of the \gls{abc} posteriors in practical, non-asymptotic settings. Results obtained with such a transformation are indicated by a ``(delay)'' suffix. All other implementation details are provided in Appendix \ref{app:imp_det}. Code for the experiments is available at \url{https://github.com/joelnmdyer/SignatureABC}.

% \begin{sidewaysfigure}
\begin{figure*}
    \centering
    \includegraphics[width=\linewidth]{fig3.png}
    \caption{(Geometric Brownian motion) (\textbf{a}) Wasserstein distances and (\textbf{b}) maximum mean discrepancies between the posteriors obtained with each ABC method and an approximate ground truth from \gls{mh}. (\textbf{c}) Squared distances between the means of the \gls{abc} posteriors and the posterior mean from \gls{mh}. Our methods are coloured blue.}
    \label{fig:gbm_metrics}
% \end{sidewaysfigure}
\end{figure*}

\subsection{The Ricker Model}

The Ricker model is a simple model of ecological dynamics that exhibits chaotic behaviour and has an intractable likelihood function. The state of the model, which tracks the size $N_t \in \mathbb{R}_{\geq 0}$ of a population over discrete time steps $t = 1, \dots, n$, evolves as
\begin{equation}
    \log{N_{t+1}} = \log{r} + \log{N_{t}} - N_{t} + \sigma\epsilon_t,
\end{equation}
where $r > 0$ is a growth parameter and $\epsilon_t \sim \mathcal{N}(0, 1)$. Following \citet{Wood2010}, we assume Poissonian observations
\begin{equation}
    \bd{y}_t \sim \text{Po}(\phi N_t) \in \mathbb{N},
\end{equation}
where $\phi > 0$ is a scale parameter. We assume the task of recovering the posterior distribution for $\bth = (\log{r}, \phi, \sigma)$ given a time series of length $n=50$, $\by = (\bd{y}_1, \bd{y}_2, \dots, \bd{y}_{n}) \sim p_{\bth^{*}}$ with $\bth^{*} = (4, 10, 0.3)$. We take $N_0 = 1$. We further assume the following independent, uniform priors for each parameter:
\begin{equation}
    \log{r} \sim \mathcal{U}(3,8),\quad 
    \phi \sim \mathcal{U}(0,20),\quad  
    \sigma \sim \mathcal{U}(0,0.6).
\end{equation}

For \gls{saabc}, the hand-crafted summary statistics we use are those proposed in \citet{Wood2010}, and consist of: the autocovariances to lag 5; the mean; the number of zeros in the sequence; the coefficients of the regression $\bd{x}_{t+1}^{0.3} = \beta_1 \bd{x}_{t}^{0.3} + \beta_2 \bd{x}_{t}^{0.6} + \epsilon_t$ for error term $\epsilon_t$; and the coefficients of the cubic regression of the ordered differences $\bd{x}_t - \bd{x}_{t-1}$ on their observed values.

In Figure \ref{fig:ricker_metrics}, we show boxplots for the Wasserstein distances and MMDs between samples from the \gls{abc} posteriors -- denoted with $\hat{\pi}_{\mathrm{ABC}}$ -- and samples from an approximation of the true posterior obtained using \gls{pmcmc} (\citet{andrieu2010particle}; see Appendix \ref{app:mcmc} for details), which we denote with $\hat{\pi}_{\cdot \mid \by}$. We also show boxplots for the Euclidean distances between the \gls{abc} posterior means and the \gls{pmcmc} posterior mean. These boxplots are all obtained by running the \gls{abc} procedure 20 times with the same observed dataset but different seeds for the \gls{abc} procedure. 

From this, we see that the signature-based methods tend to produce better performance across all three metrics considered. In more detail, the estimate of the approximate ground truth posterior obtained with the signature-based methods are more accurate than K2-ABC and W-ABC, as reflected in the Wasserstein distances and MMDs. For \gls{sabc}, this performance gap is enhanced with the additional application of the lag-1 delay transformation, which is indicated with suffix ``(delay)'' in Figure \ref{fig:ricker_metrics}. No such improvement is observed when this transformation is applied to competing methods. We note that \gls{saabc} performs particularly well in this example, as a consequence of its use of hand-crafted summary statistics developed specifically for this simulation model. However, the potential power of our signature-based methods is demonstrated by the fact that \gls{skrr} is able to outperform \gls{saabc} in all three metrics, despite the latter using summary statistics carefully engineered by experts. Finally, we %once again
observe more accurate estimates of the true posterior mean using our signature-based methods than using W-ABC and \gls{saabc}. The posterior mean estimates from \gls{sabc} without the delay transformation and \gls{skrr} are also more accurate than those of \gls{mmd}, further evidencing the usefulness of our signature-based methods.

\subsection{Geometric Brownian Motion}

\Gls{gbm} is a stochastic differential equation widely used in mathematical finance to model a stock price $x_t$ evolving with time $t$ according to
\begin{equation}
    \mathrm{d} x_t = \mu x_t \mathrm{d}t + \sigma x_t \mathrm{d} W_t,
\end{equation}
where $\mu$ is the percentage drift, $\sigma$ is the volatility, and $W_t$ is a Brownian motion. With $\epsilon_i \sim \mathcal{N}(0,1)$, this model permits an exact discretisation for $i = 1, 2, \dots, n-1$ as
\begin{equation}\label{eq:GBM_disc}
    \log\left(\bd{x}_{i\Delta t}/\bd{x}_{(i-1)\Delta t}\right) = \left(\mu - \frac{1}{2}\sigma^2\right) \Delta t + \sigma \sqrt{\Delta t}\, \epsilon_i,
\end{equation}
which implicitly defines the model $p_{\bth}$ from which we simulate. We fix $\bd{x}_0 = 10$, $n = 100$, and $\Delta t = 1/(n-1)$, and simulate the dynamics over the interval $[0,1]$.

\glsreset{mh}

We consider the task of recovering the posterior for $\bth = (\mu, \sigma)$ given an observation $\by = (\bd{y}_0, \bd{y}_{\Delta t}, \bd{y}_{2\Delta t}, \dots, \bd{y}_{(n-1)\Delta t}) \sim p_{\bth^{*}}$ with $\bth^{*} = (0.2, 0.5)$. We assume independent, uniform priors $\mu \sim \mathcal{U}(-1,1), \sigma \sim \mathcal{U}(0.2, 2)$. %
Inference is amenable to standard, exact likelihood-based Bayesian techniques such as \gls{mh} sampling using the transition density implied by \eqref{eq:GBM_disc}, enabling a comparison against an approximate ground truth posterior. For \gls{saabc}, we follow \citet{Fearnhead2012} and %once again 
regress the parameters $\bth$ onto the first, second, third, and fourth powers of  summary statistics of the time series. Specifically, we take the first, second, third, and fourth powers of the variance and lag-1 and -2 autocorrelations of the increments of the log time series, $\log{(\bd{x}_{i\Delta t}/\bd{x}_{(i-1)\Delta t})}$, since these are informative of the parameters being inferred. 



\begin{table*}
    \centering
    \caption{Median (1st quartile--3rd quartile) Performance Metrics for Section \ref{sec:gse}. Best Performance in \textbf{Bold}.}\label{tab:gse_metrics}
    \begin{tabular}{c|c|c|c}
      \toprule % from booktabs package
      \bfseries Method & $\mathcal{W}_1(\hat{\pi}_{\mathrm{ABC}}, \hat{\pi}_{\cdot|\bd{y}})\,$ ($\times 10^{-3}$) & $\mathrm{MMD}^2(\hat{\pi}_{\mathrm{ABC}}, \hat{\pi}_{\cdot|\bd{y}})\,$ ($\times 10^{-2}$) & $\|\hat{\bth}_{\mathrm{ABC}} - \hat{\bth}_{\mathrm{True}}\|^2\,$ ($\times 10^{-5}$)\\
      \midrule % from booktabs package
      Signature ABC & \textbf{4.8 (4.3--5.6)} & \textbf{6.1 (4.1--7.8)} & \textbf{0.32 (0.22--0.43)} \\
      Wasserstein ABC & 7.3 (6.4--7.7) & 7.6 (6.8--9.1) & 0.46 (0.29--0.72) \\
      K2-ABC & 520.4 (519.8--521.3) & 34.44 (34.39--34.49) & 27036 (26947--27126) \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table*}

In Figure \ref{fig:gbm_metrics}, we show boxplots for the Wasserstein distances and MMDs between the different \gls{abc} posteriors and the approximate ground truth posterior obtained with \gls{mh}, in addition to the Euclidean distance between the \gls{abc} posterior means and the \gls{mh} posterior mean. The boxplots were generated by repeating the rejection ABC procedure for each distance measure with 20 different random seeds. We see that the signature-based methods once again produce lower Wasserstein distances and MMDs between their \gls{abc} posteriors and the \gls{mh} posterior. Indeed, \gls{sabc} with the lag-1 delay transformation uniformly dominates the non-signature methods across all three metrics. 
% \vspace{-1em}

\subsection{Irregular, Multivariate Sequences: Generalised Stochastic Epidemics}\label{sec:gse}

The signature method naturally allows for inference with multivariate and/or irregularly spaced time series. To demonstrate this, we consider a generalised stochastic epidemic model \citep{Kypraios2007EfficientBI}, which simulates the spread of an infection through a fixed population of $Z$ individuals. Individuals are initially susceptible, may become infected, and subsequently recover without the possibility of reinfection. The dynamics of the model are determined by parameters $\beta$ and $\gamma$, which control the rate of infection and recovery according to the following transition probabilities:
\begin{align*}
    &P_I := \mathbb{P}\left((\delta X_{t}, \delta Y_t) = (-1, 1) \mid \mathcal{H}_t \right) = \beta X_t Y_t \delta t + o(\delta t),\\
    &P_R := \mathbb{P}\left((\delta X_{t}, \delta Y_{t}) = (0, -1) \mid \mathcal{H}_t \right) = \gamma Y_t \delta t + o(\delta t),
    \\
% \end{align*}
% and
% \begin{equation*}
    &
    \mathbb{P}\left((\delta X_{t}, \delta Y_t) = (0, 0) \mid \mathcal{H}_t \right) = 1 - (P_I + P_R) + o(\delta t),
% \end{equation*}
\end{align*}
where $X_t$ and $Y_t$ are the number of susceptible and infected individuals at time $t \in [0, T]$, respectively, and $\mathcal{H}_t$ is a sigma-algebra generated by the process up until time $t$. These three transition probabilities thus capture infection, recovery, and an absence of activity, respectively. 

We consider the problem of recovering the posterior density for $\bth = (\beta, \gamma)$ given observations of the infections and recoveries occurring in the observation period $[0, 50]$ in a system of $Z=100$ individuals. For every simulation, the epidemic begins with one infected individual at time $t=0$. We generate ``empirical'' data at parameters $\bth^{*} = (10^{-2}, 10^{-1})$, and assume priors 
% \begin{equation}
$\beta \sim \Gamma(\lambda_{\beta}, \nu_{\beta})$ and $\gamma \sim \Gamma(\lambda_{\gamma}, \nu_{\gamma})$,
% \end{equation}
with concentration and rate parameters $\lambda_{\beta} = 0.1$, $\nu_{\beta} = 2$, $\lambda_{\gamma} = 0.2$, and $\nu_{\gamma} = 0.5$. It can be shown \citep{Kypraios2007EfficientBI} that this prior is conjugate for the model, leading to a tractable posterior density; further details are provided in Appendix \ref{app:gse}. Thus, samples can be drawn from the exact posterior for a given dataset simulated by this model. 
%

We simulate the model using the Gillespie algorithm \citep{Gillespie}, such that the lengths of the simulated sequences, and the spacing between points in the sequences, are random. Operationally, the model is simulated as followed: given that an infection/recovery event occurred at time $t$, the time $\Delta t$ until the next event is simulated as $\Delta t \sim \text{Exp}(1/R_t)$ where $R_t = \beta X_t Y_t + \gamma Y_t$, and the event is chosen to be an infection (resp. recovery) event with probability $\beta X_t Y_t / R_t$ (resp. $\gamma Y_t / R_t$).  


We show in Table \ref{tab:gse_metrics} the median and first and third quartiles for the Wasserstein distances and MMDs between samples from W-ABC, \gls{sabc}, and \gls{k2abc} posteriors to samples from the exact posterior. To obtain these approximate posteriors, we run Algorithm \ref{alg:Rej} with the same observed time series with $N=10^5$ and $M=100$ for 20 different seeds for the \gls{abc} procedure. We also show the same for the squared distance between the posterior means and the exact posterior mean. Contour plots obtained by running the inference procedure at these 20 different seeds for the \gls{abc} procedure and pooling the best $M$ distances from each (giving 2000 samples) are shown in Figure \ref{fig:GSE_post}, along with samples from the exact posterior. The \gls{mmd} performed especially poorly in this experiment; we thus omit samples from the \gls{k2abc} posterior in Figure \ref{fig:GSE_post} for clarity.

From this, we see that the natural notion of distance between multivariate and irregularly sampled time series data of different lengths, enabled by the use of path signatures, manifests as better recovery of both the true posterior distribution and the true posterior mean in this example, in which the Wasserstein distances and MMDs between posteriors and Euclidean distances between posterior means for \gls{sabc} are generally lower than those obtained using the Wasserstein distance and the MMD is distances in ABC. 

\subsection{Computational Complexity and Cost}

Evaluating the signature kernel for two streams $\by \in \mcal{X}^n$ and $\bx \in \mcal{X}^m$ with $\mcal{X} = \mathbb{R}^d$ has complexity that is linear in $d$ and linear in the product $nm$ \citep{salvi2020computing}. 
This is likewise the case for \gls{mmd}, which has complexity $\mathcal{O}\left(n^2\right)$ \citep{Park2016}, and compares favourably with 
the Wasserstein distance, which in multivariate settings is known to scale poorly with $n$. \citet{Bernton2019}, for example, note costs of order $n^3$ when the Hungarian algorithm is used to solve the assignment problem. Alternative algorithms with favourable performance are an active area of research.

\begin{figure}
\centering
\includegraphics[width=\columnwidth, trim=20 40 20 60, clip]{fig6.png}
\caption{(Stochastic epidemic) Contour plot of the joint posterior density recovered with the Wasserstein distance (dashed purple lines) and Signature \gls{abc} (solid blue lines), and samples from the true posterior (filled yellow contours).\label{fig:GSE_post}}
\end{figure}

While the complexity of signature evaluations compares favourably to alternative distance measures, we observe in our experiments that our signature-based approaches tended to incur larger computational costs with current implementations; see Table \ref{tab:costs}. These increased costs may not be an inherent feature of signature-based methods, however: research on signature methods in machine learning and computational statistics is active and relatively nascent, and it is plausible that more efficient implementations of signature computations will emerge with time. Additionally, techniques for reducing the computational burden of the signature-based methods we introduce can be employed, such as the Nystr\"{o}m method \citep{williams2000using} or random Fourier features \citep{rahimi2007random, yang2012nystrom} in \gls{skrr}, and the truncated signature kernel \citep{Kiraly2019} in both \gls{sabc} and \gls{skrr}. While we have not experimented with these cost-reduction techniques in the current work, future practical implementations of \gls{sabc} and \gls{skrr} may incorporate such approximations, each of which have been implemented in, e.g., the \href{https://github.com/tgcsaba/KSig}{\texttt{KSig} package}. Such approximations introduce further hyperparameters such as the truncation degree, however, which must be tuned; in the case of \gls{skrr}, this can be done with cross-validation, but it is less clear how this might be done for \gls{sabc}. Furthermore, in the case of \gls{sabc}, too severe a truncation may destroy the asymptotic results presented in Proposition \ref{thm:abc_convergence}. Nevertheless, these are avenues that can be explored in future work in order to reduce the computational burden of these methods.

\section{Conclusion}

In this paper, we introduced two novel approaches -- Signature \gls{abc} and Signature Regression \gls{abc} -- to performing approximate Bayesian computation with time series simulation models. Each method relies on the path signature -- an object that is fundamental to the theory of controlled differential equations and rough paths -- which is associated with the path traversed by a sequence of data points. In particular, we make use of the signature kernel to construct and compute discrepancies between time series data arising in \gls{abc} settings without manually contriving summary statistics. 
%

We show that the natural notion of distance between time series to which such an approach leads generates an \gls{abc} posterior that converges to the ground-truth posterior as the \gls{abc} tolerance parameter reduces to $0$. To illustrate our proposed methods, we present multiple examples of Bayesian inference tasks in which our approaches outperform existing techniques that are common in the approximate Bayesian inference literature; indeed, in each experiment we consider, at least one signature-based method uniformly dominates competing methods across all three of the metrics considered in this paper. We furthermore demonstrate that our methods are applicable to more complex settings than univariate time series, for example simulators generating complex multivariate and irregularly sampled sequences.

While we have compared the different distance measures using a basic rejection algorithm in this paper in the interest of a simple and transparent comparison, we note that our proposed methods can be embedded within other more sophisticated sampling algorithms, for example \gls{mcmc} or sequential Monte Carlo methods. Additionally for the Signature Regression \gls{abc} method, there is the possibility of incorporating mechanisms for generating more accurate regression results, for example using a pilot run to determine regions of non-negligible posterior mass as described in \citet{Fearnhead2012}. This may allow for improved approximations to the true posterior density. With respect to the choice of \gls{sabc} vs. \gls{skrr}: whether one should be preferred over the other depends to a large extent on what is of interest to the experimenter, given that semi-automatic approaches to \gls{abc} were originally motivated by the desire to accurately recover point estimates of interest \citep{Fearnhead2012} while other \gls{abc} methods aim to accurately approximate the full posterior. Additionally, we observe empirically that \gls{skrr} and \gls{saabc} seem to exhibit a somewhat larger variation in performance over non-semi-automatic approaches (see, e.g., Figures \ref{fig:ricker_metrics}, \ref{fig:gbm_metrics}, and \ref{fig:bhn_metrics}), which may be a manifestation of the additional stochasticity introduced in training a regression model prior to posterior construction.

\begin{table*}
    \centering
    \caption{Approximate Average CPU Times (in seconds) for each \gls{abc} Approach. (Simulation Budgets and Hardware Availability are Constant).}\label{tab:costs}
    \begin{tabular}{c c|c|c|c|c|c|c}
      \toprule % from booktabs package
      \bfseries Experiment & \multicolumn{6}{c}{\textbf{Method}} \\
      \cmidrule(l){2-8}
      & \gls{sabc} & \gls{sabc} (delay) & \gls{skrr} & W-ABC & W-ABC (delay) & \gls{saabc} & K2-ABC\\
      \midrule % from booktabs package
      Ricker & $2\times 10^2$ & $2\times 10^2$ & $2\times 10^4$ & $6\times 10^1$ & $8\times 10^1$ & $10^2$ & $4\times 10^1$ \\
      GBM    & $10^3$ & $10^4$ & $6 \times 10^4$ & $4 \times 10^3$ & $4 \times 10^3$ & $9 \times 10^2$ & $4 \times 10^3$ \\
      GSE    & $10^4$ & -- & -- & $2\times 10^2$ & -- & -- & $10^5$ \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table*}

\newpage

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.


\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
The authors are grateful to Horatio Boedihardjo, Lajos Gergely Gyurko, Zacharia Issa, Terry Lyons, James Morrill, Harald Oberhauser, and Cristopher Salvi for their comments, feedback, and helpful discussions. JD was supported by the EPSRC Centre For Doctoral Training in Industrially Focused Mathematical Modelling (EP/L015803/1) in collaboration with Improbable. JD was also supported by the EPSRC grant EP/W002949/1 and The Alan Turing Institute under the EPSRC grant EP/N510129/1.
\end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Approximate Bayesian Computation with Path Signatures\\(Supplementary Material)}
\maketitle

\appendix

\section{A simple example of the Wasserstein curve matching distance for time series data}\label{app:WassersteinFailure}

In Section \ref{sec:back}, we discuss that the Wasserstein curve matching distance will generally permit permutations of the elements within the two time series that are being compared, and that this limits the suitability of the method for time series data -- a data type for which the ordering of the data is in general of ultimate importance.

As a simple example of this feature of the behaviour of the Wasserstein curve matching distance, consider the following two time series with elements observed at times $t = 0, 1, 2$:
\begin{align*}
    \bx &= (\bx_0, \bx_1, \bx_2) = (1, 3, 2)\\
    \by &= (\by_0, \by_1, \by_2) = (5, 1, 4).
\end{align*}
The matrix of pairwise distances for each element of $\bx$ with each element of $\by$ under the distance in Equation \eqref{eq:wass_curve} is then
\begin{equation}
    M(\bx, \by) = 
    \left[\begin{array}{CCC}
        4 & 0 & 3\\[4pt]
        2 & 2 & 1\\[4pt]
        3 & 1 & 2
    \end{array}\right]
    + \lambda 
    \left[\begin{array}{CCC}
        0 & 1 & 2\\[4pt]
        1 & 0 & 1\\[4pt]
        2 & 1 & 0
    \end{array}\right].
\end{equation}
With the choice of $\lambda = 1$, the minimal-cost assignment of elements in $\bx$ to elements in $\by$ will be to match $\bx_0$ with $\by_1$, $\bx_1$ with $\by_0$, and $\bx_2$ with $\by_2$, with a total associated cost of $2$. This example demonstrates that this approach continues to treat the elements of the time series as fundamentally exchangeable, despite the incorporation of information regarding their ordering in the distance measure used. Such an approach has limited suitability to time series data for this reason.

\section{Path signatures}\label{app:signatures}

\subsection{Further background on path signatures}

In the main text, we introduced path signatures as maps from $h \in \bv{\mathcal{H}}$ to elements of $\pH$ with finite norm, where the $m$th term of the signature takes value
\begin{equation}\label{eq:sig_terms}
    S_m(h) := \int_{0}^T {\d }h^{\otimes m} 
    := \int_{0}^T \int_{0}^t {\d }h^{\otimes (m-1)} \otimes {\d }h_t
    = \underset{0 \leq t_1 < \dots < t_m \leq T}{\int \dots \int} {\d }h_{t_1} \otimes \dots \otimes {\d }h_{t_m}.
\end{equation}
This introduction to path signatures, provided in Section \ref{sec:back}, is quite general. To introduce signatures more completely, it is instructive to consider %
the special case of a smooth, finite-dimensional path $x : [0,T] \to \mcal{H}$ with $\mcal{H} = \mathbb{R}^d$. The depth-$m$ term of the path signature for $x$ can then be written as
\begin{equation}
    S_m(x) = \int_{0}^T {\d }x^{\otimes m} = \underset{0 \leq t_1 < \dots < t_m \leq T}{\int \dots \int} \left( \frac{\d x}{\d t} \bigg\vert_{t_1} \otimes \dots \otimes \frac{\d x}{\d t} \bigg\vert_{t_m} \right){\d t_1} \dots {\d t_m},
\end{equation}
where for $a\in \mathbb{R}^{\alpha_1 \times \dots \times \alpha_k}$ and $b \in \mathbb{R}^{\beta_1 \times \dots \times \beta_l}$ the tensor product $\otimes$ operates as $\left(a_{i_1,\dots,i_k}, b_{j_1,\dots,j_l}\right)\\ \mapsto a_{i_1,\dots,i_k}b_{j_1,\dots,j_l}$ and the integrals can be taken in the Riemann-Stieltjes sense. 

\begin{Remark}
    Since we have assumed our paths to be of bounded variation, the integrals above can be understood as the Riemann-Stieljes integrals with respect to $h$. When the underlying path is not smooth, the integrals are taken to be stochastic or rough path integrals \citep{chevyrev2018signature}. For example, in the case of Brownian motion in $\mathbb{R}^d$, the integrals are stochastic and can be taken in the Stratonovich sense. For a larger class of stochastic processes, rough path theory \citep{LyonsT.J2007Dedb} provides an integration theory that enables the computation of the terms in the signature. As we will discuss later, this work considers throughout only linear interpolations between points in time series, so all paths considered here are of finite variation.
\end{Remark}

Path signatures are thus infinite sequences of statistics for path-valued random variables capturing information regarding the order of observations along, and the interaction between, different channels of the path. They are grounded in the theory of \glspl{cde} and stochastic analysis, and appear in the solutions of \glspl{cde} and \glspl{sde} as obtained through a procedure analogous to Picard iterations for ordinary differential equations.

To see this, we follow \citet{LyonsT.J2007Dedb} and let $V$ and $W$ be two Banach spaces, $B : V \to \mathbf{L}(W, W)$ be a bounded linear map -- where $\mathbf{L}(W, W)$ denotes the space of bounded linear mappings from $W \to W$ --- and $h : [0,T] \to V$ be a continuous path of bounded variation. Consider the following set of linear equations:
\begin{flalign}\label{eq:cde_1}
    && {\d }g_t &= Bg_t\, {\d }h_t,\ \ g_0 \in W &&\\\label{eq:cde_2}
    && {\d }\phi_t &= B\phi_t\, {\d }h_t,\ \ \phi_0 \in \mathbf{L}(W, W). &&
\end{flalign}
Here, $Bg_t\, {\d }h_t$ is taken to mean $\left\{B({\d }h_t)\right\}(g_t)$ while $B\phi_t\, {\d }h_t$ is $B({\d }h_t)\circ \phi_t$. By applying the aforementioned iterative procedure to recover the solution $\phi_t$ to \eqref{eq:cde_2}, we obtain
\begin{equation}\label{eq:cde_sol}
    \phi_t = \sum_{m \geq 0} B^{\otimes m} \int_{0}^t {\d }h^{\otimes m},
\end{equation}
in which we see that the signature terms, Equation \eqref{eq:sig_terms}, appear in the summand. The solution to \eqref{eq:cde_1} is then obtained from the flow $\phi_t$ as $g_t = \phi_t(h_0)$. Similarly, a solution to the following linear \gls{sde} driven by Brownian motion $W$,
\begin{equation*}
    \d Y_t = A(Y_t) \circ \d W_t, \quad Y_0 = y_0
\end{equation*}
for some linear operator $A$, can be obtained as
\begin{equation*}
    Y_t = \sum_{m \geq 0} A^{\otimes m} S_{m,[0,t]}(W)\, y_0,
\end{equation*}
where $S_{m,[0,t]}(W)$ is the order-$m$ tensor in the signature of $W_t$ over interval $[0,t]$ and the integrals are taken in the Stratonovich sense \citep[][Section 3.3.2]{LyonsT.J2007Dedb}. As we have seen here, signatures arise naturally as good approximations to solutions of \glspl{cde} and \glspl{sde}, and accurately describe the response of systems such as that of Equations \eqref{eq:cde_1}-\eqref{eq:cde_2} to an input signal $h$, where the inclusion of terms of increasing order further refine the approximate solution. The above sums, such as in Equation \eqref{eq:cde_sol}, converges as a result of the factorial rate of decay of the terms in the signature:
\begin{Proposition}[Proposition 2.2, \citet{LyonsT.J2007Dedb}]\label{thm:fact_decay}
    Let $V$ be a Banach space and $h \in \bv{V}$. Then, for each $m \geq 0$,
    \begin{equation}
        \norm{\int_{0}^{T} {\d }h^{\otimes m}}{V^{\otimes m}} \leq \frac{\onevar{h}}{m!}.
    \end{equation}
\end{Proposition}

\begin{Remark}
The signature of a univariate path consists only of powers of the difference between the final and initial points in the stream \citep[see e.g.][Example 5]{Chevyrev2016}. Therefore in practice one always considers paths in at least two dimensions. This can always be achieved by including the observation time as a channel in the path.
\end{Remark}

\subsection{Invariances}\label{app:signature_invariances}

Further properties of the signature include its translation and reparameterisation invariance (Section 2.2.2, \citet{LyonsT.J2007Dedb}; Theorem 3.4.2, \citet{geng2015a}):
% \vspace{-0.5cm}
\begin{Proposition}
    Let $h \in \bv{\mcal{H}}$, $a \in \mcal{H}$, and $\psi : [0,T] \to [0,T]$ a non-decreasing surjection. Then $\sig{h + a} = \sig{h}$ and $\sig{h \circ \psi} = \sig{h}$. 
\end{Proposition}
\vspace{-0.1cm}
In this way, signatures are able to factor out nuisance and potentially infinite-dimensional symmetries where this is beneficial. However, when such invariances are disadvantageous, they can easily be destroyed with two extremely simple preprocessing techniques: \textit{time-augmentation}, in which the path $(t, h_t)$ is instead considered, and \textit{basepoint augmentation}, in which $h_0 = c$ for some fixed constant $c \in \mcal{H}$ is enforced for all paths under consideration.

Another more interesting invariance property results from the signature's inability to identify regions of the path in which, informally speaking, a retracing of the path occurs \citep{chen1958integration, Hambly_2010, BOEDIHARDJO2016720}; that is, for example, paths of the form $a \star b \star \overleftarrow{b} \star c$ for $a, b, c \in \bv{\mcal{H}}$, where $\star$ denotes concatenation and $\overleftarrow{b}$ is the path $b$ ``run-backwards''. Paths in which such retracings occur are referred to as \emph{tree-like equivalent} to their reduced paths such that, for example, $a \star b \star \overleftarrow{b} \star c \sim_t a \star c$, where $\sim_t$ denotes tree-like equivalence. %
While this phenomenom has previously been studied in more specific cases \citep{chen1958integration, Hambly_2010}, the most general form of this invariance property is provided by \citet{BOEDIHARDJO2016720}, a special case of which may be stated as follows:
% \vspace{-0.2cm}
\begin{Theorem}[\citet{Hambly_2010, BOEDIHARDJO2016720}]\label{thm:sig_inj_tle}
    Let $V$ be a Banach space and $h, g \in \bv{V}$. Then $\sig{h} = \sig{g}$ iff $h \sim_t g$.
\end{Theorem}
% \vspace{-0.2cm}
In the real world, however, tree-like equivalent paths are rare and can straightforwardly be avoided by considering only time-augmented paths $h : [0,T] \to \mcal{H} \times [0,T],\ t \mapsto (t, h_t)$. Such a transformation ensures that the path is injective, meaning no partial retracing can occur at any point along the path. This, along with their universal nonlinearity property, demonstrates that signatures are powerful and faithful representations of paths and are, essentially, an injective feature map for path-valued random variables. Signatures are therefore an appealing option for performing inference for stochastic process simulators.

\subsection{Shuffle-product property}\label{app:shuffle_sig}

The terms of the path signature exhibit a so-called \textit{shuffle-product} property: 

\begin{Theorem}[Theorem 2.29, \citet{LyonsT.J2007Dedb}]
    Let $h \in \bv{\mcal{H}}$. Then
    \begin{equation*}
        \int_{0}^T {\d }h^{\otimes m} \otimes \int_{0}^T {\d }h^{\otimes m'} = \sum_{\sigma} \sigma\left(\int_{0}^T {\d }h^{\otimes (m+m')} \right),
    \end{equation*}
    where the sum is taken over all order shuffles, defined as
    \begin{equation*}
        \{\sigma: \sigma \text{ is a permutation of } \lbrace{1, \dots, m + m'\rbrace}\\ \text{ with } \sigma(1) < \dots < \sigma(m), \sigma(m+1) < \dots < \sigma(m+m')\}.
    \end{equation*}
    $\sigma$ then acts on $\mcal{H}^{\otimes (m+m')}$ as $\sigma(e_{i_1} \otimes \dots \otimes e_{i_{m+m'}}) = e_{\sigma(i_1)} \otimes \dots \otimes e_{\sigma(i_{m+m'})}$.
\end{Theorem}

\subsection{Function approximation capabilities}\label{app:un}

We state informally in the main text that signatures enjoy a universal nonlinearity property. This may be stated more formally as follows:
\begin{Theorem}[Appendix A.2, \citet{Kiraly2019}]\label{thm:un}
    Let $\mcal{K}$ be a compact set of non-tree-like (see Appendix \ref{app:signature_invariances}) paths of bounded variation, and $C(\mcal{K}, \mathbb{R})$ be the space of continuous, real-valued function on $\mcal{K}$. Then the space of linear functionals on signatures of paths in $\mcal{K}$ is dense in $C(\mcal{K}, \mathbb{R})$; that is, for any $f \in C(\mcal{K}, \mathbb{R})$ and any $\varepsilon>0$, there exists an $L \in \bigoplus_{m\geq 0}\mcal{H}^{\otimes m}$ such that
    \begin{equation*}
        \sup_{h\in \mathcal{K}}\Big|f(h) - L\{\sig{h}\}\Big| < \varepsilon.
    \end{equation*}    
\end{Theorem}

This is a consequence of the \emph{shuffle product} property of signatures (see Appendix \ref{app:shuffle_sig} above) and the Stone-Weierstrass theorem. (An issue that arises in the application of the classical Stone-Weierstrass theorem in this context is that the space of interest to us -- $\bv{\mcal{H}}$ -- is not locally compact. The classical Stone-Weierstrass theorem therefore cannot strictly be applied here. However, \citet{chevyrev2018signature} demonstrate that a Stone-Weierstrass result exists by equipping the space of continuous bounded real-valued functions on $\bv{\mcal{H}}$ with an appropriate topology. See \citet{chevyrev2018signature} for details.)

\subsection{Additional pre-processing}\label{sec:PathTrans}
%
% \vspace{-0.2cm}
Prior to lifting the sequence to a path in $\mcal{H}$, and depending on the nature of the data at hand, it is sometimes appropriate to apply a transformation to the data: %
certain transformations may enable the signature to represent information in the stream more conveniently for the learning task at hand. A large set of such transformations have been proposed in the literature on inference using path signatures; see \citet{Morrill2020} for a recent summary and comparison of many of these. Here, we describe two such pre-signature transformations that we will use in this paper. %

% \vspace{-0.5cm}
\paragraph{Cumulative sum} Recall from Figure \ref{fig:geom} that the depth 1 signature terms correspond to the increment along the path, and that a subset of the depth 2 terms correspond to the areas above and below the curve. For certain data types, for example non-negative binary or spiking data, the data may not be well-characterised by these terms by default. In such cases it can be beneficial to consider instead the cumulative sum of the observations \citep{Kiraly2019}, which can intuitively be thought of as propagating information from earlier in the sequence to later in the stream, more readily exhibiting the structure of the stream. The effect of this can be to shift information into lower order terms in the signature, for example the increments (depth 1 terms).

% \vspace{-0.4cm}
\paragraph{Delay transformation} A transformation that is common in time series analysis is a delay transformation, for example the lag-1 delay transformation:
\begin{equation}
    (\bd{x}_{t_1}, \bd{x}_{t_2}, \dots, \bd{x}_{t_n}) \mapsto ((\bd{x}_{t_1}, \bd{x}_{t_2}), (\bd{x}_{t_2}, \bd{x}_{t_3}), \dots, (\bd{x}_{t_{n-1}}, \bd{x}_{t_n})).
\end{equation}
Applying this transformation before applying the signature may help to encode temporal features of the time series.

% \vspace{-0.3cm}
\subsubsection{Augmentations}
%
As noted previously, two augmentations can be applied to remove the signature's translation and reparameterisation invariance properties:

\paragraph{Time augmentation} In this transformation, the uniformly increasing time index $0 = t_1 < t_2 < \dots < t_n = T$ is added as a channel in the sequence:
\begin{equation}
    (\bd{x}_{t_1}, \bd{x}_{t_2}, \dots, \bd{x}_{t_n}) \mapsto \left((t_1, \bd{x}_{t_1}), (t_2, \bd{x}_{t_2}), \dots, (t_n, \bd{x}_{t_n})\right),
\end{equation}
denoting the times at which the points in the series occurred.

\paragraph{Basepoint augmentation} With this transformation, all sequences are enforced to assume a common but otherwise arbitrary initial value. This can be achieved by simply concatenating an arbitrary constant value to the beginning of each sequence.

\paragraph{Lead-lag transformation} This transformation operates on a sequence $\bd{x} = (\bd{x}_{t_1}, \bd{x}_{t_2}, \dots, \bd{x}_{t_n})$ as follows:
\begin{equation}
    (\bd{x}_{t_1}, \bd{x}_{t_2}, \dots, \bd{x}_{t_n}) \mapsto \left((\bd{x}_{t_1}, \bd{x}_{t_1}), (\bd{x}_{t_1}, \bd{x}_{t_2}), (\bd{x}_{t_2}, \bd{x}_{t_2}), \dots, (\bd{x}_{t_{n-1}}, \bd{x}_{t_n}), (\bd{x}_{t_n}, \bd{x}_{t_n})\right).
\end{equation}
Under this transformation, the number of channels in the sequence doubles, and the sequence length increases from $n$ to $2n-1$. Applying this transformation enables the signature to emphasise certain properties of the path %
such as the quadratic variation %
and the L\'{e}vy area when combined with the cumulative sum \citep{Gyurk2014, Chevyrev2016}. For datasets for which these quantities are believed to be important, applying the lead-lag transformation may be appropriate.

\section{Proof of Proposition \ref{thm:abc_convergence}}\label{app:sabc_proof_a}

In this section, we denote the space of piecewise linear paths of bounded variation in $\mcal{H}$ over time interval $[0,T]$ with $\Pl{\mcal{H}}$. We will furthermore abuse notation slightly by letting $\kap{\bd{x}} \in \Pl{\mcal{H}}$ denote the path in Equation \eqref{eq:lifted_path}, i.e. the linear interpolation of the lifted points $(\kap{\bd{x}_{t_1}}, \kap{\bd{x}_{t_2}}, \dots, \kap{\bd{x}_{t_n}})$, while denoting the feature map for $\bd{x}_{t}$ with $\kap{\bd{x}_t} \in \mcal{H}$. Finally, we will take $k(\bd{x}, \cdot) := \sig{\bd{x}}$ to mean the signature of the piecewise linear, $\mcal{H}$-valued path $\kap{\bd{x}}$, while $\sig{g}$ denotes the signature of a path $g \in \bv{\mcal{H}}$.

We %first 
demonstrate that the discrepancy measure in Equation \eqref{eq:sig_distance} satisfies the conditions specified in Proposition 3.1 of \citet{Bernton2019}, which gives a statement on the convergence of \gls{abc} posteriors to the true posterior under certain regularity conditions on the simulator's likelihood function as $\varepsilon\to 0$. A specific case of the statement is as follows:
\begin{Proposition}[Proposition 3.1, \citet{Bernton2019}]\label{thm:abc_convergence_app}
    Let $\mcal{X} := \mathbb{R}^d$, $\bd{y} = (\bd{y}_1, \dots, \bd{y}_n) \in \mcal{X}^n$, and $\mcal{D} : \mcal{X}^n \times \mcal{X}^n \to \mathbb{R}_{\geq 0}$ be a non-negative distance measure on $\mcal{X}^n$. Suppose $p_{\bth}(\bd{x})$ is the continuous density (with respect to the Lebesgue measure) associated with simulated data $\bd{x} \in \mcal{X}^n$ and that
    \begin{equation*}
        \sup_{\bth \in \bTh \setminus \mcal{N}_{\bTh}} p_{\bth}(\bd{x}) < \infty,
    \end{equation*}
    where $\mcal{N}_{\bTh}$ is a set such that $\pi(\bth) = 0\, \forall \bth \in \mcal{N}_{\bTh}$. Suppose further that there exists $\bar{\varepsilon} > 0$ such that
    \begin{equation*}
        \sup_{\bth \in \bTh \setminus \mcal{N}_{\bTh}} \sup_{\bd{z} \in \mcal{A}^{\bar{\varepsilon}}} p_{\bth}(\bd{z}) < \infty,
    \end{equation*}
    where $\mcal{A}^{\bar{\epsilon}} := \lbrace{\bd{z} : \mcal{D}(\bd{y}, \bd{z}) \leq \bar{\varepsilon} }\rbrace$. Suppose that $\mcal{D}$ is continuous. If $\mcal{D}(\bd{y}, \bd{z}) = 0$ iff $\bd{y} = \bd{z}$, keeping $\bd{y}$ fixed, then for any measurable $\bd{B} \subset \bTh$,
    % 
    \begin{equation}
        \lim_{\varepsilon \to 0} \int_{\bd{B}} \pi_{\varepsilon}(\bth \mid \by)\, {\mathrm d}\bth = \int_{\bd{B}} \pi(\bth \mid \by)\, {\mathrm d}\bth.
    \end{equation}
\end{Proposition}

Therefore, provided that the stated regularity conditions on the simulator's likelihood function are met, showing that the distance function in Equation \eqref{eq:sig_distance} is continuous and injective is sufficient to show that the \gls{sabc} posterior converges to the true posterior as $\varepsilon \to 0$. These requirements are indeed met under the assumptions of Proposition \ref{thm:abc_convergence_app} and under additional benign conditions:
\begin{Proposition}\label{prop:sig_cont_abc}
    Let $\mcal{X} := \mathbb{R}^d$, $\bd{y} = (\bd{y}_1, \dots, \bd{y}_n) \in \mcal{X}^n$ be the fixed real-world dataset, and $\bd{x}$ be a simulated dataset. Assume both $\bd{y}$ and $\bd{x}$ are time- and basepoint-augmented, and that $\kappa : \mcal{X} \times \mcal{X} \to \mathbb{R}$ is a uniformly bounded kernel with continuous, injective canonical feature map. Let $\mcal{D}(\bd{y}, \cdot)$ be as in Equation \eqref{eq:sig_distance}, i.e.,
    \begin{equation}
        \mcal{D}(\bd{y}, \cdot) := \rho\left\{\sig{\bd{y}}, \cdot\right\} \circ \mathrm{Sig} \circ \kappa : \mcal{X}^n \to \mathbb{R}_{\geq 0},\ \ \ \bd{x} \mapsto \norm{\sig{\bd{y}} - \sig{\bd{x}}}{}^2
    \end{equation}
    consisting of lifting the sequence $\bd{x} \in \mcal{X}^n$ to a piecewise linear path in $\mcal{H}$, before computing the squared distance between its signature and $\sig{\bd{y}}$. Then this map is uniformly continuous.
\end{Proposition}

We will proceed by noting that each constituent map in the above operation is a continuous map, and the result follows since compositions of continuous maps are continuous.


\begin{Lemma}\label{lem:one-var}
    Let $\mcal{X}^n$ be the space of length-$n$ basepoint-augmented sequences in $\mcal{X} = \mathbb{R}^d$ and $\bd{x}, \bd{z} \in \mcal{X}^n$. Then the one-variation
    \begin{equation}
        \onevar{\bd{x}} = \sum_{i=1}^{n-1} \norm{\bd{x}_{i+1} - \bd{x}_{i}}{\mcal{X}}
    \end{equation}
    is a norm on $\mcal{X}^n$.
\end{Lemma}
\begin{proof}
    The triangle inequality follows immediately as a result of the triangle inequality for the norm on $\mcal{X}$:
    \begin{align*}
        \onevar{\bd{x} + \bd{z}} &= \sum_{i=1}^{n-1} \norm{(\bd{x}_{i+1} + \bd{z}_{i+1}) - (\bd{x}_i + \bd{z}_i)}{\mcal{X}}\\
        &\leq \sum_{i=1}^{n-1} \norm{\bd{x}_{i+1} - \bd{x}_{i}}{\mcal{X}} + \norm{\bd{z}_{i+1} - \bd{z}_i}{\mcal{X}}\\
        &= \onevar{\bd{x}} + \onevar{\bd{z}}.
    \end{align*}
    Absolute homogeneity is also immediate:
    \begin{equation*}
        \onevar{s\bd{x}} = \sum_{i=1}^{n-1} \norm{s\bd{x}_{i+1} - s\bd{x}_i}{\mcal{X}} = |s| \sum_{i=1}^{n-1} \norm{\bd{x}_{i+1} - \bd{x}_i}{\mcal{X}} = |s| \onevar{\bd{x}}.
    \end{equation*}
    Finally, since the streams are basepoint-augmented, meaning $\bd{x}_1 = 0$ for all $\bd{x} \in \mcal{X}^n$, we have that $\onevar{\bd{x}} = 0$ iff $\bd{x} = (0, 0, \dots, 0)$:
    \begin{equation*}
        \onevar{\bd{x}} = 0\ \Longrightarrow\ \norm{\bd{x}_{i+1} - \bd{x}_i}{\mcal{X}} = 0\ \forall\ i = 1, \dots, n-1\ \Longrightarrow\ \bd{x}_i = \bd{x}_1 = 0\ \forall\ i.
    \end{equation*}
\end{proof}



We next show that lifting length-$n$ basepoint-augmented sequences in $\mcal{X}$ to sequences in $\mcal{H}$ is continuous if the canonical feature map $\phi$ associated with $\kappa$ is itself continuous:

\begin{Lemma}
Let $\mcal{X}^n$ be the space of length-$n$ basepoint-augmented sequences in $\mcal{X} = \mathbb{R}^d$, $\bd{x}, \bd{z} \in \mcal{X}^n$, and $\phi : \mcal{X} \to \mcal{H}$ be the canonical feature map associated with kernel $\kappa$ with \gls{rkhs} $\mcal{H}$. Assume $\phi$ is continuous. Then the map $\bd{x} \mapsto \kap{\bd{x}}$ -- where $\kap{\bd{x}}$ is the linear interpolation of the  points $(\phi(\bd{x}_1), \dots, \phi(\bd{x}_n))$ in $\mcal{H}$ -- is continuous in the one-variation topology.
\end{Lemma}

\begin{proof}
By Lemma \ref{lem:one-var}, the one-variation is a norm on length-$n$ basepoint-augmented sequences in $\mcal{X}$. We will proceed by showing that the one-variation is an equivalent norm to the 1-product norm, defined as
\begin{equation}
    \norm{\bd{x}}{\mcal{X}^n} := \sum_{i=1}^n \norm{\bd{x}_i}{\mcal{X}},
\end{equation}
which induces the product topology on $\mcal{X}^n$. By showing this, we will have the following implications:  from the definition of the 1-product norm,
\begin{equation}
    \norm{\bd{x} - \bd{z}}{\mcal{X}^n} < \tilde{\delta} \Longrightarrow \norm{\bd{x}_i - \bd{z}_i}{\mcal{X}} < \tilde{\delta}\ \text{also};
\end{equation}
by continuity of $\phi$, we have that $\forall\, \tilde{\epsilon} > 0$, $\exists\, \tilde{\delta} > 0$ such that
\begin{equation}
    \norm{\bd{x}_i - \bd{z}_i}{\mcal{X}} < \tilde{\delta} \Longrightarrow \norm{\phi(\bd{x}_i) - \phi(\bd{z}_i)}{\mcal{H}} < \tilde{\epsilon};
\end{equation}
and that choosing $\tilde{\epsilon} = \epsilon/2(n-1)$ for any $\epsilon > 0$ means that ensuring $\norm{\phi(\bd{x}_i) - \phi(\bd{z}_i)}{\mcal{H}} < \tilde{\epsilon}$ for all $i$ means
\begin{align}\nonumber
    \onevar{\kap{\bd{x}} - \kap{\bd{z}}} &= \sum_{i=1}^{n-1} \norm{\{\phi(\bd{x}_{i+1}) - \phi(\bd{z}_{i+1})\} - \{\phi(\bd{x}_i) - \phi(\bd{z}_i)\}}{\mcal{H}} \\\nonumber
    &\leq \sum_{i=1}^{n-1} \norm{\phi(\bd{x}_{i+1}) - \phi(\bd{z}_{i+1})}{\mcal{H}} + \norm{\phi(\bd{x}_i) - \phi(\bd{z}_i)}{\mcal{H}}\\\nonumber
    &< 2(n-1)\tilde{\epsilon}\\
    &= \epsilon.
\end{align}
We therefore have the following chain of implications: for every $\epsilon > 0$ there is a $\tilde{\delta} > 0$ such that
\begin{align}\nonumber
    \norm{\bd{x} - \bd{z}}{\mcal{X}^n} < \tilde{\delta} \Longrightarrow \norm{\bd{x}_i - \bd{z}_i}{\mcal{X}} < \tilde{\delta} \Longrightarrow \norm{\phi(\bd{x}_i) - \phi(\bd{z}_i)}{\mcal{H}} < \tilde{\epsilon}\\ 
    \quad \quad \Longrightarrow \onevar{\kap{\bd{x}} - \kap{\bd{z}}} < \epsilon.
\end{align}
It therefore suffices to show that for any $\tilde{\delta} > 0$ there is a $\delta > 0$ such that $\onevar{\bd{x} - \bd{z}} < \delta \Longrightarrow \norm{\bd{x} - \bd{z}}{\mcal{X}^n} < \tilde{\delta}$, which by this chain of implications would imply that $\forall\, \epsilon > 0$, $\exists\, \delta >0$ such that $\onevar{\bd{x} - \bd{z}} < \delta \Longrightarrow \onevar{\kap{\bd{x}} - \kap{\bd{z}}} < \epsilon$. 
%
% 
This follows immediately, since $\onevar{\cdot}$ and $\norm{\cdot}{\mcal{X}^n}$ are norms on finite-dimensional vector spaces, and are thus equivalent norms. In particular, we have that $\norm{\bd{x}}{\mcal{X}^n} \leq \onevar{\bd{x}}/c$, such that for all $\tilde{\delta} > 0$, we have that
\begin{equation}
    \onevar{\bd{x} - \bd{z}} < \delta := c\tilde{\delta}\Longrightarrow \norm{\bd{x} - \bd{z}}{\mcal{X}^n} < \tilde{\delta},
\end{equation}
and so we are done.
\end{proof}


We consider next the continuity of the signature map for piecewise linear paths of bounded variation in $\mcal{H}$. For such paths, the signature truncated at degree $1$ is a multiplicative functional with bounded variation (see \citet[][Section 3.1.2]{lyons2002system}) and, consequently, a special case of \citet[][Theorem 3.1.3]{lyons2002system} applies:
\begin{Lemma}\label{lemma:extension}
    Let $V$ be a Banach space, $x, z \in \bv{V}$ be two bounded variation paths in $V$, and $\tau$ be a constant such that
    \begin{equation*}
        \tau \geq 2 \left\{1 + \sum_{r = 3}^{\infty} \left(\frac{2}{r-2}\right)^2\right\}.
    \end{equation*}
    If $\varphi$ is a constant such that
    \begin{equation*}
        \onevar{x},\, \onevar{z} \leq \frac{\varphi}{\tau}\quad \quad \text{ and }\quad \quad \onevar{x - z} \leq \chi\, \frac{\varphi}{\tau}
    \end{equation*}
    for some $\chi > 0$, then for all $m \geq 1$
    \begin{equation}\label{eq:bound_sig_terms}
        \norm{S_m(x) - S_m(z)}{V^{\otimes m}} \leq \frac{\chi}{\tau} \cdot \frac{\varphi^m}{m!}.
    \end{equation}
\end{Lemma}

An immediate consequence of this is that the signature map is continuous in the 1-variation topology for bounded variation paths in Banach spaces:
\begin{Corollary}
    Let $\mcal{H}$ be a Hilbert space, $x, z \in \bv{\mcal{H}}$ be two bounded variation paths in $\mcal{H}$, and $\tau$ be as in Lemma \ref{lemma:extension}. 
    If $\varphi$ is a constant such that
    \begin{equation*}
        \onevar{x},\, \onevar{z} \leq \frac{\varphi}{\tau}\quad \quad \text{ and }\quad \quad \onevar{x - z} \leq \chi\, \frac{\varphi}{\tau}
    \end{equation*}
    for some $\chi > 0$, then
    \begin{equation*}
        \norm{\sig{x} - \sig{z}}{} \leq \frac{\chi}{\tau} \exp\left({\frac{\varphi^2}{2}}\right).
    \end{equation*}
\end{Corollary}
\begin{proof}
By definition of the norm on $\pH$,
    \begin{flalign*}
        && \norm{\sig{x} - \sig{z}}{} &= \sqrt{\sum_{m \geq 0} \norm{S_m(x) - S_m(z)}{\mcal{H}^{\otimes m}}^2} &&\\
        && &= \sqrt{0 + \sum_{m \geq 1} \norm{S_m(x) - S_m(z)}{\mcal{H}^{\otimes m}}^2} && (S_0(x) = 1\, \forall x \in \bv{\mcal{H}})\\
        && &\leq \sqrt{\sum_{m\geq 1} \frac{\chi^2}{\tau^2} \cdot \left(\frac{\varphi^m}{m!}\right)^2} && \text{(from  \eqref{eq:bound_sig_terms} above)}\\
        && &= \frac{\chi}{\tau} \sqrt{\sum_{m\geq 1} \frac{\left(\varphi^2\right)^m}{\left(m!\right)^2}} &&\\
        && &\leq \frac{\chi}{\tau} \sqrt{\sum_{m\geq 1} \frac{\left(\varphi^2\right)^m}{m!}} && \text{(smaller denominator)}\\
        && &\leq \frac{\chi}{\tau} \exp\left(\frac{\varphi^2}{2}\right). && \text{(convergent series)}
        \end{flalign*}
\end{proof}

We show next that the map $\rho\left(\sig{\bd{y}}, \cdot\right) : \pH \to \mathbb{R}_{\geq 0},\, s \mapsto \norm{\sig{\bd{y}} - s}{}^2$ is continuous. To do so, we make use of the following result:
\begin{Lemma}\label{lem:bounded_sig}
Let $\kappa$ be a uniformly bounded kernel i.e. one for which $\sup_{x \in \mcal{X}} \sqrt{\kappa(x, x)} < \infty$, and let $\kap{\bd{x}} \in \Pl{\mcal{H}}$ be a $\mcal{H}$-valued piecewise linear path with knots at $\kap{\bd{x}_i}, i= 1, \dots, n$, and $\sig{\bd{x}}$ its signature.  
Then
\begin{equation}
    \sup_{\bd{x} \in \mcal{X}^n} \norm{\sig{\bd{x}}}{} < \infty.
\end{equation}
\end{Lemma}

\begin{proof}
For all $\bd{x} \in \mcal{X}^{n}$, we have 
\begin{flalign*}
&& \onevar{ \kap{\bd{x}} } &= \sum_{i=1}^{n-1} \norm{\kap{\bd{x}_{i+1}} - \kap{\bd{x}_{i}}}{\mcal{H}} && \text{(piecewise linear)}\\
&& &\leq \sum_{i=1}^{n-1} \norm{\kap{\bd{x}_{i+1}}}{\mcal{H}} + \norm{\kap{\bd{x}_{i}}}{\mcal{H}} && \text{(triangle inequality)}\\
&& &= \sum_{i=1}^{n-1} \sqrt{\kappa(\bd{x}_{i+1}, \bd{x}_{i+1})} + \sqrt{\kappa(\bd{x}_{i}, \bd{x}_{i})} && \text{(reproducing property)}\\
&& &\leq 2(n-1) \sup_{z \in \mcal{X}} \sqrt{\kappa(z,z)}. && \text{($\kappa$ bounded)}
\end{flalign*}
Let $v := 2(n-1) \sup_{z \in \mcal{X}} \sqrt{\kappa(z,z)}$. Then $\forall \bd{x} \in \mcal{X}^n$,
\begin{align*}
        \norm{\sig{\bd{x}}}{} &\leq \left\{\sum_{m=0}^{\infty} \frac{(\onevar{\kap{\bd{x}}}^2)^m}{\left(m!\right)^2} \right\}^{\frac{1}{2}} && \text{(Proposition \ref{thm:fact_decay})}
        \\
        &\leq \left\{\sum_{m=0}^{\infty} \frac{(v^2)^m}{m!} \right\}^{\frac{1}{2}} && \\
        &= e^{\frac{v^2}{2}}, && \text{(exponential series)}
\end{align*}
where in the first inequality we make use of the factorial decay property of signatures. We obtain the result by taking the supremum over $\mcal{X}^n$:
\begin{equation*}
    \sup_{\bd{x} \in \mcal{X}^n} \norm{\sig{\bd{x}}}{} \leq e^{\frac{v^2}{2}} < \infty.
\end{equation*}
\end{proof}

\begin{Lemma}\label{lem:ubov}
Let $\kappa$ be a uniformly bounded kernel i.e. one for which $\sup_{z \in \mcal{X}} \sqrt{\kappa(z, z)} < \infty$, and let $\kap{\bd{y}}\in \Pl{\mcal{H}}$ be the observed $\mcal{H}$-valued piecewise linear path with $\sig{\bd{y}}$ its signature. Denote the signature kernel as 
\begin{equation}
    k(\bd{x}, \bd{z}) = \langle{\sig{\bd{x}}, \sig{\bd{z}}}\rangle
\end{equation}
Then the distance function 
%%
\begin{equation}
\rho\left(\sig{\bd{y}}, \cdot\right) : \prod_{m \geq 0}\mcal{H}^{\otimes m} \to \mathbb{R}_{\geq 0},\ \ \ s \mapsto \| s - \sig{\bd{y}}\|^2
\end{equation}
% 
%%
% 
is Lipschitz continuous in $s$.
\end{Lemma}

\begin{proof}
\begin{flalign*}
&& \big\vert{ \mcal{D}(\bd{y}, \bd{x}) - \mcal{D}(\bd{y}, \bd{z}) }\big\vert &= \Big\vert{ \norm{\sig{\bd{x}} - \sig{\bd{y}}}{}^2 - \norm{\sig{\bd{z}} - \sig{\bd{y}}}{}^2 }\Big\vert &&\\
&& &= \Big\vert{ k(\bd{x}, \bd{x}) - k(\bd{z}, \bd{z}) + 2\left(k(\bd{z}, \bd{y}) - k(\bd{x}, \bd{y})\right) }\Big\vert &&\\
&& &\leq \Big\vert{ k(\bd{x}, \bd{x}) - k(\bd{z}, \bd{z}) }\Big\vert + 2\Big\vert{ k(\bd{z}, \bd{y}) - k(\bd{x}, \bd{y}) }\Big\vert && \text{(triangle inequality)}
\end{flalign*}
Considering the first of these terms and making use of the reproducing property and symmetry of $k$:
\begin{flalign*}
&& \Big\vert{ k(\bd{x}, \bd{x}) - k(\bd{z}, \bd{z}) }\Big\vert &= \Big\vert{ k(\bd{x},\bd{x}) - k(\bd{x},\bd{z}) + k(\bd{z},\bd{x}) 
 - k(\bd{z},\bd{z})\Big\vert} &\\
&& &= \Big\vert{\langle{k(\bd{x},\cdot), k(\bd{x}, \cdot) - k(\bd{z}, \cdot)\rangle} + \langle{k(\bd{z},\cdot), k(\bd{x}, \cdot) - k(\bd{z}, \cdot)\rangle}\Big\vert} && \\
&& &\leq \Big\vert{\langle{k(\bd{x},\cdot), k(\bd{x}, \cdot) - k(\bd{z}, \cdot)\rangle}}\Big\vert + \Big\vert{\langle{k(\bd{z},\cdot), k(\bd{x}, \cdot) - k(\bd{z}, \cdot)\rangle}\Big\vert} && \\
&& &\leq (\norm{\sig{\bd{x}}}{} + \norm{\sig{\bd{z}}}{}) \cdot \norm{\sig{\bd{x}} - \sig{\bd{z}}}{}, &&
\end{flalign*}
where in the penultimate and final lines we use the triangle inequality and the Cauchy-Schwarz inequality twice, respectively. Considering now the second term:
\begin{flalign*}
&& \Big\vert{ k(\bd{z}, \bd{y}) - k(\bd{x}, \bd{y}) }\Big\vert &= \Big\vert{ \langle{\sig{\bd{y}}, \sig{\bd{x}} - \sig{\bd{z}}\rangle} }\Big\vert &&\\
&& &\leq \norm{\sig{\bd{y}}}{} \norm{\sig{\bd{x}} - \sig{\bd{z}}}{}, && \text{(Cauchy-Schwartz)}
\end{flalign*}
where in the first line we use the definition and symmetry of the inner product. Putting the two terms together and using Lemma \ref{lem:ubov}, we have
\begin{align*}
    \big\vert{ \mcal{D}(\bd{y}, \bd{x}) - \mcal{D}(\bd{y}, \bd{z}) }\big\vert &\leq \left(\norm{\sig{\bd{x}}}{} + \norm{\sig{\bd{z}}}{} + 2\norm{\sig{\bd{y}}}{} \right)\norm{\sig{\bd{x}} - \sig{\bd{z}}}{}\\
    &\leq 4 e^{\frac{v^2}{2}}\norm{\sig{\bd{x}} - \sig{\bd{z}}}{}
\end{align*}
where $v$ is as in Lemma \ref{lem:bounded_sig}. Thus $\rho\left(\sig{\bd{y}}, \cdot\right)$ is Lipschitz continuous.
\end{proof}

We finally arrive at the conclusion:

\begin{proof}[Proof of Proposition \ref{prop:sig_cont_abc}]
Compositions of continuous maps are continuous, and each of the constituent maps are continuous from the Lemmas and Corollaries presented above.
\end{proof}

Injectivity of the signature map is also guaranteed under these conditions: 
\begin{Proposition}\label{prop:sig_inj_abc}
    Let $\mcal{X} := \mathbb{R}^d$, $\bd{x}, \bd{y} \in \mcal{X}^n$. Assume both $\bd{x}$ and $\bd{y}$ are time- and basepoint-augmented, and that $\kappa : \mcal{X} \times \mcal{X} \to \mathbb{R}$ is a uniformly bounded kernel with continuous, injective canonical feature map. Then $\sig{\bd{x}} = \sig{\bd{y}}$ iff $\bd{x} = \bd{y}$.
\end{Proposition}

\begin{proof}
    Obtaining a signature from a length-$n$ data stream $\bd{x}$ entails: (1) lifting the points $\bd{x}_i$ in $\bd{x}$ to the \gls{rkhs} $\mcal{H}$ associated with $\kappa$ as $\kap{\bd{x}_i}$; (2) applying a linear interpolation to obtain a piecewise linear $\mcal{H}$-valued path $\kap{\bd{x}}$; and (3) finally taking the signature of $\kap{\bd{x}}$. To show injectivity of this composite map, it suffices to show injectivity of each of these three steps since the composition of injective maps is injective.
    
    (1) is trivially injective, due to the assumed injectivity of $\kappa$. (2) is by definition injective for a length-$n$ sequence in $\mcal{H}$. To show injectivity of (3), we note that time-augmentation of the sequences, along with injectivity of $\kappa$, ensure that the lifted paths are injective, such that no tree-like equivalence is observed between the interpolated paths in $\mcal{H}$. Time-augmentation further makes the signature sensitive to parameterisation, removing its parameterisation invariance property. Uniform boundedness of $\kappa$ ensures that $\kap{\bd{x}}$ is of bounded variation, such that $\kap{\bd{x}} \in \Pl{\mcal{H}}$. To see this, note that for a piecewise linear path $\kap{\bd{x}}$,
    \begin{equation*}
        \onevar{\kap{\bd{x}}} = \sum_{i = 1}^{n-1} \norm{\kap{\bd{x}_{i+1}} - \kap{\bd{x}_{i}}}{\mcal{H}} \leq 2(n-1)\sup_{\bd{z} \in\mcal{X}} \sqrt{\kappa(\bd{z}, \bd{z})} < \infty,
    \end{equation*}
    where we have used the reproducing property of $\kappa$ and the triangle inequality. Finally, since basepoint augmentation makes the signature sensitive to paths that differ only by translations, the desired result follows from Theorem \ref{thm:sig_inj_tle}.
\end{proof}

\section{Further experimental details}\label{app:exp}

\subsection{Signature Regression ABC}\label{app:skabc}

For \gls{skrr}, we proceed as follows: 
\begin{enumerate}
    \item[(a)] fit a kernel ridge regression model using training data $\lbrace{\bx^{(i)}, \bth^{(i)}\rbrace}_{i=1}^{R} \sim p_{\bth}(\bx)\,\pi(\bth)$. This amounts to solving the following optimisation problem for each of the $p$ components $j=1,\dots,p$ of the $\lbrace{\bth^{(i)}\rbrace}_{i=1}^{R}$:
    \begin{equation}
        \min_{\hat{\bth}_{j}\in \mathcal{H}_k} \sum_{i=1}^{R} \left\{\bth_j^{(i)} - \hat{\bth}_{j}\left(\bx^{(i)} \right)\right\}^2 + \alpha \| \hat{\bth}_{j} \|^{2}_{\mathcal{H}_k},
    \end{equation}
    where $k$ is the signature kernel, $\mathcal{H}_k$ is the \gls{rkhs} associated with $k$, $\hat{\bth}_{j}$ is -- by the Representer Theorem -- a function of the form
    \begin{equation}
        \hat{\bth}_{j}(\bx) = \sum_{i=1}^{R} \boldsymbol{\omega}^{(j)}_i k(\bx, \bx^{(i)})
    \end{equation}
    with
    \begin{equation*}
        \boldsymbol{\omega}^{(j)} = \left(G + \alpha I_R\right)^{-1}\bpsi^{(j)},\qquad G_{mn} = k(\bx^{(m)}, \bx^{(n)}),\qquad %\\[1ex] %
        \bpsi^{(j)} = 
            \left[
            \begin{array}{c}
                \bth_{j}^{(1)}\\ \bth_{j}^{(2)}\\ \vdots\\ \bth_{j}^{(R)}
            \end{array}
            \right],
            \quad \quad I_R = \text{diag}(1, 1, \dots, 1) \in \mathbb{R}^{R\times R},
        % 
    \end{equation*}
    and $\alpha \geq 0$ is a regularisation parameter;
    \item[(b)] summarise the observation $\by$ and all future simulations $\bx\sim p_{\bth}$ using this trained kernel ridge regression model, i.e. use
    \begin{equation}
        \bs(\bx) = \left[
        \begin{array}{c}
            \hat{\bth}_{1}\left(\bx\right)\\ \hat{\bth}_{2}\left(\bx\right)\\ 
            \vdots \\ 
            \hat{\bth}_{p}\left(\bx\right)
        \end{array}\right];
    \end{equation}
    \item[(c)] use the squared difference between the summaries of $\by$ and $\bx$ as the measure of discrepancy between simulation and observation,
    \begin{equation}
        \rho\left\{ \bs(\by), \bs(\bx)\right\} = \| \bs(\by) - \bs(\bx) \|^2_2.
    \end{equation}
\end{enumerate}

\subsection{Further implementation details}\label{app:imp_det}

For all signature kernel computations, we use the \texttt{sigkernel} package \citep{salvi2020computing} and we normalise the time series by % 
dividing by the range of the simulation output when this is known or, when this is unknown, with the expected range of the training set of size $R=300$ for \gls{skrr} or $R=300$ samples from the prior predictive distribution for \gls{sabc}. % 

Unless stated otherwise, we remove the translation invariance and reparameterisation-invariance properties of the signature -- discussed in Appendix \ref{app:signature_invariances} -- by applying basepoint and time-augmentations to all time series in every experiment.

Unless stated otherwise, we take $\kappa$ to be a Gaussian RBF kernel with scale hyperparameter $\sigma$. To tune $\sigma$ and the regularisation hyperparameter for \gls{skrr}, we perform a grid search with 5-fold cross-validation on the training set. For \gls{sabc}, we use the median of all pairwise Euclidean distances between points in the observation $\by$ for $\sigma$, although we note that other approaches could be taken, such as using the same method as for \gls{skrr}. 

Both \gls{saabc} and \gls{skrr} %
require training data; for both we use $R = 300$ training examples $\lbrace{\bx^{(j)}, \bth^{(j)}}\rbrace_{j=1}^{R} \sim p_{\bth}(\bx)\pi(\bth)$. When $\pi(\cdot)$ has bounded support, we normalise the parameters $\lbrace{\bth^{(i)}\rbrace}_{i=1}^{R}$ in the training set with the range of the prior in each dimension. We also tune the bandwidth parameter for the Gaussian RBF kernel employed in the \gls{mmd} distance for \gls{k2abc} using the median of the pairwise absolute differences between observations in $\by$, as recommended by \citet{Park2016}. 

In all experiments, ``Wasserstein'' and ``W-ABC'' indicates the use of the 1-Wasserstein distance with curve matching, which as described in Section \ref{sec:back} is a method for using the Wasserstein distance for time series recommended in \citet{Bernton2019}, in the rejection ABC sampling scheme. To determine the $\lambda$ coefficient, we follow the guidance of \citet{thorpe2017transportation} and choose
\begin{equation}
    \lambda \simeq \frac{V}{T},
\end{equation}
where $V$ is the expected vertical range and $T$ is the length of the time interval over which observations are made, in order to balance the effects of vertical and horizontal transport. Where the value of $V$ is not apparent \emph{a priori}, we estimate it using $R=300$ samples from the prior predictive distribution. Distances are computed using the Python Optimal Transport package \citep{flamary2021pot}.


\subsection{Reference Posteriors using MCMC}
\label{app:mcmc}
\glsresetall

\paragraph{Metropolis-Hastings} For the \gls{gbm} and Brock \& Hommes models, we obtain samples from the ground truth posterior using \gls{mh}. We follow the guidelines of \citet{schmon2021optimal} and use a multivariate normal proposal, for which we estimate the covariance matrix using a pilot run. We subsequently tune the \gls{mh} algorithm according to \citet[][Table 1]{schmon2021optimal} and run the \gls{mh} for $10^5$ steps, keeping a thinned subset of $10^3$ samples as our baseline.

\paragraph{Particle MCMC} To obtain samples from the ground truth posterior of the Ricker model we employ \gls{pmcmc} using a simple bootstrap particle filter. 
We follow the guidelines of \citet{SchmonDeligiannidisDoucet2018a}, first estimating the posterior covariance  in a shorter prior run and then tuning the random walk proposal as well as the particle filter. %to follow the \citet[][Table 1]{SchmonDeligiannidisDoucet2018a}. 
\Gls{pmcmc} commonly exhibits worse convergence behaviour than standard \gls{mh} and hence we run the algorithm for $2\times 10^5$ iterations eventually retaining a thinned subset of $10^3$ samples as our baseline.

\subsection{Further experimental details: The Ricker model}

The time series generated by the Ricker model tend to consist of many zero terms, with occasional spikes. For this reason, we use the cumulative sum pre-signature transformation (see Appendix \ref{sec:PathTrans}) for \gls{sabc}, which is a common transformation for spiking data such as medical data \citep{Morrill2019}. In our experiments, we also found that W-ABC and K2-ABC benefited from this transform and were not competitive without it. We therefore also report the results obtained with W-ABC and K2-ABC with this cumulative sum transform applied.

\subsection{Further experimental results: Geometric Brownian motion}

\begin{figure*}
\centering
\includegraphics[width=\columnwidth]{fig2.png}
\caption{(Geometric Brownian motion) Examples of marginal posterior distributions recovered using each distance function and the approximate ground-truth posterior recovered with a Metropolis-Hastings (\gls{mh}) random walk. Panels \textbf{a} and \textbf{b} show the marginal posteriors recovered using our signature methods (\gls{sabc} and \gls{skrr}) and the approximate ground-truth posterior (\gls{mh}). Panels \textbf{c} and \textbf{d} show the marginal posteriors recovered using the Wasserstein curve matching 
distance (Wasserstein), \gls{k2abc} (\gls{mmd}), and semi-automatic \gls{abc} with powers of the variance and lag-1 and -2 autocorrelations of the increments of the log time series as regressors (\gls{saabc}).\label{fig:GBM_marginals}}
\end{figure*}

We show in Figure % 
%
\ref{fig:GBM_marginals} % 
the marginal posteriors recovered using the \acrfull{mh} approximation (see Appendix \ref{app:mcmc} for details) and the true likelihood function, along with the approximate posteriors obtained using the rejection sampling scheme in Algorithm \ref{alg:Rej} and each of the distance measures considered. The suffix ``(delay)'' once again indicates that the lag-1 delay transformation was applied. 
% Of all methods, 
From this, we see that and \gls{skrr} and \gls{sabc} track the shape of the approximate ground truth marginal posterior generated by \gls{mh} for $\mu$ more closely than all other methods, and that the marginal distribution for $\sigma$ %they %are the most concentrated 
concentrates in the neighbourhood of the approximate ground-truth marginal posterior for $\sigma$. This is in contrast to, for example, the \gls{mmd}, which is overly dispersed and biased for $\sigma$.

In this example, \gls{saabc} has been able to very accurately approximate the marginal density for $\sigma$ as a consequence of the informative set of summary statistics provided to this method. However, \gls{saabc} has experienced difficulty recovering the shape of the marginal density for $\mu$, despite the provided summary statistics also being informative of this parameter. The fact that the signature- and Wasserstein-based methods are able to outperform \gls{saabc}, despite the advantage the latter has been afforded, illustrates the potential power of these methods in cases where the model structure is too complex to easily derive summary statistics that are informative of the parameters.  

\subsection{Further experimental details: generalised stochastic epidemics}\label{app:gse}

For the priors reported in the main text, the posterior density can be written as
\begin{equation}\label{eq:GSE_post}
    \pi(\beta, \gamma \mid \mathbf{I}, \mathbf{R}) \propto \beta^{\lambda_{\beta} + n_{I} - 2} \exp\left\{ -\beta \left(\int_{\phi_1}^{T} X_t Y_t\, {\mathrm d} t + \nu_{\beta}\right) \right\}
    \gamma^{\lambda_{\gamma} + n_{R} - 1 }\exp\left\{ -\gamma \left(\int_{\phi_1}^{T} Y_t\, {\mathrm d} t + \nu_{\gamma}\right) \right\},
\end{equation}
where $\mathbf{I}$ and $\mathbf{R}$ are the infection and recovery times, respectively, $n_I$ and $n_R$ are the total number of individuals in the model that are infected and that recover over the course of the simulation, respectively, and $\phi_1$ is the time of the first infection.

To perform \gls{sabc}, we bring all three channels of the multivariate stream --- the number of infected individuals, number of recovered individuals, and time --- into the range $[0,1]$ by dividing by $Z$, $Z$, and $T$, respectively. For W-ABC (``Wasserstein''), we set $\lambda = 2$, since the expected vertical range is approximately twice that of the horizontal range $T=50$ when $Z=100$.


\subsection{Further experiment: The Brock \& Hommes model}

In this experiment, we consider a heterogenous agent model proposed by \citet{BROCK19981235} which simulates the dynamics of a set of traders operating under different trading strategies. The system of coupled equations comprising the model may be written succinctly with the following transition density:
\begin{equation*}
    p_{\bth}(\bd{y}_{t+1} \mid \bd{y}_{1:t}) = \mathcal{N}\left\{f(\bd{y}_{t-2:t}, \bth), \frac{\sigma^2}{R^2}\right\},
\end{equation*}
where
\begin{equation*}
    f\left(\bd{y}_{t-2:t}, \bth\right) = \frac{1}{R}\sum_{j=1}^{J} \frac{\exp{\left\{\beta \left(\bd{y}_t - R \bd{y}_{t-1}\right)\left(g_j \bd{y}_{t-2} + b_j - R \bd{y}_{t-1}\right)\right\}}}{\sum_{j' = 1}^{J} \exp{\left\{\beta \left(\bd{y}_t - R \bd{y}_{t-1}\right)\left(g_{j'} \bd{y}_{t-2} + b_{j'} - R \bd{y}_{t-1}\right)\right\}}}\left(g_j \bd{y}_t + b_j\right)
\end{equation*}
and $R, \beta, \sigma$ are parameters. In this way, we are able to obtain an approximate ground truth posterior with standard \gls{mcmc} techniques such as \gls{mh}. We follow \citet{PLATT2020103859, dyer2022black} and assume the following parameter values: $J=4, R = 1.0, \sigma = 0.04, \beta = 10, g_1 = b_1 = b_4 = 0$ and $g_4 = 1.01$. 

The parameters $g_j \in \mathbb{R}$ capture the trend-following tendencies of the agents, while the parameters $b_j \in \mathbb{R}$ determine the biases towards different trading strategies. In our experiments, we consider the task of estimating the posterior $\pi\left(\bth \mid \by\right)$, where $\bth = \left(g_2, b_2, g_3, b_3\right)$, $\by = (\bd{y}_1, \dots, \bd{y}_n) \sim p_{\bth^{*}}$ is the pseudo-observation, $n=100$, and $\bth^{*} := (-0.7, -0.4, 0.5, 0.3)$ is the parameter setting used to generate $\by$. 

% \begin{sidewaysfigure}
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{fig4.png}
    \caption{(Brock \& Hommes) (\textbf{a}) Wasserstein distances between the posteriors recovered from the different distance measures and samples from the exact posterior. (\textbf{b}) Maximum mean discrepancies between the posteriors recovered from the different distance measures and samples from the exact posterior. (\textbf{c}) Squared distances between the means of the \gls{abc} posteriors and the exact posterior mean. Our methods are shown in blue.}
    \label{fig:bhn_metrics}
% \end{sidewaysfigure}
\end{figure}

We show in Figure \ref{fig:bhn_metrics} boxplots for the Wasserstein distance and MMD between the \gls{abc} posteriors, denoted with $\hat{\pi}_{\mathrm{ABC}}$, and the approximate ground-truth posterior obtained with \gls{mh}, denoted with $\hat{\pi}_{\cdot\mid\by}$. We also show boxplots for the Euclidean distance between the \gls{abc} posterior means and the \gls{mh} posterior mean. These boxplots were created by running the \gls{rej} algorithm with the same 20 random seeds. In this experiment, \gls{saabc} uses the first and second powers of $l$ evenly spaced order statistics of the output data $\bx$, as considered in \citet{Fearnhead2012}, where we take $l=10$.

From this, we see that the signature-based methods tend to generate lower values in all three metrics compared to existing methods. In particular, we see that \gls{sabc} with the lag-1 delay transformation once again dominates existing methods uniformly across all three metrics, while the same transformation applied to \gls{wass} does not result in the same improvement. This demonstrates the potential power of our signature-based methods as automatic distance measures for \gls{abc} for dynamic, stochastic simulators.



\end{document}
