\documentclass[accepted]{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                  
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amssymb}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{subcaption}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand\Tstrut{\rule{0pt}{2.6ex}}         % = `top' strut
\newcommand\Bstrut{\rule[-0.9ex]{0pt}{0pt}}   % = `bottom' strut
\newcommand*\rot{\rotatebox{90}}
\NewDocumentCommand{\anote}{}{\makebox[0pt][l]{$^*$}}

\title{Offline Changepoint Detection With Gaussian Processes}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<janneke.verbeek@ru.nl>?Subject=Your UAI 2025 paper}{Janneke Verbeek}{}}
\author[1]{Tom Heskes}
\author[1]{Yuliya Shapovalova}
% Add affiliations after the authors
\affil[1]{%
    Radboud University Nijmegen\\ 
    Institute for Computing and Information Sciences\\
    Nijmegen, The Netherlands
}
  
  \begin{document}
\maketitle

\begin{abstract}
%why, what, findings (interpretation)
  %Gaussian process regression (GPR) is a flexible modeling framework, yet in the context of offline changepoint detection it is relatively underexplored. 
  This work proposes Segmenting changepoint Gaussian process regression (SegCPGP), an offline changepoint detection method that integrates Gaussian process regression with the changepoint kernel, the likelihood ratio test and binary search. We use the spectral mixture kernel to detect various types of changes without prior knowledge of their type.
  SegCPGP outperforms state-of-the-art methods when detecting various change types in synthetic datasets; in real world changepoint detection datasets, it performs on par with its competitors. While its hypothesis test shows slight miscalibration, we find SegCPGP remains reasonably reliable.
\end{abstract}

\section{Introduction}\label{sec:intro}
Changepoint detection (CPD) refers to the problem of finding and characterizing changes in data generating processes, such as changes in the mean, variance, trend, periodicity, or other properties of the data.
Applications of change point detection algorithms include climate data \citep{reeves2007review}, quality control, \citep{lai1995sequential} EEG analysis, network analysis \citep{tartakovsky2012efficient} and finance \citep{andreou2009structural}. 

Changepoint detection is an extensively studied problem \citep{truong2020selective, aminikhanghahi2017survey, reeves2007review, van2020evaluation, aue2013structural}; available methods can be divided in \textit{online} methods, which detect changepoints as new data arrives, and \textit{offline} methods, which analyze the entire dataset at once to identify changepoints.

In online changepoint detection, CPD algorithms need to be efficient enough to process a potentially never-ending stream of data. Processing only one window of data at a time is a common strategy for these algorithms \citep{keogh2001online, chen2022high}. For example, a popular Bayesian online changepoint detection (BOCPD) method estimates the run length, which represents the number of time steps since the last changepoint and essentially dynamically detects shifts in the data as new observations arrive \citep{adams2007bayesian}. 
%Bayesian online changepoint detection (BOCPD) models run length or time between changepoints \citep{adams2007bayesian}. 
Several variations on BOCPD, for instance robust versions \citep{altamirano2023robust, knoblauch2018doubly}, and model selection \cite{knoblauch2018spatio}) have been proposed as extensions. 

Numerous other offline changepoint detection methods exist \citep{killick2012optimal, auger1989algorithms, haynes2017computationally, zou2014nonparametric, celisse2018new} --- for a comprehensive overview, see \citep{truong2020selective}. An offline method of particular interest to this paper is binary segmentation \citep{scott1974cluster, vostrikova1981detecting}, which recursively partitions the signal by selecting split points that optimize a specific metric, such as likelihood or information criterion. Some variations of this algorithm exist \citep{fryzlewicz2014wild, olshen2004circular}. 

% Specific methods include  \cite{killick2012optimal, auger1989algorithms}.
% Nonparametric offline methods are also available  \citep{haynes2017computationally, zou2014nonparametric, celisse2018new}. 

% Evaluation of CPD algorithms
Many CPD methods are designed for specific changes (e.g., detecting mean or variance shifts in time series). Gaussian processes (GPs) provide a flexible framework where different types of changes may be incorporated at the same time. CPD methods based on Gaussian processes (GPs) have been widely studied in online setting \citep{caldarelli2022adaptive, garnett2009sequential, saatcci2010gaussian}, but their application in the offline setting remains underexplored. In offline methods where GPs are used, the focus has primarily been on detecting mean shifts \citep{keshavarz2018optimal, lebarbier2005detecting}. 

A Gaussian process is fully determined by its mean and covariance function, also known as the \textit{kernel}, making their selection a crucial step in its application. The choice of kernel reflects prior beliefs about the types of functions the GP should model. In the context of CPD, this is especially important when little is known about the data or the nature of the changes to be detected. Thus, the selection of a suitable kernel may prove crucial to the overall performance of a GP-based CPD method. 

Consequently, our research aims to answer the question: can we devise an offline, Gaussian process based changepoint detection method without the need to devote much attention to kernel selection? 
In the next section, we will proceed with a more detailed discussion of available Gaussian process-based changepoint detection methods.

\section{Related Work} \label{sec:relwork}
Gaussian processes (GPs) are flexible, nonparametric models that are capable of modeling spatiotemporal correlations. 
GPs have found ample application in changepoint detection, particularly in the online setting.
GPTS-CP \citep{saatcci2010gaussian} models temporal correlations in the BOCPD framework, using GPs as an underlying predictive model. However, the BOCPD framework can be highly sensitive to the choice of hyperparameters which can hinder its performance in real-world setting. 

As an alternative to BOCPD, Adaptive Gaussian process change point detection (ADAGA) \citep{caldarelli2022adaptive} is an online changepoint detection method based on statistical hypothesis testing. ADAGA detects changepoints via a window sliding method and tests whether the function values in the subwindow come from the same observational model as the rest of the window. The authors derive theoretical bounds for the probability of Type I and Type II errors in their changepoint detection heuristic. Nevertheless, ADAGA still relies on a prior specification of the kernel for different types of changes.

\citet{garnett2009sequential} exploited the kernel structure of GPs for CPD in the online setting, inspired by work on general linear models \citep{ruanaidh1994recursive}. By using block-diagonal covariance matrices, their approach captures abrupt transitions between regimes governed by different kernels. In this case, the location of the changepoint can then be treated as a kernel parameter. 
In contrast, the changepoint kernel \citep{lloyd2014automatic} parametrizes changepoints via steepness as well as location. This kernel has been proposed in an automatic statistician-type of framework for modeling complex time series behavior, but, to our knowledge, has not been explored in the context of CPD.

The likelihood ratio test has been used in the context of CPD more frequently, for instance in \citet{caldarelli2022adaptive}. For a general overview, see \citet{aminikhanghahi2017survey, truong2020selective}. 

\paragraph{Contributions}
We propose SegCPGP, a flexible offline changepoint detection method based on Gaussian processes that makes no assumptions about the type or nature of changes that might occur in the data. SegCPGP builds upon several components. First, we utilize the changepoint kernel, allowing for both steep and smooth transitions. Second, we use the likelihood ratio test with binary segmentation \citep{scott1974cluster} for sequential detection of multiple changepoints in the data. Finally, we propose incorporating the spectral mixture kernel \citep{wilson2013gaussian} within the changepoint kernel framework, allowing for flexibility beyond mean/variance changes and eliminating the need to specify the nature of changes a priori.
Code for SegCPGP is publicly available\footnote{https://github.com/JVerbeek/segcpgp/}.

\section{Background}
This section is structured as follows. We begin with an overview of Gaussian processes and Gaussian process regression. Next, we introduce two specific kernels that form the basis of our approach: the spectral mixture kernel, which aims to alleviate the challenge of kernel selection, and the changepoint kernel. 

\subsection{Gaussian Processes}
\paragraph{Gaussian Process (GP)}
A Gaussian process is a collection of random variables, any finite subset of which has a multivariate Gaussian distribution (see for an extensive introduction \citet{williams2006gaussian}). It is fully defined by its mean function $\mu(t)$ and covariance function $k(t, t')$. For a finite set of input points $t = \{t_1, t_2, \ldots, t_n\}$, a Gaussian process is denoted as
\begin{align*}
f(t) \sim \text{GP}(\mu(t), k(t, t')).
\end{align*}
In this paper, without loss of generality, we assume that $\mu(t)=0$, but the proposed approach can be straightforwardly extended to specific mean functions.

\paragraph{Gaussian Process Regression (GPR):}
Gaussian process regression is a non-parametric Bayesian approach that assigns a Gaussian process prior on the functional relationship between input and output variables. A Gaussian process regression is defined as
\begin{equation}
y(t)=f(t)+\epsilon(t), 
\label{eq:gpr}
\end{equation}
where $\epsilon(t)\sim N(0, \sigma^{2}_{\epsilon})$ is Gaussian noise. Given a set of observed input-output pairs $D = \{(t_1, y_1), (t_2, y_2), \ldots, (t_n, y_n)\}$, the goal is to estimate the function $f(t)$ and make predictions for new, unseen inputs $t^*$.
%\footnote{Note, when we want to obtain in sample model fit we can simply set $t^*=t$.} 
In the case of the Gaussian likelihood, the posterior distribution over $f(t^*)$ is available in the closed form \citep{williams2006gaussian}
\begin{equation}
p(f(t^*) | D) = \mathcal{N}(f(t^*) | \mu^*, \Sigma^*), 
\end{equation}
where $\mu^*$ is the predictive mean and $\Sigma^*$ is the predictive covariance. The mean and covariance of the posterior distribution are given by
\begin{equation}
\mu^* = k(t^*, t) [K + \sigma^2_\epsilon I]^{-1} y,
\label{eq:posterior_mean}
\end{equation}
\begin{equation}
\Sigma^* = k(t^*, t^*) - k(t^*, t) [K + \sigma^2_{\epsilon}I]^{-1} k(t, t^*)
\label{eq:posterior_covariance}
\end{equation}
where $k(t^*, t)$ is the covariance matrix between the test inputs $t^*$ and the training inputs $t$, $k(t^*, t^*)$ is the covariance matrix between the test inputs, $K=k(t,t)$ is the covariance matrix for the training inputs, $y$ is the vector of observed outputs, and $\sigma^2_{\epsilon}$ is the noise variance.
%where $k(t^*, t)$ is the covariance vector between the test $t^*$ and the training inputs $t$, $k(t^*, t^*)$ is the covariance scalar for $t^*$, $K$ is the covariance matrix for the training inputs, $y$ is the vector of observed outputs, and $\sigma^2_{\epsilon}$ is the noise variance.

%Adapt notation for this bit, this is sloppy but we do need to mention this.
The kernel hyperparameters of the GP prior and the variance of the noise, denoted together by $\theta$, are inferred by maximizing the marginal log-likelihood, given by
\begin{align}
	\nonumber \log p(y|t, \theta) = -\frac{1}{2}y^{T}[K + \sigma^2_\epsilon I]^{-1}y \\ 
    - \frac{1}{2}\log |K + \sigma^2_\epsilon I|- \frac{N}{2}\log 2\pi. \label{eq:log-likelihood}
\end{align} 

% Adapt
Equations~\eqref{eq:gpr} through \eqref{eq:log-likelihood} represent the mathematical formulation of Gaussian Process Regression, allowing for the estimation of the posterior distribution over the function values and providing predictions with associated uncertainties.
 
% In this paper, we focus on the Gaussian process regression with periodic and linear trends with the motivation of applying the approach to climate data, such as natural gas emissions (e.g. ethane, CO2). However, the approach can easily be extended to other kernel specifications and other time series applications. 
% Commonly used covariance functions include the squared exponential (Gaussian) kernel, Mat\'{e}rn kernel, and rational quadratic kernel, among others.

\subsection{Spectral Mixture Kernel} % fix notation and clarify
In Equations~\eqref{eq:posterior_mean}-\eqref{eq:posterior_covariance}, the covariance function, also known as a kernel, plays a crucial role in modeling the similarity or correlation between different inputs. The structural form of the kernel directly determines which kinds of functions can be drawn from a Gaussian process prior \citep{williams2006gaussian}. Notable examples of kernel functions include: the squared exponential kernel, for modeling smooth functions without discontinuities or abrupt changes; the Mat\'ern family kernels, which allow for modeling some degree of roughness or discontinuities; and the periodic kernel, which allows for modeling repeating patterns in time series, such as seasonality. For an illustration of these kernel functions, see Appendix~\ref{apx:kernels}.

If there is no prior knowledge about the most suitable kernel function for a given task, the appropriate structural form can be determined through kernel search \citep{duvenaud2013structure} and kernel learning \citep{bach2008exploring}. Kernel search involves exploring the space of possible kernels, which can be computationally expensive. In contrast, kernel learning offers a more efficient alternative, potentially reducing the computational complexity of GPR from cubic to linear \citep{wilson2016deep}. Kernel learning in the context of changepoint detection, however, would likely require training data with labeled changepoints, which may not always be available in the context of CP detection. 

Kernel selection may be sidestepped by using kernels that are sufficiently expressive, such as the spectral mixture (SM) kernel \citet{wilson2013gaussian}. The SM kernel can in theory approximate any stationary covariance kernel as a mixture of Gaussians in the frequency domain. We will further discuss this kernel in the remainder of the section and later apply it in the context of change point detection problem.

According to Bochner's theorem, any stationary covariance function $k(\cdot)$ can be expressed as an integral of the form
\begin{equation}
    k(\tau) = \int_{\mathbb{R}^P} e^{2\pi i s^\top\tau} \psi(dt),
\end{equation}
where we use $\tau = t - t'$ as a notational shorthand similarly to \citet{wilson2013gaussian} and $\psi$ is a positive finite measure.
If $\psi(ds)$ has a spectral density $S(s)$, then $k(\tau)$ and $S(s)$ are Fourier duals
\begin{align*}
k(\tau) = \int_{\mathbb{R}^P}S(s) e^{2\pi i s^\top\tau} ds, \\
S(s) = \int_{\mathbb{R}^P}k(\tau)e^{-2\pi i s^\top\tau}d\tau.
\end{align*}

% Extend GMM description.
The spectral density $S(s)$ can be approximated via a Gaussian mixture model (GMM). A GMM models the data as a mixture of $Q$ Gaussian densities with means $\mu_1, \dots, \mu_Q$ and variances $\sigma^2_1, \dots, \sigma^2_Q$ so that $k(\tau)$ has the form 
\begin{equation}
k(\tau) = \sum_{q=1}^{Q} w_q \exp(-2\pi^2\tau^2\sigma^2_q)\cos(2\pi\tau\mu_q).
\end{equation}\label{eqn:spectral}

The weights $w_q$ specify the relative contribution of each component, and do not necessarily sum to 1 as in a GMM.
For a single Gaussian component, the mean $\mu$ can be interpreted as the frequency captured by the component. 
The inverse of the standard deviation $\sigma$ represents the lengthscale, which determines how smooth or wiggly the function is. A large lengthscale for a spectral mixture component leads to functions that are almost constant, while a small lengthscale may result in a more periodic function. 
Note that since $\tau$ is the difference between $t$ and $t'$, the quantity $-2\pi^2\tau^2\sigma^2$ corresponds to a squared Euclidean norm scaled by lengthscale. 
Provided enough Gaussian mixture components are used, any stationary covariance function can be approximated in this way \citep{wilson2013gaussian}.

To the best of our knowledge, the spectral mixture kernel has not been used in the context of changepoint detection. 
Due to its versatility, we apply the spectral mixture kernel in the context of multiple changepoint detection to detect different types of changepoints with a single kernel. 
Kernel selection is therefore largely bypassed. We use the SM kernel implementation of \citet{leeftink2020spectral}. To initialize the SM kernel hyperparameters, a Lomb-Scargle periodogram is used to approximate the empirical spectrum (as in \citet{leeftink2020spectral}); subsequently, a GMM is fit to this spectrum.

\begin{figure}[ht]
    \centering
\includegraphics[width=1\linewidth]{steepness.png}
    \caption{Left: kernel structure of the changepoint kernel with periodic base kernels for three changepoints at locations 25, 50 and 75. The steepness of the changes is 0.3, 1 and 10 at index 25, 50 and 75, respectively. Right: two samples from the kernel displayed on the left. }
    \label{fig:steepness}
\end{figure}
 
\subsection{Changepoint Kernel}
The changepoint (CP) kernel was first proposed by \citet{lloyd2014automatic} in an automatic statistician-type framework. The kernel specifies a structural change in a signal, in particular (possibly) a smooth transition between two base kernels. In the following section, we will give a definition of the CP kernel.

Let $k_{1}(t,t')$ and $k_{2}(t,t')$ be base kernels (such as RBF/linear/local periodic/spectral mixture). Then the change point kernel is defined as 
\begin{equation}
    \text{cov}(f(t), f(t')) = k_{1}(t,t')\bar{\psi}(t,t')+k_{2}(t,t')\psi(t,t'),
    \label{eq:cp_kernel}
\end{equation}
where $\psi(t,t')=\psi(t)\psi(t')$ and $\bar{\psi}(t,t')=(1-\psi(t))(1-\psi(t'))$. The sigmoid $\psi(t)$ is parametrized by the location $(t_{0})$ and steepness $(s)$ parameters,
$\psi(t) = 1/(1+\exp(-s(t - t_{0})))$.
Besides inferring kernel parameters such as variance, period or lengthscale defined previously, we can also infer the location of the change point $t_{0}$ and steepness of the change $s$. 

As an example, Figure \ref{fig:steepness} shows a changepoint kernel with changepoints at several locations, as well as a sample from that kernel. The steepness of each of these changepoints is different, leading to smoother or more abrupt transitions from regime to regime. 

\section{Methodology}\label{sec:math}
In this section, we define an algorithm based on Gaussian process regression with change point kernel and binary search to detect multiple changepoints. 

\subsection{Model Selection} \label{sec:model-selection}
To determine whether a dataset contains a changepoint, we propose to compare two models with a likelihood-ratio test (LRT): a GPR with a single kernel and one with a changepoint kernel.

%If a Gaussian process regression is equipped with the changepoint kernel, a natural question is whether it models data better than a GPR without the changepoint kernel. In order to make this decision, we compare these models with a likelihood ratio test (LRT). 

Let the Gaussian process regression with a single kernel be the \textit{single GPR}
\begin{align*}
    y(t) = f(t) + \epsilon(t), \\
    f(t) \sim \textnormal{GP}(0, k(t, t')).
\end{align*}
Furthermore, let the Gaussian process regression that employs the changepoint kernel be the \textit{changepoint GPR}
\begin{align*}
    y(t) = f(t) + \epsilon(t), \\
    f(t) \sim \textnormal{GP}(0, k_1(t, t') \bar \psi(t, t') + k_2(t, t') \psi(t, t')),
\end{align*}
where the Gaussian process is defined by the change point kernel as in~\eqref{eq:cp_kernel}.
The (log-)likelihoods for both these models can be computed using~\eqref{eq:log-likelihood}.

% LRT version
The likelihood ratio test statistic $\mathcal{R}$ is given by
\begin{equation}
\mathcal{R} = -2 (\log{p(y|t, \theta_0)} - \log{p(y|t, \theta_1)}), 
\end{equation}
where $\theta_0$ are hyperparameters of the \textit{single GPR} model and $\theta_1$ are hyperparameters of the \textit{changepoint GPR}. If the models are composite (or nested) --- that is, the parameter space of the null model is in the interior of the parameter space of the alternative model --- then in theory $\mathcal{R}$ follows a $\chi^2_d$-distribution under the null hypothesis, where $d$ is the difference in dimensionality between the two models \citep{wilks1938large}. The $p$-value is then obtained as the density of the $\chi^2_d$ distribution larger than $\mathcal{R}$. 

We are interested in applying the likelihood ratio test between the single and changepoint GPR; we thus need to reduce the alternative model to the null model.
Placing the constraint $s=\infty$ on the steepness parameter of the changepoint kernel with base kernels $k_1(t, t')$ and $k_2(t, t')$, reduces the changepoint kernel to $k_1(t, t')$ or $k_2(t, t')$ respectively (for a detailed elaboration, see Appendix \ref{apx:steepness}). Thus, when we set the single GPR's kernel equal to $k_1$ or $k_2$, we arrive at the desired model selection. 
%Alternatively, when $s=0$, the changepoint kernel becomes $\text{cov}(f(t), f(t')) = \frac{1}{4} k_1(t, t') + \frac{1}{4} k_2(t, t')$.  
%Starting from the alternative model, there are thus multiple ways to arrive at a null model.  

Setting $s = \infty$ means that the null and alternative models are no longer composite. Since $\infty$ lies on the boundary of the admissible values for $s$, the null model does not lie in the interior of the alternative model's parameter space. Therefore, in practice, the distribution of $\mathcal{R}$ may (slightly) deviate from $\chi^2_d$. We will further discuss this in the Experiments section.

\subsection{SegCPGP}
The model selection described in the previous paragraph can be used to detect single changepoints. In real applications, it is often desirable to detect multiple changepoints. The changepoint kernel can be extended to support multiple changepoints  (see \eqref{eq:multiple-cp}). In optimization, however, the changepoint locations would then need to be constrained such that each changepoint location parameter estimates a unique location. Consequently, we combine the detection of single changepoints with a sequential search strategy to detect multiple changepoints.

To detect multiple changepoints, we propose segmenting changepoint Gaussian process regression (SegCPGP) to estimate multiple changepoints at unknown locations. SegCPGP combines binary search with a changepoint GPR that estimates a changepoint at a single location.
 
SegCPGP estimates changepoints sequentially. The procedure is first run on the whole time series to identify a potential changepoint. If a changepoint is found, the time series is divided at that point. The method then repeats this process on each resulting subwindow.
Two GPRs --- the changepoint GPR and the single GPR, as defined in the previous section --- are fit on the full signal by optimizing the log marginal likelihood (LML). Any valid kernel function can be used as base kernels ($k_1(t ,t')$ or $k_2(t, t')$) in the change point kernel, and may be selected to reflect prior beliefs about the change type. We evaluate standard kernel choices as well as the SM kernel that could be adopted in situations when there are no prior beliefs about the types of changes. 

The likelihood in \eqref{eq:log-likelihood} is known to suffer from multiple local optima \citep{williams2006gaussian}. Thus, we apply the standard GPR practice of restarting the optimization multiple times before selecting the highest likelihood model. 

The single and changepoint GPR are compared via the likelihood ratio statistic described in the previous section. As the null distribution, we use the $\chi^2_d$ distribution, setting $d$ equal to the difference in the dimensionality between the single and changepoint GPR. The value of $d$ depends on the number of kernel hyperparameters in $k_1$ and/or $k_2$. For most of our experiments, we set the $p$-value of the LRT at $p=0.1$ unless otherwise specified. 

% Binary search
The changepoint detection procedure is sequential.
If the LRT returns significant, the value of the changepoint kernel's location parameter is the estimated changepoint, which we denote by $\hat t$. Since $\hat t$ is regressed it is rounded to the nearest integer. Then, the signal is split into two halves. 
To avoid detecting the same changepoint multiple times, we remove a margin $\epsilon$ of the signal in the neighborhood of the detected changepoint. 
For a detected changepoint $\hat t$, the signal is therefore split at $\hat t + \epsilon$ and $\hat t - \epsilon$, where we set $\epsilon$ to 5 timesteps in practice. 
The changepoint search stops when $\hat t$ is outside the domain of the signal, when the LRT is not significant, or when only a single time step is left in the signal.

Pseudocode for the above procedure can be found in Algorithm \ref{alg:segcpgp}. 

% Binseg <> SegCPGP.
\section{Experiments}\label{sec:experiments}
Here, we demonstrate the performance of SegCPGP on synthetic and real-world datasets and compare it against several baseline algorithms. We provide an empirical analysis of SegCPGP's and ADAGA's Type I and Type II error rates. 

\paragraph{Evaluation}
Results are reported in terms of the modified $F_1$-score, a commonly used metric in changepoint detection \citep{caldarelli2022adaptive, killick2012optimal, van2020evaluation}. A detailed description of the F1-score is provided in Appendix \ref{apx:f1}.
An estimated changepoint is considered a true positive (TP) if it falls within a small margin around the true change point. A false positive (FP) is then any estimated changepoint outside of these margins, while a false negative (FN) is any missed changepoint within these margins and a true negative (TN) is the correctly identified absence of a changepoint.
Setting the margin around the true changepoint to 0 skews the accuracy of classification metrics, since changepoints are only a small subset of the total number of datapoints. Thus, the margin is often set to 5 time steps in practice \citep{caldarelli2022adaptive, killick2012optimal, van2020evaluation}. 

Differences in performance between methods in the benchmark dataset of \citet{van2020evaluation} are tested via a Wilcoxon signed-rank test. When ranking two algorithms, one with performance $P$, another with performance $Q$, the null hypothesis of the Wilcoxon signed-rank test is that the distribution $F$ of the differences in performance $F(P - Q)$ is symmetric around $0$, or equivalently, that  $F(Q - P) = F(P - Q)$, meaning the algorithms are effectively interchangeable. 
The Wilcoxon signed-rank test is appropriate for evaluating the pairwise differences between algorithms in our experiments \citep{benavoli2016should, van2020evaluation}. We set the significance level of the test to 10\% (i.e., $p$-value = 0.1). In order to correct for multiple testing, we apply a Holm correction \citep{demvsar2006statistical}.

\paragraph{Baseline methods}
We use a subset of the methods available in the Turing changepoint detection benchmark of \citet{van2020evaluation} in our experimental evaluation.  In particular, we include the following commonly used methods: BinSeg \citep{scott1974cluster}, PELT \citep{killick2012optimal}, BOCPD \citep{adams2007bayesian} and RBOCPDMS \citep{knoblauch2018spatio}. Additionally, we incorporate kernel-based and Gaussian process-based methods in our comparison, namely KCPA \citep{harchaoui2009regularized} and ADAGA \citep{caldarelli2022adaptive}, as well as nonparametric methods, namely CPNP \citep{haynes2017computationally} and ECP \citep{matteson2014nonparametric}. For these algorithms, their default initializations are used, which corresponds to applying the algorithms without prior knowledge of what reasonable hyperparameter settings might be. This experimental setting is also adopted, and was described as being the most realistic, in \citet{van2020evaluation,caldarelli2022adaptive}.
A ZERO method is included in the evaluation, which corresponds to a method that by definition finds no changepoints. 

We evaluate SegCPGP with four different base kernels: a spectral mixture kernel with 4 mixture components (SegCPGP-SM4), a Matern kernel with smoothness 5/2 (SegCPGP-Mat52), a squared exponential kernel (SegCPGP-RBF), and a linear kernel (SegCPGP-Lin). The number of mixture components was selected such that the spectral mixture kernel is sufficiently expressive. The other Gaussian process based method, ADAGA, is combined with these same kernels, except for the spectral mixture kernel, due to software version incompatibilities.

\paragraph{Synthetic Data}

\begin{table*}[ht]
    \centering 
    \caption{$F_1$ score per method for synthetic datasets, grouped by change type. Each $F_1$ score is the method's average over that change category, across 10 datasets. Methods that do not perform differently from the best-performing method (\textbf{bold}) according to a Holm-corrected Wilcoxon signed-rank ($p=0.05$) test are indicated with an $^*$. SegCPGP-SM4 performs well overall. Most methods, including the ZERO method, perform equally well to the best performing method on the mean change datasets.}
\resizebox{\textwidth}{!}{
\input{uai2025-template/synth-table}}
    \label{tab:3cp-comparison}
\end{table*}

We evaluate the performance of several CPD methods on mean, variance, periodicity, and trend changes.

By combining the changepoint kernel with various base kernels, we can create changepoint datasets with predefined change locations and transition steepness. Trend and periodicity change datasets are not generated with the changepoint kernel.
 
For each change category, ten 400-point datasets are generated, each containing three change points at index 100, 200 and 300.  
Section \ref{apx:synth-gen} of the appendix provides the exact generative parameters of each of the datasets and examples for each of the change categories.

Table \ref{tab:3cp-comparison} shows $F_1$ scores for a variety of changepoint detection methods for mean, variance, trend and periodicity changes, as well as each method's average. SegCPGP, when combined with the 4-component spectral mixture kernel, achieves particularly strong overall performance. In particular, it is the best performing method for the trend and periodicity change category, although the performance of SegCPGP with the Matern52 or RBF kernels is not significantly different.  
In the mean and trend change categories, multiple methods perform equally well to the best performing methods (ECP for mean changes, BOCPD for variance changes) according to the Wilcoxon signed-rank test. For mean changes, SegCPGP with linear or RBF base kernels is able to perform equally well to the best performing method; the SM4 kernel also leads to good results, but does perform significantly differently from ECP. 

The proposed method, SegCPGP, using either the SM4 or Matern52 kernel, demonstrates strong performance overall and across various change categories, where the spectral mixture kernel may be preferred for the trend and periodicity changes, the RBF kernel may be preferred for mean changes, and the Matern52 kernel may be preferred for variance changes. 

\paragraph{Benchmark Data}

\begin{table}[h
t]
    \centering
    
    \caption{Comparison of several changepoint detection methods on benchmark datasets. For each dataset the best performing methods are highlighted in bold; the best overall mean $F_1$ score is also in bold. SegCPGP performs comparably to the best performing methods in the benchmark. Note that none of the methods performs differently from the zero method, according to a Holm-corrected Wilcoxon signed-rank test with $p=0.05$.}
\resizebox{\linewidth}{!}{
\begin{tabular}{l|rrrrr|r}
 & \rot{businv} & \rot{gdp\_argentina} & \rot{gdp\_iran} & \rot{gdp\_japan} & \rot{ozone} & \rot{mean $F_1$} \\
\midrule
adaga (Lin) & 0.630 & 0.824 & 0.713 & 0.471 & 0.966 & 0.720 \\
adaga (Matern52) & \textbf{0.723} & 0.824 & 0.800 & 0.615 & 0.966 & 0.786 \\
adaga (RBF) & 0.681 & 0.824 & 0.800 & 0.615 & 0.776 & 0.739 \\
binseg & 0.370 & 0.889 & 0.492 & 0.615 & 0.650 & 0.603 \\
bocpd & 0.270 & \textbf{0.947} & 0.622 & 0.800 & 0.650 & 0.715 \\
cpnp & 0.304 & 0.818 & 0.330 & 0.667 & 0.750 & 0.574 \\
ecp & 0.301 & 0.824 & 0.652 & \textbf{0.889} & 0.723 & 0.697 \\
kcpa & 0.047 & 0.131 & 0.219 & 0.068 & 0.109 & 0.121 \\
pelt & 0.370 & 0.889 & 0.492 & 0.615 & \textbf{1.000} & 0.673 \\
segcpgp (SM4) & 0.370 &\textbf{ 0.947} & \textbf{0.868} & 0.800 & 0.966 & \textbf{0.790} \\
segcpgp (Lin)& 0.588 & 0.824 & 0.652 & \textbf{0.889} & 0.966 & 0.784 \\
segcpgp (Matern52) & 0.559 & 0.824 & 0.589 & \textbf{0.889} & 0.651 & 0.702 \\
segcpgp (RBF) & 0.588 & 0.824 & 0.673 & \textbf{0.889} & 0.750 & 0.745 \\
zero & 0.588 & 0.824 & 0.652 & \textbf{0.889} & 0.723 & 0.735 \\
\bottomrule
\end{tabular}}
    \label{tab:benchmark}
\end{table}
The performance of SegCPGP is evaluated on the Turing changepoint detection benchmark datasets \citep{van2020evaluation}.  The datasets are annotated by multiple experts.
We incorporate the same datasets as in \cite{caldarelli2022adaptive}: Business Inventories (businv), Ozone (ozone), and GDPs of Japan, Iran and Argentina (gdp\_argentina, gdp\_iran, gdp\_japan). We omit the Run Log dataset, as our method does not yet support multivariate datasets. All datasets are standardized to have zero mean and unit variance. Appendix \ref{apx:benchmark} provides more extensive descriptions of the benchmark datasets. 

Table \ref{tab:benchmark} displays the $F_1$-score for various CPD algorithms on each of the benchmark datasets, as well as each method's average $F_1$-score. SegCPGP-SM4 obtains the highest $F_1$ score on GDP Japan. For the GDP Argentina dataset, SegCPGP-SM4 performs on par with BOCPD, obtaining the highest scores. PELT detects the changepoints perfectly on the ozone dataset, while SegCPGP-Lin and SegCPGP-SM4, as well as ADAGA-Lin and ADAGA-Mat52, achieve the second best score of 0.966.
On both the Business Inventories and GDP Japan dataset, SegCPGP does not outperform the zero method. For the Business Inventories dataset, only the ADAGA-based methods outperform the ZERO method; for the GDP Japan dataset, no method outscores the ZERO method. 

On average, SegCPGP-SM4 obtains the highest $F_1$ score in absolute terms (0.790), closely followed by ADAGA-Matern52 (0.786) and SegCPGP-Lin (0.784), highlighting the utility of Gaussian process-based changepoint detection methods. 




\paragraph{Calibration} \label{sec:emp-analysis}
In this section we provide an empirical analysis of the FPR and FNR for ADAGA and SegCPGP. We generate 3000 random mean-change datasets.
On each dataset, ADAGA is fit with $\delta=0.3$ and $\delta=0.6$.
SegCPGP is fit with $p=0.05$ and $p=0.1$. For each fit and each method the number of true positives (TPs), false positives (FPs), true negatives (TNs) and false negatives (FNs) are computed; we again consider changepoints estimated within 5 time steps of the true changepoint TPs. We elaborate on the computation of TPs, FPs, TNs and FNs, as well as the FNR and FPR in Appendix \ref{apx:fprnfnr}. 

As a heuristic to detect changepoints, ADAGA \citep{caldarelli2022adaptive} uses a likelihood ratio test (LRT) at the window level: A Gaussian process regression is fit locally, on a data window $\mathcal{W}$, and another GPR is fit on a subwindow $\mathcal{S}$. If the fit on $\mathcal{W}$ differs from the fit on $\mathcal{S}$ according to the LRT, a changepoint is detected. The threshold $\delta$ for the likelihood ratio statistic is chosen such that the probability of a Type I and Type II error on the window level is at most $\delta$. Since the null and alternative hypotheses are defined for GPRs fit on the window and sub-window only, the bounds derived in \citet{caldarelli2022adaptive} may not hold for the entire signal. 

Table \ref{tab:ee_rates} shows the empirical FNR and FPR for SegCPGP and ADAGA computed across the 3000 random mean-change datasets, with various parameter settings for their hypothesis tests.

The FPR does seem to be bounded by $\delta$. When $\delta=0.3$ the FNR is $0.983 (> 0.3)$, while when $\delta=0.6$ the FNR (0.245) indeed falls within the bounds. When looking to bound the false positive rate, ADAGA could be a good method to use, but the results for $\delta=0.3$ suggest that ADAGA is sensitive to the setting of the $\delta$ parameter: changes in $\delta$ have a large effect on the trade-off between the FNR and the FPR.

For $p=0.05$, the empirical FPR for SegCPGP is 0.074 while the FNR is $0.258$; for $p=0.1$, the empirical FPR is 0.096 and the FNR is $0.273$. Thus, SegCPGP closely approximates the FPR. SegCPGP's FNR appears to be less sensitive to changes in its $p$-value.
Thus, we can conclude SegCPGP is slightly miscalibrated, but remains reasonably reliable.

We further investigate the miscalibration by approximating the distribution of the LR statistic. 
Recall from Section~\ref{sec:model-selection} that the distribution of the LR statistic may not be $\chi^2_d$ under the null hypothesis. We generate 3900 noised datasets without a changepoint --- the null hypothesis --- from a GP with an RBF kernel. Then, we fit a changepoint Gaussian process regression to these data and collect corresponding LR statistics. 

Figure~\ref{fig:qq-plot} shows a quantile-quantile (Q-Q) plot of the empirical null distribution versus the $\chi^2_d$ distribution, for SegCPGP with RBF base kernels (left) and SegCPGP with 4-component spectral mixture base kernels (right). The Q-Q plot is slightly shifted from the diagonal (in blue), indicating that the empirical distribution has more degrees of freedom than the $\chi^2_d$-distribution. 
To test whether the samples from the empirical null distribution are drawn from a $\chi^2_d$-distribution, we apply a Kolmogorov-Smirnoff test: for both kernels, the null hypothesis of the sample coming from the $\chi^2_d$-distribution are firmly rejected ($p<10^{-60}$ for both kernels). The empirical null distribution closely matches a $\chi^2_d$-distribution with a higher number of degrees of freedom. 
%Although this confirms the slight miscalibration, as previously shown in Table~\ref{tab:ee_rates} the discrepancies in the desired and empirical FNR and FPR are quite small.  


\begin{table}[ht]
    \centering
    \caption{Empirical FPR and FNR over 3000 samples of random mean-change datasets, for ADAGA and SegCPGP. In the ADAGA hypothesis test, the probability of Type I/II errors should be at most $\delta$. }
    \begin{tabular}{c|c|c}
       Model & FNR & FPR \\ \hline
       ADAGA, $\delta=0.3$  & 0.245 & 0.597 \\
       ADAGA, $\delta=0.6$  & 0.984& 0.002 \\
       SegCPGP, $p=0.05$ & 0.258 & 0.073 \\ 
       SegCPGP, $p=0.10$ & 0.273 & 0.095 \\
       %SegCPGP (adjusted $\chi^2$), $p=0.05$  & 0.306 & 0.032 \\
       %SegCPGP (adjusted $\chi^2$), $p=0.10$ & 0.265 & 0.042 \\
    \end{tabular}
    \label{tab:ee_rates}
\end{table}

\begin{figure}
    \centering
\includegraphics[width=1\linewidth]{chisq.png}
    \caption{Quantile-quantile plot of the empirical null distribution versus the $\chi^2_d$ distribution, for changepoint GPR's with a RBF (left) and 4-component spectral mixture kernel SM4 (right). If the empirical null distribution and the $\chi^2_d$-distribution are equal, the black scatter should lie along the blue line; here, that is not the case.}
    \label{fig:qq-plot}
\end{figure}
\section{Discussion}
\paragraph{Limitations and possible extensions}
The cubic computational complexity of GPs may be a limiting factor in large-scale CPD problems. Future extensions could address this by incorporating a sparse GP implementation of the proposed framework, for instance, by parameterizing the covariance using locations of pseudoinputs \citep{snelson2005sparse}. In this case, the evidence lower bound (ELBO) could be naturally used for model selection.  
Additionally, an extension of SegCPGP could involve a variational implementation of GPR \citep{hensman2015scalable}, enabling the modeling of data with different likelihoods, thereby further increasing the versatility of the proposed approach.
Furthermore, expanding SegCPGP to multivariate data, for instance via multi-output GPs, would broaden the method's applicability.

Uncertainty quantification over the location and/or number of change points could be another worthwhile extension. Using Markov chain Monte Carlo (MCMC) methods, one can obtain a distribution over the changepoint locations, similar to \cite{green1995reversible}. However, this would require deriving the posterior over changepoint locations, as well as an efficient implementation of MCMC. 

For broader applications it may be desirable to devise an automated procedure for the selection of the number of components in the spectral mixture kernel. A similar problem is considered in Gaussian Mixture Models (GMMs), which could inspire the model selection procedure for the spectral mixture kernel. For example, model compression \citep{chen2024compressing} or a variational solution as proposed in \citep{corduneanu2001variational} could be applied to mitigate overfitting.

Finally, while most online methods can be directly applied in the offline setting, the reverse is not true; adapting SegCPGP's hypothesis testing procedure into an online method might thus be another research direction.

\paragraph{Benchmark annotations}
When testing the pairwise differences between the methods in Table \ref{tab:benchmark} with a Wilcoxon signed-rank test, we found that none of the methods perform significantly differently from the ZERO method. The \textit{Default} experiment of \citet{van2020evaluation} shows a similar result, explaining that this may either be due to the small number of changepoints as compared to the total number of datapoints or be due to each method detecting a large number of false positives. 
Upon closer examination, we found that none of the expert annotators performs differently from the ZERO method (Appendix \ref{apx:table-2}). 

The likely culprit is the inclusion of $t=1$ as a trivial changepoint for all annotators as well as the predictions, which skews the $F_1$-score upwards.
We discuss this in more detail and provide an example in Appendix \ref{apx:trivial}. 

Overall, while the benchmark is certainly useful for comparing changepoint detection methods amongst themselves, we recommend excluding the ZERO method from evaluation or conducting further research to establish metrics that is less beneficial to the ZERO method.
 
\paragraph{Calibration}
Addressing the slight miscalibration requires estimating the null distribution separately for each kernel, but the computational cost may be too high and this estimation is beyond the scope of this work. Alternatively, deriving new statistics, as in finite mixture models \citep{fruhwirth2006finite, chen2004testing}, or approximating the null distribution via Monte Carlo methods \citep{wolfe1971monte, hogg1956distribution} could be promising directions for future research in SegCPGP.


%To address the slight miscalibration, the null distribution would need to be estimated separately for each type of kernel. However, due to the significant computational cost involved, this estimation is beyond the scope of this work. An alternative approach could involve deriving new statistics, as is done in finite mixture models, which face similar challenges to SegCPGP. A lot of work has been devoted to deriving alternative statistics or approximating the null distribution via Monte Carlo methods \citep{wolfe1971monte, hogg1956distribution}. These approaches may offer useful directions for future research in SegCPGP.

\section{Conclusion}
In this work, we introduced SegCPGP, a flexible framework for changepoint detection in the offline setting, based on Gaussian process regression (GPR). We showed that SegCPGP can detect a wide range of changes without requiring prior knowledge of their types. We tested the algorithm with various kernels and compared its performance on simulated and benchmark data sets to state-of-the-art methods. We found that SegCPGP provides better overall performance on simulated data and comparable performance on benchmark data sets.


%SegCPGP is built on a combination of GPR with changepoint kernel, the likelihood ratio test and binary segmentation. SegCPGP can detect a variety of changes, especially when using the Mat\'ern 5/2 and spectral mixture kernel as base kernels. On simulated data, it outperforms its competitors, and on benchmark datasets, it achieves performance comparable to that of other methods. Although its hypothesis test shows minor miscalibration, SegCPGP proves to be a reliable method, with its empirical false positive rate closely matching its significance level. These results highlight SegCPGP's effectiveness as a flexible approach for changepoint detection.

% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements}
We thank the anonymous reviewers for their helpful feedback and suggestions.
\end{acknowledgements}

% References
\bibliography{uai2025-template}

\newpage

\onecolumn

\title{Revisiting Gaussian Processes For Changepoint Detection (Supplementary Material)}
\maketitle



\appendix

\section{Kernel Types} \label{apx:kernels}
\begin{figure}[ht]
    \centering
    \begin{subfigure}[b]{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{rbf.png}
        \caption{}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{matern.png}
        \caption{}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{linear.png}
        \caption{}
    \end{subfigure}
    \caption{Illustration of samples from a squared exponential kernel with $
\ell=5, \sigma^2=1$ (a), samples from a Mat\'ern kernel with $
\ell=5, \sigma^2=1$, and samples from a linear kernel with $\sigma^2$=1. The squared exponential kernel results in smoother functions than the Mat\'ern kernel. The linear kernel results in straight lines.}
    \label{fig:kernels}
\end{figure}


\paragraph{Squared exponential kernel}
The squared exponential kernel is a stationary kernel --- a kernel dependent on the distance between $t$ and $t'$ scaled by lengthscale $\ell$, $\frac{||t - t'||}{\ell}$ --- given by 
\begin{equation}
    k(t, t') = \sigma^2 \exp(0.5 \frac{||t-t'||}{\ell}).
\end{equation}
where $\sigma^2$ is the variance, $\ell$ is the lengthscale, and $||t-t'||$ is the Euclidean distance between $t$ and $t'$. The squared exponential kernel is also known as the radial basis function (RBF) kernel.
\paragraph{Mat\'ern kernel} % k(r) = σ² (1 + √5r + 5/3r²) exp{-√5 r}
A Mat\'ern kernel with smoothness parameter $\nu = 5/2$ is given by
\begin{equation}
    k(t, t') = \sigma^2 (1 + \sqrt{5}\frac{||t-t'||}{\ell}) + 5/3 \frac{||t-t'||}{\ell} \exp ({-\sqrt{5}\frac{||t-t'||}{\ell})}.
\end{equation}

\paragraph{Linear kernel}
The linear kernel is given by 
\begin{equation}
k(t, t') = \sigma^2 t t',
\end{equation}
where $\sigma^2$ is again a variance parameter. The kernel models linear functions. Note that since this kernel does not depend on $||t-t'||$, it is nonstationary. 

Figure~\ref{fig:kernels} illustrates samples from a Gaussian process prior with (a) squared exponential, (b) Mat\'ern and (c) linear kernels.

\section{Steepness in the changepoint kernel} \label{apx:steepness}
Below, we provide a detail analysis for the effect of setting the steepness parameter in the changepoint kernel to $\infty$.

The specification of the CP kernel is
    $$ k(f(t),f(t)) = k_1(t, t')\psi(t, t') + k_2(t, t'), \bar\psi(t, t'), $$
where for a location $t_0$ and steepness $s$,
    $$ \psi(t, t') = \psi(t)\psi(t') = \frac{1}{1 + \exp{(-s(t - t_0))}} \times \frac{1}{1 + \exp{(-s(t' - t_0))}}, $$
and 
    $$ \bar \psi(t, t') = (1 - \psi(t))(1 - \psi(t')) $$
In the case that $s=\infty$, the components $\psi(t, t')$ and $\bar\psi(t, t')$ are driven to 1 and 0, respectively:
    $$ \psi(t, t') = \psi(t)\psi(t') = \frac{1}{1 + \exp{(-\infty(t - t_0))}} \times \frac{1}{1 + \exp{(-\infty(t' - t_0))}}, $$
    then
    $$ \psi(t, t') = \psi(t)\psi(t') = \frac{1}{1 + 0} \times \frac{1}{1 + 0}, $$
    and thus
    $$ \psi(t, t') = 1;$$
    $$ \bar \psi(t, t') = (1 - \psi(t))(1-\psi(t')) = 0.$$
so we would conclude that then the changepoint kernel becomes equivalent to the first base kernel,
    $$ k(f(t),f(t)) = k_1(t, t').$$

When the location of the changepoint is moved to one of the extremes of the data window (say, $t_0 = 0$), we instead get 
$$ \psi(t, t') = \psi(t)\psi(t') = \frac{1}{1 + \exp{(-s(t-0))}} \times \frac{1}{1 + \exp{(-s(t'-0))}}. $$

We will briefly discuss the difference between setting the steepness to an extreme versus moving the location to an extreme (i.e. the edge of a window).

If the location of the change is at, or even beyond the edge of the window, both kernels can still describe the signal in the window if the steepness is sufficiently low. 
In Figure \ref{fig:low-steepness}, we visualize this in one dimension by plotting the area influenced by each base kernel via sigmoids. The location $t_0$ is plotted with the red dotted line. We have shaded the area influenced by kernel 1 blue and the area influenced by kernel 2 orange.
As can be seen from the figure, even if the location is placed outside the right bound of the data window, the signal in the window would still be modeled by both kernels.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.5\linewidth]{low-steepness.png}
    \caption{Visualization of the effect of setting the steepness parameter to a low value, while the location of the changepoint is outside the window.}
    \label{fig:low-steepness}
\end{figure}
\begin{figure}[h]
    \centering
\includegraphics[width=0.5\linewidth]{high-steepness.png}
    \caption{Visualization of the effect of setting the steepness parameter to 500, while the location of the changepoint is outside the window.}
    \label{fig:high-steepness}
\end{figure}

In practice the steepness can of course be set to some high value, which would result in a similar effect as for $s=\infty$, that is, only one of the kernels will describe the signal in the window. Figure \ref{fig:high-steepness} shows the effect for steepness 500.

In conclusion, the effect of moving the location is different from setting the steepness to infinity.

\section{SegCPGP Algorithm}
Algorithm \ref{alg:segcpgp} shows pseudocode for the SegCPGP procedure. 

% Binseg <> SegCPGP.
\begin{algorithm}
\caption{Segmenting CPGP}\label{alg:segcpgp}
\begin{algorithmic}[1]
\State $\textit{location} \gets []$
\Procedure{SegmentingCPGP}{$X, y, k_1, k_2$}
\State \textit{location} $\gets \min_x X + (\max_x X - \min_x X) / 2)$
\State \textit{steepness} $\gets 1$
\State $M_1 := \textsc{Gpr}(X, y, \textsc{ChangePoint}(k_1, k_2, \textit{location}, \textit{steepness}))$
\State $M_0 := \textsc{Gpr}(X, y, k_1)$
\For{$M$ in $[M_0, M_1]$}: 
\State $\hat M \gets \textsc{optimize}(M)$
%\State $\hat M \gets \textsc{Mala}(\hat M)$
\EndFor

\State $df \gets |M_{1}| - |M_{0}|$
\State $\mathcal{R} \gets -2 \log p(y|\hat M_{1}) - \log p(y|\hat M_{0})$
\State $p \gets \chi^2(\mathcal{R}, df)$
\If {$p > r$ } 
\Return
\EndIf
\If {$p <= r$}
\State \textit{location} $\gets \hat M_{1}.\textit{location} $
\State \textit{steepness} $ \gets \hat M_{1}.\textit{steepness}$
\State $\epsilon \gets 5$
\If  {$\min_x {X} < \textit{location} < \max_x {X}$}
\State $X_{\text{left}}, X_{\text{right}} \gets X[:\textit{location} + \epsilon], X[\textit{location} - \epsilon :]$
\State $y_{\text{left}}, y_{\text{right}} \gets y[:\textit{location} + \epsilon], y[\textit{location} -\epsilon:]$ 
\State \textsc{SegmentingCPGP}$(X_{\text{left}}, y_{\text{left}}, k_1, k_2)$
\State \textsc{SegmentingCPGP}$(X_{\text{right}}, y_{\text{right}}, k_1, k_2)$
\EndIf
\Return
\EndIf
\Return
\EndProcedure
\end{algorithmic}
\end{algorithm}

\section{Synthetic Data Generation} \label{apx:synth-gen}
\begin{figure}[ht]
    \centering
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{uai2025-template/trendchange.png}
        \caption{}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{uai2025-template/varchange.png}
        \caption{}
    \end{subfigure}
    
    \vskip\baselineskip
    
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{uai2025-template/meanchange.png}
        \caption{}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{uai2025-template/perchange.png}
        \caption{}
    \end{subfigure}

    \caption{Examples of synthetic trend (a), variance (b), mean (c) and periodicity (d) changepoint dataset. The datasets include 400 samples, and changepoints at locations 100, 200 and 300. The steepness of the mean and variance changepoints, which were generated from changepoint Gaussian processes, is 1. }
    \label{fig:synthetic-data}
\end{figure}
We describe the generation process for the synthetic datasets. Examples of datasets from each change category are found in Figure~\ref{fig:synthetic-data}.

\paragraph{Multiple changepoints}
The changepoint kernel can be extended to multiple change points 

% %Need to reference http://www.cs.cmu.edu/~andrewgw/changepoints.pdf here
Let $[k_c]_{c=0}^C$ denote a list of kernels.
For $C$ kernels, there are $C-1$ changepoints, and the kernel for multiple changepoints becomes
\begin{equation}\label{eq:multiple-cp}
\text{cov}(f(t), f(t')) = \bar\psi_1(t, t')k_1(t, t') + \:\:\sum_{c=0}^{C-2} (\psi_c(t, t') \bar\psi_{c+1}(t, t') k_c(t, t')\:\:) + \psi_{C-1} (t, t') k_{C} (t, t').\end{equation} 

\paragraph{Mean changes}
Mean change data is sampled from a changepoint Gaussian process with a list of Constant kernels. A constant kernel,  
\begin{equation}\label{eq:kconstant}
    k(t, t') = \sigma^2,
\end{equation} 
has only a variance parameter $\sigma^2$. 
To the mean change data, we add Gaussian noise with mean 0 and variance 0.01.

\paragraph{Variance changes}
Variance change data is sampled from a changepoint Gaussian process with a list of noise (or white) kernels. The noise kernel,  
\begin{equation}\label{eq:knoise}
    k(t_i, t_j) =  \delta_{ij}\sigma^2,
\end{equation} 
where $\delta_{ij} = 1 $ if $i=j$, and 0 otherwise, and $\sigma^2$ is again the variance parameter. To ensure that there exist variance changes, the variance parameter is 1 when the kernel index $c$ is even, and sampled from $[3, 20)$ otherwise. 
As the variance data represents changes in noise, we do not add extra noise to the variance change data.

\paragraph{Trend changes}
Trend changes are generated according to the well-known line equation $f(t) = at + b$, where in each segment the slope $a$ is randomly sampled from $U(0, 2)$, and the sign of the slope switches for each segment. 

Since our objective is to test the ability of each method to detect only one particular type of change, it is crucial that no other types of changes occur in the signal. A bias term $b$ is thus added to the signal, such that there are no jumps in the signal. To the trend change data, we add Gaussian noise with mean 0 and variance 1.

\paragraph{Periodicity changes}
A periodic signal is generated according to a sine wave, $f(t) = \sin(\omega t)$, where the angular frequency $\omega$ is randomly sampled from $[1, 100)$. As we stated earlier, we only want to test the detection capacity of the benchmark models on a single change type. We therefore do not change the amplitude of the signal, since that might be interpreted as a change in variance. 
To the periodicity change data, we add Gaussian noise with mean 0 and variance 0.1.

\section{Description Of Benchmark Datasets} \label{apx:benchmark}
We briefly describe the benchmark datasets used in this paper, which were originally presented in \citet{van2020evaluation}. All datasets used in this paper are univariate.

\paragraph{Business Inventories}
The Business Inventories dataset contains United States monthly total business inventories. The length of the dataset is 330. The minimum amount of changepoints found by annotators is 0; the maximum amount of changepoints found by annotators is 3. 

\paragraph{GDP Argentina}
The GDP Argentina contains the gross domestic product of Argentina, measured from 1960 up to 2019. 
The dataset has length 59. The minimum amount of changepoints found by annotators is 0; the maximum amount of changepoints found by annotators is 3. 

\paragraph{GDP Iran}
The GDP Iran dataset contains the gross domestic product of Iran, measured from 1960 to 2020. 
The dataset has length 58. The minimum amount of changepoints found by annotators is 0; the maximum amount of changepoints found by annotators is 3. 

\paragraph{GDP Japan}
The GDP Japan dataset contains the gross domestic product of Japan measured yearly from 1960 to 2020.
The dataset has length 58. The minimum amount of changepoints found by annotators is 0; the maximum amount of changepoints found by annotators is 1. 

\paragraph{Ozone}
The Ozone dataset contains yearly measurements of the global emissions of ozone-depleting substances. 
The dataset has length 54. The minimum amount of changepoints found by annotators is 0; the maximum amount of changepoints found by annotators is 1. 

\section{Hyperparameters Of Compared Models}
We give an overview of the hyperparameters used for the models in our synthetic data and benchmark experiments. 

\subsection{Gaussian process-based models}
Both ADAGA and SegCPGP use GPFlow \citep{matthews2017gpflow}, a Python package implementing Gaussian processes and Gaussian process regression in TensorFlow, \citep{tensorflow2015-whitepaper}. The kernels used in both ADAGA and SegCPGP use their default hyperparameters from the GPFlow package. 

\paragraph{ADAGA}
For ADAGA, as in \citet{caldarelli2022adaptive}, the minimal window size is set to 15 and the batch size is set to 1. $\delta$ is set to 0.6 by default. 
The version used in the benchmark experiment is the inducing points version; an implementation can be found \href{https://github.com/lasgroup/adaga}{here}.

\paragraph{SegCPGP}
The $p$-value for SegCPGP is set to 0.1 by default. 

\subsection{Turing Changepoint Dataset Benchmark Methods}
We briefly describe some specific hyperparameters used in Default setting for the Turing Changepoint Dataset Benchmark (\href{https://github.com/alan-turing-institute/TCPDBench}{TCPDBench}). TCPDBench uses methods implemented in Python and R, which can be found \href{https://github.com/alan-turing-institute/TCPDBench/tree/master/execs}{here}. In principle, running this benchmark after cloning the TCPDBench repository should already have the default parameters set correctly. The parameters of the default experiment are also described in \citet{van2020evaluation}; for completeness, we also describe them here.

Where possible, links to the documentation of the original packages are provided.

\paragraph{BinSeg \& PELT}
The implementations of BinSeg and PELT originate from the \href{https://cran.r-project.org/package=changepoint}{changepoint}  R package \citep{killick2014changepoint}.
Both methods by default try to find a change in mean. They both use the Modified Bayesian Information Criterion as penalty. The test statistic used by both methods is the Normal test statistic, which assumes a normal distribution for the errors. 

\paragraph{CPNP}
The documentation for CPNP, a nonparametric version of PELT implemented in R, is found \href{https://cran.r-project.org/package=changepoint.np}{here}. In TCPDBench, the number of quantiles is set to 10.

\paragraph{ECP \& KCPA}
Kernel Change Point Analysis, proposed by \citep{harchaoui2009regularized} combines the kernel trick and dynamic programming to detect changepoints. The constant penalty of KCPA is set to 1.0; the maximum number of changepoints is set to the maximum number possible. 

Energy change points, or ECP, was proposed by \cite{matteson2014nonparametric}. The parameter $\alpha$ of ECP is set to 1. The minimum number of timesteps between changepoints is set to 30; 199 random permutations are used in each permutation test; the significance level is set to 0.05. 

The documentation for both methods can be found \href{https://cran.r-project.org/package=ecp}{here}, \citep{james2013ecp}.

\paragraph{BOCPD}
The implementation of Bayesian online changepoint detection (BOCPD) is the one found in the Online ChangePoint (OCP) package. The documentation for the OCP package can be found \href{https://cran.r-project.org/package=ocp}{here}. The prior parameters $a$, $b$ and $k$ are all set to 1. The hazard function intensity \texttt{lambda} is set to 100.

\paragraph{RBOCPDMS}
The authors of \citet{van2020evaluation} also created \href{https://github.com/GjjvdBurg/rbocpdms}{RBOCPDMS} \cite{knoblauch2018doubly}. For the benchmark, the code is run from this repository. In case of RBOCPDMS, the run length is pruned to the best 100 run lengths; $\alpha_0$ and $\alpha_{rld}$ were both set to 0.5.
The timeout for RBOCPDMS is set to 4 hours by default for the benchmark experiment.

\section{One-Versus-Rest \texorpdfstring{$F_1$}{F₁}-Scores}\label{apx:table-2}

For each of the annotators in the annotations of \citet{van2020evaluation}, we compute their one-versus-rest $F_1$-score for each benchmark dataset. Then, we compute the ZERO-versus-rest $F_1$ score for each benchmark dataset. We compare their pairwise differences using the Wilcoxon signed-rank test described earlier. 
We also added a PERFECT method, which (artificially) obtains an $F_1$-score of 1 on every single dataset.

The one versus rest $F_1$ scores for each annotator are found in Table \ref{tab:ovr-scores}. Both the ZERO and PERFECT method are included; the ZERO method never returns any changepoints, while the PERFECT method artificially obtains an $F_1$-score of 1 on each dataset. 
Table \ref{tab:ovr-scores} displays the results. Unfortunately, none of the annotators, including the PERFECT annotator, performs significantly differently from the ZERO method. 

In conclusion, if not any single expert annotator nor a perfect score can perform differently from the ZERO method, we conjecture that any changepoint algorithm set loose on this benchmark is faced with an impossible task.

\begin{table}[ht]
    \centering
    
    \caption{One-versus-rest $F_1$ scores for every annotator versus the rest of the annotators, for each dataset. Not all datasets have been annotated by all annotators; missing values are represented with ---. The ZERO method never returns any changepoints; the PERFECT method artificially returns an $F_1$-score of 1 for every dataset.}
\resizebox{0.48\textwidth}{!}{
       \begin{tabular}{l|rrrrr}
        \toprule
         annot. & businv & gdp\_argentina & gdp\_iran & gdp\_japan & ozone \\
        \midrule
        6 & 1.000 & 0.769 & 0.829 & 0.857 & 0.957 \\
        7 & 0.426 & 0.769 & --- & 0.857 & 0.957 \\
        8 & 0.897 & 0.769 & 0.523 & 0.857 & 0.629 \\
        9 & 1.000 & --- & 0.857 & 1.000 & --- \\
        10 & --- & --- & 0.968 & --- & 0.957 \\
        12 & --- & 1.000 & 0.968 & 1.000 & 0.800 \\
        13 & 1.000 & 1.000 & --- & --- & --- \\
        ZERO & 0.588 & 0.824 & 0.652 & 0.889 & 0.723 \\
        PERFECT & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 \\
        \bottomrule
        \end{tabular}}
    \label{tab:ovr-scores}
\end{table}

\section{Classification Measures} \label{apx:classification}
Changepoint detection can be evaluated as a classification problem, when finding the locations of the changepoints is of interest.
In this section we give a detailed description of the computation of the $F_1$-score for changepoint detection, as also presented in \cite{van2020evaluation}; then, we highlight a problem with the $F_1$-score when a trivial changepoint is included. Finally, we describe how the false negative and false positive rate are computed in our experiments.

\subsection{The \texorpdfstring{$F_1$}{F₁}-score} \label{apx:f1}
In the context of changepoint detection, a true positive (TP) is any changepoint detected within a certain margin from the true changepoint \citep{van2020evaluation, killick2012optimal, truong2020selective}. 
Let $\mathcal{X}$ denote the predictions of some changepoint detection algorithm on some dataset. Assume there are $K$ annotators, that each provide an annotation, so that the set of all annotations is $\mathcal{T} =\{\mathcal{T}_k\}_{k=1}^K$. Since some of the annotators may naturally identify the same change points, we also define the set of unique annotations as $\mathcal{T}^*=\bigcup_k \{\mathcal{T}_k$\}.

Let $\textnormal{TP}(\mathcal{X}, \mathcal{T}^*)$ be a set-based evaluation of true positives for predictions $\mathcal{X}$ and the set of all unique annotators $\mathcal{T}^*$,
\begin{align*}
\textnormal{TP}(\mathcal{X}, \mathcal{T}^*) = \{\forall t\in \mathcal{X}, \forall \tau \in \mathcal{T}^*: |t-\tau| \leq M\},\end{align*}
and $\textnormal{TP}(\mathcal{X}, \mathcal{T}_k)$ be the true positives found by annotator $k\in K$,
\begin{align*}
\textnormal{TP}(\mathcal{X}, \mathcal{T}_k) = \{\forall t\in \mathcal{X}, \forall \tau \in \mathcal{T}_k: |t-\tau| \leq M\}.\end{align*}

where $M$ is some margin around the true changepoint. Generally, $M \geq 0$, but $M$ is usually set to 5 time steps in practice.

The precision ($\textnormal{P}$) is calculated as the proportion of detected change points by the algorithm that are true positives, 
\begin{align*}
\textnormal{P} = \frac{\textnormal{TP}(\mathcal{X}, \mathcal{T}^*)}{|\mathcal{X}|},
\end{align*}
the recall ($\textnormal{R}$) is calculated as the average true positives, computed over all annotators,
\begin{align*}
\textnormal{R} = \frac{1}{K}\sum_{k=1}^K\frac{\textnormal{TP}(\mathcal{X}, \mathcal{T}_k)}{|\mathcal{T}_k|}.
\end{align*}
The $F_1$-score is then computed as
\begin{align*}F_1 = 2 \cdot \frac{\textnormal{P}\cdot\textnormal{R}}{\textnormal{P} + \textnormal{R}}.
\end{align*}


\subsection{Trivial changepoints} \label{apx:trivial}
The $F_1$-score as defined in \cite{van2020evaluation} and Appendix \ref{apx:f1} adds the trivial changepoint $t=1$ to all annotations, as well as to all predictions.
While necessary to prevent the $F_1$-score from being undefined in case no changepoints are found or annotated, the $F_1$-score behaves strangely when no changepoints are detected by the algorithm. Due to the trivial changepoint, the precision of the ZERO method is always 1, and in cases where not many changepoints are annotated this will lead to unreasonably high $F_1$-scores.

\paragraph{Example}
Consider a dataset where three annotators provide the changepoints $\mathcal{T} = \{[45], [50]\}$, making $\mathcal{T}^*=\{45, 50\}$. Assuming the annotators are experts, it is reasonable to assume there is some unknown true changepoint around $t=45$ to $t=50$. 

Now consider a ZERO method, which always gives $\mathcal{X} = \varnothing$ as a prediction. 
In order to compute the $F_1$-score, we add the trivial changepoint 1 to both the predictions and all annotations, so we have the annotations $\{[1, 45], [1, 50]\}$, which makes $\mathcal{T}^*=\{1, 45, 50\}$, and $\mathcal{X} = \{1\}$.

Computing the precision then leads to 
\begin{align*}
\textnormal{P} = \frac{|\{1\}|}{|\{1\}|} = 1,
\end{align*}
\begin{align*}
\textnormal{R} = \frac{1}{2}\sum_{k=1}^K\frac{|\{1\}|}{|\{1, 45\}|} + \frac{|\{1\}|}{|\{1, 50\}|} = \frac{1}{2} (\frac{1}{2} + \frac{1}{2}) = \frac{1}{2},
\end{align*}
\begin{align*}
F_1 = 2\cdot\frac{\textnormal{P}\cdot \textnormal{R}}{\textnormal{P}+\textnormal{R}} = 2 \cdot \frac{1/2}{3/2} = \frac{2}{3} \approx 0.67.
\end{align*}

 It is easy to see from this example that the inclusion of the trivial changepoints means that the ZERO method will always get a precision of $1$ without finding any changepoint. Furthermore, without agreeing with any of the annotators, the ZERO method gets an $F_1$-score of $0.67$. 
 Thus, although it is necessary to include the trivial changepoint to prevent the precision and recall from being undefined, the subsequent results are arguably unreasonable.

 In our synthetic data experiment, the tested methods did mostly manage to perform differently from the ZERO method. If the annotators provide enough unique changepoints (in this case, there were three ground truth changepoints), the recall will be somewhat lower --- though we still think it is unreasonably high --- and the tested methods are actually capable of performing differently from the ZERO method.

\subsection{False positive and false negative rate} \label{apx:fprnfnr}
In order to compute the true and false positives (TP and FP, respectively) and the true and false negatives (TN and FN, respectively), we use a similar method as in Appendix \ref{apx:f1}, except that $\mathcal{X}$ now contains all performed hypothesis tests. We denote a hypothesis test by $h(t)$, which tests some location $t$ and returns
  \begin{align*}
    h(t) =
    \begin{cases}
      H_0, & \text{if the null hypothesis cannot be rejected}\  \\
      H_1, & \text{otherwise.}
    \end{cases}
  \end{align*}

A true positive is then a situation where the tested location $t$ is within the margin of the true changepoint, and the test indicates $H_1$,
\begin{align*}
\textnormal{TP}(\mathcal{X}, \mathcal{T}^*) = \{\forall t\in \mathcal{X}, \forall \tau \in \mathcal{T}^*: |t-\tau| \leq M \land h(t) = H_1\},
\end{align*}

whereas a false positive is when the tested location $t$ is outside the margin of the true changepoint, and the test indicates $H_0$,

\begin{align*}
\textnormal{FP}(\mathcal{X}, \mathcal{T}^*) = \{\forall t\in \mathcal{X}, \forall \tau \in \mathcal{T}^*: |t-\tau| > M \land h(t) = H_1\}.
\end{align*}

In contrast, a true negative is a situation where the tested location is outside the margin around the changepoint, and the test indicates $H_0$, 
\begin{align*}
\textnormal{TN}(\mathcal{X}, \mathcal{T}^*) = \{\forall t\in \mathcal{X}, \forall \tau \in \mathcal{T}^*: |t-\tau| > M \land h(t) = H_0\},
\end{align*}
and a false negative is a situation where the tested location $x$ is inside the margin around the changepoint, and the test indicates $H_0$,
\begin{align*}
    \textnormal{FN}(\mathcal{X}, \mathcal{T}^*) = \{\forall x\in \mathcal{X}, \forall \tau \in \mathcal{T}^*: |t-\tau| > M \land h(t) = H_0\}.\end{align*}

The FNR and FPR, as used in Section \ref{sec:emp-analysis} of the main paper, are computed as 
\begin{align*}
\textnormal{FPR} = \frac{\textnormal{FP}}{\textnormal{FP}+\textnormal{TN}},
\end{align*}
and
\begin{align*}
    \textnormal{FNR} = \frac{\textnormal{FN}}{\textnormal{FN}+\textnormal{TP}}.
\end{align*}

\end{document}
