% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised version; also before submission to see how the non-anonymous paper would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{example}{Example}

\usepackage[ruled,vlined]{algorithm2e}
\usepackage{color}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\usepackage{multirow}
\externaldocument{uai2023-main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\def\de{\overset{\Delta}{=}}
\newcommand{\Jie}[1]{{\color{green}#1}}

\SetKwInput{KwInput}{Input}
\SetKwInput{KwOutput}{Output}
\SetKwComment{Comment}{/*}{ */}
\let\oldnl\nl% Store \nl in \oldnl
\newcommand{\nonl}{\renewcommand{\nl}{\let\nl\oldnl}}

\title{Robust Quickest Change Detection for Unnormalized Models\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Suya Wu}
\author[1]{Enmao Diao}
\author[2]{Taposh Banerjee}
\author[3]{Jie Ding}
\author[1]{Vahid Tarokh}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    Duke University\\
    Durham, NC 27708 USA
}
\affil[2]{%
    Department of Industrial Engineering\\
    University of Pittsburgh\\
    Pittsburgh, PA 15213 USA\\
}
\affil[3]{%
    School of Statistics\\
    University of Minnesota Twin Cities\\
    Minneapolis, 
MN 55455 USA
  }
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\appendix
\section{Likelihood Ratio-based Robust CUSUM Algorithm}
\label{subsec:llr-cusum}
% In this section, we review the LLR-based CUSUM algorithm and present our Score-based CUSUM (SCUSUM) algorithm. Following the scheme of CUSUM, the proposed method can be used in a recursive way, which is not too demanding in computational and memory requirements for online implementation. 
\noindent In this section, we review the result in \cite{unnikrishnan2011minimax} on classical robust quickest change detection. Let $p_{\infty}$ and $p_{1}$ be the density functions of pre- and post-change distributions. If the post-change law is known, then given the data stream $\{X_n\}_{n\geq 1}$, the stopping rule of the likelihood ratio-based CUSUM algorithm is defined by
\begin{equation} 
\label{eq:cusumrule}
    T_{\texttt{CUSUM}}=\inf\{n\geq 1:\Lambda(n)\geq \tau\},
\end{equation}
where $\Lambda(n)$ is defined using the recursion
\begin{align}
    &\Lambda(0)=0, \nonumber \\
    &\Lambda(n) \de \biggr(\Lambda(n-1)+\log \frac{p_1(X_n)}{p_{\infty}(X_n)}\biggr)^{+}, \forall n \geq 1, \label{eq:cusum_score1}
 \end{align}
which leads to a computationally efficient stopping scheme (if the densities $p_1$ and $p_{\infty}$ are precisely known). 
In \cite{moustakides1986optimal}, it is shown that the CUSUM algorithm is exactly optimal, for every fixed constraint $\gamma$, for Lorden's problem. As pointed out in \cite{lai1998information}, the algorithm is also asymptotically optimal for Pollak's problem. In \cite{lorden1971procedures} and \cite{lai1998information}, the asymptotic performance of the CUSUM algorithm is also characterized. Specifically, it is shown as $\gamma \rightarrow \infty$.
\begin{align}
\label{eq:optimality_cusum}
    \mathcal{L_{\texttt{WADD}}}(T_{\texttt{CUSUM}}) \sim \mathcal{L_{\texttt{CADD}}}(T_{\texttt{CUSUM}})\sim \frac{\log \gamma}{\mathbb{D}_{\texttt{KL}}(P_{1}\|P_{\infty})}.
\end{align}
Here $\mathbb{D}_{\texttt{KL}}(p_{1}\|p_{\infty})$ is the Kullback-Leibler divergence between the post-change density $p_1$) and pre-change distribution $p_{\infty}$:
$$
\mathbb{D}_{\texttt{KL}}(P_{1}\|P_{\infty}) = \int_x p_1(x) \log \frac{p_1(x)}{p_\infty(x)} dx, 
$$
and the notation $g(c)\sim h(c)$ as $c\to c_0$ indicates that $\frac{g(c)}{h(c)} \to 1$ as $c\to c_0$ for any two functions $c\mapsto g(c)$ and $c\mapsto h(c)$.

The CUSUM algorithm can successfully detect a change in law from $p_1$ to $p_\infty$ because 
\begin{equation}
\label{eq:driftCUSUM}
    \begin{split}
        \int_x &\log \frac{p_1(x}{p_\infty(x)} p_1(x) dx = \mathbb{D}_{\texttt{KL}}(P_{1}\|P_{\infty}) > 0 \\
         \int_x &\log \frac{p_1(x}{p_\infty(x)} p_\infty(x) dx = -\mathbb{D}_{\texttt{KL}}(P_\infty\|P_1) < 0.
    \end{split}
\end{equation}
Thus, the mean of the increment of $\Lambda(n)$
in \eqref{eq:cusum_score1} before the change is negative, and after the change is positive. 

If the post-change density $p_1$ is not known and assumed to belong to a family $\mathcal{G}_1$, then the test is designed using the least favorable distribution. Specifically, in \cite{unnikrishnan2011minimax}, it is assumed that there is a density $q_1 \in \mathcal{G}_1$ such that for every $p_1 \in \mathcal{G}_1$, 
\begin{equation}
\label{eq:leastfavunni}
    \begin{split}
        \log \frac{q_1(X)}{p_\infty(X)} \bigg|_{X \sim q_1} \; \; \prec \quad \; \; \log \frac{q_1(X)}{p_\infty(X)}\bigg|_{X \sim p_1} . 
    \end{split}
\end{equation}
Here the notation $\prec$ is used to denote stochastic dominance: if $W$ and $Y$ are two random variables, then $W \prec Y$ if
$$
P(Y \geq t) \geq P(W \geq t), \quad \text{for all } t \in (-\infty, \infty). 
$$
If such a density $q_1$ exists in the post-change family, then the robust CUSUM is defined as the CUSUM test with $q_1$ used as the post-change density. Such a test is exactly optimal for the problem of \cite{lorden1971procedures} under additional assumptions on the smoothness of densities, and asymptotically optimal for the problem in \cite{pollak1985optimal}. We refer the reader to \cite{unnikrishnan2011minimax} for a more precise optimality statement. 

We note that in the literature on quickest change detection, the issue of the unknown post-change model has also been addressed by using a generalized likelihood ratio (GLR) test or a mixture-based test. While these tests have strong optimality properties, they are computationally even more expensive than the robust test described above; see \cite{lorden1971procedures, lai1998information, tartakovsky2014sequential}. 

As discussed in the introduction, the robust CUSUM algorithm discussed above may have two major drawbacks: 1) Due to the complicated characterization of the least favorable distribution $q_1$ \eqref{eq:leastfavunni}, it may be hard to identify in high-dimensional models. 2) The robust CUSUM is a likelihood ratio-based test and is thus computationally expensive to implement for high-dimensional models. 

In Section 4 of the main paper, we propose the RSCUSUM algorithm to mitigate these issues. 
\begin{enumerate}
    \item The RSCUSUM algorithm is based on Hyv\"arinen score (\cite{hyvarinen2005estimation}) and is invariant to normalizing constants. This makes it computationally efficient for high-dimensional models which are often only learnable within a normalizing constant. 
    \item We defined the notion of least favorable distribution differently in our paper. For us, the least favorable distribution has the least Fisher divergence with respect to the pre-change model. We also provided an efficient computational method to identify the least favorable distribution. 
\end{enumerate}

% \section{Proper Scoring Rules, Fisher Information, and Hyv\"arinen Score}
% \label{subsec:score_rules}
% \noindent In this section, we review the concept of proper scoring rules and its connection to Fisher information and Hyv\"arinen score. Let $X$ be a random variable with values in $\mathcal{X}\subseteq \mathbb{R}^d$, and let $\mathcal{P}$ be a family of distributions over $\mathcal{X}$. Let $P$ and $Q \in \mathcal{P}$ denote the true data-generating distribution and a postulated distribution, and let $p$ and $q$ respectively denote their corresponding densities. \citet{gneiting2007strictly} studied proper scoring rules as a unified framework to measure the quality of postulated models on observed data.

% \begin{definition}[Proper Scoring Rule] A scoring rule is a function $(X, Q)\mapsto \mathcal{S}(X,Q)$ that measures the quality of $Q$ for modeling data represented by $X$. It is said to be \textit{proper} if for all $P \in \mathcal{P}$, the expected score $\mathbb{E}_{X\sim P}\mathcal{S}[(X, Q)]$ is minimized at $Q=P$, where the minimum is taken over all $Q \in \mathcal{P}$. 
% Moreover, $\mathcal{S}$ is \textit{strictly proper} with respect to $\mathcal{P}$, if for any $Q\in \mathcal{P}$ and $Q\neq P$, $\mathbb{E}_{X\sim P}[\mathcal{S}(X, Q)] > \mathbb{E}_{X\sim P}[\mathcal{S}(X, P)]$. 
% \end{definition}
% %A proper scoring rule penalizes the uncertainty of a belief $Q$ given observations $X$ that adhere to $P$, which is applicable to fairly arbitrary statistical decision problems. 
% The logarithmic scoring rule~\cite{good1992rational} is a well-known and widely applied example of a strictly proper scoring rule. 
% \begin{definition}[Logarithmic Score]
%     The logarithmic scoring rule (also called the log score) is given by
%     \begin{equation*}
%         (X, Q)\mapsto \mathcal{S}_{\texttt{L}}(X, Q) \de -\log q(X).
%     \end{equation*}
% \end{definition}
% Minimizing the log score is associated with maximum likelihood estimation (MLE) and the Kullback-Leibler (KL) divergence
% \begin{equation*}
%     \mathbb{D}_{\texttt{KL}}(P\|Q)\de \mathbb{E}_{X\sim P} \left[\log p(X) - \log q(X)\right].
% \end{equation*} 
% Since $\mathbb{D}_{\texttt{KL}}(P\|Q) > 0$ for any $Q\neq P$, the log score is \textit{strictly proper}. The detection score of LLR-based CUSUM, defined in Equation~(\ref{eq:cusum_score}), can be rewritten by $$
% \log \frac{p_1(X_n)}{p_{\infty}(X_n)} = \mathcal{S}_{\texttt{L}}(X_n, P_{\infty})-\mathcal{S}_{\texttt{L}}(X_n, P_1).$$
% \subsection{Fisher divergence and Hyv\"arinen score}
% \label{subsec:fisher_hyvarinen}
% % Like before, we consider a family of distributions $\mathcal{P}$. However, we assume that any distribution $Q \in \mathcal{P}$ with the PDF $q(x)$ is potentially known only up to a normalizing constant. In other words, instead of $q(x)$, we are given $\tilde{q}(x)$ with
% % \begin{equation*}
% %     q(x) = \frac{\tilde{q}(x)}{\int_{x\in \mathcal{X}} \tilde{q}(x)dx}.
% % \end{equation*}
% % In many cases, the computation of the denominator (also known as the \textit{normalizing factor} or the \textit{partition function}) may be intractable. In fact, the number of points required for approximating the integral in the above may grow exponentially as a function of the dimension of $\mathcal{X}$. 
% \noindent \citet{hyvarinen2005estimation} proposed an estimation procedure for unnormalized statistical models by minimizing the Fisher divergence from $P$ to $Q$, defined by
% \begin{align*}
%     \mathbb{D}_{\texttt{F}} (P \| Q) \de \mathbb{E}_{X\sim P} \left[\left \| \nabla_{\mathbf{x}} \log p(X)- \nabla_{\mathbf{x}} \log q(X)\right \|_2^2 \right],
% \end{align*}
% where $\|\cdot\|_2$ denotes the Euclidean norm. Clearly, $\nabla_{\mathbf{x}} \log p(X)$ and $\nabla_{\mathbf{x}} \log q(X)$ remain invariant if $p$ and $q$ are scaled by any positive constant with respect to $X$. Hence, the Fisher divergence remains \textit{scale-variant} with respect to an arbitrary constant scaling of density functions. Under some mild regularity conditions on $p$ and $q$, \citet{hyvarinen2005estimation} showed that
% \begin{align*}
%     \mathbb{D}_{\texttt{F}} (P \| Q) =\mathbb{E}_{X\sim P} \left[\frac{1}{2}\left \| \nabla_{\mathbf{x}} \log p(X) \right \|_2^2 + \mathcal{S}_{\texttt{H}}(X, Q)\right],
% \end{align*}
% where $\mathcal{S}_{\texttt{H}}(X, Q)$ a \textit{scale-invariant} proper scoring function, referred to as the Hyv\"arinen score in the framework of proper scoring rules~\cite{parry2012proper} (see a precise definition below). Since $\frac{1}{2}\left \| \nabla_{\mathbf{x}} \log p(X) \right \|_2^2$ is a constant in terms of $Q$, then minimizing the Fisher divergence is equivalent to minimizing $\mathcal{S}_{\texttt{H}}(X, Q)$.
% \begin{definition}[Hyv\"arinen Score] The Hyv\"arinen score is a mapping  $(X, Q)\mapsto \mathcal{S}_{\texttt{H}}(X, Q)$ given by 
%     \begin{equation}
%         \label{eq:hyv_score}
%         \mathcal{S}_{\texttt{H}}(X, Q) \de \frac{1}{2} \left \| \nabla_{X} \log q(X) \right \|_2^2 + \Delta_{X} \log q(X)
%     \end{equation}
% whenever it can be well defined. Here, $\nabla_{X}$ and $\Delta_{X} = \sum_{i=1}^d \frac{\partial^2}{\partial x_i^2}$ respectively denote the gradient and the Laplacian operators acting on $X = (x_1, \cdots, x_d)^{\top}$.
% \end{definition}
% $\mathcal{S}_{\texttt{H}}$ is \textit{scale-invariant} inherited from the \textit{scale-invariant} property of Fisher divergence. This property avoids the computation of the normalizing constant for unnormalized models. Specifically, when the knowledge of $Q$ is up to $\tilde{q}(x)$ with
% \begin{equation*}
%     q(x) = \frac{\tilde{q}(x)}{\int_{x\in \mathcal{X}} \tilde{q}(x)dx},
% \end{equation*} 
% it is easy to see that $\mathcal{S}_{\texttt{H}}(X, Q)$ remains invariant by replacing the density $q$ with the associated unnormalized term $\tilde{q}$. Additionally, it is easy to verify that $\mathbb{D}_{\texttt{F}}(P\| Q) > 0$ for $Q \neq P$, thus the Hyv\"arinen score is \textit{strictly proper}.

% For a probability measure $Q$ and a random variable $X$, a scoring rule \cite{gneiting2007strictly} is a function $(X, Q)\mapsto \mathcal{S}(X,Q)$ which intuitively  measures the quality of $Q$ for modeling data represented by $X$. Let $\mathcal{P}$ denote the set of all probability measures. The scoring rule $\mathcal{S}$ is said to be \textit{proper} if for all $P \in \mathcal{P}$, the expected score $\mathbb{E}_{X\sim P}\mathcal{S}[(X, Q)]$ is minimized at $Q=P$, where the minimum is taken over all $Q \in \mathcal{P}$.  Moreover, $\mathcal{S}$ is \textit{strictly proper} with respect to $\mathcal{P}$, if for any $Q\in \mathcal{P}$ and $Q\neq P$, $\mathbb{E}_{X\sim P}[\mathcal{S}(X, Q)] > \mathbb{E}_{X\sim P}[\mathcal{S}(X, P)]$. 

% A well-known example of a strictly proper scoring rule is the logarithmic scoring rule~\cite{good1992rational} given by
%     \begin{equation*}
%         (X, Q)\mapsto \mathcal{S}_{\texttt{L}}(X, Q) \de -\log q(X),
%     \end{equation*}
%     where $q$ is the density of $Q$. 
% It is well-known that minimizing the log score is associated with maximum likelihood estimation (MLE) and the Kullback-Leibler (KL) divergence $\mathbb{D}_{\texttt{KL}}(P\|Q)$ between probability measures $P$ and $Q$. 
% Since $\mathbb{D}_{\texttt{KL}}(P\|Q) > 0$ for any $Q\neq P$, the log score is \textit{strictly proper}.  

% % Like before, we consider a family of distributions $\mathcal{P}$. However, we assume that any distribution $Q \in \mathcal{P}$ with the PDF $q(x)$ is potentially known only up to a normalizing constant. In other words, instead of $q(x)$, we are given $\tilde{q}(x)$ with
% % \begin{equation*}
% %     q(x) = \frac{\tilde{q}(x)}{\int_{x\in \mathcal{X}} \tilde{q}(x)dx}.
% % \end{equation*}
% % In many cases, the computation of the denominator (also known as the \textit{normalizing factor} or the \textit{partition function}) may be intractable. In fact, the number of points required for approximating the integral in the above may grow exponentially as a function of the dimension of $\mathcal{X}$. 

\section{Proofs}
The theoretical analysis for delay and false alarms is analogous to that of analysis from \cite{wuetal-aistat-2023}. We give complete proofs here for completeness.

%In this section, we provide proof of those statements 
% \subsection{Proof of Lemma 4.1}
% \begin{proof}
% Consider the set of distributions $\lambda Q_1+(1-\lambda) Q_2 \in \mathcal{G}_1$, for $\lambda \in [0,1]$, with densities $\lambda q_1 + (1-\lambda) q_2$. This convex combination exists in $\mathcal{G}_1$ due to the convexity assumption on $\mathcal{G}_1$. Recall that $p_\infty$ is the pre-change density. Clearly the distance
% $\mathcal{L}(\lambda) = \mathbb{D}_{\texttt{F}} \left(\lambda Q_1+(1-\lambda) Q_2\| P_{\infty}\right)$ is minimized at $\lambda = 1$ and thus $\frac{\partial \mathcal{L}(1^-)}{\partial \lambda}\le 0$.
% Since
%     \begin{equation}
%         \mathcal{L}(\lambda)=\int_x \| \nabla \log \left(\lambda q_1+(1-\lambda) q_2\right)-\nabla \log p_{\infty} \|^2\left(\lambda q_1+(1-\lambda) q_2\right) d x,
%     \end{equation}
% we have 
% \begin{multline}
%         \frac{\partial \mathcal{L}(\lambda)}{\partial \lambda}=\int_x\left(q_1-q_2\right)\left\|\nabla \log \left(\lambda q_1+(1-\lambda) q_2\right)-\nabla \log p_{\infty}\right\|^2 d x\\
%         +\int_r\left(\lambda q_1+(1-\lambda) q_2\right) \cdot 2\left(\nabla\log \left(\lambda Q_1+(1-\lambda) Q_2\right)-\nabla \log P_{\infty}\right)\nabla\left( \frac{Q_1-Q_2}{\lambda Q_1+(1-\lambda) Q_2} \right)d x
%         %transport of the gradient - a vector
% \end{multline}
% Calculating $\mathcal{L}(\lambda)$ takes the minimum at $\lambda=1^-$, 
% \begin{align}
%         &\frac{\partial \mathcal{L}(\lambda)}{\partial \lambda} \mid_{\lambda=1^-}\notag\\
%         &= \int_x\left(Q_1-Q_2\right)\left\|\nabla \log Q_1- \nabla \log P_{\infty}\right\|^2 d x+2 \int_x Q_1 \nabla\left(\frac{Q_1-Q_2}{Q_1}\right)\left(\nabla \log Q_1-\nabla \log P_{\infty}\right) dx\\
%         &= \mathbb{D}_{\texttt{F}}\left(Q_1 \| P_{\infty}\right)-\int_x\underbrace{ Q_2\left\|\nabla\log Q_1-\nabla \log P_{\infty}\right\|^2}_{\text{term 1}}+\underbrace{2 Q_1 \nabla\left(\frac{Q_1-Q_2}{Q_1}\right)\left(\nabla \log Q_1-\nabla\log P_{\infty}\right)}_{\text{term 2}}dx
%         \label{eq:diff}
%         % &= \mathbb{D}_{\texttt{F}}\left(P_1 \| P_{\infty}\right)- \mathbb{D}_{\texttt{F}}\left(P_2 \| P_{\infty}\right)+ \mathbb{D}_{\texttt{F}}\left(P_2\|P_{\infty}\right)-\int_x P_2\left\|\nabla \log P_1- \log P_{\infty}\right\|^2 dx\\
%         % &+2 \int P_1 \nabla\left(1-\frac{P_2}{P_1}\right)\left(\nabla \log P_1-\nabla \log P_{\infty}\right) d x
% \end{align}
% For term 1, we have 
% \begin{align}
%     &\int_x Q_2 \| \nabla \log Q_1-\nabla\log Q_2\|^2 d x\notag \\
%     &=\int_x Q_2\left\|\nabla\log Q_1-\nabla\log Q_2+\nabla\log Q_2-\nabla\log P_{\infty}\right\|^2 d x\\
%     &=\mathbb{D}_{\texttt{F}}\left(Q_2 \| Q_1\right)+\mathbb{D}_{\texttt{F}}\left(Q_2 \| P_{\infty}\right)+\int_x 2 Q_2\left(\nabla \log Q_1-\nabla \log Q_2\right)\left(\nabla \log Q_2-\nabla \log P_{\infty}\right) d x\\
%     &=-\mathbb{D}_{\texttt{F}}\left(Q_2 \| Q_1\right)+\mathbb{D}_{\texttt{F}}\left(Q_2 \| P_{\infty}\right)+\int_x 2 Q_2\left(\nabla \log Q_1-\nabla \log Q_2\right)\left(\nabla \log Q_1-\nabla \log P_{\infty}\right) d x\label{eq:term1}
% \end{align}
% For term 2, we note that 
% \begin{align}
%     \nabla\left( \frac{Q_1-Q_2}{Q_1}\right)&=-\nabla\left(\frac{Q_2}{Q_1}\right)\\
%     &=-\frac{\left(\nabla Q_2 ) Q_1-\left(\nabla Q_1\right) Q_2\right.}{Q_1^2}\\
%     &=-\frac{\nabla Q_2}{Q_1}+\frac{(\nabla Q_1) Q_2}{Q_1^2}\\
%     &=\frac{Q_2}{Q_1}\left(\nabla\log Q_1-\nabla\log Q_2\label{eq:trick}\right)
% \end{align}
% Plugging Equation~(\ref{eq:trick}) into term 2, 
% \begin{align}
%     2 Q_1 \nabla\left(\frac{Q_1-Q_2}{Q_1}\right)\left(\nabla \log Q_1-\nabla\log P_{\infty}\right)=2Q_2\left(\nabla \log Q_1-\nabla\log Q_2\right)\left(\nabla \log Q_1-\nabla\log P_{\infty}\right)
%     \label{eq:term2}
% \end{align}
% Plugging Equations~(\ref{eq:term1}) and~(\ref{eq:term2}) into Equation~(\ref{eq:diff}), \begin{align}
%     \frac{\partial \mathcal{L}(\lambda)}{\partial \lambda} \mid_{\lambda=1^-}=\mathbb{D}_{\texttt{F}}\left(Q_1 \| P_{\infty}\right)+\mathbb{D}_{\texttt{F}}\left(Q_2 \| Q_1\right)-\mathbb{D}_{\texttt{F}}\left(Q_2 \| P_{\infty}\right)
% \end{align}
% The results follows since $\frac{\partial \mathcal{L}(1^-)}{\partial \lambda}\le 0$
% \end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \subsection{Proof of Lemma~\ref{lemma: drifts}  }
% \begin{proof}
% Under some mild regularity conditions, \cite{hyvarinen2005estimation} proved that
% \begin{align*}
%     \mathbb{D}_{\texttt{F}} (P \| Q) =\mathbb{E}_{X\sim P} \left[\frac{1}{2}\left \| \nabla_{X} \log p(X) \right \|_2^2 + \mathcal{S}_{\texttt{H}}( X, Q)\right].
% \end{align*}
% Let $C(P)$ denote $\mathbb{E}_{X\sim P} \left[\frac{1}{2}\left \| \nabla_{X} \log p(X) \right \|_2^2\right]$ for any $P\in \mathcal{P}$, then 
% \begin{equation*}
%      \mathbb{E}_{\infty}[\mathcal{S}_{\texttt{H}}(X, P_{\infty})-\mathcal{S}_{\texttt{H}}(X, Q_1)]=\mathbb{D}_{\texttt{F}} (P_{\infty} \| P_{\infty})-C(P_{\infty})-\mathbb{D}_{\texttt{F}} (P_{\infty} \| Q_1)+C(P_{\infty})=-\mathbb{D}_{\texttt{F}} (P_{\infty} \| Q_1),
% \end{equation*}
% and 
% \begin{equation*}
%      \mathbb{E}_{1}[\mathcal{S}_{\texttt{H}}(X, P_{\infty})-\mathcal{S}_{\texttt{H}}(X, Q_1)]=\mathbb{D}_{\texttt{F}} (P_1 \| P_{\infty})-C(P_{1})-\mathbb{D}_{\texttt{F}} (P_1 \| Q_1)+C(P_{1}) \ge \mathbb{D}_{\texttt{F}} (Q_1 \| P_{\infty}),
% \end{equation*}
% where we applied Lemma \ref{lemma:tech}.

% Since $\lambda>0$, the results follow.
% \end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Proof of Lemma 4.3}
\begin{proof}
Define the function $\lambda:\mapsto h(\lambda)$ given by $$h(\lambda)\de\mathbb{E}_{\infty}[\exp (z_{\lambda}(X))]-1.$$ Observe that \begin{equation*}
  h^{\prime}(\lambda)\de \frac{d h}{d\lambda}(\lambda)=\mathbb{E}_{\infty}[(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_{1}))\exp (z_{\lambda}(X))].
\end{equation*}
Note that $h(0)=0$, and $h^{\prime}(0)=-\mathbb{D}_{\texttt{F}}(P_{\infty}\|Q_1)<0$. 
% Thus, there exists $\lambda>0$ such that $h(\lambda)< 0$, and the condition (see Inequality (10) in the main paper) is satisfied.
Next, we prove that either 1) there exists $\lambda^{\star} \in (0,\infty)$ such that $h(\lambda^{\star}) = 0$, or 2) for all $\lambda>0$ we have $h(\lambda)<0$. 

Observe that
\begin{equation*}
    h''(\lambda)\de\frac{d^2 h}{d \lambda}(\lambda)
    =\mathbb{E}_{\infty}[(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_{1}))^2\exp (z_{\lambda}(X))]\geq 0.
\end{equation*}
We claim that $h(\lambda)$ is \textit{strictly convex}, namely $h''(\lambda) > 0$ for all $\lambda\in [0,\infty)$. Suppose $h''(\lambda) = 0$ for some $\lambda \geq 0$, we must have $S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1) = 0$ almost surely.  This implies that 
$\mathbb{E}_{\infty}[(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1))]
= 0$ which in turn gives $-\mathbb{D}_{\texttt{F}}(P_{\infty}\|Q_1) =0$ and $P_{\infty} = Q_1$ almost everywhere, leading to a contradiction to the assumption $P_{\infty}\notin \mathcal{G}_1$. Thus, $h(\lambda)$ is \textit{strictly convex} and $h^{\prime}(\lambda)$ is \textit{strictly increasing}. 

Here, we recognize two cases: either 1) $h(\lambda)$ have at most one global minimum in $(0, \infty)$, or 2) it is strictly decreasing in $[0,\infty)$. We will show that the second case is degenerate that is of no practical interest.
\begin{itemize}
\item \textbf{Case 1:} If the global minimum of $h(\lambda)$ is attained at $a \in (0, \infty)$, then $h^{\prime}(a) = 0$. Since $h^{\prime}(0) < 0$ and $h(0) = 0$, the global minimum $h(a)<0$. 
Since $h^{\prime}(\lambda)$ is \textit{strictly increasing}, we can choose $b > a$ and conclude that $h^{\prime}(\lambda) > h^{\prime}(b) > h^{\prime}(a) = 0$ for all $\lambda > b$. It follows that $\lim_{\lambda \rightarrow \infty} h(\lambda) = +\infty$. Combining this with the continuity of $h(\lambda)$, we conclude that $h(\lambda^*) = 0$ for some $\lambda^* \in (0, \infty)$ and any value of $\lambda \in (0, \lambda^*]$ satisfies Inequality~(10).

Note that in this case, we must have $P_{\infty}\left(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1) \ge c\right)>0$, for some $c>0$. Otherwise, we have $P_{\infty}\left(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1) \le 0\right)=1$. This implies that $P_{\infty}(z_{\lambda}(X)\le 0)=1$, or equivalently $\mathbb{E}_{\infty}[\exp (z_{\lambda}(X))]< 1$ for all $\lambda > 0$, and therefore leads to Case 2: $h(\lambda)< 0$ for all $\lambda > 0$. Here, $\mathbb{E}_{\infty}[\exp (z_{\lambda}(X))]\neq 1$ since $P_{\infty}(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1)=0)<1$; otherwise $P_{\infty}(S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1)=0)=1$, and then $\mathbb{E}_{\infty}[S_{\texttt{H}}(X,P_{\infty})-S_{\texttt{H}}(X,Q_1)]=-\mathbb{D}_{\texttt{F}}(P_{\infty}\|Q_1)=0$, causing the same contradiction to $P_{\infty}\notin\mathcal{G}_1$.

\item \textbf{Case 2:} If $h(\lambda)$ is strictly decreasing in $(0, \infty)$, then any $\lambda \in (0, \infty)$ satisfies Inequality~(10). As discussed before, in this case, we must have $P_{\infty}\left(S_{\texttt{H}}(X, P_{\infty})-S_{\texttt{H}}(X, Q_1) \le 0\right)=1$. Equivalently, all the increments of the RSCUSUM detection score are non-positive under the pre-change distribution, and $P_{\infty}(Z(n)=0)=1$ for all $n$. Accordingly, $\mathbb{E}_{\infty}[T_{\textit{RSCUSUM}}]=+\infty$. When there occurs change (under measure $Q_1$), we also observe that RSCUSUM can get close to detecting the change point instantaneously as $\lambda$ is chosen arbitrarily large. Obviously, this case is of no practical interest.
\end{itemize}

\end{proof}

\subsection{Proof of Theorem 4.4}
\begin{proof}
We follow the proof of \cite{lai1998information}[Theorem 4] to conclude the result of Theorem 4.4. A constructed martingale and Doob's submartingale inequality~\citep{doob1953stochastic} are combined to finish the proof. 
\begin{enumerate}
    \item We first construct a non-negative martingale with mean $1$ under the measure $P_{\infty}$. Define a new instantaneous score function $X \mapsto \tilde{z}_{\lambda}(X)$ given by 
\begin{equation*}
    \label{eq:new_instant_z_lambad}
    \tilde{z}_{\lambda}(X)\de z_{\lambda}(X)+\delta,
\end{equation*}
where $$\delta \de -\log \biggr(\mathbb{E}_{\infty}\left[\exp (z_{\lambda}(X))\right]\biggr).$$ Further define the sequence $$\tilde{G}_n\de \exp \biggr(\sum_{k=1}^n\tilde{z}_{\lambda}(X_k)\biggr),\; \forall n\geq 1.$$ 

Suppose $X_1, X_2, \ldots$ are i.i.d according to $P_{\infty}$ (no change occurs). Then,
\begin{align*}
\mathbb{E}_{\infty}\left[\tilde{G}_{n+1}\mid \mathcal{F}_n\right] = \tilde{G}_n\mathbb{E}_{\infty}[\exp(\tilde{z}_{\lambda}(X_{n+1}))]=\tilde{G}_{n}e^{\delta}\mathbb{E}_{\infty}[\exp(z_{\lambda}(X_{n+1}))]=\tilde{G}_{n},
\end{align*}
% Note that $\delta = -\log \left(\mathbb{E}_{\infty}\left[\exp \left(z_\lambda(X_{n+1})\right)\right]\right)$
and
\begin{align*}
    \mathbb{E}_{\infty}[\tilde{G}_n] &= \mathbb{E}_{\infty}\left[\exp\left(\sum_{i=1}^{n}(z_{\lambda}(X_i)+\delta)\right)\right]= e^{n\delta} \prod_{i=1}^n\mathbb{E}_{\infty}[\exp(z_{\lambda}(X_i))]=1.
\end{align*}
Thus, under the measure $P_{\infty}$, $\{\tilde{G}_n\}_{n\geq 1}$ is a non-negative martingale with the mean $\mathbb{E}_{\infty}[\tilde{G}_1]=1$. 

\item We next examine the new stopping rule 
\begin{equation*}
    \tilde{T}_{\texttt{RSCUSUM}} = \inf \left\{n\geq 1: \max_{1\leq k\leq n} \sum_{i=k}^n \tilde{z}_{\lambda}(X_i)\geq \tau \right\},
\end{equation*}
where $\tilde{z}_{\lambda}(X_i) = z_{\lambda}(X_i)+\delta$. By Inequality~(10), we observe that $\delta\geq 0$. By Jensen's inequality,
\begin{equation}
\label{eq:jensen}
    \mathbb{E}_{\infty}[\exp(z_{\lambda}(X))]\geq \exp\left(\mathbb{E}_{\infty}[z_{\lambda}(X)]\right),
\end{equation}
with equality holds if and only if $z_{\lambda}(X)=c$ almost surely, where $c$ is some constant. Suppose the equality of Equation~(\ref{eq:jensen}) holds, then\begin{align*}
    -\lambda \mathbb{D}_{\texttt{F}}(Q_1||P_{\infty})&=\mathbb{E}_{\infty}[z_{\lambda}(X)]=c=\mathbb{E}_{1}[z_{\lambda}(X)]=\lambda \mathbb{D}_{\texttt{F}}(P_{\infty}||Q_1).
\end{align*} 
It follows that $0\leq \mathbb{D}_{\texttt{F}}(P_{\infty}||Q_1)=-\mathbb{D}_{\texttt{F}}(Q_1||P_{\infty})\leq 0$, which implies that $P_{\infty}\notin \mathcal{G}_1$ almost everywhere. This leads to a contradiction to the assumption $P_{\infty}\notin \mathcal{G}_1$. Thus, the inequality of Equation~(\ref{eq:jensen}) is \textit{strict}, and therefore $\delta<\lambda\mathbb{D}_{\texttt{F}}(P_{\infty}||Q_1)$. Hence, $\tilde{T}_{\texttt{RSCUSUM}}$ is not trivial.

Define a sequence of stopping times: 
\begin{align*}
    &\eta_0 = 0,\\
    &\eta_1 = \inf \left\{t:\sum_{i=1}^t \tilde{z}_{\lambda}(X_i)<0\right\},\\
    &\eta_{k+1} = \inf \left\{t>\eta_k:\sum_{i=\eta_k+1}^t \tilde{z}_{\lambda}(X_i)<0\right\}, \; \text{for}\;  k\geq 1.
\end{align*}
By previous discussion, $\{\tilde{G}_n\}_{n\geq 1}$ is a nonnegative martingale under $P_{\infty}$ with mean 1. Then, for any $k$ and on $\{\eta_k<\infty\}$,
\begin{equation}
\label{eq:doobs}
P_{\infty}\left(\sum_{i=\eta_k+1}^n\tilde{z}_{\lambda}(X_i)\geq \tau \;\text{for some}\;  n>\eta_k \mid \mathcal{F}_{\eta_k} \right) \leq e^{-\tau},
\end{equation}
by Doob's submartingale inequality~\citep{doob1953stochastic}. Let
\begin{equation}
\label{eq:defm}
    M \de \inf \biggl\{k\geq 0: \eta_k<\infty \;\text{and} \; \sum_{i=\eta_k+1}^n\tilde{z}_{\lambda}(X_i)\geq \tau \; \text{for some}\; n>\eta_k\biggr\}.
\end{equation}
Combining Inequality~(\ref{eq:doobs}) and Definition~(\ref{eq:defm}),
\begin{align}
\label{eq:eq2}
    P_{\infty}(M\geq k+1\mid\mathcal{F}_{\eta_k})= 1-P_{\infty}\left(\sum_{i=\eta_k+1}^n\tilde{z}(X_i)\geq \tau  \;\text{for some} \; n>\eta_k\mid \mathcal{F}_{\eta_k}\right)\geq 1-e^{-\tau},
\end{align}
and
\begin{equation}
\label{eq:eq1}
    P_{\infty}(M> k)= \mathbb{E}_{\infty} [P_{\infty}(M\geq k+1\mid\mathcal{F}_{\eta_k})\mathbb{I}_{\{M\geq k\}}]=\mathbb{E}_{\infty}[P_{\infty}(M\geq k+1\mid\mathcal{F}_{\eta_k})]P_{\infty}(M> k-1).
\end{equation}
Combining Equations~(\ref{eq:eq1}) and (\ref{eq:eq2}), 
\begin{align*}
    \mathbb{E}_{\infty}[M] = \sum_{k=0}^{\infty}P_{\infty}(M> k)\geq \sum_{k=0}^{\infty}(1-e^{-\tau})^{k}= e^{\tau}.
\end{align*}

Observe that
\begin{align*}
    \tilde{T}_{\texttt{RSCUSUM}}&=\inf \biggl\{n\geq 1:\sum_{i=\eta_k+1}^n\tilde{z}_{\lambda}(X_i)\geq \tau \; \text{for some}\; \eta_k<n \biggr\}\geq M,
\end{align*}
and $\tilde{T}_{\texttt{RSCUSUM}}\leq T_{\texttt{RSCUSUM}}$. We conclude that
$\mathbb{E}_{\infty}[T_{\texttt{RSCUSUM}}]\geq \mathbb{E}_{\infty}[\tilde{T}_{\texttt{RSCUSUM}}]\geq \mathbb{E}_{\infty}[M]\geq e^{\tau}$.
\end{enumerate}
\end{proof}

\subsection{Proof of Theorem 4.5}
We first introduce a technical definition in order to apply~\cite{woodroofe1982nonlinear}[Corollary 2.2.] to the proof of Theorem 4.5.
\begin{definition}
A distribution $P$ on the Borel sets of $(-\infty, \infty)$ is said to be \textit{arithmetic} if and only if it concentrates on a set of points of the form $\pm nd$, where $d>0$ and $n=1, 2, \ldots$.
\end{definition}
\begin{remark}
    Any probability measure that is absolutely continuous with respect to the Lebesgue measure is non-arithmetic.
\end{remark}
\begin{proof}
Consider the random walk that is defined by 
\begin{equation*}
    Z^{\prime}(n) = \sum_{i=1}^nz_{\lambda}(X_i), \; \text{for}\; n\geq 1.
\end{equation*}
We examine another stopping time that is given by
\begin{equation*}
     T_{\texttt{RSCUSUM}}^{\prime} \de \inf \{n\geq 1: Z^{\prime}(n) \geq \tau\}.
\end{equation*}
Next, for any $\tau$, define $R_{\tau}$ on $\{T_{\texttt{RSCUSUM}}^{\prime} <\infty\}$ by 
\begin{equation*}
    R_{\tau} \de Z^{\prime}(T_{\texttt{RSCUSUM}}^{\prime}) -\tau.
\end{equation*}
$R_{\tau}$ is the excess of the random walk over a stopping threshold $\tau$ at the stopping time $T_{\texttt{RSCUSUM}}^{\prime}$.
Suppose the change point $\nu =1$, then $X_1, X_2,\ldots, $ are i.i.d. following the distribution $Q_1$. Let $\mu$ and $\sigma^2$ respectively denote the mean $\mathbb{E}_{1}[z_{\lambda}(X)]$ and the variance $\text{Var}_1[z_{\lambda}(X)]$. Note that 
\begin{equation*}
    \mu =\mathbb{E}_{1}[z_{\lambda}(X)]= \lambda(\mathbb{D}_{\texttt{F}}(P_1\|P_{\infty})-\mathbb{D}_{\texttt{F}}(P_1\|Q_{1}))>0,
\end{equation*}
and \begin{equation*}
    \sigma^2 = \text{Var}_1[z_{\lambda}(X)] = \mathbb{E}_1[z_{\lambda}(X)^2]-\left(\lambda(\mathbb{D}_{\texttt{F}}(P_1\|P_{\infty})-\mathbb{D}_{\texttt{F}}(P_1\|Q_{1}))\right)^2.
\end{equation*}
Under the mild regularity conditions
%\footnote{It assumes that the Fisher divergences and the Fisher entropies of the underlying distributions are finite, which are mild conditions for having well-defined Hyv\"arinen scores and for the empirical score matching procedure to work.}
given by \cite{hyvarinen2005estimation},
\begin{align*}
&\mathbb{E}_{1}[\mathcal{S}_{\texttt{H}}(X, P_{\infty})]^2 < \infty,\;\text{and} \\
&\mathbb{E}_{1}[\mathcal{S}_{\texttt{H}}(X, Q_1)]^2 < \infty.
\end{align*}
It implies that $\mathbb{E}_1[z_{\lambda}(X)^2]<\infty$ if $\lambda$ is chosen appropriately, e.g. $\lambda$ satisfy Inequality~(14) and $\lambda$ is not arbitrary large. 
Therefore, by \cite{lorden1970excess} Theorem 1,
\begin{equation*}
    \sup_{\tau \geq 0}\mathbb{E}_1[R_{\tau}]\leq \frac{\mathbb{E}_1[(z_{\lambda}(X)^{+})^2]}{\mathbb{E}_1[z_{\lambda}(X)]}\leq \frac{\mu^2+\sigma^2}{\mu},
\end{equation*}
where $z_{\lambda}(X)^{+} = \max (z_{\lambda}(X), 0)$.
Additionally,  $Q_1$ must be non-arithmetic in order to have Hyv\"arinen scores well-defined. Hence, by \cite{woodroofe1982nonlinear} Corollary 2.2.,
\begin{equation*}
    \mathbb{E}_{1}[T^{\prime}_{\texttt{RSCUSUM}}]=\frac{\tau}{\mu}+\frac{\mathbb{E}_1[{R_{\tau}}]}{\mu}\leq \frac{\tau}{\mu}+\frac{\mu^2+\sigma^2}{\mu^2},\;\forall \tau \geq 0.
\end{equation*}
Observe that for any $n$, $Z^{\prime}(n)\leq Z(n)$, and therefore $T_{\texttt{RSCUSUM}} \leq T_{\texttt{RSCUSUM}}^{\prime}$. Thus, 
\begin{equation}
\label{eq:cadd_result}
    \mathbb{E}_{1}[T_{\texttt{RSCUSUM}}]\leq \mathbb{E}_{1}[T_{\texttt{RSCUSUM}}^{\prime}]\leq \frac{\tau}{\mu}+\frac{\mu^2+\sigma^2}{\mu^2},\;\forall \tau \geq 0.
\end{equation}
By Theorem 4, we select $\tau = \log \gamma $ to satisfy the constraint $\mathbb{E}_{\infty}[T_{\texttt{RSCUSUM}}]\geq\gamma>0$. Plugging it back to Equation~(\ref{eq:cadd_result}), we conclude that, as $\gamma \to \infty$,
\begin{equation*}
    \mathbb{E}_{1}[T_{\texttt{RSCUSUM}}] \sim \frac{\log \gamma}{\mu}=\frac{\log \gamma}{\lambda(\mathbb{D}_{\texttt{F}}(P_1\|P_{\infty})-\mathbb{D}_{\texttt{F}}(P_1\|Q_{1}))},
\end{equation*}
to complete the proof.


Due to the stopping scheme of RSCUSUM, the expected time $\mathbb{E}_{\nu}[T_{\texttt{RSCUSUM}}-\nu|T_{\texttt{RSCUSUM}}\geq \nu]$ is independent of the change point $\nu$ (This is obvious, and the same property for CUSUM has been shown by~\cite{xie2021sequential}). Let $\nu = 1$, and we have \begin{equation*}
    \mathcal{L}_{\texttt{CADD}}(T_{\texttt{RSCUSUM}}) = \mathbb{E}_{1}[T_{\texttt{RSCUSUM}}]-1.
\end{equation*}
Thus, we conclude that 
\begin{equation*}
    \mathcal{L}_{\texttt{CADD}}(T_{\texttt{RSCUSUM}})\sim \frac{\log \gamma}{\lambda (\mathbb{D}_{\texttt{F}}(P_1\|P_{\infty})-\mathbb{D}_{\texttt{F}}(P_1\|Q_{1}))}.
\end{equation*}
Similar arguments applies for $\mathcal{L}_{\texttt{WADD}}(T_{\texttt{RSCUSUM}})$.
\end{proof}

\subsection{Selection of Appropriate Multiplier}

It is worth noting that although results of our core results hold for a pre-selected $\lambda$ that satisfied the condition discussed in Lemma 4.3. The effect of choosing any other $\lambda^{\prime}$ amounts to the scaling of all the increments of RSCUSUM by a constant factor of $\lambda^{\prime}/ \lambda$. This means that all of these results still hold adjusted for this scale factor. For instance, the result of Theorem 4.4 can be modified to be written as 
$$
\mathbb{E}_{\infty}[T_{\texttt{RSCUSUM}}]\geq \exp \left\{\frac{\lambda  \tau}{\max(\lambda, \lambda^{\prime})}\right\},
$$ 
for any $\lambda^{\prime} > 0$. It is easy to see that this scaling will change the statement of Theorem 4.5 accordingly to 
$$
\mathbb{E}_{1}[T_{\texttt{RSCUSUM}}]\sim \frac{\max(\lambda, \lambda^{\prime})}{\lambda }\frac{\log \gamma}{\lambda^{\prime}(\mathbb{D}_{\texttt{F}}(P_1\|P_{\infty})-\mathbb{D}_{\texttt{F}}(P_1\|Q_{1}))},
$$ 
as $\gamma \to \infty$. In order to have the strongest results in Theorems 4.4 and 4.5, we must choose $\lambda$ as close to $\lambda^*$ as possible.

\section{Experimental Details}
\subsection{Synthetic Dataset}
We consider the parametric family $\mathcal{P} = \{G_{\theta}:\;\theta\in\Theta\}$, and a set of basis elements $\mathcal{P}_m=\{P_1,\ldots, P_m\}$, $\forall P_i\in \mathcal{P}$. We set $m=4$ for synthetic simulations. The uncertainty class of post-change distribution (pre-change distribution respectively) is given by
\begin{align*}
    &\mathcal{G}_1= \left\{\sum_{i=1}^m \alpha_iP_i:\; \sum_{i=1}^m \alpha_i = 1, \;\forall \;\alpha_i\geq 0\right\},\nonumber\\
    &\mathcal{G}_{\infty} = \{P_{\infty}:\: P_{\infty}\in\mathcal{P},\;P_{\infty}\notin\mathcal{G}_1\}.
\end{align*}

\paragraph{Multivariate Normal Distribution (MVN)} Let $\boldsymbol{\mu}$ and $V$ respectively denote the mean and the covariance matrix. The corresponding score function is calculated by
\begin{equation*}
    S_{\texttt{H}}(X, P) = \frac{1}{2}(X-\boldsymbol{\mu})^{T}\Sigma^{-2}(X-\boldsymbol{\mu})-\operatorname{tr}(V^{-1}),
\end{equation*}
where the operator $\operatorname{tr}(\cdot)$ takes the trace of matrix.

For the scenario of MVN$_m$, we think the covariance matrix $V$ is a constant for any distribution in the parametric family. The pre-change distribution $P_{\infty}=\mathcal{N}(\mathbf{\mu}_*, V_*)$, where \begin{align*}
    \boldsymbol{\mu}_{\star}=(0,0),\quad\text{and}\quad V_{\star} = \left(\begin{matrix}
    1, &0.5\\
    0.5, &1
\end{matrix}\right).
\end{align*}
The set $\mathcal{P}_m=\{\mathcal{N}(\boldsymbol{\mu}_j, V_j), \;j=1,\ldots, m\}$, where 
\begin{align*}
    \boldsymbol{\mu}_j=(\epsilon_j,\epsilon_j),\quad\text{and}\quad V_{j} = \left(\begin{matrix}
    1, &0.5\\
    0.5, &1
\end{matrix}\right).
\end{align*}
We take the value of $\epsilon_1$ ($\epsilon_j$, $j=2,3,4$ respectively) as $0.5$ ($0.6, 0.8, 1.0$ respectively) for $P_1$ ($P_j$, $j=2,3,4$ respectively).

For the scenario of MVN$_c$, we consider both the mean and covariance matrix as the parameter. Again, we consider the pre-change distribution $P_{\infty}=\mathcal{N}(\mathbf{\mu}_*, V_*)$, and the set $\mathcal{P}_m=\{\mathcal{N}(\boldsymbol{\mu}_j, V_i), \;j=1,\ldots, m\}$. Here,  
\begin{align*}
    \boldsymbol{\mu}_j=(\epsilon_j,\epsilon_j),\quad\text{and}\quad V_{j} = \left(\begin{matrix}
    1, &0.5\\
    0.5, &1
\end{matrix}\right)\circ\exp(\delta_j),
\end{align*}
where $\circ$ denotes the element-wise product and $\epsilon_{\log(\sigma^2)}$ denotes the element-wise perturbations of the covariance matrix. We take the value of $\delta_j$ (respectively $\delta_j, j=2,3,4$) as $0.1$ ($0.2, 0.8, 1.0$ respectively) for $P_1$ ($P_j$, $j=2,3,4$ respectively). To make the perturbed covariance matrix positive-definite, we perturb the log of each component of the covariance matrix.

\begin{table}[ht]
\centering
\caption{EDD versus ARL for RSCUSUM and RCUSUM on Multivariate Gaussian Case}
\begin{tabular}{ c| c c c c c c c }
\toprule
Perturbation/ARL &  & 100 & 200 & 400 & 800 & 1500 & 3000 \\
\hline
\multirow{2}{*}{0.5} & RSCUSUM & 11.2552 & 12.6664 & 16.9057 & 20.3400 & 22.7026 & 27.3190 \\
& RCUSUM & 11.4017 & 12.8748 & 16.8437 & 20.2776 & 22.6781 & 27.2831 \\
\hline
\multirow{2}{*}{0.6} & RSCUSUM & 8.5636 & 9.5218 & 13.1102 & 15.2747 & 16.5815 & 19.8648 \\
& RCUSUM & 8.6460 & 9.5817 & 12.9797 & 15.2196 & 16.5526 & 19.7900 \\
\hline
\multirow{2}{*}{1} & RSCUSUM & 4.0894 & 4.5327 & 6.0542 & 7.1984 & 7.8237 & 9.4318 \\
& RCUSUM & 4.1259 & 4.5658 & 6.0447 & 7.1551 &7.8026 & 9.3947 \\
\hline
\multirow{2}{*}{2} & RSCUSUM & 1.4053 & 1.6268 & 2.2620 & 2.7546 & 3.0592 & 3.6752 \\
& RCUSUM & 1.4290 & 1.6393 & 2.2516 & 2.7393 & 3.0481 & 3.6684 \\
\bottomrule
\end{tabular}
\end{table}

\paragraph{Exponential Family (EXP)} We consider the Exponential family with the associated PDF given by
\begin{align}
    p_{\theta}(X) =\frac{1}{Z_{\tau}} \exp\left\{-\tau\left(\sum_{i=1}^d(x_i-\mu)^4+\sum_{1\leq i\leq d, i\leq j\leq d}(x_i-\mu)^2(x_j-\mu)^2\right)\right\},\nonumber
\end{align}
where $\theta = (\tau, \mu)$.
The associated Hyvarinen score function is calculated by
\begin{equation*}
    S_{\texttt{H}}(X, P_{\theta}) = \frac{1}{2}\sum_{i=1}^d \left(\frac{\partial}{\partial x_i}\log P_{\theta}(X)\right)^2+\sum_{i=1}^d\frac{\partial^2}{\partial x_i}\log P_{\theta}(X),
\end{equation*}
where 
\begin{align*}
    \frac{\partial}{\partial x_i}\log P_{\theta}(X) &= -\tau \left(4(x_i-\mu)^3+2\sum_{1\leq i\leq d, i\leq j\leq d}(x_i-\mu)(x_j-\mu)^2\right), \;\text{and}\\
    \frac{\partial^2}{\partial x_i}\log P_{\theta}(X)&=-\tau \left (12(x_i-\mu)^2+2\sum_{1\leq i\leq d, i\leq j\leq d}(x_j-\mu)^2\right).
\end{align*}
We consider the pre-change distribution $P_{\infty}$ with $\tau_{\star} =1$ and $\mu_*=0$. The post-change distribution basis elements are constructed with $\tau =\tau_*+\epsilon_{j}$ and $\mu=\mu_*+\delta_j$. Here, $\epsilon_{j}$ ($\delta_j$ respectively) denotes the perturbations of the scale parameter $\tau$ (the location parameter $\mu$ respectively) for each $P_j$, $j=1,2,3,4$. We take values of $\epsilon_{j}$ as $1.0, 2.0, 8.0, 10.0$, and values of $\delta_j$ as $0.01, 0.02, 0.08, 0.1$.

\paragraph{Gauss-Bernoulli Restricted Boltzmann Machine (RBM)} As introduced in Subsection the main paper, we consider the RBM mode with the PDF given by $p_{\theta}(X)= \sum_{h\in \{0,1\}^{d_h}}p_{\theta}(X, H) = \frac{1}{Z_{\theta}}\exp\{-F_{\theta}(X)\}$, where $F_{\theta}(X)$ is the free energy given by 
\begin{equation*}
    F_{\theta}(X) = \frac{1}{2}\sum_{i=1}^{d_x} (x_{i}-b_i)^{2}\nonumber
    -\sum_{j=1}^{d_h} \operatorname{Softplus}\left(\sum_{i=1}^{d_x} W_{i j}x_{i}+b_{j}\right).
\end{equation*}
We compute the corresponding Hyv\"arinen score in a closed form
\begin{equation*}
    S_{\texttt{H}}(X, P_{\theta})= \sum_{i=1}^{d_x}\left[\frac{1}{2}\left(x_{i}-b_{i}+\sum_{j=1}^{d_h} W_{ij} \phi_{j}\right)^2+\sum_{j=1}^{d_h} W_{i j}^{2} \phi_{j}\left(1-\phi_{j}\right)-1\right],
\end{equation*}
where $\phi_{j} \de \operatorname{Sigmoid}(\sum_{i=1}^{d_x} W_{i j}x_{i}+b_{j})$. The $\operatorname{Sigmoid}$ function is defined as $\operatorname{Sigmoid}(y) \de (1+\exp(-y))^{-1}$.

The pre-change distribution $P_{\infty}$ is with the parameters $\mathbf{W} = \mathbf{W}_*$, $\mathbf{b}=\mathbf{b}_*$, and $\mathbf{c}=\mathbf{c}_*$, where each component of $\mathbf{W}_*$, $\mathbf{b}_*$, and $\mathbf{c}_*$ is randomly drawn from the standard Normal distribution $\mathcal{N}(0,1)$. For the post-change distribution basis elements, we assign the parameters $\mathbf{W}_j=\mathbf{W}_*\oplus\epsilon_j$, $\mathbf{b}_j=\mathbf{b}_*$, and $\mathbf{c}_j=\mathbf{c}_*$. Here, we only consider shifts of weight matrix $\mathbf{W}$. We let $\epsilon_j$ take values from $0.001, 0.002, 0.008, 0.01$ for $P_j$, $j=1,2,3,4$.


\bibliography{wu_571}
\end{document}
