\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{xr}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}

\setlength{\parindent}{0cm}
\usepackage{multicol}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}   
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{natbib}
\usepackage{balance}
\newcommand{\twopartdef}[4]
{
	\left\{
		\begin{array}{ll}
			#1 & \mbox{if } #2 \\
			#3 & \mbox{if } #4
		\end{array}
	\right.
}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{microtype}
\usepackage{xr}

\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{notation}[theorem]{Notation}
% colors
\allowdisplaybreaks

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{sankararaman_560}

\title{Online Heavy-tailed Change-point detection (Supplementary Materials)}
\date{}

\author[1]{{Abishek Sankararaman\footnote{here}}}
{\author[1]{{Balakrishnan (Murali) Narayanaswamy}}}
\affil[1]{%
AWS AI Labs
}

\begin{document}
\onecolumn 
\maketitle
\appendix 
\section{Change-point Localization}
\label{sec:localization}

% In this section we describe how to modify Algorithm \ref{algo:learn_model} to also output the estimate of the location of change in addition to just detecting the existence of a change. Recall that for every $r \in \mathbb{N}$, $\tau_r^{(\mathcal{A})} \in \mathbb{N} \cup \{ \infty\}$ is the stopping time denoting the $r$th time, Algorithm $\mathcal{A}$ detects a change point. We modify Algorithm \ref{algo:learn_model} by additionally outputting for every $r \in \mathbb{N}$, a time interval $[s_{r;1}^{(\mathcal{A})}, s_{2;r}^{(\mathcal{A})}] \subseteq [\tau_{r-1}^{(\mathcal{A})},\tau_{r}^{(\mathcal{A})}]$ such that this is an interval that contains a change-point $\tau_c$.

%  In order do so, we need an additional definition. For every $r < s < t$ and $\delta \in (0,1)$, denote by $\mathfrak{B}(r,s,t,\delta) \in \{0,1\}$ as the indicator variable that 
% \begin{align}
%     \mathfrak{B}(r,s,t,\delta) = \mathbf{1} \bigg(\|\widehat{\theta}_{r:s} - \widehat{\theta}_{s+1:t}\|_2^2 >  \mathcal{B}\left(s-r,\frac{\delta}{2(t-r)(t-r+1)}\right) +  \mathcal{B}\left(t-s-1, \frac{\delta}{2(t-r)(t-r+1)}\right) \bigg).
%     \label{eqn:defn_mathfrak_B}
% \end{align}

% The estimates of the location of change in a time-interval $[r,t]$ is all those time instants $s \in [r,t]$ such that $\mathfrak{B}(r,s,t,\delta)=1$. We formalize this in Algorithm \ref{algo:learn_model_local} below where the highlighted line shows the difference from Algorithm \ref{algo:learn_model}. The empirical performance of this method is shown in Figure \ref{fig:localization}.
	
	\begin{algorithm*}[htb]
%\DontPrintSemicolon
		\caption{{ Online {\ttfamily Clipped-SGD} Change Point Detection and Localization}}
		\label{algo:learn_model_local}

		\begin{algorithmic}[1]
				\STATE \textbf{Input}: {  $(\eta_t)_{t \geq 1}$,  $\lambda > 0$, $\theta_0 \in \Theta$, $\delta \in (0,1)$ FPR guarantee } 
				\STATE $r \gets 1$
				\STATE $\widehat{\theta}_{t,t-1} \gets \theta_0$, for all $t \geq 1$.
				\STATE Set $ \tau_c^{(0)} \gets 0$
				\STATE Set {\ttfamily Num-change-points }$ \gets 0$
			    \FOR {each time $t = 1, 2, \cdots , $}
			   \STATE Receive sample $X_t$ \\
			   \STATE   $\widehat{\theta}_{s,t} \gets \prod_{\theta}(\widehat{\theta}_{s,t-1} - \eta_{t-s}\text{clip}(X_{t} - \widehat{\theta}_{s,t-1}, \lambda))$, for every $r \leq s \leq t$.
			    \IF {$\exists s \in (r,t)$ such that $\|\widehat{\theta}_{r:s} - \widehat{\theta}_{s+1:t}\|_2^2 > \mathcal{B}\left(s-r,\frac{\delta}{2(t-r)(t-r+1)}\right) + \mathcal{B}\left(t-s-1, \frac{\delta}{2(t-r)(t-r+1)}\right)$ \COMMENT {$B(\cdot, \cdot)$ is defined in Equation (\ref{eqn:defn_B}}}
			    \STATE Set \textbf{Restart}$_t$ $\gets 1$ \COMMENT {Change point detected}
			    \STATE Set {\ttfamily Num-change-points }$ \gets ${\ttfamily Num-change-points } $+1$ \COMMENT {Increment number of change-points detected}
			    {\color{red}\STATE Output time interval $[\inf\{ s \in (r,t) \text{ s.t. } \mathfrak{B}(r,s,t,\delta)=1 \}, \sup \{ s \in (r,t) \text{ s.t. } \mathfrak{B}(r,s,t,\delta)=1\}]$ as the location of the change-point \COMMENT {$\mathfrak{B}()$ defined in Equation (\ref{eqn:defn_mathfrak_B})}}
			    \STATE $r \gets t+1$
			    %\ENDIF
			    \ELSE 
			    \STATE Set \textbf{Restart}$_t$ $\gets 0$
			    \ENDIF
			 
			    \ENDFOR 
			 %   \begin{align*}
			 %       \widehat{\theta} \in \arg\min_{\theta \in \Theta} \min_{\substack{S \subset \mathcal{B}, \\ \text{s.t.} |S| > \frac{3}{4}|\mathcal{B}|}} \frac{1}{|S|}\sum_{Y \in S}\mathcal{L}(Y,\theta).
			 %   \end{align*}
	

\end{algorithmic}
	\end{algorithm*}


% \begin{figure}
% \centering
% \begin{subfigure}{0.24\linewidth}
% \includegraphics[width=0.99\linewidth]{plots/refined_plots/pareto_one_d1_localization.pdf}
% \label{fig:local1}
% \caption{Pareto $\Delta=1$}
% \end{subfigure}
% \begin{subfigure}{0.24\linewidth}
% \includegraphics[width=0.99\linewidth]{plots/refined_plots/pareto_one_d32_local.pdf}
% \label{fig:local2}
% \caption{Pareto $d=32, \Delta=1$}
% \end{subfigure}
% \begin{subfigure}{0.24\linewidth}
% \includegraphics[width=0.99\linewidth]{plots/refined_plots/normal_one_d1_local.pdf}
% \label{fig:local3}
% \caption{Normal $\Delta=1$}
% \end{subfigure}
% \begin{subfigure}{0.24\linewidth}
% \includegraphics[width=0.99\linewidth]{plots/refined_plots/normal_one_d32_local.pdf}
% \label{fig:local4}
% \caption{Normal $d=32\Delta=1$}
% \end{subfigure}
% \caption{Plots showing that the estimates of the location of change-points given by Algorithm \ref{algo:learn_model_local} works well across a variety of settings.}
% \label{fig:localization}
% \end{figure}

\section{Proof for Robust Estimation in Theorem \ref{thm:main_mean_est}}
\label{sec:mean_estimation_proofs}

% {\color{red} SCRATCH PAD FOR $\eta_t := \frac{2}{m(t+\gamma)}$ and $\lambda = 2G$ and $\gamma = 320 (\lambda+1) \sigma(\sigma+1)\ln \left(\frac{2T}{\delta}\right))$ }
% \\






We follow the same proof architecture as that of Proof of \citep{tsai2022heavy}. 


Fix a time $t \in \mathbb{N}$. We define a sequence of random variable $(\psi_t)_{t \geq 1}$  as follows. 
\begin{align*}
    {\psi}_t := \text{clip}( (X_t- \widehat{\theta}_{t-1}), \lambda) - (\theta^* - \widehat{\theta}_{t-1}),
\end{align*}
% and by 
% \begin{align*}
%     \bar{\psi}_t := \text{clip}(\nabla {\mathcal{L}}(X_t, \widehat{\theta}_{t-1}), \lambda) - \nabla \mathcal{R}_t(\widehat{\theta}_{t-1}).
% \end{align*}
%These are random vectors since $Z_t \sim \mathbb{P}_{\theta}$ and $\widehat{\theta}_{t-1}$ is measurable with respect to the sigma algebra generated by $\sigma(Z_1, \cdots, Z_{t-1})$. 
% Clearly, for all times $t \geq 1$, on the event that $C_t = 0$, $\psi_t = \bar{\psi}_t$ holds almost-surely. Furthermore, from triangle inequality almost-surely for all time $t \geq 1$, we have 
% \begin{align}
%     \| \psi_t \|_2^2 \leq \|\bar{\psi}_t \|_2^2 + 2 \lambda^2 \mathbf{1}_{C_t \neq 0}.
%     \label{eqn:psi_psi_bar_norm_mean}
% \end{align}


% \subsubsection*{Expanding the one-step recursion}

Consider any time $t$. We have 
\begin{align}
    \| \theta_{t} - \theta^* \|_2^2 &=  \| \prod_{\Theta}(\widehat{\theta}_{t-1} - \eta_t \text{clip}(X_t - \widehat{\theta}_{t-1}, \lambda)) - \theta^*\|_2^2, \\
    &\stackrel{(a)}{\leq}  \| \widehat{\theta}_{t-1} - \eta_t \text{clip}(X_t - \widehat{\theta}_{t-1}, \lambda) - \theta^* \|_2^2, \\
    &= \|\widehat{\theta}_{t-1} - \eta_t({\psi}_t + (\theta^* - \widehat{\theta}_{t-1})) - \theta^*\|_2^2, \nonumber \\
    &= \|\widehat{\theta}_{t-1} - \theta^*\|_2^2 + \eta_t^2 \| {\psi}_t + (\theta^* - \widehat{\theta}_{t-1})\|_2^2 - 2 \eta_t \langle \widehat{\theta}_{t-1} - \theta^*, {\psi}_t + (\theta^* - \widehat{\theta}_{t-1}) \rangle, \nonumber \\
    &\stackrel{(b)}{\leq} \|\widehat{\theta}_{t-1} - \theta^*\|_2^2 + 2\eta^2_t \| {\psi}_t \|_2^2 + 2 \eta^2_t \|(\theta^* - \widehat{\theta}_{t-1})\|_2^2 - 2 \eta_t \langle \widehat{\theta}_{t-1} - \theta^*, {\psi}_t+ (\theta^* - \widehat{\theta}_{t-1}) \rangle, \label{eqn:proof_decomposition_first_step_mean_estimate}
\end{align}
Step $(a)$ follows since $\Theta$ is a convex set, $\|\mathcal{P}_{\Theta}(\widehat{\theta}_t) - \theta^*\| \leq \| \widehat{\theta}_t - \theta^*\|$, since $\theta^* \in \Theta$. In step $(b)$, we use the fact that $\|a+b\|_2^2 \leq 2\|a\|_2^2 + 2 \|b\|_2^2$, for all $a,b \in \mathbb{R}^d$. 
Substituting Equation (\ref{eqn:convexity_bound_inner_prod}) into (\ref{eqn:proof_decomposition_first_step_mean_estimate}), we get that 
\begin{multline*}
    \|\theta^* - \theta_{t} \|_2^2 \leq  \|\widehat{\theta}_{t-1} - \theta^*\|_2^2 + 2\eta^2_t \| {\psi}_t \|_2^2  - 2 \eta_t \langle \widehat{\theta}_{t-1} - \theta^*_{t}, {\psi}_t  \rangle
     \\ + 2 \eta^2_t \left( (M+m) \langle (\theta^* - \widehat{\theta}_{t-1}), \widehat{\theta}_{t-1} - \theta^*_t\rangle - mM \| \widehat{\theta}_{t-1} - \theta^*\|_2^2 \right)- 2\eta_t \langle (\theta^* - \widehat{\theta}_{t-1}), \widehat{\theta}_{t-1} - \theta^*_t \rangle.
\end{multline*}
Re-arranging the equation above yields 
\begin{multline*}
    \|\theta^* - \theta_{t} \|_2^2 \leq (1-2\eta^2_t mM)\|\widehat{\theta}_{t-1} - \theta^*\|_2^2 + 2\eta^2_t \| {\psi}_t \|_2^2    - 2 \eta_t \langle \widehat{\theta}_{t-1} - \theta^*, {\psi}_t \rangle  \\ - 2 \eta_t(1 - \eta_t \left( (M+m) \right)\langle (\theta^* - \widehat{\theta}_{t-1}), \widehat{\theta}_{t-1} - \theta^*\rangle.
\end{multline*}
Further substituting Equation (\ref{eqn:convexity_gradient_bound}) into the display above yields that 
\begin{align*}
    \|\theta^* - \widehat{\theta}_t\|_2^2 &\leq  (1-2\eta_t m + 2 \eta^2_t m^2 ) \| \widehat{\theta}_{t-1} - \theta^* \|_2^2  + 2 \eta^2_t \| {\psi}_t \|_2^2 - 2\eta_t \langle \widehat{\theta}_{t-1} - \theta^*, {\psi}_t \rangle, \\
    &\leq (1-\eta_t m  ) \| \widehat{\theta}_{t-1} - \theta^* \|_2^2  + 2 \eta^2_t \| {\psi}_t \|_2^2 - 2\eta_t \langle \widehat{\theta}_{t-1} - \theta^*, {\psi}_t \rangle,
\end{align*}
where the inequality comes from the fact that if $\eta_t m < 1 \implies 2 \eta_t m - 2\eta^2_t m^2 > \eta m$. 
%We simplify the display above using the inequality in Equation (\ref{eqn:psi_psi_bar_norm_mean}) as, 
\begin{align}
        \|\theta^* - \widehat{\theta}_t\|_2^2 \leq  (1-\eta_t m ) \| \widehat{\theta}_{t-1} - \theta^* \|_2^2 +    2 \eta^2_t \| {\psi_t} \|_2^2  - 2\eta_t \langle \widehat{\theta}_{t-1} - \theta^*, {\psi}_t  \rangle.
        \label{eqn:proof_decomp_disp_1_mean_estimate}
\end{align}
% Using the Cauchy-Schartz inequality that $\langle \theta_{t-1} - \theta^*_t, \psi_t - {\psi}_t \rangle \leq \| \theta_{t-1} - \theta^*_t\| \| {\psi}_t - \psi_t\| \leq 2\lambda \| \theta_{t-1} - \theta^*_t\| \mathbf{1}_{C_t \neq 0}$, where the last inequality comes from the fact that for all time $t$, $\|{\psi}_t - \psi_t\| \leq 2 \lambda \mathbf{1}_{C_t \neq 0}$ almost-surely. Plugging this into  Equation (\ref{eqn:proof_decomp_disp_1_mean_estimate}) yields 
% \begin{multline}
%             \|\theta^* - \widehat{\theta}_t\|_2^2 \leq  (1-\eta_t m ) \| \theta_{t-1} - \theta^* \|_2^2 +  2 \eta^2_t \| {\psi_t} \|_2^2 + 4 \eta^2_t \lambda^2\mathbf{1}_{C_t \neq 0} - 2\eta_t \langle \theta_{t-1} - \theta^*, {\psi}_t  \rangle + 4\eta_t \lambda \| \theta_{t-1} - \theta^*\| \mathbf{1}_{C_t \neq 0}.
% \end{multline}
% Using the fact that the diameter of the set $\Theta$ is $G_{\Theta}$ now yields that 
% \begin{multline*}
%                 \|\theta^* - \widehat{\theta}_t\|_2^2 \leq  (1-\eta_t m ) \| \theta_{t-1} - \theta^* \|_2^2 +   + 2 \eta^2_t \| {\psi_t} \|_2^2 + 4 \eta^2_t \lambda^2\mathbf{1}_{C_t \neq 0} - 2\eta_t \langle \theta_{t-1} - \theta^*, {\psi}_t  \rangle + 4\eta_t \lambda G  \mathbf{1}_{C_{t} \neq 0}.
% \end{multline*}

Unrolling the recursion yields, 
\begin{align*}
                \|\theta^* - \widehat{\theta}_t\|_2^2 \leq   \prod_{u=1}^{t}(1-\eta_u m ) \| \theta_1 - \theta^* \|_2^2   +  2 \eta^2_t \sum_{s=1}^{t-1}\prod_{u=1}^{s}(1-\eta_{t-u+1} m ) \| {\psi}_{t-s+1} \|_2^2  \\ - 2\eta_t \sum_{s=1}^{t-1}\prod_{u=1}^{s}(1-\eta_{t-u+1} m ) \langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}  \rangle .
\end{align*}

Using the fact that $\prod_{u=1}^{s}(1-\eta_{t-u+1} m ) = \frac{(t-s+\gamma-3)(t-s+\gamma-2)}{(t+\gamma)(t+\gamma-1)}$, we get that
\begin{align}
                \|\theta^* - \widehat{\theta}_t\|_2^2 \leq   \frac{ (\gamma-2)(\gamma-1)\| \theta_1 - \theta^* \|_2^2}{(t+\gamma)(t+\gamma-1)}   \\ - 2\eta_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2)\langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}  \rangle}{(t+\gamma)(t+\gamma-1)}  . \label{eqn:proof_one_step_unrolled_mean_estimate}
\end{align}

Denote by $\psi_t:= \psi_t^{(b)} + \psi_t^{(v)}$, where $\psi_t^{(b)} := \mathbb{E}_{Z_t}[ \psi_t \vert \mathcal{F}_{t-1}]$ and $\psi_t^{(v)} := \psi_t - \psi_t^{(b)}$. Using this in the display above and using that fact that $\|a+b\|_2^2 \leq 2 \|a\|_2^2 + 2 \|b\|_2^2$, we get 
\begin{align}
                \|\theta^*_t - \theta\|_2^2 &\leq   \frac{ (\gamma-1)(\gamma-2)\| \theta_1 - \theta^* \|_2^2}{(t+\gamma)(t+\gamma-1)}  + 4  \eta^2_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2)\| {\psi}_{t-s+1} \|_2^2}{(t+\gamma)(t+\gamma-1)}  +    \nonumber \\ & - 2\eta_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2)\langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}^{(b)}  \rangle}{(t+\gamma)(t+\gamma-1)} \nonumber \\ &- 2\eta_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2)\langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}^{(v)}  \rangle}{(t+\gamma)(t+\gamma-1)} \nonumber . \label{eqn:proof_one_step_unrolled_with_bias_variance_mean_estimate}
\end{align}
Further simplifying by adding and subtracting $\mathbb{E}_{Z_t}[\| \psi_t^{(v)}\|_2^2 \vert \mathcal{F}_{t-1}]$ to be above display, we get
\begin{align}
                \|\theta^* - \widehat{\theta}_t\|_2^2 &\leq   \frac{(\gamma-1)(\gamma -2) \| \theta_1 - \theta^* \|_2^2}{(t+\gamma)(t+\gamma-1)}   + 4  \eta^2_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2)\| {\psi}_{t-s+1}^{(b)} \|_2^2}{(t+\gamma)(t+\gamma-1)}  \\ &+  4  \eta^2_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2)\mathbb{E}_{Z_{t-s+1}}[\| {\psi}_{t-s+1}^{(v)} \|_2^2\vert \mathcal{F}_{t-s}]}{(t+\gamma)(t+\gamma-1)}   \nonumber \\ & +  4 \eta^2_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma-3)(t-s+\gamma-2) (\| {\psi}_{t-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{t-s+1}}[\| \psi_{t-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{t-s}])}{(t+\gamma)(t+\gamma-1)}    \\ & - 2\eta_t \sum_{s=1}^{t-1}\frac{ (t-s+\gamma-3)(t-s+\gamma-2)\langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}^{(b)} \rangle}{(t+\gamma)(t+\gamma-1)} \nonumber \\ & - 2\eta \sum_{s=1}^{t-1}\frac{ (t-s+\gamma-3)(t-s+\gamma-2)\langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}^{(v)}  \rangle}{(t+\gamma)(t+\gamma-1)}  . \label{eqn:proof_one_step_unrolled_with_bias_variance_martingale_mean_estimate}
\end{align}

% OLD 
% \begin{multline}
%      \|\widehat{\theta}_t - \widehat{\theta}_t^*\|_2^2 \leq (1-2\eta m)^{t-1} \| \theta_1 - \theta^*_1 \|_2^2   + \sum_{s=1}^{t-1}(1-2\eta m)^s \|\theta^*_{t-s} - \theta^*_{t-s+1}\|_2^2 \\ + \sum_{s=1}^{t-1} (1-2\eta m)^{s-1} 2 \eta^2 (\| \bar{\psi}_{t-s+1} \|_2^2 + 2 \lambda^2 ) - 2 \eta \sum_{s=1}^{t-1} \langle \theta_{t-s} - \theta^*_{t-s+1}, \bar{\psi}_t \rangle + 4 \eta \lambda G \mathbf{1}_{C_t \neq 0}
% \end{multline}


\begin{lemma}[Lemma F.5 \citep{gorbunov2020stochastic}]
If $\lambda \geq 2G$, the following inequalities hold almost-surely for all times $t$.
\begin{align}
    \| \psi_t^{(v)}\| &\leq 2 \lambda \mathbf{1}_{\sigma > 0} \label{eqn:bound_on_variance_norm} \\
    \| \psi_t^{(b)} \|_2 &\leq \frac{4\sigma^2}{\lambda} \label{eqn:bound_on_bias_norm} \\
    \mathbb{E}_{Z_t}[ \| \psi_t^{(v)}\|_2^2 \vert \mathcal{F}_{t-1}] &\leq 10\sigma^2 \label{eqn:bound_expected_variance_norm}
\end{align}
\label{lem:bounds_on_norms}
\end{lemma}


Simplifying Equation (\ref{eqn:proof_one_step_unrolled_with_bias_variance_martingale_mean_estimate}) using bounds in Lemma \ref{lem:bounds_on_norms}, along with the fact that for all $1 \leq s \leq t$ and $\gamma \geq 1$, $\frac{(t-s+\gamma-3)(t-s+\gamma-2)}{(t+\gamma)(t+\gamma-1)} \leq \frac{t-s+\gamma}{t+\gamma}$ we get
\begin{align}
                \|\theta^* - \widehat{\theta}_t\|_2^2 &\leq    \frac{ (\gamma-1)(\gamma-2)\| \theta_1 - \theta^* \|_2^2}{(t+\gamma)(t+\gamma-1)}   + \frac{16 \eta^2_t \sigma^2}{\lambda} \sum_{s=1}^{t-1}\frac{t-s+\gamma}{t+\gamma} +  4 \eta^2_t \sigma^2 \sum_{s=1}^{t-1}\frac{t-s+\gamma}{t+\gamma}  \nonumber \\ &+  4 \eta^2_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma)(\| {\psi}_{t-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{t-s+1}}[\| \psi_{t-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{t-s+1}])}{t+\gamma} \nonumber \\ &  + 2\eta_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma) \| \theta_{t-s} - \theta^*\| \| {\psi}_{t-s+1}^{(b)} \|}{t+\gamma} + - 2\eta_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma) \langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}^{(v)}  \rangle}{t+\gamma}  . \label{eqn:proof_one_step_unrolled_with_bias_variance_martingale_simplified_1_mean_estimate}
\end{align}
Further applying the bound that $\| \psi_t^{(b)}\| \leq \frac{4 \sigma^2}{{\lambda}}$
\begin{align}
                \|\theta^* - \widehat{\theta}_t\|_2^2 &\leq    \frac{ (\gamma-1) (\gamma-2)\| \theta_1 - \theta^* \|_2^2}{(t+\gamma)(t+\gamma-1)}   + \underbrace{\left(\frac{16 \eta^2_t \sigma^2}{\lambda} +  4 \eta^2_t \sigma^2 \right)\sum_{s=1}^{t-1}\frac{t-s+1}{t+\gamma}}_{\text{Term }1}  \nonumber \\ &+  \underbrace{4 \eta^2_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma)(\| {\psi}_{t-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{t-s+1}}[\| \psi_{t-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{t-s+1}])}{t+\gamma}}_{\text{Term }2}   \nonumber \\ & + \underbrace{\frac{8\sigma^2 \eta_t}{\lambda}  \sum_{s=1}^{t-1}\frac{(t-s+\gamma) \| \theta_{t-s} - \theta^*\| }{t+\gamma}}_{\text{Term }3}  \underbrace{- 2\eta_t \sum_{s=1}^{t-1}\frac{(t-s+\gamma) \langle \theta_{t-s} - \theta^*, {\psi}_{t-s+1}^{(v)}  \rangle}{t+\gamma}}_{\text{Term }4} . \label{eqn:proof_one_step_unrolled_with_bias_variance_martingale_simplified_2_mid_mean_estimate}
\end{align}

\subsection{Probabilistic analysis}

\textbf{Definitions}
\\

For every $t \geq 1$, denote by the constant 
\begin{align}
    C_t  = \max\left(\frac{1024\sigma^4}{G^2m^2\lambda^2}, \frac{8 \lambda \sqrt{\ln \left( \frac{2t^3}{\delta} \right)}}{\gamma^2 G} \right).
    \label{eqn:defn_C}
\end{align}

Denote by the deterministic constant $\xi_u^{(t)}$ for $u= 1,\cdots, t$ as
\begin{align}
    \left(\xi_u^{(t)}\right)^2 := C_t\bigg[ \left(\frac{16  \sigma^2}{\lambda} +  4  \sigma^2 \right) \frac{1}{2m^2(u+1)} + \frac{96 \lambda^2 \ln \left( \frac{2t^3}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}}  \bigg].
    \label{eqn:xi_t_defn}
\end{align}

From the definition, the following in-equalities hold. 
\begin{proposition}
For all times $u \in \{1,\cdots, t\}$,
\begin{align}
    \sum_{s=1}^{u-1}(u-s+\gamma)\xi_s^{(t)} &\leq 2(u+\gamma)\sqrt{u+1} \xi_u^{(t)}, \\
    \sum_{s=1}^{u-1}(\xi_s^{(t)})^2 &\leq 2(u+1)\ln(u+1) (\xi_u^{(t)})^2
    \label{eqn:xi_t_sum}
\end{align}
\label{prop:sum_of_xi_squared}
\end{proposition}
\begin{proof}
This follows from the following fact. 
\begin{proposition}
For all $ u \in \mathbb{N}$ and $\gamma \geq 0$, we have 
\begin{align*}
    \sum_{s=1}^{u-1}\frac{u-s+\gamma}{\sqrt{u+1}} \leq 2(u+\gamma)\sqrt{u+1}.
\end{align*}
\end{proposition}
\end{proof}

For each time $u \in \{1,\cdots, t\}$, denote by the random variable $\nu_u^{(t)}$ by 
\begin{align*}
    \nu_u^{(t)} := \twopartdef { \theta_{u} - \theta^* } {\| \theta_u - \theta^*\|^2 \leq (\xi_u^{(t)})^2 + \frac{C_t\gamma^2 G^2}{(u+1)}} {0} {\text{otherwise}}
\end{align*}

For every $u \in \{1,\cdots,t\}$, denote by the event $\mathcal{E}^{(t)}_{u;1}$ to be the one in which the following inequality holds for all $u \in \{1,\cdots, t\}$. 
\begin{multline}
 \mathcal{E}^{(t)}_{u;1} \coloneqq \bigg \{   4 \eta^2_t \sum_{s=1}^{u-1}\frac{(u-s+\gamma)(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}{t+\gamma} \\\leq \frac{96 \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}} \bigg\}. \label{eqn:mean_martingale_term_1}
\end{multline}
and $\mathcal{E}^{(t)}_{u;2}$ as 
\begin{align}
   \mathcal{E}^{(t)}_{u;2} \coloneqq \bigg\{ - 2\eta_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma) \langle \upsilon_{u-s}, {\psi}_{u-s+1}^{(v)}  \rangle}{t+\gamma} &\leq\frac{\xi_u^{(t)}\ln \left( \frac{2t^2(t+1)}{\delta}\right)}{10 \sqrt{u+1}} +  \frac{C_u \gamma^2 G^2}{4(u+1)}. \bigg\} \label{eqn:mean_martingale_term_2}
\end{align}

Denote by the event $\mathcal{E}^{(t)}$ as 
\begin{align}
    \mathcal{E}^{(t)} \coloneqq \bigcap_{u=1}^t \left(\mathcal{E}^{(t)}_{u;1} \cap \mathcal{E}^{(t)}_{u;2} \right).
    \label{eqn:event_e_t}
\end{align}

\begin{lemma}
For all $t\geq 1$,
\begin{align*}
    \mathbb{P}[\mathcal{E}^{(t)}] \geq 1 - \frac{\delta}{t(t+1)}.
\end{align*}

% Fix a time $t$. For each time $u \in \{1,\cdots, t\}$, denote by the random variable $\nu_u^{(t)}$ by 
% \begin{align*}
%     \nu_u^{(t)} := \twopartdef { \theta_{u} - \theta^* } {\| \theta_u - \theta^*\| \leq \xi_u^{(t)} + \frac{\gamma G}{(u+1)}} {0} {\text{otherwise}}
% \end{align*}
% With probability at-least $1- \frac{\delta}{t(t+1)}$, for every time $u \in \{1,\cdots, t\}$, the following two bounds hold
% \begin{align}
%   4 \eta^2_t \sum_{s=1}^{u-1}\frac{(u-s+\gamma)(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}{t+\gamma} &\leq \frac{1600 \lambda^2 \ln \left( \frac{2t^3}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}}. \label{eqn:mean_martingale_term_1}\\
%     - 2\eta_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma) \langle \upsilon_{u-s}, {\psi}_{u-s+1}^{(v)}  \rangle}{t+\gamma} &\leq\frac{\xi_u^{(t)}\ln \left( \frac{2 t^3}{\delta}\right)}{10 \sqrt{u+1}} +  \frac{C \gamma R_1}{10(u+1)}. \label{eqn:mean_martingale_term_2}
% \end{align}
% Denote by the event $\mathcal{E}^{(t)}$ to be the one in which Equations (\ref{eqn:mean_martingale_term_1}) and (\ref{eqn:mean_martingale_term_2}) hold for all $u \in \{1,\cdots, t\}$. The statement of the lemma is then $\mathbb{P}[\mathcal{E}^{(t)}] \geq 1-\frac{\delta}{t(t+1)}$.
% Thus, on the event that Equations (\ref{eqn:induction_hypothesis}) holds for all $s=1,\cdots, t-1$, then
% \begin{align*}
%     - 2\eta_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma) \langle \theta_{u-s} - \theta^*, {\psi}_{u-s+1}^{(v)}  \rangle}{t+\gamma} \leq \frac{\xi_u^{(t)}\ln \left( \frac{2 t^3}{\delta}\right)}{10 \sqrt{u+1}} +  \frac{C \gamma R_1}{10(u+1)}.
% \end{align*}
\label{lem:martingale_mean_estimate}
\end{lemma}

We now prove by induction hypothesis that 

\begin{lemma}
For every $t$, under the event $\mathcal{E}^{(t)}$,  the following holds.
\begin{align}
    \| \widehat{\theta}_u - \theta^*\|_2^2 \leq \frac{C_t\gamma^2 G^2}{(u+1)^2} + (\xi_u^{(t)})^2  ,
    \label{eqn:induction_hypothesis}
\end{align}
for all $u \in \{1,\cdots, t\}$.
\end{lemma}
\label{lem:induction}
\begin{proof}

\begin{proof}[Proof of Lemma \ref{lem:induction}]
We will prove this lemma by induction on $u$ by analyzing Equation (\ref{eqn:proof_one_step_unrolled_with_bias_variance_martingale_simplified_2_mid_mean_estimate}). The base-case of $u=1$ holds trivially with probability $1$ since $C_t > 1$, $\forall t \geq 1$ and $\gamma > 2$.
\\

Now, assume that on the event $\mathcal{E}^{(t)}$, the induction hypothesis in Equation (\ref{eqn:induction_hypothesis}) holds for all times $1,\cdots, u-1$. We prove this by expanding Equation (\ref{eqn:proof_one_step_unrolled_with_bias_variance_martingale_simplified_2_mid_mean_estimate}) and bounding each of the terms. 
\\

\textbf{Term 1} 
\\

It is easy to verify that 
\begin{align*}
    \left(\frac{16 \eta^2_u \sigma^2}{\lambda} +  4 \eta^2_u \sigma^2 \right)\sum_{s=1}^{u-1}\frac{u-s+\gamma}{u+\gamma} &\leq \left(\frac{16  \sigma^2}{\lambda} +  4  \sigma^2 \right)\frac{u}{2m^2(u+\gamma)^2}, \\
    &\leq \frac{\left( \frac{16\sigma^2}{\lambda} + 4 \sigma^2\right)}{2m^2(u+1)}.
\end{align*}
The last inequality follows since $\gamma^2 > 1$.

\textbf{Term 2}
\\

First notice that 
\begin{multline*}
    4 \eta^2_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma)(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}{t+\gamma} \leq \\ \frac{4 \eta_u}{u+\gamma} \sum_{s=1}^{u-1}{(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}
\end{multline*}

From the definition of event $\mathcal{E}^{(t)}$ in Equation (\ref{eqn:event_e_t}), we get that 
\begin{align*}
    \text{Term }2 \leq \frac{96 \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}}.
\end{align*}


\textbf{Term 3}
\\

\begin{align*}
    \frac{8\sigma^2 \eta_u}{\lambda}  \sum_{s=1}^{u-1}\frac{(u-s+\gamma) \| \theta_{u-s} - \theta^*\| }{u+\gamma} &\leq \frac{8 \sigma^2 }{m\lambda(u+\gamma)^2} \sum_{s=1}^{u-1}\left((u-s+\gamma){\xi_{u-s}^{(t)}} + \sqrt{C_t}\gamma G\frac{(u-s+\gamma)}{(u-s+1)}\right), \\
    &\stackrel{(\ref{eqn:xi_t_sum})}{\leq} \frac{16 \sigma^2 \sqrt{(u+1)}\xi_u^{(t)}}{m(u+\gamma)} + \frac{8 \sqrt{C_t}\sigma^2\gamma^2 G u }{m\lambda(u+\gamma)^2}, \\
     &\stackrel{}{\leq} \frac{16 \sigma^2 \sqrt{(u+1)}\xi_u^{(t)}}{m(u+\gamma)} + \frac{8 \sqrt{C_t} \sigma^2\gamma^2 G  }{m\lambda(u+\gamma)}, \\
    &\stackrel{(a)}{\leq} \frac{\xi_u^{(t)}}{10\sqrt{u+1}} +  \frac{{\color{black}C_t \gamma^2 G^2} }{4(u+1)}.
\end{align*}
The last inequality follows since $\gamma \geq \frac{320\sigma^2}{m}+1 \implies \frac{8 \sigma^2 (u+1)^{1/2}\log(u+1)}{m(u+\gamma)} \leq \frac{1}{10\sqrt{u+1}}$, for all $u \leq t$ and the fact that  {\color{black}$C_t \geq \frac{1024\sigma^4}{G^2m^2\lambda^2}$}.
\\ 

\textbf{Term 4}
\\

The definition of event $\mathcal{E}^{(t)}$ in Equation (\ref{eqn:event_e_t}) gives that $\text{Term }4 \leq \frac{\xi_u^{(t)}\ln \left( \frac{2t^2(t+1)}{\delta}\right)}{10\sqrt{u+1}} + \frac{C_t \gamma^2 G^2}{4(u+1)}$
\\

% \\

% \textbf{Term 5}
% \\

% \begin{align*}
%     {4 \eta^2_t \lambda^2}\sum_{s=1}^{t-1}\frac{(t-s+1)\mathbf{1}_{C_{t-s+1} \neq 0}}{t+\gamma} &\leq \frac{4\lambda^2}{m^2(t+\gamma)^2}\Lambda_T, \\
%     &\leq \frac{4\lambda^2 }{20m^2(t+1)}\Lambda_T
% \end{align*}

% \textbf{Term 6}
% \\

% \begin{align*}
%     4\eta_t  G \sum_{s=1}^{t-1} \frac{(t-s+1)\mathbf{1}_{C_{t-s+1} \neq 0}}{t+\gamma} &\leq \frac{4G}{m(t+\gamma)}\Lambda_T, \\
%      &\leq \frac{4G }{m(t+1)}\Lambda_T
% \end{align*}

{\color{black}

Now, adding in the bounds together into Equation (\ref{eqn:proof_one_step_unrolled_with_bias_variance_martingale_simplified_2_mid_mean_estimate}), 
\begin{multline*}
    \| \widehat{\theta}_u - \theta^{*} \|_2^2 \leq \frac{\gamma^2G^2}{u+1} + \frac{\left( \frac{16\sigma^2}{\lambda} + 4 \sigma^2\right)}{2m^2(u+1)} + \frac{\xi_u^{(t)}}{10\sqrt{u+1}} + \frac{1600 \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}} \\ + \frac{\xi_u^{(t)}\ln \left( \frac{2t^2(t+1)}{\delta}\right)}{10\sqrt{u+1}} + \frac{C_t \gamma^2 G^2}{2(u+1)}.
\end{multline*}
Now using the fact that $\frac{\xi_u^{(t)}\ln \left( \frac{2 t^3}{\delta}\right)}{\sqrt{u+1}} \leq (\xi_u^{(t)})^2$, we get that 
\begin{align*}
    \| \widehat{\theta}_u - \theta^{*} \|_2^2 &\leq \left(1+\frac{C_t}{2} \right)\frac{\gamma^2G^2}{u+1} + \frac{\left( \frac{16\sigma^2}{\lambda} + 4 \sigma^2\right)}{2m^2(u+1)} + \frac{(\xi_u^{(t)})^2}{5} + \frac{96 \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}}.
\end{align*}
Substituting the definition of $\xi_u^{(t)}$ from Equation (\ref{eqn:xi_t_defn}), we get that 
\begin{align*}
     \| \widehat{\theta}_u - \theta^{*} \|_2^2 &\leq \left(1 + \frac{C_t}{2} \right)\left[\frac{\gamma^2G^2}{u+1} + \frac{\left( \frac{16\sigma^2}{\lambda} + 4 \sigma^2\right)}{2m^2(u+1)} + \frac{96 \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}}   \right] , \\
    &\leq (\xi_u^{(t)})^2 + \frac{C_t \gamma^2 G^2}{u+1}.
\end{align*}
The last inequality follows since $C_t  = \max\left(\frac{1024\sigma^4}{G^2m^2\lambda^2}, \frac{8 \lambda \sqrt{\ln \left( \frac{2t^3}{\delta} \right)}}{\gamma^2 G} \right) \implies C_t \geq 2$. 
}


\end{proof}


\end{proof}
% \clearpage

% Re-arranging the above inequality, we see that if 
% \begin{align}
%     a \geq B \ln \left( \frac{2 T}{\delta} \right) + \sqrt{ \left(B \ln \left( \frac{2 T}{\delta} \right)\right)^2 + 2v \ln \left( \frac{2 T}{\delta} \right)  },
%     \label{eqn:martingale_useful_bound}
% \end{align}
% then the RHS of Equation (\ref{eqn:martingale_diff_thm_bound}) is bounded above by $\frac{\delta}{2}$. 
% \\


\subsection{Proof of Lemma \ref{lem:martingale_mean_estimate}}

We first reproduce an useful result.

\begin{lemma}[Freedman’s inequality\citep{victor1999general}]
Suppose $Y_1, \cdots, Y_T$ is a bounded martingale with respect to a filtration $(\mathcal{F}_t)_{t=0}^T$ with $\mathbb{E}[Y_t \vert \mathcal{F}_{t-1}] = 0$ and $\mathbb{P}[|Y_t| \leq B] = 1$ for all $t \in \{1,\cdots, T\}$. Denote by $V_s := \sum_{n=1}^s \text{Var}(Y_n \vert \mathcal{F}_{n-1})$ be the sum of conditional variances. Then, for every $a, v > 0$, 
\begin{align}
    \mathbb{P} \left( \exists n \in [1,T] \text{ such that } \sum_{t=1}^n Y_t \geq a \text{ and } V_n \leq v \right) \leq \exp \left( \frac{-a^2}{2(v+ Ba)} \right).
    \label{eqn:martingale_diff_thm_bound}
\end{align}
\label{lem:martingale_diff}
\end{lemma}
Re-arranging the above inequality, we see that if 
\begin{align}
    a \geq B \ln \left( \frac{2 T}{\delta} \right) + \sqrt{ \left(B \ln \left( \frac{2 T}{\delta} \right)\right)^2 + 2v \ln \left( \frac{2 T}{\delta} \right)  },
    \label{eqn:martingale_useful_bound}
\end{align}
then the RHS of Equation (\ref{eqn:martingale_diff_thm_bound}) is bounded above by $\frac{\delta}{2}$. 

\begin{proof}[Proof of Lemma \ref{lem:martingale_mean_estimate}]
\textbf{Proof of Equation (\ref{eqn:mean_martingale_term_1})}
\\


Fix a $u \in \{1,\cdots, t\}$. For $s \in \{1,\cdots, u-1\}$, denote by the random variable $Y_s^{(u)} :=  \frac{(u-s+\gamma)}{u+\gamma}(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s}])$. Thus, 
\begin{align*}
    4 \eta^2_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma)(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}{u+\gamma} \leq 4\eta_u^2 {\sum_{s=1}^{u-1}Y_s^{(u)}}.
\end{align*}
Observe that the sequence $(Y_s^{(u)})_{s=1}^{u-1}$ is a martingale difference sequence with respect to the filtration $(\mathcal{G}_s)_{s=1}^{t-1}$, where $\mathcal{G}_s := \mathcal{F}_{u-s}$. Furthermore, observe that with probability $1$, $| Y_s^{(u)}| \leq 4\lambda^2\mathbf{1}_{\sigma > 0} + 4\lambda^2\mathbf{1}_{\sigma > 0} \leq 8\lambda^2\mathbf{1}_{\sigma > 0}$. We can bound the conditional variance as 
\begin{align*}
    \sum_{s=1}^{u-1}\text{Var}(Y_s^{(u)} \vert \mathcal{G}_s) &\leq  \sum_{s=1}^{u-1}\left(\frac{(u-s+\gamma)}{u+\gamma}\right)^2\mathbb{E}_{Z_{u-s}}[ (\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s}])^2 \vert \mathcal{F}_{u-s} ], \\
    &\stackrel{\ref{eqn:bound_on_variance_norm}}{\leq}  8\lambda^2 \sum_{s=1}^{u-1}\mathbb{E}_{Z_{u-s}}[ |\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s}]| \vert \mathcal{F}_{u-s} ], \\
    &\leq 8  \lambda^2 \sum_{s=1}^{u-1} 2 \mathbb{E}_{Z_{u-s}}[ |\| {\psi}_{u-s+1}^{(v)} \|_2^2 \vert \mathcal{F}_{u-s}], \\
    &\stackrel{\ref{eqn:bound_expected_variance_norm}}{\leq} 160 \lambda^2 \sigma^2 (u-1).
\end{align*}
Now, putting $B := 8  \lambda^2$ and $v = 160  \lambda^2 \sigma^2 u$, we get from Equation (\ref{eqn:martingale_useful_bound}) that with probability at-least $1-\delta/(2t^2(t+1))$, 
\begin{align*}
    \sum_{s=1}^{u-1}Y_s^{(u)} &\leq 8  \lambda^2  \ln \left( \frac{2t^2(t+1)}{\delta} \right)\mathbf{1}_{\sigma > 0} + \sqrt{\left(8  \lambda^2  \ln \left( \frac{2t^2(t+1)}{\delta} \right)\mathbf{1}_{\sigma > 0} \right)^2 + 160 \lambda^2 \sigma^2 u \ln \left( \frac{2t^2(t+1)}{\delta} \right)}, \\
    &\stackrel{(a)}{\leq} 32  \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta} \right)\sigma(\sigma+1)\sqrt{u+1}.
    \end{align*}
Step $(a)$ follows from the fact that $\lambda \geq 1$. Thus, we have with probability at-least $1-\frac{\delta}{2t^2(t+1)}$, 
\begin{align*}
  4 \eta^2_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma)(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}{u+\gamma} &\leq 96\eta_u^2   \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta} \right)\sigma(\sigma+1)\sqrt{u+1}, \\
    &\leq  \frac{96\lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta} \right)\sigma(\sigma+1)\sqrt{u+1}}{m^2(u+\gamma)^2}, \\
    &\leq \frac{96\lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta} \right)\sigma(\sigma+1)}{m^2(u+\gamma)\sqrt{u+1}}.
\end{align*}
Now taking an union bound over all $u \in \{1,\cdots,t\}$ yields that with probability at-least $1- \frac{\delta}{2t(t+1)}$, for all time $u \in \{1, \cdots, t\}$, 
\begin{align*}
     4 \eta^2_u \sum_{s=1}^{u-1}\frac{(t-s+\gamma)(\| {\psi}_{u-s+1}^{(v)} \|_2^2 - \mathbb{E}_{Z_{u-s+1}}[\| \psi_{u-s+1}^{(v)}\|_2^2 \vert \mathcal{F}_{u-s+1}])}{t+\gamma} &\leq \frac{96 \lambda^2 \ln \left( \frac{2t^2(t+1)}{\delta}\right)\sigma(\sigma+1) }{m (u+\gamma)\sqrt{u+1}}
\end{align*}



\textbf{Proof of Equation (\ref{eqn:mean_martingale_term_2})}
\\

{\color{black}
\begin{align*}
    - 2\eta_u \sum_{s=1}^{u-1}\frac{(u-s+\gamma) \langle \upsilon_{u-s}, {\psi}_{u-s+1}^{(v)}  \rangle}{u+\gamma} \leq \frac{2}{m(u+\gamma)^2} \sum_{s=1}^{u-1}{ \langle \theta_{u-s} - \theta^*, {\psi}_{u-s+1}^{(v)}  \rangle}
\end{align*}


Fix a $u \in \{1, \cdots, t\}$ and denote by $Y_s^{(u)} := (u-s+\gamma) \langle \theta_{u-s} - \theta^{*}, \psi_{u-s+1}^{(v)} \rangle$. Since $\theta_{u-s}$ is measurable with respect to the sigma-algebra generated by $\mathcal{F}_{u-s}$, the conditional expectation $\mathbb{E}[Y_s^{(u)} \vert \mathcal{F}_{u-s}] = 0$. Thus, $(Y_s^{(u)})_{s=1}^{u-1}$ is a martingale difference sequence with respect to the filtration $(\mathcal{F}_{u-s})_{s=1}^{u-1}$. Furthermore, we have from Equation (\ref{eqn:bound_on_variance_norm}) that $|Y_s^{(u)}| \leq 2 (u-s+\gamma) \left(\xi_{u-s}^{(t)} + \frac{\gamma R_1}{(u+\gamma - 1)} \right)\lambda  \leq 2\lambda(u+\gamma)\xi_t^{(t)} + 2\lambda \gamma G$. We can now bound the sum of conditional variances as 
\begin{align*}
    \sum_{s=1}^{u-1} \text{Var}(Y_s^{(u)} \vert \mathcal{F}_{u-s}) &\leq \sum_{s=1}^{u-1} 4 (u-s+\gamma)^2 (\xi_{u-s}^{(t)})^2 \lambda^2\sigma^2 + 4\lambda^2 G^2, \\
    &\stackrel{(\ref{eqn:xi_t_sum})}{\leq} 12 \lambda^2 \sigma^2 (u+\gamma)^2(u+1) \log(u+1) (\xi_u^{(t)})^2 + 4\lambda^2\gamma^2G^2 u . 
\end{align*}
Step $(a)$ follows since $\eta m <1$. Now applying the bound in Equation (\ref{eqn:martingale_useful_bound}) with $B := 2\lambda(u+\gamma)\xi_u^{(t)} + 2\lambda G$ and $v = 12 \lambda^2 \sigma^2 (u+\gamma)^2(u+1) \log(u+1) (\xi_u^{(t)})^2 + 4 \lambda^2  \gamma^2 G^2u$, we get that with probability at-least $1-\delta/(2t^2(t+1))$, 
\begin{align*}
   \sum_{s=1}^{u-1}&{(u-s+\gamma) \langle \upsilon_{u-s}, {\psi}_{u-s+1}^{(v)}  \rangle}\leq 2 \lambda \left((u+\gamma)\xi_u^{(t)} + R_1\right) \ln \left( \frac{2 t^2(t+1)}{\delta} \right) +  \bigg[\left(2 \lambda \left((u+\gamma)\xi_u^{(t)} + G\right)  \ln \left( \frac{2 t^2(t+1)}{\delta} \right) \right)^2 \\& +  \left( \lambda^2 \sigma^2 (u+\gamma)^2(u+1) \log(u+1) (\xi_u^{(t)})^2 + 4\lambda^2\gamma^2G^2 (u+1) \right) \ln \left( \frac{2t^2(t+1)}{\delta}\right)\bigg]^{\frac{1}{2}}, \\
    &\stackrel{}{\leq} {6(u+\gamma)\sqrt{u+1} \log(u+1) (\xi_u^{(t)}) \lambda \sigma(\sigma+1) \ln \left( \frac{2 t^2(t+1)}{\delta}\right)} + 2\lambda \gamma G \sqrt{(u+1) \ln \left( \frac{2t^2(t+1)}{\delta} \right)}.
\end{align*}
Thus,

\begin{align*}
   - 2\eta_u  \sum_{s=1}^{u-1}\frac{(u-s+\gamma) \langle \upsilon_{u-s}, {\psi}_{u-s+1}^{(v)}  \rangle}{u+\gamma}  &\leq \frac{12\sqrt{u+1} \log(u+1) (\xi_u^{(t)}) \lambda \sigma(\sigma+1) \ln \left( \frac{2t^2(t+1)}{\delta}\right)}{(u+\gamma)} + \frac{C_t \gamma G}{10(u+1)}, \\
    &\leq \frac{\xi_u^{(t)}\ln \left( \frac{2t^2(t+1)}{\delta}\right)}{10 \sqrt{u+1}} + \frac{C_t G}{10(u+1)}.
\end{align*}
The first inequality follows since $C_t \geq \frac{8 \lambda \sqrt{\ln \left( \frac{2t^3}{\delta} \right)}}{\gamma^2 G}$. The last inequality follows since for all times $u \leq t$, we have 
\begin{align*}
    \frac{12{\sqrt{u+1}} \log(u+1) \lambda \sigma(\sigma+1) \ln \left( \frac{2t^2(t+1)}{\delta}\right)}{(u+\gamma)} \leq \frac{\ln \left( \frac{2t^2(t+1)}{\delta}\right)}{10}
\end{align*}
as a consequence of $\gamma \geq 120 \lambda \sigma(\sigma+1)$.
 }
 





\end{proof}

\section{Proofs from Section \ref{sec:fpr_guarantee}}


\subsection{Proof of Theorem \ref{thm:fpr_main}}
\label{sec:proof_thm_fpr_main}


We bound this probability using the result of \ref{thm:main_mean_est} and a simple union bound argument. For any process $\mathfrak{M}$, observe that 
\begin{align}
    \mathbb{P}[\exists t \in [r+1,\tau_c^{(r)}) \text{ s.t.} \mathcal{A}_{t} =1 \vert \mathcal{A}_r = 1] &= \mathbb{P}[\cup_{t=r+1}^{\tau_c-1} \mathcal{A}_{t} = 1 \vert \mathcal{A}_r = 1] \nonumber \\
    &\leq \sum_{t=r+1}^{\tau_c-1} \mathbb{P}[\mathcal{A}_{t} = 1 \vert \mathcal{A}_r = 1]. \label{eqn:fpr_proof_mid_point_1}
\end{align}

We now examine the above Equation to bound it. For any fixed $t \in (r, \tau_c^{(r)})$
\begin{align}
   \mathbb{P}[ & \mathcal{A}_{t} = 1 \vert \mathcal{A}_r = 1] 
    =  \mathbb{P}\left[ \bigcup_{s=r+1}^{t-1} \| \widehat{\theta}_{r:s} - \widehat{\theta}_{s+1:t} \| \geq \mathcal{B}\left(s-r, \frac{\delta}{2t(t+1)}\right) + \mathcal{B}\left(t-s-1, \frac{\delta}{2t(t+1)}\right)\right], \nonumber\\
    &\leq \sum_{s=r+1}^{t-1} \left(\mathbb{P}\left[ \| \widehat{\theta}_{r:s} - \theta_{c-1} \| \geq \mathcal{B}\left(s-r, \frac{\delta}{2t(t+1)}\right) \right] +  \mathbb{P}\left[ \| \widehat{\theta}_{s+1:t} - \theta_{c-1} \| \geq \mathcal{B}\left(t-s-1, \frac{\delta}{2t(t+1)}\right) \right]\right), \nonumber\\
    &\stackrel{(a)}{\leq} \sum_{s=r+1}^{t-1} \left( \frac{\delta}{2t(t+1)(s-r)(s-r+1)} + \frac{\delta}{2t(t+1)(t-s-1)(t-s)} \right), \nonumber\\
    &= \frac{\delta}{2t(t+1)} \left( \sum_{s=r+1}^{t-1} \frac{1}{(s-r)(s-r+1)} + \sum_{s=r+1}^{t-1} \frac{1}{(t-s-1)(t-s)}\right), \nonumber\\
    &\leq \frac{\delta}{2t(t+1)} \left( \sum_{s=1}^{t-1-r} \frac{1}{s(s+1)} + \sum_{s=1}^{t-1-r}\frac{1}{s(s+1)} \right), \nonumber\\
    &\stackrel{(b)}{\leq} \frac{\delta}{t(t+1)}. \label{eqn:fpr_inter_2}
\end{align}

Since for all $t < \tau_c^{(r)}$, the mean of the random variables $X_{r+1}, \cdots, X_t$ are identical and equal to $\theta_{c-1}$ (see notation in Section \ref{sec:problem_formulation}), Theorem \ref{thm:main_mean_est} gives rise to inequality $(a)$. Step $(b)$ follows from the fact that $\sum_{s \geq 1} \frac{1}{s(s+1)} = 1$. Now substituting the bound from Equation (\ref{eqn:fpr_inter_2}) into Equation (\ref{eqn:fpr_proof_mid_point_1}), we get that 
\begin{align*}
      \mathbb{P}[\exists t \in [r+1,\tau_c^{(r)}) \text{ s.t. } \mathcal{A}_{t} =1 \vert \mathcal{A}_r = 1] &\leq \sum_{t=r+1}^{\tau_c-1} \frac{\delta}{t(t+1)}, \\
      &\leq \sum_{t \geq 1} \frac{\delta}{t(t+1)}, \\
      &=\delta. 
\end{align*}

Since the above bound holds for all $r$ and process $\mathfrak{M}$, we have 
\begin{align*}
    \sup_{\mathfrak{M}, r}\mathbb{P}[\exists t \in [r+1,\tau_c^{(r)}) \text{ s.t.} \mathcal{A}_{t}  =1 \vert \mathcal{A}_r = 1] \leq \delta. 
\end{align*}

\subsection{Proof of Lemma \ref{lem:fpr_connection}}
\label{sec:proof_of_fpr_connection}

Recall from the definition that the $r$th detection is false if
\begin{align*}
    \chi_r^{(A)} = \mathbf{1}(\not\exists c \text{ s.t. } \tau_c \in (t_{r-1}^{(A)}, t_r^{(A)}]).
\end{align*}

We will show that $\mathbb{E}[\chi_r^{(A)} ] \leq \delta$. This will then conclude the proof of the lemma. 

\begin{align*}
    \mathbb{E}[\chi_r^{(A)} ] &= \mathbb{P}[\not\exists c \text{ s.t. } \tau_c \in (t_{r-1}^{(A)}, t_r^{(A)}]], \\
    &= \mathbb{E} \left[ \mathbb{P}[\not\exists c \text{ s.t. } \tau_c^{(s)} \in (s, t_r^{(A)}]]\bigg| t_{r-1}^{(A)} = s \right], \\
    &\leq \mathbb{E} \left[ \mathbb{P}[\cup_{t=s+1}^{\infty} \tau_c^{(s)} = t, t_{r}^{(\mathcal{A})} < t]\bigg \vert t_{r-1}^{(A)} = s  \right], \\
     &\leq \mathbb{E} \left[ \mathbb{P}[\exists t \in [s+1, \tau_c^{(s)}), \mathcal{A}_{t} = 1 ]\bigg\vert t_{r-1}^{(A)} = s  \right], \\
      &\stackrel{(a)}{\leq} \mathbb{E} \left[ \mathbb{P}[\exists t \in [s+1, \tau_c^{(s)}), \mathcal{A}_{t} = 1 \vert \mathcal{A}_{s} = 1  ]\bigg\vert t_{r-1}^{(A)} = s  \right], \\
     &\stackrel{(b)}{\leq} \delta.
    % &\leq \mathbb{E} \left[ \mathbb{P}^{(\theta,t)}[ \exists r \in [s+1,t) \text{ s.t. \textbf{Restart}}_{s:r} = 1 ] \bigg| t_r^{(\mathcal{A})} = s\right], \\
    % &= \mathbb{P}[\not\exists c \text { s.t. } \tau_c \in (1, t_1^{(\mathcal{A})})]
\end{align*}
Inequality $(a)$ follows from the fact that on the event $t_{r-1}^{(\mathcal{A})} = s$, $\mathcal{A}_s = 1$. Inequality $(b)$ follows from Theorem \ref{thm:fpr_main}.
\section{Proof of Lemma \ref{lem:detection_delay}}
\label{sec:proof_delay}

% \begin{figure}
%     \centering
%     \includegraphics[width=0.4\linewidth]{plots/detection_delay.pdf}
%     \caption{Plot of $ \mathcal{D}(n, \Delta, \delta')$ for fixed $\Delta=10, \delta=0.1$.}
%     \label{fig:detection_delay}
% \end{figure}

% In Figure \ref{fig:detection_delay}, we plot $\mathcal{D}(n, \Delta, \delta')$ as a function of number of pre-change observations $n$ and confidence parameters of $\delta'$, for a fixed $\Delta$. As expected, for a fixed confidence value $\delta'$, increasing $n$ makes the detection time faster and for a given $n$, increasing $\delta'$ leads to a larger delay. 

The proof follows from a straightforward application of Theorem \ref{thm:main_mean_est} as follows. Let $n \in \mathbb{N}, \Delta > 0$ and $\delta' \in (0,1)$ be arbitrary. 

\begin{align}
    \mathbb{P}[ \mathcal{D}(n, \Delta, \delta') \geq d ] &=  \mathbb{P}[ \cap_{s=1}^{n+d} \mathcal{A}(X_{1:s}) = 0 ], \nonumber \\
    &=\mathbb{P}\left[ \bigcap_{s=1}^{n+d} \| \widehat{\theta}_{1:s} - \widehat{\theta}_{s+1:n+d} \|_2^2 \leq \mathcal{B}\left(s,\frac{\delta}{2(n+d)(n+d+1)}\right) + \mathcal{B}\left(n+d-s-1, \frac{\delta}{2(n+d)(n+d+1)}\right) \right], \nonumber\\
    &\leq \mathbb{P}\left[  \| \widehat{\theta}_{1:n-1} - \widehat{\theta}_{n:n+d} \|_2^2 \leq \mathcal{B}\left(n-1,\frac{\delta}{2(n+d)(n+d+1)}\right) + \mathcal{B}\left(d, \frac{\delta}{2(n+d)(n+d+1)}\right) \right].
    \label{eqn:delay_proof_1}
\end{align}

From triangle-inequality, we know that 
\begin{align}
    \| \widehat{\theta}_{1:n-1} - \widehat{\theta}_{n:n+d} \|_2^2 &\geq \| \theta_1 - \theta_2 \|_2^2 - \| \widehat{\theta}_{1:n-1} - \theta_1 \|_2^2 - \|  \widehat{\theta}_{n:n+d} - \theta_2 \|_2^2, \nonumber\\
    &= \Delta^2 - \| \widehat{\theta}_{1:n-1} - \theta_1 \|_2^2 - \|  \widehat{\theta}_{n:n+d} - \theta_2 \|_2^2. 
    \label{eqn:delay_proof_2}
\end{align}

Thus, substituting Equation (\ref{eqn:delay_proof_2} into Equation (\ref{eqn:delay_proof_1}), we get that 
\begin{multline*}
  \mathbb{P}[ \mathcal{D}(n, \Delta, \delta') \geq d ] \leq     \mathbb{P}\bigg[ \Delta^2 - \| \widehat{\theta}_{1:n-1} - \theta_1 \|_2^2 - \|  \widehat{\theta}_{n:n+d} - \theta_2 \|_2^2 \leq \\ \mathcal{B}\left(n-1,\frac{\delta}{2(n+d)(n+d+1)}\right) + \mathcal{B}\left(d, \frac{\delta}{2(n+d)(n+d+1)}\right) \bigg].
\end{multline*}

Denote by the events $\mathcal{E}_i$ for $i \in \{1,2\}$ as
\begin{align*}
    \mathcal{E}_1 &:= \left\{ \| \widehat{\theta}_{1:n-1} - \theta_1 \|_2^2 > \mathcal{B}\left(n-1, \frac{\delta'}{2}\right) \right\}, \\
    \mathcal{E}_2 &:= \left\{ \| \widehat{\theta}_{n:n+d} - \theta_2 \|_2^2 > \mathcal{B}\left(d, \frac{\delta'}{2}\right) \right\}, \\
\end{align*}
Denote by $\mathcal{E} := \mathcal{E}_1 \cup \mathcal{E}_2$. Theorem \ref{thm:main_mean_est} gives that $\mathbb{P}[\mathcal{E}_1] \leq \frac{\delta'}{2(n(n+1))} \leq \frac{\delta'}{2}$ and $\mathbb{P}[\mathcal{E}_2] \leq \frac{\delta'}{2d(d+1)} \leq \frac{\delta'}{2}$. Thus, an union bound gives that $\mathbb{P}[\mathcal{E}] \leq \delta'$. Let $d' \in \mathcal{G}$ be arbitrary, where
\begin{align}
   \mathcal{G} := \bigg\{ d \in \mathbb{N} : \Delta^2 \geq \mathcal{B}\left( n-1, \frac{\delta'}{2} \right) + \mathcal{B}\left( d, \frac{\delta'}{2} \right)  + \mathcal{B}\left( n, \frac{\delta}{2(n+d+1)(n+d)} \right) +  \mathcal{B}\left( d,  \frac{\delta}{2(n+d+1)(n+d)} \right) \bigg\}
   \label{eqn:delay_proof_3}
\end{align}

\textbf{Claim} : If the event $\mathcal{E}^{c}$ holds, then $\mathcal{D}(n,\Delta, \delta) \leq d$ for all $d \in \mathcal{G}$. 

 Suppose $d \in \mathcal{G}$ and event $\mathcal{E}^c$ holds. Then, we know by triangle inequality in Equation (\ref{eqn:delay_proof_2}) that 

\begin{align}
    \| \widehat{\theta}_{1:n-1} - \widehat{\theta}_{n:n+d} \|_2^2 &\geq \| \theta_1 - \theta_2 \|_2^2 - \| \widehat{\theta}_{1:n-1} - \theta_1 \|_2^2 - \|  \widehat{\theta}_{n:n+d} - \theta_2 \|_2^2, \nonumber\\
    &= \Delta^2 - \| \widehat{\theta}_{1:n-1} - \theta_1 \|_2^2 - \|  \widehat{\theta}_{n:n+d} - \theta_2 \|_2^2, \\
    &\stackrel{(a)}{\geq} \Delta^2 -  \mathcal{B}\left(n-1, \frac{\delta'}{2}\right) -  \mathcal{B}\left(d, \frac{\delta'}{2}\right), \\
    &\stackrel{(b)}{\geq} \mathcal{B}\left( n, \frac{\delta}{2(n+d+1)(n+d)} \right) +  \mathcal{B}\left( d,  \frac{\delta}{2(n+d+1)(n+d)} \right).
    \label{eqn:delay_proof_4}
\end{align}
Step $(a)$ follows from the definition of event $\mathcal{E}$ and on the assumption of the claim that event $\mathcal{E}^c$ holds. Step $(b)$ follows from the fact that $d \in \mathcal{G}$ is arbitrary (cf. Equation (\ref{eqn:delay_proof_3}). The last step says from Line $8$ of Algorithm \ref{algo:learn_model} that if no detection has been made till time $n+d$, then under the event $\mathcal{E}^c$, time step $d$ is a detection time. Since event $\mathcal{E}^c$ holds with probability at-least $1-\delta'$ , this concludes the proof. 


\begin{figure}
    \centering
    \includegraphics[width=0.4\linewidth]{plots/detection_delay.pdf}
    \caption{Plot of $ \mathcal{D}(n, \Delta, \delta')$ in Lemma \ref{lem:detection_delay} for fixed $\Delta=10, \delta=0.1$.}
    \label{fig:detection_delay}
\end{figure}


% \begin{figure}
%     \centering
%     \includegraphics[width=0.5\linewidth]{plots/refined_plots/empirical_heat_map.pdf}
%     \caption{Plot of observed delay $ \mathcal{D}(n, \Delta, \delta')$ for Pareto distribution $d=32$. As can be seen, the observed delay is much smaller than the worst case delay shown in Figure \ref{fig:heatmap}.}
%     \label{fig:empirical_heat_map}
% \end{figure}


 \subsection{Useful convexity based inequalities}
 
Let $f : \Theta \to \mathbb{R}$ be a strongly convex function with strong convexity parameters $0 < m \leq M < \infty$. Denote by $\theta^* := \arg\min_{\theta \in \Theta}f(\theta)$. Since $f(\cdot)$ is convex and $\Theta$ is convex and compact, the existence and uniqueness of $\theta^*$ is guaranteed. Strong convexity gives that for any $\widehat{\theta}_{t-1} \in \Theta$,
\begin{align}
   f({\theta}^*) \geq f(\widehat{\theta}_{t-1}) + \langle \nabla f(\widehat{\theta}_{t-1}),  \theta^*-\widehat{\theta}_{t-1}  \rangle + \frac{m}{2} \|  \theta^* -\widehat{\theta}_{t-1} \|_2^2.
\end{align}
Further since $\theta^* = \arg\min_{\theta \in \Theta}f(\theta)$., we have that 
\begin{align*}
 f(\widehat{\theta}_{t-1}) - f(\theta^*_t) \geq \frac{m}{2}\| \widehat{\theta}_{t-1} - \theta^*\|_2^2.
\end{align*}
Putting these two together, we see that 
\begin{align}
    \langle \nabla f(\widehat{\theta}_{t-1}), \widehat{\theta}_{t-1} - \theta^* \rangle \geq m \|\widehat{\theta}_{t-1} - \theta^*\|_2^2.
    \label{eqn:convexity_gradient_bound}
\end{align}
Also,  We further use the following lemma. 
\begin{lemma}[Lemma $3.11$ from \citep{bubeck2015convex}]
Let $g : \mathbb{R}^d \to \mathbb{R}$ be a $M$ smooth and $m$ strongly convex function. Then for all $x,y \in \mathbb{R}^d$,
\begin{align*}
    \langle \nabla g(x) - \nabla g(y), x-y \rangle \geq \frac{mM}{M+m} \| x-y\|_2^2 + \frac{1}{M + m}\| \nabla g(x) - \nabla g(y) \|_2^2.
\end{align*}
\end{lemma}
By substituting $x = \widehat{\theta}_{t-1}$, $y = \theta^*_{t}$ and $g(\cdot) = f(\cdot)$ and by leveraging the fact that $\nabla f(\theta^*) = 0$, we get the inequality that 
\begin{align*}
    \langle \nabla f(\widehat{\theta}_{t-1}), \widehat{\theta}_{t-1} - \theta^* \rangle \geq \frac{mM}{m+M}\|\widehat{\theta}_{t-1} - \theta^* \|_2^2 + \frac{1}{M+m}\|\nabla f(\widehat{\theta}_{t-1}) \|_2^2. 
\end{align*}
Re-arranging, we see that 
\begin{align}
    \|\nabla f(\widehat{\theta}_{t-1}) \|_2^2 \leq (M+m) \langle \nabla f(\widehat{\theta}_{t-1}), \widehat{\theta}_{t-1} - \theta^*\rangle - mM \| \widehat{\theta}_{t-1} - \theta^*\|_2^2.
        \label{eqn:convexity_bound_inner_prod}
\end{align}

\section{Additional Simulations}

In Figure \ref{fig:sample_path}, we plot a sample path of observed data and mark out the true change-points and the detected time-instants by Algorithm \ref{algo:learn_model}. The plots indicate that although visually identifying the change in the means is hard, our change-point detection algorithm is able to consistently across variety of distribution families. 

\begin{figure*}[ht!]
\centering
% \begin{subfigure}{0.32\linewidth}
% \includegraphics[width=0.99\linewidth]{figs/mean_estimation_pareto.pdf}
% %\caption{}
% \end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/multiple_normal_1.pdf}
\caption{Unit-variance Gaussian.}
\label{fig:fig1}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/multiple_pareto_1.pdf}
\caption{Pareto with $s=2.1$.}
\label{fig:fig2}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/multiple_pareto_2.pdf}
\caption{Pareto with $s=2.01$.}
\label{fig:f3}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/multiple_normal_pareto_1.pdf}
\caption{Alternate Pareto $s=2.01$ and Gaussian.}
\label{fig:f4}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/pareto_normal_2.pdf}
\caption{Alternate Pareto $s=2.01$ and Gaussian}
\label{fig:f5}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/normal_pareto_3.pdf}
\caption{Alternate Pareto $s=2.01$ and Gaussian}
\label{fig:f6}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/high_dim_pareto_1.pdf}
\caption{Pareto $s=2.01, d=15,\Delta=5$}
\label{fig:f7}
\end{subfigure}
\begin{subfigure}{0.32\linewidth}
\includegraphics[width=0.99\linewidth]{plots/high_dim_pareto_2.pdf}
\caption{Pareto $s=2.01, d=15,\Delta=2$}
\label{fig:f8}
\end{subfigure}
\caption{In all plots, we choose the change-point gap to be $\Delta=0.1$ and $\delta=0.05$ except (g) and (h) where $\Delta=5$ and $2$ respectively. In plots $(g)$ and $(h)$, we plot the norm of the observed random vector and thus the Y-axis is non-negative. We see missed detection in Figures $(e)$ and $(h)$ with the last change-point on the right being missed. We do not observe False-positives in these plots.  }
\label{fig:sample_path}
\end{figure*}


\end{document}