% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                        % version; also before submission to
                        % see how the non-anonymous paper
                        % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                             % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                              % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Nonconvex Stochastic Scaled Gradient Descent \\ and Generalized Eigenvector Problems}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<junchili@berkeley.edu>?Subject=Your UAI 2023 paper}{Chris Junchi Li}{}}
\author[1,2]{Michael I.~Jordan}
% Add affiliations after the authors
\affil[1]{%
Department of Electrical Engineering and Computer Sciences%
\\
UC Berkeley%
\\
Berkeley, California, USA
}
\affil[2]{%
Department of Statistics%
\\
UC Berkeley%
\\
Berkeley, California, USA
}

\usepackage{mathrsfs}
%\usepackage{enumerate}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath}
\usepackage{natbib}
\usepackage{amssymb}
\usepackage{bm}
\def\beq{\begin{equation} }\def\eeq{\end{equation} }\def\ep{\varepsilon}\def\1{\mathbf{1}}\def\cov{{\rm cov}}\def\var{{\rm var}}
\newcommand{\uu}{\bm{u}}
\newcommand{\vv}{{\bm{v}}}
\newcommand{\ww}{\bm{w}}
\newcommand{\cC}{\mathcal{C}}
\def\bzeta{\boldsymbol{\zeta}}
\def\bSigma{\boldsymbol{\mathbf{\Sigma}}}
\newcommand{\Exs}{\mathbb{E}}
\newcommand{\Ab}{\mathbf{A}}
\newcommand{\Bb}{\mathbf{B}}
\newcommand{\real}{\mathbb{R}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cF}{{\mathcal{F}}}
\newcommand{\cH}{{\mathcal{H}}}
\newcommand{\cM}{\mathcal{\mathbf{M}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\Ib}{\mathbf{I}}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\def\lipr{\delta}
\newcommand{\cV}{\mathcal{V}}
\def\rhoi{\rho}
\def\alphai{\mu}
\def\betai{\beta}
\def\eps{\epsilon}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\bP}{\bm{P}}
\def\diag{\mathop{\mathrm{diag}}}
\def\ivv{\overline{\vv}}
\def\bchi{\boldsymbol{\chi}}
\newcommand{\bS}{\bm{S}}
\def\bxi{\boldsymbol{\xi}}
\newcommand{\bPhi}{\bm{\mathbf{\Phi}}}
\newcommand{\Xb}{\bm{X}}
\newcommand{\Yb}{\bm{Y}}
\def\deltai{\delta}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\bR}{\bm{R}}
\newcommand{\bQ}{\bm{Q}}
\def\tbS{\widetilde{\bS}}
\def\tbP{\widetilde{\bP}}
\def\tDelta{\overline{\bm{\Delta}}}
\def\radius{r}
\newcommand{\ud}{d}
\def\kJ{\mathscr{J}}
\def\gammai{\gamma}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\eb}{\mathbf{e}}
\newcommand{\ub}{\mathbf{u}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\wb}{\mathbf{w}}
\newcommand{\xb}{\mathbf{x}}
\newcommand{\yb}{\mathbf{y}}
\newcommand{\zb}{\mathbf{z}}
\newcommand{\ba}{\bm{a}}
\newcommand{\bb}{\bm{b}}
\newcommand{\bv}{\bm{v}}


\newtheorem{theorem}{Theorem}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{remark}[theorem]{Remark}


\usepackage{graphicx,subfigure}

\newcommand{\jmlrBlackBox}{\rule{1.5ex}{1.5ex}}
\providecommand{\BlackBox}{\jmlrBlackBox}
\newcommand{\jmlrQED}{\hfill\jmlrBlackBox\par\bigskip}
\newenvironment{proof}%
{%
\par\noindent{\bfseries\upshape Proof\ }%
}%
{\jmlrQED}


% %SPACING ORIGINAL
% \setlength{\textfloatsep}{5pt}% Remove \textfloatsep
% \setlength\floatsep{3pt}
% \setlength\intextsep{1pt}
% %\setlength{\bibsep}{0ex}
% %\nipsfinalcopy % Uncomment for camera-ready version
% \setlength{\abovecaptionskip}{1pt}
% \setlength{\belowcaptionskip}{1pt}
% \setlength{\parskip}{0.15em}
% %\usepackage[compact]{titlesec}
% %\titlespacing*{\section}
% %{0pt}{1ex}{1ex}
% %\titlespacing*{\subsection}
% %{0pt}{1ex}{1ex}
% %\titlespacing*{\subsubsection}
% %{0pt}{1ex}{1ex}


% \newtheorem{innercustom}{}
% \newenvironment{custom}[1]
%   {\renewcommand\theinnercustom{#1}\innercustom}
%   {\endinnercustom}

% \usepackage{enumitem}
% %\usepackage[dvipsnames]{xcolor}
% %\def\blue#1{}
% \def\blue#1{\textcolor{cyan}{#1}}
% %\def\green#1{}
% \def\green#1{\colorbox{green}{#1}}

%=Alternating
\def\red#1{}\def\pb{}%\usepackage{fullpage}
%\def\red#1{\textcolor{orange}{#1}}\def\pb{\newpage}\addtolength{\oddsidemargin}{-.5in}\addtolength{\evensidemargin}{-.5in}\addtolength{\textwidth}{1in}\addtolength{\topmargin}{-.5in}\addtolength{\textheight}{-4in}   %\usepackage{refcheck}

\newcommand{\cjlcomment}[1]{{\bf{{\color{cyan}{{Junchi {---} #1}}}}}}

% \onecolumn
\begin{document}








% \pagenumbering{roman}
% %\pb\pb\subsection{\blue{ANNOTATION I}}
% \blue{\textbf{NOTE PAGE}
% %\blue{\pb\textbf{NOTE PAGE}
% \\•
% \\•
% }
% \begin{enumerate}[leftmargin=0mm,label=(\arabic*)]
% \pb\item
% \end{enumerate}











% \blue{\pb\tableofcontents}\newpage%%%%====





%\section to \pb\section
%\subsection to \pb\subsection
%\subsubsection to \pb\subsubsection


\pagenumbering{arabic}
\maketitle

\begin{abstract}
Motivated by the problem of online canonical correlation analysis, we propose the \emph{Stochastic Scaled-Gradient Descent} (SSGD) algorithm for minimizing the expectation of a stochastic function over a generic Riemannian manifold. SSGD generalizes the idea of projected stochastic gradient descent and allows the use of scaled stochastic gradients instead of stochastic gradients. In the special case of a spherical constraint, which arises in generalized eigenvector problems, we establish a nonasymptotic finite-sample bound of $\sqrt{1/T}$, and show that this rate is minimax optimal, up to a polylogarithmic factor of relevant parameters. On the asymptotic side, a novel trajectory-averaging argument allows us to achieve local asymptotic normality with a rate that matches that of Ruppert-Polyak-Juditsky averaging. We bring these ideas together in an application to online canonical correlation analysis, deriving, for the first time in the literature, an optimal one-time-scale algorithm with an explicit rate of local asymptotic convergence to normality. Numerical studies of canonical correlation analysis are also provided for synthetic data.
\end{abstract}




%\paragraph{Keywords:}
%Nonconvex optimization, stochastic scaled-gradient descent, generalized eigenvector problem, canonical correlation analysis, Polyak-Juditsky trajectory averaging











\pb\section{Introduction}\label{sec:intro}
Nonconvex optimization has become the algorithmic engine powering many recent developments in statistics and machine learning. 
Advances in both theoretical understanding and algorithmic implementation have motivated the use of nonconvex optimization formulations with very large datasets, and the striking empirical discovery is that nonconvex models can be successful in this setting, despite the pessimism of classical worst-case analysis.
In this paper, we consider the following general constrained nonconvex optimization problem:
\beq\label{opt_non}
\min_{\vv}~F(\vv)
, \qquad \textnormal{subject to}~
\vv \in \cC
,
\eeq
where $F(\vv)$ is a smooth and possibly nonconvex objective function and $\cC$ is a feasible set. 
The workhorse algorithm in this setting is stochastic gradient descent (SGD) and its variants \citep{robbins1951stochastic, Qian1999On, duchi2011adaptive, kingma2015adam, zhang2016first}. 
Given an unbiased estimate $\widetilde {\nabla} F(\vv;\bzeta)$ of the gradient $\nabla F(\vv)$, SGD performs the following update at the $t$-th step ($t\ge 1$):
\beq\label{SGD}
\vv_t
	=
\Pi_{\cC}\left[ \vv_{t-1} - \eta \widetilde {\nabla} F(\vv_{t-1};\bzeta_t)
\right],
\eeq
where $\eta > 0$ is a step-size and $\Pi_{\cC}$ is a projection operator onto the feasible set $\cC$. 
SGD updates use only a single data point, or a small number of data points, and thus significantly reduce computational and storage complexities compared with offline algorithms, which require storing the full data set and evaluating the full gradient at each iteration.

In many applications, however, we do \textit{not} have access to an unbiased estimate of $\nabla F(\vv)$ when we restrict access to a small number of data points. 
Instead, for each $\vv \in \mathcal{C}$ we have access only to a stochastic vector $\Gamma(\vv;\bzeta)$ which is an unbiased estimate of some \emph{scaled-gradient}:
\beq\label{EEG}
\Exs_{\bzeta} \big[ \Gamma(\vv;\bzeta) \big]
	=
D(\vv) \nabla F(\vv)
,
\eeq
where $D(\vv)$ is a deterministic positive scalar that depends on the current state $\vv$, dubbed as~\emph{scaled factor}. 
Examples of this setup arise most notably in generalized eigenvector (GEV) computation, which finds its applications in principal component analysis, partial least squares regression, Fisher's linear discriminant analysis, canonical correlation analysis (CCA), etc.
Despite this wide range of applications, and their particular relevance to large-scale machine learning problems, there exist few rigorous general frameworks for SGD-based online learning using such models.

Our approach is a conceptually straightforward extension of SGD. 
We propose to continue to use~\eqref{SGD} but with $\widetilde {\nabla} F(\vv_{t-1};\bzeta_t)$ replaced by $\Gamma(\vv_{t-1};\bzeta_t)$. 
We refer this algorithm as the \emph{Stochastic Scaled-Gradient Descent} (SSGD) algorithm.
Specifically, at each step, SSGD performs the update:
\beq\label{SSGD}
\vv_t
=
\Pi_{\cC}\left[
\vv_{t-1} - \eta \Gamma(\vv_{t-1};\bzeta_t)
\right].
\eeq
We provide a theoretical analysis of this algorithm. 
While some of our analysis applies to the algorithm in full generality, our most useful results arise when we specialize to the online GEV problem. 
In this case we aim to minimize the generalized Rayleigh quotient given a unit spherical constraint:
\beq\label{GEV}
\min_{\vv}~
-\frac{\vv^\top \Ab \vv}{\vv^\top\Bb\vv}
	, \qquad \textnormal{subject to}~
\vv \in \real^d,\ \|\vv\| = 1
.
\eeq
The first-order derivative of the generalized Rayleigh quotient with respect to $\vv$ is
\beq\label{gradient_R}
\nabla_\vv \left[ -\frac{\vv^\top \Ab \vv}{\vv^\top \Bb \vv} \right]
	=
-\frac{(\vv^\top \Bb \vv) \Ab \vv - (\vv^\top \Ab \vv) \Bb \vv}{(1/2) (\vv^\top \Bb \vv)^2}
.
\eeq
As pointed out by recent works e.g.~\citet{arora2012stochastic}, the major stumbling block in applying SGD to this problem lies in obtaining an unbiased stochastic sample of the gradient~\eqref{gradient_R}, due to the fact that the objective function takes a fractional form of two expectations.
In our approach we circumvent this issue by simply replacing the denominator on the right-hand side of~\eqref{gradient_R} by the constant 1.
At each step we take $\widetilde{\Ab}$ and $\widetilde{\Bb}'$ as mutually independent and unbiased stochastic samples of $\Ab$ and $\Bb$ respectively and proceed with the following update:
\beq\label{eq:gev}
\begin{aligned}
\vv_t
&=
\Pi_{\cS^{d-1}} \left[
\vv_{t-1} + \eta \left(
	(\vv_{t-1}^\top \widetilde \Bb' \vv_{t-1}) \widetilde \Ab \vv_{t-1} \right.\right.
\\& \left.\left.\hspace{1.2in}
- (\vv_{t-1}^\top \widetilde \Ab \vv_{t-1}) \widetilde \Bb' \vv_{t-1} 
	\right)
\right]
.
\end{aligned}
\eeq
We refer to the rule \eqref{eq:gev} as an \textit{online GEV iteration}. 
In the special case where the stochastic sample $\widetilde{\Bb}'$ is taken as $\Ib$, \eqref{eq:gev} essentially reproduces Oja's online PCA algorithm \citep{oja1982simplified} with an incurred $O(\eta^2)$ higher-order error term.



To identify the iterative algorithm in \eqref{eq:gev} as a manifestation of SSGD, we rewrite the term in parentheses in the algorithm as follows (we set $\vv = \vv_{t-1}$ for brevity):
\beq\label{eq:gev2}
\begin{aligned}
&
(\vv^\top \widetilde \Bb' \vv) \widetilde \Ab \vv - (\vv^\top \widetilde \Ab \vv) \widetilde \Bb' \vv
\\&=
\frac{(\vv^\top \Bb \vv)^2}{2}
\cdot
\frac{
(\vv^\top \widetilde \Bb' \vv) \widetilde \Ab \vv - (\vv^\top \widetilde \Ab \vv) \widetilde \Bb' \vv
}{(1/2) (\vv^\top \Bb \vv)^2}
.
\end{aligned}
\eeq
It can be easily seen that the expectation of \eqref{eq:gev2} is a scaled gradient of the generalized Rayleigh quotient, where the scaled factor $D(\vv)\equiv (\vv^\top \Bb \vv)^2/2$.
This approach, which has been referred to as \textit{double stochastic sampling} in the setting of kernel methods~\citep{dai2014scalable, dai2017learning}, makes it possible to develop an efficient stochastic approximation algorithm. 
Indeed, often $\widetilde{\Ab}, \widetilde{\Bb}'$ are of rank one, so the computation of matrix-vector products $\tilde{\Ab} \vv, \tilde{\Bb}' \vv$ only invokes inner products of vectors and is hence computationally efficient in the face of high dimensionality (i.e.~when $d$ is high).



Our contributions relative to previous work on nonconvex stochastic optimization as are follows.
First, we propose a novel algorithm---the stochastic scaled-gradient descent (SSGD) algorithm---which generalizes the classical SGD algorithm and has a wider range of applications.
Second, we provide a local convergence analysis for spherical-constraint objective functions that are locally convex.
Starting with a warm initialization, our local convergence rate matches a known information-theoretic lower bound \citep{mei2018landscape}.
Third, by applying SSGD to the GEV problem, we give a positive answer to the question raised by~\citet{arora2012stochastic} regarding to the existence of an efficient online GEV algorithm.
Specifically, in the case of CCA, our SSGD algorithm uses as few as two samples at each update, does not incur intermediate and expensive computational cost while achieving a polynomial convergence rate guarantee.













\paragraph{Related Literature}
The generalized eigenvector problem is at the core of many statistical problems such as principal component analysis~\citep{pearson1901lines,hotelling1933analysis}, canonical correlation analysis \citep{hotelling1936relations}, Fisher's linear discriminant analysis \citep{fisher1936use,welling2005fisher}, partial least squares regression~\citep{stone1990continuum}, sufficient dimension reduction \citep{li1991sliced}, mixture models~\citep{balakrishnan2017statistical}, along with their sparse counterparts.
Iterative algorithms for sparse principal component analysis has been proposed by \cite{ma2013sparse} and \cite{yuan2013truncated} as a special case of the eigenvalue problem:
by adding a soft-thresholding step to each power method step their algorithms achieve linear convergence.
In follow-up work, \cite{tan2018sparse} proposed a truncated Rayleigh flow algorithm to estimate the leading sparse generalized eigenvector that also achieves a linear convergence rate.
Additional work on generalized eigenvector computation includes \cite{ge2016efficient,allen2017doubly,yuan2019decomposition,ma2015finding,chaudhuri2009multi}.


Some recent work has focused on developing efficient online procedures for particular instances of generalized eigenvector problems, among which online principal and canonical eigenvectors estimation has been of particular interest.
Oja's online PCA iteration \citep{oja1982simplified}, which can be reproduced from \eqref{eq:gev} when $\widetilde{\Bb}$ is taken as $\Ib$ as a special case, up to an incurred $O(\eta^2)$ error term, has been shown to provably match the minimax information lower bound \citep{jain2016streaming, li2018near, allen2017first}.
There is also a rich literature on stochastic gradient methods for convex and nonconvex minimization that takes place on Riemannian manifolds \citep{ge2015escaping,zhang2016first};
we refer the readers to \citet{hosseini2020recent} for a recent survey study.
More related to our work, procedures for efficient online canonical eigenvectors estimation have been explored \citep{arora2017stochastic,gao2019stochastic,chen2019constrained}.
Among these works, \citet{gao2019stochastic} developed a streaming canonical correlation analysis (CCA) algorithm which involves solving a large linear system at each iteration, and independently~\citet{arora2017stochastic} proposed a different stochastic CCA algorithm which has temporal and spatial complexities that are quadratic in $d$. 
\citet{chen2019constrained} present a landscape analysis of GEV/CCA and provide a continuous-time insight for a class of primal-dual algorithms when the two matrices in GEV commute; the convergence analysis of \citet{chen2019constrained}, however, does \textit{not} directly translate to discrete-time convergence rate bounds and no explicit analysis has been provided when two matrices do \textit{not} commute.




In a recent paper, \citet{bhatia2018gen} studied the CCA problem and proposed a two-time-scale online iteration that they refer to as ``Gen-Oja.''
The notion of two-time-scale analysis has been used widely in stochastic control and reinforcement learning \citep{BORKAR,KUSHNER-YIN}, and the slow process in Gen-Oja is essentially Oja's iteration \citep{oja1982simplified} for online principal component estimation with Markovian noise \citep{shamir2016convergence,jain2016streaming,li2018near,allen2017first}.
\citet{bhatia2018gen} obtained a convergence rate under a bounded sample assumption that achieves the minimax rate $1/\sqrt{N}$ in terms of the sample size $N$.
In comparison, our proposed SSGD algorithm is a single time-scale algorithm with a single step-size and an extra requirement of two (independent) samples per iterate. 
The algorithm is minimax optimal with respect to local convergence and hence theoretically comparable with Gen-Oja.




\paragraph{Organization}
The rest of this paper is organized as follows. \S\ref{ssec:assu} states our settings and assumptions throughout the theoretical analysis of our paper. \S\ref{sec_local} presents our local convergence results under the warm initialization condition.
\S\ref{sec_global} presents our two-phase convergence results for arbitrary initialization.
\S\ref{sec_prj} investigates the asymptotic property of our algorithm.
\S\ref{sec:strictsaddle} uses the example of Canonical Correlation Analysis to demonstrate the practical computation and experimental performance of our algorithm.
\S\ref{sec_summary} summarizes the entire paper.
Limited by space we relegate to Appendix all our theoretical analysis and secondary lemmas.
% \S\ref{sec_proof-gev} presents the proofs of .




\paragraph{Notation}
Unless indicated otherwise, $C$ denotes some positive, absolute constant which may change from line to line. 
For two sequences $\{a_n\}$ and $\{b_n\}$ of positive scalars, we denote $a_n \gtrsim b_n$ (resp.~$a_n \lesssim b_n$) if $a_n \ge C b_n$ (resp.~$a_n \le C b_n$) for all $n$, and $a_n \asymp b_n$ if $a_n \gtrsim b_n$ and $a_n \lesssim b_n$ hold simultaneously. 
We also write $a_n = O(b_n), a_n = \Theta(b_n), a_n = \Omega(b_n)$ as $a_n\lesssim b_n, a_n \asymp b_n, a_n \gtrsim b_n$, respectively.
We use $\|\vv\|$ to denote the $\ell_2$-norm of $\vv$. 
Let $\lambda_{\max}(\Ab)$, $\lambda_{\min}(\Ab)$ and $\|\Ab\|$ denote the maximal, minimal eigenvalues and the operator norm of a real symmetric matrix $\Ab$.
We will explain other notation at its first appearance.




















\pb\section{Settings and Assumptions}\label{ssec:assu}
In this section, we present the settings and assumptions required by our theoretical analysis of the SSGD algorithm for nonconvex optimization.
To illustrate the core idea we focus on the case of a spherical constraint, $\vv\in\cS^{d-1}$, in which case our proposed SSGD iteration \eqref{SSGD} reduces to the following update:
\beq\label{PSSGD}
\vv_t		=	\Pi_{\cS^{d-1}}\left[ \vv_{t-1} - \eta \Gamma(\vv_{t-1};\bzeta_t) \right]
.
\eeq
Let $\cF_t = \sigma\big( \bzeta_s: s\le t \big)$ be the filtration generated by the stochastic process $\bzeta_t$.
Then, from \eqref{EEG}, we have $\Exs[ \Gamma(\vv_{t-1};\bzeta_t) \mid \cF_{t-1} ] = D(\vv_{t-1}) \nabla F(\vv_{t-1})$.
That is, the conditional expectation is a scaled gradient. 
The ensuing analysis is analogous to that of locally convex SGD given we have appropriate Lipschitz-smoothness of the scalar function $D(\vv)$, but it requires delicate treatment given that SSGD effectively has a varying step-size embodied in the scaling factor.



Following the classical theory of constrained optimization \citep{NOCEDAL-WRIGHT} we introduce a definition of \textit{manifold gradient} and \textit{manifold Hessian} in the presence of a unit spherical constraint, $\cC: c(\vv)= (1/2)(\vv^\top\vv - 1)=0$.\footnote{Here for notational simplicity we incorporate a factor of $1/2$.}
For this equality-constrained optimization problem, we utilize the method of Lagrange multipliers and introduce the following Lagrangian function:
$
L(\vv; \mu) = F(\vv) - \frac{\mu}{2} \left(\|\vv\|^2 - 1 \right)
.
$
We define the manifold gradient:

\vspace{-.13in}
\begin{small}
\beq\label{gvb}
g(\vv) = \nabla L(\vv; \mu)\big|_{\mu = \mu^*(\vv)}
=
\nabla F(\vv) - \frac{\vv^\top \nabla F(\vv)}{\|\vv\|^2} \vv
,
\eeq
\end{small}%
and the manifold Hessian:

\vspace{-.13in}
\begin{small}
\beq\label{Hvb}
\cH(\vv)
=
\nabla^2 L(\vv; \mu)\big|_{\mu = \mu^*(\vv)}
=
\nabla^2 F(\vv) - \frac{\vv^\top \nabla F(\vv)}{\|\vv\|^2} \Ib
,
\eeq
\end{small}%
where $\mu^*(\vv) = \|\vv\|^{-2} \vv^\top \nabla F(\vv)$ is the \textit{optimal Lagrangian multiplier} defined by

\vspace{-.13in}
\begin{small}
$$\begin{aligned}
\frac{\vv^\top \nabla F(\vv)}{\|\vv\|^2}
&=
\argmin_\mu \left\| \nabla L(\vv;\mu) \right\|
=
\argmin_\mu \left\| \nabla F(\vv) - \mu \vv \right\|
.
\end{aligned}$$
\end{small}%
For $\vv \in \cS^{d-1}$, we let $\cT(\vv) = \{\uu: \uu^\top\vv = 0\}$ denote the tangent space of $\cS^{d-1}$ at $\vv$.



To prove our main theoretical result, we need the following definitions and assumptions. 
We first define the Lipschitz continuity for a generic mapping:

\begin{definition}[Lipschitz Continuity]\label{defi:Lipschitz}
Let $\cM$ be a finite-dimensional normed vector space. 
The map $M: \real^d \mapsto \cM$ is called $L_M$-Lipschitz, if for any two points $\vv_1, \vv_2 \in \real^d$
$
\| M(\vv) - M(\vv') \|_{\cM}
\le
L_M \| \vv - \vv' \|,
$
where $\|\cdot\|_\cM$ is any norm properly defined in space $\cM$.
\end{definition}


In addition, we need the following assumption on the state-dependent scalar $D(\vv)$ and covariance matrix $\bSigma(\vv)$.
For a fixed $\vv$, define the state-dependent covariance $\bSigma(\vv)$ to be

\vspace{-.2in}
\begin{small}
\beq\label{eq:Sigma}
\begin{aligned}
&\,
\bSigma(\vv)
= 
\var\left( \Gamma(\vv;\bzeta) \right)
\\&=
\Exs\left[
\big( \Gamma(\vv;\bzeta) - D(\vv) \nabla F(\vv) \big)
%\right.\\&\hspace{1.45in}\left.
\big( \Gamma(\vv;\bzeta) - D(\vv) \nabla F(\vv) \big)^\top
\right]
.
\end{aligned}\eeq
\end{small}%
For the purposes of our analysis, we assume that the state-dependent parameter $D(\vv)$ and the Hessian $\nabla^2 F(\vv)$ are Lipschitz continuous within $\{\vv: \|\vv\| \le 1, \|\vv - \vv^*\| \le \lipr\}$, where $\vv^*$ is a local minimizer of the constrained optimization problem \eqref{GEV} and where $\lipr \in (0, 1]$ is a fixed constant.
Within this convex bounded compact space, we can also show that $F(\vv)$ and $\nabla F(\vv)$ are Lipschitz continuous.
We explicitly specify these constants in the following assumption.

\begin{assumption}[Smoothness Assumption]\label{ass:lda}
For any $\vv \in \{\vv: \|\vv\| \le 1, \|\vv - \vv^*\| \le \lipr\}$, we assume that $D(\vv)$ is $L_D$-Lipschitz, $F(\vv)$ is $L_F$-Lipschitz, $\nabla F(\vv)$ is $L_K$-Lipschitz and $\nabla^2 F(\vv)$ is $L_Q$-Lipschitz, where $L_D, L_F, L_K, L_Q$ are fixed positive constants.
\end{assumption}








Now we pose some tail behavior of the stochastic vectors $\Gamma(\vv_{t-1}; \bzeta_t), t\ge 1$ as \textit{vector $\alpha$-sub-Weibull}, as in the following definition:

\begin{assumption}[Sub-Weibull Tail]\label{ass:se}
For some fixed $\alpha \in (0,2]$ and for all $\vv\in \cC$, we assume that the stochastic vectors $\Gamma(\vv;\bzeta)$ satisfy $
\Exs\exp\left(
\|\Gamma(\vv;\bzeta)\|^\alpha / \cV^\alpha
\right) \le 2
$, where $\cV$ is called the \emph{sub-Weibull parameter} of stochastic vector $\Gamma(\vv;\bzeta)$.
\end{assumption}
Note here the sub-Weibull parameter is in the vector-norm sense instead of the maximal projected scalar sense.
The class of sub-Weibull distributions contains the  sub-Gaussian ($\alpha=2$) and sub-Exponential ($\alpha=1$) distribution classes as special cases \citep{wainwright2019high,kuchibhotla2018moving}.
Background on vector $\alpha$-sub-Weibull distributions (and the associated notion of Orlicz $\psi_\alpha$-norm) are provided in Appendix~\S\ref{sec:orlicz}.




























\pb\section{Local Convergence Analysis}\label{sec_local}
In this section we provide the main local convergence result for our SSGD algorithm.
Our local analysis is inspired from both generic \citep{ge2015escaping} and dynamics-based \citep{li2018near,li2021stochastic} analyses for nonconvex stochastic gradient descent, which we further adapt to our scaled-gradient setup.

For notational simplicity, we denote
\beq\label{rhoi}
\hspace{-.07in}
\begin{aligned}
&
D = D(\vv^*)
,
\\&
\rhoi = D \left( 2L_Q + \frac52 L_F + \frac92 L_K \right)
+
L_D (L_K + 2 L_F)
.
\end{aligned}
\eeq
For our local convergence analysis, we assume that the initialization $\vv_0$ falls into the neighborhood of a local minimizer $\vv^*$ of the constrained optimization problem; that is,
\beq\label{eq:warm}
\|\vv_0 - \vv^*\|
	\le
\min\left\{ \frac{D \alphai}{2^5 \rhoi}, \lipr \right\}
,
\eeq
where $\alphai$ denotes the minimum positive eigenvalue of the manifold Hessian $\cH(\vv^*)$:
$$
\vv_1^\top \cH(\vv^*) \vv_1 \ge \alphai
	,\quad
\forall \vv_1\in \cT(\vv^*) ~\textnormal{and}~\|\vv_1\|=1
.
$$
We note that the initialization condition \eqref{eq:warm} has a constant neighborhood radius that does not depend on dimension $d$.
In the ensuing Theorem \ref{theo_local} on local convergence, we take $\eps \in (0, 1)$ and define the following quantities:
\beq\label{Keta}
K_{\eta, \epsilon}
	\equiv
\left\lceil \log_2 \left\{
\frac{\sqrt{D^3 \alphai^3}}{2^5 \rhoi \cV \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \cdot \eta^{1 / 2}} 
\right\} \right\rceil
+ 1
,
\eeq
and for $\eta < 1/(D\alphai)$, define
\beq\label{Tstareta}
T_\eta^* 
	\equiv
\left\lceil
	\frac{2 \log 2}{-\log ( 1 - D\alphai \eta) }
\right\rceil
.
\eeq
We state our local convergence theorem.


\begin{theorem}[Local Convergence]\label{theo_local}
Given Assumptions \ref{ass:lda} and \ref{ass:se} as well as the initialization condition \eqref{eq:warm}, for any positive constants $\eta, \eps$ that satisfy the scaling condition
\beq\label{eq:eta_scaling}
\eta
\le
\min\left\{
\frac{D^3 \alphai^3}{2^{24} G_\alpha^2 \cV^2 \rhoi^2} \log^{-\frac{\alpha + 2}{\alpha}}\eps^{-1}
,~
\red{\frac12}\frac{1}{D \alphai}
\right\}
,
\eeq
and for any $T \ge K_{\eta, \epsilon} T_\eta^*$, there exists an event $\cH_{\ref{theo_local}}$ with
\beq\label{eq:tailprob}
\PP(\cH_{\ref{theo_local}})
\ge
1 - \left(
14 + 8 \left(\frac{3}{\alpha}\right)^{\frac{2}{\alpha}} \log^{- \frac{\alpha + 2}{\alpha}} \eps^{-1}
\right) T \eps
,
\eeq
such that on event $\cH_{\ref{theo_local}}$ the iterates generated by the SSGD algorithm satisfy for all $t \in [K_{\eta, \epsilon} T_\eta^*, T]$:
$$
\|\vv_t - \vv^*\|
\le
\frac{2^{\frac{17}{2}} G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}}\eps^{-1} \cdot \eta^{1 / 2}
,
$$
where $G_\alpha \equiv \log_2^{1 / \alpha}( 1 + e^{1 / \alpha})\left(1 + \log_2^{1 / \alpha}(1 + e^{1 / \alpha}) \right)$ is a positive factor depending on $\alpha$.
\end{theorem}
%
To prove Theorem \ref{theo_local}, we define $\bm{\Delta}_t$ as the projection of $\vv_t - \vv^*$ onto the tangent space $\cT(\vv^*)$, namely
$
\bm{\Delta}_t
=
(\Ib-\vv^*{\vv^*}^\top)(\vv_t-\vv^*)
.
$
We view every $T_\eta^* = \Theta\left( (D \alphai)^{-1} \eta^{-1} \right)$ iterations as one round and interpret $K_{\eta, \epsilon} = \Theta\left( \log \eta^{-1} \right)$ as the number of rounds.  Note that $K_{\eta, \epsilon} T_\eta^*$ can be interpreted as the burn-in time for $\vv_t$ to arrive in a $O(\eta^{1 / 2})$ neighborhood of local minimizer $\vv^*$.
We present a proposition that provides an upper bound on $\|\bm{\Delta}_t\|$ over $T$ iterations and characterizes the descent in $\|\bm{\Delta}_t\|$ at the end of each round:

\begin{proposition}\label{prop:localconvexity}
Assume Assumptions \ref{ass:lda}, \ref{ass:se} and initialization condition \eqref{eq:warm} hold.
For any positive constants $\eta, \eps$ satisfying the scaling condition \eqref{eq:eta_scaling} and $T \ge 1$, with probability at least
$
1 - \left(
14 + 8 \left(\frac{3}{\alpha}\right)^{\frac{2}{\alpha}} \log^{- \frac{\alpha + 2}{\alpha}} \eps^{-1}
\right) T \eps
,
$
the algorithm iterates satisfy, for all $t \in [0, T]$,
\beq\label{eq:prop-Delta}
\|\bm{\Delta}_t\|
\le
\|\vv_t - \vv^*\|
\le
\sqrt{2} \|\bm{\Delta}_t\|
,
\eeq
and
\beq\label{eq:prop-maintain}
\hspace{-.17in}
\|\bm{\Delta}_t\|
\le
4 \max\left\{
\frac{\|\bm{\Delta}_0\|}{2}
,~
\frac{2^6 G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \cdot \eta^{1 / 2}
\right\}
.
\eeq
Moreover, if $T_\eta^* \in [0, T]$, we have:
\beq\label{eq:prop-halve}
\hspace{-.17in}
\|\bm{\Delta}_{T_\eta^*}\|
\le
\max\left\{
\frac{\|\bm{\Delta}_0\|}{2}
,~
\frac{2^6 G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \cdot \eta^{1 / 2}
\right\}
.
\eeq
\end{proposition}
The proof of Proposition \ref{prop:localconvexity} is provided in \S\ref{sec:localconvexity}.

By choosing an asymptotic regime such that $T \epsilon \log (1/\ep) \to 0$, Proposition \ref{prop:localconvexity} states that \eqref{eq:prop-Delta}, \eqref{eq:prop-maintain} and \eqref{eq:prop-halve} hold with probability tending to one.
On that high-probability event, \eqref{eq:prop-Delta} indicates that $\|\vv_t - \vv^*\|$ and its projection in the tangent space $\|\bm{\Delta}_t\|$ are bounded by each other up to constant factors,
\eqref{eq:prop-maintain} guarantees that $\|\bm{\Delta}_t\|$ does not exceed $\max\left\{ 2\|\bm{\Delta}_0\|, \Theta(\eta^{1 / 2})\right\}$---that is, $\vv_t$ stays in a neighborhood of local minimizer $\vv^*$---and \eqref{eq:prop-halve} states that, for $\|\bm{\Delta}_0\| = \Omega(\eta^{1 / 2})$, $\|\bm{\Delta}_t\|$ decreases by half after $T_\eta^*$ iterations: $\|\bm{\Delta}_{T_\eta^*}\| \le \max\left\{ \|\bm{\Delta}_0\| / 2, \Theta(\eta^{1 / 2})\right\}$ .


Proposition \ref{prop:localconvexity} studies $\bm{\Delta}_t$ in a single round, i.e., for $T_\eta^*$ iterations.
We are ready to provide the proof of Theorem \ref{theo_local} by applying Proposition \ref{prop:localconvexity} repeatedly for $K_{\eta, \epsilon}$ rounds, detailed as follows:






\paragraph{Proof of Theorem \ref{theo_local}}
\red{
The goal of this section is to prove Theorem \ref{theo_local}, which focuses on the case where initialization lies in some neighborhood of a local minimizer.
We target to analyze the dynamics of the SSGD algorithm \eqref{PSSGD} that takes place on the unit sphere $\cS^{d-1}$.
Since the iteration admits the Markov property, we first analyze in each round of $\cO(\eta^{-1})$ iterates, and then repeatedly apply such arguments to conclude the theorem.
}
Since the algorithm iteration \eqref{SSGD} can be viewed as a (strong) discrete-time Markov process,
We recall the definition of $K_{\eta, \epsilon}$ in \eqref{Keta} and repeatedly apply Proposition \ref{prop:localconvexity} to the sequence of $\{\bm{\Delta}_t\}$ for $K_{\eta, \epsilon}$ rounds, initializing each round with the output $\bm{\Delta}_{T_\eta^*}$ from the previous round.
We adopt an adaptive argument of shrinkage in multiple rounds.

More specifically, for any $t \in [K_{\eta, \epsilon} T_\eta^*, T]$, we first apply \eqref{eq:prop-halve} in Proposition \ref{prop:localconvexity} for $K_{\eta, \epsilon}$ rounds, then apply \eqref{eq:prop-maintain} for $t - K_{\eta, \epsilon} T_\eta^*$ iterations, and use \eqref{eq:prop-Delta} to conclude that 
\begin{align*}
&
\|\vv_t - \vv^*\|
\le
\sqrt{2} \|\bm{\Delta}_t\|
\\&\le
\sqrt{2} \cdot 4 \max\left\{
\frac{\|\bm{\Delta}_{K_{\eta, \epsilon} T_\eta^*}\|}{2}
,
\frac{2^6 G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \cdot \eta^{1 / 2}
\right\}
\\&\le
4\sqrt{2}
\cdot
\max\left\{\frac{\|\bm{\Delta}_0\|}{2^{K_{\eta, \epsilon}}}
,
\frac{2^6 G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \cdot \eta^{1 / 2}
\right\}
\\&\le
\frac{2^{\frac{17}{2}} G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \cdot \eta^{1 / 2}
, 
\end{align*}
where the last inequality is due to initialization condition \eqref{eq:warm}.
Here $G_\alpha$ is a fixed positive factor depending on $\alpha$, as defined in Theorem~\ref{theo_local}.
By taking a union bound over $K_{\eta, \epsilon}$ rounds and $T - K_{\eta, \epsilon} T_\eta^*$ iterations, we obtain
$$
\PP(\cH_{\ref{theo_local}})
\ge
1 - \left(
14 + 8 \left(\frac{3}{\alpha}\right)^{\frac{2}{\alpha}} \log^{- \frac{\alpha + 2}{\alpha}} \eps^{-1}
\right) T \eps
,
$$
completing the proof of Theorem \ref{theo_local}.
\hfill$\square$







Theorem \ref{theo_local} establishes the local convergence of $\vv_t$ in a neighborhood of $\vv^*$ for a fixed step-size $\eta$ and a number of iterations $T \ge K_{\eta, \epsilon} T_\eta^*$.
The following corollary provides a finite-sample bound:
\red{of Theorem \ref{theo_local}.}

\begin{corollary}[Finite-Sample]\label{coro:main}
Assume Assumptions \ref{ass:lda} and \ref{ass:se} and the initialization condition \eqref{eq:warm}.
For fixed positive constants $\eps$ and sample size $T$, set the step-size as
$
\eta(T)
=
\Theta\left(
\frac{\log T}{D \alphai T}
\right)
$
satisfying scaling condition
$$
\begin{aligned}
&\eta(T)
\le
\min\left\{
\frac{D^3 \alphai^3}{2^{24} G_\alpha \cV^2 \rhoi^2} \log^{-\frac{\alpha + 2}{\alpha}} \eps^{-1}
,~
\frac{1}{D \alphai}
\right\}
,
\end{aligned}
$$
there exists an event $\cH_{\ref{coro:main}}$ with
$
\PP(\cH_{\ref{coro:main}})
\ge
1 - \left(
14 + 8 \left(\frac{3}{\alpha}\right)^{\frac{2}{\alpha}} \log^{- \frac{\alpha + 2}{\alpha}} \eps^{-1}
\right) T \eps
,
$
such that on the event $\cH_{\ref{coro:main}}$ the iterates generated by the SSGD algorithm satisfy
$$
\|\vv_T - \vv^*\|
\lesssim
\frac{G_\alpha \cV}{D \alphai} \log^{\frac{\alpha + 2}{2\alpha}} \eps^{-1} \sqrt{\frac{\log T}{T}}
.
$$
\end{corollary}


We notice that our Theorem \ref{theo_local} and Corollary \ref{coro:main} provide a \emph{dimension-free} local convergence rate when $\cV$ is $O(1)$.
As we will see later in the example of CCA, the ($\alpha=1/2$) sub-Weibull parameter $\cV$ in that case scales with $\sqrt{d}$ and thus the local rate is the minimax-optimal rate $O(\sqrt{d/T})$ up to a polylogarithmic factor.
























\pb\section{Global Convergence Analysis}\label{sec_global}
In many situations, solving the warm initialization problem itself can be a difficult problem.
We borrow the techniques from \citet{ge2015escaping} and establish a global convergence result for \textit{escaping saddle points} via SSGD.
In this section we consider a variant of SSGD with a unit spherical constraint and equipped with an artificial noise injection step: 
let $\bm{n}_t$ be an independent spherical noise at each step that is independent of $\cF_{t-1}$ and $\bzeta_t$, and let
\beq\label{SSGDglobal}
\vv_t = \Pi_{\cS^{d-1}}\left[ \vv_{t-1} - \eta \widetilde {\nabla} F(\vv_{t-1};\bzeta_t) + \eta \bm{n}_t \right]
.
\eeq
Motivated by recent work on \text{escaping saddle points} \citep{ge2015escaping,lee2016gradient,jin2019nonconvex}, one can show that SSGD algorithm equipped with the aforementioned artificial noise injection escapes from all saddle points, and hence the initialization condition \eqref{eq:warm} can be dropped. 


First, we generalize Assumption \ref{ass:lda} for local convergence to the following for global convergence:
\begin{assumption}[Global Smoothness and Boundedness]\label{ass_global}
For any $\vv \in \{\vv: \|\vv\| \le 1\}$, we assume that $D(\vv)$ is $L_D$-Lipschitz, $F(\vv)$ is $L_F$-Lipschitz, $\nabla F(\vv)$ is $L_K$-Lipschitz and $\nabla^2 F(\vv)$ is $L_Q$-Lipschitz.
Also, assume there exists $D_-, D_+>0$ such that $D_-\le D(\vv)\le D_+$ for all $\vv$.
\end{assumption}


\begin{definition}[Strict-Saddle Function]\label{defi_strictsaddle}
A twice differentiable function $F(\vv)$ with constraint $c(\vv) = 0$ is called an $(\alphai,\betai,\gammai,\deltai)$-strict-saddle function, if an arbitrary point $\vv$ with $c(\vv) = 0$ satisfies at least one of the following:

\begin{enumerate}[label=(\roman*)]
\item
$\|g(\vv)\| \ge \betai$;
\item
There is a local minimizer $\vv^*$ such that $\|\vv - \vv^*\| \le \deltai$.
Additionally, for all $\vv' \in B_{2\deltai}(\vv^*)$, we have 
$$
\vv_1^\top \cH(\vv') \vv_1 \ge \alphai
, 
\quad\forall \vv_1\in \cT(\vv') ~\textnormal{and}~\|\vv_1\|=1
.
$$

\item
There exists a unit vector $\vv_0\in \cT(\vv)$ such that $\vv_0^\top \cH(\vv) \vv_0 \le -\gammai$.
\end{enumerate}
\end{definition}

In what follows, we show that our algorithms can escape from all saddle points and thus the local initialization is no longer required.
We are ready to present the saddle-point escaping result:




\begin{theorem}[Escaping from Saddle Points]\label{theo:saddle}
Let Assumptions \ref{ass:se} and \ref{ass_global} hold.
Let $F(\vv)$ be a $(\alphai, \betai, \gammai, \deltai)$-strict-saddle function with finite sup-norm $\|F\|_\infty$.
Let
\beq\label{T1}
\hspace{-.1in}
T_1 = 
4\|F\|_\infty
\cdot
\left[
\min\left(
0.5 d L_G
,
\gammai 
\log^{-1} \left( \frac{6d\cV}{\sigma} \right)
\right) \cdot
\sigma^2 D_-^2 \eta^2
\right]^{-1}
.
\eeq
Then for any $\kappa>0$ and any step-size $\eta > 0$ satisfying
\beq\label{etamax}
\sqrt{2d\cV^2 L_G D_+\eta} \le \betai
,
\eeq
within $T_1 \cdot \lceil \log_2 (\kappa^{-1}) \rceil$ iterates, \eqref{SSGDglobal} outputs $\vv_t$ that satisfies (ii) in Definition \ref{defi_strictsaddle} with probability no less than $1-\kappa$.
\end{theorem}

The proof of Theorem \ref{theo:saddle} is collected in \S\ref{sec:escapingSSGD}.
Motivated by this saddle-point escaping result, one can run SSGD first with a \textit{burn-in} phase and once it enters the warm initialization region, one can re-run SSGD with step-sizes chosen so that the local convergence theorem applies immediately.
Using the strong Markov property and combining Theorems \ref{theo_local} and \ref{theo:saddle} we immediately obtain the following main theorem.
Recall that $T_1$ is defined as in \eqref{T1}.



\begin{theorem}[Two-Phase Global Convergence]\label{coro:saddle}
Let Assumptions \ref{ass:se} and \ref{ass_global} hold.
Let $\eta$ satisfy
\beq\label{eq:eta_scaling2}
\eta
	\le
\min\left\{
\frac{D^3 \alphai^3}{2^{24} G_\alpha^2 \cV^2 \rhoi^2} \log^{-\frac{\alpha + 2}{\alpha}}\eps^{-1}
,~
\red{\frac12}\frac{1}{D \alphai}
,~
\frac{\betai^2}{2d\cV^2 L_G D_+}
\right\}
,
\eeq
and for any $T \ge K_{\eta, \epsilon} T^*_\eta + T_1 \cdot \lceil \log_2(\kappa^{-1}) \rceil$, there exists an event $\cA_T$ with
$$
\PP(\cA_T)
	\ge
1 - \kappa - \left(
14 + 8 \left(\frac{3}{\alpha}\right)^{\frac{2}{\alpha}} \log^{- \frac{\alpha + 2}{\alpha}} \eps^{-1}
\right) T \eps
,
$$
such that on event $\cA_T$ the iterates generated by the SSGD algorithm satisfy for all $t\in \left[ K_{\eta, \epsilon} T^*_\eta + T_1 \cdot \lceil \log_2(\kappa^{-1}) \rceil, T\right]$
$$
\| \vv_t - \vv^* \|
	\le
\frac{2^{\frac{17}{2}} G_\alpha \cV}{\sqrt{D \alphai}} \log^{\frac{\alpha + 2}{2\alpha}}\eps^{-1} \cdot \eta^{1 / 2}
,
$$
where $G_\alpha \equiv \log_2^{1 / \alpha}( 1 + e^{1 / \alpha})\left(1 + \log_2^{1 / \alpha}(1 + e^{1 / \alpha}) \right)$ is a positive factor depending on $\alpha$.
\end{theorem}


Note the function class of strict-saddle functions is strictly more general than the local convergence Theorem \ref{theo_local}.
We find the final complexity by interpreting Theorem \ref{coro:saddle}.
In the asymptotic relations below we write out the dependency on $d,\eta$, and let $\cL$ be a generic quantity that only involves a polylogarithmic factor of $d$, $\eta$ and $T$, which is allowed to vary at each appearance.
From \eqref{Keta}, \eqref{Tstareta} and \eqref{T1} we have 
$$
K_{\eta, \epsilon} T^*_\eta		\asymp \cL \cdot \eta^{-1}
	,\quad
T_1 \cdot \lceil \log_2(\kappa^{-1}) \rceil		\asymp \cL \cdot d^{-1} \eta^{-2},
$$
and if $\cV$ is set as the model scaling $\sqrt{d}$, the iteration achieves a high-probability bound of $\cL\cdot \sqrt{d\eta}$ after $K_{\eta, \epsilon} T^*_\eta + T_1 \cdot \lceil \log_2(\kappa^{-1}) \rceil$ steps.
We conclude that under the scaling condition $\cL \cdot d / T \to 0$, if the total number of samples $T$ is given, we can optimize the choice of step-size $\eta = \eta(d,T)$ to conclude the following convergence rate results:


\begin{enumerate}[label=(\roman*)]
\item
\textbf{Local Convergence:}
Given a \textit{warm} initialization, and choosing $\eta(T) \asymp \cL \cdot (1/T)$, SSGD \eqref{SSGD} has the following \textit{local} convergence rate
$$
\| \vv_t - \vv^* \| \lesssim \cL\cdot \sqrt{\frac{d}{T}}
.
$$


\item
\textbf{Global Convergence:}
Given \textit{any} initialization, and choosing $\eta(T) \asymp \cL \cdot (1 / \sqrt{dT})$, SSGD with noise injection \eqref{SSGDglobal} has the following \textit{global} convergence rate
$$
\| \vv_t - \vv^* \| \lesssim \cL\cdot \sqrt[4]{\frac{d}{T}}
.
$$
\end{enumerate}
We defer the arguments for the proof to \S\ref{sec:escapingSSGD}, and turn to the application to GEV problem.


















\pb\section{Asymptotic Normality via Trajectory Averaging}\label{sec_prj}
In this section, we return to the warm initialization as in \S\ref{sec_local}.
\citet{ruppert1988efficient} and \citet{polyak1992acceleration} introduced the idea of trajectory averaging for stochastic gradient descent in order to provide fine-grained convergence rates along with an asymptotic normality result.
Our goal is to generalize the Polyak-Juditsky analysis of SGD with trajectory averaging to SSGD for nonconvex objective that is initialized in a local convex region.
We denote $\cH_* \equiv \cH(\vv^*), \bSigma_* \equiv \bSigma(\vv^*)$ and $D \equiv D(\vv^*)$.
Define 
$$
\cM_*
=
(\Ib - \vv^* {\vv^*}^\top) \cH_* (\Ib - \vv^* {\vv^*}^\top)
.
$$
From the initialization condition \eqref{eq:warm}, we have $\uu^\top \cM_* \uu \ge \alphai \|\uu\|^2$ for all $\uu \in \cT(\vv^*)$.
We consider the eigendecomposition $\cM_* = \bP \diag(\lambda_1, \ldots, \lambda_{d - 1}, 0) \bP^\top$ for an orthogonal matrix $\bP \in \real^{d \times d}$ and eigenvalues $\lambda_1 \ge \ldots \ge \lambda_{d - 1} > 0$ with minimum positive eigenvalue $\lambda_{d - 1} \ge \alphai$.
We take the inverse of all positive eigenvalues and define the following matrix
\beq\label{eq:M*_inverse}
\cM_*^-
\equiv
\bP \diag(\lambda_1^{-1}, \ldots, \lambda_{d - 1}^{-1}, 0) \bP^\top
.
\eeq
Here, $\cM_*^-$ can be interpreted as the inverse of $\cM_*$ in the $(d-1)$-dimensional tangent space $\cT(\vv^*)$, and we can easily find $\cM_*^- \vv^* = \bm{0}$.
As shown in Theorem \ref{theo_local}, we need $K_{\eta, \epsilon} T_\eta^*$ iterations for $\vv_t$ to fall in a $\Theta(\eta^{1 / 2})$ neighborhood of the local minimizer $\vv^*$.
For $T \ge K_{\eta, \epsilon} T_\eta^*$, we define the trajectory average over time $K_{\eta, \epsilon} T_\eta^* + 1, \ldots, T$ as follows:
\beq\label{eq:ave}
\ivv_T^{(\eta)}
\equiv
\frac{1}{T - K_{\eta, \epsilon} T_\eta^*} \sum_{t = K_{\eta, \epsilon} T_\eta + 1}^T \vv_t
,
\eeq
where we add the superscript $(\eta)$ to emphasize the dependency on $\eta$.
Notice that $\{\ivv_T^{(\eta)}\}_{T, \eta}$ is a triangular array over a continuum $\eta$.
To obtain asymptotic normality of the trajectory average $\ivv_T^{(\eta)}$, we additionally make the following local Lipschitz-continuity assumption on stochastic scaled-gradient $\Gamma(\vv; \zeta)$ in the neighborhood of $\vv^*$:
\begin{assumption}[Mean-Squared Smoothness]\label{ass:ssg-lip}
There exists a positive constant $L_S$ such that for all $\vv, \vv' \in \{\vv : \|\vv\| \le 1, \|\vv - \vv^*\| \le \lipr\}$ and $t \ge 1$, we have for $\bzeta$
\beq\label{eq:ssg_lip}
\Exs\left\| \Gamma(\vv; \bzeta) - \Gamma(\vv'; \bzeta) \right\|^2
\le
L_S^2 \|\vv - \vv'\|^2
.
\eeq
\end{assumption}


The following theorem states that the trajectory average $\ivv_T^{(\eta)}$ converges in distribution to a $(d-1)$-dimensional normal distribution in the tangent space $\cT(\vv^*)$:

\begin{theorem}[Asymptotic Normality]\label{theo:asympnorm}
Given Assumptions \ref{ass:lda}, \ref{ass:se}, \ref{ass:ssg-lip} and initialization condition \eqref{eq:warm}, if we choose the step-size $\eta$ such that $\eta \rightarrow 0$ as the total sample size $T \rightarrow \infty$, where
\beq\label{eq:condition}
T \eta^2 \log^{\frac{2\alpha + 4}{\alpha}} T \rightarrow 0
	,\quad
T \eta \log^{- \frac{\alpha + 2}{\alpha}} T \rightarrow \infty
	\quad\text{a.s.}
,
\eeq
we obtain Gaussian convergence in distribution:
\beq\label{eq:an}
\sqrt{T} \left( \ivv_T^{(\eta)} - \vv^* \right)
	\stackrel{d}{\rightarrow} 
\mathcal{N}\left(\mathbf{0}, D^{-2} \cdot \cM_*^- \bSigma_* \cM_*^- \right)
.
\eeq
\end{theorem}
We relegate the proof details of Theorem \ref{theo:asympnorm} to \S\ref{sec_proof,theo:asympnorm}.%
\footnote{The limiting distribution is supported on a submanifold of the Euclidean space $\real^d$. The convergence in distribution is hence rigorously characterized by the pointwise convergence of the characteristic functions.}
The analysis has the same rationale as the classical asymptotic normality result that is obtained when minimizing a strongly convex objective function in an Euclidean space using stochastic gradient descent \citep{ruppert1988efficient,polyak1992acceleration}.
Indeed, in the case of a diminishing step-size, $\eta(t) \propto t^{-\alpha}$, $\alpha\in (1/2,1)$, SGD with trajectory averaging converges in distribution to a normal distribution.
In contrast, due to our choice of a constant step-size that is asymptotically small with $\eta \propto T^{-\alpha}$ up to a polylogarithmic factor, we base our analysis on the idea that trajectory averaging begins only after ``the burn-in phase''; that is, after $K_{\eta,\ep} T_\eta^*$ iterates.



























































































\pb\section{Case Studies of Canonical Correlation Analysis}\label{sec:strictsaddle}
The GEV problem arises in many statistical machine learning tasks.
We focus on the example of (rank-one) Canonical Correlation Analysis (CCA) as a core application;
we refer to \citet{tan2018sparse} for other (sparse, high-dimensional) applications including linear discriminant analysis and sliced inverse regression.
%
Recall that CCA aims at maximizing the correlation between two transformed vectors.
Given $\Xb$ and $\Yb$ as two column vectors, let $\bSigma_{\Xb\Yb}$ be the cross-covariance matrix between $\Xb$ and $\Yb$, and let $\bSigma_{\Xb\Xb}$ and $\bSigma_{\Yb\Yb}$ be the covariance matrices of $\Xb$ and $\Yb$, respectively. CCA is a special case of the GEV problem~\eqref{GEV} with
$$
\Ab= \begin{pmatrix}
\mathbf{0} & \bSigma_{\Xb\Yb} 
\\
\bSigma_{\Yb\Xb} & \mathbf{0}
\end{pmatrix}
,
\quad
\Bb= \begin{pmatrix}
\bSigma_{\Xb\Xb} &\mathbf{0}
\\
\mathbf{0} & \bSigma_{\Yb\Yb}
\end{pmatrix}
.
$$
To obtain $\widetilde{\Ab}, \widetilde{\Bb}'$ as mutually independent and unbiased stochastic samples of $\Ab$ and $\Bb$, we draw two independent pairs of samples $(\Xb, \Yb), (\Xb', \Yb')$ at each iteration and compute
$$\begin{aligned}
\widetilde{\Ab}= \begin{pmatrix} 
\mathbf{0}		& \Xb \Yb^\top
\\
\Yb \Xb^\top	& \mathbf{0} 
\end{pmatrix}
,
\quad
\widetilde{\Bb}'= \begin{pmatrix}
\Xb' \Xb'^\top	&\mathbf{0}
\\
\mathbf{0}		& \Yb' \Yb'^\top
\end{pmatrix},
\end{aligned}$$
where all samples of $\Xb, \Yb$ are centered such that they have expectation zero.


\begin{algorithm}[!tb]
\caption{Online Canonical Correlation Analysis via Noise-Injected Stochastic Scaled-Gradient Descent}
\begin{algorithmic}
\STATE
\textbf{input} total sample size $T$, proper stepsize $\eta$, initialize $\vv_0$
\FOR{$t = 1, \ldots, T / 2$}
\STATE
Draw mutually independent sample pairs $(\Xb, \Yb)$ and $(\Xb', \Yb')$ from the stochastic oracle
\STATE
Compute unbiased estimates

\vspace{-.13in}
\begin{small}
$$
\widetilde{\Ab}= \begin{pmatrix} 
\mathbf{0}		& \Xb \Yb^\top
\\
\Yb \Xb^\top	& \mathbf{0}  
\end{pmatrix}
\quad
\widetilde{\Bb}'= \begin{pmatrix}  
\Xb' \Xb'^\top	& \mathbf{0}
\\
\mathbf{0}		& \Yb' \Yb'^\top  
\end{pmatrix}
$$
\end{small}%
\STATE
Sample a uniformly spherical noise $\bm{n}_t$ of covariance $\sigma^2 \Ib_d$ and update $\bm{g}_t, \vv_t$ using the following rule
$$\begin{aligned}
\bm{g}_t
&\leftarrow
(\vv_{t - 1}^\top \widetilde{\Bb}' \vv_{t - 1}) \widetilde{\Ab} \vv_{t - 1} - (\vv_{t - 1}^\top \widetilde{\Ab} \vv_{t - 1}) \widetilde{\Bb}' \vv_{t - 1}
\\
\vv_t
&\leftarrow
\Pi_{\cS^{d - 1}} \left[
\vv_{t - 1}
+
\eta (\bm{g}_t+\bm{n}_t)
\right]
\end{aligned}$$
\ENDFOR
\STATE
\textbf{return} $\vv_T$
\end{algorithmic}
\label{algo:cca}
\end{algorithm}



In order to apply the convergence results for the SSGD algorithm to the CCA problem, it remains to verify Assumption \ref{ass:se}.
We assume that the samples $\Xb \in \real^{d_x}, \Yb \in \real^{d_y}$ follow sub-Gaussian distributions \citep{gao2019stochastic, li2018near} with parameters $\cV_x, \cV_y$; that is,
$
\Exs\exp\left( \|\Xb\|^2 / \cV_x^2 \right)  \le  2
$ and $
\Exs\exp\left( \|\Yb\|^2 / \cV_y^2 \right)  \le  2
.
$
With these standard assumptions for the samples $\Xb, \Yb$, the following lemma shows that the scaled-gradient noise in the CCA problem satisfies Assumption \ref{ass:se} with appropriate $\cV$ and $\alpha$.
The proof is provided in \S\ref{sec_proof,prop_cca_subweibull}.

\begin{proposition}\label{prop_cca_subweibull}
Assumption \ref{ass:se} holds for CCA with parameters $\cV = 400 (\cV_x^2 + \cV_y^2) \cV_x \cV_y$ and $\alpha = 1/2$.
\end{proposition}
%
Lemmas \ref{prop_gev_lipschitz} and \ref{prop_cca_subweibull} certify that Assumptions \ref{ass:lda} and \ref{ass:se} hold in CCA settings and hence local convergence Corollary \ref{coro:main} applies, which establishes a $\sqrt{d/T}$-rate up to a polylogarithmic since the vector sub-Weibull parameter $\cV$ in our Assumption \ref{ass:se} implicitly contains a factor $\sqrt{d}$.



\red{\paragraph{Matching the lower bound of \citet{gao2019stochastic}}}
Now we demonstrate that our bounds in Corollary \ref{coro:main} match the lower bound.
\citet{gao2019stochastic} derived a lower bound for Gaussian variables, $1 - \mathrm{align}(\vv, \vv^*) \gtrsim d / T$, in terms of a new measure of error:
$$\begin{aligned}
\mathrm{align}(\vv, \vv^*)
&\equiv
\frac12 \left(
\frac{\vv_x^\top \bSigma_{\Xb \Xb} \vv_x^*}{\sqrt{{\vv_x}^\top \bSigma_{\Xb \Xb} \vv_x} \sqrt{{\vv_x^*}^\top \bSigma_{\Xb \Xb} \vv_x^*}}
\right.\\&\left.\hspace{.5in}
+
\frac{\vv_y^\top \bSigma_{\Yb \Yb} \vv_y^*}{\sqrt{{\vv_y}^\top \bSigma_{\Yb \Yb} \vv_y} \sqrt{{\vv_y^*}^\top \bSigma_{\Yb \Yb} \vv_y^*}}
\right)
,
\end{aligned}$$
where $\vv = (\vv_x^\top, \vv_y^\top)^\top$ and ${\vv^*} = ({\vv_x^*}^\top, {\vv_y^*}^\top)^\top$ are partitioned in dimensions $d_x, d_y$.
It is easy to verify that $1 - \mathrm{align}(\vv, \vv^*) \asymp 1 - \vv^\top \vv^2 = \|\vv - \vv^*\|^2 / 2$ when both $\vv, \vv^*$ lie on the unit sphere, in which case our lower bound translates into $\|\vv_T - \vv^*\| \gtrsim \sqrt{d / T}$ for any estimator $\vv_T$ that consumes $T$ samples, which matches the upper bound of Corollary \ref{coro:main} in terms of both $d$ and $T$.



We note that our Corollary \ref{coro:main} and the results of \citet{gao2019stochastic} have different dimension dependency, which is due to a distinct but connected set of assumptions.
We have assumed that each sample $\Xb, \Yb$ follows a vector sub-Gaussian distribution and verifies Assumption \ref{ass:se} required by Proposition \ref{prop_cca_subweibull}, whereas \citet{gao2019stochastic} assume that each coordinate of $\Xb, \Yb$ is sub-Gaussian with a constant parameter.
Hence, the vector sub-Gaussian parameter $\cV$ in our case suffers a dimension-dependent prefactor.










\red{
\paragraph{Linear Discriminant Analysis (LDA)}
LDA is a feature extraction method to reduce the dimension of the dataset while preserving classification information.
We denote the estimation of the between-class scatter matrix as $\widehat {\mathbf{\Sigma}}_b$ and the estimation of the within-class scatter matrix as $\widehat {\mathbf{\Sigma}}_w$.
The estimation is calculated using a batch of streaming data $\widehat {X}$ and their corresponding labels $\widehat {Y}$.
The LDA problem can be expressed as solving the following minimization problem:
\beq\label{LDA}
\min_{\vv}
-\frac{\vv^\top \widehat {\mathbf{\Sigma}}_b \vv}{\vv^\top\widehat {\mathbf{\Sigma}}_w\vv}
, \quad \textnormal{subject to}~
\vv \in \mathcal{S}^{d-1}
.
\eeq
The SSGD algorithm for solving a LDA problem updates via the following equation:
$$
\vv_t
=
\Pi_{\cS^{d-1}}\left[
\vv_{t-1} + \eta \left((\vv_{t-1}^\top\widehat {\mathbf{\Sigma}}_w \vv_{t-1})\widehat {\mathbf{\Sigma}}_b \vv_{t-1} - (\vv_{t-1}^\top\widehat {\mathbf{\Sigma}}_b \vv_{t-1})\widehat {\mathbf{\Sigma}}_w \vv_{t-1}\right)
\right]
,
$$
which is special case of~\eqref{eq:gev} when $\widetilde {\Ab} = \widehat {\mathbf{\Sigma}}_b, \widetilde {\Bb}= \widehat {\mathbf{\Sigma}}_w$.
%
%
%
%
%
%
%
%
\paragraph{Sliced Inverse Regression (SIR)}
Assume  the following model
$$
Y = f(\vv_1^\top \Xb, \cdots, \vv_k^\top\Xb, \mathbf{\epsilon}),
$$
where $\Xb\in \mathbb{R}^d$ is a $d$-dimensional vector and $f: \mathbb{R}^d\rightarrow \mathbb{R}$ is an unknown function, $Y\in\mathbb{R}$ is a scalar and $\mathbf{\epsilon}$ is a random noise.
SIR aims to recover the regression subspace spanned by $\{\vv_1, \ldots, \vv_k\}$.
The first eigenvector of such space can be obtained by minimizing the following quantity:
\beq\label{SIR}
\min_{\vv}
-\frac{\vv^\top \widehat {\mathbf{\Sigma}}_{E(\Xb\mid Y)} \vv}{\vv^\top\widehat {\mathbf{\Sigma}}_{\Xb\Xb}\vv}
, \quad \textnormal{subject to}~
\vv \in \mathcal{S}^{d-1}
,
\eeq
where $\widehat {\mathbf{\Sigma}}_{\Xb\Xb}$ is an estimate of the covariance matrix of $\Xb$ and $\widehat {\mathbf{\Sigma}}_{E(\Xb\mid Y)}$ is an estimate of the covariance matrix of the expectation $E(\Xb\mid Y)$.
%
The SSGD algorithm for SIR reduces to 
$$
\vv_t
=
\Pi_{\cS^{d-1}}\left[
\vv_{t-1} + \eta \left((\vv_{t-1}^\top\widehat {\mathbf{\Sigma}}_{\Xb\Xb} \vv_{t-1})\widehat {\mathbf{\Sigma}}_{E(\Xb\mid Y)} \vv_{t-1} - (\vv_{t-1}^\top\widehat {\mathbf{\Sigma}}_{E(\Xb \mid Y)} \vv_{t-1})\widehat {\mathbf{\Sigma}}_{\Xb\Xb} \vv_{t-1}\right)
\right]
,
$$
which is a special case of~\eqref{eq:gev} when $\widetilde {\Ab} = \widehat {\mathbf{\Sigma}}_{E(\Xb\mid Y)}, \widetilde {\Bb}= \widehat {\mathbf{\Sigma}}_{\Xb\Xb}$.
}

\red{Lower bound \citet{gao2019stochastic}}




















\red{Should have a corollary here}



\pb\subsection{Numerical Studies using Synthetic Data}\label{sec:simulation}
\begin{figure}[!tb]
\label{fig:saddle}
\centering
\subfigure[]{
\label{fig:subfig:a} %% label for first subfigure
\includegraphics[width=1.5in]{pic/sad_l21.png}}
\subfigure[]{
\label{fig:subfig:b} %% label for second subfigure
\includegraphics[width=1.5in]{pic/rand_l21.png}}
\subfigure[]{
\label{fig:subfig:c} %% label for second subfigure
\includegraphics[width=1.5in]{pic/sad_sin1.png}}
\subfigure[]{
\label{fig:subfig:d} %% label for second subfigure
\includegraphics[width=1.5in]{pic/rand_sin1.png}}
\caption{Comparison between saddle point initialization and random initialization}
\label{fig:saddle} %% label for entire figure
\end{figure}



In this subsection, we present simulation results for SSGD for the case of rank-one CCA [Algorithm \ref{algo:cca}].
% Recall that in Algorithm \ref{algo:cca} we draw at each iteration two independent pairs of samples, $(\Xb, \Yb)$ and $(\Xb', \Yb')$, and the stochastic oracles $\tilde{\Ab}, \tilde{\Bb}'$ are expressed as
% $$
% \widetilde{\Ab}= \begin{pmatrix} 
% \mathbf{0}		& \Xb \Yb^\top
% \\
% \Yb \Xb^\top	& \mathbf{0}  
% \end{pmatrix}
% \quad
% \widetilde{\Bb}'= \begin{pmatrix}  
% \Xb' \Xb'^\top	& \mathbf{0}
% \\
% \mathbf{0}		& \Yb' \Yb'^\top  
% \end{pmatrix}.
% $$
The dimensions of the synthetic data samples are picked as $d_1 = 65$ of $\Xb$ and $d_2 = 70$ of $\Yb$.
We generate the covariance matrix for $\Xb, \Yb$ as
\begin{equation}
\bSigma_{\Xb\Xb} = 3\mathbf{I}_{d_1} + \Ab_1
	,\quad
\bSigma_{\Yb\Yb} = 3\mathbf{I}_{d_2} + \Ab_2
,
\end{equation}
where $\Ab_1, \Ab_2$ are diagonal matrices with each entry along the diagonal obtained as an independent uniform draw from $[0, 1]$. 
To ensure the eigengap of $\bSigma_{\Xb\Xb}^{-\frac{1}{2}}\bSigma_{\Xb\Yb}\bSigma_{\Yb\Yb}^{-\frac{1}{2}}$ is significantly large, in particular, no less than $0.5$, we set
\begin{equation}
\bSigma_{\Xb\Yb}
= 
\Ab_3 + \bSigma_{\Xb\Xb}^{1/2}\mathbf{U}\diag(0.5, \mathbf{O})\mathbf{V}^\top \bSigma_{\Yb\Yb}^{1/2}
.
\end{equation}
Here $\Ab_3$ is a $d_1\times d_2$ matrix where each entry is generated from an independent $N(0, 1/(d_1+d_2))$ variable with SVD decomposition
$
\bSigma_{\Xb\Xb}^{1/2} \Ab_3 \bSigma_{\Yb\Yb}^{1/2}
=
\mathbf{U}\mathbf{D}\mathbf{V}^\top
$, and $\mathbf{O}$ is a $(d_1 - 1) \times (d_2 - 1)$ zero matrix.
Note that each step of Algorithm \ref{algo:cca} can be computed in time $\cO(d_1+d_2)$.
Given this setup, we report our numerical findings of Algorithm \ref{algo:cca} as follows:











\paragraph{Saddle-point escaping}
We first discuss the behavior of our algorithm in the presence of saddle points. 
When $\vv_0$ is exactly chosen as a saddle point, we show that SSGD escapes from a plateau of saddle points in the landscape and converges to the local (and global) minimizer. 
For illustrative purposes, the initialization $\vv_0$ is chosen from four saddle points, each of which corresponds to a component of CCA.
We choose the total sample size $T = \text{1e6}$ and set the (constant) step-size $\eta = \text{log}(T)/(5T)$.
In Figure~\ref{fig:saddle} we plot the error of the current solution to the optimal solution, where the error is measured both in squared Euclidean distance and in sine-squared.
The first two plots shows the behavior initialized from four different saddle points, and the last two plots shows the behavior initialized from four uniform seeds.
The horizontal axis is the number of iterates and the vertical axis is error $\|\vv_t - \vv^*\|^2$. 

\begin{figure}[!tb]
\centering
\subfigure[]{
\label{fig:linear:a} %% label for first subfigure
\includegraphics[width=1.5in]{pic/l2_lr1.png}}
\subfigure[]{
\label{fig:linear:b} %% label for second subfigure
\includegraphics[width=1.5in]{pic/sin_lr1.png}}
\caption{%
Log-log plot regarding the convergence with respect to a range of step-sizes $\eta$.
Figure \ref{fig:linear:a} illustrates the squared errors in terms of squared distance to optimality $\|\vv - \vv^*\|^2$, and Figure \ref{fig:linear:b} does so in terms of $\sin^2(\vv,\vv^*)$%
}
\label{fig:linear}
\end{figure}
% From Figure~\ref{fig:saddle} the algorithm efficiently escapes from saddle points and the error significantly drops down at a random time, exhibiting a local-global-local three-phase behavior.
% In the Initial Phase, the algorithm gradually escapes from the saddle point; 
% In the Transient Phase, the algorithm quickly  moves towards to the optimum; 
% In the Fluctuation Phase, the algorithm fluctuates around to the optimum.
% For randomly chosen $\vv$, the Initial Phase of random initialization is shorter than the Initial Phase of saddle point initialization.







\paragraph{Relationship between the step-size and squared error}
We study the role of step-size $\eta$ in our SSGD algorithm.
Set sample size $T = \text{1e6}$ and choose 20 $\eta$'s from 1e--5 to 5e--4 from $
\{\log(T) / (5T)$, $2\log(T) / (5T)$, $4\log(T) / (5T)$, $8\log(T) / (5T)$, $16\log(T) / (5T)\}
$
and plot the squared error $\|\vv - \vv^*\|^2$ on a log-log scale. It is clearly observed from Figure~\ref{fig:linear} that smaller step-sizes lead to slower convergence to a stationary point of smaller variance.

We now numerically demonstrate that at stationarity SSGD presents a squared error $\|\vv - \vv^*\|^2$ or $\sin^2(\vv,\vv^*)$ that has a linear relationship with $\eta$.
We compute the averaged squared error of the last 10\% iterates for each run and plot the result in Figure~\ref{fig:linear_2} in a log-log scale.
The horizontal axes of both Figures~\ref{fig:linear_2:c} and~\ref{fig:linear_2:d} represent the step-size $\eta$, and the vertical axes of both figures are the squared error $\|\vv-\vv^*\|^2$ and $\sin^2(\vv,\vv^*)$, respectively.
We compute an averaged squared error of the last 10\% iterates for each $\eta$.
Due to ergodicity in the algorithmic final phase, this provides a feasible estimate of its variance around the local (and global) minimizer.
Also, the fitting slope of Figure~\ref{fig:linear_2} provided by the least-square method is 0.9921 (fairly close to 1), which corroborates our theoretical convergence results in Theorems \ref{theo_local} and \ref{coro:saddle}.
These numerical findings are consistent with our theory that the squared error $\|\vv-\vv^*\|^2$ at stationarity has a linear relationship with $\eta$.


\begin{figure}[!tb]
\centering
\subfigure[]{
\label{fig:linear_2:c}
\includegraphics[width=1.5in]{pic/l2vslr1.png}}
\subfigure[]{
\label{fig:linear_2:d}
\includegraphics[width=1.5in]{pic/sinvslr1.png}}
\caption{Relationship between step-size $\eta$ and the squared error of our algorithmic estimator to the optimal solution}
\label{fig:linear_2}
\end{figure}











% \paragraph{Comparison between SSGD and Gen-Oja}
% We compare our SSGD algorithm for CCA with Gen-Oja proposed in the concurrent work of \citet{bhatia2018gen}.
% Recall that a key difference between the two algorithms lie on the single-loop structure of SSGD compared with Gen-Oja.
% Setting the initial step-size $\eta_0\in \{$1e--2, 1e--1, 1, 1e1, 1e2$\}$, we pick our step-sizes for SSGD as $\eta_t = \eta_0/(t + 10)$ and for Gen-Oja, the step-size is set as $\beta_t = \eta_0/(t + 10)$ for slow process and fixed as $\alpha_t = \text{2e--3}$ for fast process.
% (Recall in the two-timescale Gen-Oja algorithm, the step-size $\beta_t$ in the slow process determines the convergence rates.)
% In Figure~\ref{fig:compare}, we use solid lines to represent the SSGD algorithm and use dotted lines to represent the Gen-Oja algorithm. 
% We observe that SSGD takes slightly longer time to escape the saddle points (partly due to the varying step-size) but performs more stably with respect to the choice of step-size:
% SSGD for CCA admits a wide range of initial step-size from 1e--3 to 1, while Gen-Oja only admits a step-size range from 1e--1 to 1.






% \begin{figure}[!tb]
%   \centering
%       \subfigure[]{
%     \label{fig:compare:a}
%     \includegraphics[width=1.5in]{pic/ssgdoja_sin1.png}}
%   \caption{The comparison between SSGD and Gen-Oja algorithm. }
%   \label{fig:compare}
% \end{figure}






















\pb\section{Summary}\label{sec_summary}
We have presented the Stochastic Scaled-Gradient Descent (SSGD) algorithm for minimizing a constrained nonconvex objective function. 
Comparing with classical stochastic gradient descent, our method only requires access to an unbiased estimate of a scaled gradient, allowing access to a broader range of applications. 
The proposed algorithm requires only a single pass through the data and is memory-efficient, with storage complexity linearly dependent on the ambient dimensionality of the problem. 
For a class of nonconvex stochastic optimization problems, we establish local convergence rates of the proposed algorithm to local minimizers and we prove asymptotic normality of the trajectory average. 
An application to the generalized eigenvector problem is investigated. 
In the near future we will investigate the rate of escape of saddle points for SSGD, and study global convergence for generic Riemannian manifolds.



\red{ \cjlcomment{SQ: rewrite this section.}
\paragraph{Comparison with Concurrent GEV Algorithms}
Our proposed online GEV iteration \eqref{eq:gev} is advantageous over existing GEV algorithms in many aspects.
We detail the discussions as follows:
%
%
\begin{enumerate}[label=(\roman*)]
\item
Generalized eigenvector problem has been recently studied by many authors.
Compared to the shift-and-invert meta method proposed by \citep{gao2019stochastic}, our SSGD-based online GEV iteration \eqref{eq:gev} is more natural as a stochastic approximation of the generalized Rayleigh quotient.
\citet{arora2017stochastic} propose the stochastic approximation of CCA algorithm, which achieves polynomial iteration complexity.
However their algorithm is \textit{not} memory-efficient, since each update requires the $O(d^2)$ storage complexity.
\cite{gao2019stochastic} proposes a shift-and-invert meta algorithm specially for the CCA problem, built upon the earlier work \citet{garber2016faster}.
They then develops a streaming version of the algorithm, which involves solving a large linear system and thus is \textit{not} scalable to large-scale problems.
\end{enumerate}
%
%
\red{
\paragraph{Scalability}
Existing literatures on stochastic approximation of CCA \citep{arora2017stochastic,gao2019stochastic} solves the problem by estimating $\bSigma_x$ and $\bSigma_y$ online and take their inverse matrix, which takes $\cO(d^2)$ time complexity at each iteration and hence is \textit{not} efficient.
}
%
%
\paragraph{Statistical Rates}
It is known in some online statistical learning tasks like principal component estimation, the global convergence rate can be improved to $\sqrt{d/N}$ which is the \textit{near-optimal convergence rate} under the subgaussian setting \citep{vu2013minimax,li2018near} 
(see also \citet{jain2016streaming} for the case of bounded samples).
Whether the online algorithm for GEV and other statistical models admit global, near-optimal convergence rates under mild assumptions is left for future investigation.
%
%
\paragraph{Future Directions}
As stated previously, future works would be valuable if the optimal convergence rate is achieved from a near-global initialization \citep{fang2018sharp}, without explicitly adopting a two-phase diagram.
\red{\cjlcomment{
How about the case
$$
\lambda_1 > \lambda_2 \ge \dots \ge \lambda_d
$$
where $\lambda_i$'s are the eigenvalues of $\Bb^{-1/2} \Ab \Bb^{-1/2}$.
cf.~\citet{ge2015escaping}.
What are the Lipschitz constants?
}}
Also, designing and analyzing accelerated version for GEV are interesting directions via variance reduction and recursive gradient methods \citep{johnson2013accelerating,nguyen2017sarah,fang2018spider}.
A final direction is to study relevant online statistical learning tasks with non-convex objectives, such as extracting the rank-$k$ for canonical correlation analysis, training deep neural networks, and MLE/MAP parameters estimation via the Expectation-Maximization (EM) algorithm.
}













\begin{acknowledgements} % will be removed in pdf for initial submission,
We thank the Department of Electrical Engineering and Computer Sciences at UC Berkeley for COVID-19 accommodations during which time this work is completed.
We thank Tong Zhang, Huizhuo Yuan, Yuren Zhou for inspiring discussions at various stages of this project.
This work was supported in part by the Mathematical Data Science program of the Office of Naval Research under grant number N00014-18-1-2764.
\end{acknowledgements}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\pb
%\bibliographystyle{plainnat}
%\bibliographystyle{apalike}
\bibliography{SAILreferences}
%\end{document}

% \newpage
\appendix
\onecolumn



\input{real-supp}








































% \pb
% %\bibliographystyle{plainnat}
% %\bibliographystyle{apalike}
% \bibliography{SAILreferences}

% %\newpage\tableofcontents


\end{document}
