\newpage
\clearpage

% \setcounter{section}{0}
\hrule height 1pt
\vskip 0.15in
\vskip -\parskip
\begin{center}
{\LARGE AutoDrop: Training Deep Learning Models with Automatic Learning Rate Drop \\ (Supplementary Material) \par} 
\end{center}
\vskip 0.25in
\vskip -\parskip
\hrule height 1pt
% \renewcommand{\thealgorithm}{}

\section{Interpretation of Angular Velocity}
\label{sec:int_av2}
The numerator of angular velocity is a dot product of two consecutive gradients. A popular study regarding the meaning of this quantity comes from a hypergradient method~\cite{gunes2018online}, where the authors discover that the derivative of the loss with respect to the learning rate is closely related to the dot product of the gradients of two consecutive steps. That is:

$$\frac{\partial L(\theta_t)}{\partial \alpha} = -\left <\nabla L(\theta_t), \nabla L(\theta_{t-1})\right>, $$

where $L$ is the loss function, $\theta_t$ are the model parameters at step $t$, $\nabla L(\theta_t)$ is the gradient of the loss, and $\alpha$ is the learning rate. From the above formula we can see that the dot product could be used as an indicator for adjusting the learning rate. However, if we adjust the learning too frequently, it will introduce the short-horizon bias problem as we discussed before. Greedily selecting the learning rate only based on the current step may hurt optimizer's performance in the long term. The empirical results of Hypergradient Descent~\cite{gunes2018online} (HD) in Table 1 confirm that.

\vspace{0.05in}
\noindent What if we do not allow the learning rate to change until the optimizer saturates? From Theorem 1, we could conclude that for a constant learning rate:
\begin{itemize}
\item The expected value of the dot product of the consecutive gradients converges to some value smaller than $0$.
\item The angular velocities of the gradients converge to some value between $90$ and $120$ degrees. 
\end{itemize}
\vspace{0.05in}
This theorem could be interpreted as: 
\begin{itemize}
\item Under current learning rate, the loss will no longer decrease when the expected value of the dot product of the consecutive gradients/the angular velocity converges. 
\item The dot product of the consecutive gradients is always smaller than $0$ when it converges, which means the learning rate can only decrease, but not increase, after the dot products of the consecutive gradients/angular velocity saturates. 
\end{itemize}
\vspace{0.05in}

Therefore, the AutoDrop method is designed to detect whether the dot product/angular velocity converges and drop the learning rate then. Furthermore, the angular velocity is easier to track compared to other metrics, like the dot product of consecutive gradients or training loss, because it has much less oscillations when it converges and it is naturally bounded in $[0^{\circ} , 180^{\circ}]$. Thus the threshold of saturation for angular velocity becomes easy to determine.


\section{Proof for Theorem \ref{thm_1}}
\label{sec:int_av}
\begin{proof}[Proof for Theorem \ref{thm_1}]
  First note that if the learning rate is chosen as specified, then each of the trajectories is a contraction map. By Banach’s fixed point theorem, they each have a unique fixed point. Clearly 
  $$\mathbb{E}_{SGD}^*=\lim_{t\to\infty}\mathbb{E}[x_t]=0.$$
  For the variance we can solve for the fixed points directly. Define $\mathbb{V}_{SGD}^*=\lim_{t\to\infty}\mathbb{V}[x_t]$,
  \begin{align*}
      &\mathbb{V}_{SGD}^*=(I-\gamma A)^2\mathbb{V}_{SGD}^*+\gamma A^2\Sigma,\\
      \Longrightarrow&\mathbb{V}_{SGD}^*=\frac{\gamma^2 A^2\Sigma}{I-(I-\gamma A)^2}=diag(\frac{\alpha^2a_1^2\sigma_1^2}{1-(1-\alpha a_1)^2},\cdots,\frac{\alpha^2a_n^2\sigma_n^2}{1-(1-\alpha a_n)^2}),
  \end{align*}
  where $\sigma_i^2$ is the i-th diagonal element of the variance matrix $\Sigma$ of a gaussian noise $c_t$.
  Because
  \begin{align*}
      \mathbb{V}_{SGD}^*=\lim_{t\to\infty}\mathbb{V}[x_t]&=\lim_{t\to\infty}\mathbb{E}\left[(x_t-\mathbb{E}[x_t])(x_t-\mathbb{E}[x_t])^T\right]\\\notag
       &=\lim_{t\to\infty}\mathbb{E}[x_tx_t^T]\\\notag
      &=diag(\lim_{t\to\infty}\mathbb{E}[x_{t,1}^2],\lim_{t\to\infty}\mathbb{E}[x_{t,2}^2],\cdots,\lim_{t\to\infty}\mathbb{E}[x_{t,n}^2]),
  \end{align*}
  we have
  \begin{align}\label{eq:x}
      \lim_{t\to\infty}\mathbb{E}[x_{t,i}^2]=\frac{\alpha^2a_i^2\sigma_i^2}{1-(1-\alpha a_i)^2}\quad i=1,\cdots,n.
  \end{align}
  Since $c_t\sim N(0,\Sigma)$, 
  \begin{align}\label{eq:c}
      \lim_{t\to\infty}\mathbb{E}[c_{t,i}^2]=\sigma_i^2\quad i=1,\cdots,n.
  \end{align}
%   Loss function at iteration t is
%   \begin{align*}
%       L(x_{t})=\frac{1}{2}x_t^TAx_t-x_t^TAc_t+\frac{1}{2}c_t^TAc_t.
%   \end{align*}
%   Therefore, by formula (\ref{eq:x}) and (\ref{eq:c}), the trajectory of the loss value converges to
%   \begin{align}
%       L^*&=\lim_{t\to\infty}\mathbb{E}[L(x_t)]\\\notag
%       &=\frac{1}{2}\lim_{t\to\infty}\mathbb{E}[x_t^TAx_t]-\lim_{t\to\infty}\mathbb{E}[x_t^TAc_t]+\frac{1}{2}\lim_{t\to\infty}\mathbb{E}[c_t^TAc_t]\\\notag
%       &=\frac{1}{2}\sum_{i=1}^na_i\lim_{t\to\infty}\mathbb{E}[x_{t,i}^2]+\frac{1}{2}\sum_{i=1}^na_i\lim_{t\to\infty}\mathbb{E}[c_{t,i}^2]\\\notag
%       &=\frac{1}{2}\sum_{i=1}^n\left[a_i\frac{\alpha^2a_i^2\sigma_i^2}{1-(1-\alpha a_i)^2}+a_i\sigma_i^2\right]\\\notag
%       &=\frac{1}{2}\sum_{i=1}^na_i\sigma_i^2\left(\frac{\alpha a_i}{2-\alpha a_i}+1\right)\\\notag
%       &=\frac{1}{2}\sum_{i=1}^na_i\sigma_i^2\frac{2}{2-\alpha a_i}\\\notag
%       &=\sum_{i=1}^n\frac{a_i\sigma_i^2}{2-\alpha a_i}
%   \end{align}
  The update formula with learning rate $\alpha$ is
  \begin{align}
      x_{t+1}=x_t-\alpha\nabla\hat{L}(x_t)= x_t-\alpha A(x_t-c_t),\quad c_t\sim N(0,\Sigma).
  \end{align}
  For the next iteration, the update formula can be written as
  \begin{align}
      x_{t+2}&=x_{t+1}-\alpha\nabla\hat{L}(x_{t+1})\\\notag
      &=x_{t+1}-\alpha A(x_{t+1}-c_{t+1}),\quad c_{t+1}\sim N(0,\Sigma)\\\notag
      &=x_{t+1}-\alpha A(x_t-\alpha A(x_t-c_t)),\quad c_t,c_{t+1}\sim N(0,\Sigma)\\\notag
      &=x_{t+1}-\alpha A(x_t-c_{t+1})+\alpha^2A^2(x_t-c_t),\quad c_t,c_{t+1}\sim N(0,\Sigma).
  \end{align}
  Define the step at iteration t as $s_t=x_{t+1}-x_t$, then the inner product of two consecutive steps can be written as
  \begin{align}
      <s_t,s_{t+1}>=&<-\alpha A(x_t-c_t), -\alpha A(x_t-c_{t+1})+\alpha^2A^2(x_t-c_t)>\\\notag
      =&\alpha^2(x_t-c_t)^TA^2(x_t-c_{t+1})-\alpha^3(x_t-c_t)^TA^3(x_t-c_t)\\\notag
      =&\alpha^2\left[x_t^TA^2x_t\!-\!x_t^TA^2c_{t+1}\!-\!c_t^TA^2x_t\!+\!c_t^TA^2c_{t+1}\!-\!\alpha x_t^TA^3x_t\!+\!2\alpha x_t A^3c_t\!-\!\alpha c_t^TA^3c_t\right].
  \end{align}
  Therefore, the trajectory of the expectation of the inner product converges to
  \begin{align}\label{eq:dot}
      I^*=\lim_{t\to\infty}\mathbb{E}[<s_t,s_{t+1}>]&=\alpha^2\left[\lim_{t\to\infty}\mathbb{E}[x_t^TA^2(I-\alpha A)]x_t-\alpha\lim_{t\to\infty}\mathbb{E}[c_t^TA^3c_t]\right]\\\notag
      &=\alpha^2\left[\sum_{i=1}^na_i^2(1-\alpha a_i)\lim_{t\to\infty}\mathbb{E}[x_{t,i}^2]-\sum_{i=1}^n\alpha a_i^3\lim_{t\to\infty}\mathbb{E}[c_{t,i}^2]\right]\\\notag
      &=\alpha^2\sum_{i=1}^n\left[a_i^2(1-\alpha a_i)\frac{\alpha a_i\sigma_i^2}{2-\alpha a_i}-\alpha a_i^3\sigma_i^2\right]\\\notag
      &=\alpha^2\sum_{i=1}^n\alpha a_i^3\sigma_i^2\left[\frac{1-\alpha a_i}{2-\alpha a_i}-1\right]\\\notag
      &=-\alpha^3\sum_{i=1}^n\frac{a_i^3\sigma_i^2}{2-\alpha a_i}.
  \end{align}
  The norm of step $s_t$ at iteration t is written as
  \begin{align}
      \norm{s_t}^2&=\norm{\alpha A(x_t-c_t)}^2\\\notag
      &=\alpha^2(x_t-c_t)^TA^2(x_t-c_t)\\\notag
      &=\alpha^2(x_t^TA^2x_t-2x_t^TA^2c_t+c_t^TA^2c_t).
  \end{align}
  Therefore the trajectory of the expectation of the norm of $s_t$ converges to
  \begin{align}\label{eq:norm}
      N^*=\lim_{t\to\infty}\mathbb{E}[\norm{s_t}^2]&=\alpha^2\lim_{t\to\infty}\mathbb{E}[x_t^TA^2x_t]+\alpha^2\lim_{t\to\infty}\mathbb{E}[c_t^TA^2c_t]\\\notag
      &=\alpha^2\sum_{i=1}^na_i^2\left(\mathbb{E}[x_{t,i}^2]+\mathbb{E}[c_{t ,i}^2]\right)\\\notag
      &=\alpha^2\sum_{i=1}^na_i^2\sigma^2\left(\frac{\alpha a_i}{2-\alpha a_i}+1\right)\\\notag
      &=2\alpha^2\sum_{i=1}^n\frac{a_i^2\sigma^2}{2-\alpha a_i}.
  \end{align}
  Here, in order to draw meaningful conclusions we make certain simplifications and proceed by approximating $\mathbb{E}[cos(\angle(s_t,s_{t+1}))]\approx\mathbb{E}[<s_t,s_{t+1}>]/\mathbb{E}[\norm{s_t}\norm{s_{t+1}}]$. 
  
  Because $cos(\angle(s_t,s_{t+1}))=\frac{<s_t,s_{t+1}>}{\norm{s_t}\norm{s_{t+1}}}$ and $\norm{s}_t$ converges when t is large enough, then
  \begin{align}\label{eq:cos}
      \lim_{t\to\infty}\mathbb{E}[cos(\angle(s_t,s_{t+1}))]\approx\lim_{t\to\infty}\frac{\mathbb{E}[<s_t,s_{t+1}>]}{\mathbb{E}[\norm{s_t}^2]}.
  \end{align}
  
  Since $I^*=\lim_{t\to\infty}\mathbb{E}[cos(\angle(s_t,s_{t+1}))]$ and $N^*=\lim_{t\to\infty}\mathbb{E}[\norm{s_t}^2]$ are both bounded and not equal to 0,
  \begin{align}\label{eq:cos}
      \lim_{t\to\infty}\mathbb{E}[cos(\angle(s_t,s_{t+1}))]\approx\frac{\lim_{t\to\infty}\mathbb{E}[<s_t,s_{t+1}>]}{\lim_{t\to\infty}\mathbb{E}[\norm{s_t}^2]}.
  \end{align}
  By combining formula (\ref{eq:cos}), (\ref{eq:dot}) and (\ref{eq:norm}), we obtain that the expectation of cosine value converges to
  \begin{align}
      C^*\!\!=\!\!\lim_{t\to\infty}\mathbb{E}[cos(\angle(s_t,s_{t+1}))]\!\approx\!\frac{I^*}{N^*}&\!=\!-\frac{\alpha}{2}\frac{\sum_{i=1}^n\frac{a_i^3\sigma_i^2}{2-\alpha a_i}}{\sum_{i=1}^n\frac{a_i^2\sigma_i^2}{2-\alpha a_i}}\!\geq\!-\frac{\alpha}{2}\max_i a_i\frac{\sum_{i=1}^n\frac{a_i^2\sigma_i^2}{2-\alpha a_i}}{\sum_{i=1}^n\frac{a_i^2\sigma_i^2}{2-\alpha a_i}}\!=\!-\frac{\alpha\max_i a_i}{2}
  \end{align}
  Since $I-\alpha A\succ 0$ implies $\alpha a_i<1$ for arbitrary $i$, then $C^*\in [-\frac{1}{2}, 0]$ and the angle is between 90 degree to 120 degrees.
\end{proof}

\section{Hyperparmater setting for AutoDrop}\label{supp:hyper}
\subsection{Two fixed conditions in AutoDrop}
We further comment on the two fixed conditions $len(\mathcal{B})>10$ and $\mathcal{C}_i-\mathcal{C}_{i-1}<0.1$ in the algorithm. The condition $len(\mathcal{B})>10$ means that we will not smooth the angular velocity at the very beginning of the training or right after dropping the learning rate - so this is just a common-sense initial condition since we need to gather a few samples before applying smoothing makes sense. Regarding the condition on $\mathcal{C}_i-\mathcal{C}_{i-1}<0.1$. Intuitively the threshold for that term should be set to match the standard deviation of the angular velocity. We found that this standard deviation is between 0.1 and 0.25 (see exemplary Table \ref{tab:ablation_c} in Supplementary for the ResNet experiment with different learning rates; we observed similar properties for the remaining experiments). 
\begin{table}[H]
\centering
\begin{tabular}{|p{3cm}||p{2cm}|p{2cm}|p{2cm}|}
\hline
ResNet18/CIFAR10&1e-1&3e-2&1e-2\\
\hline
Standard Deviation&0.17&0.22&0.24\\
\hline

\end{tabular}
% \vspace{0.1in}
\caption{Standard deviation of angular velocity for different learning rate on ResNet18/CIFAR10.}
% \vspace{0.1in}
\label{tab:ablation_c}
\end{table}


% Resnet18/cifar10
% 1e-1
% 3e-2
% 1e-2
% Standard Deviation
% 0.17
% 0.22
% 0.24



\subsection{Ablation Study for hyperparameter $\rho$ and $m$}
In this section, we perform ablation study for hyperparameter $\rho$ and $m$ on multiple model settings: ResNet18/CIFAR10, WRN28x10/CIFAR100, ResNet34/CIFAR100 and WRN40x10/CIFAR100. We hyperparamter search $\rho=[0.5, 0.8, 0.9, 0.95, 0.99]$ and $m=[5,10,20,30,50]$ among different tasks. $\rho=0.95$, $m=10$ performs the best among all tasks. In a wide range of hyper-parameter settings that we explore, the changes of the model performance are mild, i.e., of the order $2.5\%-4\%$.

Therefore, Hyperparameters $\rho$ and $m$ are set fixed across all our experiments ($\rho=0.95$, $m=10$).

\subsubsection{ResNet18-CIFAR10}
% \begin{table}[H]
% \centering
% \begin{tabular}{|p{1.5cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
% \hline
% Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
% \hline
% \multirow{4}{8em}{WRN$28$x$10$ \\ CIFAR10} 
% &\multirow{4}{8em}{AutoDrop}
% &\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{4.79 ±
% 0.99}\\
% &&--&--&32&200 &5.65 ± 0.154\\
% &&--&--&128&200 &6.08 ± 0.111\\
% &&--&--&256&200 &7.41 ± 0.244\\
% &&--&--&512&200 &17.70 ± 16.611\\

% \hline

% \end{tabular}
% \vspace{0.1in}
% \caption{Ablation study on parameter $k$ for AutoDrop on task ResNet18/CIFAR10.}
% \vspace{0.1in}
% \label{tab:cifar_lpf}
% \end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{ResNet18 \\ CIFAR10} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{4.79 ±
0.99}\\
&&--&5&--&200 &5.05 ± 0.096\\
&&--&20&--&200 &5.50 ± 0.169\\
&&--&30&--&200 &6.52 ± 0.092\\
&&--&50&--&200 &7.41 ± 0.111\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $m$ for AutoDrop on task ResNet18/CIFAR10.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{ResNet18 \\ CIFAR10} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{4.79 ±
0.99}\\
&&0.5&--&--&200 &8.85 ± 0.873\\
&&0.8&--&--&200 &6.62 ± 0.259\\
&&0.9&--&--&200 &5.48 ± 0.040\\
&&0.99&--&--&200 &7.65 ± 0.178\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $\rho$ for AutoDrop on task ResNet18/CIFAR10.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}

\subsubsection{WRN28x10-CIFAR10}

% \begin{table}[H]
% \centering
% \begin{tabular}{|p{1.5cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
% \hline
% Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
% \hline
% \multirow{4}{8em}{WRN28x10 \\ CIFAR10} 
% &\multirow{4}{8em}{AutoDrop}
% &\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{3.73 ± 0.07}\\
% &&--&--&32&200 &4.30 ± 0.127\\
% &&--&--&128&200 &5.77 ± 0.134\\
% &&--&--&256&200 &7.36 ± 0.352\\
% &&--&--&512&200 & ±\\

% \hline

% \end{tabular}
% \vspace{0.1in}
% \caption{Ablation study on parameter $k$ for AutoDrop on task WRN28x10/CIFAR10.}
% \vspace{0.1in}
% \label{tab:cifar_lpf}
% \end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{WRN$28$x$10$ \\ CIFAR10} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{3.73 ± 0.07}\\
&&--&5&--&200 &6.76 ± 0.349\\
&&--&20&--&200 &4.63 ± 0.165\\
&&--&30&--&200 &4.11 ± 0.137\\
&&--&50&--&200 &7.85 ± 0.119\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $m$ for AutoDrop on task WRN28x10/CIFAR10.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{WRN$28$x$10$ \\ CIFAR10} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{3.73 ± 0.07}\\
&&0.5&--&--&200 &6.26 ± 0.283\\
&&0.8&--&--&200 &5.07 ± 0.286\\
&&0.9&--&--&200 &3.94 ± 0.118\\
&&0.99&--&--&200 &7.36 ± 0.021\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $\rho$ for AutoDrop on task WRN28x10/CIFAR10.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}

\subsubsection{ResNet34-CIFAR100}

% \begin{table}[H]
% \centering
% \begin{tabular}{|p{1.5cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
% \hline
% Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
% \hline
% \multirow{4}{8em}{ResNet$34$ \\ CIFAR100} 
% &\multirow{4}{8em}{AutoDrop}
% &\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{21.82 ±
% 0.14}\\
% &&--&--&32&200 &24.07 ± 0.438\\
% &&--&--&128&200 &23.11 ± 1.351\\
% &&--&--&256&200 &28.33 ± 0.200\\
% &&--&--&512&200 & ±\\

% \hline

% \end{tabular}
% \vspace{0.1in}
% \caption{Ablation study on parameter $k$ for AutoDrop on task ResNet34/CIFAR100.}
% \vspace{0.1in}
% \label{tab:cifar_lpf}
% \end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{ResNet$34$ \\ CIFAR100} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{21.82 ±
0.14}\\
&&--&5&--&200 &22.41 ± 0.187\\
&&--&20&--&200 &\textbf{22.39 ± 0.11}\\
&&--&30&--&200 &26.09 ± 0.612\\
&&--&50&--&200 &28.53± 0.44\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $m$ for AutoDrop on task ResNet34/CIFAR100.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{ResNet$34$ \\ CIFAR100} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{21.82 ±
0.14}\\
&&0.5&--&--&200 &30.42 ± 0.430\\
&&0.8&--&--&200 &25.71 ± 0.561\\
&&0.9&--&--&200 &23.14 ± 0.464\\
&&0.99&--&--&200 &30.09 ± 0.192\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $\rho$ for AutoDrop on task ResNet34/CIFAR100.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}

\subsubsection{WRN40x10-CIFAR100}

\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{WRN40x10 \\ CIFAR100} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{19.41 ± 0.10}\\
&&--&5&--&200 &19.84 ± 0.21\\
&&--&20&--&200 &23.59 ± 0.22\\
&&--&30&--&200 &25.65 ± 0.17\\
&&--&50&--&200 &28.72± 0.42\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $m$ for AutoDrop on task WRN40x10/CIFAR100.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}


\begin{table}[H]
\centering
\begin{tabular}{|p{1.8cm}||p{2.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1.5cm}|p{2.2cm}|}
\hline
Model&Method&$\rho$&$m$&$k$&epoches&Test Error\\
\hline
\multirow{4}{8em}{WRN40x10 \\ CIFAR100} 
&\multirow{4}{8em}{AutoDrop}
&\textbf{0.95}&\textbf{10}&\textbf{64}&\textbf{200} &\textbf{19.41 ± 0.10}\\
&&0.5&--&--&200 &25.58±0.46\\
&&0.8&--&--&200 &21.03±0.54\\
&&0.9&--&--&200 &19.96±0.12\\
&&0.99&--&--&200 &30.23±0.35\\

\hline

\end{tabular}
\vspace{0.1in}
\caption{Ablation study on parameter $\rho$ for AutoDrop on task WRN40x10/CIFAR100.}
\vspace{0.1in}
\label{tab:cifar_lpf}
\end{table}

\subsection{Ablation Study for $k$}
Regarding the sliding window size $k$ used for computing the batch angular velocity, it varies with respect to the size of the training data $N$. Since $k$ decides the frequency of computing the batch angular velocity and we drop the learning rate every time the angular velocity saturates, the learning rate $\alpha_t$ at iteration $t$ for AutoDrop could be simplistically expressed as $\alpha_t=\alpha_0\rho^{\mathcal{O}(N/k)}$, assuming $\rho$ and $m$ are fixed. Therefore, when the size of the data set $N$ is large, e.g., ImageNet data set has 14 million images, the sliding window $k$ should be larger than for smaller data sets, such as CIFAR10 and CIFAR100 tasks that have $\sim$10K data points. We found that $k=64$ performs well for CIFAR10 and CIFAR100 tasks, while $k=640$ performs much better for ImageNet.

\begin{table}[H]
\centering
\vspace{-0.1in}
\begin{small}
  \begin{tabular}{|p{2cm}||p{2cm}|p{2cm}|p{2cm}|p{2cm}|}
    \hline
    \multirow{2}{*}{Model} &
    \multicolumn{4}{c|}{Ablation Study for $k$}\\
    \cline{2-5}
    & $k$=32 & $k$=64 & $k$=128 & $k$=256 \\
    \hline
    \tabincell{l}{ResNet18 \\ CIFAR10} &$5.65_{\pm
.15}$&$\mathbf{4.79_{\pm
.99}}$&$6.08_{\pm
.11}$&$7.41_{\pm
.24}$\\
    \hline
    \tabincell{l}{WRN28x10 \\ CIFAR10}&$4.30_{\pm
.13}$&$\mathbf{3.73_{\pm
.07}}$&$5.77_{\pm
.13}$&$7.36_{\pm
.15}$\\
    \hline
    \tabincell{l}{ResNet34 \\ CIFAR100}&$24.07_{\pm
.44}$&$\mathbf{21.82_{\pm
.14}}$&$23.11_{\pm
1.3}$&$28.33_{\pm
.20}$ \\
    \hline
    \tabincell{l}{WRN40x10 \\ CIFAR100}&$20.39_{\pm.08}$&$\!\mathbf{19.41_{\pm.10}}$&$24.49_{\pm.16}$&$28.79_{\pm.32}$\\
    \hline
    Model&$k$=64 & $k$=256 & $k$=512 & $k$=640 \\
    \hline
    \tabincell{l}{ResNet18 \\ ImageNet}&39.22&31.04&29.70&\textbf{29.24} \\
    \hline
  \end{tabular}
  \vspace{-0.1in}
  \caption{Ablation study for $k$ among different models (test error).}
  \label{tab:ablation_k}
  \end{small}
\end{table}


\section{AutoDrop (approximate)}\label{supp:flat2}
In this section, we analyze why algorithm \ref{alg:LRdrop} is an appropriate approximation for Algorithm \ref{alg:LRdrop}.  Note that the main idea behind our algorithm (either Algorithm \ref{alg:AD} or \ref{alg:LRdrop}) is to decrease the learning rate when the angular velocity saturates. Therefore, the key point is how to detect the “saturation”. In AutoDrop (Algorithm \ref{alg:AD}), we determine the saturation of the angular velocity by looking at the difference of the angular velocity in two consecutive epochs. If this difference is smaller than a given threshold $\theta$ then we assume we entered saturation and we will drop the learning rate. However, when it comes to theoretical analysis, it is hard to mathematically measure the “difference” of angular velocities in two consecutive steps and thus the analysis requires some approximations when it comes to defining saturation. Intuitively, when the derivative of the angular velocity is close to zero, we would expect the angular velocity to saturate. This motivates Algorithm \ref{alg:LRdrop}, which is an approximation to Algorithm \ref{alg:AD}. Moreover, for the purpose of theoretical analysis, we assume that the angular velocity curve is smooth and could be represented with Equation~\ref{eq:AVModel}. Under this assumption, the angular velocity is concave with no noise.  The behavior of the angular velocity and the learning rate for Algorithm~\ref{alg:LRdrop} is depicted in Figure~\ref{fig:flat2}.
\begin{figure}[H]
    \centering
    \subfigure[]{\includegraphics[width=0.4\textwidth]{Figures/theory/angle_landscape3_final.png}}
    \subfigure[]{\includegraphics[width=0.4\textwidth]{Figures/theory/Learning_rate_trend2_final.png}}
    \vspace{-0.1in}
    \caption{(a) The behavior of the angular velocity for Algorithm~\ref{alg:LRdrop}. (b) The behavior of the learning rate for Algorithm~\ref{alg:LRdrop}.}
    \vspace{-0.1in}
    \label{fig:flat2}
\end{figure}

\section{Proof for Theorem \ref{thm:sgdm_conv}}
Proof in this section in inspired by \cite{yang2016unified}.
% \begin{proof}[Proof for Theorem \ref{thm:sgdm_conv}]
% We denote $\mathcal{G}(x_t;\xi_t)=\mathcal{G}(x_t)=\mathcal{G}_t$. One could verify that the update formula (\ref{eq:sgdm}) implies the following recursions:
% \begin{align}
%     x_{t+1}+p_{t+1}=&x_t+p_t-\frac{\alpha_t}{1-\beta}\mathcal{G}(x_t)\\
%     v_{t+1}=&\beta v_t+((1-\beta)s-1)\alpha_t\mathcal{G}(x_t),
% \end{align}
% where $v_t=\frac{1-\beta}{\beta}p_t$ and $p_t$ is given by
% \begin{equation}
% p_t=\left\{
% \begin{aligned}\label{eq:sgdm2}
%       &\frac{\beta}{1-\beta}(x_t-x_{t-1}+s\alpha_{t-1}\mathcal{G}(x_{t-1})), \quad k\geq1\\
%       &0, \quad k=0.
% \end{aligned}
% \right.    
% \end{equation}
% Define $\delta_t=\mathcal{G}_t-\partial f(x_t)$ and $x^*$ is the optimal point. From the above recursions we have
% \begin{align}
%     &\norm{x_{t+1}+p_{t+1}-x^*}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t+p_t-x^*)^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T\mathcal{G}_t\!-\!\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T\mathcal{G}_t\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathcal{G}_{t-1}^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2-\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T(\delta_t+\partial f(x_t))-\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))+\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\delta_t+\partial f(x_t)}^2.
% \end{align}
% Note that
% \begin{align*}
%     &\mathbb{E}[(x_t-x^*)^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]\\
%     &\mathbb{E}[(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\\
%     &\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T\partial f(x_t)]=\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\\
%     &\mathbb{E}[\norm{\delta_t+\partial f(x_t)}^2]=\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2].
% \end{align*}
% Taking expectation on both sides gives the following
% \begin{align}\label{eq:norm_exp}
%     &\mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\notag\\
%     =&\mathbb{E}[\norm{x_t+p_t-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]+\left(\frac{\alpha_t}{1-\beta}\right)^2(\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2]).
% \end{align}
% Moreover, since f is convex, $\mathbb{E}\left[\norm{ \mathcal{G}(x;\xi)-\mathbb{E}[\mathcal{G}(x;\xi)]}\right]\leq\delta^2$, and $\norm{\nabla f(x)}\leq G$ for any $x$:
% \begin{align*}
%     &f(x_t)-f(x^*)\leq(x_t-x^*)^T\partial f(x_t)\\
%     &f(x_t)-f(x_{t-1})\leq(x_t-x_{t-1})^T\partial f(x_t)\\
%     &-\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\leq\frac{\mathbb{E}[\norm{\mathcal{G}_{t-1}}^2+\norm{\partial f(x_t)}^2]}{2}\leq\delta^2/2+G^2\leq\delta^2+G^2\\
%     &\mathbb{E}[\norm{\delta_t}^2]\leq\delta^2,\quad\mathbb{E}[\norm{\partial f(x_t)}^2]\leq G^2.
% \end{align*}
% Therefore, (\ref{eq:norm_exp}) can be re-written as
% \begin{align}\label{eq:exp_update}
%     \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
%     &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{2s\beta\alpha_t\alpha_{t-1}+\alpha_t^2}{(1-\beta)^2}(G^2+\delta^2).
% \end{align}
% Since $\hat{\alpha}_i$ is decreasing, it implies that $\alpha_t$ is non-increasing. Then (\ref{eq:exp_update}) can be upper-bounded as
% \begin{align}\label{eq:exp_update2}
%     \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
%     &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{(2s\beta+1)\alpha_t\alpha_{t-1}}{(1-\beta)^2}(G^2+\delta^2).
% \end{align}
% Taking $t=0,...,T-1$ and $x_{-1}=x_0$, and then summing all the inequalities gives
% \begin{align*}
%     \sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t+1}\!+\!p_{t+1}\!-\!x^*}^2]\leq&\sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\sum_{t=0}^{T-1}\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\notag\\
%     &-\!\sum_{t=0}^{T-1}\frac{2\alpha_t\beta}{(1\!-\!\beta)^2}\mathbb{E}[f(x_t)\!-\!f(x_{t-1})]
%     \!+\!\frac{(2s\beta\!+\!1)(G^2\!+\!\delta^2)}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}.
% \end{align*}
% Therefore,
% \begin{align*}
%     \frac{2}{1\!-\!\beta}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_t)\!-\!f(x^*)]\leq&\norm{x_0\!-\!x^*}^2\!-\!\norm{x^{T}\!+\!p_{T}\!-\!x^*}\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})\!-\!f(x_t)]\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}.
% \end{align*}
% Since $\alpha_{T-1}\leq...\leq\alpha_1\leq\alpha_0<1$, $\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq\mathbb{E}[f(x_t)-f(x^*)](\forall t=0,...,T-1)$, we obtain
% \begin{align*}
%     \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})-f(x_t)]\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% Moreover, $\alpha_t=\hat{\alpha}_i (t_i\leq t< t_{i+1})$ implies that
% \begin{align*}
%     \frac{2}{1\!-\!\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)\!-\!f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0\!-\!x^*}^2\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{i=0}^{n-1}\hat{\alpha}_i\mathbb{E}[f(x_{t_i})\!-\!f(x_{t_{i+1}})]\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% Since $\mathbb{E}[f(x_{t_i})-f(x_{t_{i+1}})]$ is always upper-bounded by $f(x_0)-f(x^*)$, we have
% \begin{align*}
%     \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% After simplifying, we have
% \begin{align}\label{ieq:min}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{(1-\beta)\norm{x_0-x^*}^2}{2\sum_{t=0}^{T-1}\alpha_t}+\frac{\beta[f(x_0)-f(x_T)]\sum_{i=0}^{n-1}\hat{\alpha}_i}{(1-\beta)\sum_{t=0}^{T-1}\alpha_t}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{2(1-\beta)\sum_{t=0}^{T-1}\alpha_t}.
% \end{align}
% Because $\hat{\alpha}_i\leq (i+3)^{-1}$, $k_i\hat{\alpha}_i\geq (i+3)^{-\frac{1}{3}},$ $k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq (i+2)^{-\frac{2}{3}},\quad \forall i=0,1,...,n-1(n\gg1)$,
% \begin{align}
%     \sum_{i=0}^{n-1}\hat{\alpha}_i&\leq\sum_{i=0}^{n-1}(i+3)^{-1}=\int_{0}^{n-1}(i+3)^{-1}=\log(n+2)-\log 2\label{eq:sumi}\\
%     \sum_{t=0}^{T-1}\alpha_t&=\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\geq\sum_{i=0}^{n-1}(i+3)^{-\frac{1}{3}}=\int_{0}^{n-1}(i+3)^{-\frac{1}{3}}=\frac{3}{2}[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]\label{eq:sumt}\\
%     \sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}&\leq\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq\sum_{i=0}^{n-1}(i+3)^{-\frac{2}{3}}=\int_{0}^{n-1}(i+3)^{-\frac{2}{3}}=3[(n+1)^{\frac{1}{3}}-1]\label{eq:sumt2}.
% \end{align}
% Finally, substitute (\ref{eq:sumi}-\ref{eq:sumt2}) into inequality (\ref{ieq:min}) to obtain the following
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[\log(n+2)-\log 2]}{3(1-\beta)[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)[(n+1)^{\frac{1}{3}}-1]}{(1-\beta)[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}.
% \end{align*}
% \end{proof}
% \begin{proof}[Proof for Theorem \ref{thm:sgdm_conv}]
% We denote $\mathcal{G}(x_t;\xi_t)=\mathcal{G}(x_t)=\mathcal{G}_t$. The update formula (\ref{eq:sgdm}) implies the following recursions:
% \begin{align}
%     x_{t+1}+p_{t+1}=&x_t+p_t-\frac{\alpha_t}{1-\beta}\mathcal{G}(x_t)\\
%     v_{t+1}=&\beta v_t+((1-\beta)s-1)\alpha_t\mathcal{G}(x_t),
% \end{align}
% where $v_t=\frac{1-\beta}{\beta}p_t$ and $p_t$ is given by
% \begin{equation}
% p_t=\left\{
% \begin{aligned}\label{eq:sgdm2}
%       &\frac{\beta}{1-\beta}(x_t-x_{t-1}+s\alpha_{t-1}\mathcal{G}(x_{t-1})), \quad k\geq1\\
%       &0, \quad k=0
% \end{aligned}
% \right.  .  
% \end{equation}
% Define $\delta_t=\mathcal{G}_t-\partial f(x_t)$ and let $x^*$ be the optimal point. From the above recursions we have
% \begin{align}
%     &\norm{x_{t+1}+p_{t+1}-x^*}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t+p_t-x^*)^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T\mathcal{G}_t\!-\!\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T\mathcal{G}_t\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathcal{G}_{t-1}^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2-\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T(\delta_t+\partial f(x_t))-\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))+\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\delta_t+\partial f(x_t)}^2.
% \end{align}
% Note that
% \begin{align*}
%     &\mathbb{E}[(x_t-x^*)^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]\\
%     &\mathbb{E}[(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\\
%     &\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T\partial f(x_t)]=\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\\
%     &\mathbb{E}[\norm{\delta_t+\partial f(x_t)}^2]=\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2].
% \end{align*}
% Taking the expectation on both sides gives the following
% \begin{align}\label{eq:norm_exp}
%     &\mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\notag\\
%     =&\mathbb{E}[\norm{x_t+p_t-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]+\left(\frac{\alpha_t}{1-\beta}\right)^2(\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2]).
% \end{align}
% Moreover, since f is convex,$\mathbb{E}\left[\norm{ \mathcal{G}(x;\xi)-\mathbb{E}[\mathcal{G}(x;\xi)]}\right]\leq\delta^2$, and $\norm{\nabla f(x)}\leq G$, then for any $x$
% \begin{align*}
%     &f(x_t)-f(x^*)\leq(x_t-x^*)^T\partial f(x_t)\\
%     &f(x_t)-f(x_{t-1})\leq(x_t-x_{t-1})^T\partial f(x_t)\\
%     &-\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\leq\frac{\mathbb{E}[\norm{\mathcal{G}_{t-1}}^2+\norm{\partial f(x_t)}^2]}{2}\leq\delta^2/2+G^2\leq\delta^2+G^2\\
%     &\mathbb{E}[\norm{\delta_t}^2]\leq\delta^2,\quad\mathbb{E}[\norm{\partial f(x_t)}^2]\leq G^2.
% \end{align*}
% Therefore, (\ref{eq:norm_exp}) can be rewritten as
% \begin{align}\label{eq:exp_update}
%     \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
%     &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{2s\beta\alpha_t\alpha_{t-1}+\alpha_t^2}{(1-\beta)^2}(G^2+\delta^2).
% \end{align}
% Since $\hat{\alpha}_i$ is decreasing, it implies that $\alpha_t$ is non-increasing. Thus, (\ref{eq:exp_update}) could be upper-bounded as
% \begin{align}\label{eq:exp_update2}
%     \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
%     &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{(2s\beta+1)\alpha_t\alpha_{t-1}}{(1-\beta)^2}(G^2+\delta^2).
% \end{align}
% Taking $t=0,...,T-1$ and $x_{-1}=x_0$, and then summing all the inequalities gives
% \begin{align*}
%     \sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t+1}\!+\!p_{t+1}\!-\!x^*}^2]\leq&\sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\sum_{t=0}^{T-1}\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\notag\\
%     &-\!\sum_{t=0}^{T-1}\frac{2\alpha_t\beta}{(1\!-\!\beta)^2}\mathbb{E}[f(x_t)\!-\!f(x_{t-1})]
%     \!+\!\frac{(2s\beta\!+\!1)(G^2\!+\!\delta^2)}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}.
% \end{align*}
% Therefore,
% \begin{align*}
%     \frac{2}{1\!-\!\beta}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_t)\!-\!f(x^*)]\leq&\norm{x_0\!-\!x^*}^2\!-\!\norm{x^{T}\!+\!p_{T}\!-\!x^*}\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})\!-\!f(x_t)]\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1},
% \end{align*}
% since $\alpha_{T-1}\leq...\leq\alpha_1\leq\alpha_0<1$, $\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq\mathbb{E}[f(x_t)-f(x^*)](\forall t=0,...,T-1)$. Then
% \begin{align*}
%     \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})-f(x_t)]\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% Moreover, $\alpha_t=\hat{\alpha}_i (t_i\leq t< t_{i+1})$ implies that
% \begin{align*}
%     \frac{2}{1\!-\!\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)\!-\!f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0\!-\!x^*}^2\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{i=0}^{n-1}\hat{\alpha}_i\mathbb{E}[f(x_{t_i})\!-\!f(x_{t_{i+1}})]\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% Since $\mathbb{E}[f(x_{t_i})-f(x_{t_{i+1}})]$ is always upper-bounded by $f(x_0)-f(x^*)$, we have
% \begin{align*}
%     \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% After simplification, we have
% \begin{align}\label{ieq:min}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{(1-\beta)\norm{x_0-x^*}^2}{2\sum_{t=0}^{T-1}\alpha_t}+\frac{\beta[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i}{(1-\beta)\sum_{t=0}^{T-1}\alpha_t}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{2(1-\beta)\sum_{t=0}^{T-1}\alpha_t}.
% \end{align}
% Because $\hat{\alpha}_i\leq (i+2)^{-1}$, $k_i\hat{\alpha}_i\geq \kappa_1(i+2)^{-\frac{1}{3}},$ $k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-\frac{2}{3}},\forall i=0,1,...,n-1(n\gg1)$,
% \begin{align}
%     \sum_{i=0}^{n-1}\hat{\alpha}_i&\leq\sum_{i=0}^{n-1}(i+2)^{-\frac{2}{3}}=\int_{0}^{n-1}(i+2)^{-\frac{2}{3}}=3[(n+1)^{\frac{1}{3}}-2^{\frac{1}{3}}]\label{eq:sumi}\\
%     \sum_{t=0}^{T-1}\alpha_t&=\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\geq\sum_{i=0}^{n-1}\kappa_1(i+2)^{-\frac{1}{3}}=\kappa_1\int_{0}^{n-1}(i+2)^{-\frac{1}{2}}=\frac{3\kappa_1}{2}[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]\label{eq:sumt}\\
%     \sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}&\leq\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq\kappa_2\sum_{i=0}^{n-1}(i+1)^{-1}=\kappa_2\int_{0}^{n-1}(i+1)^{-1}=\kappa_2\log n\label{eq:sumt2}.
% \end{align}
% Substituting (\ref{eq:sumi}-\ref{eq:sumt2}) into inequality (\ref{ieq:min}) gives 
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[(n+1)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{3(1-\beta)\kappa_1[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}.
% \end{align*}
% \end{proof}

\begin{proof}[Proof for Theorem 2]
 We denote $\mathcal{G}(x_t;\xi_t)=\mathcal{G}(x_t)=\mathcal{G}_t$. The update formula (\ref{eq:sgdm}) implies the following recursions:
\begin{align}
    x_{t+1}+p_{t+1}=&x_t+p_t-\frac{\alpha_t}{1-\beta}\mathcal{G}(x_t)\\
    v_{t+1}=&\beta v_t+((1-\beta)s-1)\alpha_t\mathcal{G}(x_t),
\end{align}
where $v_t=\frac{1-\beta}{\beta}p_t$ and $p_t$ is given by
\begin{equation}
p_t=\left\{
\begin{aligned}\label{eq:sgdm3}
      &\frac{\beta}{1-\beta}(x_t-x_{t-1}+s\alpha_{t-1}\mathcal{G}(x_{t-1})), \quad k\geq1\\
      &0, \quad k=0
\end{aligned}
\right.  .  
\end{equation}
Define $\delta_t=\mathcal{G}_t-\partial f(x_t)$ and let $x^*$ be the optimal point. From the above recursions we have
\begin{align}
    &\norm{x_{t+1}+p_{t+1}-x^*}^2\notag\\
    =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t+p_t-x^*)^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
    =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T\mathcal{G}_t\!-\!\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T\mathcal{G}_t\notag\\
    &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathcal{G}_{t-1}^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
    =&\norm{x_t+p_t-x^*}^2-\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T(\delta_t+\partial f(x_t))-\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))\notag\\
    &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))+\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\delta_t+\partial f(x_t)}^2.
\end{align}
Note that
\begin{align*}
    &\mathbb{E}[(x_t-x^*)^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]\\
    &\mathbb{E}[(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\\
    &\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T\partial f(x_t)]=\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\\
    &\mathbb{E}[\norm{\delta_t+\partial f(x_t)}^2]=\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2].
\end{align*}
Taking the expectation on both sides gives the following
\begin{align}\label{eq:norm_exp2}
    &\mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\notag\\
    =&\mathbb{E}[\norm{x_t+p_t-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\notag\\
    &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]+\left(\frac{\alpha_t}{1-\beta}\right)^2(\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2]).
\end{align}
Moreover, since f is convex,$\mathbb{E}\left[\norm{ \mathcal{G}(x;\xi)-\mathbb{E}[\mathcal{G}(x;\xi)]}\right]\leq\delta^2$, and $\norm{\nabla f(x)}\leq G$, then for any $x$
\begin{align*}
    &f(x_t)-f(x^*)\leq(x_t-x^*)^T\partial f(x_t)\\
    &f(x_t)-f(x_{t-1})\leq(x_t-x_{t-1})^T\partial f(x_t)\\
    &-\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\leq\frac{\mathbb{E}[\norm{\mathcal{G}_{t-1}}^2+\norm{\partial f(x_t)}^2]}{2}\leq\delta^2/2+G^2\leq\delta^2+G^2\\
    &\mathbb{E}[\norm{\delta_t}^2]\leq\delta^2,\quad\mathbb{E}[\norm{\partial f(x_t)}^2]\leq G^2.
\end{align*}
Therefore, (\ref{eq:norm_exp2}) can be rewritten as
\begin{align}\label{eq:exp_update2}
    \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
    &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{2s\beta\alpha_t\alpha_{t-1}+\alpha_t^2}{(1-\beta)^2}(G^2+\delta^2).
\end{align}
Since $\hat{\alpha}_i$ is decreasing, it implies that $\alpha_t$ is non-increasing. Thus, (\ref{eq:exp_update2}) could be upper-bounded as
\begin{align}\label{eq:exp_update2}
    \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
    &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{(2s\beta+1)\alpha_t\alpha_{t-1}}{(1-\beta)^2}(G^2+\delta^2).
\end{align}
Taking $t=0,...,T-1$ and $x_{-1}=x_0$, and then summing all the inequalities gives
\begin{align*}
    \sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t+1}\!+\!p_{t+1}\!-\!x^*}^2]\leq&\sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\sum_{t=0}^{T-1}\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\notag\\
    &-\!\sum_{t=0}^{T-1}\frac{2\alpha_t\beta}{(1\!-\!\beta)^2}\mathbb{E}[f(x_t)\!-\!f(x_{t-1})]
    \!+\!\frac{(2s\beta\!+\!1)(G^2\!+\!\delta^2)}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}.
\end{align*}
Therefore,
\begin{align*}
    \frac{2}{1\!-\!\beta}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_t)\!-\!f(x^*)]\leq&\norm{x_0\!-\!x^*}^2\!-\!\norm{x^{T}\!+\!p_{T}\!-\!x^*}\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})\!-\!f(x_t)]\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1},
\end{align*}
since $\alpha_{T-1}\leq...\leq\alpha_1\leq\alpha_0<1$, $\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq\mathbb{E}[f(x_t)-f(x^*)](\forall t=0,...,T-1)$. Then
\begin{align*}
    \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})-f(x_t)]\notag\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
\end{align*}
Moreover, $\alpha_t=\hat{\alpha}_i (t_i\leq t< t_{i+1})$ implies that
\begin{align*}
    \frac{2}{1\!-\!\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)\!-\!f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0\!-\!x^*}^2\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{i=0}^{n-1}\hat{\alpha}_i\mathbb{E}[f(x_{t_i})\!-\!f(x_{t_{i+1}})]\notag\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
\end{align*}
Since $\mathbb{E}[f(x_{t_i})-f(x_{t_{i+1}})]$ is always upper-bounded by $f(x_0)-f(x^*)$, we have
\begin{align*}
    \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
\end{align*}
After simplification, we have
\begin{align}\label{ieq:min2}
    \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{(1-\beta)\norm{x_0-x^*}^2}{2\sum_{t=0}^{T-1}\alpha_t}+\frac{\beta[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i}{(1-\beta)\sum_{t=0}^{T-1}\alpha_t}\notag\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{2(1-\beta)\sum_{t=0}^{T-1}\alpha_t}.
\end{align}
Because $\hat{\alpha}_i\leq (i+2)^{-1}$, $k_i\hat{\alpha}_i\geq \kappa_1(i+2)^{-\frac{1}{3}},$ $k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-\frac{2}{3}},\forall i=0,1,...,n-1(n\gg1)$,
\begin{align}
    \sum_{i=0}^{n-1}\hat{\alpha}_i&\leq\sum_{i=0}^{n-1}(i+2)^{-1}=\int_{0}^{n-1}(i+2)^{-1}=\log(n+1)-\log(2)\label{eq:sumi_2}\\
    \sum_{t=0}^{T-1}\alpha_t&=\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\geq\sum_{i=0}^{n-1}\kappa_1=\kappa_1n\label{eq:sumt}\\
    \sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}&\leq\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq\kappa_2\sum_{i=0}^{n-1}(i+1)^{-1}=\kappa_2\int_{0}^{n-1}(i+1)^{-1}=\kappa_2\log n\label{eq:sumt2_2}.
\end{align}
Substituting (\ref{eq:sumi_2}-\ref{eq:sumt2_2}) into inequality (\ref{ieq:min2}) gives 
\begin{align*}
    \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log(n+1)-\log 2]}{\kappa_1(1-\beta)n}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1n}\notag\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{2(1-\beta)\kappa_1n}.
\end{align*}
\end{proof}

\subsection{Proof for Theorem \ref{thm:conv}}
First, we introduce Lemma~\ref{lma:k_bound2} which will be used in the proof for Theorem \ref{thm:conv}. We prove this lemma later in this section.

% \begin{lma}\label{lma:k_bound}
% If sequence $\{\hat{\alpha}_i\}_{i=-1}^{n-1}\subset(0,1)$ and $\{k_i\}_{i=0}^n\subset\mathbb{N}$ satisfy:
% \begin{align*}
%     \hat{\alpha}_i=\frac{\gamma\epsilon}{i+3},\quad k_i=\frac{1}{\gamma}+\frac{1}{\gamma\epsilon\hat{\alpha}_i},
% \end{align*}
% where constants $\gamma,\epsilon$ satisfy: $\gamma\epsilon<1$ and $\gamma\epsilon^2<2^{\frac{4}{3}}-2$, then 
% \begin{align}\label{ieq:ki_con2}
%     \hat{\alpha}_i\leq (i+3)^{-1}, \quad k_i\hat{\alpha}_i\geq (i+3)^{-\frac{1}{3}},\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq (i+2)^{-\frac{2}{3}},\quad \forall i=0,1,...,n-1.
% \end{align}
% Moreover, suppose $T=\sum_{i=0}^{n-1}k_i$, then
% \begin{align}\label{eq:T}
%     T=\frac{n^2+(2\gamma\epsilon^2+5)n}{2\gamma^2\epsilon^2}.
% \end{align}
% \end{lma}

% \begin{lma}\label{lma:k_bound}
% If sequences $\{\hat{\alpha}_i\}_{i=-1}^{n-1}\subset(0,1)$ and $\{k_i\}_{i=0}^n\subset\mathbb{N}$ satisfy:
% \begin{align*}
%     \hat{\alpha}_i=(i+2)^{-\frac{2}{3}},\quad \frac{\kappa_1}{\sqrt{\hat{\alpha}_i}}\leq k_i\leq\frac{\kappa_2}{\sqrt{\hat{\alpha}_i}},
% \end{align*}
% where $\kappa_1$, $\kappa_2$ are constants, then 
% \begin{align}\label{ieq:ki_con2}
%     \hat{\alpha}_i\leq (i+2)^{-\frac{2}{3}}, \quad k_i\hat{\alpha}_i\geq \kappa_1(i+2)^{-\frac{1}{3}},\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
% \end{align}
% Moreover, suppose $T=\sum_{i=0}^{n-1}k_i$. If $n\gg 1$ the following holds
% \begin{align}\label{eq:T}
%     \frac{3\kappa_1}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}]\leq T\leq\frac{3\kappa_2}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}].
% \end{align}
% \end{lma}

% \begin{proof}[Proof for Theorem \ref{thm:conv}]
% Define the gaps of partition $\Pi:0=t_0<t_1<...<t_n=T$ derived from Algorithm \ref{alg:LRdrop} as
% $$k_i=t_{i+1}-t_i,\quad\forall i=0,...,n-1.$$
% Therefore, based on the model of the angle velocity $v_\alpha(t)$ we get
% \begin{align*}
%     v_{\hat{\alpha}_i}(k_i)=\frac{\pi}{2}\Longrightarrow k_i=\frac{1}{\gamma}+\frac{1}{\gamma\epsilon\hat{\alpha}_i}.
% \end{align*}
% By Lemma \ref{lma:k_bound}, we have
% \begin{align}\label{ieq:ki_co3}
%     \hat{\alpha}_i\leq (i+3)^{-1}, \quad k_i\hat{\alpha}_i\geq (i+3)^{-\frac{1}{3}},\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq (i+2)^{-\frac{2}{3}},\quad \forall i=0,1,...,n-1.
% \end{align}
% Combining (\ref{ieq:ki_co3}) with Theorem \ref{thm:sgdm_conv} allows us to conclude
% that the sequence $\{x_t\}_{t=0}^{T-1}$ generated using Algorithm~\ref{alg:LRdrop} satisfies
% \begin{align}\label{ieq:conv_n}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[\log(n+2)-\log 2]}{3(1-\beta)[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)[(n+1)^{\frac{1}{3}}-1]}{(1-\beta)[(n+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}.
% \end{align}
% By Equation (\ref{eq:T}), Lemma~\ref{lma:k_bound}, and $\gamma\epsilon<1$, we have that
% \begin{align*}
%     \frac{n^2}{2\gamma^2\epsilon^2}\leq T=\frac{n^2+(2\gamma\epsilon^2+5)n}{2\gamma^2\epsilon^2}\leq\frac{(\gamma\epsilon^2+3)n^2}{\gamma^2\epsilon^2}\leq\frac{4n^2}{\gamma^2\epsilon^2}.
% \end{align*}
% Therefore
% \begin{align}\label{ieq:n_bound}
%     \frac{\gamma\epsilon}{4}\sqrt{T}\leq n\leq \sqrt{2}\gamma\epsilon\sqrt{T}.
% \end{align}
% Combine (\ref{ieq:n_bound}) with (\ref{ieq:conv_n}) to obtain
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)\!-\!f(x^*)]\}\leq&\frac{2\beta(f(x_0)\!-\!f(x^*))[\log(\sqrt{2}\gamma\epsilon\sqrt{T})\!-\!\log 2]}{3(1-\beta)[(\frac{\gamma\epsilon\sqrt{T}}{4}+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\!+\!\frac{(1-\beta)\norm{x_0-x^*}^2}{3[(\frac{\gamma\epsilon\sqrt{T}}{4}+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)[(\sqrt{2}\gamma\epsilon\sqrt{T}+1)^{\frac{1}{3}}-1]}{(1-\beta)[(\frac{\gamma\epsilon\sqrt{T}}{4}+2)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\\
%     =&O\left(T^{-\frac{1}{6}}\right).
% \end{align*}
% \end{proof}
% \begin{proof}[Proof for Theorem \ref{thm:conv}]
% The derivative of the angular velocity model is:
% $$v_{\alpha}'(t)=\frac{\pi(1+\epsilon\alpha)}{2\gamma t^2}.$$
% Define the gaps of partition $\Pi:0=t_0<t_1<...<t_n=T$ derived from the Algorithm \ref{alg:LRdrop} as
% $$k_i=t_{i+1}-t_i,\quad\forall i=0,...,n-1.$$
% Since we drop the learning rate every time the derivative of the angular velocity is smaller that $\delta$, we have
% \begin{align*}
%     v_{\hat{\alpha}_i}'(k_i)=\tau\Longrightarrow k_i=\sqrt{\frac{\pi(1+\epsilon\alpha)}{2\gamma\tau\hat{\alpha}_i}}.
% \end{align*}
% Since $\epsilon\in(0,1/3\hat{\alpha}_0)$, we have
% \begin{align}\label{ki_bound}
%     \sqrt{\frac{\pi}{2\gamma\tau\hat{\alpha}_i}}\leq k_i\leq\sqrt{\frac{2\pi}{3\gamma\tau\hat{\alpha}_i}}.
% \end{align}
% Define $\kappa_1=\sqrt{\frac{\pi}{2\gamma\tau}}$ and $\kappa_2=\sqrt{\frac{2\pi}{3\gamma\tau}}$. By Lemma \ref{lma:k_bound}, we have
% \begin{align}\label{ieq:ki_co3}
%     \hat{\alpha}_i\leq (i+2)^{-\frac{2}{3}}, \quad k_i\hat{\alpha}_i\geq \kappa_1(i+2)^{-\frac{1}{3}},\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
% \end{align}
% Then, by combining (\ref{ieq:ki_co3}) with Theorem \ref{thm:sgdm_conv} we could conclude
% that the sequence $\{x_t\}_{t=0}^{T-1}$ generated by the Algorithm \ref{alg:LRdrop} satisfies
% \begin{align}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[(n+1)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{3(1-\beta)\kappa_1[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}.\label{ieq:conv_n}
% \end{align}
% By Equation (\ref{eq:T}) in Lemma \ref{lma:k_bound} we have that
% \begin{align*}
%     \frac{3\kappa_1}{5}(n-1)^{\frac{5}{3}}\leq\frac{3\kappa_1}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}]\leq T\leq\frac{3\kappa_2}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}]\leq\frac{3\kappa_2}{5}(n+1)^{\frac{5}{3}}.
% \end{align*}
% Therefore
% \begin{align}\label{ieq:n_bound}
%   (\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1\leq n\leq (\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1.
% \end{align}
% Combining (\ref{ieq:n_bound}) with (\ref{ieq:conv_n}) gives
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[((\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1+1)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[((\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[((\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log ((\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1)}{3(1-\beta)\kappa_1[((\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     =&\frac{2\beta(f(x_0)-f(x^*))[(\frac{5T}{3\kappa_1})^{\frac{3}{5}}+2)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[(\frac{5T}{3\kappa_2})^{\frac{2}{5}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[(\frac{5T}{3\kappa_2})^{\frac{2}{5}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log ((\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1)}{3(1-\beta)\kappa_1[(\frac{5T}{3\kappa_2})^{\frac{2}{5}}-2^{\frac{2}{3}}]}\\
%     =&O\left(T^{-\frac{1}{5}}\right).
% \end{align*}
% \end{proof}
% {\color{red}
% \begin{proof}[Proof for Theorem \ref{thm:conv}]
% The derivative of the angular velocity model is:
% $$v_{\alpha}'(t)=\frac{\pi(1+\epsilon\alpha)}{2\sqrt{\gamma\alpha} (t+1/\sqrt{\gamma\alpha})^2}.$$
% Define the gaps of partition $\Pi:0=t_0<t_1<...<t_n=T$ derived from the Algorithm \ref{alg:LRdrop} as
% $$k_i=t_{i+1}-t_i,\quad\forall i=0,...,n-1.$$
% Since we drop the learning rate every time the derivative of the angular velocity is smaller that the threshold $\tau_i=\min\{\tau_0,\sqrt{\gamma\hat{\alpha}_i}/2\}$, we have
% \begin{align*}
%     v_{\hat{\alpha}_i}'(k_i)=\tau_i\Longrightarrow k_i=(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\left[\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_i}}-(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\right].
% \end{align*}

% \begin{itemize}
%     \item[i)] From $\tau_i=\min\{\tau_0,\sqrt{\gamma\hat{\alpha}_i}/2\}$, we have $\tau_i\leq\sqrt{\gamma\hat{\alpha}_i}/2$. Therefore,
%     \begin{align}
%         k_i\geq&(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\left[\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\times(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}-(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\right]\notag\\
%         =&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\left[\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}-1\right]\notag\\
%         \geq&\frac{\sqrt{\pi}-1}{\sqrt{\gamma \hat{\alpha}_i}}
%     \end{align}
%     \item[ii)] From $\tau_i=\min\{\tau_0,\sqrt{\gamma\hat{\alpha}_i}/2\}$, we have
%     \begin{align}
%         k_i\leq&(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\times \sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_i}}\notag\\
%         =&(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\max\left\{\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\right\}\notag\\
%         =&\max\left\{(\gamma\hat{\alpha}_i)^{-\frac{1}{4}}\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\right\}\notag\\
%         \leq&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\max\left\{\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\right\}\notag.
%     \end{align}
%     Since $\epsilon\in(0,\frac{1}{3\hat{\alpha}_0})$ and $\tau_0<2$, we could conclude
%     \begin{align*}
%         k_i\leq(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\max\left\{\sqrt{\frac{2\pi}{3\tau_0}},\sqrt{\frac{4\pi}{3}}\right\}\leq\sqrt{\frac{2\pi}{3\gamma\tau_0\hat{\alpha}_i}}.
%     \end{align*}
% \end{itemize}
% Combine i) and ii), we have
% \begin{align}
%     \frac{\sqrt{\pi}-1}{\sqrt{\gamma}}\times\frac{1}{\sqrt{\hat{\alpha}_i}}\leq k_i\leq\sqrt{\frac{2\pi}{3\gamma\tau_0}}\times\frac{1}{\sqrt{\hat{\alpha}_i}}.
% \end{align}
% Define $\kappa_1=\frac{\sqrt{\pi}-1}{\sqrt{\gamma}}$ and $\kappa_2=\sqrt{\frac{2\pi}{3\gamma\tau_0}}$. By Lemma \ref{lma:k_bound}, we have
% \begin{align}\label{ieq:ki_co3}
%     \hat{\alpha}_i\leq (i+2)^{-\frac{2}{3}}, \quad k_i\hat{\alpha}_i\geq \kappa_1(i+2)^{-\frac{1}{3}},\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
% \end{align}
% Then, by combining (\ref{ieq:ki_co3}) with Theorem \ref{thm:sgdm_conv} we could conclude
% that the sequence $\{x_t\}_{t=0}^{T-1}$ generated by the Algorithm \ref{alg:LRdrop} satisfies
% \begin{align}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[(n+1)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{3(1-\beta)\kappa_1[(n+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}.\label{ieq:conv_n}
% \end{align}
% By Equation (\ref{eq:T}) in Lemma \ref{lma:k_bound} we have that
% \begin{align*}
%     \frac{3\kappa_1}{5}(n-1)^{\frac{5}{3}}\leq\frac{3\kappa_1}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}]\leq T\leq\frac{3\kappa_2}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}]\leq\frac{3\kappa_2}{5}(n+1)^{\frac{5}{3}}.
% \end{align*}
% Therefore
% \begin{align}\label{ieq:n_bound}
%   (\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1\leq n\leq (\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1.
% \end{align}
% Combining (\ref{ieq:n_bound}) with (\ref{ieq:conv_n}) gives
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{2\beta(f(x_0)-f(x^*))[((\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1+1)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[((\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[((\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log ((\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1)}{3(1-\beta)\kappa_1[((\frac{5T}{3\kappa_2})^{\frac{3}{5}}-1+1)^{\frac{2}{3}}-2^{\frac{2}{3}}]}\notag\\
%     =&\frac{2\beta(f(x_0)-f(x^*))[(\frac{5T}{3\kappa_1})^{\frac{3}{5}}+2)^{\frac{1}{3}}-2^{\frac{1}{3}}]}{2\kappa_1(1-\beta)[(\frac{5T}{3\kappa_2})^{\frac{2}{5}}-2^{\frac{2}{3}}]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{3\kappa_1[(\frac{5T}{3\kappa_2})^{\frac{2}{5}}-2^{\frac{2}{3}}]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log ((\frac{5T}{3\kappa_1})^{\frac{3}{5}}+1)}{3(1-\beta)\kappa_1[(\frac{5T}{3\kappa_2})^{\frac{2}{5}}-2^{\frac{2}{3}}]}\\
%     =&O\left(T^{-\frac{1}{5}}\right).
% \end{align*}
% \end{proof}}

\begin{lma}\label{lma:k_bound2}
If sequences $\{\hat{\alpha}_i\}_{i=-1}^{n-1}\subset(0,1)$ and $\{k_i\}_{i=0}^n\subset\mathbb{N}$ satisfy:
\begin{align*}
    \hat{\alpha}_i=(i+2)^{-1},\quad \frac{\kappa_1}{\hat{\alpha}_i}\leq k_i\leq\frac{\kappa_2}{\hat{\alpha}_i},
\end{align*}
where $\kappa_1$, $\kappa_2$ are constants, then 
\begin{align}\label{ieq:ki_con2_2}
    \hat{\alpha}_i\leq (i+2)^{-1}, \quad k_i\hat{\alpha}_i\geq \kappa_1,\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
\end{align}
Moreover, suppose $T=\sum_{i=0}^{n-1}k_i$. If $n\gg 1$ the following holds
\begin{align}\label{eq:T2}
    \frac{\kappa_1n(n+3)}{2}\leq T\leq\frac{\kappa_2n(n+3)}{2}.
\end{align}
\end{lma}

\begin{proof}[Proof for Theorem \ref{thm:conv}]
The derivative of the angular velocity model is:
$$v_{\alpha}'(t)=\frac{\pi(1+\epsilon\alpha)}{2\gamma\alpha(t+1/\gamma\alpha)^2}.$$
Define the gaps of partition $\Pi:0=t_0<t_1<...<t_n=T$ derived from the Algorithm \ref{alg:LRdrop} as
$$k_i=t_{i+1}-t_i,\quad\forall i=0,...,n-1.$$
Since we drop the learning rate every time the derivative of the angular velocity is smaller that the threshold $\tau_i=\min\{\tau_0,\gamma\hat{\alpha}_i/2\}$, we have
\begin{align*}
    v_{\hat{\alpha}_i}'(k_i)=\tau_i\Longrightarrow k_i=(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\left[\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_i}}-(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\right].
\end{align*}

\begin{itemize}
    \item[i)] From $\tau_i=\min\{\tau_0,\gamma\hat{\alpha}_i/2\}$, we have $\tau_i\leq\gamma\hat{\alpha}_i/2$. Therefore,
    \begin{align}
        k_i\geq&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\left[\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\times(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}-(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\right]\notag\\
        =&\frac{1}{\gamma\hat{\alpha}_i}\left[\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}-1\right]\notag\\
        \geq&\frac{\sqrt{\pi}-1}{\gamma \hat{\alpha}_i}
    \end{align}
    \item[ii)] From $\tau_i=\min\{\tau_0,\gamma\hat{\alpha}_i/2\}$, we have
    \begin{align}
        k_i\leq&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\times \sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_i}}\notag\\
        =&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\max\left\{\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\right\}\notag\\
        =&\max\left\{(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\frac{1}{\gamma\hat{\alpha}_i}\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\right\}\notag\\
        \leq&\frac{1}{\gamma\hat{\alpha}_i}\max\left\{\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\right\}\notag.
    \end{align}
    Since $\epsilon\in(0,\frac{1}{3\hat{\alpha}_0})$ and $\tau_0<2$, we could conclude
    \begin{align*}
        k_i\leq\frac{1}{\gamma\hat{\alpha}_i}\max\left\{\sqrt{\frac{2\pi}{3\tau_0}},\sqrt{\frac{4\pi}{3}}\right\}\leq\frac{1}{\gamma\hat{\alpha}_i}\sqrt{\frac{2\pi}{3\tau_0}}.
    \end{align*}
\end{itemize}
Combine i) and ii), we have
\begin{align}
    \frac{\sqrt{\pi}-1}{\gamma}\times\frac{1}{\hat{\alpha}_i}\leq k_i\leq\frac{1}{\gamma}\sqrt{\frac{2\pi}{3\tau_0}}\times\frac{1}{\hat{\alpha}_i}.
\end{align}
Define $\kappa_1=\frac{\sqrt{\pi}-1}{\gamma}$ and $\kappa_2=\frac{1}{\gamma}\sqrt{\frac{2\pi}{3\tau_0}}$. By Lemma \ref{lma:k_bound2}, we have
\begin{align}\label{ieq:ki_co3_2}
    \hat{\alpha}_i\leq (i+2)^{-1}, \quad k_i\hat{\alpha}_i\geq \kappa_1,\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
\end{align}
Then, by combining (\ref{ieq:ki_co3_2}) with Theorem \ref{thm:sgdm_conv} we could conclude
that the sequence $\{x_t\}_{t=0}^{T-1}$ generated by the Algorithm \ref{alg:LRdrop} satisfies
\begin{align}\label{ieq:conv_n2}
    \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log(n+1)-\log 2]}{\kappa_1(1-\beta)n}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1n}\notag\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{2(1-\beta)\kappa_1n}.
\end{align}
By Equation (\ref{eq:T2}) in Lemma \ref{lma:k_bound2} we have that
\begin{align*}
    \frac{\kappa_1n(n+3)}{2}\leq T\leq\frac{\kappa_2n(n+3)}{2}.
\end{align*}
Therefore
\begin{align}\label{ieq:n_bound2}
   \sqrt{\frac{2T}{\kappa_2}}-3\leq n\leq \sqrt{\frac{2T}{\kappa_1}}.
\end{align}
Combining (\ref{ieq:n_bound2}) with (\ref{ieq:conv_n2}) gives
\begin{align*}
    \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log\left(\sqrt{\frac{2T}{\kappa_1}}+1\right)-\log 2]}{\kappa_1(1-\beta)\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}\notag\\
    &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log \left(\sqrt{\frac{2T}{\kappa_1}}\right)}{2(1-\beta)\kappa_1\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}\\
    =&O\left(\frac{\log T}{\sqrt{T}}\right)
\end{align*}
\end{proof}



\subsection{Proof for Lemma \ref{lma:k_bound2}}
% \begin{proof}[Proof for Lemma \ref{lma:k_bound}]
% First, we are showing bounds in (\ref{ieq:ki_con2}) one by one.
% \begin{enumerate}
%     \item[i)] Since $\gamma\epsilon<1$, $\hat{\alpha}_i=\gamma\epsilon(i+3)^{-1}\leq(i+3)^{-1}$.
%     \item[ii)] We are going to prove that $k_i\hat{\alpha}_i\geq (i+3)^{-\frac{1}{3}}$.
%     \begin{align*}
%       k_i\hat{\alpha}_i-(i+3)^{-\frac{1}{3}}=&\frac{\hat{\alpha}_i}{\gamma}+\frac{1}{\gamma\epsilon}-(i+3)^{-\frac{1}{3}}=\frac{\epsilon\hat{\alpha}_i-\gamma\epsilon(i+3)^{-\frac{1}{3}}+1}{\gamma\epsilon}\geq\frac{1-\gamma\epsilon(i+3)^{-\frac{1}{3}}}{\gamma\epsilon}\geq\frac{1-\gamma\epsilon3^{-\frac{1}{3}}}{\gamma\epsilon}\\
%       \geq& 0 \quad(\gamma\epsilon<1<\sqrt[3]{3})
%     \end{align*}
%     \item[iii)] We are going to prove that $k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq (i+2)^{-\frac{2}{3}}$.
%     \begin{align*}
%         k_i\hat{\alpha}_i\hat{\alpha}_{i-1}-(i+2)^{-\frac{2}{3}}=&\frac{\hat{\alpha}_i\hat{\alpha}_{i-1}}{\gamma}+\frac{\hat{\alpha}_{i-1}}{\gamma\epsilon}-(i+2)^{-\frac{2}{3}}=\frac{\epsilon\hat{\alpha}_i\hat{\alpha}_{i-1}+\hat{\alpha}_{i-1}-\gamma\epsilon(i+2)^{-\frac{2}{3}}}{\gamma\epsilon}\\
%         =&\frac{\gamma^2\epsilon^3(i+2)^{-1}(i+3)^{-1}+\gamma\epsilon(i+2)^{-1}-\gamma\epsilon(i+2)^{-\frac{2}{3}}}{\gamma\epsilon}\\
%         \leq&\gamma\epsilon^2(i+2)^{-2}+(i+2)^{-1}-(i+2)^{-\frac{2}{3}}\\
%         =&(i+2)^{-\frac{2}{3}}\left[(i+2)^{-\frac{1}{3}}\left(1+\frac{\gamma\epsilon^2}{i+2}\right)-1\right]\\
%         \leq& (i+2)^{-\frac{2}{3}}\left[2^{-\frac{1}{3}}\left(1+\frac{\gamma\epsilon^2}{2}\right)-1\right]\\
%         \leq& 0\quad (\gamma\epsilon^2\leq 2^{\frac{4}{3}}-1).
%     \end{align*}
    
% \end{enumerate}
% Secondly, compute $T=\sum_{i=0}^{n-1}k_i$ according to the definition of $k_i$.
%     \begin{align*}
%         T=&\sum_{i=0}^{n-1}k_i=\sum_{i=0}^{n-1}\frac{1}{\gamma}+\frac{1}{\gamma\epsilon\hat{\alpha}_i}=\frac{n}{\gamma}+\sum_{i=0}^{n-1}\frac{i+2}{\gamma^2\epsilon^2}=\frac{n}{\gamma}+\frac{3n}{\gamma^2\epsilon^2}+\frac{n(n-1)}{2\gamma^2\epsilon^2}=\frac{n^2+(2\gamma\epsilon^2+5)n}{2\gamma^2\epsilon^2}.
%     \end{align*}

% \end{proof}
% \begin{proof}[Proof for Lemma \ref{lma:k_bound}]
%  First, we show bounds from (\ref{ieq:ki_con2}) one by one:
% \begin{enumerate}
%     \item[i)]  $\hat{\alpha}_i=(i+2)^{-\frac{2}{3}}\leq(i+2)^{-\frac{2}{3}}$.
%     \item[ii)] $k_i\hat{\alpha}_i=\kappa_1\sqrt{\hat{\alpha}_i}=\kappa_1(i+2)^{-\frac{1}{3}}\geq\kappa_1(i+2)^{-\frac{1}{3}}$.
%     \item[iii)] $k_i\hat{\alpha}_i\hat{\alpha}_{i=1}\leq\kappa_2\sqrt{\hat{\alpha}_i}\hat{\alpha}_{i-1}=\kappa_2(i+2)^{-\frac{1}{3}}(i+1)^{-\frac{2}{3}}\leq\kappa_2(i+1)^{-1}$.
    
% \end{enumerate}
% Secondly, we compute $T=\sum_{i=0}^{n-1}k_i$ according to the definition of $k_i$. Because $n\gg1$, the sum of the sequence could be treated as an integral:
%     \begin{align*}
%         T=\sum_{i=0}^{n-1}k_i\leq\kappa_2\sum_{i=0}^{n-1}\sqrt{\frac{1}{\hat{\alpha}_i}}=\kappa_2\sum_{i=0}^{n-1}(i+2)^{\frac{1}{3}}=\kappa_2\int_{0}^{n-1}(i+2)^{\frac{1}{3}}=\frac{3\kappa_2}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}],
%     \end{align*}
% and
%     \begin{align*}
%         T=\sum_{i=0}^{n-1}k_i\geq\kappa_1\sum_{i=0}^{n-1}\sqrt{\frac{1}{\hat{\alpha}_i}}=\kappa_1\sum_{i=0}^{n-1}(i+2)^{\frac{1}{3}}=\kappa_1\int_{0}^{n-1}(i+2)^{\frac{1}{3}}=\frac{3\kappa_1}{5}[(n+1)^{\frac{5}{3}}-2^{\frac{5}{3}}].
%     \end{align*}
% \end{proof}
\begin{proof}[Proof for Lemma \ref{lma:k_bound2}]
 First, we show bounds from (\ref{ieq:ki_con2_2}) one by one:
\begin{enumerate}
    \item[i)]  $\hat{\alpha}_i=(i+2)^{-1}\leq(i+2)^{-1}$.
    \item[ii)] $k_i\hat{\alpha}_i\geq\kappa_1$.
    \item[iii)] $k_i\hat{\alpha}_i\hat{\alpha}_{i=1}\leq\kappa_2\hat{\alpha}_{i-1}=\kappa_2(i+1)^{-1}\leq\kappa_2(i+1)^{-1}$.
    
\end{enumerate}
Secondly, we compute $T=\sum_{i=0}^{n-1}k_i$ according to the definition of $k_i$. Because $n\gg1$, the sum of the sequence could be treated as an integral:
    \begin{align*}
        T=\sum_{i=0}^{n-1}k_i\leq\kappa_2\sum_{i=0}^{n-1}\frac{1}{\hat{\alpha}_i}=\kappa_2\sum_{i=0}^{n-1}(i+2)=\frac{\kappa_2n(n+3)}{2},
    \end{align*}
and
    \begin{align*}
        T=\sum_{i=0}^{n-1}k_i\geq\kappa_1\sum_{i=0}^{n-1}\frac{1}{\hat{\alpha}_i}=\kappa_1\sum_{i=0}^{n-1}(i+2)=\frac{\kappa_1n(n+3)}{2}.
    \end{align*}
\end{proof}

% {\color{red}
% \section{Model with faster convergence rate}
% This section shows that angular model (\ref{eq:AVModel2}) guarantees a faster convergence rate of $O(\log T/\sqrt{T})$ than $O(T^{-\frac{1}{5}})$ of previous angular model (\ref{eq:AVModel}).

% \subsection{Unified convergence analysis for SGD and momentum SGD with discrete learning rate drop}

% \begin{thm}\label{thm:sgdm_conv2}
% Suppose $f(x)$ is a convex function, $\mathbb{E}\left[\norm{ \mathcal{G}(x;\xi)-\mathbb{E}[\mathcal{G}(x;\xi)]}\right]\leq\delta^2$ and $\norm{\partial f(x)}\leq G$ for any $x$ and some non-negative $G$. Given a sequence of decreasing learning rates $\{\hat{\alpha}_i\}_{i=-1}^{n-1}\subset (0,1)$ and a sequence of integers $\{k_i\}_{i=0}^{n-1}\subset \mathbb{N}$ ($n\gg 1$), there exits constants $\kappa_1,\kappa_2$ such that
% \vspace{-0.07in}
% {\color{red}
% \begin{align}\label{ieq:ki_con2}
%     \hat{\alpha}_i\leq (i+2)^{-1}, \quad k_i\hat{\alpha}_i\geq \kappa_1,\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
% \end{align}}
% \vspace{-0.23in}

% Define a partition $\Pi:0=t_0<t_1<...<t_n=T (T=\sum_{i=0}^{n-1}k_i)$ based on the integer sequence $\{k_i\}_{i=0}^{n-1}$ such that the gap between $t_i$ and $t_{i+1}$ is $k_i$ ($k_i = t_{i+1}-t_i$). Run UM update defined in Equation~\ref{eq:sgdm} for $T$ iterations by setting the learning rate $\alpha_t$ based on a sequence $\{\hat{\alpha}_i\}_{i=-1}^{n-1}$ as
% \vspace{-0.1in}
% \begin{align}
%     \alpha_t=\hat{\alpha}_i,\quad\text{where }t_i\leq t< t_{i+1}.
% \end{align}
% \vspace{-0.23in}

% Then the following holds:
% \vspace{-0.1in}
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log(n+1)-\log 2]}{\kappa_1(1-\beta)n}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1n}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{2(1-\beta)\kappa_1n}.
% \end{align*}
% \vspace{-0.23in}
% \end{thm}

% \begin{proof}
%  We denote $\mathcal{G}(x_t;\xi_t)=\mathcal{G}(x_t)=\mathcal{G}_t$. The update formula (\ref{eq:sgdm}) implies the following recursions:
% \begin{align}
%     x_{t+1}+p_{t+1}=&x_t+p_t-\frac{\alpha_t}{1-\beta}\mathcal{G}(x_t)\\
%     v_{t+1}=&\beta v_t+((1-\beta)s-1)\alpha_t\mathcal{G}(x_t),
% \end{align}
% where $v_t=\frac{1-\beta}{\beta}p_t$ and $p_t$ is given by
% \begin{equation}
% p_t=\left\{
% \begin{aligned}\label{eq:sgdm3}
%       &\frac{\beta}{1-\beta}(x_t-x_{t-1}+s\alpha_{t-1}\mathcal{G}(x_{t-1})), \quad k\geq1\\
%       &0, \quad k=0
% \end{aligned}
% \right.  .  
% \end{equation}
% Define $\delta_t=\mathcal{G}_t-\partial f(x_t)$ and let $x^*$ be the optimal point. From the above recursions we have
% \begin{align}
%     &\norm{x_{t+1}+p_{t+1}-x^*}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t+p_t-x^*)^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2\!-\!\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T\mathcal{G}_t\!-\!\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T\mathcal{G}_t\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathcal{G}_{t-1}^T\mathcal{G}_t\!+\!\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\mathcal{G}_t}^2\notag\\
%     =&\norm{x_t+p_t-x^*}^2-\frac{2\alpha_t}{1-\beta}(x_t-x^*)^T(\delta_t+\partial f(x_t))-\frac{2\alpha_t\beta}{(1-\beta)^2}(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))+\left(\frac{\alpha_t}{1-\beta}\right)^2\norm{\delta_t+\partial f(x_t)}^2.
% \end{align}
% Note that
% \begin{align*}
%     &\mathbb{E}[(x_t-x^*)^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]\\
%     &\mathbb{E}[(x_{t}-x_{t-1})^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\\
%     &\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T(\delta_t+\partial f(x_t))]=\mathbb{E}[(\delta_{t-1}+\partial f(x_{t-1}))^T\partial f(x_t)]=\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\\
%     &\mathbb{E}[\norm{\delta_t+\partial f(x_t)}^2]=\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2].
% \end{align*}
% Taking the expectation on both sides gives the following
% \begin{align}\label{eq:norm_exp2}
%     &\mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\notag\\
%     =&\mathbb{E}[\norm{x_t+p_t-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[(x_t-x^*)^T\partial f(x_t)]-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[(x_{t}-x_{t-1})^T\partial f(x_t)]\notag\\
%     &-\frac{2s\alpha_t\alpha_{t-1}\beta}{(1-\beta)^2}\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]+\left(\frac{\alpha_t}{1-\beta}\right)^2(\mathbb{E}[\norm{\delta_t}^2]+\mathbb{E}[\norm{\partial f(x_t)}^2]).
% \end{align}
% Moreover, since f is convex,$\mathbb{E}\left[\norm{ \mathcal{G}(x;\xi)-\mathbb{E}[\mathcal{G}(x;\xi)]}\right]\leq\delta^2$, and $\norm{\nabla f(x)}\leq G$, then for any $x$
% \begin{align*}
%     &f(x_t)-f(x^*)\leq(x_t-x^*)^T\partial f(x_t)\\
%     &f(x_t)-f(x_{t-1})\leq(x_t-x_{t-1})^T\partial f(x_t)\\
%     &-\mathbb{E}[\mathcal{G}_{t-1}^T\partial f(x_t)]\leq\frac{\mathbb{E}[\norm{\mathcal{G}_{t-1}}^2+\norm{\partial f(x_t)}^2]}{2}\leq\delta^2/2+G^2\leq\delta^2+G^2\\
%     &\mathbb{E}[\norm{\delta_t}^2]\leq\delta^2,\quad\mathbb{E}[\norm{\partial f(x_t)}^2]\leq G^2.
% \end{align*}
% Therefore, (\ref{eq:norm_exp2}) can be rewritten as
% \begin{align}\label{eq:exp_update2}
%     \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
%     &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{2s\beta\alpha_t\alpha_{t-1}+\alpha_t^2}{(1-\beta)^2}(G^2+\delta^2).
% \end{align}
% Since $\hat{\alpha}_i$ is decreasing, it implies that $\alpha_t$ is non-increasing. Thus, (\ref{eq:exp_update2}) could be upper-bounded as
% \begin{align}\label{eq:exp_update2}
%     \mathbb{E}[\norm{x_{t+1}+p_{t+1}-x^*}^2]\leq&\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\\\notag
%     &-\frac{2\alpha_t\beta}{(1-\beta)^2}\mathbb{E}[f(x_t)-f(x_{t-1})]+\frac{(2s\beta+1)\alpha_t\alpha_{t-1}}{(1-\beta)^2}(G^2+\delta^2).
% \end{align}
% Taking $t=0,...,T-1$ and $x_{-1}=x_0$, and then summing all the inequalities gives
% \begin{align*}
%     \sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t+1}\!+\!p_{t+1}\!-\!x^*}^2]\leq&\sum_{t=0}^{T-1}\mathbb{E}[\norm{x_{t}+p_{t}-x^*}^2]-\sum_{t=0}^{T-1}\frac{2\alpha_t}{1-\beta}\mathbb{E}[f(x_t)-f(x^*)]\notag\\
%     &-\!\sum_{t=0}^{T-1}\frac{2\alpha_t\beta}{(1\!-\!\beta)^2}\mathbb{E}[f(x_t)\!-\!f(x_{t-1})]
%     \!+\!\frac{(2s\beta\!+\!1)(G^2\!+\!\delta^2)}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}.
% \end{align*}
% Therefore,
% \begin{align*}
%     \frac{2}{1\!-\!\beta}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_t)\!-\!f(x^*)]\leq&\norm{x_0\!-\!x^*}^2\!-\!\norm{x^{T}\!+\!p_{T}\!-\!x^*}\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})\!-\!f(x_t)]\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1},
% \end{align*}
% since $\alpha_{T-1}\leq...\leq\alpha_1\leq\alpha_0<1$, $\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq\mathbb{E}[f(x_t)-f(x^*)](\forall t=0,...,T-1)$. Then
% \begin{align*}
%     \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}\sum_{t=0}^{T-1}\alpha_t\mathbb{E}[f(x_{t-1})-f(x_t)]\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% Moreover, $\alpha_t=\hat{\alpha}_i (t_i\leq t< t_{i+1})$ implies that
% \begin{align*}
%     \frac{2}{1\!-\!\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)\!-\!f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0\!-\!x^*}^2\!+\!\frac{2\beta}{(1\!-\!\beta)^2}\sum_{i=0}^{n-1}\hat{\alpha}_i\mathbb{E}[f(x_{t_i})\!-\!f(x_{t_{i+1}})]\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% Since $\mathbb{E}[f(x_{t_i})-f(x_{t_{i+1}})]$ is always upper-bounded by $f(x_0)-f(x^*)$, we have
% \begin{align*}
%     \frac{2}{1-\beta}\min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\sum_{t=0}^{T-1}\alpha_t\leq&\norm{x_0-x^*}^2+\frac{2\beta}{(1-\beta)^2}[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{(1-\beta)^2}.
% \end{align*}
% After simplification, we have
% \begin{align}\label{ieq:min2}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{(1-\beta)\norm{x_0-x^*}^2}{2\sum_{t=0}^{T-1}\alpha_t}+\frac{\beta[f(x_0)-f(x^*)]\sum_{i=0}^{n-1}\hat{\alpha}_i}{(1-\beta)\sum_{t=0}^{T-1}\alpha_t}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}}{2(1-\beta)\sum_{t=0}^{T-1}\alpha_t}.
% \end{align}
% Because $\hat{\alpha}_i\leq (i+2)^{-1}$, $k_i\hat{\alpha}_i\geq \kappa_1(i+2)^{-\frac{1}{3}},$ $k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-\frac{2}{3}},\forall i=0,1,...,n-1(n\gg1)$,
% \begin{align}
%     \sum_{i=0}^{n-1}\hat{\alpha}_i&\leq\sum_{i=0}^{n-1}(i+2)^{-1}=\int_{0}^{n-1}(i+2)^{-1}=\log(n+1)-\log(2)\label{eq:sumi_2}\\
%     \sum_{t=0}^{T-1}\alpha_t&=\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\geq\sum_{i=0}^{n-1}\kappa_1=\kappa_1n\label{eq:sumt}\\
%     \sum_{t=0}^{T-1}\alpha_t\alpha_{t-1}&\leq\sum_{i=0}^{n-1}k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq\kappa_2\sum_{i=0}^{n-1}(i+1)^{-1}=\kappa_2\int_{0}^{n-1}(i+1)^{-1}=\kappa_2\log n\label{eq:sumt2_2}.
% \end{align}
% Substituting (\ref{eq:sumi_2}-\ref{eq:sumt2_2}) into inequality (\ref{ieq:min2}) gives 
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log(n+1)-\log 2]}{\kappa_1(1-\beta)n}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1n}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{2(1-\beta)\kappa_1n}.
% \end{align*}
% \end{proof}

% \subsection{Convergence Analysis of AutoDrop}\label{subsec:thm_auto_drop2}
% For a fixed learning rate $\alpha$, we introduce a simplified mathematical model of the behavior of the angular velocity as a function of iterations. The model is defined below (and depicted in Figure~\ref{fig:flat2}): 
% \vspace{-0.1in}
% {\color{red}
% \begin{equation}
%     v_{\alpha}(t)=\frac{\pi}{2}(1+\epsilon\alpha)\left(1-\frac{1}{\gamma\alpha (t+1/\gamma \alpha)}\right),
%     \label{eq:AVModel2}
% \end{equation}}
% \vspace{-0.23in}

% where $t$ is the number of iterations, $\epsilon$ and $\gamma$ are two constants that control the asymptote and curvature of the velocity. 

% \makeatletter\def\@captype{figure}\makeatother
% % \vspace{-0.3in}
% \vspace{-0.25in}
% \begin{minipage}{.38\textwidth}
% \centering
% \begin{figure}[H]
%     \centering
%     \includegraphics[width=\textwidth]{Figures/theory/angle_landscape_final.png}
%     \vspace{-0.15in}
%     \caption{Angular velocity model for a fixed learning rate $\alpha$.}
%     \label{fig:flat2}
% \end{figure}
% \vfill
% \end{minipage}
% \makeatletter\def\@captype{table}\makeatother
% \begin{minipage}{.61\textwidth}
% \centering
% \begin{algorithm}[H]
%     \centering
%     \caption{AutoDrop2 (approximate)}\label{alg:LRdrop2}
%     \begin{algorithmic}
%     \STATE \textbf{Inputs:} $x_0$: initial weight \\ 
%     \STATE \textbf{Hyperparameters:} $\{\hat{\alpha}_i\}$: set of learning rates, $v_{\alpha}(t)$: ang. vel. model, $\tau_0$: initial threshold for the derivative of ang. vel. \\ 
%     \STATE Initialize $i=0$, $t_0=0$, $t=0$\\
%     \WHILE{$i<n$}
%         \STATE Update $x_t$ via (\ref{eq:sgdm}) with learning rate $\alpha_t\!=\!\hat{\alpha}_i$.
%         \IF{{\color{red}$v'_{\hat{\alpha}_i}(t-t_i)\leq\min\{\tau_0, \gamma \hat{\alpha}_i/2\}$}}
%           \STATE $i=i+1; t_i=t$
%         \ENDIF
%         \STATE $t=t+1, T=t$
%     \ENDWHILE
%     \RETURN $\{x_t\}_{t=0}^{T-1}$ (T: $\#$ iterations)
%     \end{algorithmic}
% \end{algorithm}
% \end{minipage}

% \begin{wrapfigure}{r}{0.61\textwidth} 
% \vspace{-0.25in}
% \begin{center}
% \includegraphics[trim={0.3cm 0 0.3cm 2cm},clip,width=.3\textwidth]{Figures/theory/angle_landscape3_final.png}
% \includegraphics[trim={0.45cm 0 0.3cm 2cm},clip,width=.3\textwidth]{Figures/theory/Learning_rate_trend2_final.png}
% \end{center}
% \vspace{-0.25in}
% \caption{The behavior of the angular velocity (\textbf{left}) and the learning rate (\textbf{right}) for Algorithm~\ref{alg:LRdrop2}. The derivative threshold $\tau_i\!=\!\min\{\tau_0, \frac{\gamma\hat{\alpha}_i}{2}\}$.}
% \label{fig:AutoLR2}
% \vspace{-0.1in}
% \end{wrapfigure}
% $v_\alpha(t)$ saturates in $\frac{\pi}{2}[1+\epsilon\alpha]$ when $t$ goes to infinity. Note that the given model complies with the property P2 empirically observed and described in Section~\ref{sec:ME}: i) if the learning rate is large enough, the angular velocity saturates at a level larger than $\pi/2$ and smaller than $2\pi/3$; ii) as the learning rate decreases, the angular velocity saturates at progressively lower levels; iii) smaller learning rate leads to a slower saturation of angular velocity; iv) when the learning rate is low enough the angular velocity saturates at $\pi/2$. Lets assume an upper-bound $\alpha_{max}$ for the learning rate. Since the limit of the angular velocity should be between $\pi/2$ and $2\pi/3$, the range of factor $\epsilon$ is set to be $(0,\frac{1}{3\alpha_{max}})$.

% In this section, similar to the intuition in section \ref{sec:Theory}, we propose a new approximation (Algorithm \ref{alg:LRdrop2}) for Algorithm \ref{alg:ALRD}. The behavior of the angular velocity and the learning rate for Algorithm~\ref{alg:LRdrop2} is depicted in Figure~\ref{fig:AutoLR2}.

% \begin{thm}\label{thm:conv2}
% Suppose $f(x)$ is a convex function, $\mathbb{E}\left[\norm{ \mathcal{G}(x;\xi)-\mathbb{E}[\mathcal{G}(x;\xi)]}\right]\leq\delta^2$ and $\norm{\partial f(x)}\leq G$ for any $x$ and some non-negative $G$. Given the sequence of the learning rates $\{\hat{\alpha}_i\}_{i=-1}^{n-1}$ such that $\hat{\alpha}_i=(i+1)^{-\frac{2}{3}}$, parameters $\epsilon\in(0,\frac{1}{3\hat{\alpha}_0})$ and $\gamma$ defining the angular velocity model $v_{\alpha}(t)$ (Equation~\ref{eq:AVModel}), and the initial threshold $\tau_0$ ($\tau_0<2$) for the derivative of the angular velocity, the sequence of weights $\{x_t\}_{t=0}^{T-1}$ generated by Algorithm~\ref{alg:LRdrop} satisfies
% \vspace{-0.05in}
% \begin{align}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log\left(\sqrt{\frac{2T}{\kappa_1}}+1\right)-\log 2]}{\kappa_1(1-\beta)\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log \left(\sqrt{\frac{2T}{\kappa_1}}\right)}{2(1-\beta)\kappa_1\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}\\
%     =&O\left(\frac{\log T}{\sqrt{T}}\right),
% \end{align}
% \vspace{-0.2in}

% where $\kappa_1=\frac{\sqrt{\pi}-1}{\gamma}$ and $\kappa_2=\frac{1}{\gamma}\sqrt{\frac{2\pi}{3\tau_0}}$.
% \end{thm}

% First, we introduce Lemma~\ref{lma:k_bound2} which will be used in the proof for Theorem \ref{thm:conv2}. We prove this lemma later in this section.

% \begin{lma}\label{lma:k_bound2}
% If sequences $\{\hat{\alpha}_i\}_{i=-1}^{n-1}\subset(0,1)$ and $\{k_i\}_{i=0}^n\subset\mathbb{N}$ satisfy:
% \begin{align*}
%     \hat{\alpha}_i=(i+2)^{-1},\quad \frac{\kappa_1}{\hat{\alpha}_i}\leq k_i\leq\frac{\kappa_2}{\hat{\alpha}_i},
% \end{align*}
% where $\kappa_1$, $\kappa_2$ are constants, then 
% \begin{align}\label{ieq:ki_con2_2}
%     \hat{\alpha}_i\leq (i+2)^{-1}, \quad k_i\hat{\alpha}_i\geq \kappa_1,\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
% \end{align}
% Moreover, suppose $T=\sum_{i=0}^{n-1}k_i$. If $n\gg 1$ the following holds
% \begin{align}\label{eq:T2}
%     \frac{\kappa_1n(n+3)}{2}\leq T\leq\frac{\kappa_2n(n+3)}{2}.
% \end{align}
% \end{lma}

% \begin{proof}[Proof for Lemma \ref{lma:k_bound2}]
%  First, we show bounds from (\ref{ieq:ki_con2_2}) one by one:
% \begin{enumerate}
%     \item[i)]  $\hat{\alpha}_i=(i+2)^{-1}\leq(i+2)^{-1}$.
%     \item[ii)] $k_i\hat{\alpha}_i\geq\kappa_1$.
%     \item[iii)] $k_i\hat{\alpha}_i\hat{\alpha}_{i=1}\leq\kappa_2\hat{\alpha}_{i-1}=\kappa_2(i+1)^{-1}\leq\kappa_2(i+1)^{-1}$.
    
% \end{enumerate}
% Secondly, we compute $T=\sum_{i=0}^{n-1}k_i$ according to the definition of $k_i$. Because $n\gg1$, the sum of the sequence could be treated as an integral:
%     \begin{align*}
%         T=\sum_{i=0}^{n-1}k_i\leq\kappa_2\sum_{i=0}^{n-1}\frac{1}{\hat{\alpha}_i}=\kappa_2\sum_{i=0}^{n-1}(i+2)=\frac{\kappa_2n(n+3)}{2},
%     \end{align*}
% and
%     \begin{align*}
%         T=\sum_{i=0}^{n-1}k_i\geq\kappa_1\sum_{i=0}^{n-1}\frac{1}{\hat{\alpha}_i}=\kappa_1\sum_{i=0}^{n-1}(i+2)=\frac{\kappa_1n(n+3)}{2}.
%     \end{align*}
% \end{proof}

% \begin{proof}[Proof for Theorem \ref{thm:conv2}]
% The derivative of the angular velocity model is:
% $$v_{\alpha}'(t)=\frac{\pi(1+\epsilon\alpha)}{2\gamma\alpha(t+1/\gamma\alpha)^2}.$$
% Define the gaps of partition $\Pi:0=t_0<t_1<...<t_n=T$ derived from the Algorithm \ref{alg:LRdrop} as
% $$k_i=t_{i+1}-t_i,\quad\forall i=0,...,n-1.$$
% Since we drop the learning rate every time the derivative of the angular velocity is smaller that the threshold $\tau_i=\min\{\tau_0,\gamma\hat{\alpha}_i/2\}$, we have
% \begin{align*}
%     v_{\hat{\alpha}_i}'(k_i)=\tau_i\Longrightarrow k_i=(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\left[\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_i}}-(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\right].
% \end{align*}

% \begin{itemize}
%     \item[i)] From $\tau_i=\min\{\tau_0,\gamma\hat{\alpha}_i/2\}$, we have $\tau_i\leq\gamma\hat{\alpha}_i/2$. Therefore,
%     \begin{align}
%         k_i\geq&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\left[\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\times(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}-(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\right]\notag\\
%         =&\frac{1}{\gamma\hat{\alpha}_i}\left[\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}-1\right]\notag\\
%         \geq&\frac{\sqrt{\pi}-1}{\gamma \hat{\alpha}_i}
%     \end{align}
%     \item[ii)] From $\tau_i=\min\{\tau_0,\gamma\hat{\alpha}_i/2\}$, we have
%     \begin{align}
%         k_i\leq&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\times \sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_i}}\notag\\
%         =&(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\max\left\{\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\right\}\notag\\
%         =&\max\left\{(\gamma\hat{\alpha}_i)^{-\frac{1}{2}}\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\frac{1}{\gamma\hat{\alpha}_i}\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\right\}\notag\\
%         \leq&\frac{1}{\gamma\hat{\alpha}_i}\max\left\{\sqrt{\frac{\pi(1+\epsilon\hat{\alpha}_i)}{2\tau_0}},\sqrt{\pi(1+\epsilon\hat{\alpha}_i)}\right\}\notag.
%     \end{align}
%     Since $\epsilon\in(0,\frac{1}{3\hat{\alpha}_0})$ and $\tau_0<2$, we could conclude
%     \begin{align*}
%         k_i\leq\frac{1}{\gamma\hat{\alpha}_i}\max\left\{\sqrt{\frac{2\pi}{3\tau_0}},\sqrt{\frac{4\pi}{3}}\right\}\leq\frac{1}{\gamma\hat{\alpha}_i}\sqrt{\frac{2\pi}{3\tau_0}}.
%     \end{align*}
% \end{itemize}
% Combine i) and ii), we have
% \begin{align}
%     \frac{\sqrt{\pi}-1}{\gamma}\times\frac{1}{\hat{\alpha}_i}\leq k_i\leq\frac{1}{\gamma}\sqrt{\frac{2\pi}{3\tau_0}}\times\frac{1}{\hat{\alpha}_i}.
% \end{align}
% Define $\kappa_1=\frac{\sqrt{\pi}-1}{\gamma}$ and $\kappa_2=\frac{1}{\gamma}\sqrt{\frac{2\pi}{3\tau_0}}$. By Lemma \ref{lma:k_bound2}, we have
% \begin{align}\label{ieq:ki_co3_2}
%     \hat{\alpha}_i\leq (i+2)^{-1}, \quad k_i\hat{\alpha}_i\geq \kappa_1,\quad k_i\hat{\alpha}_i\hat{\alpha}_{i-1}\leq \kappa_2(i+1)^{-1},\quad \forall i=0,1,...,n-1.
% \end{align}
% Then, by combining (\ref{ieq:ki_co3_2}) with Theorem \ref{thm:sgdm_conv2} we could conclude
% that the sequence $\{x_t\}_{t=0}^{T-1}$ generated by the Algorithm \ref{alg:LRdrop2} satisfies
% \begin{align}\label{ieq:conv_n2}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log(n+1)-\log 2]}{\kappa_1(1-\beta)n}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1n}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log n}{2(1-\beta)\kappa_1n}.
% \end{align}
% By Equation (\ref{eq:T2}) in Lemma \ref{lma:k_bound2} we have that
% \begin{align*}
%     \frac{\kappa_1n(n+3)}{2}\leq T\leq\frac{\kappa_2n(n+3)}{2}.
% \end{align*}
% Therefore
% \begin{align}\label{ieq:n_bound2}
%   \sqrt{\frac{2T}{\kappa_2}}-3\leq n\leq \sqrt{\frac{2T}{\kappa_1}}.
% \end{align}
% Combining (\ref{ieq:n_bound2}) with (\ref{ieq:conv_n2}) gives
% \begin{align*}
%     \min_{t=0,...,T-1}\{\mathbb{E}[f(x_t)-f(x^*)]\}\leq&\frac{\beta(f(x_0)-f(x^*))[\log\left(\sqrt{\frac{2T}{\kappa_1}}+1\right)-\log 2]}{\kappa_1(1-\beta)\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}+\frac{(1-\beta)\norm{x_0-x^*}^2}{2\kappa_1\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}\notag\\
%     &+\frac{(2s\beta+1)(G^2+\delta^2)\kappa_2\log \left(\sqrt{\frac{2T}{\kappa_1}}\right)}{2(1-\beta)\kappa_1\left[\sqrt{\frac{2T}{\kappa_2}}-3\right]}\\
%     =&O\left(\frac{\log T}{\sqrt{T}}\right)
% \end{align*}
% \end{proof}


% }

\section{Experimental Details}
\label{sec:ED}

% \subsection{Data sets and models}
% \textbf{The CIFAR-$\mathbf{10}$ and CIFAR-$\mathbf{100}$ data sets} \citep{cifar} consist of $50$ K training images, with $10$ and $100$ different classes respectively. For CIFAR-$10$ experiments we used a ResNet-$18$ \citep{He2016DeepRL} and a WRN-$28$x$10$ \citep{Zagoruyko2016WRN} models. For CIFAR-$100$ experiments we used a ResNet-$34$ \citep{He2016DeepRL} and a WRN-$40$x$10$ \citep{Zagoruyko2016WRN} models. We do not use the dropout \citep{srivastava2014dropout} layers for WRN models in our experiments. The implementation involving WRN architecture and CIFAR data set relies on publicly available codes\footnote{https://github.com/meliketoy/wide-resnet.pytorch}.

% % We normalize each image by mean $(0.4914,0.4822,0.4465)$ and standard deviation $(0.2471,0.2435,0.2616)$. We also augment the training data by horizontal flips with a probability of $0.5$. We pad the images to $3 \times 40 \times 40$, extract random crops of size $3 \times 32 \times 32$ and present these to the network in batches of size $128$. The test loss and test error are computed from the test images.

% \subsection{Training setup}
% % For CIFAR-10 and CIFAR-100 experiments we refer to \citep{zhang2019lookahead} and \citep{Zagoruyko2016WRN} for ResNet and WRN models respectively. 
% {\color{blue}
% \subsection{Computational Cost}
% We want to emphasize that our goal is to design the automatic learning rate scheduler that could reach the SOTA performance. We did not intend to outperform the SOTA, but rather show that it is possible to design an automatic learning rate scheduler that indeed can match manual schemes that the SOTA relies on. Our method performance-wise matches or outperforms SOTA approach and wins with all other learning rate schedulers, manual and automatic. So for example existing automatic learning rate schedulers, HD and TLR, lose with SOTA since they suffer from the short-horizon problem, which we by design do not have. Regarding computational costs, note that our method does not introduce any additional significant extra computations compared to the existing optimization methods. To support that, see the table below. In this table we report the computational time for a single iteration of HD, TLR, SOTA Baseline, and AutoDrop run on the same machine (NVIDIA GeForce GTX 1080 Ti) for different models on different datasets. We use the same batch size of 64 for all methods to have a fair comparison. As you can see the training time for per-iteration is practically the same for all methods.
% \begin{table}[H]
% \centering
% \begin{tabular}{|p{4cm}||p{1.5cm}|p{1.5cm}|p{2.5cm}|p{2cm}|}
% \hline
% Training time/per-iter&HD&TLR&SOTA Baseline&AutoDrop\\
% \hline
% WRN28x10/CIFAR10&0.21s&0.23s&0.20s&0.20s\\
% \hline
% WRN40x10/CIFAR100&0.31s&0.31s&0.29s&0.30s\\
% \hline
% ResNet50/ImageNet&0.42s&0.43s&0.38s&0.40s\\
% \hline

% \end{tabular}
% % \vspace{0.1in}
% \caption{Comparison of per-iteration computation time for HD, TLR, SOTA Baseline and AutoDrop.}
% % \vspace{0.1in}
% \label{tab:time_supp}
% \end{table}
% }
\subsection{Image Classification}\label{supp:image}
% The selection of baselines for CIFAR-10 and CIFAR-100 is based on a thorough review of the literature. The baseline learning rate setting adheres to the approaches described in \citep{zhang2019lookahead} and \citep{Zagoruyko2016WRN}, with Step-LR yielding the best results for both tasks. In all our experiments, for the baseline we use the same setting of hyperparameters (including the learning rate schedule) as recommended in the referenced literature. \textcolor{red}{- to me it is unclear what is the baseline and what is the Step-LR.}
In addition to the SOTA Baseline referred in the main body of the paper, we also evaluated other competitors, including three manual learning rate schedulers (CLR, OneCycle, ExpLR) and two automatic learning rate schedulers (HD and TLR). 
For CLR~\cite{smith2017cyclical} we test with the textit{OneCycle} learning rate policy and \textit{triangular2} learning policy by adjusting the $stepsize$ (the number of iterations in half a cycle) for different models as recommended by the authors. For ExpLR~\citep{li2019exponential}, we grid search the decay factor from $\gamma=[0.8, 0.9, 0.95, 0.99, 0.999]$. For HD~\citep{gunes2018online} we grid search the hypergradient learning rate $\beta$ from $[1e-3, 1e-4, 1e-5]$ as suggested in the reference paper. For TLR~\citep{retsinas2022trainable} we set the gap $p$ for updating the learning rate as $0.33$ epoch and bound $c=1/4$, as recommended by the authors. For AutoDrop, We set $k=64$, $\rho=0.95$, and $m=10$ as referred in Section \ref{sec:HS}.

%For Penn TreeBank experiments we use the setup of \citep{zhuang2020adabelief}.

% In all our experiments, for the baseline we use the same setting of hyperparameters (including the learning rate schedule) as recommended in the referenced literature.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_cifar10_3/lr_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_cifar10_3/train_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_cifar10_3/train_error_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_cifar10_3/test_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_cifar10_3/test_error_all_0.05.jpg}
    \vspace{-0.1in}
    \caption{Experimental curves for ResNet18 model and CIFAR-$10$ data set: learning rate, test loss, test error, and zoomed subplots.}
    \label{fig:cifar10_resnet18}
\end{figure}

\begin{figure}[H]
    \centering
    \vspace{-0.1in}
    \includegraphics[width=0.3\textwidth]{Figures/wrn28x10_cifar10_3/lr_all_0.1.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn28x10_cifar10_3/train_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn28x10_cifar10_3/train_error_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn28x10_cifar10_3/test_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn28x10_cifar10_3/test_error_all_0.05.jpg}
    \vspace{-0.1in}
    \caption{Experimental curves for task WRN$28$x$10$/CIFAR-$10$: learning rate, test loss, test error, and zoomed subplots.}
    \label{fig:cifar10_wrn28x10}
    \vspace{-0.1in}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{-0.1in}
    \includegraphics[width=0.3\textwidth]{Figures/resnet34_cifar100_3/lr_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet34_cifar100_3/train_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet34_cifar100_3/train_error_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet34_cifar100_3/test_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet34_cifar100_3/test_error_all_0.05.jpg}
    % \vspace{-0.2in}
    \caption{Experimental curves for ResNet$34$ model and CIFAR-$100$ data set: learning rate, test loss, test error, and zoomed subplots.}
    \label{fig:cifar10_resnet18_2}
    % \vspace{-0.2in}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{-0.1in}
    \includegraphics[width=0.3\textwidth]{Figures/wrn40x10_cifar100_3/lr_all_0.1.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn40x10_cifar100_3/train_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn40x10_cifar100_3/train_error_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn40x10_cifar100_3/test_loss_all_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/wrn40x10_cifar100_3/test_error_all_0.05.jpg}
    % \vspace{-0.2in}
    \caption{Experimental curves for WRN$40$x$10 $model and CIFAR-$100$ data set: learning rate, test loss, test error, and zoomed subplots.}
    \label{fig:cifar10_resnet18_2}
    % \vspace{-0.2in}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{-0.1in}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_imagenet/lr_0.1.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_imagenet/train_loss_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_imagenet/train_error_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_imagenet/test_loss_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet18_imagenet/test_accu_0.05.jpg}
    % \vspace{-0.2in}
    \caption{Experimental curves for ResNet18 model and ImageNet data set: learning rate, test loss, test error, and zoomed subplots.}
    \label{fig:cifar10_resnet18_2}
    % \vspace{-0.2in}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{-0.1in}
    \includegraphics[width=0.3\textwidth]{Figures/resnet50_imagenet/lr_0.1.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet50_imagenet/train_loss_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet50_imagenet/train_error_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet50_imagenet/test_loss_0.05.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/resnet50_imagenet/test_accu_0.05.jpg}
    % \vspace{-0.2in}
    \caption{Experimental curves for ResNet50 model and ImageNet data set: learning rate, test loss, test error, and zoomed subplots.}
    \label{fig:cifar10_resnet18_2}
    % \vspace{-0.2in}
\end{figure}

Regarding convergence, note that the theoretical convergence of our method is shown in the paper and the rate in theory matches traditional optimizers, such as SGD. The convergence curves are shown above. The curves reveal that AutoDrop converges to SOTA performance, unlike other methods. Furthermore, the minimum test error for different methods at different epochs (50, 100, 150, 200) for the ResNet18/CIFAR10 task is shown in Table \ref{tab:convergence_epoch}. The table demonstrates that AutoDrop reaches comparable performance to SOTA Baseline with sightly faster convergence rate that others cannot attain.
\begin{table}[H]
\centering
\begin{tabular}{|p{1.5cm}||p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|}
\hline
Test Error&HD&TLR&CLR&OneCycle&ExpLR&SOTA Baseline&AutoDrop\\
\hline
50 epoch&8.87&6.79&8.62&9.56&6.81&11.18&8.76\\
\hline
100 epoch &6.84&5.68&5.46&8.56&5.98&5.95&5.87\\
\hline
150 epoch&6.81&5.59&5.3&8.14&5.98&4.95&4.90\\
\hline
200 epoch&6.78 &5.48&5.16 &4.96 &5.95&4.78&4.76\\
\hline
\end{tabular}
% \vspace{0.1in}
\caption{Minimum test error for different methods at different epochs (50, 100, 150, 200) for the ResNet18/CIFAR10.}
% \vspace{0.1in}
\label{tab:convergence_epoch}
\end{table}

\subsection{Machine Translation}\label{supp:mt}
A transformer model based on \cite{vaswani2017attention} was trained to translate German to English on the WMT2014 data set~\citep{bojar2014findings}, using ADAM~\citep{kingma2015} optimizer. The performance of our AutoDrop is compared with ReduceLROnPlateau~\citep{ReduceLROnPlateau}, HD and TLR. We train the model for 10K iterations. Table~\ref{tab:mt} displays the BLEU score obtained on the test data set. The proposed optimizer led to the highest score on the machine translation task. Figure \ref{fig:tran_wmt14} in Supplementary \ref{supp:mt} displays the training curve and shows that AutoDrop also converges faster than other methods.
\begin{figure}[H]
    \centering
    \vspace{-0.1in}
    \includegraphics[width=0.3\textwidth]{Figures/tran_wmt14/lr_all.jpg}
    \includegraphics[width=0.3\textwidth]{Figures/tran_wmt14/val_loss_all.jpg}\includegraphics[width=0.3\textwidth]{Figures/tran_wmt14/test_bleu_score_all.jpg}
    % \vspace{-0.1in}
    \caption{Experimental curves for Transformer and WMT14 data set: learning rate, validation loss, test BLEU score, and zoomed subplots.}
    \label{fig:tran_wmt14}
    % \vspace{-0.2in}
\end{figure}

% \begin{figure}[H]
%     \centering
%     \includegraphics[width=0.24\textwidth]{Figures/resnet18_cifar10_2/lr_all_0.05.jpg}
%     % \includegraphics[width=0.3\textwidth]{Figures/resnet18_cifar10_2/test_loss_all_0.05.jpg}
%     \includegraphics[width=0.24\textwidth]{Figures/resnet18_cifar10_2/test_error_all_0.05.jpg}
%     \includegraphics[width=0.24\textwidth]{Figures/wrn28x10_cifar10_2/lr_all_0.1.jpg}
%     % \includegraphics[width=0.3\textwidth]{Figures/wrn28x10_cifar10_2/test_loss_all_0.05.jpg}
%     \includegraphics[width=0.24\textwidth]{Figures/wrn28x10_cifar10_2/test_error_all_0.05.jpg}
%     \vspace{-0.1in}
%     \caption{Experimental curves for ResNet18/CIFAR-$10$ \& WRN28x10/CIFAR10: learning rate, test loss, test error, and zoomed subplots.}
%     \label{fig:cifar10_resnet18}
% \end{figure}
