% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{pifont}
\usepackage{comment}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\input{math_comands.tex}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{han_567}

% \usepackage{xcite}
% \externalcitedocument{han_567}

% \let\oldbibitem\bibitem
% \renewcommand{\bibitem}[1]{\oldbibitem[\citep{#1}]{#1}}
% \makeatletter
% \renewcommand\@biblabel[1]{#1}
% \makeatother


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{On the Convergence of Continual Learning with Adaptive Methods\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
\author[1]{\href{mailto:<seungyubhan@snu.ac.kr>?Subject=On the convergence of continual learning}{Seungyub Han}{}}
\author[1]{Yeongmo Kim}
\author[1]{Taehyun Cho}
\author[1]{Jungwoo Lee}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Electrical and Computer Engineering Dept.\\
    Seoul National University\\
    Seoul, Republic of Korea
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\appendix


%%%%%%%%%%%%%%%%%
\section{Additional Backgrounds and Extended Discussion}\label{sec:append_back}
\subsection{Summary of notations}
\begin{table}[hbt!]
\centering
\begin{tabular}{@{}cccc@{}}
\toprule
Notations     & Definitions                             & Notations                       & Definitions                           \\ \midrule
$x$           & model parameter                         & $H_t$                           & the union of $I_t$ and $J_t$          \\
$\mathcal{P}$ & previous task                          & $n_f$                           & the number of data points in $P$      \\
$\mathcal{C}$ & current task                          & $n_g$                           & the number of data points in $C$      \\
$P$           & dataset of $\mathcal{P}$                & $\langle \cdot , \cdot \rangle$ & inner product                         \\
$C$           & dataset of $\mathcal{C}$                & $L$                             & $L$-smoothness constant               \\
$h(x)$       & mean loss of $x$ on entire datasets     & $\alpha_{H_t}$  & adaptive step size for $f$ with $H_t$                       \\
$f(x)$        & mean loss of $x$ on $P$                 & $\beta_{H_t}$                   & adaptive step size for $g$ with $H_t$ \\
$g(x)$        & mean loss of $x$ on $C$                 & $M_t$                           & memory at time $t$                    \\
$f_{i}(x)$    & loss of $x$ on a data point $i\in P$    & $e_t$                           & error of estimate $f$ at time $t$     \\
$g_{j}(x)$    & loss of $x$ on a data point $j \in C$   & $e_{M_t}$                       & error of estimate $f$ with $M_t$      \\
$f_{I_t}(x)$  & mini-batch loss of $x$ on a batch $I_t$ & $f_{M_t}$                       & mean loss of $x$ with $M_t$           \\
$g_{J_t}(x)$ & mini-batch loss of $x$ on a batch $J_t$ & $M_{[t1:t2]}$   & the history of memory from $t1$ to $t2$                     \\
$I_t$         & minibatch sampled from $P$              & $B_t$                           & memory bias term at $t$           \\
$J_t$         & minibatch sampled from $C$              & $\Gamma_t$                      & forgetting term at $t$            \\
$\E_t$       & total expectation from 0 to time $t$    & $\Lambda_{H_t}$ & inner product between $\nabla f_{I_t}$ and $\nabla g_{J_t}$ \\ \bottomrule
\end{tabular}
\end{table}

\subsection{Review of terminology}
\textbf{(Restriction of $f$)} If $f : A \rightarrow B$ and if $A_0$ is a subset of $A$, then the \textbf{restriction of $f$ to $A_0$} is the function
\begin{equation*}
    f|_{A_0} : A_0 \rightarrow B
\end{equation*}
given by $f|_{A_0}(x) = f(x)$ for $x \in A_0$.

\subsection{Additional Related work}

\textbf{Regularization based methods.} EWC has an additional penalization loss that prevent the update of parameters from losing the information of previous tasks. When we update a model with EWC, we have two gradient components from the current task and the penalization loss.

\textbf{task-specific model components.} SupSup learns a separate subnetwork for each task to predict a given data by superimposing all supermasks. It is a novel method to solve catastrophic forgetting with taking advantage of neural networks.

\textbf{SGD methods without expereince replay.}  stable SGD \citep{mirzadeh2020understanding} and MC-SGD \citep{jin2021gradient} show overall higher performance in terms of average accuracy than the proposed algorithm. For average forgetting, our method has the lowest value, which means that NCCL prevents catastrophic forgetting successfully with achieving the reasonable performance on the current task. We think that our method is focused on reducing catastrophic forgetting as we defined in the reformulated continual learning problem (12), so our method shows the better performance on average forgetting. Otherwise, MC-SGD finds a low-loss paths with mode-connectivity by updating with the proposed regularization loss. This procedure implies that a continual learning model might find a better local minimum point for the new (current) task than NCCL.

For non-memory based methods, the theoretical measure to observe forgetting and convergence during training does not exist. Our theoretical results are the first attempt to analyze the convergence of previous tasks during continual learning procedure. In future work, we can approximate the value of  with fisher information for EWC and introduce Bayesian deep learning to analyze the convergence of each subnetworks for each task in the case of SupSup \citep{wortsman2020supermasks}.




\section{Additional Experimental Results and Implementation Details}\label{sec:append_exp}
 We implement the baselines and the proposed method on Tensorflow 1. For evaluation, we use an NVIDIA 2080ti GPU along with 3.60 GHz Intel i9-9900K CPU and 64 GB RAM.

\subsection{Architecture and Training detail} 
For fair comparison, we follow the commonly used model architecture and hyperparameters of \citep{DBLP:conf/iclr/LeeHZK20, chaudhry2020continual}.
For Permuted-MNIST and Split-MNIST, we use fully-connected neural networks with two hidden layers of $[400,400]$ or $[256,256]$ and ReLU activation. ResNet-18 with the number of filters $n_f=64, 20$ \citep{he2016deep} is applied for Split CIFAR-10 and 100.
 All experiments conduct a single-pass over the data stream. It is also called 1 epoch or 0.2 epoch (in the case of split tasks). We deal both cases with and without the task identifiers in the results of split-tasks to compare fairly with baselines. Batch sizes of data stream and memory are both 10.
 All reported values are the average values of 5 runs with diffrent seeds, and we also provide standard deviation. Other miscellaneous settings are the same as in \citep{chaudhry2020continual}.

\subsection{Hyperparameter grids}
We report the hyper-paramters grid we used in our experiments below.
Except for the proposed algorithm, we adopted the hyper-paramters that are reported in the original papers.
We used grid search to find the optimal parameters for each model.
\begin{itemize}
    \item finetune
    - learning rate [0.003, 0.01, 0.03 (CIFAR), 0.1 (MNIST), 0.3, 1.0]

\item EWC
    - learning rate: [0.003, 0.01, 0.03 (CIFAR),
    0.1 (MNIST), 0.3, 1.0]
    - regularization: [0.1, 1, 10 (MNIST,CIFAR), 100, 1000]

\item A-GEM
    - learning rate: [0.003, 0.01, 0.03 (CIFAR), 0.1 (MNIST), 0.3, 1.0]

\item ER-Ring
    - learning rate: [0.003, 0.01, 0.03 (CIFAR), 0.1 (MNIST), 0.3, 1.0]

\item ORTHOG-SUBSPACE
    - learning rate: [0.003, 0.01, 0.03, 0.1 (MNIST), 0.2, 0.4 (CIFAR), 1.0]

\item MER
    - learning rate: [0.003, 0.01, 0.03 (MNIST, CIFAR), 0.1, 0.3, 1.0]
    - within batch meta-learning rate: [0.01, 0.03, 0.1
    (MNIST, CIFAR), 0.3, 1.0]
    - current batch learning rate multiplier: [1, 2, 5 (CIFAR), 10 (MNIST)]

\item iid-offline and iid-online
    - learning rate [0.003, 0.01, 0.03 (CIFAR), 0.1 (MNIST), 0.3, 1.0]

\item ER-Reservoir
    - learning rate: [0.003, 0.01, 0.03, 0.1 (MNIST, CIFAR), 0.3, 1.0]

\item NCCL-Ring (default)
    - learning rate $\alpha$: [0.003, 0.001(CIFAR), 0.01, 0.03, 0.1, 0.3, 1.0]

\item NCCL-Reservoir
    - learning rate $\alpha$: [0.003(CIFAR), 0.001, 0.01, 0.03, 0.1, 0.3, 1.0]
\end{itemize}

\subsection{Hyperparameter Search on $\beta_{max}$ and Training Time}
\begin{table}[hbt!]
\caption{Permuted-MNIST (23 tasks 10000 examples per task), FC-[256,256] and Multi-headed split-CIFAR100, full size Resnet-18.  Accuracies with different clipping rate on NCCL + Ring.}
\centering
\begin{tabular}{@{}ccc@{}}
\toprule
\textbf{$\beta_{max}$} & \textbf{Permuted-MNIST} & \textbf{Split-CIFAR100} \\ \midrule
0.001                  & 72.52(0.59)             & 49.43(0.65)             \\
0.01                   & 72.93(1.38)             & 56.95(1.02)             \\
0.05                   & 72.18(0.77)             & 56.35(1.42)             \\
0.1                    & 72.29(1.34)             & 58.20(0.155)            \\
0.2                    & 74.38(0.89)             & 57.60(0.36)             \\
0.5                    & 72.95(0.50)             & 59.06(1.02)             \\
1                      & 72.92(1.07)             & 57.43(1.33)             \\
5                      & 72.31(1.79)             & 57.75(0.24)             \\ \bottomrule
\end{tabular}
\label{tab:clipping}
\end{table}


\begin{table}[hbt!]
\caption{Permuted-MNIST (23 tasks 10000 examples per task), FC-[256,256] and Multi-headed split-CIFAR100, full size Resnet-18. Training time.}
\centering
\begin{tabular}{@{}ccc@{}}
\toprule
\multirow{2}{*}{\textbf{Methods}} & \multicolumn{2}{c}{\textbf{Training time {[}s{]}}} \\ \cmidrule(l){2-3} 
                                  & \textbf{Permuted-MNIST}  & \textbf{Split-CIFAR100} \\ \midrule
fine-tune                         & 91                       & 92                      \\
EWC                               & 95                       & 159                     \\
A-GEM                             & 180                      & 760                     \\
ER-Ring                           & 109                      & 129                     \\
ER-Reservoir                      & 95                       & 113                     \\
ORTHOG-SUBSPACE                   & 90                       & 581                     \\
NCCL+Ring                         & 167                      & 248                     \\
NCCL+Reservoir                    & 168                      & 242                     \\ \bottomrule
\end{tabular}
\end{table}
\clearpage

\subsection{Additional Experiment Results}
\label{sec:addresult}

% \begin{figure*}[hbt!]
% \centering
% \includegraphics[width=\linewidth]{figures/vis1.pdf}
% \caption{Illustration of how the cumulative forgetting term, $\sum_{t}\E[\Gamma_t]$ controls the performance of continual learning. (a) Evolution of $\sum_{t}\E[\Gamma_t]$ over 20 seqeuntial tasks from multi-headed split-CIFAR100 with reduced Resnet-18 and memory size 1. (b) zoomed version of (1). (c) the relation between $\sum_{t}\E[\Gamma_t]$ and average accuracy. (d) the relation between $\sum_{t}\E[\Gamma_t]$ and forgetting. 
% }
% \label{fig:vis1}
% \end{figure*}

% Figure \ref{fig:vis1} shows that $\sum_{t}\E[\Gamma_t]$ increases over tasks in all continual learning algorithms, and the proposed algorithms, NCCL-ring and NCCL-reservoir successfully suppress $\sum_{t}\E[\Gamma_t]$.
% In Figure \ref{fig:vis1}-(c),(d), we can observe that the higher $\sum_{t}\E[\Gamma_t]$ has both the smaller average accuracy and the larger forgetting.
% Therefore, we conclude that minimizing $\sum_{t}\E[\Gamma_t]$ is a key factor to continual learning empirically.

% % \begin{figure*}[hbt!]
% % \centering
% % \includegraphics[width=0.6\linewidth]{figures/vis2.pdf}
% % \caption{Illustration of empirical $B_t$ with different 5 seeds at the end of tasks.
% % }
% % \label{fig:vis2}
% % \end{figure*}

% Figure \ref{fig:vis2} shows that the emprical property of bias term.
% We can observe that the empirical mean value over different selections of $M_0$ is 0 surprisingly.
% In addition, we can also note that its variance is not large.
% Therefore, the theoretical analysis on overfitting memory is verified.

% \begin{table}[hbt!]
% \caption{Multi-headed split-CIFAR100, full size Resnet-18 $n_f=64$. Accuracy and forgetting results.}
% \centering
% \begin{tabular}{|c|c|c|c|c|c|}
% \hline
%                  & memory & \multicolumn{2}{c|}{1}    & \multicolumn{2}{c|}{5}     \\ \hline
% Method           &        & accuracy    & forgetting  & accuracy    & forgetting   \\ \hline
% Fintune          & x      & 42.6(2.72)  & 0.27(0.02)  & 42.6(2.72)  & 0.27(0.02)   \\ \hline
% EWC              & x      & 43.2(2.77)  & 0.26(0.02)  & 43.2(2.77)  & 0.26(0.02)   \\ \hline
% ICRAL            & o      & 46.4(1.21)  & 0.16(0.01)  & -           & -            \\ \hline
% A-GEM            & o      & 51.3(3.49)  & 0.18(0.03)  & 60.9(2.5)   & 0.11(0.01)   \\ \hline
% MER              & o      & 49.7(2.97)  & 0.19(0.03)  & -           & -            \\ \hline
% ER-Ring          & o      & 59.6(1.19)  & 0.14(0.01)  & 67.2(1.72)  & 0.06(0.01)   \\ \hline
% ER-Reservoir     & o      & 51.5(2.15)  & 0.14(0.09)  & 62.68(0.91) & 0.06(0.01)   \\ \hline
% ORTHOG-subspace  & o      & 64.3(0.59)  & 0.07(0.01)  & 67.3(0.98)  & 0.05(0.01)   \\ \hline
% NCCL + Ring      & o      & 59.06(1.02) & 0.03(0.02)  & 66.58(0.12) & 0.004(0.003) \\ \hline
% NCCL + Reservoir & o      & 54.7(0.91)  & 0.083(0.01) & 66.37(0.19) & 0.004(0.001) \\ \hline
% \end{tabular}
% \end{table}

\begin{table*}[hbt!]
\caption{Permuted-MNIST (23 tasks 60000 examples per task), FC-[256,256].}
    \centering
\begin{tabular}{@{}cccccc@{}}
\toprule
\multirow{2}{*}{\textbf{Method}} & \textbf{memory size}       & \multicolumn{2}{c}{\textbf{1}} & \multicolumn{2}{c}{\textbf{5}} \\ \cmidrule(l){2-6} 
                                 & \textbf{memory}            & accuracy       & forgetting    & accuracy      & forgetting     \\ \midrule
multi-task                       & \ding{55} & 83             & -             & 83            & -              \\
Fine-tune                        & \ding{55} & 53.5 (1.46)     & 0.29 (0.01)    & 47.9          & 0.29 (0.01)     \\
EWC                              & \ding{55} & 63.1 (1.40)     & 0.18 (0.01)    & 63.1 (1.40)    & 0.18 (0.01)     \\
stable SGD                       & \ding{55} & 80.1 (0.51)     & 0.09 (0.01)    & 80.1 (0.51)    & 0.09 (0.01)     \\
MC-SGD                         & \ding{55} & 85.3 (0.61)     & 0.06 (0.01)    & 85.3 (0.61)    & 0.06 (0.01)     \\
MER                              & \ding{51} & 69.9 (0.40)     & 0.14 (0.01)    & 78.3 (0.19)    & 0.06 (0.01)     \\
A-GEM                            & \ding{51} & 62.1 (1.39)     & 0.21 (0.01)    & 64.1 (0.74)    & 0.19 (0.01)     \\
ER-Ring                          & \ding{51} & 70.2 (0.56)     & 0.12 (0.01)    & 75.8 (0.24)    & 0.07 (0.01)     \\
ER-Reservoir                     & \ding{51} & 68.9 (0.89)     & 0.15 (0.01)    & 76.2 (0.38)    & 0.07 (0.01)     \\
ORHOG-subspace                   & \ding{51} & 84.32 (1.10)    & 0.12 (0.01)    & 84.32 (1.1)    & 0.11 (0.01)     \\ \midrule
NCCL + Ring                      & \ding{51} & 74.22 (0.75)    & 0.13 (0.007)   & 84.41 (0.32)   & 0.053 (0.002)   \\
NCCL+Reservoir                   & \ding{51} & 79.36 (0.73)    & \textbf{0.12 (0.007)}   & \textbf{88.22 (0.26)}   & \textbf{0.028 (0.003)}   \\ \bottomrule
\end{tabular}
    \label{tab:permuted_60000}
\end{table*}


\begin{table*}[hbt!]
\caption{Multi-headed split-CIFAR100, reduced size Resnet-18 $n_f=20$.}
\centering
\begin{tabular}{@{}cccccc@{}}
\toprule
\multirow{2}{*}{\textbf{Method}} & \textbf{memory size}       & \multicolumn{2}{c}{\textbf{1}} & \multicolumn{2}{c}{\textbf{5}} \\ \cmidrule(l){2-6} 
                                 & \textbf{memory}            & accuracy       & forgetting    & accuracy      & forgetting     \\ \midrule
EWC                              & \ding{55} & 42.7 (1.89)     & 0.28 (0.03)    & 42.7 (1.89)    & 0.28 (0.03)     \\
Fintune                          & \ding{55} & 40.4 (2.83)     & 0.31 (0.02)    & 40.4 (2.83)    & 0.31 (0.02)     \\
Stable SGD                       & \ding{55} & 59.9 (1.81)     & 0.08 (0.01)    & 59.9 (1.81)    & 0.08 (0.01)     \\
MC-SGD                       & \ding{55} & 63.3 (2.21)     & 0.06 (0.03)    & 63.3 (2.21)    & 0.06 (0.03)     \\
A-GEM                            & \ding{51} & 50.7 (2.32)     & 0.19 (0.04)    & 59.9 (2.64)    & 0.10 (0.02)     \\
ER-Ring                          & \ding{51} & 56.2 (1.93)     & 0.13 (0.01)    & 62.6 (1.77)    & 0.08 (0.02)     \\
ER-Reservoir                     & \ding{51} & 46.9 (0.76)     & 0.21 (0.03)    & 65.5 (1.99)    & 0.09 (0.02)     \\
ORTHOG-subspace                  & \ding{51} & 58.81 (1.88)    & 0.12 (0.02)    & 64.38 (0.95)   & 0.055 (0.007)   \\ \midrule
NCCL + Ring                      & \ding{51} & 54.63 (0.65)    & \textbf{0.059 (0.01)}   & 61.09 (1.47)   & \textbf{0.02 (0.01)}     \\
NCCL + Reservoir                 & \ding{51} & 52.18 (0.48)    & 0.118 (0.01)   & 63.68 (0.18)   & 0.028 (0.009)   \\ \bottomrule
\end{tabular}
    \label{tab:cifar100_reduced}
\end{table*}



\begin{table}[hbt!]
\caption{Multi-headed split-MiniImagenet, full size Resnet-18 $n_f=64$. Accuracy and forgetting results.}
\centering
\begin{tabular}{@{}cccc@{}}
\toprule
\multirow{2}{*}{Method} & memory size                & \multicolumn{2}{c}{1}   \\ \cmidrule(l){2-4} 
                        & memory                     & accuracy   & forgetting \\ \midrule
Fintune                 & \ding{55} & 36.1(1.31) & 0.24(0.03) \\
EWC                     & \ding{55} & 34.8(2.34) & 0.24(0.04) \\
A-GEM                   & \ding{51} & 42.3(1.42) & 0.17(0.01) \\
MER                     & \ding{51} & 45.5(1.49) & 0.15(0.01) \\
ER-Ring                 & \ding{51} & 49.8(2.92) & 0.12(0.01) \\
ER-Reservoir            & \ding{51} & 44.4(3.22) & 0.17(0.02) \\
ORTHOG-subspace         & \ding{51} & 51.4(1.44) & 0.10(0.01) \\
NCCL + Ring & \ding{51} & 45.5(0.245) & \textbf{0.041(0.01)} \\
NCCL + Reservoir & \ding{51} & 41.0(1.02) & \textbf{0.09(0.01)} \\\bottomrule
\end{tabular}
\end{table}

% \begin{table*}[hbt!]
% \caption{Multi-headed split-CIFAR100, reduced size Resnet-18 $n_f=20$.}
% \centering
% \begin{tabular}{@{}cccccc@{}}
% \toprule
% \multirow{2}{*}{\textbf{Method}} & \textbf{memory size}       & \multicolumn{2}{c}{\textbf{1}} & \multicolumn{2}{c}{\textbf{5}} \\ \cmidrule(l){2-6} 
%                                  & \textbf{memory}            & accuracy       & forgetting    & accuracy      & forgetting     \\ \midrule
% EWC                              & \ding{55} & 42.7 (1.89)     & 0.28 (0.03)    & 42.7 (1.89)    & 0.28 (0.03)     \\
% Fintune                          & \ding{55} & 40.4 (2.83)     & 0.31 (0.02)    & 40.4 (2.83)    & 0.31 (0.02)     \\
% Stable SGD                       & \ding{55} & 59.9 (1.81)     & 0.08 (0.01)    & 59.9 (1.81)    & 0.08 (0.01)     \\
% MC-SGD                       & \ding{55} & 63.3 (2.21)     & 0.06 (0.03)    & 63.3 (2.21)    & 0.06 (0.03)     \\
% A-GEM                            & \ding{51} & 50.7 (2.32)     & 0.19 (0.04)    & 59.9 (2.64)    & 0.10 (0.02)     \\
% ER-Ring                          & \ding{51} & 56.2 (1.93)     & 0.13 (0.01)    & 62.6 (1.77)    & 0.08 (0.02)     \\
% ER-Reservoir                     & \ding{51} & 46.9 (0.76)     & 0.21 (0.03)    & 65.5 (1.99)    & 0.09 (0.02)     \\
% ORTHOG-subspace                  & \ding{51} & 58.81 (1.88)    & 0.12 (0.02)    & 64.38 (0.95)   & 0.055 (0.007)   \\ \midrule
% NCCL + Ring                      & \ding{51} & 54.63 (0.65)    & \textbf{0.059 (0.01)}   & 61.09 (1.47)   & \textbf{0.02 (0.01)}     \\
% NCCL + Reservoir                 & \ding{51} & 52.18 (0.48)    & 0.118 (0.01)   & 63.68 (0.18)   & 0.028 (0.009)   \\ \bottomrule
% \end{tabular}
%     \label{tab:cifar100_reduced}
% \end{table*}


\begin{table}[hbt!]
\caption{Multi-headed split-CIFAR100, full size Resnet-18 $n_f=64$. Accuracy and forgetting results.}
\centering
\begin{tabular}{@{}cccccc@{}}
\toprule
\multirow{2}{*}{\textbf{Method}} & \textbf{memory size}       & \multicolumn{2}{c}{\textbf{1}} & \multicolumn{2}{c}{\textbf{5}} \\ \cmidrule(l){2-6} 
                                 & \textbf{memory}            & accuracy       & forgetting    & accuracy      & forgetting     \\ \midrule
Fintune                          & \ding{55} & 42.6 (2.72)    & 0.27 (0.02)    & 42.6 (2.72)    & 0.27 (0.02)     \\
EWC                              & \ding{55} & 43.2 (2.77)     & 0.26 (0.02)    & 43.2 (2.77)    & 0.26 (0.02)     \\
ICRAL                            & \ding{51} & 46.4 (1.21)     & 0.16 (0.01)    & -             & -              \\
A-GEM                            & \ding{51} & 51.3 (3.49)     & 0.18 (0.03)    & 60.9 (2.5)     & 0.11 (0.01)     \\
MER                              & \ding{51} & 49.7 (2.97)     & 0.19 (0.03)    & -             & -              \\
ER-Ring                          & \ding{51} & 59.6 (1.19)     & 0.14 (0.01)    & 67.2 (1.72)    & 0.06 (0.01)     \\
ER-Reservoir                     & \ding{51} & 51.5 (2.15)     & 0.14 (0.09)    & 62.68 (0.91)   & 0.06 (0.01)     \\
ORTHOG-subspace                  & \ding{51} & 64.3 (0.59)     & 0.07 (0.01)    & 67.3 (0.98)    & 0.05 (0.01)     \\ \midrule
NCCL + Ring                      & \ding{51} & 59.06 (1.02)    & 0.03 (0.02)    & 66.58 (0.12)   & 0.004 (0.003)   \\
NCCL + Reservoir                 & \ding{51} & 54.7 (0.91)     & 0.083 (0.01)   & 66.37 (0.19)   & 0.004 (0.001)   \\ \bottomrule
\end{tabular}
\end{table}



% \begin{table}[hbt!]
% \caption{permuted-MNIST (23 tasks 10000 examples per task), FC-[256,256]. Accuracy and forgetting results.}
% \centering
% \begin{tabular}{|c|c|c|c|c|c|}
% \hline
%               & memory & \multicolumn{2}{c|}{1}    & \multicolumn{2}{c|}{5}     \\ \hline
% Method         &        & accuracy    & forgetting  & accuracy    & forgetting   \\ \hline
% multi-task     & x      & 91.3        & -           & 83          & -            \\ \hline
% Fine-tune      & x      & 50.6(2.57)  & 0.29(0.01)  & 47.9        & 0.29(0.01)   \\ \hline
% EWC            & x      & 68.4(0.76)  & 0.18(0.01)  & 63.1(1.40)  & 0.18(0.01)   \\ \hline
% MER            & o      & 78.6(0.84)  & 0.15(0.01)  & 88.34(0.26) & 0.049(0.003) \\ \hline
% A-GEM          & o      & 78.3(0.42)  & 0.21(0.01)  & 64.1(0.74)  & 0.19(0.01)   \\ \hline
% ER-Ring        & o      & 79.5(0.31)  & 0.12(0.01)  & 75.8(0.24)  & 0.07(0.01)   \\ \hline
% ER-Reservoir   & o      & 68.9(0.89)  & 0.15(0.01)  & 76.2(0.38)  & 0.07(0.01)   \\ \hline
% ORHOG-subspace & o      & 86.6(0.91)  & 0.04(0.01)  & 87.04(0.43) & 0.04(0.003)  \\ \hline
% NCCL + Ring    & o      & 74.38(0.89) & 0.05(0.009) & 83.76(0.21) & 0.014(0.001) \\ \hline
% NCCL+Reservoir & o      & 76.48(0.29) & 0.1(0.002)  & 86.02(0.06) & 0.013(0.002) \\ \hline
% \end{tabular}
% \end{table}

\begin{table}[hbt!]
\caption{permuted-MNIST (23 tasks 10000 examples per task), FC-[256,256]. Accuracy and forgetting results.}
\centering
\begin{tabular}{@{}cccccc@{}}
\toprule
\multirow{2}{*}{\textbf{Method}} & \textbf{memory size}       & \multicolumn{2}{c}{\textbf{1}} & \multicolumn{2}{c}{\textbf{5}} \\ \cmidrule(l){2-6} 
                                 & \textbf{memory}            & accuracy       & forgetting    & accuracy      & forgetting     \\ \midrule
multi-task                       & \ding{55} & 91.3           & -             & 83            & -              \\
Fine-tune                        & \ding{55} & 50.6 (2.57)     & 0.29 (0.01)    & 47.9          & 0.29 (0.01)     \\
EWC                              & \ding{55} & 68.4 (0.76)     & 0.18 (0.01)    & 63.1 (1.40)    & 0.18 (0.01)     \\
MER                              & \ding{51} & 78.6 (0.84)     & 0.15 (0.01)    & 88.34 (0.26)   & 0.049 (0.003)   \\
A-GEM                            & \ding{51} & 78.3 (0.42)     & 0.21 (0.01)    & 64.1 (0.74)    & 0.19 (0.01)     \\
ER-Ring                          & \ding{51} & 79.5 (0.31)     & 0.12 (0.01)    & 75.8 (0.24)    & 0.07 (0.01)     \\
ER-Reservoir                     & \ding{51} & 68.9 (0.89)     & 0.15 (0.01)    & 76.2 (0.38)    & 0.07 (0.01)     \\
ORHOG-subspace                   & \ding{51} & 86.6 (0.91)     & 0.04 (0.01)    & 87.04 (0.43)   & 0.04 (0.003)    \\ \midrule
NCCL + Ring                      & \ding{51} & 74.38 (0.89)    & 0.05 (0.009)   & 83.76 (0.21)   & 0.014 (0.001)   \\
NCCL+Reservoir                   & \ding{51} & 76.48 (0.29)    & 0.1 (0.002)    & 86.02 (0.06)   & 0.013 (0.002)   \\ \bottomrule
\end{tabular}
\end{table}




% \begin{table}[hbt!]
% \caption{Single-headed split-MNIST, FC-[256,256]. Accuracy and forgetting results.}
% \centering
% \resizebox{0.95\linewidth}{!}{
% \begin{tabular}{|c|c|c|c|c|c|c|c|}
% \hline
%           & memory & \multicolumn{2}{c|}{1}   & \multicolumn{2}{c|}{5} & \multicolumn{2}{c|}{50} \\ \hline
% Method     &        & accuracy    & forgetting & accuracy  & forgetting & accuracy  & forgetting  \\ \hline
% multi-task & x      & 95.2        & -          & -         & -          & -         & -           \\ \hline
% Fine-tune  & x      & 52.52(5.24) & 0.41(0.06) & -         & -          & -         & -           \\ \hline
% EWC        & x      & 56.48(6.46) & 0.31(0.05) & -         & -          & -         & -           \\ \hline
% A-GEM          & o & 34.04(7.10) & 0.23(0.11)   & 33.57(6.32) & 0.18(0.03)  & 33.35(4.52) & 0.12(0.04)    \\ \hline
% ER-Reservoir   & o & 34.63(6.03) & 0.79(0.07)   & 63.60(3.11) & 0.42(0.05)  & 86.17(0.99) & 0.13(0.016)   \\ \hline
% NCCL + Ring    & o & 34.64(3.27) & 0.55(0.03)   & 61.02(6.21) & 0.207(0.07) & 81.35(8.24) & -0.03(0.1)    \\ \hline
% NCCL+Reservoir & o & 37.02(0.34) & 0.509(0.009) & 65.4(0.7)   & 0.16(0.006) & 88.9(0.28)  & -0.125(0.004) \\ \hline
% \end{tabular}}
% \end{table}

\begin{table}[hbt!]
\caption{Single-headed split-MNIST, FC-[256,256]. Accuracy and forgetting results.}
\centering
\resizebox{\linewidth}{!}{
\begin{tabular}{@{}cccccccc@{}}
\toprule
\multirow{2}{*}{\textbf{Method}} & \textbf{memory size}       & \multicolumn{2}{c}{\textbf{1}} & \multicolumn{2}{c}{\textbf{5}} & \multicolumn{2}{c}{\textbf{50}} \\ \cmidrule(l){2-8} 
                                 & \textbf{memory}            & accuracy      & forgetting     & accuracy       & forgetting    & accuracy      & forgetting      \\ \midrule
multi-task                       & \ding{55} & 95.2          & -              & -              & -             & -             & -               \\
Fine-tune                        & \ding{55} & 52.52 (5.24)   & 0.41 (0.06)     & -              & -             & -             & -               \\
EWC                              & \ding{55} & 56.48 (6.46)   & 0.31 (0.05)     & -              & -             & -             & -               \\
A-GEM                            & \ding{51} & 34.04 (7.10)   & 0.23 (0.11)     & 33.57 (6.32)    & 0.18 (0.03)    & 33.35 (4.52)   & 0.12 (0.04)      \\
ER-Reservoir                     & \ding{51} & 34.63 (6.03)   & 0.79 (0.07)     & 63.60 (3.11)    & 0.42 (0.05)    & 86.17 (0.99)   & 0.13 (0.016)     \\ \midrule
NCCL + Ring                      & \ding{51} & 34.64 (3.27)   & 0.55 (0.03)     & 61.02 (6.21)    & 0.207 (0.07)   & 81.35 (8.24)   & -0.03 (0.1)     \\
NCCL+Reservoir                   & \ding{51} & 37.02 (0.34)   & 0.509 (0.009)   & 65.4 (0.7)      & 0.16 (0.006)   & 88.9 (0.28)    & -0.125 (0.004)  \\ \bottomrule
\end{tabular}}
\end{table}



% \begin{table}[hbt!]
% \caption{Single-headed split-MNIST, FC-[400,400] and mem. size=500(50 / cls.). Accuracy and forgetting results.}
% \centering
% \begin{tabular}{|c|c|}
% \hline
% mem=50           &             \\ \hline
% Method           & accuracy    \\ \hline
% multi-task       & 96.18       \\ \hline
% Fine-tune        & 50.9(5.53)  \\ \hline
% EWC              & 55.40(6.29) \\ \hline
% A-GEM            & 26.49(5.62) \\ \hline
% ER-Reservoir     & 85.1(1.02)  \\ \hline
% CN-DPM           & 93.23       \\ \hline
% Gdumb     & 91.9(0.5)   \\ \hline
% NCCL + Reservoir & 95.15(0.91) \\ \hline
% \end{tabular}
% \end{table}

\begin{table}[hbt!]
\caption{Single-headed split-MNIST, FC-[400,400] and mem. size=500(50 / cls.). Accuracy and forgetting results.}
\centering
\begin{tabular}{@{}cc@{}}
\toprule
\textbf{Method}  & \textbf{accuracy} \\ \midrule
multi-task       & 96.18             \\
Fine-tune        & 50.9 (5.53)        \\
EWC              & 55.40 (6.29)       \\
A-GEM            & 26.49 (5.62)       \\
ER-Reservoir     & 85.1 (1.02)        \\
CN-DPM           & 93.23             \\
Gdumb            & 91.9 (0.5)         \\
NCCL + Reservoir & 95.15 (0.91)       \\ \bottomrule
\end{tabular}
\end{table}


% \begin{table}[hbt!]
% \caption{Single-headed split-CIFAR10, full size Resnet-18
%  and mem. size=500(50 / cls.). Accuracy and forgetting results.}
% \centering
% \begin{tabular}{|c|c|}
% \hline
% mem=50             &             \\ \hline
% Method             & accuracy    \\ \hline
% iid-offline        & 93.17       \\ \hline
% iid-online         & 36.65       \\ \hline
% Fine-tune          & 12.68       \\ \hline
% EWC                & 53.49(0.72) \\ \hline
% A-GEM              & 54.28(3.48) \\ \hline
% GSS                & 33.56       \\ \hline
% Reservoir Sampling & 37.09       \\ \hline
% CN-DPM             & 41.78       \\ \hline
% NCCL + Ring        & 54.63(0.76) \\ \hline
% NCCL + Reservoir   & 55.43(0.32) \\ \hline
% \end{tabular}
% \end{table}

\begin{table}[hbt!]
\caption{Single-headed split-CIFAR10, full size Resnet-18
 and mem. size=500(50 / cls.). Accuracy and forgetting results.}
\centering
\begin{tabular}{@{}cc@{}}
\toprule
\textbf{Method}    & \textbf{accuracy} \\ \midrule
iid-offline        & 93.17             \\
iid-online         & 36.65             \\
Fine-tune          & 12.68             \\
EWC                & 53.49 (0.72)       \\
A-GEM              & 54.28 (3.48)       \\
GSS                & 33.56             \\
Reservoir Sampling & 37.09             \\
CN-DPM             & 41.78             \\ \midrule
NCCL + Ring        & 54.63 (0.76)       \\
NCCL + Reservoir   & 55.43 (0.32)       \\ \bottomrule
\end{tabular}
\end{table}





\begin{table}[hbt!]
\caption{Single-headed split-CIFAR100, Resnet18 with $n_f=20$. Memory size = 10,000. We conduct the experiment with the same setting of GMED \citep{jin2021gradient}.}
\centering
\begin{tabular}{@{}cc@{}}
\toprule
\textbf{Methods}    & \textbf{accuracy} \\ \midrule
Finetune            & 3.06(0.2)       \\
iid online          & 18.13(0.8)      \\
iid offline         & 42.00(0.9)      \\
A-GEM               & 2.40(0.2)       \\
GSS-Greedy          & 19.53(1.3)      \\
BGD                 & 3.11(0.2)       \\
ER-Reservoir        & 20.11(1.2)      \\
ER-Reservoir + GMED & 20.93(1.6)      \\
MIR                 & 20.02(1.7)      \\
MIR + GMED          & 21.22(1.0)      \\
NCCL-Reservoir      & \textbf{21.95(0.3)}      \\ \bottomrule
\end{tabular}
\end{table}


\clearpage



\section{Theoretical Analysis}\label{sec:appendproof}

In this section, we provide the proofs of the results for nonconvex continual learning.
We first start with the derivation of Equation \ref{eq:changelsmooth} in Assumption \ref{assumption:lsmooth}.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{assumption*}
% \label{assumption:lsmooth}
% $f_i$ is $L$-smooth that there exists a constant $L>0$ such that for any $x,y \in \mathbb{R}^d$,
% \begin{equation}
% \label{eq:lsmooth}
%     \lVert \nabla f_{i}(x) - \nabla f_{i}(y) \rVert \leq L \lVert x - y \rVert
% \end{equation}
% where $\lVert \cdot \rVert$ denotes the Euclidean norm.
% Then the following inequality directly holds that
% \begin{align}
% \label{eq:changelsmooth}
%      -{L \over 2} \lVert x - y \rVert^{2} &\leq
%      f_{i}(x) - f_{i}(y)- \langle \nabla f_{i}(y), x - y \rangle  \leq {L \over 2} \lVert x - y \rVert^2.
% \end{align}
% \end{assumption*}

\subsection{Assumption and Additional Lemma}
\begin{proof}[\textbf{Derivation of Equation \ref{eq:changelsmooth}}] %\quad
Recall that
\begin{equation}
    \left|  f_{i}(x) - f_{i}(y)- \langle \nabla f_{i}(y), x - y \rangle  \right| \leq {L \over 2} \lVert x - y \rVert^2.
\end{equation}
Note that $f_i$ is differentiable and nonconvex. We define a function $g(t)=f_i(y+t(x-y))$ for $t\in [0,1]$ and an objective function $f_i$.
By the fundamental theorem of calculus,
\begin{equation}
    \int_{0}^{1} g'(t)dt = f(x)-f(y).
\end{equation}
By the property, we have
\begin{align*}
    &\left|  f_{i}(x) - f_{i}(y)- \langle \nabla f_{i}(y), x - y \rangle  \right| \\
    &= \left| \int_{0}^1  \langle \nabla f_{i}(y+t(x-y)), x-y \rangle dt- \langle \nabla f_{i}(y), x - y \rangle  \right| \\
    &= \left| \int_{0}^1  \langle \nabla f_{i}(y+t(x-y)) - \nabla f_i(y), x-y \rangle dt \right|. 
\end{align*}
Using the Cauchy-Schwartz inequality,
\begin{align*}
   & \left| \int_{0}^1  \langle \nabla f_{i}(y+t(x-y)) - \nabla f_i(y), x-y \rangle dt \right| \\
   &\leq \left| \int_{0}^1  \lVert \nabla f_{i}(y+t(x-y)) - \nabla f_i(y)\rVert \cdot \lVert x-y \rVert dt \right|.
\end{align*}
Since $f_i$ satisfies Equation \ref{eq:lsmooth}, then we have
\begin{align*}
     &\left|  f_{i}(x) - f_{i}(y)- \langle \nabla f_{i}(y), x - y \rangle  \right| \\
     &\leq \left| \int_{0}^1  L \lVert y+t(x-y) - y \rVert \cdot \lVert x-y \rVert dt \right| \\
     & = L \lVert x-y \rVert^2 \left| \int_0^1 t dt \right| \\
     & = {L \over 2} \lVert x-y \rVert^2.
\end{align*}
\end{proof}


% Suppose that an objective function $f_i$ is $L$-smooth over \textbf{dom}$f$.
% Then for any $x, y \in \textbf{dom} f$
% for some constant $L$ Equation \ref{eq:lsmooth}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{thm:inner_two_rand_vec}
    Let $p=[p_1, \cdots p_{D}], \ q=[q_1, \cdots, q_D]$ be two statistically independent random vectors with dimension $D$. Then the expectation of the inner product of two random vectors $\E[\langle p, q \rangle]$ is $\sum_{d=1}^{D} \E[p_d]\E[ q_d]$.
\end{lemma}
\begin{proof}
By the property of expectation,
\begin{align*}
    \E[\langle p, q \rangle] &= \E[\sum_{d=1}^D p_d q_d] \\
    &= \sum_{d=1}^D \E[ p_d q_d] \\
    &= \sum_{d=1}^D \E[ p_d] \E[q_d].
\end{align*}
\end{proof}

\subsection{Proof of Main Results}
We now show the main results of our work.
\begin{proof}[\textbf{Proof of Lemma \ref{lemma:memory}}]
To clarify the issue of $\E_{M_t} \left[ \E_{I_t} \left[e_t | M_t \right] \right]=0$, let us explain the details of constructing replay-memory as follows.
We have considered episodic memory and reservoir sampling in the paper.
We will first show the case of episodic memory by describing the sampling method for replay memory.
We can also derive the case of reservoir sampling by simply applying the result of episodic memory.

\textbf{Episodic memory (ring buffer).} 
We divide the entire dataset of continual learning into the previous task $P$ and the current task $C$ on the time step $t=0$. 
For the previous task $P$, the data stream of $P$ is i.i.d., and its sequence is random on every trial (episode).
The trial (episode) implies that a continual learning agent learns from an online data stream with two consecutive data sequences of $P$ and $C$.
Episodic memory takes the last data points of the given memory size $m$ by the First In First Out (FIFO) rule, and holds the entire data points until learning on $C$ is finished.
Then, we note that $M_t=M_0$ for all $t\geq 0$ and $M_0$ is uniformly sampled from the i.i.d. sequence of $P$.
By the law of total expectation, we derive $\E_{M_0 \subset P} \left[ \E_{I_t} \left[\nabla f_{I_t}(x^t) | M_0 \right] \right]$ for any $x^t, \ \forall t\geq 0$.

\begin{align*}
    \E_{M_0 \subset P} \left[ \E_{I_t} \left[\nabla f_{I_t}(x^t) | M_0 \right] \right] = \E_{M_0 \subset P} \left[ \nabla f_{M_0}(x^t) \right].
\end{align*}
It is known that $M_0$ was uniformly sampled from $P$ on each trial before training on the current task $C$.
Then, we take expectation with respect to every trial that implies the expected value over the memory distribution $M_0$.
We have
\begin{align*}
    \E_{M_0 \subset P} \left[ \nabla f_{M_0}(x^t) \right]=\nabla f(x^t)
\end{align*}
for any $x^t, \ \forall t$. We can consider $\nabla f_{M_t}(x^t)$ as a sample mean of $P$ on every trial for any $x^t, \ \forall t\geq 0$.
Although $x^t$ is constructed iteratively, the expected value of the sample mean for any $x^t$, $\E_{M_0 \subset P} \left[ \nabla f_{M_0}(x^t) \right]$ is also derived as $\nabla f(x^t)$. 

\textbf{Reservoir sampling.}
To clarify the notation for reservoir sampling first, we denote the expectation with respect to the history of replay memory $M_{[0:t]}=(M_0, \cdots, M_t)$ as
$\E_{M_{[0:t]}}$.
This is the revised version of $\E_{M_t}$.
Reservoir sampling is a trickier case than episodic memory, but $\E_{M_{[0:t]}} \left[ \E_{I_t} \left[e_t | M_t \right] \right]=0$ still holds.
Suppose that $M_0$ is full of the data points from $P$ as the episodic memory is sampled and the mini-batch size from $C$ is 1 for simplicity.
The reservoir sampling algorithm drops a data point in $M_{t-1}$ and replaces the dropped data point with a data point in the current mini-batch from $C$ with probability $p=m/n$, where $m$ is the memory size and $n$ is the number of visited data points so far.
The exact pseudo-code for reservoir sampling is described in [1].
The replacement procedure uniformly chooses the data point which will be dropped.
We can also consider the replacement procedure as follows.
The memory $M_t$ for $P$ is reduced in size 1 from $M_{t-1}$, and the replaced data point $d_C$ from $C$ contributes in terms of $\nabla g_{d_C}(x^t)$ if $d_C$ is sampled from the replay memory.
Let $M_{t-1} = [ d_1, \cdots, d_{|M_{t-1}|} ]$ where $| \cdot |$ denotes the cardinality of the memory.
The sample mean of $M_{t-1}$ is given as
\begin{equation}
    \nabla f_{M_{t-1}} (x^{t-1}) = {1 \over |M_{t-1}|} \sum_{d_i} \nabla f_{d_i} (x^{t-1}).
\end{equation}

By the rule of reservoir sampling, we assume that the replacement procedure reduces the memory from $M_{t-1}$ to $M_t$ with size $|M_{t-1}| -1$ and the set of remained upcoming data points $C_t\in C$ from the current data stream for online continual learning is reformulated into $C_{t-1} \cup [d_C]$.
Then, $d_C$ can be resampled from $C_{t-1} \cup [d_C]$ to be composed of the minibatch of reservoir sampling with the dfferent probability.
However, we ignore the probability issue now to focus on the effect of replay-memory on $\nabla f$.
Now, we sample $M_t$ from $M_{t-1}$, then we get the random vector $\nabla f_{M_{t}} (x^t)$ as
\begin{equation}
   \nabla f_{M_{t}} (x^t) =  {1 \over |M_{t}|} \sum_{j=1}^{|M_{t-1}|} W_{ij} \nabla f_{d_j} (x^t),
\end{equation}
where the index $i$ is uniformly sampled from $i \sim [1, \cdots, |M_{t-1}|]$, and $W_{ij}$ is the indicator function that $W_{ij}$ is 0 if $i=j$ else 1.


The above description implies the dropping rule, and $M_t$ can be considered as an uniformly sampled set with size $|M_t|$ from $M_{t-1}$.
There could also be $M_{t} = M_{t-1}$ with probability $1-p=1-m/n$.
Then the expectation of $\nabla f_{M_{t}} (x^t)$ given $M_{t-1}$ is derived as
\begin{align*}
    \E_{M_t}[ \nabla f_{M_{t}} (x^t) | M_{t-1}] &= p\left({1 \over |M_{t-1}|} \sum_{i}^{|M_{t-1}|} {1 \over |M_{t}|} \sum_{j=1}^{|M_{t-1}|}  W_{ij} \nabla f_{d_j} (x^t)\right) + (1-p)\left(\nabla f_{M_{t-1}} (x^t)\right) \\
    &= \nabla f_{M_{t-1}} (x^t).
\end{align*}
When we consider the mini-batch sampling, we can formally reformulate the above equation as
\begin{equation}
    \E_{M_t \sim p(M_t|M_{t-1})} \left[ \E_{I_t \subset M_t} \left[\nabla f_{I_t} (x^t) | M_t\right] | M_{t-1} \right]=\nabla f_{M_{t-1}} (x^t).
\end{equation}
Now, we apply the above equation recursively.
Then,
\begin{equation}
    \E_{M_1\sim p(M_1|M_0)}\left[ \cdots \E_{M_t \sim p(M_t|M_{t-1})} \left[ \E_{I_t \subset M_t} \left[\nabla f_{I_t} (x^t) | M_t\right] | M_{t-1} \right]\cdots|M_0 \right]=\nabla f_{M_{0}} (x^t).
\end{equation}
Similar to episodic memory, $M_0$ is uniformly sampled from $P$. Therefore, we conclude that

\begin{equation}
    \E_{M_0, \cdots, M_t}[\nabla f_{M_t} (x^t)]=\nabla f(x^t)
\end{equation}
by taking expectation over the history $M_{[0:t]}=(M_1, M_2, \cdots, M_t)$.

Note that taking expectation iteratively with respect to the history $M_{[t]}$ is needed to compute the expected value of gradients for $M_t$.
However, the result $\E_{M_0, \cdots, M_t}[\E_{I_t}[e_t|M_t]]=0$ still holds in terms of expectation.

Furthermore, we also discuss that the effect of reservoir sampling on the convergence of $C$.
Unlike we simply update $g(x)$ by the stochastic gradient descent on $C$, the datapoints $d\in M \cap C$ have a little larger sampling probability than other datapoints $d_{C-M} \in C - M$. The expectation of gradient norm on the averaged loss $\E \lVert \nabla g (x^t) \rVert^2$ is based on the uniform and equiprobable sampling over $C$, but the nature of reservoir sampling distort this measure slightly.
In this paper, we focus on the convergence of the previous task $C$ while training on the current task $C$ with several existing memory-based methods.
Therefore, analyzing the convergence of reservoir sampling method will be a future work.


\end{proof}





\begin{proof}[\textbf{Proof of Lemma \ref{lemma:step}}]
We analyze the convergence of nonconvex continual learning with replay memory here.
Recall that the gradient update is the following
\begin{align*}
    x^{t+1} = x^{t} - \alpha_{H_t} \nabla f_{I_t}(x^t) - \beta_{H_t} \nabla g_{J_t}(x^t)
\end{align*}
for all $t \in \{1,2, \cdots, T\}$.
Let $e_t = \nabla f_{I_t}(x^t) - \nabla f(x^t)$.
Since we assume that $f, \ g$ is $L$-smooth,
we have the following inequality by applying Equation \ref{eq:changelsmooth}:
\begin{align}
\label{aeq:lsmoothanal}
    f& (x^{t+1}) \leq f(x^t) + \langle \nabla f(x^t), x^{t+1} - x^t \rangle + {L \over 2} \lVert x^{t+1} - x^t \rVert^2 \nonumber \\
    &= f(x^t) - \langle \nabla f(x^t), \alpha_{H_t} \nabla f_{I_t}(x^t) + \beta_{H_t} \nabla g_{J_t}(x^t) \rangle + {L \over 2} \lVert  \alpha_{H_t} \nabla f_{I_t}(x^t) + \beta_{H_t} \nabla g_{J_t}(x^t) \rVert^2 \nonumber \\
    &= f(x^t) - \alpha_{H_t} \langle \nabla f(x^t), \nabla f_{I_t}(x^t) \rangle - \beta_{H_t} \langle \nabla f(x^t),  \nabla g_{J_t}(x^t) \rangle \nonumber \\
    & \ \ + {L \over 2}  \alpha_{H_t}^2 \lVert \nabla f_{I_t}(x^t)\rVert^2 + {L \over 2} \beta_{H_t}^2 \lVert \nabla g_{J_t}(x^t) \rVert^2 + L \alpha_{H_t}\beta_{H_t} \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \nonumber \\
    &= f(x^t) - \alpha_{H_t} \langle \nabla f(x^t), \nabla f(x^t) \rangle - \alpha_{H_t} \langle \nabla f(x^t), e_t \rangle - \beta_{H_t} \langle \nabla f_{I_t}(x^t),  \nabla g_{J_t}(x^t) \rangle + \beta_{H_t} \langle \nabla g_{J_t} (x^t), e_t \rangle \nonumber \\
    & \ \ + {L \alpha_{H_t}^2 \over 2}  \lVert \nabla f(x^t)\rVert^2 + L \alpha_{H_t}^2 \langle \nabla f(x^t), e_t \rangle + {L \alpha_{H_t}^2 \over 2} \lVert e_t \rVert^2 + {L\beta_{H_t}^2 \over 2}  \lVert \nabla g_{J_t}(x^t) \rVert^2 + L \alpha_{H_t}\beta_{H_t} \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \nonumber \\
    &= f(x^t) - \left(\alpha_{H_t} - {L \over 2} \alpha_{H_t}^2 \right) \lVert \nabla f(x^t) \rVert^2  +  {L \over 2} \beta_{H_t}^2 \lVert \nabla g_{J_t}(x^t) \rVert^2  - \beta_{H_t} ( 1 - \alpha_{H_t} L ) \langle \nabla f_{I_t}(x^t),  \nabla g_{J_t}(x^t) \rangle \nonumber \\
    & \ \ + \left( L \alpha_{H_t}^2 - \alpha_{H_t} \right) \langle \nabla f(x^t), e_t \rangle + \beta_{H_t} \langle \nabla g_{J_t} (x^t), e_t \rangle + {L  \over 2} \alpha_{H_t}^2\lVert e_t \rVert^2.
\end{align}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 여기서부
To show the proposed theoretical convergence analysis of nonconvex continual learning,
we define the catastrophic forgetting term $\Gamma_t$ and the overfitting term $B_t$ as follows:
\begin{align*}
    &B_t = (L\alpha_{H_t}^2 - \alpha_{H_t}) \langle \nabla f(x^t), e_t \rangle + \beta_{H_t} \langle \nabla g_{J_t}(x^t),e_t \rangle, \\
    &\Gamma_t = {\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 - \beta_{H_t}(1-\alpha_{H_t}L) \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle.
\end{align*}
Then, we can rewrite Equation \ref{aeq:lsmoothanal} as
\begin{align}
    f& (x^{t+1}) \leq f(x^t) - \left(\alpha_{H_t} - {L \over 2} \alpha_{H_t}^2 \right) \lVert \nabla f(x^t) \rVert^2  +  \Gamma_t +  B_t + {L  \over 2} \alpha_{H_t}^2\lVert e_t \rVert^2.
\end{align}

% \begin{align*}
%   \Tilde{C}_t = {L \over 2} \beta_{H_t}^2 \lVert \nabla g_{J_t}(x^t) \rVert^2 -\beta_{H_t}(1 - \alpha_{H_t} L) \langle \nabla f(x^t),  \nabla g_{J_t}(x^t) \rangle, 
% \end{align*}
%  for $t\geq 1$. 
We first note that $B_t$ is dependent of the error term $e_t$ with the batch $I_t$.
In the continual learning step, an training agent cannot access $\nabla f(x^t)$, then we cannot get the exact value of $e_t$.
Furthermore, $\Gamma_t$ is dependent of the gradients $\nabla f_{I_t}(x^t), \nabla g_{I_t}(x^t)$ and the learning rates $\alpha_{H_t}, \beta_{H_t}$.

 
Taking expectations with respect to $I_t$ on both sides given $J_t$, we have
\begin{align*}
    \E_{I_t}\left[f(x^{t+1})\right] &\leq \E_{I_t}\left[  f(x^t) - \left(\alpha_{H_t} - {L \over 2} \alpha_{H_t}^2 \right) \lVert \nabla f(x^t) \rVert^2  +  \Gamma_t +  B_t + {L  \over 2} \alpha_{H_t}^2\lVert e_t \rVert^2 \Big| J_t \right] \\
    &\leq \E_{I_t}\left[  f(x^t) - \left(\alpha_{H_t} - {L \over 2} \alpha_{H_t}^2 \right) \lVert \nabla f(x^t) \rVert^2 + {L  \over 2} \alpha_{H_t}^2\lVert e_t \rVert^2 \right] + \E_{I_t} \left[ \Gamma_t +  B_t  \Big| J_t \right].
\end{align*}

Now, taking expectations over the whole stochasticity we obtain
\begin{align*}
    \E \left[f(x^{t+1})\right] &\leq \E\left[  f(x^t) - \left(\alpha_{H_t} - {L \over 2} \alpha_{H_t}^2 \right) \lVert \nabla f(x^t) \rVert^2  +  \Gamma_t +  B_t + {L  \over 2} \alpha_{H_t}^2\lVert e_t \rVert^2  \right].
\end{align*}
Rearranging the terms and assume that ${1 \over 1- {L\alpha_{H_t}/ 2} } > 0$, we have

\begin{align*}
    \left(\alpha_{H_t} - {L\over 2} \alpha_{H_t}^2 \right)\E \lVert \nabla f(x^t) \rVert^2 \leq\E \left[ f(x^t) - f(x^{t+1}) + \Gamma_t + B_t + {L  \over 2} \alpha_{H_t}^2\lVert e_t \rVert^2  \right]
    % &\leq f(x^t) - f(x^{t+1}) + C_t + {L\over 2}\alpha_{H_t}^2 \sigma_f^2 + (L\alpha_{H_t}^2 -\alpha_{H_t}) \E[\langle \nabla f(x^t), e_t \rangle] .
\end{align*}
and

\begin{align*}
    \E \lVert \nabla f(x^t) \rVert^2 &\leq\E \left[ {1 \over \alpha_{H_t} (1- {L \over 2} \alpha_{H_t})} \left( f(x^t) - f(x^{t+1}) + \Gamma_t + B_t \right) + {\alpha_{H_t} L  \over 2 (1- {L \over 2} \alpha_{H_t})} \lVert e_t \rVert^2  \right] \\
     &\leq\E \left[ {1 \over \alpha_{H_t} (1- {L \over 2} \alpha_{H_t})} \left( f(x^t) - f(x^{t+1}) + \Gamma_t + B_t \right) + {\alpha_{H_t} L  \over 2 (1- {L \over 2} \alpha_{H_t})} \sigma_f^2  \right].
    % &\leq f(x^t) - f(x^{t+1}) + C_t + {L\over 2}\alpha_{H_t}^2 \sigma_f^2 + (L\alpha_{H_t}^2 -\alpha_{H_t}) \E[\langle \nabla f(x^t), e_t \rangle] .
\end{align*}


%%%여기서부터

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{comment}
\begin{align}
\label{eq:thm1_with_bias}
    \E_{I_t} \lVert \nabla f(x^t) \rVert^2 &\leq {\lambda_{H_t} \over \alpha_{H_t}} \E_{I_t} \left[   f(x^t) - f(x^{t+1}) + \tilde{C}_t + \Lambda_{H_t} \langle \nabla f(x^t), e_t \rangle \ +  L \alpha_{H_t}\beta_{H_t} \langle e_t, \nabla g_{J_t} (x^t) \rangle \right] +  {{L\over 2}\alpha_{H_t} \sigma_f^2 \over {1- {L \over 2}\alpha_{H_t}}} \nonumber \\
   % &\leq {1 \over \alpha_{H_t}(1- {L\over2}\alpha_{H_t})} \left( f(x^t) - f(x^{t+1}) + C_t +\gamma \E[\langle \nabla f(x^t), e_t \rangle] \right) + {{L\over 2}\alpha_{H_t} \sigma_f^2   \over {1- {L \over 2}\alpha_{H_t}}}.
\end{align}

Noting that $\E[ e_t]=0$ under Assumption \ref{thm:unbiased} and Lemma \ref{thm:inner_two_rand_vec}, we have
\begin{align}
\label{eq:thm1_with_bias}
    \E_{I_t} \lVert \nabla f(x^t) \rVert^2 \leq {\lambda_{H_t} \over \alpha_{H_t}} \E_{I_t} \left[ f(x^t) - f(x^{t+1}) + \tilde{C}_t \right] + {{L\over 2}\alpha_{H_t} \sigma_f^2 \over {1- {L \over 2}\alpha_{H_t}}}.
\end{align}
Finally, we define 
\begin{equation}
    C_t = \E_{J_t}[\tilde{C}_t]
\end{equation} 
and take expectations over $J_t$,
\begin{align}
% \label{eq:thm1_with_bias}
    \E \lVert \nabla f(x^t) \rVert^2 \leq {\lambda_{H_t} \over \alpha_{H_t}} \left(\E \left[ f(x^t) - f(x^{t+1})\right] + C_t \right)+ {{L\over 2}\alpha_{H_t} \sigma_f^2 \over {1- {L \over 2}\alpha_{H_t}}}.
\end{align}
\end{comment}




\end{proof}

% Furthermore, the batch size $b$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\medskip

\begin{proof}[\textbf{Proof of Theorem \ref{thm:min}}]
Suppose that the learning rate $\alpha_{H_t}$ is a constant $\alpha= c / \sqrt{T}$, for $c>0$, $1-{L\over 2} \alpha = {1\over A} >0$. Then, by summing Equation \ref{eq:thm1} from $t=0$ to $T-1$, we have

\begin{align}
\label{aeq:thm1_result}
    \underset{t}{\min} \ \E \lVert \nabla f(x^t) \rVert^2 &\leq {1 \over T} \sum_{t=0}^{T-1} \E \lVert \nabla f(x^t) \rVert^2 \nonumber \\
    &\leq {1 \over 1 - {L \over 2} \alpha} \left( {1 \over \alpha T} \left( f(x^0)-f(x^{T}) + \sum_{t=0}^{T-1} \left( \E\left[ B_t + \Gamma_t \right]\right) \right)  + {L \over 2} \alpha \sigma_f^2 \right)\nonumber \\
    &= {1 \over 1 - {L \over 2} \alpha} \left( {1 \over c \sqrt{T}} \left( \Delta_f + \sum_{t=0}^{T-1} \left(\E\left[ B_t + \Gamma_t \right] \right) \right)  + {Lc \over 2 \sqrt{T}} \sigma_f^2 \right)\nonumber \\
    &= {A \over  \sqrt{T}} \left( {1 \over c} \left( \Delta_f + \sum_{t=0}^{T-1} \E\left[ B_t + \Gamma_t \right] \right) + {Lc \over 2} \sigma_f^2 \right).
\end{align}

We note that a batch $I_t$ is sampled from a memory $M_t \subset M$ which is a random vector whose element is a datapoint $d \in P \cup C$.
Then, taking expectation over $I_t \subset M_t \subset P \cup C$ implies that $\E[B_t]=0$.
Therefore, we get the minimum of expected square of the norm of gradients
\begin{align*}
    \underset{t}{\min} \ \E \lVert \nabla f(x^t) \rVert^2 \leq {A \over  \sqrt{T}} \left( {1 \over c} \left( \Delta_f + \sum_{t=0}^{T-1} \E[\Gamma_t] \right)  + {Lc \over 2} \sigma_f^2 \right).
\end{align*}

\end{proof}


% we present the convergence rate for $g(x)$.


% \begin{lemma}
% \label{lemma:g}
% Suppose that $I_t \cap J_t = \emptyset$,
% % and the datapoints $d\in M \cap P$ use the same objective function $g_d=f_d$. 
% Taking expectation over $I_t \subset M_t$ and $J_t \subset C$, we have 
% \begin{equation}
%      \underset{t}{\min}\ \mathbb{E}  \lVert \nabla g (x^t) \rVert^2  \leq \sqrt{ {2 \Delta_g L \over T} }\sigma_g,
% \end{equation}
% where $\Delta_g$ and $\sigma_g$ is the version of loss gap and the variance for $g$ on $M \cup C$, respectively.
% In fact, it should be noted that the convergence rate of $g$ is on $M\cup C$, so that it also converges to $C$ trivially.
% \end{lemma}

% \begin{replemma}{lemma:g}
% Suppose that $I_t \cap J_t = \emptyset$,
% % and the datapoints $d\in M \cap P$ use the same objective function $g_d=f_d$. 
% Taking expectation over $I_t \subset M_t$ and $J_t \subset C$, we have 
% \begin{equation}
%      \underset{t}{\min}\ \mathbb{E}  \lVert \nabla h|_{M\cup C} (x^t) \rVert^2  \leq \sqrt{ {2 \Delta_{h|_{M\cup C}} L \over T} }\sigma_{h|_{M \cup C}},
% \end{equation}
% where $\Delta_{h|_{M \cup C}}$ and $\sigma_{h|_{M \cup C}}$ is the version of loss gap and the variance for $h$ on $M \cup C$, respectively.
% % In fact, it should be noted that the convergence rate of $g$ is on $M\cup C$, so that it also converges to $C$ trivially.
% \end{replemma} 
 

\begin{proof}[\textbf{Proof of Lemma \ref{lemma:g}}]
To simplify the proof, we assume that learning rates $\alpha_{H_t}, \beta_{H_t}$ are a same fixed value $\beta= c' / \sqrt{T}. $
The assumption is reasonable, because it is observed that the RHS of Equation \ref{eq:thm1} is not perturbed drastically by small learning rates in $0< \alpha_{H_t}, \beta_{H_t} \leq 2 / L \ll 1$.
% This assumption is actually the case of ER-Reservoir, which shows the remarkable performance.
% , which leads to the following.
Let us denote the union of $M_t$ over time $0\leq t \leq T-1$ as $M= \bigcup_{t} M_t$.
By the assumption, it is equivalent to update on $M \cup C$.
Then, the non-convex finite sum optimization is given as
\begin{equation}
        \underset{x \in \mathbb{R}^d}{\min}\ h|_{M \cup C}(x)= {1 \over n_{g}+ |M|} \sum_{i\in M \cup C} h_i (x),
\end{equation}
where
% $g_i$ is the same function as $f_i$, and \
$|M|$ is the number of elements in $M$.
This problem can be solved by a simple SGD algorithm \citep{DBLP:conf/icml/ReddiHSPS16}.
Thus, we have
\begin{equation}
\label{aeq:g_conv}
    \underset{t}{\min}\ \mathbb{E}  \lVert \nabla h|_{M \cup C} (x^t) \rVert^2  \leq {1\over T} \sum_{t=0}^T \mathbb{E}  \lVert \nabla h|_{M \cup C} (x^t) \rVert^2 \leq \sqrt{ {2 \Delta_{h|_{M \cup C}} L \over T} }\sigma_{h|_{M \cup C}}.
\end{equation}
\end{proof}

\begin{lemma}
\label{lemma:supsigma}
For any $C \subset D \subset M\cup C$, define $\omega^2_{h|_D}$ as
\begin{align*}
    {\omega}^2_{h|_{D}}=\underset{x}{\sup} \ \E_{j\in D} \lVert \nabla h_j(x^t) - \nabla h|_{M \cup C}(x^t) \rVert^2].
\end{align*}
Then, we have
\begin{align}
     \E \lVert \nabla g_{J_t}(x^t) \rVert^2 \leq \E \lVert \nabla h|_{M \cup C}(x^t) \rVert^2+  \underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}.
\end{align}

\end{lemma}


\begin{proof}[\textbf{Proof of Lemma \ref{lemma:supsigma}}]
We arrive at the following result by Jensen's inequality
% the supremum of the variance of the mini-batch gradient  $\nabla g_{J_t} (x^t)$ is derived as
% \begin{align}
% \label{eq:samplevariance}
%     \underset{x}{\sup} \E \lVert \nabla g_{J_t}(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2 &= \underset{x}{\sup}{[n_g+M] - b_g \over ([n_g+M]-1) b_g}\cdot {1 \over [n_g+M]} \sum_{j=1}^{[n_g+M]} \lVert \nabla g_j(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2 \nonumber \\
%     &= {[n_g+M] - b_g \over ([n_g+M]-1) b_g} \sigma_{h_{M \cup C}}^2,
% \end{align}

% \begin{align}
% \label{eq:samplevariance}
%     \underset{x}{\sup} \E_{J_t \sim C} \lVert \nabla g_{J_t}(x^t) - \nabla h|_{M \cup C}(x^t) \rVert^2 
%     &\leq \underset{x}{\sup}  \E_{J_t \sim C} \left[  \lVert \nabla g_{J_t}(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2 \right] \\
%     &=\underset{x}{\sup}  \E_{J_t \sim C} \left[  \lVert  \E_{j\in J_t} [\nabla h_j(x^t)] - \nabla h_{M \cup C}(x^t) \rVert^2 \right] \\
%     &\leq \underset{C \subset D \subset M\cup C}\sup \E_{J_t \sim D} \left[ \underset{x}{\sup}  \lVert  \E_{j\in J_t} [\nabla h_j(x^t)] - \nabla h_{M \cup C}(x^t) \rVert^2 \right] \\
%     % & = \E \left[ \underset{x}{\sup}  \lVert \nabla \E_{j\in J_t} [h_j(x^t)] - \nabla h_{M \cup C}(x^t) \rVert^2 \right] \\
%     & \leq \E \left[    \E_{j\in J_t} [\underset{x}{\sup}\ \lVert \nabla h_j(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2] \right] \\
%     % & \leq \E \left[    \E_{j\in J_t} [\lVert \underset{x}{\sup}\ \nabla h_j(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2] \right] \\
%     & = \E_{j\in J_t} \left[  \E [\underset{x}{\sup}\ \lVert \nabla h_j(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2] \right] \\
%     & = \sigma^2_{h_{M \cup C}},
% \end{align}



\begin{align}
\label{eq:samplevariance}
    \underset{x}{\sup} \E_{J_t \subset C} \lVert \nabla g_{J_t}(x^t) - \nabla h|_{M \cup C}(x^t) \rVert^2 
    % &\leq \underset{x}{\sup}  \E_{J_t \subset C} \left[  \lVert \nabla g_{J_t}(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2 \right] \\
    &=\underset{x}{\sup}  \E_{J_t \subset C} \left[  \lVert  \E_{j\in J_t} [\nabla h_j(x^t)] - \nabla h|_{M \cup C}(x^t) \rVert^2 \right] \\
    &\leq \underset{C \subset D \subset M\cup C}\sup \underset{x}{\sup}  \E_{J_t \subset D} \left[  \lVert  \E_{j\in J_t} [\nabla h_j(x^t)] - \nabla h|_{M \cup C}(x^t) \rVert^2 \right] \\
    % & = \E \left[ \underset{x}{\sup}  \lVert \nabla \E_{j\in J_t} [h_j(x^t)] - \nabla h_{M \cup C}(x^t) \rVert^2 \right] \\
    % & \leq \underset{C \subset D \subset M\cup C}\sup \E_{J_t \subset D}\left[   \underset{x}{\sup}  \E_{j\in J_t} [\ \lVert \nabla h_j(x^t) - \nabla h|_{M \cup C}(x^t) \rVert^2] \right] \\
    & \leq \underset{C \subset D \subset M\cup C}\sup \left[   \underset{x}{\sup}  \E_{j\in D} [\ \lVert \nabla h_j(x^t) - \nabla h|_{M \cup C}(x^t) \rVert^2] \right] \\
    % & \leq \E \left[    \E_{j\in J_t} [\lVert \underset{x}{\sup}\ \nabla h_j(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2] \right] \\
    % & = \E_{j\in J_t} \left[  \E [\underset{x}{\sup}\ \lVert \nabla h_j(x^t) - \nabla h_{M \cup C}(x^t) \rVert^2] \right] \
    & = \underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}.
\end{align}




% \begin{align}
% \label{eq:samplevariance}
%     \underset{x}{\sup} \E \lVert \nabla g_{J_t}(x^t) - \nabla g(x^t) \rVert^2 &= \underset{x}{\sup}{n_g - b_g \over (n_g-1) b_g}\cdot {1 \over n_g} \sum_{j=1}^{n_g} \lVert \nabla g_j(x^t) - \nabla g(x^t) \rVert^2 \nonumber \\
%     &= {n_g - b_g \over (n_g-1) b_g} \sigma_g^2,
% \end{align}
% The detailed derivation is shown in technical lemma A.1 in \citep{lei2017non}.
% where $n_g$ and $b_g$ denotes the size of $C$ and minibatch $J_t$, respectively.
By the triangular inequality, we get

\begin{align}
     \E \lVert \nabla g_{J_t}(x^t) \rVert^2 &\leq  \E \lVert \nabla g_{J_t}(x^t) - \nabla h|_{M\cup C}(x^t) \rVert^2 + \E \lVert 
     \nabla h|_{M\cup C}(x^t) \rVert^2\\
    &\leq \E \lVert \nabla h|_{M \cup C}(x^t) \rVert^2+  \underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}.
\end{align}

\end{proof}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{lemma}
% \label{thm:exp_catastrophic}
%     Let an upper bound $\beta > \beta_{H_t} >0$.
%     % The upper bound of $\Gamma_t$ 
%     For the worst case, the expectation of summing the catastrophic forgetting term over iterations $T$ is 
%     \begin{equation*}
%         \sum_{t=0}^{T-1} \Gamma_t = O(T).
%     \end{equation*}

%     % For $\delta \leq {1\over \sqrt{T}}$, we have $O(1)$.
% \end{lemma}

For continual learning, the model $x^0$ reaches to an $\epsilon$-stationary point of $f(x)$ when we have finished to learn $P$ and start to learn $C$. 
% Now, we have $\lVert \nabla f(x) \rVert = \epsilon \ll 1$
Now, we discuss the frequency of transfer and interference during continual learning before showing Lemma \ref{thm:exp_catastrophic}.
It is well known that the frequencies between interference and transfer have similar values (the frequency of constraint violation is approximately 0.5 for AGEM) as shown in Appendix D of \citep{DBLP:conf/iclr/ChaudhryRRE19}.
Even if memory-based continual learning has a small memory buffer which contains a subset of $P$, random sampling from the buffer allows to have similar frequencies between interference and transfer.

In this paper, we consider two cases for the upper bound of $\E [\Gamma_t]$, the moderate case and the worst case. For \textbf{the moderate case}, which covers most continual learning scenarios, we assume that the inner product term $\langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle$ has the same probabilities of being positive (transfer) and negative (interference).
Then, we can approximate $\E [ \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle] \approx 0$ over all randomness.
For \textbf{the worst case}, we assume that all $\langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle$ has negative values.


\begin{proof}[\textbf{Proof of Lemma \ref{thm:exp_catastrophic}}]
For the moderate case, we derive the rough upper bound of $\E [\Gamma_t]$:
\begin{align}
    \E \left[\Gamma_t \right] &= \E \left[ {\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 - \beta_{H_t}(1-\alpha_{H_t}L) \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle\right] \\
    &\approx \E \left[ {\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2\right] \\
    &= O \left( \E \left[ {\beta^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 \right] \right)
\end{align}
% where $\lVert \nabla g_{J_t}(x^t) \rVert \geq \lVert \nabla f_{I_t}(x^t) \rVert$.


% {n_g - b_g \over (n_g-1) b_g}
By plugging Lemma \ref{lemma:supsigma} into $\E[\Gamma_t]$, we obtain that
\begin{align}
    \E[\Gamma_t] &\leq O \left( \E \left[ {\beta^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 \right] \right) \\
    &= O \left( \E \left[ {\beta^2 L \over 2} \lVert \nabla h|_{M\cup C}(x^t) \rVert^2  + {\beta^2 L \over 2} \underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}\right]\right).
    % &= O \left( E \left[ {\beta^2 L \over 2} \lVert \nabla g(x^t) \rVert^2 \right] + {\beta^2 L(n_g-b_g) \over 2(n_g -1)b_g}\sigma_g^2 \right).
\end{align}


% The sum of catastrophic forgetting term $\sum \Gamma_t$ is corrected as $\sum E[\Gamma_t]$.
We use the technique for summing up in the proof of Theorem 1,
then the cumulative sum of catastrophic forgetting term is derived as
\begin{align}
    \sum_{t=0}^{T-1} \E[\Gamma_t] &\leq  \sum_{t=0}^{T-1} {\beta^2 L \over 2}O \left( \E \left[  \lVert h|_{M\cup C}(x^t) \rVert^2 \right] +\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}  \right) \\
    &\leq  {\beta^2 L \over 2} \sum_{t=0}^{T-1} O \left( {1\over \beta} \left[ h|_{M\cup C}(x^t) - h|_{M\cup C}(x^{t+1}) \right] + {L\beta \over 2} \sigma_{h|_{M\cup C}}^2 +\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}   \right) \\
    & \leq{\beta^2 L \over 2}  O\left({1 \over \beta}\Delta_{h|_{M\cup C}} + {TL\beta \over 2} \sigma_{h|_{M\cup C}}^2 + {T\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}} \right) \\
    &= O\left( \beta \Delta_{h|_{M\cup C}} +   {TL \beta^3 \over 2}\sigma_{h|_{M\cup C}}^2 +T\beta^2\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}} \right).
\end{align}
Now, we consider the randomness of memory choice.
Let $D^*$ be as follows:
\begin{align}
    D^* =   \underset{C \subset D \subset P\cup C}{\arg\max} \beta \Delta_{h|_{D}} +   {TL \beta^3 \over 2}\sigma_{h|_{D}}^2.
\end{align}
Then, we obtain the following inequality,
\begin{align}
    \sum_{t=0}^{T-1} \E[\Gamma_t] &\leq O\left( \beta \Delta_{h|_{D^*}} +   {TL \beta^3 \over 2}\sigma_{h|_{D^*}}^2 +T\beta^2\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}} \right)\\
    &\leq O\left( \beta \Delta_{h|_{D^*}} +   {TL \beta^3 \over 2}\sigma_{h|_{D^*}}^2 +T\beta^2\underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}} \right).
\end{align}


% Consider telescoping over time and the randomness of memory choice. Then,
% \begin{align}
%       \E[\Gamma_t] &= O \left( \E \left[ {\beta^2 L \over 2} \lVert \nabla h|_{M\cup C}(x^t) \rVert^2  + {\beta^2 L \over 2} \underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}\right]\right) \\
%       &\leq O \left( \underset{M\subset P\cup C}\sup\E \left[ {\beta^2 L \over 2} \lVert \nabla h|_{M\cup C}(x^t) \rVert^2\right]  + {\beta^2 L \over 2} \underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}}\right) \\
%       &\leq{\beta^2 L \over 2} O \left(  \underset{C \subset D \subset P\cup C}\sup  \E \left[  \lVert \nabla h|_{D}(x^t) \rVert^2\right]  + \underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}}\right).
% \end{align}

% \begin{align}
%     \sum_{t=0}^{T-1} E[\Gamma_t] &\leq  \sum_{t=0}^{T-1} {\beta^2 L \over 2}O \left( E \left[  \lVert \nabla g(x^t) \rVert^2 \right] + {(n_g-b_g) \over (n_g -1)b_g}\sigma_g^2 \right) \\
%     &\leq  {\beta^2 L \over 2} \sum_{t=0}^{T-1} O \left( {1\over \beta} \left[ g(x^t) - g(x^{t+1}) \right] + {L\beta \over 2} \sigma_g^2 + {(n_g-b_g) \over (n_g -1)b_g}\sigma_g^2 \right) \\
%     & \leq{\beta^2 L \over 2}  O\left({1 \over \beta}\Delta_g + {TL\beta \over 2} \sigma_g^2 + {T(n_g-b_g) \over (n_g -1)b_g}\sigma_g^2 \right) \\
%     &= O\left( \beta \Delta_g  + \sigma_g^2 \left( {L \beta^3 \over 2} + { (n_g - b_g) \beta^2 \over (n_g-1)b_g} \right)T \right).
% \end{align}

% Rearranging the above equation, we get
% \begin{align}
%     \sum_{t=0}^{T-1} \E[\Gamma_t] \leq  O\left( \sigma_g^2 \left( {L \beta^3 \over 2} + { (n_g - b_g)\beta^2 \over (n_g-1)b_g} \right)T + \beta \Delta_g \right).
% \end{align}

Rearranging the above equation, we get
\begin{align}
    \sum_{t=0}^{T-1} \E[\Gamma_t] \leq O\left( T \left( {L \beta^3 \over 2}\sigma_{h|_{D^*}}^2 +\beta^2\underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}}\right) +  \beta \Delta_{h|_{D^*}} \right).
\end{align}

% Therefore, we can write $ \sum_{t=0}^{T-1} \E[\Gamma_t]=O(T)$. We note that the rough upper bound of $\sum \E[\Gamma_t]$ increases monotonically with training step as in the previous result in the paper.
\textbf{For the moderate case}, we provide the derivations of the convergence rate for two cases of $\beta$ as follows.

When $\beta < \alpha=c/\sqrt{T}$, the upper bound always satisfies
\begin{align*}
     \sum_{t=0}^{T-1} {\E[\Gamma_t] \over \sqrt{T}} &\leq {1 \over \sqrt{T}}O\left(  {1 \over T} \left( {L \beta \over 2}\sigma_{h|_{D^*}}^2 +{1\over \sqrt{T}}\underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}}\right) + {1 \over \sqrt{T}} \Delta_{h|_{D^*}}\right) < O\left( {1 \over T^{3/2}} + {1 \over T} \right).
\end{align*}

For $\beta \geq \alpha=c/\sqrt{T}$, we cannot derive a tighter bound, so we still have
\begin{align*}
     \sum_{t=0}^{T-1} {\E[\Gamma_t] \over \sqrt{T}} &\leq {1 \over \sqrt{T}}O\left(  T \left( {L \beta^3 \over 2}\sigma_{h|_{D^*}}^2 +\beta^2\underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}}\right) +  \beta \Delta_{h|_{D^*}} \right) = O\left(\sqrt{T} + {1 \over \sqrt{T}} \right).
\end{align*}
% This result is obtained by dividing $\sum E[\Gamma_t]$ by $\sqrt{T}$ as in the proof of Thm. 1.
% \textb{On the other hand, $\E[\Gamma_t]$ can be negative when $\langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle >0$.
% It implies that the cumulative sum of $\E[\Gamma_t]$ does not increase monotonically.}
% \textr{By Lemma \ref{lemma:g}}
% Therefore, for some large number $N<O(T)$, we can denote the cumulative sum of $\E[\Gamma_t]$ over the finite steps $T$ as
% \begin{equation}
% \label{aeq:o1}
%     \sum_{t=0}^{T-1} {\E[\Gamma_t] \over \sqrt{T}} \leq {N\over \sqrt{T}} = O({1\over \sqrt{T}}).
% \end{equation}

\textbf{For the worst case}, we assume that there exists a constant $c_{f,g}$ which satisfies $c_{f,g} \lVert \nabla g_{J_t}(x^t) \rVert \geq  \lVert \nabla f_{I_t}(x^t) \rVert$.
\begin{align}
    \E \left[\Gamma_t \right] &= \E \left[ {\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 - \beta_{H_t}(1-\alpha_{H_t}L) \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle\right] \\
    &\leq \E \left[ {\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 + \beta_{H_t}(1-\alpha_{H_t}L) \lVert \nabla f_{I_t}(x^t)\rVert \lVert \nabla g_{J_t} (x^t) \rVert\right] \\
    &\leq \E \left[ {\beta^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 + \beta c_{f,g}\lVert\nabla g_{J_t} (x^t) \rVert^2 \right] \\
    &= O \left( \E \left[ \left(\beta^2 + \beta\right) \lVert \nabla g_{J_t}(x^t) \rVert^2 \right] \right).
\end{align}
% where $\lVert \nabla g_{J_t}(x^t) \rVert \geq \lVert \nabla f_{I_t}(x^t) \rVert$.


% {n_g - b_g \over (n_g-1) b_g}
By plugging Lemma \ref{lemma:supsigma} into $\E[\Gamma_t]$, we obtain that
\begin{align}
    \E[\Gamma_t] &\leq O \left( \E \left[ \left(\beta^2 + \beta\right)  \lVert \nabla g_{J_t}(x^t) \rVert^2 \right] \right) \\
    &= O \left( \left(\beta^2 + \beta\right) \E \left[  \lVert \nabla h|_{M\cup C}(x^t) \rVert^2  +  \underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}\right]\right).
    % &= O \left( E \left[ {\beta^2 L \over 2} \lVert \nabla g(x^t) \rVert^2 \right] + {\beta^2 L(n_g-b_g) \over 2(n_g -1)b_g}\sigma_g^2 \right).
\end{align}


% The sum of catastrophic forgetting term $\sum \Gamma_t$ is corrected as $\sum E[\Gamma_t]$.
We use the technique for summing up in the proof of Theorem 1,
then the cumulative sum of catastrophic forgetting term is derived as
\begin{align}
    \sum_{t=0}^{T-1} \E[\Gamma_t] &\leq  \sum_{t=0}^{T-1} \left(\beta^2 + \beta\right) O \left( \E \left[  \lVert h|_{M\cup C}(x^t) \rVert^2 \right] +\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}  \right) \\
    &\leq \left(\beta^2 + \beta\right) \sum_{t=0}^{T-1} O \left( {1\over \beta} \left[ h|_{M\cup C}(x^t) - h|_{M\cup C}(x^{t+1}) \right] + {L\beta \over 2} \sigma_{h|_{M\cup C}}^2 +\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}   \right) \\
    & \leq\left(\beta^2 + \beta\right)  O\left({1 \over \beta}\Delta_{h|_{M\cup C}} + {TL\beta \over 2} \sigma_{h|_{M\cup C}}^2 + {T\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}}} \right) \\
    &= O\left( (\beta+1) \Delta_{h|_{M\cup C}} +   {TL \beta^2(\beta+1) \over 2}\sigma_{h|_{M\cup C}}^2 +T\beta(\beta+1)\underset{C \subset D \subset M\cup C}\sup {\omega}^2_{h|_{D}} \right).
\end{align}

For the worst case, we provide the derivations of the convergence rate for two cases of $\beta$ as follows.

When $\beta < \alpha=c/\sqrt{T}$, the upper bound always satisfies
\begin{align*}
     \sum_{t=0}^{T-1} {\E[\Gamma_t] \over \sqrt{T}} &\leq {1 \over \sqrt{T}}O\left(   {L c + \sqrt{T} \over \sqrt{T}}\sigma_{h|_{D^*}}^2 +(\sqrt{T} + c)\underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}} + {\sqrt{T} + c \over \sqrt{T}} \Delta_{h|_{D^*}}\right) < O\left( {1 \over T} + {1 \over \sqrt{T}} + 1 \right).
\end{align*}

For $\beta \geq \alpha=c/\sqrt{T}$, we cannot derive a tighter bound, so we still have
\begin{align*}
     \sum_{t=0}^{T-1} {\E[\Gamma_t] \over \sqrt{T}} &\leq {1 \over \sqrt{T}}O\left(  T \left( {L \beta^2(\beta+1) \over 2}\sigma_{h|_{D^*}}^2 +\beta(\beta+1)\underset{C \subset D \subset P\cup C}\sup {\omega}^2_{h|_{D}}\right) +  (\beta+1) \Delta_{h|_{D^*}} \right) = O\left(\sqrt{T} + {1 \over \sqrt{T}} \right).
\end{align*}

\end{proof}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{comment}
\begin{lemma}
\label{thm:sum_catastrophic}
    Let a constant $\delta >0$ and an upper bound $\beta > \beta_{H_t} >0$. The sum of the catastrophic forgetting term over iterations $T$ $\sum_{t=0}^{T-1} C_t$ is $O(\delta \sqrt{T})$.
    For $\delta \leq {1\over \sqrt{T}}$, we have $O(1)$.
\end{lemma}


\textbf{Proof} \quad
The upper bound of the catastrophic forgetting term is 
\begin{align*}
    C_t &= \mathbb{E} \left[{\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 - \beta_{H_t} \langle \nabla f(x^t), \nabla g_{J_t} (x^t) \rangle\right] \\
    &\leq \E \left[ {\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 + \beta_{H_t} \lVert\nabla f(x^t) \rVert \lVert \nabla g_{J_t} (x^t) \rVert \right] \\
    &=O\left( \E \left[ \lVert \nabla g_{J_t}(x^t) \rVert^2 \right] \right).
\end{align*}
Since
\begin{align*}
    \lVert \nabla g_{J_t}(x^t) \rVert^2 &\leq \lVert \nabla g(x^t) \rVert^2 + \lVert \nabla g_{J_t}(x^t) - g(x^t) \rVert^2 \\
    &\leq \lVert \nabla g(x^t) \rVert^2 + {\sigma_g^2 \over b_g}
\end{align*}
%and we assume that $\lVert \nabla g_{J_t}(x^t) - g(x^t) \rVert^2$ is much smaller than $\lVert \nabla g(x^t) \rVert^2$
where $\sigma_g$ is analogous to Equation \ref{eq:sigma} and $b_g$ is the mini-batch size of $g$. Then we have
\begin{align*}
    C_t &=  O \left( \E \lVert \nabla g(x^t) \rVert^2 \right) \\ 
    &= O\left( { \beta^2 \delta \over \sqrt{T}} \right)
\end{align*}
where $t\in [T]$ and for some $\delta > 0$. Summing over time $t$, we have
\begin{align*}
    C = \sum_{t=0}^{T-1} C_t = T \cdot O\left({\beta^2 \delta \over \sqrt{T}} \right) = O\left(\beta^2 \delta \sqrt{T} \right).
\end{align*}
Therefore, we obtain $O(1)$ when $\beta^2 \delta \sqrt{T} \leq 1$.
\end{comment}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



Even if we consider the worst case, we still have $O(1)$ for the cumulative forgetting $\E[\Gamma_t]$ when $\beta < \alpha$.
This implies that we have the theoretical condition for control the forgetting on $f(x)$ while evolving on $C$.
In the main text, we only discuss the moderate case to emphasize $f(x)$ can be converged by the effect of transfer during continual learning, but we have also considered the worst case can be well treated by our theoretical condition by keeping the convergence of $f(x)$ over time as follows.


\begin{proof}[\textbf{Proof of Corollary \ref{coro:smallbeta}}]

By Lemma \ref{thm:exp_catastrophic}, we have 
\begin{equation*}
    \sum_{t=0}^{T-1} {\E[\Gamma_t] \over \sqrt{T}} < O\left( {1 \over T^{3/2}} + {1 \over T} \right)
\end{equation*}
for $\beta < \alpha$ for \textbf{the moderate case}.
Then, we can apply the result into RHS of the inequality in Theorem \ref{thm:min} as follows.
\begin{align*}
        \underset{t}{\min}\ \mathbb{E}  \lVert \nabla f (x^t) \rVert^2  &\leq {A \over \sqrt{T}} \left({1\over c}\left( \Delta_f +   \sum_{t=0}^{T-1}\E\left[ \Gamma_t \right] \right) +  {Lc \over 2} \sigma_{f}^2 \right) \\
        &= {A/c \over \sqrt{T}} \left( \Delta_f +  {Lc^2 \over 2} \sigma_{f}^2 \right) + {A/c \over \sqrt{T}}  \sum_{t=0}^{T-1} \E[\Gamma_t] \\
        &= O\left( {1 \over T^{3/2}} + {1 \over T} + {1 \over T^{1/2}} \right)=O\left( {1 \over \sqrt{T}} \right).
\end{align*}

In addition, we have the convergence rate of $f(x)$ for \textbf{the worst case} as follows:
\begin{align}
    \underset{t}{\min}\ \mathbb{E}  \lVert \nabla f (x^t) \rVert^2 = O(1),
\end{align}
which implies that $f(x)$ can keep the convergence while evolving on $C$.

\end{proof}



\begin{proof}[\textbf{Proof of Corollary \ref{coro:one}}]
To formulate the IFO calls, Recall that $T(\epsilon)$
\begin{equation*}
    T(\epsilon) = \min \ \{ T: \ \min  \ \E \lVert \nabla f(x^t) \rVert^2 \leq \epsilon \}.
\end{equation*}
A single IFO call is invested in calculating each step, and we now compute IFO calls to reach an $\epsilon$-accurate solution.
\begin{equation*}
    {A \over \sqrt{T}} \left({1\over c}\left( \Delta_f +   \sum_{t=0}^{T-1}\E\left[ \Gamma_t \right] \right) +  {Lc \over 2} \sigma_{f}^2 \right) \to \epsilon.
\end{equation*}
% As seen in Theorem \ref{thm:min}, NCCL has a convergence rate of
% \begin{equation}
%     O\left({\sum^{T-1}_{t=0} \Gamma_t \over \sqrt{T}} \right).
% \end{equation}
% We note that the convergence rate for the worst case is
% \begin{equation}
%     O\left(\sqrt{T} \right),
% \end{equation}
% where the given model diverges on the convergence of $f(x)$.
% Then, IFO calls are denoted as $\infty$.
When $\beta < \alpha$, we get
\begin{equation*}
    \text{IFO calls} = O\left({1\over \epsilon^2}\right).
\end{equation*}
Otherwise, when $\beta \geq \alpha$, we cannot guarantee the upper bound of stationary decreases over time. Then, we cannot compute IFO calls for this case.



% For the case of Equation \ref{aeq:o1}, we obtain the convergence rate $O(1/\sqrt{T}).$
% Thus we get $O(1/\epsilon^2)$ in this case.

% $\E \lVert \nabla f(x^t) \rVert^2=O({\sum C_t \over \sqrt{T}})$ by Theorem \ref{thm:min}.
% Then by Lemma \ref{thm:sum_catastrophic}, we have
\begin{comment}
\begin{equation*}
   \underset{t}{\min} \ \E \lVert \nabla f(x^t) \rVert^2 = O\left({\beta^2\delta \sqrt{T} \over \sqrt{T}}\right)=O(\beta^2\delta).
\end{equation*}
It implies that $\underset{t}{\min} \ \E \lVert \nabla f(x^t) \rVert^2$ is not decreasing when $1 \ll \beta^2\delta \sqrt{T}$.
Then, $x^t$ cannot reach to the stationary point.

On the other hand, $f(x)$ can be converged to the stationary point when $\beta^2\delta \leq {1 \over \sqrt{T}}$ such that
\begin{equation}
    \underset{t}{\min} \ \E \lVert \nabla f(x^t) \rVert^2 = O(\beta^2\delta)=O\left({1 \over \sqrt{T}}\right).
\end{equation}
To derive a bound for $T(\epsilon)$, we note that
\begin{align*}
    O \left( { 1 \over \sqrt{T}} \right) \leq \epsilon.
\end{align*}
Then we have
\begin{align*}
    T(\epsilon) = O\left( {1 \over \epsilon^2} \right).
\end{align*}
The IFO call is defined as $\sum_{t=1}^{T( \epsilon)} b_{f,t}$. Therefore, the IFO call is $O(1/\epsilon^2)$.
\end{comment}

\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Derivation of Equations in Adaptive Methods in Continual Learning}
\label{sec:derivation_algo}

\textbf{Derivation for A-GEM} \quad
Let the surrogate $\nabla \Tilde{g}_{J_t}(x^t)$ as
\begin{align}
    \nabla \Tilde{g}_{J_t}(x^t) = \nabla g_{J_t}(x^t) - \left\langle {\nabla f_{I_t}(x^t) \over \lVert \nabla f_{I_t}(x^t) \rVert}, \nabla g_{J_t} (x^t) \right\rangle {\nabla f_{I_t}(x^t) \over \lVert \nabla f_{I_t}(x^t) \rVert},
\end{align}
where $\alpha_{H_t} = \alpha (1 - {\langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \over \lVert \nabla f_{I_t}(x^t) \rVert^2})$ and $\beta_{H_t}=\alpha$ for Equation \ref{eq:gradupdate}.

Then, we have
\begin{align}
    \E[\Gamma_t] &= \mathbb{E}\left[{\beta_{H_t}^2 L \over 2} \lVert \nabla \Tilde{g}_{J_t}(x^t) \rVert^2 - \beta_{H_t} \langle \nabla f_{I_t}(x^t), \nabla \Tilde{g}_{J_t} (x^t) \rangle \right] \nonumber \\
    &= \mathbb{E} \left[{\beta_{H_t}^2 L \over 2} \left( \lVert \nabla g_{J_t}(x^t) \rVert^2 -2{ \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle^2 \over \lVert \nabla f_{I_t}(x^t) \rVert^2} + { \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle^2 \over \lVert \nabla f_{I_t}(x^t) \rVert^2} \right) - \beta_{H_t} \langle \nabla f_{I_t}(x^t), \nabla \Tilde{g}_{J_t}(x^t)  \rangle\right] \nonumber \\
    &= \mathbb{E} \left[{\beta_{H_t}^2 L \over 2} \left( \lVert \nabla g_{J_t}(x^t) \rVert^2 -{ \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle^2 \over \lVert \nabla f_{I_t}(x^t) \rVert^2}\right) - \beta_{H_t} \left( \langle \nabla f_{I_t}(x^t),  \nabla g_{J_t}(x^t) \rangle - \langle \nabla f_{I_t}(x^t),  \nabla g_{J_t}(x^t) \rangle \right)\right] \nonumber \\
    &= \mathbb{E} \left[{\beta_{H_t}^2 L \over 2} \left( \lVert \nabla g_{J_t}(x^t) \rVert^2 -{ \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle^2 \over \lVert \nabla f_{I_t}(x^t) \rVert^2}\right) \right].
\end{align}
Now, we compare the catastrophic forgetting term between the original value with $\nabla g_{J_t} (x^t)$ and the above surrogate.
\begin{align*}
    \mathbb{E} \left[{\beta_{H_t}^2 L \over 2} \left( \lVert \nabla g_{J_t}(x^t) \rVert^2 -{ \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle^2 \over \lVert \nabla f_{I_t}(x^t) \rVert^2}\right) \right] <  \mathbb{E}\left[{\beta_{H_t}^2 L \over 2} \lVert \nabla g_{J_t}(x^t) \rVert^2 - \beta_{H_t} \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \right].
\end{align*}
Then, we can conclude that $\E[\Gamma_t]$ with the surrogate of A-GEM  is smaller than the original $\E[\Gamma_t]$.

\textbf{Derivation of optimal $\Gamma_t^*$ and $\beta_{H_t}^*$} \quad
For a fixed learning rate $\alpha$, we have
\begin{align*}
    0={\partial \E [\Gamma_t] \over \partial \beta_{H_t}} &= \E \left[ {\partial \Gamma_t \over \partial \beta_{H_t}} \right] \\
    &=  \E \left[ \beta_{H_t} L \lVert \nabla g_{J_t} (x^t) \rVert - (1- \alpha L) \langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \right].
\end{align*}
Thus, we obtain
\begin{align*}
    \beta_{H_t}^* = {(1-\alpha_{H_t} L)\langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \over L \lVert \nabla g_{J_t}(x^t) \rVert^2}={(1-\alpha_{H_t} L)\Lambda_{H_t} \over L \lVert \nabla g_{J_t}(x^t) \rVert^2}, \\
    \Gamma_t^* = - {(1-\alpha_{H_t} L)\langle \nabla f_{I_t}(x^t), \nabla g_{J_t} (x^t) \rangle \over 2L \lVert \nabla g_{J_t}(x^t) \rVert^2}= -{(1-\alpha_{H_t} L)\Lambda_{H_t} \over 2L \lVert \nabla g_{J_t}(x^t) \rVert^2}.
\end{align*}


% \begin{comment}
\section{Overfitting to replay Memory}
\label{sec:overfitting}
\begin{comment}
In the main text, we discussed a theoretical convergence analysis of continual learning for a smooth nonconvex finite-sum optimization problems.
The practical continual learning tasks have the restriction on full access to the entire data points of previously learned tasks.
Unlike taking expectation over $I_t \sim M$ and $M \sim P \cup C$, we have to compute on the given memory in the practical scenario.
Then, we note that $\E[B_t | M] \neq 0$.

Now we rewirte Equation \ref{aeq:thm1_result} for the worst case as

\begin{align}
    % \sup \lVert \nabla f(x) \rVert^2 &\leq \sum B_t \\
     T \sup \lVert \nabla f(x) \rVert^2  &\leq {1 \over \alpha(1-\alpha L /2 )} \left( \Delta_f + \sum \left(B_t + C_t \right) + {L \over 2} \alpha^2 \sigma_f^2 \right) \\
      \sup \lVert \nabla f(x) \rVert^2  &\leq {A \over \sqrt{T}} \left( {1 \over c} \left(\Delta_f + \sum \left(B_t + C_t \right) \right) + {Lc \over 2} \sigma_f^2 \right).
\end{align}

We note that $\sum B_t$ is a random variable, which is unpredictible, and
choosing $\nabla f_M(t) = \nabla f(x^t)$ over entire period is impossible.
Then, the cumulative sum of $B_t$ is increasing over $T$.
Therefore, we conclude that for the overfitting to memory degrades the convergence rate of NCCL empirically.
\end{comment}



In Lemma \ref{lemma:step}, we show the expectation of stepwise change of upper bound.
Now, we discuss the distribution of the upper bound by analyzing the random variable $B_t$.
As $B_t$ is computed by getting
\begin{equation*}
    B_t = (L\alpha_{H_t}^2 - \alpha_{H_t}) \langle \nabla f(x^t), e_t \rangle + \beta_{H_t} \langle \nabla g_{J_t}(x^t),e_t \rangle.
\end{equation*}
The purpose of our convergence analysis is to compute the upper bound of Equation \ref{eq:thm1},
then we compute the upper bound of $B_t$.
\begin{align*}
    B_t &\leq (L\alpha_{H_t}^2 - \alpha_{H_t}) \lVert \nabla f(x^t) \rVert \lVert e_t\rVert + \beta_{H_t} \lVert \nabla g_{J_t}(x^t)\rVert \lVert e_t \rVert.
\end{align*}
It is noted that the upper bound is related to the distribution of the norm of $e_t$.
We have already know that $\E [e_t]=0$, so we consider its variance, Var$(\lVert e_t \rVert)$ in this section.
Let us denote the number of data points of $P$ in a memory $M_0$ as $m_{P}$.
We assume that $M_0$ is uniformly sampled from $P$. 
Then the sample variance, Var$(\lVert e_t \rVert)$ is computed as
\begin{align*}
    \text{Var}(\lVert e_t \rVert) = {n_f - m_{P} \over (n_f-1) m_{P}} \sigma_f^2
\end{align*}
by the similar derivation with Equation \ref{eq:samplevariance}.
The above result directly can be applied to the variance of $B_t$.
This implies $m_t$ is a key feature which has an effect on the convergence rate.
It is noted that the larger $m_P$ has the smaller variance by applying schemes, such as larger memory.
In addition, the distributions of $e_t$ and $\nabla f_{I_t}(x^t)$ are different with various memory schemes.
Therefore, we can observe that memory schemes differ the performance even if we apply same step sizes.





% % Thm 1
% % TODO: check the coefficient of B_t
% \begin{lemma}
% %Suppose $f$ has $\sigma_f$ bounded gradient. $L \alpha_{H_t}^2 - \alpha_{H_t}^2 \leq \gamma$ for some $\gamma >0$ and 
% Suppose that Assumption \ref{assumption:lsmooth} holds and $0 < \alpha_{H_t} \leq {2 \over L}$.
% For $x^t$ updated by Algorithm \ref{alg:gni}, we have
% \begin{align}
%     &\mathbb{E}_t  \lVert \nabla f (x^t) \rVert^2 \leq  \mathbb{E}_t \left[ {1 \over \alpha_{H_t}(1-{L\over2}\alpha_{H_t})} \left(f(x^t) - f(x^{t+1}) + B_t + \Gamma_t \right) + {\alpha_{H_t} L \over 2 (1-{L\over2}\alpha_{H_t})} \sigma_{f}^2 \right].
% \end{align}
% \end{lemma}

\bibliography{han_567}



\end{document}
