% \documentclass{article}
\documentclass[accepted]{style/uai2025}

% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2024


% ready for submission
% \usepackage[nonatbib]{style/neurips_2024}
% \usepackage{style/arxiv}
% \usepackage{iclr2025_conference,times}
% \usepackage{natbib}
% \usepackage[margin=1in]{geometry}
\input{math_commands}

\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions

% \usepackage{iclr2024_conference,times}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2024}


% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2024}


% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2024}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
% \usepackage[hidelinks]{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{float}
\usepackage{multirow}
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\usepackage{lipsum}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{arydshln}
\usepackage{soul}
\usepackage{listings}

\hypersetup{
    % colorlinks=true,
    colorlinks=false,
    linkcolor=purple,
    filecolor=magenta,
    urlcolor=black,
    citecolor=blue,
    % pdftitle={Overleaf Example},
    % pdfpagemode=FullScreen,
}


\makeatletter
\renewcommand{\SetKwInOut}[2]{%
  \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}%
  \expandafter\ifx\csname InOutSizeDefined\endcsname\relax% if first time used
    \newcommand\InOutSizeDefined{}\setlength{\inoutsize}{\wd\algocf@inoutbox}%
    \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~}\setlength{\inoutindent}{\wd\algocf@inoutbox}%
  \else% else keep the larger dimension
    \ifdim\wd\algocf@inoutbox>\inoutsize%
    \setlength{\inoutsize}{\wd\algocf@inoutbox}%
    \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~}\setlength{\inoutindent}{\wd\algocf@inoutbox}%
    \fi%
  \fi% the dimension of the box is now defined.
  \algocf@newcommand{#1}[1]{%
    \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}%
%     {\let\\\algocf@newinout\hangindent=\wd\algocf@inoutbox\hangafter=1\parbox[t]{\inoutsize}{\KwSty{#2}\algocf@typo\hfill:}~##1\par}%
    {\let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]{\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~##1\par}%
    \algocf@linesnumbered% reset the numbering of the lines
  }}%
\makeatother

\newcommand{\commenton}{1}
\ifx\commenton\undefined
\newcommand{\benyou}[1]{{ \bf \color{red}   [benyou says `#1'] }}
\newcommand{\yumou}[1]{{ \bf \color{cyan}   [Yumou says `#1'] }}
\newcommand{\response}[1]{{ \bf \color{blue}   [#1] }}
\else
\newcommand{\benyou}[1]{}
\newcommand{\yumou}[1]{}
\newcommand{\response}[1]{}
\fi


\usepackage{amsthm}
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}

\newcommand{\algname}{\texttt{PMA}}
\newcommand{\algadamw}{\texttt{AdamW-PMA}}
% \newcommand{\algadamwdp}{\texttt{AGMA-DP}}
\newcommand{\alglion}{\texttt{Lion-PMA}}

\usepackage{minitoc}
\renewcommand \thepart{}
\renewcommand \partname{}

% \title{Stabilizing Noisy Adam for Training Large Language Models}
% \title{Accelerating Noisy Adam with Small Batch for Training Large Language Models}
% \title{Accelerating Noisy SGD with Periodical Moving Average for Training Large Language Models}
% \title{Periodical Moving Average to Accelerate Momentum-based Optimizers in Language Model Training}
% \title{Accelerating SGD with Periodical Moving Averaged Momentum for Training Language Models}
% \title{Mini-Batch SGD with Periodical Moving Average Accelerates Large Language Model Training}
% \title{Mini-Batch SGD with Periodical Moving Average Accelerates Large Language Model Tuning}
\title{Periodical Moving Average Accelerates Gradient Accumulation for Post-Training}


% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.


% \author{%
%   David S.~Hippocampus\thanks{Use footnote for providing further information
%     about author (webpage, alternative address)---\emph{not} for acknowledging
%     funding agencies.} \\
%   Department of Computer Science\\
%   Cranberry-Lemon University\\
%   Pittsburgh, PA 15213 \\
%   \texttt{hippo@cs.cranberry-lemon.edu} \\
%   % examples of more authors
%   % \And
%   % Coauthor \\
%   % Affiliation \\
%   % Address \\
%   % \texttt{email} \\
%   % \AND
%   % Coauthor \\
%   % Affiliation \\
%   % Address \\
%   % \texttt{email} \\
%   % \And
%   % Coauthor \\
%   % Affiliation \\
%   % Address \\
%   % \texttt{email} \\
%   % \And
%   % Coauthor \\
%   % Affiliation \\
%   % Address \\
%   % \texttt{email} \\
% }
% \author{}
% \author{Yumou Liu, An Li, Chaojie Li, Fei Yu and Benyou Wang\\
% \small
% The Chinese University of Hong Kong, Shenzhen\\
% \small
% Shenzhen, China\\
% % Pittsburgh,PA15213,USA\\
% % \texttt{your email}\\
% }
\author[ ]{\href{mailto:yumouliu@link.cuhk.edu.cn}{Yumou Liu\thanks{\href{mailto:yumouliu@link.cuhk.edu.cn}{yumouliu@link.cuhk.edu.cn}}, An Li, Chaojie Li, Fei Yu, Benyou Wang~\thanks{Benyou is the corresponding author}}}
% \author[ ]{An Li}
% \author[ ]{Chaojie Li}
% \author[ ]{Fei Yu}
% \author[ ]{Benyou Wang} %$^\dagger$
% Add affiliations after the authors
\affil[ ]{%
    School of Data Science~\protect\\
    The Chinese University of Hong Kong, Shenzhen\protect\\
    Shenzhen, China
}


\begin{document}


\maketitle


% \begin{abstract}
% High gradient variance challenges post-training Large Language Models (LLMs) on memory-limited devices. Existing practical approaches, such as small batch size or using Gradient Accumulation (GA), face the dilemma between low convergence rates due to high variance in parameter updates and long training times due to the serial GA process. In this paper, we identify that the exponential nature of the Exponential Moving Average (EMA) rapidly forgets historical gradients at an exponential rate in momentum updates, making it difficult to utilize the historical gradients to stabilize the update steps. Leveraging that model parameters move little during post-training, we embed the idea of GA into the momentum update and propose the Periodical Moving Average (PMA) technique. PMA splits the training steps into periods and employs moving averages instead of EMA in each period. We apply PMA to AdamW and Lion, resulting in AdamW-PMA and Lion-PMA. Theoretical analysis demonstrates that AdamW-PMA achieves a comparable convergence rate with Adam. Extensive experiments showcase the superiority of PMA on the post-training stafe (including Supervised Fine-Tuning and Direct Preference Optimization). Notebly, the PMA-based methods achieve approximately at least $2\times$ speedup than gradient accumulation and higher scores on downstream tasks.
% The high gradient variance poses significant challenges for post-training Large Language Models (LLMs) on memory-constrained devices. Current practical solutions, such as reducing batch sizes or employing Gradient Accumulation (GA), present a trade-off between low convergence rates caused by high variance in parameter updates and prolonged training times due to GA's sequential nature. In this work, we demonstrate that the Exponential Moving Average (EMA) in momentum updates inherently forgets historical gradients at an exponential rate, thereby limiting their utility for stabilizing update steps. Capitalizing on the observation that model parameters undergo minimal changes during post-training, we integrate the concept of GA into momentum updates and introduce the Periodical Moving Average (PMA) technique. PMA organizes training steps into distinct periods and replaces EMA with moving averages within each period. We implement PMA in both AdamW and Lion optimizers, yielding AdamW-PMA and Lion-PMA variants. Theoretical analysis confirms that AdamW-PMA maintains a convergence rate comparable to standard Adam. Comprehensive experiments demonstrate PMA's effectiveness in post-training scenarios, including Supervised Fine-Tuning and Direct Preference Optimization. Notably, PMA-based methods achieve approximately $2\times$ speedup compared to gradient accumulation while attaining superior performance on downstream tasks.
% \end{abstract}
\begin{abstract}
High gradient variance presents a significant obstacle to efficient post-training of large language models (LLMs) on memory-constrained devices. Existing practical strategies—such as reducing batch sizes or adopting gradient accumulation (GA)—suffer from an inherent trade-off: smaller batches exacerbate convergence issues due to increased gradient noise, while GA substantially prolongs training time owing to its sequential processing. In this work, we reveal that the Exponential Moving Average (EMA) in momentum-based optimizers exponentially discounts historical gradients, thereby limiting their effectiveness in stabilizing parameter updates, especially during post-training when parameter drift is minimal. Motivated by this, we propose integrating the core idea of GA directly into momentum updates via a novel \emph{Periodical Moving Average} (PMA) mechanism, which structures training into fixed periods and replaces EMA with a uniform moving average within each period. We instantiate PMA within AdamW and Lion, resulting in the AdamW-PMA and Lion-PMA optimizers. Theoretical analysis establishes that AdamW-PMA matches the convergence guarantees of standard Adam. Extensive empirical evaluation on supervised fine-tuning and direct preference optimization tasks demonstrates that PMA-based methods achieve approximately $2\times$ faster training compared to GA, while yielding consistently better performance on downstream evaluations. 
\end{abstract}



\input{contents-uai-cr/introduction}
\input{contents-uai-cr/preliminaries}

% \input{pseudo_code}
% \input{contents/method}
\input{contents-uai-cr/method}

\input{contents-uai-cr/theory}

% \input{contents/evaluation}
% \input{contents/evaluation-v2}
\input{contents-uai-cr/evaluation}

\input{contents-uai-cr/related_work}

% \input{related_work}
% \section{Conclusion}
% We address the problem of high-variance stochastic optimization on GPU-memory-limited devices for post-training LLMs. We identified that the low convergence rate of current momentum-based optimizers is primarily due to the EMA method, which fails to leverage historical gradients effectively for stabilizing updates. We propose \algname, a new momentum update method that splits the training process into periods and applies a moving average within each period. We modify AdamW and Lion using \algname, resulting in \algadamw{} and \alglion, respectively. Empirical evaluations on SFT and DPO tasks using various models demonstrate that \algname{} achieves approximately at least $2\times$ speedup in the training process and delivers better performance on downstream tasks.

% % \paragraph{}
% \section*{Limitation}
% \algname{} modified methods could incur higher communication overhead in multi-GPU training scenarios, especially when $K$ is large. 
% % For example, Since \algadamw{} employs extra steps of parameter update during GA, more communication overhead is required when multiple GPUs are employed for the training task.
% % , potentially slowing down the training process compared to Adam. 
% Specifically, since there are $K$ more communication rounds in \algadamw{} than in Adam with GA, the communication cost of \algadamw{} is $K$ times higher than that of Adam with GA.
% In addition, we did not evaluate \algname{} in larger-scale experiments.
\section{Conclusion}
We tackled the challenge of high-variance stochastic optimization for post-training large language models (LLMs) on GPU-memory-constrained devices. Our analysis revealed that the slow convergence of existing momentum-based optimizers is largely attributable to the EMA scheme, which inadequately exploits information from historical gradients to stabilize parameter updates. To remedy this, we introduced \algname{}, a novel momentum update framework that partitions training into periods and applies a moving average within each period. We integrated \algname{} into AdamW and Lion to obtain \algadamw{} and \alglion{}, respectively. Extensive experiments on SFT and DPO tasks across various models demonstrate that \algname{} yields at least a $2\times$ speedup in training and consistently improves downstream task performance.

\section*{Limitations}
While \algname{} offers substantial training acceleration, it can incur increased communication overhead in multi-GPU settings, particularly for large $K$. Specifically, since \algadamw{} performs $K$ times as many parameter update communications as AdamW with GA, its communication cost can scale linearly with $K$. In addition, we have not yet evaluated \algname{} in large-scale distributed experiments; further investigation in such settings is warranted.


\section*{Acknowledgement}
This work was supported by the Shenzhen Science and Technology Program (JCYJ20220818103001002), Shenzhen Doctoral Startup Funding (RCBS20221008093330065), Tianyuan Fund for Mathematics of National Natural Science Foundation of China (NSFC) (12326608), Shenzhen Science and Technology Program (Shenzhen Key Laboratory Grant No. ZDSYS20230626091302006), and Shenzhen Stability Science Program 2023, Shenzhen Key Lab of Multi-Modal Cognitive Computing.

% \newpage
% \section*{Reproducibility Statement}
% The code of the experiment is attached in the supplementary material as a zip file. Please refer to the \verb|README_ICLR_submission.md| for detailed usage. The proof is provided in the Appendix.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \newpage
% \small
% \bibliographystyle{unsrt}
% \bibliographystyle{apalike}
% \bibliographystyle{alpha}
% \bibliographystyle{iclr2024_conference}
% \bibliographystyle{iclr2025_conference}
\bibliography{ref}
\normalsize
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
% \appendix


\onecolumn

\title{Perodical Moving Average Accelerates Gradient Accumulation for Post-Training\\(Supplementary Material)}
\maketitle

\appendix

% \addcontentsline{toc}{section}{Appendix} % Add the appendix text to the document TOC
% \part{Appendix} % Start the appendix part
% \parttoc % Insert the appendix TOC

\section{Extra Pseudo-Code}\label{sec:app-pseudo-code}
% \input{pseudo_code_alglion}
\input{contents-uai/pseodo_code_alglion}
Algorithm~\ref{alg:agma-lion} presents the pseudo-code of \alglion. Lines 4-8 illustrate the large update steps, while lines 9-13 demonstrate the small update steps. We incorporate \alglion ~as an adaptation of \algadamw ~to AdamW. The main difference between the adaptation of AdamW and Lion lies in the implementation of the learning rate strategy. In \alglion, we decay the learning rate by $1/K$ at the small update steps instead of $1/\sqrt{K}$ in \algadamw.


% \input{contents-uai-cr/related_work}

\input{contents/theory-addition}
\input{contents/proof}

\input{contents/evaluation-addition}


\end{document}