\section{Additional Experiments}\label{sec:app-evaluation}

\subsection{Pre-Training}
Although \algname{} is designed for post-training, we also evaluate its performance on pre-training task. Specifically, we train a randomly-initialized nanoGPT model on WikiPedia dataset. Figure~\ref{fig:pretrain} shows the validation loss of \algadamw{} and AdamW. EMA-based AdamW achieves a lower validation loss than \algadamw. This is because \algname, especially the small update step, is designed for post-training tasks where the distance between the original and trained parameters is small. Large distance of updates, such as pre-training, can make the update direction deviate too much from the direction of AdamW, leading to a slow training.

\begin{figure}[t]
 \centering
 \begin{minipage}[t]{0.45\textwidth}
    \centering
    \includegraphics[width=\textwidth]{fig/rebuttal/pretrain.jpg}  
    \caption{Runtime to achieve the same loss on DPO task. PMA can reduce the training time cost than EMA.}
    \label{fig:pretrain}  
 \end{minipage}
 \hfill
    \nextfloat
 \begin{minipage}[t]{0.45\textwidth}
    \centering
    \includegraphics[width=\textwidth]{fig/rebuttal/scheduler.jpg}  
    \caption{Validation loss of AdamW without learning rate scheduler and AdamW with a PMA-like lr scheduler.}
    \label{fig:scheduler}  
 \end{minipage}
 \end{figure}

\subsection{Ablation on Learning Rate Scheduler}
To evaluate how the learning rate scheduler introduced in Sec.~\ref{sec:learning-rate-strategy}, we conduct an experiment on Qwen2-0.5B, comparing AdamW without a scheduler and with a PMA-like scheduler. The other settings are the same as the experiment in Fig.~\ref{fig:epoch}. We evaluate the tuned model every 120 steps, and the statistics are shown in Fig.~\ref{fig:scheduler}. The PMA-like scheduler slows down the training process if the other components of PMA are not applied. This result indicates the necessity of the joint design of each component in \algadamw.

\subsection{SFT}
The improvement in validation loss brought by PMA can be translated into a reduction of the number of steps or total compute. In Figure~\ref{fig:sft-loss-4}, we evaluate the optimizers by comparing the number of steps or total flops needed to achieve the same validation loss level, setting K to 4. As can be observed in Figure~\ref{fig:sft-loss-8}, AdamW-PMA and Lion-PMA achieve a 12x and 2x speedup compared with AdamW and Lion.  
\begin{figure}[t]
    \centering
    \begin{subfigure}{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft_4/sft_adamw_4_flops.PNG}
        \caption{AdamW-PMA v.s. AdamW on flops}
        \label{fig:sft-AdamW-4-flops}
    \end{subfigure}\hfill
    \begin{subfigure}{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft_4/sft_adamw_4_numberofsamples.PNG}
        \caption{AdamW-PMA v.s. AdamW on number of samples}
        \label{fig:sft-AdamW-4-numberofsamples}
    \end{subfigure}\hfill
    \begin{subfigure}{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft_4/sft_lion_4_flops.PNG}
        \caption{Lion-PMA v.s. Lion on flops}
        \label{fig:sft-Lion-4-flops}
    \end{subfigure}\hfill
    \begin{subfigure}{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft_4/sft_lion_4_numberofsamples.PNG}
        \caption{Lion-PMA v.s. Lion on number of samples}
        \label{fig:sft-Lion-4-numberofsamples}
    \end{subfigure}
    \caption{From the perspectives of total flops and number of steps, AdamW-PMA and Lion-PMA achieved speedups of 1.8x and 1.4x respectively, compared to AdamW and Lion when K = 1.}
    \label{fig:sft-loss-4}
    % Yumou: please refer to https://arxiv.org/pdf/2305.14342 for examples if there is any confusion in writing.
\end{figure}

\begin{figure}[t]
    \centering
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft-8/sft-AdamW-8-flops.PNG}
        \caption{AdamW-PMA v.s. AdamW on Flops}
        \label{fig:sft-AdamW-8-flops}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft-8/sft-AdamW-8-numberofsamples.PNG}
        \caption{AdamW-PMA v.s. AdamW on Number of Samples}
        \label{fig:sft-AdamW-8-numberofsamples}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft-8/sft-Lion-8-flops.PNG}
        \caption{Lion-PMA v.s. Lion on Flops}
        \label{fig:sft-Lion-8-flops}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/sft/sft-8/sft-Lion-8-numberofsamples.PNG}
        \caption{Lion-PMA v.s. Lion on Number of Samples}
        \label{fig:sft-Lion-8-numberofsamples}
    \end{subfigure}
    \caption{We evaluate the optimizers by comparing the total flops and number of samples needed to achieve the same validation loss level. \algadamw{} and \alglion{} achieved approximately 12x and 2x speedup, respectively, relative to AdamW and Lion.}
    \label{fig:sft-loss-8}
    % Yumou: please refer to https://arxiv.org/pdf/2305.14342 for examples if there is any confusion in writing.
\end{figure}


\subsection{DPO}
Figure~\ref{fig:dpo-loss-8} and Figure~\ref{fig:dpo-loss-16} illustrate the validation loss of the DPO task on Phi-2 and HH-RLHF-harmless dataset, using four different optimizers. We compare the total flops and number of samples needed to achieve the same validation loss across vanilla AdamW and AdamW-PMA, Lion and Lion-PMA. The corresponding accuracy graph for this experiment can be found in Figure~\ref{fig:dpo-accuracy} of Section 5.3 in the main text. 

\begin{figure}[t]
    \centering
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo_8/dpo_adamw_8_flops.PNG}
        \caption{AdamW-PMA v.s. AdamW on flops}
        \label{fig:dpo-AdamW-8-flops}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo_8/dpo_adamw_8_numberofsamples.PNG}
        \caption{AdamW-PMA v.s. AdamW on number of samples}
        \label{fig:dpo-AdamW-8-numberofsamples}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo_8/dpo_lion_8_flops.PNG}
        \caption{Lion-PMA v.s. Lion on flops}
        \label{fig:dpo-Lion-8-flops}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo_8/dpo_lion_8_numberofsamples.PNG}
        \caption{Lion-PMA v.s. Lion on number of samples}
        \label{fig:dpo-Lion-8-number-of-samples}
    \end{subfigure}
    \caption{Validation loss of the DPO task on Phi-2 and HH-RLHF-harmless dataset.}
    \label{fig:dpo-loss-8}
\end{figure}

\begin{figure}[t]
    \centering
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo-1-16/adamw_16_flops.PNG}
        \caption{AdamW-PMA v.s. AdamW on flops}
        \label{fig:dpo-AdamW-16-flops}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo-1-16/adamw_16_numberofsamples.PNG}
        \caption{AdamW-PMA v.s. AdamW on number of samples}
        \label{fig:dpo-AdamW-16-numberofsamples}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo-1-16/lion_16_flops.PNG}
        \caption{Lion-PMA v.s. Lion on flops}
        \label{fig:dpo-Lion-16-flops}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/dpo/dpo-1-16/lion_16_numberofsamples.PNG}
        \caption{Lion-PMA v.s. Lion on number of samples}
        \label{fig:dpo-Lion-16-steps}
    \end{subfigure}
    \caption{We evaluate the optimizers by comparing the total flops and number of samples needed to achieve the same DPO validation loss level, with K setting to be 16. \algadamw{} and \alglion{} achieved approximately 4x and 5x speedup, respectively, relative to AdamW and Lion. }
    \label{fig:dpo-loss-16}
\end{figure}



\subsection{Hyper-Parameter Sensitivity}\label{sec:app-sensitivity}
We do experiments on DPO task with the Phi-2-2.7B model and Qwen1half-0.5B-chat model to explore the sensitivity of the PMA method's speedup factor with hyper-parameter K on AdamW. In experiment of Phi-2 model, we set K to be 8, 16, 32, 64 to explore the optimal K value. For Qwen1.5-0.5B model, the K is set to be 4, 8, 16, 32, which are relatively smaller since the model is smaller. The results of experiments can be seen in Figure~\ref{fig:hyper-app} and \ref{fig:hyper-app-qwen}. This part is the supplement results of Section 5.4 in the main text.
\begin{figure}[t]
    \centering
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper-phi-2/dpo_hyper_8.jpg}
        \caption{}
        \label{fig:hyper-8}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper-phi-2/dpo_AdamW_16_flops.jpg}
        \caption{}
        \label{fig:hyper_16}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper-phi-2/dpo_hyper_32.jpg}
        \caption{}
        \label{fig:hyper_32}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.22\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper-phi-2/dpo_hyper_64.jpg}
        \caption{}
        \label{fig:hyper_64}
    \end{subfigure}
    \caption{The sensitivity of PMA's speedup factor with hyper-parameter K on Phi-2 model using AdamW}
    \label{fig:hyper-app}
\end{figure}


\begin{figure}[t]
    \centering
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper_qwen_1/hyper_qwen_2.PNG}
        \caption{}
        \label{fig:hyper-qwen-2}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper_qwen_1/hyper_qwen_4.PNG}
        \caption{}
        \label{fig:hyper-qwen-4}
    \end{subfigure}\hfill
    \begin{subfigure}[t]{0.3\textwidth}
        \centering
        \includegraphics[width=\linewidth]{fig/hyper_qwen_1/hyper_qwen_8.PNG}
        \caption{}
        \label{fig:hyper_qwen-8}
    \end{subfigure}\hfill
    \caption{The sensitivity of PMA's speedup factor with hyper-parameter K on Qwenhalf1-0.5B model using AdamW}
    \label{fig:hyper-app-qwen}
\end{figure}