% !TEX root = main.tex




\section{The \alg Algorithm Family} \label{sec: alg}

In this section, we first present the basic version of the \alg algorithm in Section~\ref{subsec:stimulus}, which is followed by its momentum and adaptive-batching variants in Sections~\ref{subsec:stimulus-m} and \ref{subsec:stimulusp}, respectively.





\subsection{The \alg Algorithm} \label{subsec:stimulus}



Our \alg algorithm is presented in Algorithm~\ref{alg}, where we propose a new variance-reduced (VR) multi-gradient estimator. 
It can be seen from Algorithm~\ref{alg} that our proposed VR approach has a double-loop structure, where the inner loop is of length $q>0$.
% At the beginning of each inner loop, 
% %denoted by $\x_t$. 
% the estimator is initialized as the full multi-gradient $\bu_t^s=\nabla f_s\left(\x_0\right)$.
%It is insightful to compare \alg with the basic MGD method \citep{fliege2019complexity}. 
% In MGD, the algorithm computes the  full multi-gradient in the $t$-th iteration as follows:
% %
% \begin{align}\label{mgda}
% \mathbf{u}_t^s = \nabla f_s(\x_{t}), \forall s \in [S].
% \end{align}
More specifically, different from MGD where a full multi-gradient direction $\mathbf{u}_t^s = \nabla f_s(\x_{t})$, $\forall s \in [S]$ is evaluated in all iterations, our \alg algorithm only evaluates a full multi-gradient every $q$ iterations (i.e., $\mathrm{mod}(t,q)=0$).
For all other iterations $t$ with $\mathrm{mod}(t,q)\ne 0$, our \alg algorithm uses a {\em stochastic} multi-gradient estimator $\bu_t^s$ based on a mini-batch $\mathcal{A}$ with a recursive correction term as follows:
\begin{small}
\begin{align}\label{vr}
\mathbf{u}_t^s = \mathbf{u}_{t-1}^s &+ \frac{1}{|\mathcal{A}|} \sum_{j\in \mathcal{A}}( \nabla f_{sj} (\x_{t};\xi_{sj}  )\notag\\&- \nabla f_{sj} (\x_{t-1};\xi_{sj} ) ), \text{for all }s \in [S].
\end{align}
\end{small}

Eq.~\eqref{vr} shows that the estimator is constructed iteratively based on information from $\x_{t-1}$ and $\bu_{t-1}^s$, both of which are obtained from the previous update. 
We will show later in Section~\ref{sec: convergence} that, thanks to the $q$-periodic full multi-gradients and the recursive correction terms, \alg is
able to achieve a convergence rate of $\mathcal{O}(1/T)$. Moreover, due to the stochastic subsampling in mini-batch $\mathcal{A}$, \alg has a lower sample complexity than MGD. 
%The full description of \alg is shown in Algorithm 1.
In \algns, the update rule for parameters in $\x$ is written as:
$%\label{vr_update}
\x_{t+1} = \x_{t} - \eta \bd_t,
$
where $\eta$ is the learning rate.
%, a hyperparameter that decides the step size at each iteration while moving towards a minimum.
Here, the direction $\bd_t$ is defined as $\bd_t := \sum_{s \in [S]} \lambda_{t}^{s} \mathbf{u}_{t}^s$, where the $\lambda_t^s$-values are obtained by solving the following quadratic optimization problem:
\begin{small}
\begin{align}\label{mgda} 
     \min_{\lambda_t^s\geq 0} \Big \|  \sum_{s \in [S]} \lambda_{t}^s \mathbf{u}_{t}^s \Big\|^2, \,\,
    \mathrm{s.t.} \,\, \sum_{s \in [S]} \lambda_{t}^s = 1.    
\end{align}
\end{small}
%
%{\color{blue}
The iterative update in Eqs.~\eqref{mgda} follows the same token as in the MGDA algorithm \citep{mukai1980algorithms,sener2018multi,lin2019pareto,fliege2019complexity}.
%}

\begin{algorithm}[htbp]
\caption{\alg algorithm and its variants.}
\label{alg} 
\begin{algorithmic}[1] % Adding [1] for line numbering
\REQUIRE Initial point $\x_0$, parameters $T$, $q$.
\STATE Initialize: Choose $\x_0$.
\FOR {$t = 0, 1, \ldots, T$}
    \IF {$\mathrm{mod}(t, q) = 0$}
        \IF {\alg or \algmns}
            \STATE Compute: $\mathbf{u}_t^s\!\!=\!\!\frac{1}{n}\sum_{j=1}^n \nabla f_{sj} (\x_{t};\xi_{sj}  ),\! \forall  s \!\in\! [S].$
        \ENDIF
        \IF {\algp or \algmp}
            \STATE Compute: $\mathbf{u}_t^s$ as in Eq.~\eqref{STIMULUSP1}.
        \ENDIF
    \ELSE
        \STATE  Compute $\mathbf{u}_t^s$ as in Eq.~\eqref{vr}.
    \ENDIF
     \STATE  Compute $\boldsymbol{\lambda}_t^* \in [0, 1]^S$ by solving Eq. \eqref{mgda}.
    \STATE Compute: $\bd_t = \sum_{s \in [S]} \lambda_{t}^{s,*} \mathbf{u}_{t}^s$.
    \IF {\alg or \algpns}
        \STATE Update: $\x_{t+1} = \x_{t} - \eta \bd_t$.
    \ENDIF 
    \IF {\algm or \algmpns}
        \STATE Update: $\x_{t+1} = \x_{t} + \alpha(\x_{t}-\x_{t-1}) - \eta \bd_t $.
    \ENDIF 
\ENDFOR
\end{algorithmic}
\end{algorithm}


\subsection{The \algm Algorithm} \label{subsec:stimulus-m}

Although it can be shown that \alg achieves a theoretical $\mathcal{O}(1/T)$ convergence rate, it could be sensitive to the choice of learning rate and suffer from similar oscillation issues in practice as gradient-descent-type methods do in single-objective optimization when some objectives are ill-conditioned.

To further improve the empirical performance of \algns, we now propose a momentum-assisted enhancement for \alg called \algmns. 
The idea behind \algm is to take into account the past trajectories to smooth the update direction. 
%It can be visualized as a ball rolling downhill, building up velocity (or momentum) in directions with persistent gradients.
Specifically, in addition to the combined iterative update as in $%\label{vr_update}
\x_{t+1} = \x_{t} - \eta \bd_t
$ and \eqref{mgda}, the update rule in \algm incorporates an $\alpha$-parameterized momentum term as follows:
\begin{small}
\begin{align}\label{vrm_update}
\x_{t+1}   =   \x_{t}   -   \eta \bd_t   +   \underbrace{\alpha(\x_{t}  -  \x_{t-1})}_{\mathrm{Momentum}}, \forall s   \in   [S],
\end{align}
\end{small}
where $\alpha   \in   (0, 1)$ is the momentum coefficient.
%, a hyperparameter typically set between 0 (no momentum) and 1 (high momentum).
         

\subsection{\algpns/\algmp Algorithms} \label{subsec:stimulusp}

Note that in both \alg and \algmns, one still needs to evaluate a full multi-gradient every $q$ iteration, which remains computationally demanding in the large data regime. 
Moreover, if the objectives are in an expectation or ``online'' form rather than the finite-sum setting, it is infeasible to compute a full multi-gradient.
%Also, the initialization step in \alg and \algm requires a full multi-gradient evaluation, which could take a long time for the algorithm to start the first move. 
To address these limitations, we propose two {\em adaptive-batching} enhanced versions for \alg and \algm called \algp and \algmpns, respectively. 
%It integrates the \alg with an adaptive batch size technique. 
Specifically, rather than using a $q$-periodic full multi-gradient $\mathbf{u}_t^s = \nabla f_s(\x_{t})=\frac{1}{n}\sum_{j=1}^n \nabla f_{sj} (\x_{t};\xi_{sj}  )$, $\forall s \in [S]$, in iteration $t$ with $\mathrm{mod}(t, q) = 0$, we utilize an adaptive-batching stochastic multi-gradient as follows:
\begin{small}
\begin{align}\label{STIMULUSP1}
\mathbf{u}_t^s = \frac{1}{|\mathcal{N}_s|} \sum_{j\in \mathcal{N}_s}\nabla f_{sj}(\mathbf{x}_{t};\xi_{sj} ), \quad \forall s \in [S],
\end{align}
\end{small}
where $\mathcal{N}_s$ is an $\epsilon$-adaptive batch sampled from the dataset uniformly at random with size:
\begin{small}
\begin{align}\label{STIMULUSP2}
|\mathcal{N}_s| = \min \left\{ c_\gamma \sigma^2\gamma_{t}^{-1}, c_\epsilon \sigma^2 \epsilon^{-1}, n\right\}.
\end{align}
\end{small}
We choose constants $c_\gamma \geq 8$, $c_{\epsilon}\geq \eta$ in non-convex case and $c_{\gamma}\geq \frac{8\mu}{\eta}, c_{\epsilon}\geq \frac{\mu}{2}$ in strongly-convex case (see detailed discussions in Section~\ref{sec: convergence}). 
%\kevin{You need to explain how to choose these constants. Otherwise, it's very likely that the reviewers will ask.}. 
The $\sigma^2$ represents the variance bound of stochastic gradient norms (cf. Assumption.~\ref{ass3}). 
In \algp, we choose 
 $\gamma_{t+1} = \sum_{i = (n_k-1) q}^t \frac{\|\bd_i\|^2}{q},$ while in the momentum based algorithm \algmpns, we choose $\gamma_{t+1} =\sum_{i = (n_k-1) q}^t \|\alpha^{(t-i)}\bd_{i}\|^2/q. $ The term $\gamma_{t+1} $ offers further refinement to improve convergence.

% The choice of $|\mathcal{N}_s|$ is driven by its crucial role in the later stages of convergence where a more precise update direction becomes essential. We'll demonstrate that \algpns/\algmp retains the convergence rate of \algns/\algm under certain conditions and settings.







