% !TEX root = main.tex
\section{Pareto Stationarity Convergence Analysis} \label{sec: convergence}


In this section, we theoretically analyze the Pareto stationarity convergence of our \alg algorithms in non-convex and strongly convex settings, beginning with two necessary assumptions.

\begin{assump}[$L$-Lipschitz Smoothness] \label{assump: smooth}
    % Each objective's gradient $\nabla f_s(\x), s \in [S]$ is $L$-Lipschitz continuous, i.e., 
    There exists a constant $L>0$ such that $\| \nabla f_s(\x) - \nabla f_s(\y) \| \leq L \| \x - \y \|, \forall \x, \y \in \mathbb{R}^d$, $\forall s \in [S]$.
\end{assump}

\begin{assump}[Bounded Variance]	\label{ass3}
There exists a constant	$\sigma>0$ such that for all $\x\in \mathbb{R}^d$, $\mathbb{E}\|  \nabla_{\x}f_s(\x;\xi)- \nabla_{\x}f_s(\x)\|^2 \leq \sigma^2$, $\forall s\in S.$
\end{assump}	
%
% %
% \iffalse
% \begin{assump} (Bounded Gradient) \label{assump: BD}
%     The gradient of each objective at any client is bounded, i.e., there exists a constant $G>0$ such that $\| \nabla f_{s}(\x) \|^2 \leq G^2, \forall s \in [S]$.
% \end{assump}
% \fi
With these assumptions, we are now in a position to discuss the Pareto stationary convergence of the \alg family.



\subsection{Pareto-Stationarity Convergence of \algns} \label{subsec: STIMULUS}


\textbf{1)~\algns: The Non-convex Setting.} First, we show that the basic \alg algorithm achieves an $\mathcal{O}(1/T)$ convergence rate for non-convex MOO problems in the following theorem.
Note that this result matches that of the deterministic MGD method.

\begin{restatable}[\alg for Non-convex MOO]{theorem}{STIMULUS_NonC}
\label{thm:STIMULUS_nonC}
% Choose a constant global learning rate
Under Assumption~\ref{assump: smooth}, let $\eta \leq  \frac{1}{2L}$, if at least one objective function $f_s(\cdot)$, $s \in [S]$ is bounded from below by $f_s^{\min}$, then the sequence $\{\x_t \}$ output by \alg satisfies: $\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2  =\mathcal{O}(1/T).$
\end{restatable}


Following from Theorem.~\ref{thm:STIMULUS_nonC}, we immediately have the following sample complexity for the \alg algorithm by choosing $ q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$: 

\begin{restatable}[Sample Complexity of \algns]{corollary}{STIMULUS_NCRate}
\label{cor:STIMULUS_NC}
By choosing $\eta \leq  \frac{1}{2L}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$, the overall sample complexity of  \alg for finding an $\epsilon$-stationary point for non-convex MOO problems is $\mathcal{O}\left(\sqrt{n} \epsilon^{-1}+n\right)$. 
\end{restatable}

%\begin{remark}
Several interesting remarks regarding Theorem~\ref{thm:STIMULUS_nonC} and Corollary~\ref{cor:STIMULUS_NC} are in order:
%{\bf 1)} We note that \alg achieves an $\mathcal{O}(1/T)$ Pareto-stationarity convergence rate for non-convex MOO, which is the {\em same} as the Pareto stationary rate of MGD for MOO and the {\em same} convergence rate of gradient descent (GD) for single objective problems.
%
{\bf 1) } Our proof of \algns's Pareto-stationarity convergence only relies on standard assumptions commonly used in first-order optimization techniques. 
This is in stark contrast to prior research, where unconventional and hard-to-verify assumptions were required (e.g., an assumption on the convergence of $\x$-sequence is used in~\cite{fliege2019complexity}).
{\bf 2) } %Another significant advantage of \alg lies in its convergence rate compared to MGD. 
While both MGD and our methods share the same $\mathcal{O}(1/T)$ convergence rate, \alg enjoys a substantially lower sample complexity than MGD. 
More specifically, the sample complexity of \alg is reduced by a factor of $\sqrt{n}$ when compared to MGD. 
This becomes particularly advantageous in the ``big data'' regime where $n$ is large. 
 
%\end{remark}


{\bf 2) \algns: The Strongly Convex Setting.} 
Now, we consider the strongly convex setting, which is more tractable but still of interest in many learning problems in practice (e.g., multi-objective ridge regression).
In the strongly convex setting, we have the following additional assumption:
\begin{assump}[$\mu$-Strongly Convex Function]\label{assump: SC}
    Each objective $f_s(\x)$, $s \in [S]$ is a $\mu$-strongly convex function, i.e., $f_s(\y) \geq f_s(\x) + \nabla f_s(\x) (\y - \x) + \frac{\mu}{2} \| \y - \x \|^2$, $\forall \x,\y$, for some $\mu >0$.
\end{assump}
%{\color{blue}
\begin{assump}\label{assump: add}
For any objective function $f_j$, there exists a positive real number $c_j$ such that for any $\mathbf{x}$ in $\mathbb{R}^n$ the following relation holds
$
f_j(\mathbf{x})-f_j\left(\mathbf{x}^*\right) \geq \frac{c_j}{2}\left\|\mathbf{x}-\mathbf{x}^*\right\|^2 \text { a.s. ;} j \in S .
$
\end{assump}
Assumption \ref{assump: add} asserts that the function value increases at least quadratically as you move away from $\x_*$, ensuring consistent progress towards the optimum. 
It is a reasonable assumption since it is also based on the strong convexity property.
%, which is a realistic and useful characteristic in many optimization problems. 
The above assumption has also been adopted in \cite{mercier2018stochastic}.
%}

For strongly convex MOO problems, the next result says that \alg achieves a much stronger expected linear Pareto-optimality convergence performance:

\begin{restatable}[\alg for $\mu$-Strongly Convex MOO] {theorem}{STIMULUS_SC}
\label{thm:STIMULUS_SC}
Under Assumption~\ref{assump: smooth}, \ref{assump: SC}, \ref{assump: add}, let $\eta \leq  \min\{\frac{1}{2}, \frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \}$, $ q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$.
Under Assumptions~\ref{assump: smooth}--\ref{assump: add}, pick $\x_t$ as the final output of \alg with probability $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$.
Then, we have $\mathbb{E}\|\x_t-\x^*\|^2 \leq \| \x_0 - \x^* \|^2 \mu \exp( - \frac{3\eta \mu T}{4}).$
\end{restatable}

Further, Theorem~\ref{thm:STIMULUS_SC} immediately implies following with logarithmic sample complexity (in terms of $\epsilon$) \alg with a proper choice of learning rate and $q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$. 
\begin{restatable}[Sample Complexity of \algns]{corollary}{vr_mooSCRate}
\label{cor:STIMULUS_SC}
By choosing $\eta \leq  \min\{\frac{1}{2}, \frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2}\}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil\}$, the overall sample complexity of \alg for solving strongly convex MOO is $\mathcal{O}\left(n+ \sqrt{n} \ln ({\mu/\epsilon})\right)$.
\end{restatable}


%\begin{remark}
There are also several interesting insights from Theorem~\ref{thm:STIMULUS_SC} and Corollary~\ref{cor:STIMULUS_SC} regarding \algns's performance for solving strongly convex MOO problems: %. These insights are as follows:
{\bf 1)} \alg achieves an expected linear convergence rate of $\mathcal{O}(\mu \exp(-\mu T))$. 
Interestingly, this convergence rate matches that of MGD for strongly convex MOO problems as well as gradient descent for strongly convex single-objective optimization. 
%The linearity of \algns's convergence is a noteworthy attribute that underlines its effectiveness in tackling strongly convex MOO.
%
{\bf 2)} Another interesting feature of \alg for strongly convex MOO stems from its use of randomly selected outputs $\x_t$ along with associated weights $w_t$ from the trajectory of $\x_t$, which is inspired by the similar idea for stochastic gradient descent (SGD)~\citep{ghadimi2013stochastic}. 
Note that, for implementation in practice, one does not need to store all $\x_t$-values.
Instead, the algorithm can be implemented by using a random clock for stopping~\citep{ghadimi2013stochastic}.
%\end{remark}


\subsection{Pareto Stationarity Convergence of \algmns} \label{subsec: STIMULUS_m}

Next, we turn our attention to the Pareto stationarity convergence of the \algm algorithm.
Again, we analyze \algm in non-convex and strongly convex settings:

\begin{restatable}[\algm for Non-convex MOO] {theorem}{STIMULUS_M_NonC}
\label{STIMULUSM_NonC}
% Choose a constant global learning rate
Let $\eta_t = \eta \leq \min\{ \frac{1}{2L}, \frac{1}{2}\}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$.
Under Assumptions~\ref{assump: smooth}, if at least one objective function $f_s(\cdot)$, $s \in [S]$, is bounded from below by $f_s^{\min}$, then the sequence $\{\x_t \}$ output by \algm satisfies $\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2  =\mathcal{O}(\frac{1}{T}).$
\end{restatable}


Similar to the basic \alg algorithm, by choosing the appropriate learning rate and inner loop length parameters, we immediately have the following sample complexity result for \algm for solving non-convex MOO problems:

\begin{restatable}[Sample Complexity of \algmns]{corollary}{STIMULUS_M_NCRate}
\label{STIMULUS_M_NCRate}
By choosing $\eta_t = \eta \leq \min\{ \frac{1}{2L}, \frac{1}{2}\}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$. The overall sample complexity of  \algm under non-convex objective functions is $\mathcal{O}\left(\sqrt{n} \epsilon^{-1}+n\right)$.
\end{restatable}


The next two results state the Pareto optimality and sample complexity results for \algmns:
\begin{restatable}[\algm for $\mu$-Strongly Convex MOO] {theorem}{STIMULUSm_SC}
\label{thm:STIMULUSm_SC}
Let $\eta \leq  \min\{\frac{1}{2},\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$.
Under Assumption~\ref{assump: smooth}, \ref{assump: SC}, \ref{assump: add}, pick $\x_t$ as the final output of \algm with probability $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$.
Then, we have
$\mathbb{E}\|\x_t-\x^*\|^2 \leq \| \x_0 - \x_* \|^2 \mu \exp( - \frac{3\eta \mu T}{4}).$
\end{restatable}


\begin{restatable}[Sample Complexity of \algmns]{corollary}{vrm_mooSCRate}
\label{cor:STIMULUSm_SC}
By choosing $\eta \leq  \min\{\frac{1}{2},\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$, the overall sample complexity of \algm for solving strongly convex MOO is $\mathcal{O}\left(n+ \sqrt{n} \ln ({\mu/\epsilon})\right)$.
\end{restatable}


%\begin{remark}
%We note that \algm has the same theoretical convergence rate as that of \alg in both non-convex and strongly-convex optimization problems. 
We remark that the convergence rate upper bound of \algm is the same as that in Theorem~\ref{thm:STIMULUS_SC}, which suggests a potentially loose convergence upper bound in Theorem~\ref{thm:STIMULUSm_SC} due to the technicality and intricacies in analyzing momentum-based stochastic multi-gradient algorithms for solving non-convex MOO problems. 
Yet, we note that even this potentially loose convergence rate upper bound in Theorem~\ref{thm:STIMULUSm_SC} already suffices to establish a linear convergence rate for \algm in solving strongly convex MOO problems.
Moreover, we will show later in Section~\ref{sec:exp} that this momentum-assisted method significantly accelerates the empirical convergence speed performance.
%
It is also worth noting that there are two key differences in the proofs of Theorem~\ref{STIMULUSM_NonC} and \ref{cor:STIMULUSm_SC} compared to those of the momentum-based stochastic gradient algorithm for single-objective non-convex optimization: 1) our proof exploits the martingale structure of the $\bu_t^s$. 
This enables us to tightly bound the mean-square error term $\mathbb{E}\left\|\nabla f_s\left(\x_t\right)-\bu_t^s\right\|^2$ under the momentum scheme. 
In contrast, in the traditional analysis of stochastic algorithms with momentum, this error term corresponds to the variance of the stochastic estimator and is typically assumed to be bounded by a universal constant. 
2) Our proof requires careful manipulation of the bounding strategy to effectively handle the accumulation of the mean-square error $\mathbb{E}\left\|\nabla f_s\left(\x_k\right)-\bu_t^s\right\|^2$ over the entire optimization trajectory in non-convex MOO. 
%\end{remark}

\begin{figure*}[t!]
    \centering
    \subfigure[Training loss convergence in terms of iterations.]{
        \includegraphics[width=0.24\textwidth]{compare_alg_taskL_mnist.pdf}
        \includegraphics[width=0.24\textwidth]{compare_alg_taskR_mnist.pdf}
    }
    \hfill
    \subfigure[Training loss convergence in terms of samples.]{
        \includegraphics[width=0.24\textwidth]{sample_taskL_mnist.pdf}
        \includegraphics[width=0.24\textwidth]{sample_taskR_mnist.pdf}
    }
    \caption{Training loss convergence comparisons between different MOO algorithms.}
    \label{fig:compare_mnist}
\end{figure*}


\subsection{Pareto Stationarity Convergence Results of \algp and \algmpns} \label{subsec: STIMULUSp_m}

% \begin{assump}(Bounded Variance)	\label{ass3}
% There exists a constant	$\sigma>0$ such that for all $\x\in \mathbb{R}^d$, $\|  \nabla_{\x}f_s(\x;\xi)- \nabla_{\x}f_s(\x)\|^2 \leq \sigma^2. \forall s\in S. $
% \end{assump}	

Next, we present the Pareto stationarity convergence and the associated sample complexity results of the \algpns/\algmp algorithms for non-convex MOO as follows:

\begin{restatable}[\algpns/\algmpns]{theorem}{STIMULUSm_P_NonC}
\label{thm:STIMULUSmp_nonC}
% Choose a constant global learning rate
Let $\eta \leq \min\{ \frac{1}{4L}, \frac{1}{2}\},q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$. 
By choosing $c_\gamma$ and $c_\epsilon$ as such that $c_\gamma \geq 8$, and $c_{\epsilon}\geq \eta$,
under Assumptions~\ref{assump: smooth} and \ref{ass3}, if at least one function $f_s(\cdot)$, $s \in [S]$ is bounded from below by $f_s^{\min}$, then the sequence $\{\x_t \}$ output by \algpns/\algmp satisfies:
$\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2   =\mathcal{O}(\frac{1}{T}).$
\end{restatable}

\begin{restatable}[Sample Complexity]{corollary}{STIMULUS_P_NCRate}
\label{STIMULUS_Mp_NCRate}
By choosing $\eta \leq \min\{ \frac{1}{4L}, \frac{1}{2}\},q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$, $c_\gamma \geq 8$, and $c_{\epsilon}\geq \eta$. The overall sample complexity of  \algpns/ \algmp under non-convex objective functions is $\mathcal{O}\left(\sqrt{n} \epsilon^{-1}+n\right)$.
\end{restatable}

%Lastly, we state the Pareto optimality convergence and the associated sample complexity of \algpns/\algmp for strongly convex MOO problems as follows:

\begin{restatable}[\algpns/\algmpns] {theorem}{STIMULUSP_SC}
\label{thm:STIMULUSP_SC}
Let $\eta \leq \min\{\frac{1}{2},\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \},c_{\gamma}\geq \frac{8\mu}{\eta}, c_{\epsilon}\geq \frac{\mu}{2}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$.
Under Assumptions~\ref{assump: smooth}-~\ref{assump: add}, pick $\x_t$ as the final output of the \algpns/\algmp algorithm with weights $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$.
Then, it holds that $\mathbb{E}\|\x_t-\x^*\|^2 \leq \| \x_0 - \x_* \|^2 \mu \exp( - \frac{3\eta \mu T}{4}).$

\end{restatable}



\begin{restatable}[Sample Complexity]{corollary}{vr_moomSCRate}
\label{cor:STIMULUSmp_SC}
By choosing $\eta \leq \min\{\frac{1}{2},\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \},c_{\gamma}\geq \frac{8\mu}{\eta}, c_{\epsilon}\geq \frac{\mu}{2}, q = |\mathcal{A}|=\lceil\sqrt{n}\rceil$,
the overall sample complexity of  \algpns/ \algmp for solving strongly-convex MOO is $\mathcal{O}\left(n+ \sqrt{n} \ln ({\mu/\epsilon})\right)$.
\end{restatable}



%\begin{remark}
We note that, although the theoretical sample complexity bounds of \algpns/ \algmp are the same as those of \algns/ \algmns, respectively, the fact that \algp and \algmp do not need full multi-gradient evaluations implies that \algns/ \algm use significantly fewer samples than \algns/ \algm in the large dataset regime. 
Our experimental results in the next section will also empirically confirm this.
%\end{remark}

