% !TEX root = main.tex


\section{Preliminaries} \label{sec: prelim}






To facilitate subsequent technical discussions, in this section, we first provide a primer on MOO fundamentals and formally define the notions of Pareto optimality/stationarity, $\epsilon$-stationarity in MOO, and the associated sample complexity.
Then, we will give an overview of the most related work in the MOO literature, thus putting our work into comparative perspectives.

{\bf Multi-objective Optimization: A primer.}
As introduced in Section~\ref{sec: intro}, MOO aims to optimize multiple objectives in Eq.~\eqref{eq: moo} simultaneously.
%, leading to challenges in the fair comparison of two solutions, denoted as $\x$ and $\y$. It is highly probable that one solution outperforms the other in certain objectives, while the reverse holds true for different objectives, specifically, $f_i(\x) < f_i(\y)$ and $f_j(\x) > f_j(\y)$ for certain $i, j \in [S]$.
%
%In other words, the existence of a universally superior "optimal solution" that dominates all other solutions across every single objective is unlikely. Thus, conflicts among the objective functions in the MOO problem presented in \eqref{eq: moo} necessitate the concept of Pareto optimality. 
However, since in general there may not exist an $\x$-solution that minimizes all objectives at the same time in MOO, the more appropriate notion of optimality in MOO is the so-called {\em Pareto optimality,} which is formally defined as follows:

\begin{defn}[(Weak) Pareto Optimality]
\label{def:weakPareto}
Given two solutions $\x$ and $\y$, $\x$ is said to dominate $\y$ only if $f_s(\x) \leq f_s(\y), \forall s \in [S]$ and there exists at least one function, $f_s$, where $f_s(\x) < f_s(\y)$.
A solution $\x_*$ is Pareto optimal if no other solution dominates it.
A solution $\x$ is defined as weakly Pareto optimal if there is no solution $\y$ for which $f_s(\x) > f_s(\y), \forall s \in [S]$.
\end{defn}

Finding a Pareto-optimal solution in MOO is as complex as solving single-objective non-convex optimization problems and is NP-Hard in general. 
Consequently, practical efforts in MOO often aim to find a solution that meets the weaker notion called Pareto-stationarity (a necessary condition for Pareto optimality), which is defined as follows~\cite{fliege2000steepest,miettinen2012nonlinear}:

\begin{defn} [Pareto Stationarity] \label{defn:ParetoStationarity}
A solution $\x$ is Pareto-stationary if no common descent direction $\bd \in \mathbb{R}^d$ exists such that $\nabla f_s(\x)^{\top} \bd < 0, \forall s \in [S]$.
\end{defn}
Note also that in the special setting with strongly convex objective functions, Pareto-stationary solutions are Pareto-optimal.
Following directly from Pareto-stationarity in Definition~\ref{defn:ParetoStationarity}, gradient-based MOO algorithms strive to find a common descent (i.e., improving) direction $\bd \in \mathbb{R}^d$, such that $\nabla f_s(\x)^{\top} \bd \leq 0, \forall s \in [S]$. 
If such a direction does not exist at $\x$, then $\x$ is Pareto-stationary. % according to Definition~\ref{defn:ParetoStationarity}. 
Toward this end, the MGD method~\citep{desideri2012multiple} identifies an optimal weight $\boldsymbol{\lambda}^*$ for the multi-gradient set $\nabla \F(\x) \triangleq \{ \nabla f_s(\x), \forall s \in [S] \}$ by solving $\boldsymbol{\lambda}^*(\x) \in \operatorname*{argmin}_{\boldsymbol{\lambda} \in C} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x) \|^2$. Consequently, the common descent direction can be defined as $\bd = \boldsymbol{\lambda}^{\top} \nabla \F(\x)$.
Then, MGD follows the iterative update rule $\x \leftarrow \x - \eta \bd$ in the hope that a Pareto-stationary point can be reached, where $\eta$ signifies a learning rate. 
SMGD~\cite{liu2021stochastic} follows a similar approach, but with full multi-gradients being replaced by stochastic multi-gradients. 
For both MGD and SMGD, it has been shown that if $\| \boldsymbol{\lambda}^{\top} \nabla \F(\x) \| = 0$ for some $\boldsymbol{\lambda} \in C$, where $C \triangleq \{ \y \in [0, 1]^S, \sum_{s \in [S]} y_s = 1 \}$, then $\x$ is a Pareto stationary solution \cite{fliege2019complexity,zhou2022on}.
% \end{remark}

Here, it is insightful to contrast vector-valued MOO with the linear scalarization method with fixed weights for MOO, which is also a relatively straightforward approach commonly seen in the MOO literature. We note that vector-valued MOO offers unique benefits that do not exist in linear scalarization. 
Specifically, MGD-type methods for vector-valued MOO dynamically calculate the weights for each objective based on the gradient information in each iteration. 
The dynamic weighting in MGD-type approach adapts much better to the landscapes of different MOO problems, which enables a much more flexible exploration on the Pareto front. 
In contrast, the linear scalarization method uses fixed or pre-defined weights for each objective.
As a result, linear scalarization methods are limited to identifying the convex hull of the Pareto front \citep{boyd2004convex,ehrgott2005multicriteria}, whereas (stochastic) multi-gradient methods, including our proposed VR-based algorithms, have the capability to uncover the Pareto front. 
%Essentially, this represents a distinct advantage for all multi-gradient algorithms over linear scalarization methods. 
%This paper contributes by demonstrating that variance reduction can significantly enhance the complexity of stochastic multi-gradient methods by improved convergence.



In this paper, we focus on MOO problems in two settings: (i) non-convex MOO and (ii) strongly convex MOO.
Clearly, the non-convex setting is applicable to many learning problems in practice (e.g., neural network models).
The strongly convex setting is also interesting due to many applications in practice (e.g., linear models with quadratic regularizations).

Next, to introduce the notion of sample complexity in MOO, we first need the following definitions for the non-convex and strongly convex settings, respectively.
%of $\epsilon$-stationarity:
%{\color{blue} 
\begin{defn}[$\epsilon$-Stationarity (Nonconvex Setting)] \label{def:stationary}
%In nonconvex case, 
A solution $\x$ is $\epsilon$-stationary in MOO problem if the common descent direction at $\x$ satisfies the following condition: $\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x) \|^2 \leq \epsilon$
%$\mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 \leq \epsilon$
in non-convex MOO problems, where $C \triangleq \{ \y \in [0, 1]^S, \sum_{s \in [S]} y_s = 1 \}$.
\end{defn}
%}



%{\color{blue} 
\begin{defn}[$\epsilon$-Optimality (Strongly-Convex Setting)]
\label{def:optimality}
In the strongly-convex setting, a solution $\x$ is $\epsilon$-optimal if $\mathbb{E}[\|\x-\x^*\|^2]\leq \epsilon$ in MOO problems, where $\x^*$ is a Pareto-optimal solution of Problem~(\ref{eq: moo}).
%, where $ \mathbf{x}_* \in \underset{\mathbf{w} \in \mathcal{P}_D^{\star}}{\operatorname{argmin}}\{\|\mathbf{x}-\mathbf{w}\|\} .$
%The notation $\mathcal{P}_D^{\star}$  denotes the Pareto solution set (resp. the Pareto front). For any $\mathbf{x} \in \mathbb{R}^n$ the notation $\mathbf{x}_*$ will denote an element of the Pareto set which minimizes the distance between the point $\mathbf{x}$ and a point of the Pareto set $\mathcal{P}_D^{\star}$.
\end{defn}
%}
%\begin{remark}

% {\color{blue}
% % We note that the quantity $\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x) \|^2 \leq \epsilon$   can be used as a metric for evaluating the convergence speed of MOO algorithms in the non-convex setting~\citep{fliege2019complexity,zhou2022on,fernando2022mitigating}. In multi-objective optimization, this can be viewed as a measure of how close the current solution is to being optimal across all objectives.  
% One remark on Definition~\ref{def:optimality} is in order.
% % So far for strongly convex MOO problems, %the optimality gap $\sum_{s \in [S]} \lambda_s \left[ f_s(\x) - f_s(\x^*) \right]$ is usually used as the convergence metric~\citep{liu2021stochastic}, where $\x^*$ denotes the Pareto-optimal point. 
% % %we also want to point out that, due to research on MOO is still in its infancy, 
% % there remains no consensus on the definition of Pareto-stationarity that is universally adopted in the MOO literature. 
% % %The condition in Definition \ref{def:stationary} is proposed by us, which is also part of the novelty of this paper. 
% % Also, it is worth noting that several existing papers, including \citep{fliege2019complexity,yang2023federated}, employed similar metrics, rendering our results directly comparable to theirs. 
% %Additionally, we would also like to note that 
% By using an additional assumption similar to  \citep[Assumption~5.6]{liu2021stochastic} instead of Assumption. \ref{assump: add} in our paper, we can achieve sublinear convergence rates for the metric $\mathbb{E}\|\x-\x^*\|^2]$. In contrast, our methods demonstrate superior convergence results with linear convergence rates. Besides, Assumption 5.6 ($\nabla_{\x} S\left(\x_*, \boldsymbol{\lambda}_t\right)^{\top}\left(\x_t-\x_*\right) \geq 0, S(\x, \boldsymbol{\lambda_t})=\sum_{s=1}^{S} \lambda_t^s f_s(\x)$) is not without its limitation: Its validity heavily depends on the choice of the scalarization function $S$. If $S$ does not adequately capture the tradeoffs or interactions between different objectives, the assumption might lead to misleading conclusions about the nature of the optimization landscape.
% }
%\end{remark}
\iffalse
\begin{assump}
 Let $\x_*$ be the Pareto optimal defined in Definition \ref{def:weakPareto}. Let $S(x, \lambda)=\sum_{i=1}^{
|S|} \lambda_i f_i(x)$ denote the weighted true function and $\nabla_x S(x, \lambda)=\sum_{i=1}^{
|S|} \lambda_i \nabla f_i(x)$ the corresponding gradient. For any $\x_t$, one has
$$
\nabla_{\x} S\left(\x_*, \lambda_t\right)^{\top}\left(\x_t-\x_*\right) \geq 0
$$
\end{assump}

In fact, notice that $\nabla_{\x} S\left(\x_*, \lambda_*\right)=0$ holds according to the Pareto stationarity condition in Definition \ref{defn:ParetoStationarity}, and thus this assumption would hold with $\lambda_t$ replaced by $\lambda_*$.

{\color{blue} 
\begin{defn}[$\epsilon$-optimality in strongly-convex case] 
%$\mathbb{E}[\sum_{s \in [S]} \lambda_t^s \left[ f_s(\x) - f_s(\x_*) \right] ]\leq \epsilon$
$\mathbb{E}\|\x_t-\x^*\|^2]\leq \epsilon$ in strongly-convex MOO problems. A solution $\mathbf{\x} _ *$ is considered Pareto-optimal if there is no other feasible solution that would improve one objective without causing at least one other objective to worsen.
%{\color{red} $\epsilon$-stationarity: $\x$ is $\epsilon$-stationarity for a non-convex function if $\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x) \|^2 \leq \epsilon$. 
% Check fundamental MOO paper. define them separately.(stationarity and optimality. )  since there is no well-defined metric}
\end{defn}
}
\fi





With the above definitions, we are now in a position to define the concept of sample complexity in MOO as follows:

\begin{defn} [Sample Complexity] The sample complexity in MOO is defined as the total number of incremental first-order oracle (IFO) calls required by a MOO algorithm to converge to an $\epsilon$-stationary (or $\epsilon$-optimal in the strongly convex setting) point, where one IFO call evaluates the multi-gradient $\nabla_{\mathbf{x}} f_{sj}(\mathbf{x};\xi_{sj})$ for all tasks $s$. \end{defn}

\iffalse
\textbf{2)~Overview of MOO Algorithms:}
In the literature, 
MOO algorithms can be classified into two primary categories. 
The first category is usually referred to as gradient-free methods.
Typical gradient-free methods include evolutionary MOO algorithms and Bayesian MOO algorithms~\citep{zhang2007moea,deb2002fast,belakaria2020uncertainty,laumanns2002bayesian}. These techniques are suitable for small-scale problems but inefficient in solving high-dimensional MOO models (e.g., deep neural networks). 
In contrast, the second class is gradient-based MOO methods~\citep{fliege2000steepest,desideri2012multiple,fliege2019complexity,peitz2018gradient,liu2021stochastic}, which have shown to be more effective in solving high-dimensional MOO problems. 
As discussed in Section~\ref{sec: intro}, the most notable gradient-based MOO algorithms include multi-gradient descent (MGD)~\citep{fliege2019complexity} and stochastic multi-gradient descent (SMGD)~\citep{liu2021stochastic}, which achieves $\mathcal{O}(1/T)$ and $\mathcal{O}(1\sqrt{T})$ convergence rates, respectively.
% Most notably, multi-gradient descent (MGD) algorithms~\cite{fliege2019complexity} utilize full gradients and achieve specific convergence rates under unconventional conditions, but with challenges in practical verification.
% Stochastic Multi-Gradient Descent (SMGD) methods add complexity due to stochastic gradient noise, with some researchers providing an $O(1/T)$ rate analysis for SMGD~\cite{liu2021stochastic}. 
Although SMGD is easier to implement in practice thanks to the use of stochastic multi-gradient, it has been shown that the noisy common descent direction in SMGD could potentially cause divergence (cf. the example in Sec.~4 in \citep{zhou2022on}).
% {\color{blue} 
There also have been recent works on using momentum-based methods for bias mitigation in MOO, and these methods are further applied for bilevel optimization problems.~\citep{zhou2022on,fernando2022mitigating}. 
% }
Note, however, that the $\mathcal{O}(1/\sqrt{T})$ convergence rates of \citep{zhou2022on,fernando2022mitigating} remain unsatisfactory compared to the $\mathcal{O}(1/T)$ convergence rate of our \alg algorithm family.
%, the convergence rate results of \citep{zhou2022on,fernando2022mitigating} are not directly comparable.
%These advancements, although significant, are orthogonal to the current study and thus not directly comparable.
For easier comparisons, we summarize the state-of-the-art gradient-based MOO algorithms and their convergence rate results under non-convex and strongly convex settings in Table~\ref{tab}. 
% With full multi-gradient, MGD~\citep{fliege2019complexity} achieves an $\mathcal{O}(1/T)$ convergence rate but requires a linear search of the learning rate and sequence convergence. \kevin{What is ``sequence convergence''?} 
% In contrast, stochastic gradient further complicates the case in SMGD methods. Recent proposals such as MoCo~\cite{fernando2022mitigating} and CR-MOGM~\cite{zhou2022on} offer convergence guarantees with momentum utilization, but these do not shed light on pure SMGD despite its widespread application.




\fi
% \textbf{Motivation applications}
% With the basics of multi-objective optimization, we present two examples to further illustrate the practical relevance and benefits of our proposed algorithm \alg and its variants:

% \begin{list}{\labelitemi}{\leftmargin=1em \itemindent=-0.09em \itemsep=.2em}



% \item  {\em Motivation application on MOO (Multi-label learning to rank):}

% The multi-label learning to rank (MLLTR) problem\cite{mahapatra2023multi} plays a crucial role in various fields like information retrieval, recommendation systems, and natural language processing. The challenge lies in ranking items based on relevance to multiple criteria, rather than a single objective. In a multi-objective framework, each label can be treated as a separate objective that needs to be optimized. Thus, the problem of ranking items based on multiple labels can be transformed into a problem of finding a solution that optimizes several objectives simultaneously. The multi-objective formulation aims to find a ranking function $f$ that minimizes (or maximizes) a vector of objective functions $\mathbf{F} = \{f_1, f_2, \ldots, f_S\}$, where each $f_i$ corresponds to a specific label and is defined based on the relevance of items to that label. The resulting optimization problem can be formally defined as:
% \begin{align}
% \min_{\mathbf{x}} \quad & \mathbf{F}(\mathbf{x}) = \left[ f_1(\mathbf{x}), f_2(\mathbf{x}), \ldots, f_S(\mathbf{x}) \right] \notag\\
% \text{subject to} \quad & \mathbf{x} \in \mathcal{X}, \notag
% \end{align}
% where $\mathbf{x}$ represents the parameters of the ranking function, and $\mathcal{X}$ denotes the feasible region.



% \item  {\em Motivation on utilizing the variance reduction technique in MOO(An illustrative example):}

% We would like to provide a simple problem instance in which stochastic algorithms can not converge to Pareto optimal\cite{zhou2022convergence}. Consider the following two-dimensional stochastic optimization setting over the domain $\mathcal{K}=\{(x_1, x_2) \mid x_1 \in[-1,1], x_2 \geq 0\}$, where $p$ represents the probability of occurrence
% $$
% f_1(x_1, x_2)=\left\{\begin{array}{ll}
% (x_1+1)^2+(x_2-2)^2 & p=0.5\\
% (x_1+5)^2+(x_2+2)^2 & p=0.5
% \end{array}, f_2(x_1, x_2)= \begin{cases}(x_1-1)^2+(x_2-2)^2 & p=0.5 \\
% (x_1-5)^2+(x_2+2)^2 & p=0.5\end{cases}\right.
% $$
% The expected function is $f_1(x_1, x_2)=(x_1+3)^2+x_2^2+8,  f_2(x_1, x_2)=(x_1-3)^2+x_2^2+8$. We can see that the optimization goal is to minimize the distance towards $(-3,0)$ and $(3,0)$ simultaneously. The Pareto set for this problem is a line segment: $\{(a, 0) \mid a \in[-1,1]\}$.
% However, when we calculate the expected gradient at the point $(0,0.4)$, all the gradient-based techniques indicate an upward vertical movement, which moves the algorithm in the wrong direction away from Pareto optimal setting. Even more concerning is that the expected direction, 
% $\mathbb{E}\left[\bd_t\right]=(0, \epsilon), \epsilon>0$, goes against the gradients of both objectives. This implies it works counter-productively for both goals.
% Given that the stochastic optimization algorithms such as SMGD, MoCo, and CR-MOO may fail to converge to the Pareto optimal solution, we are prompted to explore the use of variance reduction techniques in addressing multi-objective optimization challenges in this paper.





% \end{list}



%











