% !TEX root = main.tex

\section{Introduction} \label{sec: intro}


{\bf 1) Background of multi-objective learning:}
%Since its inception as a discipline, machine learning (ML) has heavily relied on optimization formulations and algorithms.
Machine learning (ML) has always heavily relied on optimization formulations and algorithms.
While traditional ML problems generally focus on minimizing a single loss function, many emergent complex-structured multi-task ML problems require balancing {\em multiple} objectives that are often conflicting (e.g., multi-agent reinforcement learning~\citep{parisi2014policy}, multi-task fashion representation learning~\citep{jiao2022fine,jiao2023learning}, multi-task recommendation system~\citep{chen2019co,zhou2023multi}, multi-model learning in video captioning~\citep{pasunuru-bansal-2017-multi}, and multi-label learning-to-rank~\citep{mlltr2023kdd,querymlltr2023kdd}).
Such ML applications necessitate solving {\em multi-objective} optimization (MOO) problems, which can be expressed as:
\begin{small}
    \begin{align} \label{eq: moo}
\min_{\x \in \mathcal{D}} \F(\x) := [f_1(\x), \cdots, f_S(\x) ],
\end{align}
\end{small}
where $\x \in \mathcal{D} \subseteq \mathbb{R}^d$ is the model parameters.
Here, each $f_s$ denotes the objective function of task $s \in [S]$,
%{\color{blue}
$f_s(\x)= \frac{1}{n}\sum_{j=1}^n f_{sj} (\x ; \xi_{sj})$, where $n$ denotes the total number of samples, $\xi_{sj}$ denotes the $j$-th sample for task $s$.
%}
%
% These MOO problems arise in numerous fields such as engineering design~\citep{dhiman2020mosse}, finance~\citep{el2020finance}, healthcare~\citep{fathollahi2021multi}, and safe reinforcement learning~\citep{thomas2021multi}, to name a few.
%
%The goal of MOO is to optimize all of the objective functions simultaneously without sacrificing any individual objective. 
However, unlike traditional single-objective optimization, there may not exist a common $\x$-solution in MOO that can simultaneously minimize all objective functions.
Instead, a more relevant optimality criterion in MOO is the notion of \textit{Pareto-optimal solutions}, where no objective can be further improved without sacrificing other objectives.
Moreover, in settings where the set of objective functions are non-convex, searching for Pareto-optimal solutions is intractable in general.
In such scenarios, the goal of MOO is usually weakened to finding a {\em Pareto-stationary solution}, where no improving direction exists for any objective without sacrificing other objectives.
% The central challenge in MOO lies in discovering a Pareto-optimal set of solutions, which balance the conflicting objectives. 
%


%


\begin{table*}[t!]
\centering
\begin{scriptsize}
\begin{threeparttable}
\caption{Convergence comparisons between MOO algorithms, where $n$ is the size of dataset; $\epsilon$ is the convergence error. Our proposed algorithms are marked in a shaded background.}
\label{tab}
\renewcommand{\arraystretch}{1.2}
\begin{tabular}{cccccc}
\toprule
\multirow{2}{*}{Algorithm} & \multirow{2}{*}{Multi-gradient} &  \multicolumn{2}{c}{Non-convex case} & \multicolumn{2}{c}{Strongly-Convex case} \\
\cmidrule(r){3-4} \cmidrule(l){5-6}
& & Rate & Sample Complexity & Rate & Sample Complexity \\
\midrule
  MGD~\citep{fliege2019complexity} & Deterministic & $\mathcal{O}\left(T^{-1}\right)$ & $\mathcal{O}\left(n \epsilon^{-1}\right)$ & $\mathcal{O}(\exp(-\mu T))$ & $\mathcal{O}\left( n\ln ({\mu/\epsilon})\right)$ \\
 \midrule 
SMGD~\citep{yang2022pareto} & Stochastic&$\mathcal{O}\left({{T^{-{1/2}}}}\right)$ & $\mathcal{O}\left(\epsilon^{-2}\right)$ & $\mathcal{O}\left(T^{-1}\right)$ & $\mathcal{O}\left( \epsilon^{-1}\right)$ \\\midrule MoCo~\citep{fernando2022mitigating} &Stochastic & $\mathcal{O}\left({{T^{-{1/2}}}}\right)$ & $\mathcal{O}\left(\epsilon^{-2}\right)$ & $\mathcal{O}\left(T^{-1}\right)$ & $\mathcal{O}\left( \epsilon^{-1}\right)$ 
\\\midrule MoCo+~\citep{10446038} &Stochastic & $\mathcal{O}\left({{T^{-{2/3}}}}\right)$ & $\mathcal{O}\left(\epsilon^{-{1.5}}\right)$ & - & -\\\midrule
 CR-MOGM~\citep{zhou2022on} &Stochastic &$\mathcal{O}\left({{T^{-{1/2}}}}\right)$ & $\mathcal{O}\left(\epsilon^{-2}\right)$ & $\mathcal{O}\left(T^{-1}\right)$ & $\mathcal{O}\left( \epsilon^{-1}\right)$ \\
\midrule
\arrayrulecolor{gray!20}
\rowcolor{gray!20}\textbf{ \algns/ \algm } &  Stochastic & {$\mathcal{O}\left(T^{-1}\right)$ }& {$\mathcal{O}\left(n+\sqrt{n}\epsilon^{-1}\right)$} & {$\mathcal{O}(\exp(-\mu T))$} & {$\mathcal{O}\left(n+ \sqrt{n} \ln ({\mu/\epsilon})\right)$} \\ \midrule
\rowcolor{gray!20}  \textbf{\algpns/ \algmp }& Stochastic  & {$\mathcal{O}\left(T^{-1}\right)$ }& {$\mathcal{O}\left(n+\sqrt{n}\epsilon^{-1}\right)$} & {$\mathcal{O}(\exp(-\mu T))$} & {$\mathcal{O}\left(n+ \sqrt{n} \ln ({\mu/\epsilon})\right)$} \\
\arrayrulecolor{black}
\bottomrule
\end{tabular}
\end{threeparttable}
\end{scriptsize}

\end{table*}

{\bf 2) Motivating application: Multi-label learning to rank (MLLTR) problem.} 
Problem~\eqref{eq: moo} can be applied to a number of interesting real-world problems. Here, we provide one concrete example to further motivate its practical relevance:

The learning to Rank (LTR) method is a common technique used to rank information based on relevance, but it often struggles with ambiguity because of the noisy nature of human-generated data, like product ratings. To tackle this, Multi-Label Learning to Rank (MLLTR) offers a more refined approach. MLLTR addresses the inherent challenges of traditional LTR methods by integrating multiple relevance criteria into the ranking model. This allows for a more comprehensive representation of diverse crucial objectives.

\begin{list}{\labelitemi}{\leftmargin=0.5em \itemindent=-0.2em \itemsep=-0.2em}
	%
	\item {\em Learning to Rank:} 
Let $A$ be the training set, consisting of pairs $(\mathbf{a}_i,{b}_i)$ where $\mathbf{a}_i \in \mathbb{R}^d$ representing features, and $\mathbf{b}$ is the corresponding list of relevance labels $b_i$, and $ i = 1, \ldots, n $. We note that the lists $\mathbf{a}$ within the training set may not all be of the same length. $\mathbf{x}$ is the model parameter.

The goal of the learning-to-rank problem is to find a scoring function $f$ that optimizes a chosen Information Retrieval (IR) metric, such as Normalized Discounted Cumulative Gain (NDCG), on the test set. The scoring function $f$ is trained to minimize the mean of a surrogate loss $l$ across the training data:
$
f_{single}(\mathbf{x}) = \frac{1}{|A|} \sum_{(\mathbf{a}, \mathbf{b}) \in A} l( f(\mathbf{x};{\mathbf{a}}), \mathbf{b}).
$


	\item {\em Multi-label Learning to Rank:} Learning to Rank from multiple relevance labels. In the problem of Multi-label learning to rank (MLLTR), different relevance criteria are measured, providing multiple labels for each feature vector $\mathbf{a}_i\in \mathbb{R}^d$. The goal of MLLTR is still the same as that of LTR, which is to learn a scoring function $f(\x;\mathbf{a})$ that assigns a scalar value to each feature vector $\mathbf{a}_i\in \mathbb{R}^d$.
Here, we consider a set of training examples denoted by $ \mathbf{a}_i \in \mathbb{R}^d$, where $ i = 1, \ldots, n$. Associated with each training example $ \mathbf{a}_i $ is a vector of class labels:
$
\mathbf{b}_i = \left({b}_i^1, \ldots, {b}_i^K\right),$
indicating the labels of $\mathbf{a}_i $. Here, $ K $ is the total count of possible labels. In the multi-label learning to rank problem, the objective is to construct $ K $ distinct classification functions:
$
f_k({\x}): \mathbb{R}^d \rightarrow \mathbb{R}, \text{ for } k = 1, \ldots, K,
$
each tailored to a specific label.



In MLLTR, the cost is a vector-valued function: $f({{\x}}) = [f_1({\x}) ,f_2({\x}),f_K({\x})  ],$ naturally making it an MOO problem.
\end{list}

In the search ranking domain, the objective is to rank search results based on their relevance to user queries and other factors such as popularity, user feedback, and conversion rates. The loss function in search ranking not only considers relevance but also takes into account various performance metrics, such as click-through rates (CTR), dwell time, or conversion rates\cite{lyu2020deep, yang2020empirically,xiao2020deep}. The goal is to optimize the ranking of search results to maximize user satisfaction and engagement. Common loss functions used in search ranking include pairwise ranking loss\cite{kumar2020deep,jing2019deep,wang2021pairwise}, listwise loss\cite{revaud2019learning,yu2019wassrank}, or evaluation metrics like normalized discounted cumulative gain (NDCG)\cite{bruch2019analysis} or mean average precision (MAP)\cite{revaud2019learning}. These loss functions aim to capture the overall quality of the search ranking by considering both relevance and performance metrics.

The multi-label learning to rank problem typically involves a larger number of labels, which increases the dimensionality of the output space. This higher dimensionality often necessitates a greater number of samples to accurately train models, resulting in increased sample complexity. Therefore, this motivates us to propose a new family of algorithms for low sample complexity and fast convergence rates.

 

{\bf 3) Related works and motivation:} 
 To date, existing MOO algorithms in the literature can be generally categorized as gradient-free and gradient-based methods.
Typical gradient-free methods include evolutionary MOO algorithms and Bayesian MOO algorithms~\citep{zhang2007moea,deb2002fast,belakaria2020uncertainty,laumanns2002bayesian}. These techniques are suitable for small-scale problems but inefficient in solving high-dimensional MOO models (e.g., deep neural networks). 
Notably, gradient-based methods have attracted increasing attention recently due to their stronger empirical performances.
Specifically, following a similar token of (stochastic) gradient descent methods for single-objective optimization, (stochastic) multi-gradient descent (MGD/SMGD) algorithms have been proposed in~\citep{fliege2019complexity,fernando2022mitigating,zhou2022on,liu2021stochastic}.
The basic idea of MGD/SMGD is to iteratively update the $\x$-variable following a common descent direction for all the objectives through a time-varying convex combination of (stochastic) gradients of all objective functions.
%While MGD and SMGD share similar algorithmic procedures, they exhibit distinct characteristics and application domains. 
Although MGD-type algorithms enjoy a fast $\mathcal{O}(1/T)$ convergence rate ($T$ denotes the number of iterations) in finding a Pareto-stationary solution, their $\mathcal{O}(n)$ per-iteration computation complexity in full multi-gradient evaluations becomes prohibitive when the dataset size $n$ is large.
%Further, in finding an $\epsilon$-stationary point for non-convex MOO (typical in ML), the high overall $\mathcal{O}(n\epsilon^{-1})$ sample complexity of MGD-type methods is not acceptable when $n$ is large.
%employ full gradients of all objective functions, hence enjoying a faster convergence rate but at the expense of higher per-iteration sample complexity.
As a result, SMGD-type algorithms are often more favored in practice thanks to the lower per-iteration computation complexity in evaluating stochastic multi-gradients.
However, due to the noisy stochastic multi-gradient evaluations, SMGD-type algorithms typically exhibit a slow $\mathcal{O}(1/\sqrt{T})$ convergence rate, which also induces a high $\mathcal{O}(\epsilon^{-2})$ sample complexity.
%When employing large models, especially in deep learning, SMGD and its stochastic gradient-based variants prove to be superior choices. 
%Exacerbating the problem is the fact that, due to the complex coupling algorithmic structure between multiple objectives, SMGD-type methods are prone to divergence problems, particularly in scenarios with small batch and high variance~\citep{liu2021stochastic,zhou2022convergence}.
Although SMGD is easier to implement in practice thanks to the use of stochastic multi-gradient, it has been shown that the noisy common descent direction in SMGD could potentially cause divergence (cf. the example in Sec.~4 in \citep{zhou2022on}).
% {\color{blue} 
There also have been recent works on using momentum-based methods for bias mitigation in MOO, named MoCo~\citep{fernando2022mitigating}, MoCo+~\citep{10446038}, CR-MOGM~\citep{zhou2022on}. 
% }
%Note, however, that the $\mathcal{O}(1/\sqrt{T})$ convergence rates of \citep{zhou2022on,fernando2022mitigating} remain unsatisfactory. 
For easier comparisons, we summarize the state-of-the-art gradient-based MOO algorithms and their convergence rate results under non-convex and strongly convex settings in Table~\ref{tab}. We note that given the limited research on finite-sum multi-objective optimization, we included broader comparisons. 



In light of these major limitations of SMGD-type algorithms, a fundamental question naturally emerges:

\begin{tcolorbox}[left=1.2pt,right=1.2pt,top=1.2pt,bottom=1.2pt]
 %\begin{center}
\textbf{(Q)}: Is it possible to develop fast-convergent stochastic MOO algorithms in the sense of matching the convergence rate of deterministic MGD-type methods, while having a low per-iteration computation complexity as in SMGD-type algorithms, as well as achieving a low overall sample complexity?
%\end{center}
\end{tcolorbox}

To be specific, our algorithms differ from them in the following key aspects: (i) Our algorithms only require a constant level step size, which is easier to tune in practice. (ii) Our STIMULUS family of algorithms has a lower sample complexity compared to all other existing methods.






\iffalse
\textbf{2)~Overview of MOO Algorithms:}
In the literature, 
MOO algorithms can be classified into two primary categories. 
The first category is usually referred to as gradient-free methods.
Typical gradient-free methods include evolutionary MOO algorithms and Bayesian MOO algorithms~\citep{zhang2007moea,deb2002fast,belakaria2020uncertainty,laumanns2002bayesian}. These techniques are suitable for small-scale problems but inefficient in solving high-dimensional MOO models (e.g., deep neural networks). 
In contrast, the second class is gradient-based MOO methods~\citep{fliege2000steepest,desideri2012multiple,fliege2019complexity,peitz2018gradient,liu2021stochastic}, which have shown to be more effective in solving high-dimensional MOO problems. 
As discussed in Section~\ref{sec: intro}, the most notable gradient-based MOO algorithms include multi-gradient descent (MGD)~\citep{fliege2019complexity} and stochastic multi-gradient descent (SMGD)~\citep{liu2021stochastic}, which achieves $\mathcal{O}(1/T)$ and $\mathcal{O}(1\sqrt{T})$ convergence rates, respectively.
Although SMGD is easier to implement in practice thanks to the use of stochastic multi-gradient, it has been shown that the noisy common descent direction in SMGD could potentially cause divergence (cf. the example in Sec.~4 in \citep{zhou2022on}).
% {\color{blue} 
There also have been recent works on using momentum-based methods for bias mitigation in MOO, and these methods are further applied for bilevel optimization problems.~\citep{zhou2022on,fernando2022mitigating}. 
% }
Note, however, that the $\mathcal{O}(1/\sqrt{T})$ convergence rates of \citep{zhou2022on,fernando2022mitigating} remain unsatisfactory compared to the $\mathcal{O}(1/T)$ convergence rate of our \alg algorithm family.
For easier comparisons, we summarize the state-of-the-art gradient-based MOO algorithms and their convergence rate results under non-convex and strongly convex settings in Table~\ref{tab}. 
\fi



























%In this paper, we answer this question affirmatively.
{\bf 4) Technical Challenges:}
As in traditional single-objective optimization, a natural idea to achieve both fast convergence and low sample complexity in MOO is to employ the so-called ``variance reduction'' (VR) techniques to tame the noise in stochastic multi-gradients in SMGD-type methods.
However, due to the complex coupling nature of MOO problems, developing VR-assisted algorithms for SMGD-type algorithms faces the following challenges {\em unseen} in their single-objective counterparts:

(1) Since SMGD-type methods aim to identify the Pareto front (i.e., the set of all Pareto-optimal/stationary solutions), it is critical to ensure that the use of VR techniques does not introduce new bias into the already-noisy SGMD-type search process, which drives the search process toward certain regions of the Pareto front. 
(2) MOO problems often involve higher computational complexity compared to single-objective problems due to the need to evaluate multiple objectives simultaneously. Incorporating VR techniques adds another layer of complexity, as it requires additional computations to estimate and reduce variance across multiple objectives.
%
(3) Conducting theoretical analysis to prove the convergence performance of some proposed VR-based SMGD-type techniques also contains multiple challenges, including how to quantify multiple conflicting objectives, navigating trade-offs between them, handling the non-convexity objective functions, and managing the computational cost of evaluations.
All of these analytical challenges are quite different from those in single-objective optimization theoretical analysis,
%and ensuring the convergence of stochastic gradients utilized within variance reduction techniques. 
which necessitate specialized proofs and analyses are needed to effectively tackle these challenges and facilitate efficient exploration of the Pareto optimality/stationarity.




{\bf 5) Main Contributions:}
The major contribution of this paper is that we overcome the aforementioned technical challenges and develop a suite of new VR-assisted SMGD-based MOO algorithms called \alg (\ul{st}ochastic path-\ul{i}ntegrated \ul{mul}ti-gradient rec\ul{u}rsive e\ul{s}timator) to achieve both fast convergence and low sample complexity in MOO.
Our main technical results are summarized as follows:


\begin{list}{\labelitemi}{\leftmargin=0.5em \itemindent=-0.2em \itemsep=-0.2em}
\item 
%\textbf{Proposed the variance reduction based algorithm \alg with comprehensive theoretical analysis:} 
%We introduce \algns, a novel approach tailored for finite-sum multi-objective minimization. 
Our \alg algorithm not only enhances computational efficiency but also significantly reduces multi-gradient estimation variance, leading to more stable convergence trajectories and overcoming the divergence problem of SMGD. 
We theoretically establish a convergence rate of $\mathcal{O}(1/T)$ for \alg in non-convex settings (typical in ML), which further implies a low sample complexity of $O\left(n+\sqrt{n}\epsilon^{-1}\right)$. 
In the special setting where the objectives are strongly convex, we show that \alg has a linear convergence rate of $\mathcal{O}(\exp(-\mu T))$, which implies an even lower sample complexity of $\mathcal{O}\left( n+\sqrt{n} \ln ({\mu/\epsilon})\right)$. 
%These results underline the adaptability of our algorithm across diverse problem landscapes.

\item  
%\textbf{Provided the enhanced version of \alg and its theoretical analysis:} 
To further improve the performance of \algns, we develop an enhanced version called \algmns, which incorporates momentum information to expedite convergence speed. 
Also, to relax the requirement for periodic full multi-gradient evaluations in \alg and \algmns, we propose two enhanced variants called \algp and \algmp based on adaptive batching, respectively. 
We provide theoretical convergence and sample complexity analyses for all these enhanced variants. 
These enhanced variants expand the practical utility of \algns, offering efficient solutions that not only accelerate optimization processes but also alleviate computational burdens
%, making \alg and its variants even more versatile 
in a wide spectrum of multi-objective optimization applications.

\item %\textbf{ Empirical Validation:} 
We conduct extensive experiments on a variety of challenging MOO problems to verify our theoretical results and illustrate the efficacy of the \alg algorithm family.  
%Additionally, the introduction of momentum in \algm and the adaptive-batch adjustment in \algpns/\algmp further enhance convergence speed and sample complexities, highlighting the algorithm's adaptability to diverse optimization contexts. 
Our experiments demonstrate the efficiency of the \alg algorithm family over existing state-of-the-art MOO methods, which underscore the robustness, scalability, and flexibility of our \alg algorithm family in complex MOO applications.
\end{list}

\iffalse
The remaining sections of the paper are structured as outlined below:
 In Section~\ref{sec: prelim}, we conduct a comprehensive preliminaries.
 In Section~\ref{sec: alg}, we introduce our \alg and three variants (\algmns, \algp and \algmpns)), which are followed by their convergence analyses in Section~\ref{sec: convergence}.
 In Section~\ref{sec:exp}, we present the numerical results, while our conclusions can be found in Section~\ref{sec: conclusion}. 
 Due to space limitations, we relegate all proofs to supplementary material.
 For the convenience of our readers, we offer a summary of our primary notations in Table \ref{tab:list of notations} within the Appendix.

\fi


