% !TEX root = main.tex


\section{Additional experiment results} \label{sec:addexp}




\textbf{1) Strongly-Convex Optimization:}

We conducted experiments to assess the performance of our algorithms on a strongly-convex optimization problem, where $\mathbf{F}(\x)=[f_1(\x) = \x^2, f_2(\x) = {e}^{-\x}]$. For this experiment, we selected hyperparameters $\eta=0.005$ and $\alpha=0.3$, while introducing stochasticity into the gradient by adding Gaussian noise with a range of (-1, 1).
As shown in Fig. \ref{fig_compare_sc}, it is evident that all of the algorithms successfully achieved convergence. Notably, the momentum-based algorithms, namely MOCO, \algmns, and \algmpns, exhibited faster convergence compared to MGD, MSGD, \algns, and \algpns. We would also like to note that there isn't a significant difference between the stochastic algorithms (SMGD, MGD) and other algorithms. This is not necessarily because the stochastic algorithms are inferior, but perhaps because the strongly-convex function in question is too simplistic.


\begin{figure}[h]
  \centering
  \includegraphics[width=0.3\textwidth]{compare_alg_task1.pdf}
  \includegraphics[width=0.3\textwidth]{compare_alg_task2.pdf}
\caption{Convergence comparison on strongly-convex optimization problem.}

\label{fig_compare_sc}
\end{figure}




\textbf{2) Eight-Objective Experiments on River Flow Dataset:}

\begin{wrapfigure}{r}{0.27\textwidth}
  \includegraphics[width=0.2\textwidth]{8tasks.png}
\caption{Training loss convergence comparison (8-objective).}
\label{fig_compare_8tasks}
\end{wrapfigure}

We further test our algorithms on an 8-task problem with the river flow dataset~\citep{nie2017image}, which is for flow prediction at eight locations in the Mississippi river network. 
%Thus, there are {\em eight} tasks in this problem. 
%
In this experiment, we set $\eta=0.001, \alpha=0.1$, the batch size for MOCO, CR-MOGM and SMGD is $8$, the full batch size for MGD is $128$, and the inner loop batch size $|\mathcal{N}_s|$ for \algns, \algmns, \algpns, \algmpns is eight. 
To better visualize different tasks, we plot the normalized loss in a radar chart as shown in Fig.~\ref{fig_compare_8tasks}, where we can see that our \alg algorithms achieve a much smaller footprint, which is desirable. 
%Nevertheless, when dealing with a large number of tasks, it becomes challenging to discern the comparative advantages of each algorithm solely based on loss values. 
%This complexity arises from the nature of multi-objective optimization problems, where the objective is to identify a set of solutions that simultaneously optimize multiple conflicting objectives. These objectives often represent different aspects of the problem, and the solutions that optimize one objective may not necessarily optimize the others. 
Further, we compare the sample complexity results of all algorithms in Table~\ref{table_river}, which reveals a significant reduction in sample utilization by \algpns/\algmp compared to MGD, while achieving a much better loss compared to SGMD and MOCO (cf. Fig.~\ref{fig_compare_8tasks}).


\begin{table*}[h]
\centering
\caption{Results of normalized loss with the river flow dataset and learning tasks.}
{
\begin{tabular}{lcccccccccc}
\toprule
 & \multirow{2}{*}{\textbf{\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!$\#$ of samples}} & \multicolumn{5}{c}{\textbf{Tasks}} \\
\cmidrule(lr){3-10} % Use \cmidrule for improved spacing
 && \multicolumn{1}{c}{0} & \multicolumn{1}{c}{1} & \multicolumn{1}{c}{2} & \multicolumn{1}{c}{3} & \multicolumn{1}{c}{4} & \multicolumn{1}{c}{5} & \multicolumn{1}{c}{6} & \multicolumn{1}{c}{7} \\
\midrule
 SMGD &8000 & 0.985&0.558&0.521&0.384& 1&
       0.862& 0.667& 0.550 \\
 MOCO &8000 & 0.985&0.753& 1&0.399& 0.632&1& 0.595& 0.926  \\
 MGDA & 128000 &0.989& 0.396&0.532& 0.174& 0.589&
       0.945&0.417&0.669&\\
\algns&  27200  &0.985& 0.546&0.675&1& 0.077&
       0.898&0.417&0.281 \\
\algpns & 20947 & 0.996&1&0.528&0.178&0.990&
       0.395&0.427& 1\\
\algmns&  27200 & 0.996&0.864&0.530&0.475&0.036&
       0.271& 1&0.264\\
\algmpns &21085 & 1&0.596& 0.627& 0.1781&0.0376&
       0.482& 0.430& 0.055  \\
\bottomrule
\label{table_river}
\end{tabular}}
\end{table*}



\textbf{3) Ablation study on momentum in STIMULUS-M:}

\begin{table}[h]
    \centering
    \caption{Loss value vs. Iteration on tasks L and R of STIMULUS-M.}
    \begin{tabular}{c|cccc|cccc}
        \toprule
        \multirow{2}{*}{Momentum Term $\alpha$} & \multicolumn{4}{c|}{Task L} & \multicolumn{4}{c}{Task R} \\
        \cmidrule(lr){2-5} \cmidrule(lr){6-9}
        & 100 & 200 & 300 & 500 & 100 & 200 & 300 & 500 \\
        \midrule
        0.1 & 0.0228 & 0.0207 & 0.0203 & 0.0153 & 0.0229 & 0.0223 & 0.0205 & 0.0215 \\
        0.3 & 0.0228 & 0.0182 & 0.0179 & 0.0120 & 0.0223 & 0.0191 & 0.0168 & 0.0143 \\
        0.5 & 0.0227 & 0.0174 & 0.0146 & 0.0078 & 0.0215 & 0.0180 & 0.0124 & 0.0091 \\
        0.8 & 0.0225 & 0.0158 & 0.0127 & 0.0065 & 0.0210 & 0.0152 & 0.0113 & 0.0078 \\
        \bottomrule
    \end{tabular}
    \label{tablem}
\end{table}

We performed additional experiments to analyze the impact of varying the momentum term in our proposed STIMULUS-M algorithm, as shown in Table \ref{tablem}, on the classification task of the MultiMNIST dataset. The experimental settings are consistent with those in Section 5.1 of the main paper. These results indicate that a larger momentum term leads to faster convergence.
