
%\section{Appendix}
\section{Theoretical Analysis}
\iffalse 
We provide detailed analysis of our algorithm here. 
To fix the notation, let $\cc_\tim$ be the solution at the $t$-th iteration. Our update is 
\bbb \label{equ:updatesapp} 
\cc_{\tim+1} = \cc_\tim - \xi v_\tim, 
\eee 
where 
$\xi$ is the step size and 
$v_\tim$ is obtained by solving
\bbb \label{opt: relax}
v_\tim = \argmin_{v \in \RR^\dimcc } \left \{  \frac12 \norm{\dd F(\cc_\tim)-v}^2 ~~~~s.t.~~~~ \dd \ell_\i(\cc_\tim)\tt v \geq \phi_\tim  \right\}. 
\eee 
with $\phi_\tim$ taken to be 
\bbb \label{equ:phik}
\phi_\tim = \begin{cases}
-\infty & \text{if $g(\cc) \leq \ep$} \\
\alpha_\tim g(\cc_\tim) & \text{if $g(\cc) > \ep$}.
\end{cases}
\eee 
%The $\epsilon$ is a constant step size and
and $\{\alpha_\tim\}$ is a sequence of positive constant. 
\fi 
%\subsubsection{Finding the Descent Direction} 
%We start with finding the descent direction by deriving the dual form of the optimization in \eqref{opt: relax}. 
%\begin{lemma}
\paragraph{Theorem~\ref{thm: sol} {[Dual of \eqref{opt: relax}]}}
%Assume the constraint in \eqref{opt: relax} is strictly feasible, that is, there exists an $v^*\in \RR^\dimcc$, such that $\dd \ell_\i(\cc_\tim)\tt v^* > \phi_\tim$. Then 
\emph{The solution $v_t$ of \eqref{opt: relax}, 
if it exists, has a form of 
\bb%b \label{equ:vk}
v_\tim = \dd F(\cc_\tim) + \sum_{\i=1}^m \lambda_{i,t} \dd \ell_\i(\cc_\tim),
\ee%e 
with $\{\lambda_{i,t}\}_{\i=1}^m$ %given by % 
the solution of the following dual problem 
%
\bb%b \label{equ:lambdatk}
\max_{\lambda\in\RRplus^m}-\frac{1}{2}\left\Vert \nabla F(\cc_\tim)+\sum_{\i=1}^{m}\lambda_\tim\nabla\ell_\i(\th_\tim)\right\Vert ^{2}+\sum_{\i=1}^{m}\lambda_\i\phi_\tim,
\ee%e 
where $\RRplus^m$ is the set of nonnegative  $m$-dimensional vectors, that is, 
$\RRplus^m = \{\lambda\in\RR^m\colon \lambda_\i\geq 0,~~\forall \i\in[m]\}$. 
}
%\lambda_\tim = \argmax_{
%\end{lemma}
%\subsubsection{Proof of Theorem \ref{thm: sol}}
\begin{proof}
By introducing Lagrange multipliers, the optimization in \eqref{opt: relax} is equivalent to  the following minimax problem: 
\[
\min_{v\in \RR^\dimcc}\max_{\lambda\in \RR_+^m }\frac{1}{2}\left\Vert \nabla F(\th_\tim)-v\right\Vert  ^{2}+\sum_{\i=1}^{m}\lambda_\i\left( \phi_\tim -%\left\langle 
\nabla\ell_\i(\th_\tim)
\tt v
%\right\rangle 
\right). %~~~~s.t.~~~~ \lambda_\i\geq0,
\]
With strong duality of convex quadratic programming (assuming the primal problem is feasible), %we can  ex
%\qq{technically, we need to assume there exists a strictly feasible point},
we can exchange the order of min and max, yielding  
\begin{align*}
\max_{\lambda\in \RR_+^m }
\left\{ \Phi(\lambda)\defeq \min_{v\in \RR^\dimcc}
\frac{1}{2}\left\Vert \nabla F(\th_\tim)-v\right\Vert  ^{2}+\sum_{\i=1}^{m}\lambda_\i\left( \phi_\tim -%\left\langle 
\nabla\ell_\i(\th_\tim)
\tt v
%\right\rangle 
\right) \right\}. %~~~~s.t.~~~~ \lambda_\i\geq0,
%\iff & \max_{\lambda\ge0}\min_{d}\frac{1}{2}\left\Vert \nabla F\right\Vert ^{2}-\left\langle \nabla F+\sum_{\i=1}^{m}\lambda_\i\nabla\ell_\i(\th),d\right\rangle +\frac{1}{2}\left\Vert d\right\Vert ^{2}+\sum_{\i=1}^{m}\lambda_\i\alpha g(\th).
\end{align*}
%Notice that the %optimal solution $d$
It is easy to see that the 
minimization w.r.t. $v$ is achieved 
when $v =\nabla F(\cc_\tim) + \sum_{\i=1}^{m}\lambda_\i\nabla\ell_\i(\th_\tim)$. 
%Finding the 
Correspondingly, 
%Calculating 
the $\Phi(\lambda)$ has the following dual form: 
%$\lambda_\i$ should solve the following dual optimization problem: 
%we obtain the following dual form. 
%the optimization is equ
%And the $\lambda$ is thus the solution of 
\[
\max_{\lambda\in\RRplus^m}
%\left\{ \Phi(\lambda)\defeq
-\frac{1}{2}\left\Vert \nabla F(\cc_\tim)+\sum_{\i=1}^{m}\lambda_\i\nabla\ell_\i(\th_\tim)\right\Vert ^{2}+\sum_{\i=1}^{m}\lambda_\i\phi_\tim.  %\right\}.
\]
This concludes the proof. 
\end{proof} 

%\paragraph{Pareto Optimization on $\L$} We analyze the descent of $\L$ by {\PNG}.  

%\begin{theorem}[Pareto Improvement on $\L$]  
\paragraph{Theorem~\ref{thm:odeell} [Pareto Improvement on $\L$]}
%\label{thm:odeell}
 \emph{Under Assumption~\ref{asm:basic}, 
assume $\cc_0\not\in\P_\ep$, and $t_\ep$ is the first time when $\cc_{t_\ep} \in \P_\ep$, then for any time $t < t_\ep,$ 
\bb 
\frac{\df}{\df t} \ell_\i( \cc_t) \leq -\alpha_t g(\cc_t),% <0%,
&&&
\min_{s\in[0,t]} g(\cc_t) \leq \frac{\min_{\i\in[m]}(\ell_\i(\cc_0)-\ell_i\true)}{\int_0^t \alpha_s \df s}.
\ee 
Therefore,  the update yields Pareto improvement on $\L$ when $\cc_t \not\in \P_\ep$ and  $\alpha_t g(\cc_t)>0$.\\
%
%Therefore, 
Further, 
if $\int_0^t \alpha_s \df s = +\infty$, 
then for any $\epsilon>\ep$,  there exists a finite time $t_\epsilon \in \RRplus$ on which the solution enters $\P_{\epsilon}$ and stays within $\overline \P_\epsilon$ afterwards, that is, we have $\cc_{t_\epsilon} \in \P_\epsilon$ and $\cc_t\in \overline \P_\epsilon$ 
%$\{\cc_{t}\colon ~ t\geq t_\epsilon\} \subseteq \overline \P_\epsilon $. 
for any $t \geq t_\epsilon$. 
%then for any $t$ 
%1) We achieve Pareto improvement on $\L$ when $g(\cc) > \epsilon$, and hence it reaches $\P_{\epsilon}$ within  $O(1/\epsilon)$ steps.
%2) Once it reaches $\P_{\epsilon}$, it stays within $\P_{\epsilon,u}$ for $u=xxx$ afterwards. 
}
\begin{proof} 

i) 
When $t < t_\ep$, we have $g(\cc_t) > \ep$ and hence 
%We have 
\begin{align}\label{equ:tbl}
\frac{\df}{\df t} \ell_\i(\th_{\tim})
= - \nabla\ell_\i(\th_\tim)\tt v_\tim 
\leq - \phi_\tim = -\alpha_\tim g(\cc_\tim),
%%-\ell_\i(\th_\tim) & \le- \xi \nabla\ell_\i(\th_\tim)\tt v_\tim
%+\frac{\xi^{2}L}{2}\left\Vert v_\tim\right\Vert ^{2}\\
 %& \le- \xi \phi_\tim 
 %\alpha_\timg(\th_\tim)
 %+\frac{\xi^{2}L}{2}\left\Vert v_\tim\right\Vert ^{2},
\end{align}
where we used the constraint of $\dd \ell_\i(\cc_\tim)\tt v_\tim\geq \phi_\tim$ in \eqref{opt: relax}. 
Therefore, we yield strict decent on all the losses $\{\ell_\i\}$
when $\alpha_t g(\cc_t) >0$. 

ii) 
Integrating both sides of \eqref{equ:tbl}: 
$$
\min_{s\in[0,t]}{g(\cc_s)} \leq \frac{\int_0^t \alpha_{s} g(\cc_s) \df s}{\int_0^t\alpha_s \df s} \leq 
\frac{\ell_\i(\th_0) - \ell_\i(\th_t)}{\int_0^t\alpha_s \df s} 
\leq \frac{\ell_\i(\th_0) - \ell\true}{\int_0^t\alpha_s \df s}.
$$
 This yields the result since it holds for every $i\in[m]$.
 
 If $\int_0^\infty \alpha_t \df t = +\infty$, then we have $\min_{s\in [0,t]} g(\cc_s) \to 0$ when $t\to +\infty$. Assume there exists an $\epsilon > \ep$, such that $\cc_t$ never enters $\P_\epsilon$ at finite $t$. Then we have $g(\cc_t) \geq \epsilon$ for $t\in \RRplus$,  which contradicts with  $\min_{s\in [0,t]} g(\cc_s) \to 0$. 


iii) 
Assume there exists a finite time $t' \in (t_\epsilon, +\infty)$ such that $\cc_{t'}\not\in \overline \P_\epsilon$. 
Because $\epsilon > \ep$ and $g$ is continuous, 
$\P_e$ is in the interior of $\P_\epsilon \subseteq \overline \P_\epsilon$. 
Therefore, the trajectory leading to $\cc_{t'}\not\in \overline \P_\epsilon$  must pass through $\overline \P_{\epsilon}\setminus \P_\ep$ at some point, that is, there exists a point $t'' \in [t_\epsilon, t')$, such that $\{\cc_{t} \colon t\in[ t'', t'] \} \not\in  \P_\ep$. 
But because the algorithm can not increase any objective $\ell_i$ outside of $\P_\ep$, we must have $\L(\cc_{t'}) \preceq \L(\cc_{t''})$, yielding that $\cc_{t'}\in\overline{\{\cc_{t''}\}} \subseteq \overline \P_{\epsilon}$, where $\overline{\{\cc_{t''}\}}$ is the Pareto closure of ${\{\cc_{t''}\}}$; this contradicts with the assumption. 
%
%because %$g(\cc_0) > \ep$ and 
%$g(\cc_t)$ is continuous w.r.t. $t$.  
% Therefore, for any $$
%Therefore, for any $\epsilon > \ep$, we will first enter $\P_{\epsilon}$ within $\bigO(1/\epsilon)$ time  (although it may leave $\P_{\epsilon}$ later).  
%
%Assume we enter $\P_{\epsilon}$ at time $t_0$, then we have $\cc_t \in \overline\P_{\epsilon}$ for $t \geq t_0$. This is because However, it will stay within $\overline{\P}_\epsilon$ afterwards, because
%However, since we monotonically decrease all $\{\ell_\i\}$ simultaneously, we stay within $\overline{\P}_\epsilon$ afterwards.  
\end{proof} 

%\begin{lemma}
\paragraph{Theorem~\ref{lem:dfjijgfgfgfgifjg}} 
\emph{ 
Under Assumption~\ref{asm:basic}, 
assume $\cc_t \not\in \P_\ep$  
is a fixed point of the algorithm, that is, $\frac{\df \theta_t}{\df t} = -v_t = 0$, 
and $F$, $\L$ are  convex in a neighborhood $\cc_t$, then 
$\cc_t$ is a local minimum of $F$
in the Pareto closure $\overline{\{\theta_t\}}$, %of  $\{\cc_t\}$,
that is,  there exists a neighborhood of $\cc_t$ in which  there exists no point $\cc'$ such that $F(\cc') < F(\cc_t)$ and $\L(\cc') \preceq \L(\cc_t)$. 
} 
\begin{proof} 
Note that minimizing $F$ in $\overline{\{\theta_t\}}$ can be framed into a constrained optimization problem:
$$
\min_{\cc} F(\cc) ~~~~s.t.~~~~\ell_i(\cc) \leq \ell_i(\cc_t),~~\forall i\in[m]. 
$$
In addition, by assumption, $\cc=\cc_t$ satisfies $v_t = \dd F(\cc_t) + \sum_{i=1}^m \lambda_{i,t}\dd \ell_i(\cc_t) = 0$, which is the KKT stationarity condition of the constrained optimization. 
It is also obvious to check that $\cc=\cc_t$ satisfies the feasibility and slack condition trivially. Combining this with the local convexity assumption yields the result. 
\end{proof}
%\emph{Under Assumption~\ref{asm:basic}, assume a point $\cc_\infty \in \RR^\dimcc$  satisfies $\cc_\infty \not\in \P_\ep$ and is a local minimum of $F$in the Pareto closure $\overline{\{\theta_\infty\}}$ of  $\{\cc_\infty\}$, that is,  there exists a neighborhood of $\cc_\infty$ in which  there exists no point $\cc'$ such that $F(\cc') < F(\cc_\infty)$ and $\L(\cc') \preceq \L(\cc_\infty)$. Then $\cc_\infty$ is a fixed point of  Algorithm~\ref{alg:main}, that is, $v_\infty = 0$. 
%Under Assumption~\ref{asm:basic}, assume $\cc_t \not\in \P_\ep$ is a local minimum of $F$in the Pareto closure $\overline{\{\theta_t\}}$ of  $\{\cc_t\}$,   that is,  there exists a neighborhood of $\cc_t$ in which  there exists no point $\cc'$ such that $F(\cc') < F(\cc_t)$ and $\L(\cc') \preceq \L(\cc_t)$. Then $\cc_t$ is a fixed point of  Algorithm~\ref{alg:main}, that is, $v_t = 0$. 
%}
\iffalse 
\begin{proof}
The fact that $\cc_t \not\in \P_\ep$ is a local minimum of $F$
in the Pareto closure $\overline{\{\cc_t\}}$  is equivalent to that $\cc_t$ is on the Pareto front of the $m+1$ objectives $(F, \ell_1,\ldots, \ell_m)$, 
which implies that there exists $\omega' \in \C^{m+1}$, such that 
$\omega'_0 \dd F(\cc_\infty) + \sum_{i=1}^m \omega'_i \dd \ell_i(\cc_\infty) = 0$. 

If $\omega'_0 = 0$, we would then have $\sum_{i=1}^m \omega'_i \dd \ell_i(\cc_\infty) =0$; this would mean that $g(\cc_\infty) =0$, which condicts with the assumption that $\cc_\infty\not\in \P_\ep$. 

Therefore, we have $\omega'_0 \neq 0$ and defining $\lambda_{i} = \omega'_i/\omega'_0$ yields that $\dd F(\cc_\infty) + \sum_{i=1}^m \lambda_i \dd \ell_i(\cc_\infty) = 0$, and such $\{\lambda_{i}\}$ must be the solution of 

%a solution such that $\sum_{}$
%of  $\{\cc_t\}$ is 
\end{proof}
\fi 
%\end{lemma}

%On the other hand, if $\cc_t\in \P_\ep$, we have $v_t = \dd F(\cc_t)$, and hence $v_t=0$ is a necessary condition of $\cc_t$ being an unconstrained local minimum of $F$. 
\paragraph{Theorem~\ref{thm:odef} [Optimization of $F$]}
\emph{Let  $\epsilon > \ep$ and assume $g_{\epsilon} \defeq \sup_{\cc} \{g(\cc) \colon ~\cc\in \overline \P_\epsilon\}<+\infty$ and  $\sup_{t\geq0}\alpha_t<\infty$.   
Under Assumption~\ref{asm:basic}, 
when we initialize from $\cc_0 \in \P_\epsilon$, % for $\forall \epsilon > \ep$, 
we have 
$$
%\min_{s\in[0,t]}\norm{v_s}^2 \leq 
\min_{s\in[0,t]}\norm{ \frac{\df\cc_s}{\df s} }^2 \leq 
\frac{F(\cc_0)-F\true}{t} + \frac{1}{t}\int_{0}^t 
\alpha_s \left (\alpha_s {g_\epsilon}    + 
c \sqrt{g_\epsilon}  \right ) 
%\sqrt{g(\cc_s)}
\df s. 
%F(\cc_%t)
$$
In particular, 
if we have $\alpha_t = \alpha = const$, then 
$\min_{s\in[0,t]}\norm{\df \theta_s/\df s}^2  = \bigO\left (1/t + \alpha \sqrt{g_\epsilon} \right ).$ \\
%
If  
$%A^\gamma \defeq 
\int_0^\infty\alpha_t^\gamma\df t <+\infty$ for some $\gamma \geq 1$, we have 
$\min_{s\in[0,t]}\norm{\df \theta_s/\df s}^2  = \bigO(1/t + \sqrt{g_\epsilon}/t^{1/\gamma}).$
} 
%\end{theorem}
\begin{proof} 
%If $g(\cc_t) > \ep$, 
%\textbf{i)}
i) 
The slack condition of the constrained optimization in \eqref{opt: relax} says that 
\bbb \label{equ:slack001}
\lambda_{i,t}\left(\nabla\ell_\i(\th_\tim)\tt v_\tim
 -\phi_\tim \right)=0,\ \forall \i\in[m].
\eee
This gives that 
\begin{align}
\left\Vert v_\tim \right\Vert ^{2} 
& = \left ({\nabla F(\th_\tim)+\sum_{\i=1}^m \lambda_{i,t}\nabla\ell_\i(\th_\tim)} \right ) \tt v_\tim \notag \\ 
& = \dd F(\th_\tim) \tt v_\tim + \sum_{\i=1}^m \lambda_{i,t} \phi_\tim \ant{plugging \eqref{equ:slack001}}. \label{equ:ddfdifd1}
%\left\langle d(\th,\alpha),\nabla F(\th)\right\rangle +\sum_\i\lambda_\i(\th,\alpha)\left\langle d(\th,\alpha),\nabla\ell_\i(\th)\right\rangle \\
 %& =\left\langle d(\th,\alpha),\nabla F(\th)\right\rangle +\alpha g\sum_\i\lambda_\i(\th,\alpha).
\end{align}
%Also when $g(\th)\le r\epsilon$, it is obvious that $\left\Vert d(\th,\alpha)\right\Vert ^{2}=\left\langle d(\th,\alpha),\nabla F(\th)\right\rangle $as $d(\th,\alpha)=\nabla F(\th)$. 
%This gives 
If $\cc_t\not\in \P_\ep$, we have $\phi_t = \alpha_t g(\cc_t)$ and this gives 
\bb 
\frac{\df }{\df t} F(\cc_t) 
= - \dd F(\cc_t)\tt v_t  = 
- \left\Vert v_\tim \right\Vert ^{2} + \sum_{\i=1}^m \lambda_{i,t} \phi_\tim 
= - \left\Vert \frac{\df \theta_\tim}{\df t} \right\Vert ^{2} + \sum_{\i=1}^m \lambda_{i,t} \alpha_t g(\cc_t)
%\phi_\tim. 
\ee 
If $\cc_t$ is in the interior of $\P_\ep$, then we run typical gradient descent of $F$ and hence has 
%By the $L$-smoothness of $F$ in Assumption \ref{asm: bound}, we have 
\bb 
\frac{\df }{\df t} F(\cc_t)  = 
%= - \dd F(\cc_t)\tt v_t  = 
- \left\Vert v_\tim \right\Vert ^{2} = -  \left\Vert \frac{\df \theta_\tim}{\df t} \right\Vert ^{2}. 
%+ \sum_{\i=1}^m \lambda_{i,t} \phi_\tim. 
\ee  
If $\cc_t$ is on the boundary of $\P_\ep$, then by the definition of differential inclusion, 
$\df \cc/\df t$ belongs to the convex hull of the velocities that it receives from either side of the boundary, yielding that 
$$
\frac{\df }{\df t} F(\cc_t)= 
- \left\Vert \frac{\df \theta_\tim}{\df t} \right\Vert ^{2} + \beta \sum_{\i=1}^m \lambda_{i,t} \alpha_t g(\cc_t) 
\leq - \left\Vert \frac{\df \theta_\tim}{\df t} \right\Vert ^{2} +  \sum_{\i=1}^m \lambda_{i,t} \alpha_t g(\cc_t),
$$
where $\beta \in[0,1]$. 
Combining all the cases gives
$$
\frac{\df }{\df t} F(\cc_t) \leq 
- \left\Vert \frac{\df \theta_\tim}{\df t} \right\Vert ^{2} +  \sum_{\i=1}^m \lambda_{i,t} \alpha_t g(\cc_t). % \ind(\th_t\in \P)
$$
Integrating this yields
\bb
\min_{s\in[0,t]}\norm{ \frac{\df\cc_s}{\df s} }^2
\leq \frac{1}{t} \int_0^t \norm{ \frac{\df\cc_s}{\df s} }^2 \df s 
& \leq\frac{F(\cc_0)-F\true}{t} + \frac{1}{t}\int_{0}^t 
\sum_{\i=1}^m \lambda_{i,s} \alpha_s g(\cc_s) \df s \\ 
& \leq\frac{F(\cc_0)-F\true}{t} + \frac{1}{t}\int_{0}^t  \alpha_s \left (\alpha_s {g_\epsilon}    + c \sqrt{g_\epsilon}  \right ) \df s,
\ee  
where the last step used Lemma~\ref{lem:cgbound} with $\phi_t = \alpha_t g(\cc_t)$:
$$
%\sum_{\i=1}^m \lambda_{i,t} \phi_\tim = 
 \sum_{\i=1}^m \lambda_{i,t} \alpha_\tim g(\cc_\tim) 
 %\leq
%\alpha_\tim (\alpha_\tim g(\cc_\tim) + c \sqrt{g(\cc_\tim)}) 
\leq  \alpha_\tim^2 g(\cc_\tim) + c \alpha_\tim \sqrt{g(\cc_\tim)} 
\leq  \alpha_\tim^2 g_\epsilon + c \alpha_\tim \sqrt{g_\epsilon},
%\max\left(\phi_\tim + c \sqrt{g(\cc_\tim)}, ~0\right). 
$$
and here we used $g(\cc_\tim) \leq g_\epsilon$ because the trajectory is contained in $\overline \P_\epsilon$ following Theorem~\ref{thm:odeell}.

The remaining results follow Lemma~\ref{lem:alphako}. 
\end{proof} 

%\begin{lemma}

%\end{lemma}

\subsubsection{Technical Lemmas} 
%\begin{tcolorbox} 
\begin{lemma}\label{lem:cgbound}
Assume Assumption~\ref{asm:basic} holds. %\ref{asm: feasibility} hold. %\red{and $\phi_\tim\geq 0$}.
Define $g(\cc) = \min_{\omega\in \C^m} \norm{\sum_{\i=1}^m \omega_\i \dd \ell_\i(\cc)}^2$, where $\C^m$ is the probability simplex on $[m]$. 
Then for the $v_\tim$ and $\lambda_{i,t}$ defined in \eqref{opt: relax} and \eqref{equ: dual}, we have 
%either $\lambda =0$, or 
$$
\sum_{\i=1}^m \lambda_{i,t} g(\cc_\tim) \leq \max\left(\phi_\tim + c \sqrt{g(\cc_\tim)}, ~0\right). 
$$
\end{lemma} 
\begin{proof} 
The slack condition of the constrained optimization in \eqref{opt: relax} says that
%for each $\i\in[m]$, we have either $\lambda_\i(\cc,\alpha)=0$, or $\dd \ell_\i(\cc)\tt d(\cc,\alpha)  = \phi(\cc,\alpha).$
\[\lambda_{i,t}\left(
\dd \ell_\i(\cc) \tt v_\tim -
\phi_\tim
\right)=0,~~~~\forall \i\in[m]. 
\]
%Plugging in 
Sum the equation over $\i\in[m]$ and 
note  
that $v_\tim = \dd F(\cc_\tim) + \sum_{\i=1}^m \lambda_{i,t} \dd \ell_\i(\cc_\tim)$. 
 We get 
\bbb \label{equ:eurd} 
\norm{\sum_{\i=1}^m \lambda_{i,t} \dd\ell_\i(\cc_\tim)}^2 + \left (\sum_{\i=1}^m \lambda_{i,t}  \dd \ell_\i(\cc_\tim) 
\right)\tt \dd F(\cc)  -\sum_{\i=1}^m \lambda_{i,t}  \phi_\tim  = 0. 
\eee 
%Without loss of generality, assume$\sum_{\i=1}^m \lambda_\i(\cc,\alpha) \neq 0$ (otherwise, there is nothing to prove).
Define 
\bb 
x_\tim= \norm{\sum_{\i=1}^m \lambda_{i,t} \dd\ell_\i(\cc_\tim)}^2,&&
\bar \lambda_\tim  = \sum_{\i=1}^m \lambda_{i,t}, &&
g_\tim=g(\cc_\tim) = \min_{\omega\in \C^m} \norm{\sum_{\i=1}^m\omega_\i \dd \ell_\i(\cc_\tim)}^2. 
\ee 
%Note that
Then it is easy to see that $x_\tim \geq \bar \lambda_\tim^2 g_\tim$. 
Using Cauchy-Schwarz inequality, 
$$
\abs{\left (\sum_{\i=1}^m \lambda_{i,t}\dd \ell_\i(\cc)\right )\tt \dd F(\cc_\tim)}
\leq \norm{\dd F(\cc_\tim)}\norm{\sum_{\i=1}^m \lambda_{i,t}\dd \ell_\i(\cc)} \leq c \sqrt{x_\tim}, 
$$
where we used $\norm{\dd F(\cc_\tim)} \leq c$ by Assumption~\ref{asm:basic}. 
Combining this with \eqref{equ:eurd}, we have 
$$
\abs{x_\tim - \bar \lambda_\tim \phi_\tim} \leq c \sqrt{x_\tim}. 
$$
Applying Lemma~\ref{lem:xcg} yields the result. %we have 
%$$
%\phi_\tim - c \sqrt{g_\tim} \leq \bar \lambda_\tim g_\tim \leq \phi_\tim + c \sqrt{g_\tim}. 
%$$
\end{proof}
%\end{tcolorbox}

\begin{lemma}\label{lem:xcg}
Assume $\phi \in \RR$, and 
$x,\lambda, c, g \in \RRplus$ are non-negative real numbers and they satisfy 
\bb 
\abs{x - \lambda \phi} \leq c \sqrt{x}, ~~~~~~~~ x\geq \lambda^2 g.
\ee 
Then we have $\lambda g \leq \max(0, \phi + c \sqrt{g}).$
%either $\lambda =0,$ or 
%\phi - c \sqrt{g} \leq \lambda g \leq \phi + c \sqrt{g}. 
$
 %\text{either $\lambda =0,$  ~~~~or }~~~~
%\phi - c \sqrt{g} \leq \lambda g \leq \phi + c \sqrt{g}. 
$
\end{lemma} 
\begin{proof}
Square the first equation, we get 
$$f(x):= (x-\lambda \phi )^2 - c^2 x \leq 0,$$
%x^2 - (2 \lambda \phi +c^2) x^2 + \lambda^2 \phi ^2 \leq 0,$$
where $f$ is a quadratic function. 
To ensure that $f(x) \leq 0$ has a solution that satisfies $ x \ge \lambda^2 g$, we need to have $f(\lambda^2 g) \leq 0$, that is,  
%,~~~~~ x^2 \ge \lambda^2 g. 
%$$
%To have a feasible solution of $x$, we must have 
%To have this hold, we must have 
$$
f(\lambda^2g) = (\lambda^2g - \lambda \phi )^2 - c^2 \lambda ^2 g \leq 0.
%(\lambda^2 g)^2 - (2\lambda \phi +c^2) \lambda ^2 g + \lambda ^2 \phi ^2 \leq 0.
$$
%This gives $\lambda^2 g^2 - (2 \lambda\phi+c^2)g + \phi^2\leq 0$. Or 
%$$\lambda^2 g^2 - 2 \lambda g \phi - c^2g + \phi^2\leq 0.$$
%Then 
%$$|\lambda^2g - \lambda \phi | \leq c \lambda  g^{1/2}.
%\lambda g \leq \sqrt{\phi^2 - }
%$$
%which 
This can hold under two cases: 

Case 1: $\lambda =0;$

Case 2: $|\lambda g -\phi|\leq c \sqrt{g}$, and hence 
%This gives 
$\phi - c \sqrt{g} \leq \lambda g \leq \phi + c \sqrt{g}
$. 

Under both case, we have 
$$
\lambda g \leq \max(0, \phi + c \sqrt{g}). 
$$
%We have 
\end{proof}

\begin{lemma}\label{lem:alphako}
Let $\{\alpha_\tim\colon t\in \RRplus\} \subseteq \RRplus$ 
be a non-negative sequence with 
%assume  
$
A:=\left(\int_0^\infty \alpha_\tim^\gamma \df t \right)^{1/\gamma}  <\infty$, where $\gamma \geq 1$,  and $B = \sup_t \alpha_t <\infty$. 
%we have $\alpha_{\max} : = \sup_{k\geq 0} \alpha_\tim < \infty$, 
Then we have 
$$
\frac{1}{t} \int_{0}^t \left (\alpha_s^2 +\alpha_s\right) \df s \leq  (B+1) A t^{-1/\gamma}. 
%A t^{-1/\gamma} + A^2 t^{-1/\gamma}
%C_{\alpha} (K+1)^{\frac{\gamma-1}{\gamma}},
$$
%where $C_{\alpha} <\infty$ is a constant depending on $A$ and $\alpha_{\max}$. % : = \sup_{k\geq 0} \alpha_\tim < \infty$. 
\end{lemma}
\begin{proof}
Let $\eta = \frac{\gamma}{\gamma-1}$, so that $1/ \eta  + 1/\gamma= 1$. We have by Holder's inequality, 
\bb 
\int_{0}^t\alpha_s \df s 
\leq \left( \int_{0}^t \alpha_s^\gamma \df s\right )^{1/\gamma} \left (\int_{0}^t 1^\eta\df s \right )^{1/\eta} 
\leq A t^{1/\eta}= A t^{1-1/\gamma}. 
\ee 
and hence 
$$
\frac{1}{t}\int_{0}^t\left (\alpha_s^2  + \alpha_s\right )\df s  
\leq \frac{B+1}{t} \int_0^t\alpha_s \df s 
\leq  (B+1) A t^{-1/\gamma}. 
$$
\end{proof}



\iffalse 
\subsection{Descent of the Objective Functions}
We characterize how the algorithm decreases the objectives 
$F$ and $\ell_\i$. 

\red{TODO: Explain the meaning of $\norm{v_\i}$}

\begin{assumption}[Feasibility] \label{asm: feasibility}
Assume $\{\theta_\tim\colon k =0,1,\ldots\}$ follows the updates in \eqref{equ:updatesapp} with $v_\tim$ solving \eqref{opt: relax}, and $\xi > 0$ and $\phi_\tim \in \RR$ for all $k.$
\med{Assume that the constraint in 
\eqref{opt: relax} is feasible at each iteration. } 
\end{assumption}

\begin{lemma}[Descent of $F$]
\label{lem: descent on F}
%Following the updates in \eqref{opt: relax}. 
%Assume the 
 %Assume that \eqref{opt: relax} is feasible at each iteration. 
%Following the updates of $\cc_\tim$  in \eqref{equ:updatesapp},
 \textbf{i)} Under Assumption \ref{asm: bound} and \ref{asm: feasibility},  
 %for any $k$ with $g(k)\le\epsilon$ andany $K\in\mathbb{N}^{+}\cup\{+\infty\}$, 
 we have, for any choice of control sequence $\{\phi_\tim \}\subset \RR$, 
 $$
\sum_{\t=0}^K \left ( \xi - \frac{L\xi^2}{2} \right ) 
\norm{v_\tim}^2 \leq F(\cc_0) - F(\cc_{\tim+1}) +
\xi \sum_{\t=0}^K \sum_{\i=1}^m  \lambda_{i,t} \phi_\tim,~~~~\forall \t=0,1,\cdots.
$$
\textbf{ii)} In particular, with the $\phi_\tim$ chosen by \eqref{equ:phik}, we have 
\bbb
\sum_{\t=0}^K %\xi_L
\left ( \xi - \frac{L\xi^2}{2} \right ) 
\norm{v_\tim}^2
& \leq F(\cc_0) - F(\cc_{\tim+1}) +\xi c^2  \sum_{\t=0}^K (\alpha_\tim^2+\alpha_\tim)\ind(g(\cc_\tim)>\ep).
\eee 
%where $\xi_L =  \xi - \frac{L\xi^2}{2} $. 
\textbf{iii)} Further, assume $\xi \leq 1/L$,  and $\inf_{\cc\in \RR^\dimcc} F(\cc) :=F^* >-\infty $, and  \red{$\sum_{\t=0}^\infty \alpha_\tim^\gamma \leq \bar \alpha_\gamma < \infty $ for some constant $\gamma \geq 1$}. We have 
\bbb\label{equ:rategamma}
%\sum_{\t=0}^K \left ( \xi - \frac{L\xi^2}{2} \right ) 
\min_{k\leq K}\norm{v_\tim}^2 
%\leq \frac{1}{K+1}\sum_{\t=0}^K \norm{v_\tim}^2
= \bigO\left ( 
\frac{1}{\xi (K+1)} + 
\frac{1}{(K+1)^{\frac{1}{\gamma}}}
\right ),
\eee 
where the constant in $\bigO(\cdot)$  depends on $(F(\cc_0)-F\true)$, $c$, and $\{\alpha_\tim\}.$
%\[\sum_{j=0}^{K}\left(\xi-\frac{\xi^{2}L}{2}\right)\left\Vert d(\th_{k+j},\alpha_{k+j})\right\Vert ^{2}\le F(\th_\tim)-F(\th_{k+K\med{+1}})+\frac{2c^{3}\xi}{r\epsilon}\sum_{j=0}^{K}\alpha_{k+j}.\]
\end{lemma}
\red{The first term in 
\eqref{equ:rategamma} is the standard rate of typical gradient descent on $F$. 
The second term comes from the fact that we need to simultaneously decrease the objectives $\{\ell_\i\}$.
The magnitude of $\alpha_\tim$ controls the trade-off between decreasing $F$ vs. $\{\ell_\i\}$. 
}
%\subsubsection{Proof of Lemma \ref{lem: descent on F}}
\begin{proof} 
\textbf{i)}
The slack condition of the constrained optimization in \eqref{opt: relax} says that 
\bbb \label{equ:slack001}
\lambda_{i,t}\left(\nabla\ell_\i(\th_\tim)\tt v_\tim
 -\phi_\tim \right)=0,\ \forall \i\in[m].
\eee
This gives that 
\begin{align}
\left\Vert v_\tim \right\Vert ^{2} 
& = \left ({\nabla F(\th_\tim)+\sum_{\i=1}^m \lambda_{i,t}\nabla\ell_\i(\th_\tim)} \right ) \tt v_\tim \notag \\ 
& = \dd F(\th_\tim) \tt v_\tim + \sum_{\i=1}^m \lambda_{i,t} \phi_\tim \ant{plugging \eqref{equ:slack001}}. \label{equ:ddfdifd1}
%\left\langle d(\th,\alpha),\nabla F(\th)\right\rangle +\sum_\i\lambda_\i(\th,\alpha)\left\langle d(\th,\alpha),\nabla\ell_\i(\th)\right\rangle \\
 %& =\left\langle d(\th,\alpha),\nabla F(\th)\right\rangle +\alpha g\sum_\i\lambda_\i(\th,\alpha).
\end{align}
%Also when $g(\th)\le r\epsilon$, it is obvious that $\left\Vert d(\th,\alpha)\right\Vert ^{2}=\left\langle d(\th,\alpha),\nabla F(\th)\right\rangle $as $d(\th,\alpha)=\nabla F(\th)$. 

By the $L$-smoothness of $F$ in Assumption \ref{asm: bound}, we have 
\bbb  
F(\th_{\tim+1}) - F(\th_\tim) 
& = F(\cc_\tim - \xi v_\tim) - F(\cc_\tim) \notag \\
& \leq  -\xi \dd F(\cc_\tim) \tt v_\tim + \frac{L\xi^2}{2} \norm{v_\tim}^2. \label{equ:DFLsmooth1}
\eee  
Therefore, combining \eqref{equ:ddfdifd1} and \eqref{equ:DFLsmooth1} gives 
\bbb 
\xi \left\Vert v_\tim
\right\Vert ^{2} 
%& = \left ({\nabla F(\th)+\sum_{\i=1}^m \lambda_\i(\th,\alpha)\nabla\ell_\i(\th)} \right ) \tt d(\cc, \alpha) \\ 
& = \xi \dd F(\th_\tim) \tt v_\tim + \xi \sum_{\i=1}^m \lambda_{i,t}\phi_\tim \\
& \leq F(\th_\tim) - F(\th_{\tim+1})  + \frac{L\xi^2}{2} \norm{v_\tim}^2 
+ \xi \sum_{\i=1}^m \lambda_{i,t}\phi_\tim. \label{equ:bab3}
\eee  
Applying telescoping sum on $\t=0,\ldots K$ gives 
$$
\sum_{\t=0}^K \left ( \xi - \frac{L\xi^2}{2} \right ) 
\norm{v_\tim}^2 \leq F(\cc_0) - F(\cc_{\tim+1}) +
\xi \sum_{\t=0}^K \sum_{\i=1}^m  \lambda_{i,t} \phi_\tim. 
$$

\textbf{ii)} 
Let us consider the choice of $\phi_\tim$
defined in \eqref{equ:phik} now. 
%
%
%we have 
When $g(\cc_\tim) > \ep$, we have  $\phi_\tim = \alpha_\tim g(\cc_\tim)$.  
 Lemma~\ref{lem:cgbound} shows that %have %, when $g(\cc_\tim)\geq \ep$,  
$$
\sum_{\i=1}^m \lambda_{i,t} \phi_\tim = 
\alpha_\tim \sum_{\i=1}^m \lambda_{i,t} g(\cc_\tim) \leq
\alpha_\tim (\alpha_\tim g(\cc_\tim) + c \sqrt{g(\cc_\tim)}) 
= \alpha_\tim^2 g(\cc_\tim) + c \alpha_\tim \sqrt{g(\cc_\tim)}. 
%\max\left(\phi_\tim + c \sqrt{g(\cc_\tim)}, ~0\right). 
$$
Therefore, by \eqref{equ:bab3},
\bbb
%\sum_{\t=0}^K
\left ( \xi - \frac{L\xi^2}{2} \right ) 
\norm{v_\tim}^2
& \leq F(\cc_\tim) - F(\cc_{\tim+1}) +\xi  \left( \alpha_\tim^2 g(\cc_\tim) + c \alpha_\tim \sqrt{g(\cc_\tim)} \right)\notag \\ 
& \leq F(\cc_\tim) - F(\cc_{\tim+1}) +\xi c^2  (\alpha_\tim^2+\alpha_\tim),
\label{equ:bab1} 
%(\alpha g(\cc_\tim) + c \sqrt{g(\cc_\tim)}). 
%\frac{2c^3 \xi}{r\epsilon} \sum_{j=0}^K \alpha_{j}. 
%\sum_{j=0}^K \sum_{\i=1}^m \xi \lambda_\i(\th_j, \alpha_j) \phi(\theta_j, \alpha_j). 
\eee 
where we used $g(\cc_\tim)\leq c^2$ in the last step, because 
$\norm{\dd \ell_\i(\cc)}\leq c$ by Assumption~\ref{asm: bound} and hence
\bb 
g(\cc) 
 = \sup_{\lambda\in \C^m} \norm{\sum_{\i=1}^m\lambda_\i \dd \ell_\i(\cc)}^2 
 \leq \sup_{\lambda\in \C^m} 
\left(\sum_{\i=1}^m \lambda_\i \norm{ \dd \ell_\i(\cc)} \right)^2 \leq \sup_{\i\in [m]} \norm{\dd \ell_\i(\cc)}^2 \leq c^2.  
%\\
\ee 

On the other hand, when $g(\cc_\tim) < \ep$, we have $\phi_\tim = -\infty$ by \eqref{equ:phik}. The algorithm reduces to standard gradient descent on $F$, and hence 
\bbb\label{equ:bab2}
\xi \left\Vert v_\tim
\right\Vert ^{2} 
%& = \left ({\nabla F(\th)+\sum_{\i=1}^m \lambda_\i(\th,\alpha)\nabla\ell_\i(\th)} \right ) \tt d(\cc, \alpha) \\ 
%& = \xi \dd F(\th_\tim) \tt v_\tim + \xi \sum_{\i=1}^m \lambda_{i,t}\phi_\tim \\
& \leq F(\th_\tim) - F(\th_{\tim+1})  + \frac{L\xi^2}{2} \norm{v_\tim}^2. %+ \xi \sum_{\i=1}^m \lambda_{i,t}\phi_\tim. 
\eee  
Combining \eqref{equ:bab1} and \eqref{equ:bab2} yields 
\bbb
\sum_{\t=0}^K \left ( \xi - \frac{L\xi^2}{2} \right ) 
\norm{v_\tim}^2
& \leq F(\cc_0) - F(\cc_{\tim+1}) +\xi c^2  \sum_{\t=0}^K (\alpha_\tim^2+\alpha_\tim)\ind(g(\cc_\tim)>\ep),
\eee 
\textbf{iii)} 
By Lemma~\ref{lem:alphak}, we have 
$$
\sum_{\t=0}^K \alpha_\tim^2 +\alpha_\tim \leq C_\alpha  (K+1)^{\frac{\gamma-1}{\gamma}},
$$
where $C_\alpha <\infty$ is a constant depending on $\{\alpha_\tim\}$. 
Therefore, 
when $\xi \leq 1/L$ and $\inf_{\cc\in \RR^\dimcc}F(\cc)= F\true$, we have  
\bbb
%\sum_{\t=0}^K \left ( \xi - \frac{L\xi^2}{2} \right ) 
\min_{k\leq K}\norm{v_\tim}^2 \leq 
\frac{1}{K+1}\sum_{\t=0}^K \norm{v_\tim}^2
& \leq
\frac{2}{\xi (K+1)}
\left (F(\cc_0) - F(\cc_{\tim+1}) +\xi c^2  \sum_{\t=0}^K (\alpha_\tim^2+\alpha_\tim) \right) \\
& \leq
\frac{2}{\xi (K+1)}
\left (F(\cc_0) - F(\cc_{\tim+1}) +\xi c^2  C_\alpha (K+1)^{\frac{\gamma-1}{\gamma}} \right)  \\
& = \bigO\left ( 
\frac{1}{\xi (K+1)} + 
\frac{1}{(K+1)^{\frac{1}{\gamma}}}
\right ),
\eee 
where the constant in $\bigO(\cdot)$  depends on $F(\cc_0)-F\true$, $c$, and $\{\alpha_\tim\}.$
%$\bar \alpha_\gamma$ and $\alpha_{\max}: = \sup_{k\geq 0} \alpha_\tim < \infty$.
\end{proof} 

\subsubsection{Descent of $\{\ell_\i\}$}

To quantify the theoretical properties of our method, we need to define a proper notion of neighborhood of $\P$ and 
a criterion for (approximate) local optimality of $F$ inside the neighborhood of $\P$. 

For $\epsilon\geq 0$, 
let $\P_\epsilon$  
%We define 
the set of Pareto $\epsilon$-stationary points: 
$\P_{\epsilon} = \{\cc\in \RR^\dimcc \colon ~ g(\cc) \leq \epsilon\}$. Further, for $u\geq 0$, 
the $u$-Pareto closure of $\P_{\epsilon}$ is the set of points 
the performs no worse than at least one point in $\P_{\epsilon}$ upto a slack $u$, that is, 
%Our results involve the following notion of $(\epsilon, u)$-closure of $\P$: 
\begin{eqnarray}
{\P}_{\epsilon,u}:=\left\{ \th:\ \exists\th'\in \P_{\epsilon}\ ~~\text{s.t.}\ ~~ %\forall \i\in[m],\
\max_{\i\in[m]}\ell_\i(\th)-\ell_\i(\th')\leq u\right\}.
\end{eqnarray}
In particular, ${\P}_{\epsilon, 0}$ is the set of points $\cc$ such that $\L(\cc) \preceq \L(\cc')$ for some point $\cc'$ in $\P_\epsilon$. 
%that Pareto dominate (or equivalent to)  at least one point in $\P_\epsilon$. 
%that performs no worse than some parameter in $\P_\epsilon$, which is a natural set to consider.
Therefore, if $\P_\epsilon$ is a good proxy of the Pareto set, 
so is $\P_{\epsilon,u}$ for small $u$. 
As shown in \red{XXX}, a key property of our method is that it guarantees to enter $\P_{\epsilon,u}$ and stays within it afterwards for some small $\epsilon$ and $u$ that depends on the step size $\xi$ and the control parameters $e,\alpha_\k$ in \eqref{equ:phi}. 
In addition, in \red{XXX}, we show that 
our method minimize $F(\cc)$ inside $\P_{\epsilon,u}$ in the sense of \red{XXX}. 
%can be shown %$\bar{\P}_{\epsilon, u}$ gives some extra freedom allowing the loss of tasks to be sightly larger.
%to be the set of points st
%that is, $$

\begin{lemma}[Decent of $\ell_\i$]
Under Assumption~\ref{asm:base}, we have when $g(\cc_t) > e$, 
$$
\frac{\df}{\df t} \ell_\i(\th_{\tim})\leq -\alpha_t 
$$
%\red{We want to show that starting from some point with $g(\cc) > \ep$, it takes at most $O(1/\ep)$ steps to enter the $g(\cc) \leq \ep$ zone. But this would require $\alpha_\tim$ to not decay to zero?}
\end{lemma} 
\begin{proof}
%Suppose that $g(\th_\tim)\le r\epsilon$, then using Lemma \ref{lem: bound descent phase}, we have $g(\th_{\tim+1})\le\epsilon.$ Suppose that$g(\th_\tim)>r\epsilon$, then we have, $\forall \i\in[m]$, 
%By the $L$-smoothness of $\ell_\i$ in Assumption~\ref{asm: bound}, we have 
When $g(\cc_t) > \ep$, 
%We have 
\begin{align*}
\frac{\df}{\df t} \ell_\i(\th_{\tim})
= - \nabla\ell_\i(\th_\tim)\tt v_\tim 
\leq - \phi_\tim = -\alpha_\tim g(\cc_\tim),
%%-\ell_\i(\th_\tim) & \le- \xi \nabla\ell_\i(\th_\tim)\tt v_\tim
%+\frac{\xi^{2}L}{2}\left\Vert v_\tim\right\Vert ^{2}\\
 %& \le- \xi \phi_\tim 
 %\alpha_\timg(\th_\tim)
 %+\frac{\xi^{2}L}{2}\left\Vert v_\tim\right\Vert ^{2},
\end{align*}
where we used the constraint of $\dd \ell_\i(\cc_\tim)\tt v_\tim\geq \phi_\tim$ in \eqref{opt: relax}. 
In this case, since $\alpha_t\phi_t > \alpha_t \ep > 0$,  we yield strict decent on all the losses $\{\ell_\i\}$.

Integrating both sides: 
$$
\min_{t\in[0,T]}{g(\cc_t)} \leq \frac{1}{T}\int_0^T \alpha_{t} g(\cc_t) \df t \leq \frac{\ell_\i(\th_0) - \ell_\i(\th_T)}{T}.
$$
Therefore, for any $\epsilon > \ep$, we 
will first enter $\P_{\epsilon}$ within $\bigO(1/\epsilon)$ time  (although it may leave $\P_{\epsilon}$ later).  

Assume we enter $\P_{\epsilon}$ at time $t_0$, then we have $\cc_t \in \overline\P_{\epsilon}$ for $t \geq t_0$. 
This is because 
However, it will stay within $\overline{\P}_\epsilon$ 
afterwards, because




However, since we monotonically decrease all $\{\ell_\i\}$ simultaneously, we stay within $\overline{\P}_\epsilon$ afterwards.  
%In addition, however,
%$g(\cc_t) \leq \epsilon$
%\begin{align*}
%\ell_\i(\th_{\tim+1})-\ell_\i(\th_\tim) & \le- \xi \nabla\ell_\i(\th_\tim)\tt v_\tim
%+\frac{\xi^{2}L}{2}\left\Vert v_\tim\right\Vert ^{2}\\
% & \le- \xi \phi_\tim 
 %\alpha_\timg(\th_\tim)
% +\frac{\xi^{2}L}{2}\left\Vert v_\tim\right\Vert ^{2},
%\end{align*}
%For any $K\in\mathbb{N}\cup\{+\infty\}$ such that for any $j\in[K]$,$g(\th_{k+j})>r\epsilon$, 

Applying a telescoping  sum on $k$, we have 
\begin{align*}
\ell_\i(\th_{\tim+1})-\ell_\i(\th_{0}) & 
\le- \sum_{\t=0}^K  \left( \xi  \phi_\tim 
 %\alpha_\timg(\th_\tim)
 -\frac{\xi^{2}L}{2}  \left\Vert v_\tim\right\Vert ^{2} \right).
\end{align*}
If $\xi \leq \frac{\phi_\tim}{L\norm{v_\tim}^2},$ \med{which is ensured by XXXX}, we have 
\begin{align*}
\ell_\i(\th_{\tim+1})-\ell_\i(\th_{0}) & 
\le- \frac{1}{2}\sum_{k=1}^K   \xi  \phi_\tim.
 %\alpha_\timg(\th_\tim)
 %-\frac{\xi^{2}L}{2}  \left\Vert v_\tim\right\Vert ^{2} \right).
\end{align*}
When using the $\phi_\tim$ in \eqref{equ:phik}, %starting
%Theerefore, 
starting from a $\theta_0$ with $g(\theta_0) > \ep$, assume $K_0$ is the 
$$
\sum_{\t=0}^K \alpha_\tim g(\cc_\tim) \leq \frac{2}{\xi}(\ell_\i(\th_{0})-\ell_\i(\th_\tim)).
$$
\red{But if $\alpha_\tim$ decay to zero. We can not ensure that $g(\cc_\tim)$ will converge to zero?} 
\end{proof}

\begin{lemma}

\end{lemma}
\begin{proof}
%{\color{blue} My proof 
Assume $\th_0 \in \P_\epsilon$ \red{but it is sufficient to have $\th_0 \in \P_{(\epsilon, u)}$?}.  
We want to prove that if $\th_\tim \in \P_{(\epsilon,u)}$,
then  $\th_{\tim+1} \in \P_{(\epsilon,u)}$. 
%the trajectory $\theta_\tim$ is constrained inside $\P_{(\epsilon,u)}$. 
We need to prove two cases: 
\begin{itemize}
    \item Case 1:  If $\theta_\tim \in \P_{\ep}$, then $\theta_{\tim+1} \in \Pbar$. 
    \item Case 2: If $\theta_\tim \in \Pbar\setminus \P_{\ep}$, then $\theta_{\tim+1} \in \Pbar$. 
\end{itemize}

\paragraph{Case 1}
If $\cc_\tim \geq \P_{\ep}$, that is, $g(\cc_\tim) \leq \ep$, 
we have by Lemma~\ref{lem: bound descent phase} that $g(\cc_{\tim+1}) \leq c$, meaning that $\cc_{\tim+1}\in \Pbar$. 


\paragraph{Case 2} 
If $\theta_\tim \in \Pbar\setminus \P_{\ep}$, that is, $g(\cc_\tim) > \ep$, we have 
\begin{align*}
\ell_\i(\th_{\tim+1})-\ell_\i(\th_\tim) &
\le- {\xi}\alpha_\timg(\th_\tim)+\frac{\xi^{2}L}{2}\norm{v_\tim} ^{2}\\
 & %\overset{(1)}
 {\le}-{\xi}\alpha_\timg(\th_\tim)+
 \frac{\xi^{2}L}{2}
 \frac{F(\th_\tim)-F(\th_{\tim+1})+
\xi \alpha_\tim^2 g(\cc_\tim) +  \xi c \alpha_\tim \sqrt{g(\cc_\tim)}
%c^2 (\alpha_\tim^2+\alpha_\tim) 
 }{\left(\xi-\frac{\xi^{2}L}{2}\right)}
 \\
  & {\le}-{\xi}\alpha_\timg(\th_\tim)+
 {\xi L}
 \left ({F(\th_\tim)-F(\th_{\tim+1})+
\xi \alpha_\tim^2 g(\cc_\tim) +  \xi c \alpha_\tim \sqrt{g(\cc_\tim)}
%c^2 (\alpha_\tim^2+\alpha_\tim) 
 }\right) %{\left(\xi-\frac{\xi^{2}L}{2}\right)} 
 \ant{assume %\red{$\norm{\dd F(\cc)}\leq c$}, 
 $\xi \leq 1/L$} 
%\xi c^2 (\alpha_\tim^2+\alpha_\tim) 
% }{\left(\xi/2\right)}
 \\
   &\leq   \xi^2 L (F(\cc_\tim)-F(\cc_{\tim+1})) 
   - \xi \alpha_\tim \ep + \xi^2 L(\alpha_\tim^2 + \alpha_\tim) c^2 
   %+  \xi (\alpha_\tim )
   %{\le}-{\xi}\alpha_\timg(\th_\tim)+
 %{\xi L}
 %({\red{\xi c} + \xi c^2 (\alpha_\tim^2+\alpha_\tim)}) 
 \\ 
 & \le-\med{\xi}\alpha_{k+j}g(\th_{k+j})+\xi Lc+\frac{2Lc^{3}\xi^{2}}{r\epsilon}\sum_{j=0}^{K}\alpha_{k+j}\\
 & \le-\med{\xi}\alpha_{k+j}r\epsilon+\xi Lc+\frac{2Lc^{3}\xi^{2}}{r\epsilon}\sum_{\t=0}^{K}\alpha_\tim\\
 & = \alpha_{k+j}(-\med{\xi}r\epsilon+\frac{2Lc^{3}\xi^{2}}{r\epsilon})+\xi Lc\\
 & \le\xi Lc.
\end{align*}
Here inequality (1) is by the result of Lemma \ref{lem: descent on F} and
the last inequality is by the assumption \ref{asm: lr} on $\xi$. Combine
the result, we have, for any $k$, $\th_\tim\in\bar{P}_{\epsilon,\xi Lc}$.
\end{proof}


\subsubsection{Technical Lemmas} 
%\begin{tcolorbox} 
\begin{lemma}\label{lem:cgbound}
Assume Assumption~\ref{asm: bound}, \ref{asm: feasibility} hold. %\red{and $\phi_\tim\geq 0$}.
Define $g(\cc) = \min_{\omega\in \C^m} \norm{\sum_{\i=1}^m \omega_\i \dd \ell_\i(\cc)}^2$, where $\C^m$ is the probability simplex on $[m]$. 
Then for the $v_\tim$ and $\lambda_{i,t}$ defined in \eqref{opt: relax} and \eqref{equ:lambdatk}, we have 
%either $\lambda =0$, or 
$$
\sum_{\i=1}^m \lambda_{i,t} g(\cc_\tim) \leq \max\left(\phi_\tim + c \sqrt{g(\cc_\tim)}, ~0\right). 
$$
\end{lemma} 
\begin{proof} 
The slack condition of the constrained optimization in \eqref{opt: relax} says that
%for each $\i\in[m]$, we have either $\lambda_\i(\cc,\alpha)=0$, or $\dd \ell_\i(\cc)\tt d(\cc,\alpha)  = \phi(\cc,\alpha).$
\[\lambda_{i,t}\left(
\dd \ell_\i(\cc) \tt v_\tim -
\phi_\tim
\right)=0,~~~~\forall \i\in[m]. 
\]
%Plugging in 
Sum the equation over $\i\in[m]$ and 
note  
that $v_\tim = \dd F(\cc_\tim) + \sum_{\i=1}^m \lambda_{i,t} \dd \ell_\i(\cc_\tim)$. 
 We get 
\bbb \label{equ:eurd} 
\norm{\sum_{\i=1}^m \lambda_{i,t} \dd\ell_\i(\cc_\tim)}^2 + \left (\sum_{\i=1}^m \lambda_{i,t}  \dd \ell_\i(\cc_\tim) 
\right)\tt \dd F(\cc)  -\sum_{\i=1}^m \lambda_{i,t}  \phi_\tim  = 0. 
\eee 
%Without loss of generality, assume$\sum_{\i=1}^m \lambda_\i(\cc,\alpha) \neq 0$ (otherwise, there is nothing to prove).
Define 
\bb 
x_\tim= \norm{\sum_{\i=1}^m \lambda_{i,t} \dd\ell_\i(\cc_\tim)}^2,&&
\bar \lambda_\tim  = \sum_{\i=1}^m \lambda_{i,t}, &&
g_\tim=g(\cc_\tim) = \min_{\omega\in \C^m} \norm{\sum_{\i=1}^m\omega_\i \dd \ell_\i(\cc_\tim)}^2. 
\ee 
%Note that
Then it is easy to see that $x_\tim \geq \bar \lambda_\tim^2 g_\tim$. 
Using Cauchy-Schwarz inequality, 
$$
\abs{\left (\sum_{\i=1}^m \lambda_{i,t}\dd \ell_\i(\cc)\right )\tt \dd F(\cc_\tim)}
\leq \norm{\dd F(\cc_\tim)}\norm{\sum_{\i=1}^m \lambda_{i,t}\dd \ell_\i(\cc)} \leq c \sqrt{x_\tim}, 
$$
where we used $\norm{\dd F(\cc_\tim)} \leq c$ by Assumption~\ref{asm: bound}. 
Combining this with \eqref{equ:eurd}, we have 
$$
\abs{x_\tim - \bar \lambda_\tim \phi_\tim} \leq c \sqrt{x_\tim}. 
$$
Applying Lemma~\ref{lem:xcg} yields the result. %we have 
%$$
%\phi_\tim - c \sqrt{g_\tim} \leq \bar \lambda_\tim g_\tim \leq \phi_\tim + c \sqrt{g_\tim}. 
%$$
\end{proof}
%\end{tcolorbox}

\begin{lemma}\label{lem:xcg}
Assume $\phi \in \RR$, and 
$x,\lambda, c, g \in \RRplus$ are non-negative real numbers and they satisfy 
\bb 
\abs{x - \lambda \phi} \leq c \sqrt{x}, ~~~~~~~~ x\geq \lambda^2 g.
\ee 
Then we have either $\lambda =0,$ or 
%\phi - c \sqrt{g} \leq \lambda g \leq \phi + c \sqrt{g}. 
$
 %\text{either $\lambda =0,$  ~~~~or }~~~~
\phi - c \sqrt{g} \leq \lambda g \leq \phi + c \sqrt{g}. 
$
\end{lemma} 
\begin{proof}
Square the first equation, we get 
$$f(x):= (x-\lambda \phi )^2 - c^2 x \leq 0,$$
%x^2 - (2 \lambda \phi +c^2) x^2 + \lambda^2 \phi ^2 \leq 0,$$
where $f$ is a quadratic function. 
To ensure that $f(x) \leq 0$ has a solution that satisfies $ x \ge \lambda^2 g$, we need to have $f(\lambda^2 g) \leq 0$, that is,  
%,~~~~~ x^2 \ge \lambda^2 g. 
%$$
%To have a feasible solution of $x$, we must have 
%To have this hold, we must have 
$$
f(\lambda^2g) = (\lambda^2g - \lambda \phi )^2 - c^2 \lambda ^2 g \leq 0.
%(\lambda^2 g)^2 - (2\lambda \phi +c^2) \lambda ^2 g + \lambda ^2 \phi ^2 \leq 0.
$$
%This gives $\lambda^2 g^2 - (2 \lambda\phi+c^2)g + \phi^2\leq 0$. Or 
%$$\lambda^2 g^2 - 2 \lambda g \phi - c^2g + \phi^2\leq 0.$$
%Then 
%$$|\lambda^2g - \lambda \phi | \leq c \lambda  g^{1/2}.
%\lambda g \leq \sqrt{\phi^2 - }
%$$
%which 
This can hold under two cases: 

Case 1: $\lambda =0;$

Case 2: $|\lambda g -\phi|\leq c \sqrt{g}$, and hence 
%This gives 
$\phi - c \sqrt{g} \leq \lambda g \leq \phi + c \sqrt{g}
$. 

Under both case, we have 
$$
\lambda g \leq \max(0, \phi + c \sqrt{g}). 
$$
%We have 
\end{proof}

\begin{lemma}\label{lem:alphak}
For any non-negative sequence $\{\alpha_\tim\colon k =0,1,\cdots\} \subset \RRplus$,
assume $\sum_{\t=0}^\infty \alpha_\tim ^\gamma := A^\gamma <\infty$, where $\gamma \geq 1$, 
we have $\alpha_{\max} : = \sup_{k\geq 0} \alpha_\tim < \infty$, and 
$$
\sum_{\t=0}^K \alpha_\tim^2 +\alpha_\tim \leq C_{\alpha} (K+1)^{\frac{\gamma-1}{\gamma}},
$$
where $C_{\alpha} <\infty$ is a constant depending on $A$ and $\alpha_{\max}$. % : = \sup_{k\geq 0} \alpha_\tim < \infty$. 
\end{lemma}
\begin{proof}
Let $\eta = \frac{\gamma}{\gamma-1}$, so that $1/ \eta  + 1/\gamma= 1$. We have by Holder's inequality, 
\bb 
\sum_{\t=0}^K \alpha_\tim \leq \left( \sum_{\t=0}^K \alpha_\tim^\gamma \right )^{1/\gamma} \left (\sum_{\t=0}^K 1^\eta \right )^{1/\eta} 
\leq A (K+1)^{1/\eta}.
\ee 
If $\gamma \leq 2$, we have 
$$
\sum_{\t=0}^\infty\alpha_\tim^2 \leq 
\alpha_{\max}^{2-\gamma}\sum_{\t=0}^\infty\alpha_\tim^\gamma 
\leq \alpha_{\max}^{2-\gamma} A^\gamma,
$$
where $\alpha_{\max} = \sup_{k\geq 0} \alpha_\tim$, and $\alpha_{\max} < \infty$ because $\{\alpha_\tim\}$ must be a bounded sequence. 
In this case, 
$$
\sum_{\t=0}^K \alpha_\tim^2 + \alpha_\tim \leq A(K+1)^{\frac{\gamma-1}{\gamma}} + \alpha_{\max}^{2-\gamma} A^\gamma %(K+1)^{\frac{\gamma-2}{\gamma}}
%\right ). 
$$
If $\gamma > 2$, let $\eta_2 = \frac{\gamma}{\gamma-2}$
we have by Holder's inequality. 
\bb 
\sum_{\t=0}^K \alpha_\tim^2 \leq \left( \sum_{\t=0}^K \alpha_\tim^\gamma \right )^{1/\gamma} \left (\sum_{\t=0}^K 1^{\eta_2} \right )^{1/\eta_2} 
\leq A (K+1)^{1/\eta_2}.
\ee 
Therefore,
$$
\sum_{\t=0}^K \alpha_\tim^2 + \alpha_\tim \leq A\left ((K+1)^{\frac{\gamma-1}{\gamma}} + (K+1)^{\frac{\gamma-2}{\gamma}}\right ). 
$$
Both cases yield the desirable result. 
\end{proof}
\fi 