\section{PROOF OF RESULTS IN SECTION~\ref{section:threestages}}\label{appendixA}
\textbf{Notation}.
Assumption~\ref{assumption:data} guarantees that the non-zero entries in $X$ are non-overlapping across rows. Therefore, we can partition the index set $I = \{1, \dots, D\}$ into $N$ disjoint subsets $I^{(1)}, \dots, I^{(N)}$ such that \begin{align}
    I = \bigcup_{n=1}^N I^{(n)},~I^{(n)} := \left\{ i \in [D] : x^{(n)}_i \neq 0 \right\}.
\end{align}
We define $\bm{w}^{+(n)}, \bm{w}^{-(n)}, \bm\beta^{(n)} \in \mathbb{R}^{D_n}$ as the subvectors of $\bm w^+$, $\bm w^-$ and $\bm\beta$ corresponding to the indices in $I^{(n)}$, respectively. Similarly, we define $\bm g^{+(n)}, \bm g^{-(n)}$ as subvectors of gradients $\nabla_{\bm w^+}L(\bm w), \nabla_{\bm w^-}L(\bm w)$ corresponding to the indices in $I^{(n)}$.
We let $\bm w^{(n)} := \begin{bmatrix}
    \bm w^{+(n)}, & \bm w^{-(n)}
\end{bmatrix} \in \mathbb{R}^{2D_n}$ and $\bm g^{(n)} := \begin{bmatrix}
    \bm g^{+(n)}, & \bm g^{-(n)}
\end{bmatrix} \in \mathbb{R}^{2D_n}$. The weight dynamics \eqref{eq:weightdynmics} can be decomposed into $N$ autonomous ODE systems:
\begin{equation}
    \frac{d\bm w^{(n)}(t)}{dt} = F^{(n)}\left(\bm w^{(n)}(t)\right) := -\frac{\bm g^{(n)}(t)}{\bm g^{(n)}(t) + \varepsilon \bm 1},
\label{eq:odesystem1block}\end{equation}
where $\bm w^{(n)}(0) = \alpha \bm{1}$ for each $n$. The residual for each $n$ is defined by $r^{(n)}(t) := y^{(n)} - \sum_{i=1}^{D_n} x^{(n)}_i \beta^{(n)}_i(t)$.
In this section, we prove the results for an arbitrary $n$. We omit the superscripts $(n)$ when possible to simplify the notation. 

\subsection{Proof of Proposition~\ref{proposition:signandmonotone}}
\begin{proof}
For all $i=1, \dots, D_n$, it is easy to see that $g^+_i(t) = -w_i^+(t) \cdot x_i \cdot r(t)$, $g_i^-(t) = w_i^-(t)\cdot x_i \cdot r(t)$. The dynamics follow
$w^+_i(t)' = -\frac{g_i^+(t)}{|g_i^+(t)| + \varepsilon},~w^-_i(t)' = -\frac{g_i^-(t)}{|g_i^-(t)| + \varepsilon}$. 

First, we show that for all $i$, $w^+_i(t), w^-_i(t) \geq 0$ always hold. Suppose for contradiction that $w^+_i(t') < 0$ for some $t'$. Since $w^+_i(0) = w^-_i(0) = \alpha > 0$, by continuity of $w^+_i(t)$, there exists $t_0 \in (0, t')$ such that $w^+_i(t_0) = 0$ and $w^+_i(t_0)' < 0$. However, $w^+_i(t_0) = 0$ implies $g^+_i(t_0) = 0$ and $w^+_i(t_0)' = 0$. Therefore, $w^+_i(t)$ never changes sign and is always non-negative. Similarly, we can show that $w^-_i(t)$ is always non-negative.

Next, we show that for each $i$, if $w^+_i(0)' > 0$, then $w^+_i(t)' \geq 0, w^-_i(t) \leq 0$. Relation $w^+_i(0)' = \alpha x_i y > 0$ implies that $x_i y > 0$. Therefore, $x_i r(0) = x_i y > 0$. Let us suppose for contradiction that there exists $t' > 0$ such that $x_i r(t') < 0$. By continuity of $x_i r(t)$, there exists $t_0 \in (0, t')$ such that $x_i r(t_0) = 0$. Since $x_i \neq 0$ by assumption, we must have $r(t_0) = 0$. In turn, $x_j r(t_0) = 0$ and $g^+_j(t_0) = g^-_j(t_0) = 0$ for all $j = 1, \dots, D_n$. As a result, $F^{(n)}\left(\bm w^{(n)}(t_0)\right) = \bm {0}$ and $\bm w^{(n)}(t_0)$ is an equilibrium of the autonomous ODE system \eqref{eq:odesystem1block}. It follows that for all $t \geq t_0$, $\bm w^{+(n)}(t) = \bm w^{+(n)}(t_0)$ and $\bm w^{-(n)}(t) = \bm w^{-(n)}(t_0)$. Therefore, we get $x_i r(t) = x_i r(t_0) = 0$ for all $t \geq t_0$. However, this contradicts that $x_i r(t') < 0$ and $t' > t_0$. Thus, we must have $x_i r(t) \geq 0$ for all $t \geq 0$. Since $w^+_i(t), w^-_i(t) \geq 0$, it follows that $g^+_i(t) \leq 0$ and $g^-_i(t) \geq 0$ for all $t$. We conclude $w^+_i(t)'
\geq 0$ and $w^-_i(t)' \leq 0$ for all $t$.

If $w^+_i(0)' = \alpha x_i y \leq 0$, since $x_i$ and $y$ are non-zero by assumption, we must have $x_i y < 0$. Using similar arguments, it follows that $w^+_i(t)' \leq 0$ and $w^-_i(t)' \geq 0$ for all $t$.
\end{proof}

\begin{lemma}\label{lemma:residualmonotone}
Residual $r(t)$ never changes sign and its absolute value is always non-increasing.
\end{lemma}

\begin{proof}
    We have $r(0) = y \neq 0$ by assumption. When $r(0) > 0$, suppose for contradiction that there exists $t' > 0$ such that $r(t') < 0$. By continuity, we must have $r(t_0) = 0$ for some $t_0 \in (0, t')$. It follows that $g^+_i(t_0) = g^-_i(t_0) = 0$ for all $i = 1, \dots, D_n$. In turn, $w^+_i(t_0)' = w^-_i(t_0)' = 0$ for all $i$ and $\bm w^{(n)}(t_0)$ is an equilibrium of the autonomous ODE system \eqref{eq:odesystem1block}. Therefore, $w^+_i(t) = w^+_i(t_0)$ and $w^-_i(t) = w^-_i(t_0)$ for all $t \geq t_0$. We conclude $r(t) = r(t_0) = 0$ for all $t \geq t_0$. This contradicts that $r(t') < 0$ for $t' > t_0$. As a result, $r(t) \geq 0$ for all $t$. Similarly, when $r(t) < 0$, it follows that $r(t) \leq 0$ for all $t$.

    Next, we compute the derivative of $r(t)$ with respect to $t$ as \begin{align*}
        r'(t) &= -2\sum_{i=1}^{D_n} x_i \left(w^+_i(t)\cdot w^+_i(t)' - w^-_i(t) \cdot w^-_i(t)'\right) \\
        &= -2\sum_{i=1}^{D_n} x_i \left(w^+_i(t)\cdot \frac{w^+_i(t) x_i r(t)}{|w^+_i(t) x_i r(t)| + \varepsilon} - w^-_i(t)\cdot \frac{-w^-_i(t) x_i r(t)}{|w^-_i(t) x_i r(t)| + \varepsilon}\right) \\
        &= -2\sum_{i=1}^{D_n} x_i^2\left(\frac{(w^+_i(t))^2}{|w^+_i(t) x_i r(t)| + \varepsilon} + \frac{(w^-_i(t))^2}{|w^-_i(t) x_i r(t)| + \varepsilon}\right)r(t).
    \end{align*}
    Notice that $x_i^2\left(\frac{(w^+_i(t))^2}{|w^+_i(t) x_i r(t)| + \varepsilon} + \frac{(w^-_i(t))^2}{|w^-_i(t) x_i r(t)| + \varepsilon}\right) \geq 0$. When $r(0) > 0$, we have shown that $r(t) \geq 0$ for all $t$. It follows that $r'(t) \leq 0$ for all $t$. Similarly, when $r(0) < 0$, we have $r(t) \leq 0$ and $r'(t) \geq 0$ for all $t$. Hence, the magnitude of the residual $r(t)$ is always non-increasing.
\end{proof}

Following the notation in Section~\ref{section:threestages}, we let $u_i$ denote the dominating weight, and let $v_i$ represent the non-dominating weight. We repeat the definition here for clarity.
\begin{align*}
    u_i(t) &:= \begin{cases}
        w_i^+(t) & \text{ if } w_i^+(0)' > 0, \\
        w_i^-(t) & \text{else}
    \end{cases}\\
    v_i(t) &:= \begin{cases}
        w_i^-(t) & \text{ if } w_i^+(0)' > 0, \\
        w_i^+(t) & \text{else}
    \end{cases}
\end{align*}
If $x_i y > 0$, then $\beta_i = u_i^2 - v_i^2$; if $x_i y < 0$, then $\beta_i = -u_i^2 + v_i^2$. Therefore, for all $i$, \begin{align}
    \beta_i(t) &= \operatorname{sgn}(x_i y)\left(u_i^2(t) - v_i^2(t)\right).\label{eq:signofbeta}
    \end{align}
We let $f_i(t) := [\nabla_{\bm u} L(\bm w(t))]_i$, $h_i(t) := [\nabla_{\bm v} L(\bm w(t))]_i$ denote the $i$-th component of the gradient with respect to $\bm u$ and $\bm v$, respectively, which varies across all coordinates. By calculating the gradient, we derive the expressions
\begin{align*}
    f_i(t) &:= -u_i(t)|x_i r(t)|,~h_i(t) := v_i(t)|x_i r(t)|.
    \end{align*}
In turn, we have \begin{align*}
    u'_i(t) &= -\frac{f_i(t)}{|f_i(t)| + \varepsilon} = \frac{u_i(t) |x_i r(t)|}{u_i(t) |x_i r(t)| + \varepsilon},\\
    v'_i(t) &= -\frac{h_i(t)}{|h_i(t)| + \varepsilon} = - \frac{v_i(t) |x_i r(t)|}{v_i(t) |x_i r(t)| + \varepsilon}.
\end{align*}
The residual can be written as \begin{align}
    r(t)  &= y - \sum_{k=1}^{D_n} x_k\beta_i(t) \label{eq:expressionofrtinbeta} \\
    &= y - \sum_{k=1}^{D_n} \operatorname{sgn}(y)\operatorname{sgn}(x_k)\cdot x_k\left(u_k^2(t) - v_k^2(t) \right) \\
    &= \operatorname{sgn}(y)\left(|y| - \sum_{k=1}^{D_n} |x_k| \left(u_k^2(t) - v_k^2(t) \right)\right).
\end{align}
By Lemma~\ref{lemma:residualmonotone}, $r(t)$ never changes sign. Since $r(0) = \operatorname{sgn}(y)|y|$, then for all $t$, \begin{equation}\label{eq:absresidual}
    |r(t)| = |y| - \sum_{k=1}^{D_n} |x_k|\left(u_k^2(t) - v_k^2(t) \right).
\end{equation}
\subsection{Proof of Proposition~\ref{proposition:initialstage}}
\begin{proof}
    First, we show the existence of $t_i > 0$ such that $h_i(t_i) = \varepsilon$ for each $i$. By Assumption~\ref{assumption:epsilonalpha}, $h_i(0) = \alpha|x_i y| \geq 2\varepsilon$. Let us suppose for contradiction that $h_i(t) > \varepsilon$ for all $t$. Then $v'_i(t) < -\frac{\varepsilon}{\varepsilon + \varepsilon} = -\frac{1}{2}$ for all $t$, and for $t > 2\alpha$, $v_i(t) < \alpha - \frac{1}{2} t < 0$. However, by Proposition~\ref{proposition:signandmonotone}, $v_i(t)$ is always non-negative. It yields $h_i(t_i') < \varepsilon$ for some $t_i'$. Since $h_i(0) \geq 2\varepsilon$, by continuity of $h_i(t)$, there exists $t_i$ such that $h_i(t_i) = \varepsilon$. It follows that $t_i \leq 2\alpha$. Because $h_2(t) \geq \varepsilon$ for $t \leq t_i$, $v_2'(t) \leq -\frac{1}{2}$. If $t_i > 2\alpha$, then $v_2(t_i) < \alpha - \frac{1}{2} t_i < 0$, which is a contradiction. We conclude $t_i \in (0, 2\alpha]$.

    Next, we show that $|r(t_i)|$ is lower bounded. Using \eqref{eq:absresidual}, we get \begin{align*}
        |r(t)| &= |y| - \sum_{i=1}^{D_n} |x_i| \left(u_i^2(t) - v_i^2(t)\right)\\
        &\geq |y| - \sum_{i=1}^{D_n} |x_i| u_i^2(t).
    \end{align*}
    For all $i$, $u_i'(t) \leq 1$ always holds. It follows that $u_i(t) \leq \alpha + t$ for all $t$. Since $|r(t)|$ is non-increasing by Lemma~\ref{lemma:residualmonotone}, and using $t_i \leq 2\alpha$, it follows that
    \begin{equation*}
        |r(t_i)| \geq |r(2\alpha)| \geq |y| - \sum_{i=1}^{D_n} |x_i| \left(\alpha + 2\alpha\right)^2 = |y| - 9\alpha^2 \sum_{i=1}^{D_n} |x_i|.
    \end{equation*}
    By Assumption~\ref{assumption:epsilonalpha}, $9\alpha^2 \leq \frac{|y|}{2\sum_{i=1}^{D_n} |x_i|}$, and thus \begin{equation}
        |r(t_i)| \geq |r(2\alpha)| \geq |y| - \frac{|y|}{2} = \frac{|y|}{2}.\label{eq:lowerboundresidualat2alpha}
    \end{equation} 
    Since $h_i(t_i) = v_i(t_i)|x_i r(t_i)| = \varepsilon$, we get $v_i(t_i) \leq \frac{2\varepsilon}{|x_i y|}$. Function $v_i(t)$ is non-increasing by Proposition~\ref{proposition:signandmonotone}, so for all $t \geq t_i$ we have $v_i(t) \leq \frac{2\varepsilon}{|x_i y|}$. The argument holds for all $i = 1, \dots, D_n$ and for all $n$. We complete the proof by letting $T_0 := \max \{ t_i \}$.
\end{proof}

\subsection{Proof of Proposition~\ref{proposition:dualdynamics}}
\begin{proof}
    Let us define the potential function by $\Phi_t(\bm \beta(t)) := \frac{2}{3}\sum_{i=1}^{D}\left(|\beta_i(t)| + v^2_{i, t}\right)^{\frac{3}{2}}$ for all $t$, where $v_{i. t} := v_i(t)$ is a parameter for the time-varying potential. We get the dual variable using the mirror map \begin{equation*}
        \nabla \Phi_t(\bm\beta(t)) = \operatorname{sgn}(\bm \beta (t))\odot \left(|\bm\beta(t)| + \bm v^2_t\right)^{\frac{1}{2}},
    \end{equation*} where operations are taken element-wise. The Hessian $\nabla^2 \Phi_t(\bm\beta(t))$ is a diagonal matrix with diagonal elements $\frac{\operatorname{sgn}(\beta_i(t))}{2\left(|\beta_i(t)| + v^2_{i,t}\right)^{\frac{1}{2}}}$. 
    Using the chain rule, we compute the dual dynamics \begin{align*}
        \frac{d\nabla \Phi_t(\bm \beta(t))}{dt} = &\langle \nabla^2 \Phi_t(\bm\beta(t)), \frac{d\bm \beta(t)}{dt} \rangle + \langle \nabla_{\bm v} \nabla \Phi_t(\bm\beta(t)), \frac{d\bm v(t)}{dt}\rangle \\
        = &\operatorname{sgn}(\bm\beta(t)) \odot \left(|\bm\beta(t)| + \bm v_t^2\right)^{-\frac{1}{2}} \odot \left( \bm u(t) \odot \frac{d\bm u(t)}{dt} - \bm v(t) \odot \frac{d\bm v(t)}{dt}\right) \\
        &+ \operatorname{sgn}(\bm \beta(t)) \odot \left( |\bm \beta (t)| + \bm v^2_t \right)^{-\frac{1}{2}} \odot \bm v(t) \odot \frac{d\bm v(t)}{dt} \\
        = &\operatorname{sgn}(\bm\beta(t)) \odot \frac{d\bm u(t)}{dt} \\
        = &-\operatorname{sgn}(\bm\beta(t)) \odot \frac{\nabla_{\bm u}L(\bm w(t))}{\left|\nabla_{\bm u}L(\bm w(t))\right|+\varepsilon \bm{1}}.
    \end{align*}
\end{proof}

\begin{lemma}\label{lemma:xixjcomparison}
    For all $i, j \in \{1, \dots, D_n \}$, $|x_i| \geq |x_j|$ implies $u_i(t) \geq u_j(t)$ for all $t \geq 0$.
\end{lemma}

\begin{proof}
    If $|x_i| = |x_j|$, since $u_i(0) = u_j(0) = \alpha$, then $u'_i(t) = u'_j(t)$ and $u_i(t) = u_j(t)$. Suppose $|x_i| > |x_j|$. Let $\bar{u}(t) := u_i(t) - u_j(t)$. Then we have $\bar{u}(0) = \alpha - \alpha = 0$.
    
    First, we show that there exists a small neighborhood $\mathcal{B}$ such that $\bar{u}(t) > 0$ for $t \in \mathcal{B}$. Because $u_i(t), u_j(t)$ are differentiable everywhere, $\bar{u}(t)$ is differentiable for all $t \geq 0$. Inequality $|x_i| > |x_j|$ implies $u_i'(0) = \frac{\alpha |x_i y|}{\alpha |x_i y| + \varepsilon} > \frac{\alpha |x_j y|}{\alpha|x_j y| + \varepsilon} = u_j'(0)$. As a result, $\bar{u}'(0) > 0$. Using differentiability of $\bar{u}(t)$ at $t=0$, we get
    \begin{equation}
        \lim_{\tau\to 0^+} \frac{\bar{u}(\tau) - \bar{u}(0)}{\tau - 0} = \lim_{\tau\to 0^+} \frac{\bar{u}(\tau)}{\tau} = \bar{u}'(0).\label{eq:limat0}
    \end{equation}
   Let $\epsilon_{\tau} := \frac{\bar{u}'(0)}{3} > 0$. By definition of limit in \eqref{eq:limat0}, there exists $\delta_{\tau} > 0$ such that for all $\tau \in (0, \delta_{\tau})$, $\left |\frac{\bar{u}(\tau)}{\tau} - \bar{u}'(0)\right | < \epsilon_{\tau}$. Therefore, $\frac{\bar{u}(\tau)}{\tau} - \bar{u}'(0) > -\epsilon_\tau = -\frac{\bar{u}'(0)}{3}$. It follows that $\bar{u}(\tau) > \frac{2\tau}{3} \bar{u}'(0) > 0$ for all $\tau \in (0, \delta_\tau)$.
    
    Next, we show that $\bar{u}(t) \geq 0$ for all $t > 0$. Suppose for contradiction that there exists $t > 0$ such that $\bar{u}(t) < 0$. Let $t_0 := \inf\{ t : t > 0,~\bar{u}(t) < 0\}$. We have $\bar{u}(t_0) \leq 0$ and $\bar{u}(t) \geq 0$ for $t \in (0, t_0)$ by definition. We must have $t_0 \geq \delta_{\tau} > \frac{\delta_{\tau}}{2} > 0$ as we have shown that $\bar{u}(t) > 0$ for $t \in (0, \delta_{\tau})$. Since $\bar{u}(t)$ is differentiable, by the Mean Value Theorem, there exists $t_1 \in (\frac{\delta_{\tau}}{2}, t_0)$ such that $\bar{u}'(t_1) = \frac{\bar{u}(t_0) - \bar{u}(\frac{\delta_{\tau}}{2})}{t_0 - \frac{\delta_{\tau}}{2}}$. Since $\bar{u}(t_0) \leq 0$ and $\bar{u}(\frac{\delta_{\epsilon}}{2}) > 0$, we have $\bar{u}'(t_1) < 0$. Therefore,
    \begin{align*}
        \bar{u}'(t_1) &= u_i'(t_1) - u_j'(t_1) \\
        &= \frac{u_i(t_1)|x_i r(t_1)|}{ u_i(t_1)|x_i r(t_1)| + \varepsilon} -\frac{u_j(t_1)|x_j r(t_1)|}{ u_j(t_1)|x_j r(t_1)| + \varepsilon} \\
        &< 0.
    \end{align*}
    The inequality implies that $u_i(t_1)|x_i r(t_1)|  < u_j(t_1) |x_j r(t_1)|$. By assumption, we have $|x_i| > |x_j|$, and thus we must have $u_i(t_1) < u_j(t_1)$, i.e., $\bar{u}(t_1) < 0$. However, $t_1 < t_0$ and this contradicts that $\bar{u}(t) \geq 0$ for all $t \in (0, t_0)$. Thus, $u_i(t) \geq u_j(t)$ always holds.
\end{proof}

\subsection{Proof of Proposition~\ref{proposition:mainstage}}
\begin{proof}
First, we show that for all $i$, there exists $T_i$ such that $\left|\nabla_{\bm u}L(\bm w(T_i))\right|_i = f_i(T_i) = \varepsilon$. 
Suppose for contradiction that $f_i(t) > \varepsilon$ for all $t$. We have $u'_i(t) > \frac{1}{2}$ and $u_i(t) \geq \alpha + \frac{t}{2}$. Without loss of generality, we assume $r(0) = y > 0$. For $t > 2\sqrt{\frac{|y|}{|x_i|}}$, the residual is negative due to \begin{align*}
    r(t) &= y - |x_i|\left(u_i^2(t) - v_i^2(t)\right) - \sum_{k \neq i}^{D_n} |x_k|\left(u_i^2(t) - v_i^2(t)\right) \\
    &\leq y - |x_i|\left(u_i^2(t) - v_i^2(t)\right) \\
    &\leq y - |x_i|\left(\left(\alpha + \sqrt{\frac{|y|}{|x_i|}}\right)^2 - \alpha^2\right) \\
    &< y - |x_i| \frac{|y|}{|x_i|} \\
    &= 0.
\end{align*}
However, this contradicts that $r(t)$ never flips sign by Lemma~\ref{lemma:residualmonotone}. Hence, there exists $T_i'$ such that $f_i(T_i') \leq \varepsilon$. By continuity, there exists $t \in (0, T_i']$ such that $f_i(t) = \varepsilon$. Let $T_i := \min\left\{t : 0 \leq t \leq T'_i,~ f_i(t) =\varepsilon\right\}$. Therefore, \begin{equation}
    f_i(T_i) = \varepsilon,\text{ and }f_i(t) > \varepsilon~\text{for }t < T_i.\label{eq:fT_icomparison}
\end{equation}

Next, we show that $T_i > 2\alpha \geq T_0$. In \eqref{eq:lowerboundresidualat2alpha} we have proved that $|r(2\alpha)| \geq \frac{|y|}{2}$. Since $u_i(t) \geq \alpha$ and $|r(t)| \geq |r(2\alpha)|$ for $t \leq 2\alpha$, then $f_i(t) = u_i(t) |x_i r(t)| \geq \alpha \frac{|x_i y|}{2}$. By Assumption~\ref{assumption:epsilonalpha}, we have $\alpha > \frac{2\varepsilon}{|x_i y|}$. As a result, $f_i(t) > \varepsilon$ for all $t \leq 2\alpha$. Therefore, we must have $T_i' > 2\alpha \geq T_0$.

We need to show that the derivative of $f_i(t)$ is always non-positive for $t \geq T_i$. Using the expression for $|r(t)|$ in \eqref{eq:absresidual}, we get
\begin{align*}
    f_i'(t) &= |x_i|\left(u_i'(t) |r(t)| + u_i(t)|r(t)|'\right) \\
    &= |x_i|\left(u'_i(t)|r(t)| + u_i(t)\left(-2\sum_{k=1}^{D_n} |x_k|u_k(t)u'_k(t) + 2\sum_{k=1}^{D_n} |x_k|v_k(t)v'_k(t)\right)\right). 
\end{align*}
Since $v'_k(t) \leq 0$ for all $k$, we get
\begin{equation}\label{eq:fderivativeform1}
    f_i'(t) \leq |x_i|\left(u'_i(t)|r(t)| - 2u_i(t)\sum_{k=1}^{D_n} |x_k|u_k(t)u'_k(t)\right).
\end{equation}

Next, we want to find a lower bound for $2u_i(T_i)\sum_{k=1}^{D_n} |x_k|u_k(T_i)u'_k(T_i)$. We denote the index set by $\mathcal{I} := \left \{1, \dots, D_n\right \}$ that we partition as $\mathcal{I} = \mathcal{I}_i^+ \cup \mathcal{I}_i^-$, where $\mathcal{I}_i^+ := \left\{ k : |x_k| \geq |x_i| \right\}$ and $\mathcal{I}_i^- := \left\{ k : |x_k| < |x_i| \right\}$. 
For $k \in \mathcal{I}_i^-$, since $|x_k| < |x_i|$, we have $u_k(t) \leq u_i(t)$ by Lemma~\ref{lemma:xixjcomparison}. As a result, $u_k(t)|x_kr(t)| \leq u_i(t)|x_ir(t)|$. In turn, for all $k \in \mathcal{I}_i^-$ and for all $t$, \begin{align}
    u_i(t) u_k'(t) &= u_i(t) \frac{u_k(t)|x_k r(t)|}{u_k(t)|x_k r(t)| + \varepsilon} \\
    &\geq u_i(t) \frac{u_k(t)|x_k r(t)|}{u_i(t)|x_i r(t)| + \varepsilon} \\
    &\geq \frac{u_k(t)|x_k|}{|x_i|}\cdot \frac{u_i(t)|x_i r(t)|}{u_i(t)|x_i r(t)| + \varepsilon}\\
    &= \frac{|x_k|}{|x_i|}u_k(t)u'_i(t) \label{eq:inequalityforI-}.
\end{align}
For $k \in \mathcal{I}_i^+$, similarly, we have $u_k(t) \geq u_i(t)$ for all $t$. We also have $f_k(t) = u_k(t)|x_k r(t)| \geq u_i(t)|x_i r(t)| = f_i(t)$. In turn, $u'_k(t) \geq u_i'(t)$. Therefore, for all $k \in \mathcal{I}_i^+$ and for all $t$, we have
\begin{equation}
    u_i(t)u_k'(t) \geq u_i(t)u_i'(t)\label{eq:inequalityforI+1}.
\end{equation}
Using \eqref{eq:inequalityforI-} and \eqref{eq:inequalityforI+1}, we get
\begin{align*}
    2u_i(t)\sum_{k=1}^{D_n} |x_k|u_k(t)u_k'(t) 
    &= 2\sum_{k\in\mathcal{I}_i^+} |x_k|u_k(t) u_i(t)u_k'(t) + 2\sum_{k\in\mathcal{I}_i^-} |x_k|u_k(t)u_i(t)u_k'(t) \\
    &\geq 2\sum_{k\in\mathcal{I}_i^+} |x_k|u_k(t) u_i(t)u_i'(t) + 2\sum_{k\in\mathcal{I}_i^-} \frac{|x_k|}{|x_i|}\cdot|x_k|u_k^2(t)u_i'(t) \\
    &= 2u'_i(t) \left( \sum_{k\in\mathcal{I}_i^+} |x_k|u_i(t)u_k(t) + \sum_{k\in\mathcal{I}_i^-} \frac{|x_k|}{|x_i|}|x_k|u_k^2(t)\right).
\end{align*}
Because $u_i(t)$ is non-decreasing for all $i$, it follows that for $t \geq T_i$,
\begin{align*}
    2u'_i(t) \left( \sum_{k\in\mathcal{I}_i^+} |x_k|u_i(t)u_k(t) + \sum_{k\in\mathcal{I}_i^-} \frac{|x_k|}{|x_i|}|x_k|u_k^2(t)\right) &\geq 2u'_i(t) \left( \sum_{k\in\mathcal{I}_i^+} |x_k|u_i(T_i)u_k(T_i) + \sum_{k\in\mathcal{I}_i^-} \frac{|x_k|}{|x_i|}|x_k|u_k^2(T_i)\right).
\end{align*}
Moreover, for $t \in [0, T_i]$, $u'_i(t) \geq \frac{1}{2}$ and $u'_k(t) \leq 1$. As a result, $u'_i(t) \geq \frac{1}{2}u'_k(t)$. In turn, we have \begin{equation}
    u_i(T_i) \geq \alpha + \frac{1}{2}\left(u_k(T_i) - \alpha \right) > \frac{1}{2}u_k(T_i).\label{eq:uigreaterthanuk}
\end{equation}
We also know that $\frac{|x_k|}{|x_i|} \geq \frac{\min_j \{ |x_j| \}}{|x_i|}$ for all $k \in \mathcal{I}_i^-$, and $1 \geq \frac{\min_j \{ |x_j| \}}{|x_i|}$. Using \eqref{eq:uigreaterthanuk}, we get \begin{align}
     2u_i(t)\sum_{k=1}^{D_n} |x_k|u_k(T_i)u_k'(T_i) &\geq u'_i(t) \left( \sum_{k\in\mathcal{I}_i^+} 2|x_k|\frac{1}{2}u_k(T_i)u_k(T_i) + \sum_{k\in\mathcal{I}_i^-} \frac{2|x_k|}{|x_i|}|x_k|u_k^2(T_i)\right) \\
     &= u'_i(t) \left( \sum_{k\in\mathcal{I}_i^+} |x_k|u_k^2(T_i) + \sum_{k\in\mathcal{I}_i^-} \frac{2|x_k|}{|x_i|}|x_k|u_k^2(T_i)\right) \\
     &\geq u'_i(t) \left( \sum_{k\in\mathcal{I}_i^+} \frac{\min_j \{ |x_j| \}}{|x_i|}|x_k|u_k^2(T_i) + \sum_{k\in\mathcal{I}_i^-} \frac{\min_j \{ |x_j| \}}{|x_i|}|x_k|u_k^2(T_i)\right) \\
     &= u'_i(t) \frac{\min_j\{|x_j|\}}{|x_i|}\sum_{k=1}^{D_n}|x_k|u_k^2(T_i).\label{eq:lowerboundforsum}
\end{align}

Next, we consider \eqref{eq:fderivativeform1} by using \eqref{eq:lowerboundforsum}. Since $|r(t)|$ is non-increasing, for all $t \geq T_i$, we have
\begin{align}
    f'_i(t) &\leq |x_i|\left(u'_i(t)|r(t)| - u'_i(t) \frac{\min_j\{|x_j|\}}{|x_i|}\sum_{k=1}^{D_n}|x_k|u_k^2(T_i)\right) \\
    &\leq |x_i|\left(u'_i(t)|r(T_i)| - u'_i(t) \frac{\min_j\{|x_j|\}}{|x_i|}\sum_{k=1}^{D_n}|x_k|u_k^2(T_i)\right) \\
    &\leq |x_i|u'_i(t)\left(|r(T_i)| - \frac{\min_j\{|x_j|\}}{|x_i|}\sum_{k=1}^{D_n}|x_k|u_k^2(T_i)\right).\label{eq:proposition36finalinequality2}
\end{align}

At $t = T_i$, we know that $u_i(T_i) \geq \alpha$ and $f_i(T_i) = u_i(T_i)|x_i r(T_i)| = \varepsilon$. By Assumption~\ref{assumption:epsilonalpha}, we have $\alpha > \frac{2\varepsilon}{|x_j y|}$ for all $j$. We must have 
\begin{equation}
    |r(T_i)| = \frac{f_i(t)}{u_i(t) |x_i|} \leq \frac{\varepsilon}{\alpha |x_i|} < \frac{\varepsilon}{|x_i|}\cdot \frac{\min_j\{|x_j|\}|y|}{2\varepsilon} = \frac{1}{2}\frac{\min_j\{|x_j|\}}{|x_i|}|y|\label{eq:proposition36finalinequality3},
\end{equation}
which implies
\begin{equation*}
    |y| - \sum_{k=1}^{D_n} |x_k|\left( u_k^2(T_i) - v_k^2(T_i) \right) < \frac{1}{2}\frac{\min_j\{|x_j|\}}{|x_i|}|y|.
\end{equation*}
Thus, \begin{equation}
    \sum_{k=1}^{D_n} |x_k|u_k^2(T_i) \geq \sum_{k=1}^{D_n} |x_k|\left( u_k^2(T_i) - v_k^2(T_i) \right) > \left(1 - \frac{1}{2}\frac{\min_j\{|x_j|\}}{|x_i|}\right)|y| \geq \frac{1}{2}|y|.\label{eq:proposition36finalinequality}
\end{equation}
By using \eqref{eq:proposition36finalinequality3} and \eqref{eq:proposition36finalinequality} in \eqref{eq:proposition36finalinequality2}, we get
\begin{align*}
    f_i'(t) &\leq |x_i|u'_i(t)\left( \frac{1}{2}\frac{\min_j\{|x_j|\}}{|x_i|}|y| - \frac{\min_j\{|x_j|\}}{|x_i|}\sum_{k=1}^{D_n}|x_k|u_k^2(T_i) \right) \\
    &\leq |x_i|u'_i(t)\left( \frac{1}{2}\frac{\min_j\{|x_j|\}}{|x_i|}|y| - \frac{\min_j\{|x_j|\}}{|x_i|}\frac{1}{2}|y| \right) \\
    &\leq 0.
\end{align*}

Hence, for all $t\geq T_i$, $f'_i(t)$ is non-increasing. We conclude that for each $i$, there exists $T_i > T_0$ such that $f'_i(t) > \varepsilon$ for $t < T_i$, and $f'_i(t) \leq \varepsilon$ for $t \geq T_i$.
\end{proof}


\begin{lemma}[Convergence]\label{lemma:convergence}
    As $t\to\infty$, for every $n$ we have \begin{align*}
        &\lim_{t \to \infty} r^{(n)}(t) = 0, \\
        &\lim_{t\to\infty}\nabla_{\bm w}L(\bm w(t)) = \bm 0, \\
        &\bm u^{\infty} := \lim_{t\to\infty}(\bm u(t)) \text{~with}~u_i^\infty < \infty~\forall i, \\
        &\bm v^{\infty} := \lim_{t\to\infty}(\bm v(t)) \text{~with}~v_i^\infty < \infty~\forall i.
    \end{align*}
\end{lemma}

\begin{proof}
    Without loss of generality, we assume $r(0) = y > 0$. By Lemma~\ref{eq:absresidual}, $r(t)$ is bounded below by $0$ and monotonically non-increasing in $t$. Therefore, $r(t)$ converges as $t \rightarrow \infty$ by calculus. Let $R_0 := \lim_{t \to \infty} r(t) \geq 0$. We want to show that $R_0 = 0$. Suppose for contradiction that $R_0 > 0$. We have $r(t) \geq R_0 > 0$ for all $t \geq 0$.
    
    We first show that $u_k'(t)$ is bounded below by a positive number for all $k$. Since $u_k(t) \geq \alpha$ and $r(t) \geq R_0$ for all $t$, we have $f_k(t) = u_k(t)|x_k|r(t) \geq \alpha |x_k|R_0 > 0$. Therefore, for all $t \geq 0$,
    \begin{equation*}
        u_k'(t) = \frac{f_k(t)}{f_k(t) + \varepsilon} \geq \frac{\alpha |x_k|R_0}{\alpha |x_k|R_0 + \varepsilon} > 0.
    \end{equation*}
    As a result, $u_k(t) \geq \alpha + t \cdot \frac{\alpha |x_k|R_0}{\alpha |x_k|R_0 + \varepsilon}$. 
    Recall that \begin{align*}
        r (t) &= y - \sum_{k=1}^{D_n}|x_k|\left(u_k^2(t) - v_k^2(t) \right) \\
        &\leq y - \sum_{k=1}^{D_n}|x_k|\left(u_k^2(t)- \alpha^2\right).
    \end{align*}
    As $t \to \infty$, $u_k^2(t) \rightarrow \infty$, and the summation $\sum_{k=1}^{D_n}|x_k|u_k^2(t)$ is unbounded. We conclude that $r(t) < 0$ for sufficiently large $t$. This contradicts that $r(t) \geq 0$ for all $t$ by Lemma~\ref{lemma:residualmonotone}. Thus, we must have $R_0 = \lim_{t \to \infty} r(t) = 0$.

    The argument holds for all $n$, so $\lim_{t\to\infty}r^{(n)}(t) = 0$ for all $n = 1, \dots, N$. As a result, we have $\lim_{t\to\infty}[\nabla_{\bm u}L(\bm w(t))]_i = 0$ and $\lim_{t\to\infty}[\nabla_{\bm v}L(\bm w(t))]_i = 0$ for all $i$. It follows that $\lim_{t\to\infty}\nabla_{\bm w} L (\bm w(t)) = \bm 0$.

    Next, we show that the weights converge as $t\rightarrow \infty$. Without loss of generality, we suppose $r(0) = y > 0$. Because $r(t)$ never changes sign by Lemma~\ref{lemma:residualmonotone}, we have $0 \leq r(t) \leq y - \sum_{k=1}^{D_n}|x_k|\left(u_k^2(t)- \alpha^2\right)$. As a result, $u_k(t)$ is upper bounded. Since $u_k(t)$ is non-decreasing, we have $u_k^{\infty} := \lim_{t\to\infty}u_k(t) < \infty$ by calculus. Using a similar argument for $v_k(t)$ which is non-increasing, $v_k^{\infty} := \lim_{t\to\infty}v_k(t) < \infty$. The proof holds for all $k$ and all $n$. Therefore, $\bm u^{\infty} := \lim_{t\to\infty}\bm u(t)$ exists with $u_i^\infty < \infty$ for all $i$, and $\bm v^{\infty} := \lim_{t\to\infty}\bm v(t)$ exists with $v_i^\infty < \infty$ for all $i$.
\end{proof}

\section{PROOF OF RESULTS IN SECTION~\ref{section:characterization}}\label{appendixB}
Using Assumption~\ref{assumption:data2}, we parameterize the dynamics using $\theta_1$ and $\lambda_1$ with $|\cos\theta_1| \geq |\sin\theta_1| > 0$ and $\lambda_1 > 0$. We let $y := y^{(1)}$, $\theta := \theta_1$, $\lambda := \lambda_1$ and $\Tilde{y} := \frac{y^{(1)}}{\sqrt{\lambda_1}}$ to simplify the notation in the proofs. We have
\begin{align*}
    |r(t)| &= |\Tilde{y}| - |\cos\theta| \left(u_1^2(t)-v_1^2(t)\right) - |\sin\theta| \left(u_2^2(t) - v_2^2(t)\right),\\
    f_1(t) &= \lambda u_1(t) |\cos\theta r(t)|, \\
    f_2(t) &= \lambda u_2(t) |\sin\theta r(t)|, \\
    u'_1(t) &= \frac{f_1(t)}{f_1(t) + \varepsilon},~u'_2(t) := \frac{f_2(t)}{f_2(t) + \varepsilon}.
\end{align*}

\begin{lemma}\label{lemma:ratio}
    We have $u'_1(t) \geq u_2'(t)$ for $ t \in [0, T)$, and $u'_1(t) \geq \frac{2|\cot\theta|}{1+|\cot\theta|} u_2'(t)$ for $ t \in [T, \infty)$. Quantity $T$ is the stage transition time as in Proposition~\ref{proposition:mainstage}.
\end{lemma}

\begin{proof}
    First, we show that for all $t \geq 0$, 
    \begin{equation}\label{eq:derivativeratiolb}
        u'_1(t) \geq \frac{|\cot\theta|\left(f_2(t) + \varepsilon\right)}{|\cot\theta| f_2(t)+\varepsilon} u_2'(t).
    \end{equation}
    Since $|\cos\theta| \geq |\sin\theta| > 0$ by Assumption~\ref{assumption:data2}, we have $u_1(t) \geq u_2(t)$ by Lemma~\ref{lemma:xixjcomparison}. As a result,
    \begin{align*}
        f_1(t) &= \lambda u_1(t) |\cos \theta r(t)| \\
        &= |\cot\theta| \lambda u_1(t) |\sin \theta r(t)|\\
        &\geq |\cot\theta| \lambda u_2(t) |\sin \theta r(t)| \\
        &= |\cot\theta| f_2(t).
    \end{align*}
    Therefore,
    \begin{equation}
        u_1'(t) = \frac{f_1(t)}{f_1(t)+\varepsilon}
        = 1 - \frac{\varepsilon}{f_1(t) + \varepsilon}
        \geq 1 - \frac{\varepsilon}{|\cot\theta| f_2(t) + \varepsilon}
        = \frac{|\cot\theta| f_2(t)}{|\cot\theta| f_2(t) + \varepsilon}.\label{eq:u1'inequality1}
    \end{equation}
    When $u'_2(t) = 0$, \eqref{eq:derivativeratiolb} holds since $u'_1(t)$ is always non-negative. When $u'_2(t) \neq 0$, using \eqref{eq:u1'inequality1}, we have that \eqref{eq:derivativeratiolb} holds:
    \begin{align*}
        \frac{u'_1(t)}{u'_2(t)} &= \frac{f_1(t)}{f_1(t)+\varepsilon} \cdot \frac{f_2(t) + \varepsilon}{f_2(t)} \\
        &\geq \frac{|\cot\theta| f_2(t)}{|\cot\theta| f_2(t) + \varepsilon} \cdot \frac{f_2(t) + \varepsilon}{f_2(t)} \\
        &= \frac{|\cot\theta|(f_2(t) + \varepsilon)}{|\cot\theta| f_2(t) + \varepsilon}.
    \end{align*}
    By Proposition~\ref{proposition:mainstage}, there exist stage transition times $T_1, T_2$ for $f_1(t)$ and $f_2(t)$, respectively. We know that $f_1(t) \leq \varepsilon$ for $t \geq T_1$. Since $|\cos\theta| \geq |\sin\theta|$, $f_1(t) \geq f_2(t) > \varepsilon$ for $t \in [0, T_2)$. As a result, we must have $T_1 \geq T_2$. By definition, $T := \min\{T_1, T_2\} = T_2$. For all $t$, $|\cos\theta| \geq |\sin\theta|$ implies $u_1(t) \geq u_2(t)$ and $f_1(t) \geq f_2(t)$. Therefore, we conclude $u'_1(t) \geq u'_2(t)$ for $t \in [0, T)$.

    For $t \in [T, \infty)$, we establish $f_2(t) \leq \varepsilon$. Notice that $\frac{|\cot\theta|(f_2 + \varepsilon)}{|\cot\theta| f_2 + \varepsilon} = 1 + \frac{(|\cot\theta|-1)\varepsilon}{|\cot\theta|f_2 + \varepsilon}$. Since $|\cot\theta| \geq 1$, the ratio $\frac{|\cot\theta|(f_2 + \varepsilon)}{|\cot\theta| f_2 + \varepsilon}$ is non-increasing in $f_2 \geq 0$. Using $f_2(t) \leq \varepsilon$, we get \begin{equation*}
        \frac{|\cot\theta|(f_2(t) + \varepsilon)}{|\cot\theta| f_2(t) + \varepsilon} \geq \frac{|\cot\theta|(\varepsilon + \varepsilon)}{|\cot\theta| \varepsilon + \varepsilon} = \frac{2|\cot\theta|}{1 + |\cot\theta|}.
    \end{equation*}
    Using \eqref{eq:derivativeratiolb}, we conclude that for $t \in [T, \infty)$, \begin{equation*}
        u'_1(t) \geq \frac{|\cot\theta|\left(f_2(t) + \varepsilon\right)}{|\cot \theta| f_2(t) + \varepsilon} u'_2(t) \geq \frac{2|\cot\theta|}{1 + |\cot\theta|} u'_2(t).
    \end{equation*}
\end{proof}

Let us consider the cubic equation $x\left(A - Bx^2\right) = \epsilon$, where $A > 0, B > 0, x > 0$. We assume that $\epsilon \geq 0$ is small. The largest solution $x^*$ is approximately
    \begin{equation*}
        x^* = \sqrt{\frac{A}{B}} - \frac{1}{2A}\epsilon -\frac{3}{8} B^{\frac{1}{2}}A^{-\frac{5}{2}}\epsilon^2 + \mathcal{O}\left(\epsilon^3\right).
    \end{equation*}
This can be established by using elementary perturbation theory.


\begin{lemma}\label{lemma:M+} We have
    \begin{equation*}
        \Delta := |\cos\theta| \left(u_2^\infty - u_2(0)\right) - |\sin\theta| \left(u_1^\infty - u_1(0)\right) \leq M_+, 
    \end{equation*}where $M_+ := \left( |\cos\theta| - |\sin\theta| \right) \left(\lambda^{-\frac{1}{4}}|y|^\frac{1}{2} - \frac{\sqrt{2}\varepsilon}{4\lambda^\frac{1}{2}|y|}\right)$.
\end{lemma}

\begin{proof}
We complete the proof in three steps.

\textbf{Step 1}. We show an upper bound for $u_2(T)$.

Let us define $p(U) := \lambda|\sin\theta|U\left(|\Tilde{y}| + \left(|\cos\theta| + |\sin\theta|\right)\alpha^2 -\left(|\cos\theta| + |\sin\theta|\right)U^2\right)$, which is a cubic function of $U \in \mathbb{R}$. Let $\hat{U}$ be the largest solution to $p(U) = \varepsilon$. Let us define $f_+(t) := \left(p \circ u_2\right)(t)$. We want to show that $f_+(t) \geq f_2(t)$ for $t \in [0, T]$.
Indeed, since $u_1(t) \geq u_2(t)$, $v_1(t), v_2(t) \leq \alpha$ always hold, we have \begin{align*}
    f_+(t) &=  \lambda |\sin\theta| u_2(t) \left( |\Tilde{y}| + \left(|\cos\theta| + |\sin\theta|\right)\alpha^2 - \left(|\cos\theta| + |\sin\theta|\right)u_2^2(t) \right) \\
    &\geq \lambda |\sin \theta| u_2(t) \left(|\Tilde{y}| + |\cos\theta| v_1^2(t) + |\sin\theta| v_2^2(t) - |\cos\theta| u_1^2(t) - |\sin\theta| u_2^2(t)\right) \\
    &= f_2(t).
\end{align*}
We know that $f_2(T) = \varepsilon$, so $f_+(t) = p(u_2(T)) \geq \varepsilon$. Meanwhile, $p(\hat{U}) = \varepsilon$. We want to show that $u_2(T) \leq \hat{U}$. Suppose for contradiction that $u_2(T) > \hat{U}$. By studying the behavior of the cubic function $p(U)$, we observe that $p(U) < 0$ for sufficiently large $U$. Since $p(u_2(T)) \geq \varepsilon$, by continuity, there exists $U' \geq u_2(T)$ such that $p(U') = \varepsilon$. However, $U' \geq u_2(T) > \hat{U}$, which contradicts that $\hat{U}$ is the largest solution to $p(U) = \varepsilon$. Thus, $u_2(T) \leq \hat{U}$. By using the expansion of the cubic root $\hat{U}$ in $\varepsilon$, it is easy to show that $\hat{U} < \Tilde{u}_2 := \sqrt{\frac{|\Tilde{y}|}{|\cos\theta|+|\sin\theta|} + \alpha^2} - \frac{\varepsilon}{2\lambda|\sin\theta \Tilde{y}|}$ under Assumption~\ref{assumption:epsilonalpha}. As a result, \begin{equation}
    u_2(T) \leq \hat{U} < \Tilde{u}_2 := \sqrt{\frac{|\Tilde{y}|}{|\cos\theta|+|\sin\theta|} + \alpha^2} - \frac{\varepsilon}{2\lambda|\sin\theta \Tilde{y}|}.\label{eq:lowerboundu2T}
\end{equation}

\textbf{Step 2}. We show that $u_1(T) - u_1(0) \geq u_2(T) - u_2(0)$ and $u_1^{\infty} - u_1(T) \geq \frac{2|\cot\theta|}{1 + |\cot\theta|}\left( u_2^\infty - u_2(T) \right)$.

For all $t$, we know that $u'_1(t) \geq u_2'(t)$. By integrating both sides with respect to $t$ from $0$ to $T$, we get \begin{equation}
    u_1(T) - u_1(0) \geq u_2(T) - u_2(0).\label{eq:stage1ratio}
\end{equation}
For $t \geq T$, by Lemma~\ref{lemma:ratio} we have $u'_1(t) \geq \frac{2|\cot\theta|}{1+|\cot\theta|}u'_2(t)$. Again by integrating both sides, we get \begin{equation}
    u_1^{\infty} - u_1(T) \geq \frac{2|\cot\theta|}{1+|\cot\theta|}\left(u_2^{\infty} - u_2(T)\right).\label{eq:stage2ratio}
\end{equation}

\textbf{Step 3}. We derive an upper bound for $\Delta$.

We can write $\Delta = \Delta_1 + \Delta_2$, where
\begin{align*}
    \Delta_1 &:= |\cos\theta| (u_2(T) - u_2(0)) - |\sin\theta| (u_1(T) - u_1(0)), \\
    \Delta_2 &:= |\cos\theta| (u_2^{\infty} - u_2(T)) - |\sin \theta| (u_1^{\infty} - u_1(T)).
\end{align*}
Using \eqref{eq:stage1ratio} and \eqref{eq:stage2ratio} from Step 2, we get
\begin{align}
    \Delta_1 &\leq (|\cos\theta| - |\sin\theta|)(u_2(T)-u_2(0))\label{eq:u2T-u20}, \\
    \Delta_2 &\leq \left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( u_2^{\infty} - u_2(T) \right).\label{eq:u2inf-u2T}
\end{align}
Adding \eqref{eq:u2inf-u2T} and \eqref{eq:u2T-u20}, we get \begin{align*}
    \Delta \leq &\left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( u_2^{\infty} - u_2(T) \right) + \left(|\cos\theta| - |\sin\theta|\right)\left(u_2(T) - u_2(0) \right) \\
    = &\left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( u_2^{\infty} - \Tilde{u}_2 \right) \\
    &+ \left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( \Tilde{u}_2 - u_2(T) \right) + (|\cos\theta| - |\sin\theta|)(u_2(T) - \Tilde{u}_2) \\
    &+ (|\cos\theta| - |\sin\theta|)(\Tilde{u}_2  - u_2(0)).
\end{align*}
We have shown that $\Tilde{u}_2 \geq u_2(T)$ in \eqref{eq:lowerboundu2T}, and $|\cos\theta| \geq |\sin \theta|$ implies $|\cos\theta| - |\sin\theta| \geq |\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|} \geq 0$. As a result,
\begin{align*}
    &\left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( \Tilde{u}_2 - u_2(T) \right) \leq \left(|\cos\theta| - |\sin\theta| \right)\left( \Tilde{u}_2 - u_2(T) \right)\\
    &\left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( \Tilde{u}_2 - u_2(T) \right) + (|\cos\theta| - |\sin\theta|)(u_2(T) - \Tilde{u}_2) \leq 0.
\end{align*}
Therefore, \begin{align}
    \Delta \leq \left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right)\left( u_2^{\infty} - \Tilde{u}_2 \right)
    + (|\cos\theta| - |\sin\theta|)(\Tilde{u}_2  - u_2(0)).\label{eq:Deltaupperbound1}
\end{align}
Moreover, by Lemma~\ref{lemma:convergence}, we know that the residual converges to zero. It follows that $\lim_{t \to \infty} r(t) = 0$, and
\begin{equation*}
    |\Tilde{y}| = |\cos\theta|\left((u_1^{\infty})^2 - (v_1^{\infty})^2\right) + |\sin\theta|\left( (u_2^{\infty})^2 - (v_2^{\infty})^2 \right).
\end{equation*}
Because $u_1(t) \geq u_2(t)$ and $v_1(t), v_2(t) \leq \alpha$ always hold, it follows that $u_2^{\infty} \leq \sqrt{\frac{|\Tilde{y}|}{|\cos\theta| + |\sin\theta|} + \alpha^2}$. Using $\Tilde{u}_2$ from \eqref{eq:lowerboundu2T}, we get $u_2^\infty - \Tilde{u}_2 \leq \frac{\varepsilon}{2\lambda|\sin\theta \Tilde{y}|}$. Continuing with \eqref{eq:Deltaupperbound1} and using $u_2(0) = \alpha$, we get
\begin{align*}
    \Delta \leq &\left(|\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|}\right) \frac{\varepsilon}{2\lambda|\sin\theta \Tilde{y}|} \\
    + &\left( |\cos\theta| - |\sin\theta| \right)\left(\sqrt{\frac{|\Tilde{y}|}{|\cos\theta|+|\sin\theta|} + \alpha^2} - \frac{\varepsilon}{2\lambda|\sin\theta \Tilde{y}|} - u_2(0)\right) \\
    = &\left( |\cos\theta| - |\sin\theta| \right)\left(\sqrt{\frac{|\Tilde{y}|}{|\cos\theta|+|\sin\theta|} + \alpha^2} -u_2(0)\right) \\
    &+ \left( |\cos\theta| - |\sin\theta| \frac{2|\cot\theta|}{1+|\cot\theta|} - |\cos\theta| + |\sin\theta|\right)\frac{\varepsilon}{2\lambda|\sin\theta \Tilde{y}|} \\
    = &\left( |\cos\theta| - |\sin\theta| \right)\left(\sqrt{\frac{|\Tilde{y}|}{|\cos\theta|+|\sin\theta|} + \alpha^2} -\alpha\right) - \left(\frac{|\cos\theta|-|\sin\theta|}{|\cos\theta| + |\sin\theta|}\right)\frac{\varepsilon}{2\lambda|\Tilde{y}|} \\
    \leq &\left( |\cos\theta| - |\sin\theta| \right)\sqrt{\frac{|\Tilde{y}|}{|\cos\theta|+|\sin\theta|}} - \left(\frac{|\cos\theta|-|\sin\theta|}{|\cos\theta| + |\sin\theta|}\right)\frac{\varepsilon}{2\lambda|\Tilde{y}|} \\
    \leq &\left( |\cos\theta| - |\sin\theta| \right)\sqrt{|\Tilde{y}|} - \left(|\cos\theta|-|\sin\theta|\right)\frac{\sqrt{2}\varepsilon}{4\lambda|\Tilde{y}|} \\
    = &\left( |\cos\theta| - |\sin\theta| \right) \left(|y|^\frac{1}{2}\lambda^{-\frac{1}{4}} - \frac{\sqrt{2}\varepsilon}{4\lambda^\frac{1}{2}|y|}\right) \\
    = &M_+.
\end{align*}
We conclude $\Delta \leq M_+$.
\end{proof}

\begin{lemma}\label{lemma:M-}
We have \begin{equation*}
    \Delta := |\cos\theta| \left(u_2^\infty - u_2(0)\right) - |\sin\theta| \left(u_1^\infty - u_1(0)\right) \geq M_-, 
\end{equation*}where $M_- := \left(|\cos\theta|-|\sin\theta|\right)\left(\left(2\lambda\right)^{-\frac{1}{4}}|y|^{\frac{1}{2}}-\alpha\right) - 2\sqrt{\frac{2\varepsilon}{\lambda^{\frac{3}{4}}|\sin\theta||y|^{\frac{1}{2}}}} - \frac{3\sqrt{2}\varepsilon}{\lambda^{\frac{1}{2}}|\sin\theta y|}\ln \left(\frac{\lambda^{\frac{1}{4}}|\sin\theta||y|^{\frac{3}{2}}}{\sqrt{2}\varepsilon}\right)$.
\end{lemma}

\begin{proof}
We begin by exhibiting a lower bounding function for $f_2(t)$ for $t \in [0, T]$. Let $\bar{v} := |\cos\theta|\left(v_1^\infty\right)^2 + |\sin\theta|\left(v_2^\infty\right)^2$. Since $v_1(t), v_2(t)$ are non-increasing and non-negative, we have \begin{equation}
 0 \leq \bar{v} \leq |\cos\theta|v_1^2(t) + |\sin\theta|v_2^2(t) \leq \left(|\cos\theta| + |\sin\theta|\right)\alpha^2. \label{eq:barvrange}
\end{equation} Let us define \begin{equation*}
    f_-(t) := \frac{1}{2}\lambda |\sin\theta| \left(\alpha + t\right)\left(|\Tilde{y}| + \bar{v} - (|\cos\theta|+|\sin\theta|)\left(\alpha + t\right)^2\right).
\end{equation*}
For $t \leq T$, since $f_2(t) \geq \varepsilon$ and $u'_2(t) \geq \frac{1}{2}$, we have $u_2(t) \geq \alpha+\frac{1}{2}t > \frac{1}{2}\left(\alpha + t\right)$. Moreover, $u_1(t) \leq \alpha + t$ and $u_2(t) \leq \alpha + t$ always hold. Therefore, for $t \in [0, T]$, we establish that \begin{align*}
    f_-(t)  &= \frac{1}{2}\lambda |\sin\theta| \left(\alpha + t\right)\left(|\Tilde{y}| + \bar{v} - (|\cos\theta|+|\sin\theta|)\left(\alpha + t\right)^2\right) \\
    &< \lambda|\sin\theta|u_2(t)\left(|\Tilde{y}| + \bar{v} - |\cos\theta|u_1^2(t) - |\sin\theta|u_2^2(t)\right) \\
    &\leq \lambda|\sin\theta|u_2(t)\left(|\Tilde{y}| - |\cos\theta|\left(u_1^2(t) - v_2^2(t)\right) - |\sin\theta|\left(u_2^2(t) - v_2^2(t)\right)\right) \\
    &= f_2(t).
\end{align*}
As a result, we get \begin{equation}
    f_-(T) < f_2(T) = \varepsilon\label{eq:f_-Tlessthanepsilon}.
\end{equation}
Assumption~\ref{assumption:epsilonalpha} guarantees $\sqrt{\frac{|\Tilde{y}|+\bar{v}}{3\left(|\cos\theta| + |\sin\theta|\right)}}-\alpha > 0$. The derivative of the cubic function $f_-(t)$ shows that $f_-(t)$ is increasing on $\left[0,~\sqrt{\frac{|\Tilde{y}|+\bar{v}}{3\left(|\cos\theta| + |\sin\theta|\right)}}-\alpha\right)$ and decreasing for $t > \sqrt{\frac{|\Tilde{y}|+\bar{v}}{3\left(|\cos\theta| + |\sin\theta|\right)}}-\alpha$. Because $f_-(0) > \varepsilon$ by Assumption~\ref{assumption:epsilonalpha} and $f_-(T) < \varepsilon$ by \eqref{eq:f_-Tlessthanepsilon}, it follows that there exists a unique $T' \in \left(\sqrt{\frac{|\Tilde{y}|+\bar{v}}{3\left(|\cos\theta| + |\sin\theta|\right)}}-\alpha,~T\right)$ such that $f_-(T') = \varepsilon$, and $f_-(t) < \varepsilon$ for $t > T'$.

Next, we show a lower bound for $\alpha + T'$. Since we already have $\alpha + T' > \sqrt{\frac{|\Tilde{y}|+\bar{v}}{3\left(|\cos\theta| + |\sin\theta|\right)}}$, then
\begin{align*}
    \varepsilon = f_-(T') &= \frac{1}{2}\lambda |\sin\theta|\left(\alpha+T'\right)\left(|\Tilde{y}| + \bar{v} -(|\cos\theta|+|\sin\theta|)\left(\alpha + T'\right)^2\right) \\
    &\geq \frac{1}{2}\lambda |\sin\theta|\sqrt{\frac{|\Tilde{y}|+\bar{v}}{3\left(|\cos\theta| + |\sin\theta|\right)}}\left(|\Tilde{y}|+\bar{v}-(|\cos\theta|+|\sin\theta|)\left(\alpha + T'\right)^2\right).
\end{align*}
Therefore, we have\begin{align}
    \frac{2\varepsilon}{\lambda|\sin\theta|}\sqrt{\frac{3(|\cos\theta|+|\sin\theta|)}{|\Tilde{y}|+\bar{v}}} &\geq |\Tilde{y}|+\bar{v}-(|\cos\theta|+|\sin\theta|)\left(\alpha + T'\right)^2 \\
    (|\cos\theta|+|\sin\theta|)\left(\alpha + T'\right)^2 &\geq |\Tilde{y}|+\bar{v} - \frac{2\varepsilon}{\lambda|\sin\theta|}\sqrt{\frac{3(|\cos\theta|+|\sin\theta|)}{|\Tilde{y}|+\bar{v}}} \\
    (\alpha + T')^2 &\geq \frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|} - \frac{2\varepsilon}{\lambda|\sin\theta|}\sqrt{\frac{3}{(|\Tilde{y}|+\bar{v})(|\cos\theta|+|\sin\theta|)}} \\
    (\alpha + T')^2 &\geq \frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|} - \frac{2\sqrt{3}\varepsilon}{\lambda|\sin\theta|(|\Tilde{y}|+\bar{v})^{\frac{1}{2}}} \\
    (\alpha + T')^2 &\geq \frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|} - \frac{4\varepsilon}{\lambda|\sin\theta|(|\Tilde{y}|+\bar{v})^{\frac{1}{2}}} \\
    \alpha + T' &\geq \left(\frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|} - \frac{4\varepsilon}{\lambda|\sin\theta|(|\Tilde{y}|+\bar{v})^{\frac{1}{2}}}\right)^{\frac{1}{2}}  \\
    \alpha + T' &\geq \sqrt{\frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|}} - 2\sqrt{\frac{\varepsilon}{\lambda|\sin\theta|(|\Tilde{y}|+\bar{v})^{\frac{1}{2}}}}.\label{eq:lowerboundoft1-final}
\end{align}

Next, we want to find a lower bound for $u_2(T)$. Because $u_2(t)$ is non-decreasing, $T > T'$ implies $u_2(T) \geq u_2(T')$. For all $t \in [0, T']$, $f_-(t) \leq f_2(t)$ holds, and therefore \begin{align*}
    u'_2(t) &= \frac{f_2(t)}{f_2(t) + \varepsilon} \\
    &\geq \frac{f_-(t)}{f_-(t) + \varepsilon} \\
    &= 1 - \frac{2\varepsilon}{\lambda|\sin\theta|\left(\alpha + t\right)\left(|\Tilde{y}| +\bar{v}- \left( |\cos\theta| + |\sin\theta|\right)\left(\alpha + t\right)^2\right)}.
 \end{align*}
 This lower bounding function is explicit in $t$, which makes it possible to obtain a lower bound for $u_2(T')$ by integrating it with respect to $t$ from $0$ to $T'$, which yields \begin{align}
     u_2(T') - u_2(0) &\geq \int_{0}^{T'} 1 - \frac{2\varepsilon}{\lambda|\sin\theta|\left(\alpha + t\right)\left(|\Tilde{y}| +\bar{v}- \left( |\cos\theta| + |\sin\theta|\right)\left(\alpha + t\right)^2\right)} \, dt \\
     u_2(T') &\geq \alpha + T' - \frac{2\varepsilon}{\lambda|\sin\theta|}\int_{0}^{T'} \frac{1}{\left(\alpha + t\right)\left(|\Tilde{y}| +\bar{v}- \left( |\cos\theta| + |\sin\theta|\right)\left(\alpha + t\right)^2\right)} \, dt. \label{eq:u2T'lowerbound}
 \end{align}
 Let $\tau := \alpha + t$. We compute the integral\begin{align*}
    J:= \int_\alpha^{\alpha+T'} \frac{1}{\tau(|\Tilde{y}| +\bar{v}- (|\cos\theta|+|\sin\theta|)\tau^2)}\, d\tau &= \left.\frac{1}{2(|\Tilde{y}|+\bar{v})}\ln{\frac{\tau^2}{|\Tilde{y}| + \bar{v} - (|\cos\theta|+|\sin\theta|)\tau^2}}\right|_{\alpha}^{\alpha+T'} \\
    &= \frac{1}{2(|\Tilde{y}|+\bar{v})}\ln{\frac{(\alpha+T')^2(|\Tilde{y}|+\bar{v} - (|\cos\theta|+|\sin\theta|)\alpha^2)}{\alpha^2(|\Tilde{y}|+\bar{v} - (|\cos\theta|+|\sin\theta|)(\alpha+T')^2)}}.
 \end{align*}
Since $f_-(T') = \varepsilon$, we get \begin{equation*}
    |\Tilde{y}| + \bar{v} - (|\cos\theta|+|\sin\theta|)\left(\alpha + T'\right)^2 = \frac{2\varepsilon}{\lambda|\sin\theta|(\alpha+ T')}.
\end{equation*}
Moreover, $\left(\alpha + T'\right)^2 \leq |\Tilde{y}| + \bar{v}$. Using $\bar{v} \leq \left( |\cos\theta| + |\sin\theta|\right)\alpha^2$ from \eqref{eq:barvrange} and Assumption~\ref{assumption:epsilonalpha}, we get 
\begin{align*}
    J &\leq \frac{1}{2|(\Tilde{y}|+\bar{v})}\ln \frac{\left(\alpha + T' \right)^3}{2\varepsilon/(\lambda|\sin\theta|)} \frac{|\Tilde{y}|}{\alpha^2} \\
    &\leq \frac{1}{2(|\Tilde{y}|+\bar{v})}\ln \frac{(|\Tilde{y}|+\bar{v})^\frac{3}{2}}{2\varepsilon/(\lambda|\sin\theta|)} \frac{|\Tilde{y}|}{\left(2\varepsilon/(\lambda|\sin\theta\Tilde{y}|) \right)^2} \\
    &= \frac{1}{2(|\Tilde{y}|+\bar{v})}\ln \left((|\Tilde{y}|+\bar{v})^\frac{3}{2}\left(\frac{\lambda|\sin\theta\Tilde{y}|}{2\varepsilon}\right)^3\right) \\
    &= \frac{3}{2(|\Tilde{y}|+\bar{v})}\ln \left((|\Tilde{y}|+\bar{v})^\frac{1}{2}\left(\frac{\lambda|\sin\theta\Tilde{y}|}{2\varepsilon}\right)\right).
\end{align*}
Using Assumption~\ref{assumption:epsilonalpha}, we obtain that \begin{align*}
    |\Tilde{y}| + \bar{v} \leq |\Tilde{y}| + \left(|\cos\theta|+|\sin\theta|\right)\alpha^2 \leq |\Tilde{y}| + \frac{1}{18}|\Tilde{y}| = \frac{19}{18}|\Tilde{y}|.
\end{align*}
In turn we have \begin{align*}
    J &\leq \frac{3}{2(|\Tilde{y}|+\bar{v})}\ln \left((|\Tilde{y}|+\bar{v})^\frac{1}{2}\left(\frac{\lambda|\sin\theta\Tilde{y}|}{2\varepsilon}\right)\right) \\
    &\leq \frac{3}{2(|\Tilde{y}|+\bar{v})} \ln \left(\left(\frac{19}{18}|\Tilde{y}|\right)^\frac{1}{2}\left(\frac{\lambda|\sin\theta\Tilde{y}|}{2\varepsilon}\right)\right) \\
    &\leq \frac{3}{2(|\Tilde{y}|+\bar{v})} \ln \left(\sqrt{2}|\Tilde{y}|^\frac{1}{2}\left(\frac{\lambda|\sin\theta\Tilde{y}|}{2\varepsilon}\right)\right) \\
    &= \frac{3}{2(|\Tilde{y}|+\bar{v})} \ln \left(\frac{\lambda|\sin\theta||\Tilde{y}|^\frac{3}{2}}{\sqrt{2}\varepsilon}\right).
\end{align*}
Assumption~\ref{assumption:epsilonalpha} implies that $\varepsilon \leq \frac{\lambda|\sin\theta||\Tilde{y}|^{\frac{3}{2}}}{9\sqrt{2(|\cos\theta|+|\sin\theta|)}}$. Therefore, $\ln \left(\frac{\lambda|\sin\theta||\Tilde{y}|^\frac{3}{2}}{\sqrt{2}\varepsilon}\right)$ is guaranteed to be positive, and we get\begin{equation}
    J \leq \frac{3}{2|\Tilde{y}|} \ln \left(\frac{\lambda|\sin\theta||\Tilde{y}|^\frac{3}{2}}{\sqrt{2}\varepsilon}\right). \label{eq:integral}
\end{equation}

Combining \eqref{eq:lowerboundoft1-final}, \eqref{eq:u2T'lowerbound} and \eqref{eq:integral}, we get \begin{align*}
u_2(T') &\geq \sqrt{\frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|}} - 2\sqrt{\frac{\varepsilon}{\lambda|\sin\theta|(|\Tilde{y}|+\bar{v})^{\frac{1}{2}}}} - \frac{3\varepsilon}{\lambda|\sin\theta\Tilde{y}|}\ln \left(\frac{\lambda|\sin\theta||\Tilde{y}|^\frac{3}{2}}{\sqrt{2}\varepsilon}\right) \\
&\geq \sqrt{\frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|}} - 2\sqrt{\frac{\varepsilon}{\lambda|\sin\theta||\Tilde{y}|^{\frac{1}{2}}}} - \frac{3\varepsilon}{\lambda|\sin\theta\Tilde{y}|}\ln \left(\frac{\lambda|\sin\theta||\Tilde{y}|^\frac{3}{2}}{\sqrt{2}\varepsilon}\right).
\end{align*}
Let $P := \sqrt{\frac{|\Tilde{y}|+\bar{v}}{|\cos\theta|+|\sin\theta|}}$, $Q := 2\sqrt{\frac{\varepsilon}{\lambda|\sin\theta||\Tilde{y}|^{\frac{1}{2}}}} + \frac{3\varepsilon}{\lambda|\sin\theta \Tilde{y}|}\ln \left(\frac{\lambda|\sin\theta ||\Tilde{y}|^{\frac{3}{2}}}{\sqrt{2}\varepsilon}\right)$, and $P, Q > 0$. Therefore, we have $u_2(T') \geq P - Q$. Using Lemma~\ref{lemma:convergence}, we obtain that \begin{align*}
    |\cos\theta|\left(P + Q\right)^2 + |\sin\theta|\left( P - Q \right)^2  &= (|\cos\theta|+|\sin\theta|)P^2 + 2PQ(|\cos\theta|-|\sin\theta|) + (|\cos\theta|+|\sin\theta|)Q^2 \\
    &\geq (|\cos\theta|+|\sin\theta|)P^2 \\
    &= |\Tilde{y}| + \bar{v} \\
    &= |\Tilde{y}| + |\cos\theta|\left(v_1^{\infty}\right)^2 + |\sin\theta|\left(v_2^\infty\right)^2 \\
    &= |\cos\theta|\left(u_1^{\infty}\right)^2 + |\sin\theta|\left(u_2^{\infty}\right)^2.
\end{align*}
Since $u_2(t)$ is non-decreasing, we get $u_2^\infty \geq u_2(T') \geq P - Q$. As a result, we must have $ u_1^\infty \leq P + Q $. We derive 
\begin{align}
    |\cos\theta|u_2^\infty - |\sin\theta| u_1^\infty
    &\geq \left(|\cos\theta|-|\sin\theta|\right)P - \left(|\cos\theta| + |\sin\theta|\right) Q \\
    &= \left(|\cos\theta|-|\sin\theta|\right)\sqrt{\frac{|\Tilde{y}|+\bar{v}}{|\cos\theta| + |\sin\theta|}} - \left(|\cos\theta| + |\sin\theta|\right) Q \\
    &\geq \left(|\cos\theta|-|\sin\theta|\right)\sqrt{\frac{|\Tilde{y}|}{|\cos\theta| + |\sin\theta|}} - \sqrt{2}Q \\
    &\geq \left(|\cos\theta|-|\sin\theta|\right)\sqrt{\frac{|\Tilde{y}|}{\sqrt{2}}} - \sqrt{2}Q
    .\label{eq:finaleq1}
\end{align}

Additionally, we have $u_1(0) = u_2(0) = \alpha$, which implies \begin{equation}
    -|\cos\theta|u_2(0) + |\sin\theta|u_1(0) = -\alpha (|\cos\theta|-|\sin\theta|). \label{eq:finaleq2}
\end{equation}
Adding \eqref{eq:finaleq1} and \eqref{eq:finaleq2}, and substituting in $Q$, we get 
\begin{align*}
    \Delta &= |\cos\theta|(u_2^\infty - u_2(0)) - |\sin\theta|(u_1^\infty - u_1(0)) \\
    &= |\cos\theta|u_2^\infty - |\sin\theta| u_1^\infty -\alpha(|\cos\theta| - |\sin\theta|) \\
    &\geq \left(|\cos\theta|-|\sin\theta|\right)\left(\sqrt{\frac{|\Tilde{y}|}{\sqrt{2}}}-\alpha\right) - 2\sqrt{\frac{2\varepsilon}{\lambda|\sin\theta||\Tilde{y}|^{\frac{1}{2}}}} - \frac{3\sqrt{2}\varepsilon}{\lambda|\sin\theta \Tilde{y}|}\ln \left(\frac{\lambda|\sin\theta|| \Tilde{y}|^{\frac{3}{2}}}{\sqrt{2}\varepsilon}\right). \end{align*}
Finally, using $\Tilde{y} = \lambda^{-\frac{1}{2}}y$, we get
\begin{align*}
    \Delta &\geq \left(|\cos\theta|-|\sin\theta|\right)\left(\left(2\lambda\right)^{-\frac{1}{4}}|y|^{\frac{1}{2}}-\alpha\right) - 2\sqrt{\frac{2\varepsilon}{\lambda^{\frac{3}{4}}|\sin\theta||y|^{\frac{1}{2}}}} - \frac{3\sqrt{2}\varepsilon}{\lambda^{\frac{1}{2}}|\sin\theta y|}\ln \left(\frac{\lambda^{\frac{1}{4}}|\sin\theta||y|^{\frac{3}{2}}}{\sqrt{2}\varepsilon}\right)\\
    &= M_-.
\end{align*}
Therefore, $\Delta \geq M_-$.
\end{proof}

\begin{lemma}\label{lemma:M-_decreasing}
Let us consider $M_-(\varepsilon)$ as a function of $\varepsilon$ with $M_-(0) := \lim_{\varepsilon\to0^+} M_-(\varepsilon)$. We have $M_-(0) > 0$ and $M_-(\varepsilon)$ is strictly decreasing for $0 \leq \varepsilon \leq \frac{1}{9}\frac{\lambda|\sin\theta||\Tilde{y}|^{\frac{3}{2}}}{\sqrt{2(|\cos\theta|+|\sin\theta|)}}$.
\end{lemma}

\begin{proof}
    We write $M_-(\varepsilon) = N_0 + N_1(\varepsilon) + N_2(\varepsilon)$, where \begin{align*}
        N_0 &= \left(|\cos\theta|-|\sin\theta|\right)\left(2^{-\frac{1}{4}}\sqrt{|\Tilde{y}|}-\alpha\right), \\
        N_1(\varepsilon) &= - 2\sqrt{\frac{2\varepsilon}{\lambda|\sin\theta||\Tilde{y}|^{\frac{1}{2}}}},\\
        N_2(\varepsilon) &= - \frac{3\sqrt{2}\varepsilon}{\lambda|\sin\theta \Tilde{y}|}\ln \left({|\Tilde{y}|^\frac{1}{2}}\frac{\lambda|\sin\theta \Tilde{y}|}{\sqrt{2}\varepsilon}\right).
    \end{align*} Notice that $N_0$ does not depend on $\varepsilon$, and $N_1(\varepsilon)$ is decreasing in $\varepsilon$ for all $\varepsilon \geq 0$. If $\varepsilon' := \frac{\sqrt{2}\varepsilon}{\lambda|\sin\theta\Tilde{y}|}$, then \begin{align*}
        N_2(\varepsilon') &= -3\varepsilon' \ln \left(\frac{|\Tilde{y}|^{\frac{1}{2}}}{\varepsilon'}\right), \\
        \frac{dN_2(\varepsilon')}{d\varepsilon'} &= -3\left(\ln \left(\frac{|\Tilde{y}|^{\frac{1}{2}}}{\varepsilon'}\right) - 1\right).
    \end{align*}
    Since $0 \leq \varepsilon \leq \frac{1}{9}\frac{\lambda|\sin\theta||\Tilde{y}|^{\frac{3}{2}}}{\sqrt{2(|\cos\theta|+|\sin\theta|)}}$, we have $0 \leq \varepsilon' \leq \frac{1}{9}\frac{|\Tilde{y}|^{\frac{1}{2}}}{\sqrt{|\cos\theta|+|\sin\theta|}}$. As a result, \begin{align*}
        \frac{|\Tilde{y}|^{\frac{1}{2}}}{\varepsilon'} \geq 9\sqrt{|\cos\theta|+|\sin\theta|} \geq 9 > e.
    \end{align*}
    Therefore, $\ln \left(\frac{|\Tilde{y}|^{\frac{1}{2}}}{\varepsilon'}\right) > 1$ and $\frac{dN_2(\varepsilon')}{d\varepsilon'} < 0$. It follows that $N_2(\varepsilon)$ is decreasing in $\varepsilon$ on the given interval. Combining $N_0, N_1$ and $N_2$, we conclude that $M_-(\varepsilon)$ is decreasing in $\varepsilon$ on the given interval. We obtain $\lim_{\varepsilon'\to 0^+} \varepsilon'\ln(\frac{|\Tilde{y}|^\frac{1}{2}}{\varepsilon'}) = 0$ using L'Hopital's rule, so $\lim_{\varepsilon\to 0^+}N_2(\varepsilon)= 0$. Applying Assumption~\ref{assumption:epsilonalpha} yields
    \begin{equation*}
        M_-(0) = \left(|\cos\theta|-|\sin\theta|\right)\left(2^{-\frac{1}{4}}\sqrt{|\Tilde{y}|}-\alpha\right) > 0.
    \end{equation*}
\end{proof}


\subsection{Proof of Theorem~\ref{maintheorem}}
\begin{proof}
By Lemma~\ref{lemma:convergence}, we know that the weights converge and the residual $r(t)$ converges to zero. It follows that $\bm \beta^\infty := \lim_{t \to \infty} \bm \beta (t)$ exists and is finite. Using \eqref{eq:expressionofrtinbeta}, we get \begin{align*}
    0 = \lim_{t \to \infty}r(t) = y - \left(\sqrt{\lambda}\cos\theta \beta_1^\infty + \sqrt{\lambda}\sin\theta\beta_2^\infty \right) = \bm y - X\bm \beta^\infty.
\end{align*}
Therefore, the convergent solution $\bm\beta^\infty$ is an interpolating solution.

Next, we derive the stationary condition for the optimization problem \eqref{eq:Bregmanminimization} as
\begin{align}
    \nabla_{\bm \beta}\left(X\bm\beta^\infty\right) &= \begin{bmatrix}
        \sqrt{\lambda}\cos\theta \\
        \sqrt{\lambda}\sin\theta
    \end{bmatrix}, \\
    \nabla_{\bm \beta} E\left(\bm\beta^\infty\right) &= \nabla \Phi_{\infty}\left(\bm\beta^\infty \right) - \nabla \Phi_{0}\left(\bm\beta(0)\right). \label{eq:Bregmangradient}
\end{align}
The gradient in the left-hand side of \eqref{eq:Bregmangradient} is equal to the difference between the convergent point and the starting point of the dual variable. We use the result of the dual dynamics in Proposition~\ref{proposition:dualdynamics} to calculate the gradient. Recall that the dual dynamics follow\begin{align*}
    \frac{d\nabla\Phi_t(\bm\beta(t))}{dt} = -\operatorname{sgn}(\bm\beta(t)) \odot \frac{\nabla_{\bm u}L(\bm w(t))}{|\nabla_{\bm u}L(\bm w(t))| + \varepsilon}.
\end{align*}
From \eqref{eq:signofbeta} we note that $\operatorname{sgn}(\beta_i(t)) = \operatorname{sgn}(x_i y)$. Therefore, $\operatorname{sgn}(\bm \beta(t))$ remains the same for all $t$. Integrating both sides with respect to $t$ from $0$ to infinity, it follows that \begin{align*} 
   \nabla \Phi_{\infty}(\bm\beta^\infty) - \nabla \Phi_{0}(\bm\beta(0)) = \begin{bmatrix}
       \operatorname{sgn}(\cos\theta \Tilde{y}) \\
       \operatorname{sgn}(\sin\theta \Tilde{y})
   \end{bmatrix} \odot \left(\bm u^{\infty} - \bm u(0)\right).
\end{align*}
Next, we want to compute the extent of the deviation from the exact KKT point. To this end, we have \begin{align*}
    \delta := \min_{\nu \in \mathbb{R}}\left\Vert\nabla_{\bm \beta} E\left(\bm\beta\right) - \nu \cdot \nabla_{\bm \beta}(X\bm\beta) \right\Vert &= \min_{\nu \in \mathbb{R}}\left\Vert\begin{bmatrix}
       \operatorname{sgn}(\cos\theta \Tilde{y}) \\
       \operatorname{sgn}(\sin\theta \Tilde{y})
   \end{bmatrix}\odot \left(\bm u^\infty - \bm u(0) \right) - \nu \cdot \begin{bmatrix}
        \sqrt{\lambda}\cos\theta \\
        \sqrt{\lambda}\sin\theta
    \end{bmatrix} \right\Vert.
\end{align*}
Let $V:= \begin{bmatrix}
       \operatorname{sgn}(\cos\theta\Tilde{y})\left(u_1^\infty - u_1(0) \right) \\
       \operatorname{sgn}(\sin\theta\Tilde{y})\left(u_2^\infty - u_2(0) \right)
   \end{bmatrix}$. Using orthogonal projection, we derive that \begin{align*}
     \min_{\nu \in \mathbb{R}}\left\Vert V - \nu \cdot \begin{bmatrix}
        \sqrt{\lambda}\cos\theta \\
        \sqrt{\lambda}\sin\theta
    \end{bmatrix} \right\Vert &= \left| \left\langle V, ~\begin{bmatrix}
    -\sin\theta \\
    \cos\theta
\end{bmatrix} \right\rangle\right| \\
&= \Big| -\operatorname{sgn}(\cos\theta\Tilde{y})\sin\theta \left(u_1^\infty - u_1(0)\right) + \operatorname{sgn}(\sin\theta\Tilde{y})\cos\theta \left(u_2^\infty - u_2(0)\right)\Big| \\
&= \Big|\operatorname{sgn}(\sin\theta \cos\theta \Tilde{y})\cdot \left(-|\sin\theta|\left(u_1^\infty - u_1(0)\right) + |\cos\theta|\left(u_2^\infty - u_2(0)\right) \right) \Big| \\
&= \Big||\cos\theta|\left(u_2^\infty - u_2(0)\right) - |\sin\theta|\left(u_1^\infty - u_1(0)\right) \Big|.
\end{align*}
Therefore, $\delta = |\Delta|$, where $\Delta := |\cos\theta| \left(u_2^\infty - u_2(0) \right) - |\sin\theta| \left(u_1^\infty - u_1(0)\right)$. Using Lemma~\ref{lemma:M+} and Lemma~\ref{lemma:M-}, we get\begin{align*}
    M_- \leq \Delta \leq M_+,
\end{align*}
where\begin{align*}
    M_- &:=  \left(|\cos\theta|-|\sin\theta|\right)\left(\left(2\lambda\right)^{-\frac{1}{4}}|y|^{\frac{1}{2}}-\alpha\right) - 2\sqrt{\frac{2\varepsilon}{\lambda^{\frac{3}{4}}|\sin\theta||y|^{\frac{1}{2}}}} - \frac{3\sqrt{2}\varepsilon}{\lambda^{\frac{1}{2}}|\sin\theta y|}\ln \left(\frac{\lambda^{\frac{1}{4}}|\sin\theta||y|^{\frac{3}{2}}}{\sqrt{2}\varepsilon}\right), \\
    M_+ &:= \left( |\cos\theta| - |\sin\theta| \right) \left(\lambda^{-\frac{1}{4}}|y|^\frac{1}{2} - \frac{\sqrt{2}\varepsilon}{4\lambda^\frac{1}{2}|y|}\right).
\end{align*}
We can further simplify the expressions to 
\begin{align*}
            M_- &= \left(|\cos\theta_1|-|\sin\theta_1|\right)\left((2\lambda_1)^{-\frac{1}{4}}|y^{(1)}|^{\frac{1}{2}}-\alpha\right) + \mathcal{O}(\sqrt{\varepsilon}),\\
            M_+ &= \left(|\cos\theta_1|-|\sin\theta_1|\right)\lambda_1^{-\frac{1}{4}} |y^{(1)}|^{\frac{1}{2}} + \mathcal{O}(\varepsilon).
\end{align*}

We conclude that $\delta = |\Delta| \leq \max\{|M_-|, |M_+|\}$.
\end{proof}

\subsection{Proof of Corollary~\ref{corollary:epsilon}}
\begin{proof}
Let us consider $\delta(\varepsilon), \Delta(\varepsilon), M_-(\varepsilon), M_+(\varepsilon)$ as functions of $\varepsilon$ on the domain $\mathcal{I}_{\varepsilon} = \left[0,~\bar{\varepsilon}\right]$ implied by Assumption~\ref{assumption:epsilonalpha}. We define $M_-(0) := \lim_{\varepsilon\to0^+} M_-(\varepsilon)$ so that $M_-(\varepsilon)$ is continuous on the domain. Theorem~\ref{maintheorem} shows that $M_+(\varepsilon)$ is linearly decreasing in $\varepsilon$. By Lemma~\ref{lemma:M-_decreasing}, $M_-(\varepsilon)$ is strictly decreasing in $\varepsilon$ on the domain $\mathcal{I}_\varepsilon$. Lemma~\ref{lemma:M-_decreasing} also shows that $M_-(0) > 0$. If $M_-(\bar{\varepsilon}) \geq 0$, then $\Delta(\varepsilon) \geq M_-(\varepsilon) \geq M_-(\bar{\varepsilon}) \geq 0$ for all $\varepsilon \in \mathcal{I}_{\varepsilon}$. It implies that only $M_+(\varepsilon)$ applies to the bound, i.e., $\delta(\varepsilon) \leq M_+(\varepsilon)$. Let $\varepsilon^* = \bar{\varepsilon}$. It follows that for $\varepsilon \in [0,\,\varepsilon^*]$, we have \begin{equation}
    \delta(\varepsilon) \leq \bar{M} - \left( |\cos\theta| - |\sin\theta| \right)\frac{\sqrt{2}\varepsilon}{4\lambda^\frac{1}{2}|y|}.\label{eq:deltaepsilon}
\end{equation}
If $M_-(\bar{\varepsilon}) < 0$, since $M_-(0) > 0$, the monotonicity of $M_-(\varepsilon)$ ensures a unique $\hat{\varepsilon} \in (0, \bar{\varepsilon})$ such that $M_-(\hat{\varepsilon}) = 0$ and $\Delta(\varepsilon) \geq M_-(\varepsilon) \geq 0$ for $\varepsilon \in [0,\,\hat{\varepsilon}]$. Let $\varepsilon^* = \hat{\varepsilon}$. Notice that $\hat{\varepsilon}$ is positive, so $[0,\,\varepsilon^*]$ is non-degenerate. By a similar argument, we establish \eqref{eq:deltaepsilon}. We complete the proof by setting $\mathcal{I}' := [0,\,\varepsilon^*] \subseteq \mathcal{I}_\varepsilon$.
\end{proof}


\subsection{Proof of Corollary~\ref{corollary:NDcorollary}}
\begin{proof}
    By Lemma~\ref{lemma:convergence}, $\lim_{t\to\infty}r^{(n)}(t) = 0$ for all $n \in \{1, \dots, N\}$ and the weights converge. We let $\bar{\bm\beta}^{(n)} := \lim_{t\to\infty}\bm\beta^{(n)}(t)$, $\bar{\bm u}^{(n)} := \lim_{t\to\infty}\bm u^{(n)}(t)$ and $\bar{\bm v}^{(n)} := \lim_{t\to\infty}\bm v^{(n)}(t)$ for each $n$. We also let $\bm\beta^{\infty} := \lim_{t\to\infty}\bm\beta(t) = \begin{bmatrix}
        \bar{\bm\beta}^{(1)} & \dots & \bar{\bm\beta}^{(n)}
    \end{bmatrix}^\top$.
    Using \eqref{eq:expressionofrtinbeta}, we derive that for all $n$
    \begin{equation*}
        0 = \lim_{t\to\infty} r^{(n)}(t) = y^{(n)} - x^{(n)}_1\bar\beta_1^{(n)} - x^{(n)}_2\bar\beta_2^{(n)}.
    \end{equation*}
    Therefore, $X\bm\beta^\infty = \bm y$, i.e., $\bm\beta^\infty$ is an interpolating solution.
    
    Each block of $X^\top X$ is parameterized by $\theta_n$ and $\lambda_n$ as
    \begin{align*}
        B^{(n)} = \begin{bmatrix}
        \cos\theta_n & -\sin\theta_n \\
        \sin\theta_n & \cos\theta_n 
    \end{bmatrix}\begin{bmatrix}
        \lambda_n & 0 \\
        0 & 0
    \end{bmatrix}\begin{bmatrix}
        \cos\theta_n & \sin\theta_n \\
        -\sin\theta_n & \cos\theta_n 
    \end{bmatrix},
    \end{align*}
    where $|\cos\theta_n| \geq |\sin\theta_n| > 0$. Matrix $B^{(n)}$ is positive semi-definite and has rank 1, so $\lambda_n > 0$. We let $\Tilde{y}^{(n)} := \frac{y^{(n)}}{\sqrt{\lambda_n}}$. The constraint $X\bm\beta^\infty = \bm y$ consists of $N$ equality conditions \begin{align*}
        \left\langle\bm x^{(1)}, \,\bm\beta^\infty\right\rangle &= y^{(1)}, \\
        &\dots \\
        \left\langle\bm x^{(N)}, \,\bm\beta^\infty\right\rangle &= y^{(N)}.
    \end{align*}
By integrating both sides of \eqref{eq:dualdynamics}, we get\begin{align*}
    \nabla_{\bm\beta} E\left(\bm\beta^\infty\right) &= \nabla\Phi_{\infty}\left(\bm\beta^\infty\right) -\nabla\Phi_{0}\left(\bm\beta(0)\right) \\
    &= \begin{bmatrix}
        \operatorname{sgn}(\cos\theta_1 \Tilde{y}^{(1)})\left( \bar{u}_1^{(1)} - u_1^{(1)}(0) \right) \\
        \operatorname{sgn}(\sin\theta_1 \Tilde{y}^{(1)})\left( \bar{u}_2^{(1)} - u_2^{(1)}(0) \right) \\
        \dots \\
        \operatorname{sgn}(\cos\theta_N \Tilde{y}^{(N)})\left( \bar{u}_1^{(N)} - u_1^{(N)}(0) \right) \\
        \operatorname{sgn}(\sin\theta_N \Tilde{y}^{(N)})\left( \bar{u}_2^{(N)} - u_2^{(N)}(0) \right) \\
    \end{bmatrix}.
\end{align*}
We let $\bm \mu := \begin{bmatrix}
    \mu_1 & \dots & \mu_N 
\end{bmatrix}$, and then we have 
\begin{align*}
    \bar{\delta} &:= \min_{\bm\mu \in \mathbb{R}^N}\left\Vert \nabla_{\bm\beta} E\left(\bm\beta^\infty\right) - \sum_{n=1}^N\mu_n \bm x^{(n)} \right\Vert \\
    &= \min_{\bm\mu \in \mathbb{R}^N}\left\Vert  \nabla_{\bm\beta} E\left(\bm\beta^\infty\right) - \begin{bmatrix}
        \mu_1 \sqrt{\lambda_1}\cos\theta_1 \\
        \mu_1 \sqrt{\lambda_1}\sin\theta_1 \\
        \dots \\
        \mu_N \sqrt{\lambda_N}\cos\theta_N \\
        \mu_N \sqrt{\lambda_N}\sin\theta_N
    \end{bmatrix} \right\Vert \\
    &= \min_{\bm\mu \in \mathbb{R}^N}\left(\sum_{n=1}^{N} \left\Vert \begin{bmatrix}
       \operatorname{sgn}(\cos\theta_n \Tilde{y}^{(n)})\left( \bar{u}_1^{(n)} - u_1^{(n)}(0) \right) \\
        \operatorname{sgn}(\sin\theta_n \Tilde{y}^{(n)})\left( \bar{u}_2^{(n)} - u_2^{(n)}(0) \right) 
    \end{bmatrix} - \mu_n \cdot \begin{bmatrix}
         \sqrt{\lambda_n}\cos\theta_n \\
        \sqrt{\lambda_n}\sin\theta_n \\
    \end{bmatrix}\right\Vert^2\right)^{\frac{1}{2}} \\
    &\leq \sum_{n=1}^{N} \min_{\mu_n \in \mathbb{R}}\left\Vert \begin{bmatrix}
       \operatorname{sgn}(\cos\theta_n \Tilde{y}^{(n)})\left( \bar{u}_1^{(n)} - u_1^{(n)}(0) \right) \\
        \operatorname{sgn}(\sin\theta_n \Tilde{y}^{(n)})\left( \bar{u}_2^{(n)} - u_2^{(n)}(0) \right) 
    \end{bmatrix} - \mu_n \cdot \begin{bmatrix}
         \sqrt{\lambda_n}\cos\theta_n \\
        \sqrt{\lambda_n}\sin\theta_n \\
    \end{bmatrix}\right\Vert.
\end{align*}
By Theorem~\ref{maintheorem}, it follows that for each $n$, \begin{align*}
    \delta_n &:= \min_{\mu_n \in \mathbb{R}}\left\Vert \begin{bmatrix}
       \operatorname{sgn}(\cos\theta_n \Tilde{y}^{(n)})\left( \bar{u}_1^{(n)} - u_1^{(n)}(0) \right) \\
        \operatorname{sgn}(\sin\theta_n \Tilde{y}^{(n)})\left( \bar{u}_2^{(n)} - u_2^{(n)}(0) \right) 
    \end{bmatrix} - \mu_n \cdot \begin{bmatrix}
         \sqrt{\lambda_n}\cos\theta_n \\
        \sqrt{\lambda_n}\sin\theta_n \\
    \end{bmatrix}\right\Vert \\
    &\leq \max\left\{\Big|M^{(n)}_+\Big|,~\Big|M_-^{(n)}\Big|\right\}.
\end{align*}
Therefore, $\bar{\delta} \leq \sum_{n=1}^N \delta_n \leq \sum_{n=1}^N \max\left\{\Big|M^{(n)}_+\Big|,~\Big|M_-^{(n)}\Big|\right\}$.
\end{proof}

\subsection{Proof of Corollary~\ref{corollary:NDvarepsilon}}
\begin{proof}
    For each $n \in \{1, \dots, N \}$, we apply Corollary~\ref{corollary:epsilon} and show that there exists a non-degenerate interval $\mathcal{I}'_n = [0,~\varepsilon^*_n]$ such that for all $\varepsilon \in \mathcal{I}'_n$, we have \begin{align}
    \delta_n(\varepsilon) \leq \left( |\cos\theta_n| - |\sin\theta_n| \right) \left(\lambda_n^{-\frac{1}{4}}|y^{(n)}|^\frac{1}{2}\right) - \left( |\cos\theta_n| - |\sin\theta_n| \right)\frac{\sqrt{2}\varepsilon}{4\lambda_n^\frac{1}{2}|y^{(n)}|}.\label{eq:deltanupperbound}
\end{align}
We let $\Tilde{\varepsilon} := \min_n\{ \varepsilon^*_n\}$ and let $\mathcal{J} := \bigcap_{n=1}^N \mathcal{I}'_n = [0, ~\Tilde{\varepsilon}]$. Since each $\mathcal{I}'_n$ is non-degenerate, we have $\varepsilon_n^* > 0$ for all $n$ and $\Tilde{\varepsilon} > 0$. Therefore, the interval $\mathcal{J}$ is non-degenerate. In turn, for all $\varepsilon \in \mathcal{J}$, the relation \eqref{eq:deltanupperbound} holds. By Corollary~\ref{corollary:NDcorollary}, we have
\begin{equation*}
    \bar{\delta}(\varepsilon) \leq \sum_{n=1}^{N} \delta_n(\varepsilon) \leq \sum_{n=1}^N \left( |\cos\theta_n| - |\sin\theta_n| \right) \left(\lambda_n^{-\frac{1}{4}}|y^{(n)}|^\frac{1}{2}\right)  - \left(\sum_{n=1}^N\left( |\cos\theta_n| - |\sin\theta_n| \right)\frac{\sqrt{2}}{4\lambda_n^\frac{1}{2}|y^{(n)}|}\right)\varepsilon.
\end{equation*}
\end{proof}

\section{DERIVATION OF DUAL DYNAMICS FOR GRADIENT DESCENT}\label{appendixC}
When applying GD to minimize loss (7) with respect to weights, in the continuous-time limit we have
    \begin{align}
        \frac{d\bm w^+(t)}{dt} &= -\bm w^+(t) \odot X^\top (X \bm \beta(t) - \bm y) \label{eq:weightsuGD}\\
        \frac{d\bm w^-(t)}{dt} &= \bm w^-(t) \odot X^\top (X \bm \beta(t) - \bm y). \label{eq:weightsvGD}
    \end{align}
With initialization $\bm w^+(0) = \bm w^-(0) = \alpha \bm 1$, we write implicit solutions to \eqref{eq:weightsuGD} and \eqref{eq:weightsvGD} as \begin{align*}
    \bm w^+(t) &= \alpha \exp\left(-\int_0^t X^\top(X\bm\beta(s) - \bm y)~ds\right) \\
    \bm w^-(t) &= \alpha \exp\left(\int_0^t X^\top(X\bm\beta(s) - \bm y)~ds  \right).
\end{align*}
Therefore, we have
\begin{align*}
    \bm\beta(t) &= \bm w^+(t) \odot \bm w^+(t)  - \bm w^-(t) \odot \bm w^-(t)
    \\
    &= \alpha^2 \left[\exp\left(-2\int_0^t X^\top(X\bm\beta(s) - \bm y)~ds\right) - \exp\left(2\int_0^t X^\top(X\bm\beta(s) - \bm y)~ds \right) \right] \\
    &= 2\alpha^2 \operatorname{sinh}\left(-2\int_0^t X^\top (X\bm\beta(s) - \bm y)~ds \right).
\end{align*}
It follows that
\begin{align*}
\frac{1}{2\alpha^2}\bm\beta(t) &=  \operatorname{sinh}\left(-2\int_0^t X^\top (X\bm\beta(s) - \bm y)~ds \right) \\
\operatorname{arcsinh}\left(\frac{\bm\beta(t)}{2\alpha^2}\right) &= -\int_0^t 2X^\top (X\bm\beta(s) - \bm y)~ds \\
\frac{d\operatorname{arcsinh}\left(\frac{\bm\beta(t)}{2\alpha^2}\right)}{dt} &= -2X^\top (X\bm\beta(t) - \bm y).
\end{align*}
We note that $\nabla_{\bm\beta} L(\bm\beta(t)) = \frac{1}{2}X^\top (X \bm\beta(t) - \bm y)$. In turn, we have \begin{align}
\frac{d\operatorname{arcsinh}\left(\frac{\bm\beta(t)}{2\alpha^2}\right)}{dt} = -2X^\top (X\bm\beta(t) - \bm y) = -4\nabla_{\bm\beta} L(\bm\beta(t)).\label{eq:77}
\end{align}
Given the potential function $\Psi_{\alpha}(\bm\beta(t)) = \frac{1}{4}\left(\sum_{i=1}^D \beta_i\operatorname{arcsinh}(\frac{\beta_i}{2\alpha^2}) + \sqrt{\beta_i^2 + 4\alpha^4} \right)
$, we have the mirror map\begin{align}
    \nabla \Psi_\alpha(\bm\beta(t)) = \frac{1}{4}\operatorname{arcsinh}\left(\frac{\bm\beta(t)}{2\alpha^2}\right).\label{eq:78}
\end{align}
Combining \eqref{eq:77} and \eqref{eq:78}, we get the dual dynamics for GD\begin{align*}
    \frac{d\nabla \Psi_\alpha(\bm\beta(t))}{dt} = -\nabla_{\bm\beta}L(\bm\beta(t)).
\end{align*}
