
\begin{lemma}\label{lemma:stationary_cond}
  The stationary condition of Problem~\eqref{eq:backfitting} with respect to \(\mathbf{f}_g\) is given by
  \begin{equation}
    f_{g,h}^{(k)} + \sum_{h' \in [d_g]: h' \neq h} P_hf_{g,h'}^{(k)} - P_hR_g^{(k)} + \lambda\sqrt{d_g}u^{(k)}v_{h}^{(k)} = 0,
  \end{equation}
  for all \(h \in [d_g]\) where \(u^{(k)}\) are scalars and \(\mathbf{v}_g^{(k)} = (v_{h}^{(k)})_{h\in[d_g]}\) is a vector of measurable functions of \(X_h^{(g)}\), with
  \begin{equation}
    (u^{(1)}, \dots, u^{(d_j)})^T \in \partial \norm{\cdot}_\infty\rvert_{(\norm{\mathbf{f}_g^{(1)}}, \dots, \norm{\mathbf{f}_g^{(d_j)}})^T},
  \end{equation}
  and
  \begin{equation}
    \mathbf{v}_{g}^{(k)} \in \partial \norm{\mathbf{f}_g^{(k)}},
  \end{equation}
  for \(k = 1, \dots {d_j}\). The subdifferential of the sup-norm evaluated at \((\norm{\mathbf{f}_g^{(1)}}, \dots, \norm{\mathbf{f}_g^{(d_j)}})^T\) lies in the \(d_j\)-dimensional Euclidean space.
\end{lemma}

\begin{proof}
  Since both the loss function and the regularization term are convex, the solution to the objective function in \eqref{eq:backfitting} can be characterized by the Karush-Kuhn-Tucker conditions. We investigate the subdifferential for the loss and the regularization term separately. For readability, we omit the argument of the component function when it is clear from the context. Starting with the loss function we define
  \begin{equation*}
    L(\mathbf{f}_g^{(k)}) \coloneq \frac{1}{2} \mathbb{E}\left[ \sum_{k=1}^{d_j} \bigg(R_g^{(k)} - \sum_{h\in [d_g]} f_{g,h}^{(k)}\bigg)^2 \right].
  \end{equation*}
  Consider a perturbation of \(L(\mathbf{f}_g^{(k)})\) along the direction
  \begin{equation*}
    \mathbf{\psi}_g^{(k)} = \left(\psi_{g,h}^{(k)} \in \mathcal{H}_{g,h}^{(k)}\right)_{h \in [d_g]}
  \end{equation*}
  such that
  \begin{multline*}
    \lim_{\tau \to 0} \frac{L(\mathbf{f}_g^{(k)} + \tau\mathbf{\psi}_g^{(k)}) - L(\mathbf{f}_g^{(k)})}{\tau} \\
    \begin{aligned}
      &= \sum_{h\in [d_g]}\mathbb{E}\Big[ (\sum_{h' \in [d_g]} f_{g,h'}^{(k)} -R_g^{(k)})\psi_{g,h}^{(k)} \Big] \\
      &= \sum_{h\in [d_g]}\mathbb{E}\Bigg[ \mathbb{E}\Big[\sum_{h' \in [d_g]} f_{g,h'}^{(k)} -R_g^{(k)} \mid X_h^{(g)}\Big]\psi_{g,h}^{(k)} \Bigg] \\
      &= \sum_{h\in [d_g]} \bigg\langle \mathbb{E}\Big[\sum_{h' \in [d_g]} f_{g,h'}^{(k)} -R_g^{(k)} \mid X_h^{(g)}\Big], \psi_{g,h}^{(k)} \bigg\rangle.
    \end{aligned}
  \end{multline*}
  The second equation follows from the law of iterated expectation and the third from expressing the resulting expectation as an inner product in the Hilbert space \(\mathcal{H}_{g,h}^{(k)}\). The gradient of \(L(\mathbf{f}_g^{(k)})\) is
  \begin{equation*}
    \nabla L(\mathbf{f}_g^{(k)}) = \left[\mathbb{E}\Big[\sum_{h' \in [d_g]} f_{g,h'}^{(k)} -R_g^{(k)} \mid X_h^{(g)}\Big]\right]_{h\in [d_g]}.
  \end{equation*}
  The subdifferential of \(\Phi^{d_j}_{\text{group}}(f)\) is given by
  \begin{equation*}
    \partial \Phi^{d_j}_{\text{group}}(f) = \lambda \sqrt{d_g} u_{hk} v_{hk}, \quad \forall h \in [d_g],
  \end{equation*}
  where \((u_{h1}, \dots, u_{h{d_j}})^T \in \partial \norm{\cdot}_\infty\rvert_{(\norm{\mathbf{f}_g^{(1)}}, \dots, \norm{\mathbf{f}_g^{(d_j)}})^T}\) and \(v_{hk} \in \partial \norm{\mathbf{f}_g^{(k)}}\).

  Isolating \(f_{g,h}^{(k)}(X_h^{(g)}) = \mathbb{E}[f_{g,h}^{(k)}(X_h^{(g)}) \mid X_h^{(g)}]\) and using the conditional expectation operator \(\mathbb{E}[\ \cdot \mid X_h^{(g)}]\) for the remaining terms yields the expression for the stationary condition.
\end{proof}

The following two Lemmas characterize the subdifferential of sup-norms (Lemma~\ref{lemma:sup_norm}, proof is provided in \citep[Chapter 8]{Rockafellar1998}) and that of the Euclidean norm (Lemma~\ref{lemma:euclidean}).

\begin{lemma}\label{lemma:sup_norm}
  The subdifferential of \(\norm{\cdot}_\infty\) in \(\R^{d_j}\) is
  \begin{equation}
    \partial \norm{\cdot}_\infty \rvert_x =
    \begin{cases}
      \{\eta : \norm{\eta}_1 \leq 1\}                                   & \text{if } \bm{x} = \bm{0} \\
      \text{conv}\{\text{sign}(x_k) e_k : \abs{x_k} = \norm{x}_\infty\} & \text{o.w.},
    \end{cases}
  \end{equation}
  where \(\text{conv}(A)\) denotes the convex hull of set \(A\) and \(e_k\) is the \(k^{th}\) canonical unit vector in \(\R^{d_j}\).
\end{lemma}

\begin{lemma}\label{lemma:euclidean}
  The subdifferential of \(\norm{\bf{f}_g}\) is
  \begin{equation}
    \partial \norm{f_g} =
    \begin{cases}
      \{f_j / \norm{\bf{f}_g}\}_{j\in g}    & \text{if } \norm{\bf{f}_g} \neq 0 \\
      \{\bf{v}_g : \norm{\bf{v}_g} \leq 1\} & \text{if } \norm{\bf{f}_g} = 0
    \end{cases}
  \end{equation}
\end{lemma}

The proof proceeds by considering three cases for the sup-norm subdifferential evaluated at \((\norm{\mathbf{f}_g^{(1)}}, \dots, \norm{\mathbf{f}_g^{(d_j)}})^T\): (1)
\(\norm{\mathbf{f}_g^{(k)}} = 0\) for all \(k = 1, \dots, d_j\); (2) there exists a unique \(k\), such that \(\norm{\mathbf{f}_g^{(k)}} = \max_{k' = 1, \dots, d_j} \norm{\mathbf{f}_g^{(k')}}\); (3) There exist at least two \(k \neq k'\), such that \(\norm{\mathbf{f}_g^{(k)}} = \norm{\mathbf{f}_g^{(k')}} = \max_{m = 1, \dots, d_j} \norm{\mathbf{f}_g^{(m)}}\)

We begin with the proof of Proposition~\ref{prop:all_zeros}, i.e., the case where \(\sum_{k=1}^{d_j} \norm{\mathbf{Q}R_g^{(k)}} \leq \lambda \sqrt{d_g}\) and show that \(\norm{\mathbf{f}_g^{(k)}} = 0\) must be a solution.

\begin{proof}
  From Lemma~\ref{lemma:stationary_cond} we know that if \(\norm{\mathbf{f}_g^{(k)}} = 0\) then \(\norm{\mathbf{u}}_1 \leq 1\) and \(\norm{\mathbf{v}_{g}^{(k)}} \leq 1\). It follows that
  \begin{align*}
    P_hR_g^{(k)} &= \lambda\sqrt{d_g}u^{(k)}v_{h}^{(k)} \\
    \sum_{k=1}^{{d_j}} \sqrt{\sum_{h\in [d_g]} \mathbb{E}[(P_h R_g^{(k)})^2]} &\leq \lambda \sqrt{d_g}.
  \end{align*}
  On the other hand, we also know from Lemma~\ref{lemma:stationary_cond} that \(\norm{\mathbf{f}_g^{(k)}} = 0\) if and only if \(\exists u^{(1)}, \dots, u^{(d_j)}\) such that \(\sum_{k=1}^{d_j} \abs{u}^{(k)} \leq 1\) and \(\exists v_{1}^{(k)}, \ldots, v_{d_g}^{(k)}\) such that \(\sqrt{\sum_{h=1}^{d_g} (v_h^{2})^{(k)}} \leq 1\) and
  \begin{equation*}
    \lambda\sqrt{d_g}u^{(k)}v_{h}^{(k)} = P_hR_g^{(k)}.
  \end{equation*}
  Then, if \(\sum_{k=1}^{d_j} \norm{\mathbf{Q}R_g^{(k)}} \leq \lambda \sqrt{d_g}\), choosing \(u^{(k)}\) and \(v_{h}^{(k)}\) as above guarantees that \(\sum_{k=1}^{d_j} \abs{u}^{(k)} \leq 1\) and \(\sum_{k=1}^{d_j} \abs{u}^{(k)} \leq 1\), therefore \(\norm{\mathbf{f}_g^{(k)}} = 0\).
\end{proof}

If we continue to allow the within-group covariances to be nonzero, we obtain the following known result.

\begin{lemma}
  Consider the case where there exists a unique \(k\) at which the sup-norm is attained. Then the group SpAM by \citep{Yin2012} is recovered.
\end{lemma}

\begin{proof}
  Note that we must have that \(\sum_{k=1}^{d_j} \norm{\mathbf{Q}R_g^{(k)}} > \lambda \sqrt{d_g}\) otherwise all \(f_{g,h}^{(k)} = 0, \forall h \in [d_g] ,k \in [d_j]\).

  Denote \(k_1 \in [d_j]\) the unique \(k\) that attains the sup-norm, then \(\norm{\mathbf{f}_g^{(k_1)}} > \norm{\mathbf{f}_g^{(k)}}\) for all \(k \neq k_1\). Consequently, the subdifferential of the sup-norm becomes \(\partial \norm{\cdot}_\infty\rvert_{(\norm{\mathbf{f}_g^{(1)}}, \dots, \norm{\mathbf{f}_g^{(d_j)}})^T} = e_{k_1}\), the \(k_1\)-th canonical vector in \(\R^{d_j}\). Hence, from Lemma~\ref{lemma:stationary_cond} we have that
  \begin{multline}\label{eq:case_2_kkt}
    P_hR_g^{(k)} - \Big(f_{g,h}^{(k)} + \sum_{h' \in [d_g]: h' \neq h} P_hf_{g,h'}^{(k)}\Big) \\
    = \lambda\sqrt{d_g}\frac{f_{g,h}^{(k)}}{\norm{\mathbf{f}_{g}^{(k)}}} \mathbbm{1}_{\{k = k_1\}}.
  \end{multline}
  If \(k \neq k_1\) then Equation~\eqref{eq:case_2_kkt} implies
  \begin{equation*}
    \mathbf{f}_g^{(k)} = \mathbf{\mathcal{I}}^{-1} \mathbf{Q}R_g^{(k)},
  \end{equation*}
  where
  \begin{equation*}
    \mathbf{\mathcal{I}} =
    \begin{bmatrix}
      1 & P_1 & \cdots & P_1 \\
      P_2 & 1 & \cdots & P_2 \\
      \vdots & \vdots & \ddots  &  \vdots \\
      P_{d_g} & P_{d_g} & \cdots  & 1
    \end{bmatrix}
  \end{equation*}

  On the other hand, if \(k = k_1\), we have
  \begin{align*}
    \mathbf{Q}R_g^{(k_1)} - \mathbf{\mathcal{I}}\mathbf{f}_g^{(k_1)} &= \lambda\sqrt{d_g}\frac{\mathbf{f}_{g}^{(k_1)}}{\norm{\mathbf{f}_{g}^{(k_1)}}} \\
    \Bigg[\mathbf{\mathcal{I}} + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_1)}}} I_{d_g}\Bigg] \mathbf{f}_g^{(k_1)} &= \mathbf{Q}R_g^{(k_1)},
  \end{align*}
  from which we obtain the group SpAM result discussed by \citet{Yin2012}, i.e.,
  \begin{equation*}
    \mathbf{f}_g^{(k_1)} = \Bigg[\mathbf{\mathcal{I}} + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_1)}}}I_{d_g}\Bigg]^{-1}\mathbf{Q}R_g^{(k_1)}.
  \end{equation*}

\end{proof}

For the remainder of the backfitting update derivation, assume that
\begin{equation*}
  P_hf_{g,h'}^{(k)} = \mathbb{E}[f_{g,h'}^{(k)} \mid X_{h}^{(g)}] =  0, \quad \forall h' \neq h.
\end{equation*}
This implies that the covariance of the within-group component functions is zero, i.e.,
\begin{align*}
  Cov(f_{g,h}^{(k)}, f_{g,h'}^{(k)}) &= \mathbb{E}[f_{g,h}^{(k)} f_{g,h'}^{(k)}] \\
  &= \mathbb{E}[f_{g,h}^{(k)} \mathbb{E}[f_{g,h'}^{(k)} \mid X_{h}^{(g)}]] \\
  &= 0.
\end{align*}

Under this assumption, the stationary condition in Lemma~\ref{lemma:stationary_cond} simplifies to
\begin{equation*}
  f_{g,h}^{(k)} - P_hR_g^{(k)} + \lambda\sqrt{d_g}u^{(k)}v_{h}^{(k)} = 0.
\end{equation*}

Suppose \(k_1 \in [d_j]\) is the unique \(k\) that attains the sup-norm (case 2). Then, we have the simplified expression
\begin{equation}\label{eq:case_2_not_k_1}
  {f}_{g,h}^{(k)} = {P}_{h}^{(k)}R_g^{(k)}, \quad \forall k \neq k_1
\end{equation}
On the other hand, if \(k = k_1\), the expression simplifies to
\begin{align*}
  f_{g,h}^{(k_1)} - P_h^{(k_1)}R_g^{(k_1)} &= \lambda\sqrt{d_g}\frac{f_{g,h}^{(k_1)}}{\norm{\mathbf{f}_{g}^{(k_1)}}} \\
  f_{g,h}^{(k_1)} \left[1 + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_1)}}}\right] &= P_h^{(k_1)}R_g^{(k_1)} \\
  f_{g,h}^{(k_1)} &= \left[1 + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_1)}}}\right]^{-1} P_h^{(k_1)}R_g^{(k_1)}.
\end{align*}
Taking the \(L_2\)-norm on both sides yields
\begin{align*}
  \sqrt{\sum_{h \in [d_g]} \mathbb{E}[(f_{g,h}^{(k_1)})^2]} &= \norm{\mathbf{f}_{g}^{(k_1)}} \\
  &= \left[1 + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_1)}}}\right]^{-1} \norm{\mathbf{Q}R_g^{(k_1)}}.
\end{align*}
Solving for \(\norm{\mathbf{f}_{g}^{(k_1)}}\) gives us the following identity
\begin{equation*}
  \norm{\mathbf{f}_{g}^{(k_1)}} = \norm{\mathbf{Q}R_g^{(k_1)}} - \lambda \sqrt{d_g}.
\end{equation*}
Plugging into the simplified update for above finally yields
\begin{align*}
  f_{g,h}^{(k_1)} &= \left[1 + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_1)}}}\right]^{-1} P_h^{(k_1)}R_g^{(k_1)} \\
  &= \left[1 + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{Q}R_g^{(k_1)}} - \lambda \sqrt{d_g}}\right]^{-1} P_h^{(k_1)}R_g^{(k_1)} \\
  &= \left[1 - \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{Q}R_g^{(k_1)}}}\right] P_h^{(k_1)}R_g^{(k_1)} \\
  &= \left[\norm{\mathbf{Q}R_g^{(k_1)}} - \lambda\sqrt{d_g}\right] \frac{P_h^{(k_1)}R_g^{(k_1)}}{\norm{\mathbf{Q}R_g^{(k_1)}}}
\end{align*}
for all \(h \in [d_g]\).

In the case where \(m > 1\) entries \(\norm{\mathbf{f}_{g}^{(k_1)}}, \ldots, \norm{\mathbf{f}_{g}^{(k_m)}}\) achieve the sup-norm also simplifies, i.e., for all \(i \in [m]\) we have
\begin{equation*}
  {P}_h^{(k_i)}R_g^{(k_i)} = \lambda\sqrt{d_g}a_i\frac{{f}_{g,h}^{(k_i)}}{\norm{\mathbf{f}_{g}^{(k_i)}}} + {f}_{g,h}^{(k_i)}.
\end{equation*}
Recall that \(\norm{\mathbf{f}_{g}^{(k_1)}} = \cdots = \norm{\mathbf{f}_{g}^{(k_m)}}\) and taking the \(L_2\) norm on both sides as well as summing over all \(m\) yields
\begin{align*}
  \sum_{i=1}^{m}\norm{\mathbf{Q}R_g^{(k_i)}} &= \sum_{i=1}^{m} \norm{\left[\frac{\lambda\sqrt{d_g}a_i}{\norm{\mathbf{f}_{g}^{(k_m)}}} + 1\right] {f}_{g,h}^{(k_i)}} \\
  &= \sum_{i=1}^{m} \left[\frac{\lambda\sqrt{d_g}a_i}{\norm{\mathbf{f}_{g}^{(k_m)}}} + 1 \right] \norm{\mathbf{f}_{g}^{(k_m)}} \\
  &= \left[\frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_m)}}} \sum_{i=1}^{m}a_i + m \right] \norm{\mathbf{f}_{g}^{(k_m)}} \\
  &= \lambda\sqrt{d_g} + m \norm{\mathbf{f}_{g}^{(k_m)}}.
\end{align*}
Isolating \(\norm{\mathbf{f}_{g}^{(k_m)}}\) gives us the following identity
\begin{equation*}
  \norm{\mathbf{f}_{g}^{(k_m)}} = \frac{1}{m}\left[\sum_{i=1}^{m}\norm{\mathbf{Q}R_g^{(k_i)}} - \lambda\sqrt{d_g}\right] \quad \forall m \in [m].
\end{equation*}
Plugging this into the simplified update for case (3) yields
\begin{multline*}
  f_{g,h}^{(k_m)} = \left[1 + \frac{\lambda\sqrt{d_g}}{\norm{\mathbf{f}_{g}^{(k_m)}}}\right]^{-1} P_hR_g^{(k_m)} = \\
  \Bigg[1 + \frac{\lambda\sqrt{d_g}}{\frac{1}{m}\left[\sum_{i=1}^{m}\norm{\mathbf{Q}R_g^{(k_i)}} - \lambda\sqrt{d_g}\right]}\Bigg]^{-1} P_hR_g^{(k_m)}.
\end{multline*}
After a few algebraic manipulations, we obtain
\begin{equation*}
  f_{g,h}^{(k_m)} = \frac{1}{m}\left[\sum_{i=1}^{m}s_g^{(k_i)} - \lambda\sqrt{d_g}\right] \frac{P_hR_g^{(k_m)}}{s_g^{(k_m)}},
\end{equation*}
where \(s_g^{(k_m)} = \norm{\mathbf{Q}R_g^{(k_m)}}\) for all \(m \in [m]\).

Now that we have the update for case (3), we still need to find the exact condition that the sup-norm is attained. This condition is given in Lemma~\ref{lemma:m_selection}. The arguments follow a similar logic to \citet{Fornasier2008}.

\begin{lemma}\label{lemma:m_selection}
  For some \(m > 1\) precisely \(m\) entries \(\norm{\mathbf{f}_{g}^{(k_1)}}, \ldots, \norm{\mathbf{f}_{g}^{(k_m)}}\) attain the sup-norm \(\max_{k=1, \ldots, {d_j}} \norm{\mathbf{f}_{g}^{(k)}}\) if and only if \[s_g^{(k_m)} \geq \frac{1}{m-1}\left[\sum_{i=1}^{m-1}s_g^{(k_i)} - \lambda\sqrt{d_g}\right]\] and \[s_g^{(k_{m+1})} < \frac{1}{m}\left[\sum_{i=1}^{m}s_g^{(k_i)} - \lambda\sqrt{d_g}\right].\]
\end{lemma}
\begin{proof}
  Assume exactly \(m\) entries \(\norm{\mathbf{f}_{g}^{(k_1)}}, \ldots, \norm{\mathbf{f}_{g}^{(k_m)}}\) attain the sup-norm. Then, for all \(m \in [m]\) we have
  \begin{equation*}
    f_{g,h}^{(k_m)} = \frac{1}{m}\left[\sum_{i=1}^{m}s_g^{(k_i)} - \lambda\sqrt{d_g}\right] \frac{P_hR_g^{(k_m)}}{s_g^{(k_m)}}.
  \end{equation*}
  By Lemmas~\ref{lemma:stationary_cond} and~\ref{lemma:sup_norm} we know that
  \begin{equation*}
    {P}_h^{(k_i)}R_g^{(k_i)} = \lambda\sqrt{d_g}a_i\frac{{f}_{g,h}^{(k_i)}}{\norm{\mathbf{f}_{g}^{(k_i)}}} + {f}_{g,h}^{(k_i)} \quad \forall i \in [m].
  \end{equation*}
  Isolating \(a_i\) after taking the \(L_2\) norm on both sides we obtain
  \begin{equation*}
    a_i = \frac{\norm{\mathbf{Q}R_g^{(k_i)}} - \norm{\mathbf{f}_{g}^{(k_i)}}}{\lambda\sqrt{d_g}} \quad \forall i \in [m].
  \end{equation*}
  Plugging the identity for \(\norm{\mathbf{f}_{g}^{(k_m)}}\) into \(a_m\) and since \(a_m \geq 0\) we obtain
  \begin{equation*}
    \norm{\mathbf{Q}R_g^{(k_m)}} \geq \frac{1}{m-1}\left[\sum_{i=1}^{m-1}\norm{\mathbf{Q}R_g^{(k_i)}} - \lambda\sqrt{d_g}\right].
  \end{equation*}
  Since \(\norm{\mathbf{f}_{g}^{(k_m)}} > \norm{\mathbf{f}_{g}^{(k_{m+1})}}\) we must have that from Eq.~\eqref{eq:case_2_not_k_1} \(\norm{\mathbf{Q}R_g^{(k_{m+1})}} = \norm{\mathbf{f}_{g}^{(k_{m+1})}} < \norm{\mathbf{f}_{g}^{(k_m)}} = \frac{1}{m}\left[\sum_{i=1}^{m}\norm{\mathbf{Q}R_g^{(k_i)}} - \lambda\sqrt{d_g}\right]\).

  On the other hand, assuming that exactly \(l \neq m\) entries \(\norm{\mathbf{f}_{g}^{(k_1)}}, \ldots, \norm{\mathbf{f}_{g}^{(k_l)}}\) attain the sup-norm, then following the same steps as above leads to contradiction.
\end{proof}

From Lemma~\ref{lemma:m_selection} we conclude that there exist exactly \(m^*\) entries that attain the sup-norm if and only if
\begin{equation*}
  m^* = \argmax_m \frac{1}{m} \left[\sum_{i=1}^{m}\norm{\mathbf{Q}R_g^{(k_i)}} - \lambda\sqrt{d_g}\right].
\end{equation*}
This concludes the proof of Theorem~\ref{thm:backfitting_update}.
