\appendix

\section{Formal Derivations}
\label{appendix_theroy}
\subsection{General}
\begin{itemize}
    \item[$n_t$]: number of samples of task $t$
    \item[$\textbf{X}_t$]: task batch $\in \mathbb{R}^{n_t\times m}$
    \item[$X_t$]: task sample $\in \mathbb{R}^{1\times m}$
    \item[$d_l$]: hidden dimension of layer $l$
    \item[$o$]: output dimension (number of classes)
    \item[$\sigma$]: ReLU activation function, $\sigma(z):=\max{(0,z)}$
    \item[$\Theta$]: hidden layer $\in \mathbb{R}^{m\times d}$
    \item[$\vartheta_t$]: output head (for task $t$) $\in \mathbb{R}^{d\times o}$, frozen and initialized $\vartheta_t\overset{\text{iid}}{\sim}\mathcal{N}(0,\frac{1}{d})$
    
    \item[$z_t$]: pre-activations $z_t=X_t\Theta \in \mathbb{R}^{1\times d}$
    \item[$h_t$]: activations $h_t=\sigma(z_t)\in \mathbb{R}^{1\times d}$
    \item[$f$]: logits $f(X_t) = h_t\vartheta_t$
    \item[$E_t$]: residuals $E:=f(X_t)-Y_t\in \mathbb{R}^{1\times o}$
    \item[$\delta_t$]: logit residuals $\delta_t:=\text{Softmax}(f(X_t))-Y_t\in \mathbb{R}^{1\times o}$
    \item[$L_t$]: loss task $t$
    \item[$\mathcal{L}_t^{(i)}$]: $i^{th}$ sample contribution to loss task $t$
    
\end{itemize}


\subsection{Mathematical Identities}

\paragraph{Frobenius product rank one identity }
\begin{equation}
    \label{frob_id_1}
     \langle AB^\top, CD^\top\rangle_F=\text{Tr}(B^\top A^\top CD)=(A^\top C)(B^\top D)= \langle A,C\rangle_F \langle B,D\rangle_F
\end{equation}



\paragraph{Trace}

\begin{equation}
\label{eq:tr_identity_1}
    \operatorname{Tr}(AB)=\operatorname{Tr}(BA)
\end{equation}

\paragraph{Vec}

\begin{equation}
    \Theta\in\mathbb{R}^{m\times d}\to\theta:=\operatorname{vec}(\Theta)\in \mathbb{R}^{1\times md}
\end{equation}


\paragraph{Kronecker product}


\begin{equation}
\label{eq:kronecker_identity_1}
    \underbrace{a}_{1\times n} \otimes_\text{Kr} \underbrace{b}_{1\times m} = \underbrace{\operatorname{vec}(b^\top a)}_{1\times (nm)}
\end{equation}


The "vec trick"
\begin{equation}
\label{eq:kronecker_identity_2}
(A \otimes_\text{Kr} B)\operatorname{vec}(C)=\operatorname{vec}(BCA^\top)
\end{equation}


\subsection{Definitions}

\subsubsection{Forgetting}
\begin{equation}
\label{eq:cf_definition}
    \text{CF} := \langle \nabla_{\Theta} \mathcal{L}_1, \nabla_{\Theta} \mathcal{L}_{2} \rangle_F
\end{equation}


\subsubsection{Mean Squared Error (MSE)}

\begin{equation}
    \text{L}_t = \frac{1}{n_t}\sum_{i=1}^{n_t} \mathcal{L}^{(i)}_t
\end{equation}

\begin{equation}
\mathcal{L}^{(i)}_t=\frac{1}{2}\Vert E_t^{(i)} \Vert^2=\frac{1}{2}(f(X^{(i)}_t)-Y^{(i)}_t)(f(X^{(i)}_t)-Y^{(i)}_t)^\top
\end{equation}

\subsubsection{Cross-Entropy}

\begin{equation}
    \mathcal{L}_t^{(i)}=-\sum_{c=1}^o Y_{t,c}^{(i)}\log\Big(\text{Softmax}\big(f_t(X_t^{(i)})\big)\Big)
\end{equation}


\subsubsection{Loss Gradient}

Applying the chain rule on a generic loss $\mathcal{L}$, the vectorized gradient w.r.t. the layer parameter $\Theta$ can be separated into a loss-dependent and model-dependent component.

\begin{equation}
    \nabla_\Theta \mathcal{L}=\underbrace{\nabla_f \mathcal{L}}_{\text{loss}^\dagger}\cdot \underbrace{J_f(\Theta)}_{\text{model}^\ddagger}
\end{equation}

\begin{itemize}
    \item[$\dagger$] since $\mathcal{L}$ is a scalar-valued function, this term is the gradient w.r.t. the model function $f$, $\nabla_\Theta \mathcal{L}\in\mathbb{R}^{1\times o}$ 
    \item[$\ddagger$] is the Jacobian of the model output w.r.t the parameter $\Theta\in\mathbb{R}^{m\times d}$. The model $f$ is generally a vector-valued function if $o>1$. Therefore, it is a Jacobian rather than a gradient and $J_f(\Theta)\in \mathbb{R}^{o\times m\times d}$
\end{itemize}

In the vectorized form $\theta=\operatorname{vec}(\Theta)$

\begin{equation}
    \nabla_\theta\mathcal{L}=\underbrace{\nabla_f\mathcal{L}}_{1\times o}\cdot \underbrace{J_f(\theta)}_{o\times md}
\end{equation}

The term $\dagger$ can be calculated independently of the model considered.

\begin{equation}
    \nabla_f \mathcal{L}=\begin{cases}
        E_t:= f_t(X_t^{(i)})-Y_t^{(i)} & \text{MSE} \\
        \delta_t:= \text{Softmax}\big(f_t(X_t^{(i)})\big)-Y_t^{(i)} & \text{Cross-Entropy}
    \end{cases}
\end{equation}

In the next derivations, $E_t$ and $\delta_t$ can be interchanged depending on the loss function. 

To further simplify our analysis, we set $o=1$ and $e$ as the generic scalar of the type of loss considered. The Jacobian becomes real valued and can be substituted by the gradient $\nabla_\theta f$ 

\begin{equation}
\label{eq:loss_decomposition}
    \nabla_\theta\mathcal{L}=e\nabla_\theta f
\end{equation}

In the following subsections, $\nabla_\theta f$ will be calculated for different models $f$.

\subsection{Linear Model}

$$f(X)=z\vartheta = X\Theta\vartheta$$

The gradient for Equation \ref{eq:loss_decomposition} is decomposed by the chain rule.

\begin{equation}
    \nabla_\theta f =\nabla_z f\cdot J_z(\theta)
\end{equation}

\begin{align}
    & \nabla_z f = \vartheta^\top \\
    & J_{f}(\theta)=(I_d \otimes_\text{Kr} X)\\
\end{align}

$\otimes_\text{Kr}$ is the Kronecker product of two matrices: $(k\times j) \otimes_\text{Kr} (m\times n)\to (km \times jn)$.


Substituting into the chain, the gradient is given.

\begin{align}
    \nabla_\theta f &=\vartheta^\top \otimes_\text{Kr} X\\
    &= \operatorname{vec}(X^\top\vartheta^\top)
\end{align}

The gradient is broadcast back in its original shape by the $\text{unvec}$ operation.

\begin{equation}
    \nabla_\Theta f=\operatorname{unvec}(\nabla_\theta f)=X^\top\vartheta^\top
\end{equation}

Substitute in Equation \ref{eq:loss_decomposition}, 

\begin{align}
    \nabla_{\Theta}\mathcal{L}_t&=eX^\top \vartheta^\top\\
    &=X^\top e\vartheta^\top\\
\end{align}

It is now possible to calculate $G$. For this, it is necessary to introduce the subscripts identifying different tasks $t$, $t'$ into the notation.


\begin{align}
G &= \langle \nabla_{\Theta} \mathcal{L}_t, \nabla_{\Theta} \mathcal{L}_{t'} \rangle_F \\
&=\langle X_t^\top e_t\vartheta_t^\top, X_{t'}^\top e_{t'}\vartheta_{t'}^\top\ \rangle_F\\
&=\underbrace{e_t e_{t'}}_K\langle X_t^\top \vartheta_t^\top, X_{t'}^\top \vartheta_{t'}^\top\ \rangle_F\\
&=K\langle X_t,X_{t'}\rangle \langle \vartheta_t,\vartheta_{t'}\rangle && \text{using the identity \ref{frob_id_1}}
\end{align}


In expectation, $G$ is zero since $\vartheta_t\overset{\text{iid}}{\sim}\mathcal{N}(0,\frac{1}{d})$ - but the variance is non-zero.

\begin{equation}
\begin{split}
    \text{Var}[G]&=\mathbb{E}[G^2]-\mathbb{E}[G]^2
    \\
    &=\mathbb{E}[G^2]
    \\
    &=K^2\langle X_t,X_{t'}\rangle^2 \mathbb{E}[\langle \vartheta_{t},\vartheta_{t'}\rangle^2]
    \\
    &=K^2\langle X_t,X_{t'}\rangle^2 \text{Var}[\langle \vartheta_t,\vartheta_{t'}\rangle]
    \\
    &=K^2\langle X_t,X_t'\rangle^2 \langle \text{Var}[\vartheta_t],\text{Var}[\vartheta_{t'}]\rangle
    \\
    &=K^2\langle X_t,X_t'\rangle^2 \operatorname{Var}[\vartheta_t] \operatorname{Var}[\vartheta_{t'}]\langle I_d \,I_d\rangle
    \\
    &=K^2\langle X_1,X_2\rangle^2\frac{1}{d^2}d\\
    &=K^2\langle X_1,X_2\rangle^2\frac{1}{d}
\end{split}
\end{equation}

The upper bound for catastrophic forgetting is given by its standard deviation.
\begin{equation}
    \mathcal{B}=\operatorname{std}[\text{CF}]=K\langle X_1,X_2\rangle\frac{1}{\sqrt{d}}
\end{equation}

\begin{equation}
\boxed{
    \text{CF}\leq \mathcal{B} = \langle X_1,X_2\rangle\frac{K}{\sqrt{d}} \ \text{w.h.p}
    }
\end{equation}





\subsection{One-Layer MLP}
Since there is only one hidden layer, the subscript to distinguish the layers is omitted. The mapping $f$ can be written equivalently for the task batch $\textbf{X}_t$ or for a single sample $X_t$.

\begin{equation}
\begin{split}
    f(X_t)&=h\vartheta_t\\
    &=\sigma(z_t)\vartheta_t\\
    &=\sigma(X_t\Theta)\vartheta_t
\end{split}
\end{equation}


The Jacobian $\ddagger$ for Equation \ref{eq:loss_decomposition} can be decomposed by applying the chain rule and the vectorized parameter $\theta=\operatorname{vec}(\Theta)$

\begin{equation}
\label{eq:J_oneMLP_decomposition}
    \nabla_\theta f=
    \underbrace{\nabla_h f}_{\in\mathbb{R}^{1\times d}}
    \cdot
    \underbrace{J_{h}(z)}_{\in\mathbb{R}^{d\times d}}
    \cdot
    \underbrace{J_z(\theta)}_{\in \mathbb{R}^{d \times md}}
\end{equation}

Each component is calculated as follows.

\begin{equation}
\label{eq:J_oneMLP_components}
\begin{split}
&J_f(h) = \vartheta_t^\top \\
&J_h(z) = D \\
&J_z(\theta) = I_d \otimes_\text{Kr} X_t\\
\end{split}
\end{equation}


$D$ is a diagonal matrix $d\times d$ whose diagonal elements are \textit{gates}, i.e. the derivative of the ReLU activations $\sigma'(z)$

\begin{equation}
    g_t^{(j)} := \sigma'(z_t^{(j)})=
    \begin{cases}
    1, & z_t^{(j)}>0\\
    0, & \text{else}
    \end{cases}
\end{equation}

As a consequence, the following property of $D$ holds
\begin{equation}
    D^2 = DD^\top = D
\end{equation}

The Kronecker product $\otimes_\text{Kr}$ produces a matrix $d\times md$ whose entries in $j,(i,k)$ are $\delta_{jk}X_t^{(i)}$

Plugging \ref{eq:J_oneMLP_components} into \ref{eq:J_oneMLP_decomposition}

\begin{align}
    \nabla_\theta f &= (\vartheta_t^\top D) \otimes_\text{Kr} X_t\\
    &=\operatorname{vec}(X_t^\top(\vartheta_t^\top D)) & \text{by } \ref{eq:kronecker_identity_1}
\end{align}

The gradient in its original shape ($m\times d$) is retrieved using the $\operatorname{unvec}$ operation.

\begin{equation}
\begin{split}
    \nabla_\Theta f = \operatorname{unvec}(\nabla_\theta f) &= X_t^\top\vartheta_t^\top D
\end{split}
\end{equation}


Substitute in \ref{eq:loss_decomposition}

\begin{align}
    \nabla_\theta\mathcal{L}_t &= e_t X_t^\top\vartheta_t^\top D\\
    &=  X_t^\top \underbrace{e_t\vartheta_t^\top D}_{:=S_t}\\
    &= X_t^\top S_t
\end{align}




Inserting \ref{eq:loss_grad_oneMLP_compact_S} into \ref{eq:cf_definition}, the CF is obtained.

\begin{align}
    G &:= \langle \nabla_{\Theta} \mathcal{L}_t, \nabla_{\Theta} \mathcal{L}_{t'} \rangle_F\\
    &= \langle X_t, X_{t'}\rangle \cdot\langle S_t, S_{t'}\rangle_F \\
    &= \langle X_t, X_{t'}\rangle \cdot\text{Tr}( S_t, S_{t'}^\top) \label{cf_MLP_der}\\
    %&= \text{Tr}(S_1^\top X_1 X_2^\top S_2)
\end{align}


\begin{align}
    G^2 &=  \langle X_t, X_{t'}\rangle^2 \cdot\text{Tr}( S_t, S_{t'}^\top)^2\\
    &= \langle X_t, X_{t'}\rangle^2 \cdot \sum _{i=1}^d \big(S_t^{(i)}\big)^2, \big(S_{t'}^{(i)}\big)^2\\
\end{align}

\begin{align}
    \mathbb{E}\Big[ \big(S_t^{(i)}\big)^2  \Big] &= \mathbb{E}\Big[ e_t^2 (g_t^{(i)}\big)^2 (\vartheta_t^{(i)}\big)^2  \Big]\\
    &= e_t^2\mathbb{E}\Big[  (g_t^{(i)}\big)^2\Big] \mathbb{E}\Big[ (\vartheta_t^{(i)}\big)^2  \Big] && \text{assuming } g_t \perp\vartheta_t\\
    &= e_t^2\mathbb{E}\Big[  g_t^{(i)}\Big] \operatorname{Var}\Big[ \vartheta_t^{(i)}\Big]\\
    &=e_t^2 \frac{1}{2}\frac{1}{d}
\end{align}

\begin{align}
\label{eq:var_MLP_der}
    \operatorname{Var}[G] &= \mathbb{E}[G^2]-\mathbb{E}[G]^2\\
    &= \mathbb{E}[G^2] \\
    &= \mathbb{E}\Big[ \langle X_t, X_{t'}\rangle^2 \cdot \sum _{i=1}^d \big(S_t^{(i)}\big)^2, \big(S_{t'}^{(i)}\big)^2\Big] \\
    &=\langle X_t, X_{t'}\rangle^2 \cdot \sum _{i=1}^d \mathbb{E}\Big[\big(S_t^{(i)}\big)^2\Big] \mathbb{E}\Big[\big(S_{t'}^{(i)}\big)^2\Big] && \text{assuming } S_t\perp S_{t'}\\
    &= \langle X_t, X_{t'}\rangle^2 \cdot \sum _{i=1}^d e_t^2\frac{1}{2d}e_{t'}^2\frac{1}{2d}\\
    &= \langle X_t, X_{t'}\rangle^2 \underbrace{e_t^2 e_{t'}^2}_{K^2}\frac{1}{4d^2} \sum _{i=1}^d 1 \\
    &= \langle X_t, X_{t'}\rangle^2 K^2 \frac{1}{2d}
\end{align}

\begin{align}
    \mathcal{B}:=\operatorname{std}[G]&=\sqrt{\operatorname{Var}[G]}\\
    &=\langle X_t, X_{t'}\rangle \frac{K}{\sqrt{2d}}
\end{align}


\begin{align}
    \boxed{
    \text{CF}\leq \mathcal{B} = \langle X_t, X_{t'}\rangle \frac{K}{\sqrt{2d}} \ \text{ w.h.p}
    }
\end{align}




\subsection{Two-Layer MLP}


\begin{equation}
\begin{split}
    f(X)&=h_{2}\vartheta_t\\
    &=\sigma(z_{2})\vartheta_t\\
    &= \sigma(h_{1}\Theta_2)\vartheta_t\\
    &= \sigma(\sigma(z_{1})\Theta_2)\vartheta_t\\
    &= \sigma(\sigma(X\Theta_1)\Theta_2)\vartheta_t
\end{split}
\end{equation}

Now there are two parameter matrices. The chain rule decomposes the Jacobian of the parameter matrices $\Theta_1$ and $\Theta_2$ into their contributions. Let us start from the outer one. Like for the one-layer, we define the vectorized parameter $\theta_2=\operatorname{vec}(\Theta_2)\in \mathbb{R}^{d_2\times (d_1d_2)}$


\begin{equation}
    \label{eq:J_Theta2_twoMLP_decomposition}
    \nabla_{\theta_2} f=\underbrace{\nabla_{h_{2}}f}_{1\times d_2}
    \underbrace{J_{h_2}(z_{2})}_{d_2\times d_2}
    \underbrace{J_{z_2}(\Theta_2)}_{d_2\times (d_1d_2)}
\end{equation}


With the following components.

\begin{align}
\label{eq:J_twoMLP_components}
    & \nabla_{h_{2}}f=\vartheta^\top \\
    & J_{h_2}(z_{2,t}) = D_2 \\
    & J_{z_2}(\Theta_2) = I_{d_2} \otimes_\text{Kr} h_{1}\\
\end{align}

Plugging in \ref{eq:J_Theta2_twoMLP_decomposition}

\begin{equation}
    \begin{split}
        \nabla_{\theta_2} f&=\vartheta^\top D_2(I_{d_2} \otimes_\text{Kr} h_{1})\\
        &=(\vartheta^\top D_2) \otimes_\text{Kr} h_{1}\\
        &=\operatorname{vec}(h_1^\top (\vartheta^\top D_2))
    \end{split}
\end{equation}

The gradient in its original shape is obtained by applying $\operatorname{unvec}$

\begin{equation}
        \nabla_{\Theta_2}f=\operatorname{unvec}(\nabla_{\theta_2} f)=h_1^\top (\vartheta^\top D_2)
\end{equation}




Substituting into \ref{eq:loss_decomposition} 

\begin{equation}
\label{eq:grad_theta2_twoMLP}
\begin{split}
    \nabla_{\Theta_2}\mathcal{L} &= e \cdot h_1^\top (\vartheta^\top D_2)\\
    &= h_1^\top \underbrace{e(\vartheta^\top D_2)}_{:=S_2}\\
    &= h_1^\top S_2
\end{split}
\end{equation}

$S_2\in\mathbb{R}^{1\times d_2}$

The more nested gradient for the vectorized $\theta_1 = \operatorname{vec}(\Theta_1)\in \mathbb{R}^{1\times (md_1)}$ can be decomposed by going deeper with the chain rule. 

\begin{equation}
    \label{eq:J_Theta1_twoMLP_decomposition}
    \nabla_{\theta_1}f=\underbrace{\nabla_{h_{2}}f}_{1\times d_2}
    \underbrace{J_{h_2}(z_{2})}_{d_2\times d_2}
    \underbrace{J_{z_2}(h_{1})}_{d_2\times d_1}
    \underbrace{J_{h_1}(z_{1})}_{d_1\times d_1}
    \underbrace{J_{z_1}(\Theta_1)}_{d_1\times md_1}
\end{equation}

Some components have already been calculated from $J_f(\Theta_2)$; the remaining ones are the following.

\begin{align}
\label{eq:J_twoMLP_components}
    & J_{z_2}(h_{1}) = \Theta_2^\top\\
    & J_{h_1}(z_{1})=D_1\\
    & J_{z_1}(\Theta_1) = I_{d_1} \otimes_\text{Kr} X
\end{align}

Plugging them into \ref{eq:loss_decomposition}

\begin{align}
    \nabla_{\theta_1}f&=\vartheta^\top D_2\Theta_2^\top D_1(I_{d_1} \otimes_\text{Kr} X)\\
    &=(\vartheta^\top D_2\Theta_2^\top D_1) \otimes_\text{Kr} X\\
    &=\operatorname{vec}(X^\top (\vartheta^\top D_2\Theta_2^\top D_1))
\end{align}

Broadcasting back into the original shape of $\Theta_1$ $m\times d_1$ by $\operatorname{unvec}$

\begin{equation}
    \nabla_{\Theta_1}f = \operatorname{unvec}(\nabla_{\theta_1}f)=X^\top (\vartheta^\top D_2\Theta_2^\top D_1)
\end{equation}

Plugging it into \ref{eq:loss_decomposition}.
\begin{equation}
\label{eq:grad_theta1_twoMLP}
\begin{split}
    \nabla_{\Theta_1}\mathcal{L} &= e \cdot X^\top (\vartheta^\top D_2\Theta_2^\top D_1)\\
    &= X^\top \underbrace{e (\vartheta^\top D_2\Theta_2^\top D_1)}_{:=S_1}\\
    &= X^\top S_1
\end{split}
\end{equation}

$S_1 \in \mathbb{R}^{1\times d_1}$





\subsubsection{$L$-Layer MLP}
Let us extend the \ref{eq:grad_theta2_twoMLP} and \ref{eq:grad_theta1_twoMLP} to the general case of $L$ layers. The following expression can be written from \ref{eq:grad_theta1_twoMLP}, recognizing that $X^\top$ represents $h$ of the previous layer. 

Consider the last layer $l=L$

\begin{equation}
    \nabla_{\Theta_{L}}\mathcal{L} = \underbrace{h_{L-1}^\top}_{\text{previous layers}} e\vartheta^\top D_{L}
\end{equation}


Consider the second-to-last layer $l=L-1$.

\begin{equation}
    \nabla_{\Theta_{L-1}}\mathcal{L} = \underbrace{h_{L-2}^\top}_{\text{previous layers}} e\vartheta^\top \underbrace{D_L\Theta_L^\top}_{\text{next layers}}D_{L-1}
\end{equation}

Identifying the terms coming from the previous and subsequent layers, a general expression for the $l$ layer has the following form.

\begin{equation}
\label{eq:general_gradient_intermediate}
\begin{split}
\nabla_{\Theta_{l}}\mathcal{L} &= h_{l-1}^\top e\vartheta^\top  \big(D_L\Theta_L^\top D_{L-1}\Theta_{L-1}^\top\dots D_{l+1}\Theta_{l+1}^\top \big)D_l\\
&= h_{l-1}^\top e\vartheta^\top  \big(\Theta_{l+1}D_{l+1}\dots \Theta_{L-1}D_{L-1}\Theta_L D_L\big)^\top D_l\\
&= h_{l-1}^\top e\vartheta^\top  \bigg(\prod_{k=l+1}^L \Theta_kD_k \bigg)^\top D_l\\
\end{split}
\end{equation}

The recursive component of previous layers is expanded as follows.

\begin{equation}
\label{eq:general_gradient_prev_layers}
\begin{split}
    &h_1 = X\Theta_1 D_1\\
    &h_2= h_1\Theta_2D_2 = X(\Theta_1D_1)(\Theta_2D_2)\\
    &\vdots\\
    &h_{l-1}=X(\Theta_1D_1)\dots(\Theta_{l-2}D_{l-2})(\Theta_{l-1}D_{l-1})\\
\end{split}
\end{equation}

Take the transpose

\begin{equation}
\begin{split}
    h_{l-1}^\top&= \Big((\Theta_1D_1)\dots(\Theta_{l-2}D_{l-2})(\Theta_{l-1}D_{l-1})\Big)^\top X^\top\\
    &= \Big( \prod_{j=1}^{l-1}\Theta_jD_j \Big)^\top X^\top
\end{split}
\end{equation}

Substitute into \ref{eq:general_gradient_intermediate}.

\begin{equation}
    \label{eq:general_gradient_final}
    \boxed{
    \nabla_{\Theta_{l}}\mathcal{L}=
    \begin{cases}
        
    
     \bigg( \prod_{j=1}^{l-1}\Theta_jD_j \bigg)^\top X^\top e \vartheta^\top \bigg(\prod_{k=l+1}^L \Theta_k D_k \bigg)^\top D_l &\text{if }\ l<L \text{ and } L>1\\
    \bigg(\prod_{j=1}^{l-1}\Theta_jD_j \bigg)^\top X^\top e\vartheta^\top D_{L} &\text{if }\ l=L \ \text{or}\ L=1 
    \end{cases}
    }
\end{equation}


Let us call the products appearing in \ref{eq:general_gradient_final} as follows.

\begin{align}
    &P_{l} :=\bigg(\prod_{j=1}^{l-1}\Theta_jD_j\bigg)^\top\\
    &Q_{l} := \bigg(\prod_{k=l+1}^L \Theta_k D_k\bigg)^\top
\end{align}

Equation \ref{eq:general_gradient_final} can be written in more compact terms. 

\begin{align}
    \label{eq:general_gradient_final_compact}
    \nabla_{\Theta_{l}}\mathcal{L}_t &= \underbrace{P_l X^\top}_{:=H_l^\top} \underbrace{e\vartheta^\top Q_l D_l}_{:=S_l}\\
    &= H_l^\top S_t
\end{align}

$S_l\in \mathbb{R}^{1\times d_l}$


Now, it is possible to derive an expression for CF in the case of an arbitrary deep MLP. Let us define the vectorized gradient for the $l^{th}$ layer and task $t$ as $g_{t,l}:=\text{vec}(\nabla_{\Theta_l}\mathcal{L}_t)$.

\begin{equation}
    g_t = \text{concat}(g_{t,1}, g_{t,2}, \dots, g_{t,L})
\end{equation}

\begin{equation}
\begin{split}
    G&=\langle g_t,g_{t'}\rangle\\
    &=\sum_{l=1}^L\underbrace{\langle g_{t,l},g_{t',l}\rangle}_{:=G_l}=\sum_{l=1}^L G_l
\end{split}
\end{equation}


\begin{equation}
    \begin{split}
        \operatorname{Var}[G]&=\mathbb{E}[G^2]\\
        &=\sum_{l=1}^L\mathbb{E}[ G_l^2]\\
        &= \sum_{l=1}^L\operatorname{Var}[ G_l]
    \end{split}
\end{equation}

\begin{align}
    G_l &=  \langle \nabla_{\Theta_l} \mathcal{L}_t,\nabla_{\Theta_l} \mathcal{L}_{t'}\rangle_F\\
    &= \langle H_{l,t},H_{l,t'}\rangle \operatorname{Tr}(S_{l,t}S_{l,t'}^\top)\\
\end{align}

\begin{align}
     G_l^2 &= \langle H_{l,t},H_{l,t'}\rangle^2 \operatorname{Tr}(S_{l,t}S_{l,t'}^\top)^2\\
     &=\langle H_{l,t},H_{l,t'}\rangle^2\sum _{i=1}^{d_l} \big(S_{l,t}^{(i)}\big)^2 \big(S_{l,t'}^{(i)}\big)^2
\end{align}

\begin{equation}
    S_l = e\vartheta^\top \underbrace{Q_lD_l}_A
\end{equation}

Let's call $A_{|j}$ the $j^{th}$ column of $A$

\begin{align}
    \mathbb{E}\Big[\big(S_l^{(i)}\big)^2\Big] &=e^2\mathbb{E}\Big[ \langle \vartheta^2,A_{|i}^2\rangle\Big]\\
    &=e^2\langle \mathbb{E}\big[\vartheta^2\big],\mathbb{E}\big[A_{|i}^2\big]\rangle\\
    &=e^2 \operatorname{Var}\big[\vartheta\big]\mathbb{E}\big[A_{|i}^2\big]\\
    &=e^2 \frac{1}{d}\mathbb{E}\big[A_{|i}^2\big]\\
\end{align}



\begin{equation}
    A_{|i} = Q_{|i}g_i  
\end{equation}

Where $Q_{|i}$ is the $i^{th}$ column of $Q_l$ and $g_i$ is the scalar valued gate function on the diagonal of $D_l$

\begin{align}
    A_{|i}^2&=Q_{|i}^2 g_i \\
    &=\Big(\prod_{k=l+1}^L g_k \operatorname{diag}(\Theta_k^\top)  \Big)^2 g_i\\
    &=\Big(\prod_{k=l+1}^L g_k \operatorname{diag}(\Theta_k^\top)^2  \Big) g_i
\end{align}

\begin{align}
    \mathbb{E}\big[A_{|i}^2\big]&=\Big(\prod_{k=l+1}^L \mathbb{E}\big[g_k\big] \mathbb{E}\big[\operatorname{diag}(\Theta_k^\top)^2\big]  \Big) \mathbb{E}\big[g_i\big]\\
    &=\Big(\prod_{k=l+1}^L \mathbb{E}\big[g_k\big] \operatorname{Var}\big[\operatorname{diag}(\Theta_k^\top)\big]  \Big) \mathbb{E}\big[g_i\big]\\
    &=\Big(\prod_{k=l+1}^L \frac{1}{2}\frac{1}{d}\operatorname{diag}(I_d) \Big) \frac{1}{2}\\
    &=\frac{1}{2}\Big(\frac{1}{2d} \Big)^{L-l-1}
\end{align}



\begin{align}
    \mathbb{E}\Big[\big(S_l^{(i)}\big)^2\Big] &=e^2 \frac{1}{d}\mathbb{E}\big[A_{|i}^2\big]\\
    &=e^2 \frac{1}{d}\frac{1}{2}\Big(\frac{1}{2d} \Big)^{L-l-1}\\
    &= e^2 \Big(\frac{1}{2d}\Big)^{L-l}
\end{align}


\begin{align}
    \langle H_{l,t},H_{l,t'}\rangle^2 &= \langle P_{l,t}X_t,P_{l,t'}X_{t'}\rangle^2 \\
    &= (X_t^\top,P_{l,t}^\top P_{l,t'}X_{t'})^2 \\
\end{align}

Under the following isotropy assumption $\mathbb{E}[(P_{l,t}^\top P_{l,t'})]=\lambda I$

\begin{align}
    \mathbb{E}\big[\langle H_{l,t},H_{l,t'}\rangle^2\big] &= \mathbb{E}\big[(X_t^\top P_{l,t}^\top P_{l,t'}X_{t'})^2\big]\\
    &= \mathbb{E}\big[(X_t^\top P_{l,t}^\top P_{l,t'}X_{t'})(X_t^\top P_{l,t}^\top P_{l,t'}X_{t'})\big]\\
    &= (X_t^\top X_{t'}) I\lambda (X_t^\top X_{t'})I\lambda\\
    &=\lambda^2(X_t^\top X_{t'})^2\\
    &=\lambda^2\langle X_t, X_{t'}\rangle^2
\end{align}

The last piece is to calculate $\lambda$

\begin{align}
    \lambda I &= \mathbb{E}[(P_{l,t}^\top P_{l,t'})]\\
    &=\mathbb{E}\bigg[\bigg(\prod_{j=1}^{l-1} \Theta_{j,t}D_{j,t}  \bigg)\bigg( \prod_{j=1}^{l-1} \Theta_{j,t'}D_{j,t'}\bigg)^\top\bigg]\\
    &=\mathbb{E}\Big[ (\Theta_{1,t}D_{1,t}\Theta_{2,t}D_{2,t}\dots \Theta_{l-1,t}D_{l-1,t})(D_{l-1,t'}\Theta_{l-1,t'}^\top\dots D_{2,t'}\Theta_{2,t'}^\top D_{1,t'}\Theta_{1,t'}^\top)\Big]
\end{align}

The expectation is calculated from the innermost to the outermost product. The following expectations of products appear in an alternating fashion.

\begin{align}
    \mathbb{E}[(D_{k,t}D_{k,t'})] &= \mathbb{E}[D_{k,t}]\mathbb{E}[D_{k,t'}] && D_{k,t}\perp D_{k,t'}\\
    &=\frac{1}{2}\frac{1}{2}=\frac{1}{4}
\end{align}

\begin{align}
    \mathbb{E}[\Theta_{k,t}\Theta_{k,t'}^\top]&\approx \mathbb{E}[\Theta_{k,0}\Theta_{k,0}^\top]\\
    &=\mathbb{E}[\Theta_{k,0}^2]\\
    &=\operatorname{Var}[\Theta_{k,0}] = \frac{1}{d}
\end{align}

Since there are $l-1$ alternating products, we obtain the following expression for $\lambda$ at the end.

\begin{align}
    \lambda = \bigg(\frac{1}{4d}\bigg)^{l-1} 
\end{align}
    

\begin{align}
    \operatorname{Var}[G_l] &= \mathbb{E} [G_l^2] \\
    &= \mathbb{E}\Big[\langle H_{l,t},H_{l,t'}\rangle^2\sum _{i=1}^{d_l} \big(S_{l,t}^{(i)}\big)^2 \big(S_{l,t'}^{(i)}\big)^2\Big]\\
    &=\mathbb{E}\big[\langle H_{l,t},H_{l,t'}\rangle^2\big] \mathbb{E}\bigg[\sum _{i=1}^{d_l} \big(S_{l,t}^{(i)}\big)^2 \big(S_{l,t'}^{(i)}\big)^2\bigg] &&\text{assume} \perp\\
    &=\mathbb{E}\big[\langle H_{l,t},H_{l,t'}\rangle^2\big] \sum _{i=1}^{d_l} \mathbb{E}\Big[\big(S_{l,t}^{(i)}\big)^2\Big] \mathbb{E}\Big[\big(S_{l,t'}^{(i)}\big)^2\Big] &&\text{assume} \perp\\
    &= \langle X_t, X_{t'}\rangle^2 \bigg(\frac{1}{4d}\bigg)^{2(l-1)} \underbrace{e_t^2 e_{t'}^2}_{K^2} \Big(\frac{1}{2d}\Big)^{2(L-l)}d\\
    &= \langle X_t, X_{t'}\rangle^2 K^2 \bigg( \frac{1}{2^{2L+l-2}} \frac{1}{d^{L-1}} \bigg)^2 d
\end{align}


\begin{equation}
    \mathcal{B}_l =\operatorname{std}[G_l] = \langle X_t, X_{t'}\rangle K \frac{1}{2^{2L+l-2}} \frac{\sqrt{d}}{d^{L-1}}
\end{equation}

\begin{align}
    \mathcal{B} &= \mathcal{B}_L + \sum_{l>1}^L \mathcal{B}_l\\
    &=\langle X_t, X_{t'}\rangle K\Bigg(\frac{1}{\sqrt{2d}}+ \frac{\sqrt{d}}{d^{L-1}}\sum_{l=1}^{L-1} \frac{1}{2^{2L+l-2}}\Bigg)
\end{align}