\section{Lower Bound Analysis}

In this section, we construct a theoretical lower bound for the convergence rate of the distributed learning protocol. We demonstrate the existence of a specific data distribution and network configuration where the excess loss decays at a rate of $\Omega(k/D)$, where $D$ is the depth of the network and $k$ is the dimension of the feature space. This result confirms that the sequential nature of the protocol makes network depth a fundamental bottleneck for information aggregation.

\subsection{Problem Construction}
\label{section:lb-construction}

We define a hard instance that exploits the information bottleneck inherent in sequential logit aggregation.

\paragraph{Data Distribution.}
Let $k \ge 2$ be the dimension of the feature space. Consider a sequence of independent latent variables $Z_1, Z_2, \dots, Z_k \sim \mathcal{N}(0,1)$. We define the observable features $x_1, \dots, x_k \in \mathbb{R}$ as follows:
\begin{align}
    x_1 &= Z_1 \\
    x_i &= Z_i - Z_{i-1}, \quad \text{for } 2 \le i \le k
\end{align}
By construction, the latent variable $Z_i$ can be recovered by the prefix sum of features: $\sum_{j=1}^i x_j = Z_i$. We define the binary target label $y \in \{0,1\}$ based on the final latent variable $Z_k$ via the logistic model:
\begin{equation}
    P(y=1|x) = \sigma(Z_k) = \sigma\left(\sum_{j=1}^k x_j\right)
\end{equation}
Thus, the optimal global logit predictor is $z^*(x) = Z_k$, which requires access to all $k$ features to cancel the intermediate noise terms.

\paragraph{Network and Assignment.}
Consider a path of agents $A_1, \dots, A_D$. The agents observe features one-at-a-time in a repeating cyclic order. Agent $A_i$ observes the single feature $x_{\ell}$ where $\ell = ((i-1) \pmod k) + 1$. The network structure is a simple path where $\mathrm{Pa}(A_i) = \{A_{i-1}\}$.

We define a \textit{pass} $p$ as $p$-th disjoint block of $k$ agents. Specifically, the $p$-th pass consists of the agents $A_{(p-1)k + 1}, \dots, A_{pk}$.

\subsection{Information Capacity and Variance}

We analyze the capacity of the final agent in pass $p$ to reconstruct the target $Z_k$ by identifying which features can be effectively decoded from the scalar stream.

The following lemma characterizes the restricted information available to agents. This result follows the methods of Lemma 5.5 in \cite{kearns2026networked}, which establishes an analogous result. We provide the proof for completeness.

\begin{lemma}[Recursive Information Relevance]
    \label{lemma:relevance}
    For any pass $p$ with $p \le k$, define the feature subset $\mathcal{I}_p = \{x_k, x_{k-1}, \dots, x_{k-p+1}\}$. The optimal logistic predictor for $\sigma(Z_k)$ at the end of pass $p$ depends solely on the features in $\mathcal{I}_p$.
\end{lemma}

\begin{proof}
    We proceed by induction on the pass index $p$.
    
    \textbf{Base Case ($p=1$):} The first pass ends at agent $A_k$, who observes the local feature $x_k = Z_k - Z_{k-1}$. Preceding agents $A_1, \dots, A_{k-1}$ observe features $x_1, \dots, x_{k-1}$, all of which are independent of the target $Z_k$ and thus the label $y$. Because each agent $A_i$ (for $i < k$) minimizes its local BCE loss using only information independent of the label, each sequentially transmits a logit of 0 to its successor. Consequently, $A_k$ receives a logit of 0 from $A_{k-1}$ and must rely exclusively on its local observation $x_k$ to predict $y$. This establishes the effective information set $\mathcal{I}_1 = \{x_k\}$.
    
   \textbf{Inductive Step:} Assume at the end of pass $p$, the optimal predictor $z^{(p)}$ is a function only of the features in $\mathcal{I}_p = \{x_k, x_{k-1}, \dots, x_{k-p+1}\}$. In pass $p+1$, the initial sequence of agents observe features $x_1, \dots, x_{k-p-1}$. Because these features are independent of the current information set $\mathcal{I}_p$, they are also independent of the incoming logit $z^{(p)}$ and the label $y$. Consequently, these agents cannot improve the prediction; they sequentially forward the logit $z^{(p)}$ to one another without modification. This process continues until an agent observes $x_{k-p} = Z_{k-p} - Z_{k-p-1}$. This new feature is correlated with the latent variable $Z_{k-p}$ currently acting as noise in $z^{(p)}$, allowing the agent to partially cancel that noise and improve the estimate of $Z_k$. Subsequent agents in the pass observe features independent of this updated residual. Thus, the relevant information set expands by exactly one feature: $\mathcal{I}_{p+1} = \mathcal{I}_p \cup \{x_{k-p}\}$.
\end{proof}

Given this restriction, any logit $z^{(p)}$ generated at the end of pass $p$ is a linear function of the features in $\mathcal{I}_p$. We analyze this linear predictor in the following lemma.

\begin{lemma} \label{lemma:lb-predict-s-p}
    Let $z^{(p)}$ be a linear predictor based on $\mathcal{I}_p$. Then $z^{(p)}$ is given by
    \begin{align*}
        z^{(p)} = c\left(Z_k + \frac{1}{\sqrt{p}}\xi\right),
    \end{align*}
    where $c \in \mathbb{R}$ is a constant and $\xi \sim \mathcal{N}(0, V_p)$ is independent of $Z_k$. For a fixed $c$, minimizing the variance $V_p$ yields $V_p = 1$. 
\end{lemma}

\begin{proof}
Define $z^{(p)} = \sum_{j=0}^{p-1} c_jx_{k-j}$ with coefficients $c_0, \dots, c_{p-1}$. We rewrite this expression in terms of the variables $Z_k, \dots, Z_{k-p}$:
\begin{align*}
    z^{(p)} = c_0Z_k + \sum_{j=1}^{p-1} (c_j - c_{j-1}) Z_{k-j} - c_{p-1} Z_{k-p}.
\end{align*}
Let $c = c_0$ and $\alpha_j = c_j - c_{j-1}$. Substituting these terms yields:
\begin{align*}
    z^{(p)} = cZ_k + \sum_{j=1}^{p-1} \alpha_j Z_{k-j} - \left(\sum_{j=1}^{p-1} \alpha_j + c \right) Z_{k-p}.
\end{align*}
Define the residual $\eta = z^{(p)} - cZ_k$. Since $Z_i$ are i.i.d. $\mathcal{N}(0,1)$, the variance of $\eta$ is $\text{Var}(\eta) = \sum_{j=1}^{p-1} \alpha_j^2 + \left(\sum_{j=1}^{p-1} \alpha_j + c\right)^2$.
Define $\xi = \frac{\sqrt{p}}{c} \eta$. It follows that $\xi \sim \mathcal{N}(0, V_p)$, where $V_p = \frac{p}{c^2} \text{Var}(\eta)$. This establishes the form of the predictor.

Next, we fix $c$ and minimize $V_p$, which is equivalent to minimizing $\text{Var}(\eta)$. Let $S = \sum_{j=1}^{p-1} \alpha_j$. The variance can be written as $\text{Var}(\eta) = \sum_{j=1}^{p-1} \alpha_j^2 + (S+c)^2$.
For a fixed sum $S$, the term $\sum \alpha_j^2$ is minimized when all $\alpha_j$ are equal, yielding $\text{Var}(\eta) = \frac{S^2}{p-1} + (S+c)^2$.
Differentiating with respect to $S$ and setting the result to zero, we find the minimum occurs at $S = -c(p-1)/p$. Substituting this value back, the minimum variance is $c^2/p$. Consequently, $V_p = \text{Var}(\eta) \frac{p}{c^2} = 1$.
\end{proof}

Next, we analyze the properties of the noise term $\xi$ in the linear predictor defined in Lemma~\ref{lemma:lb-predict-s-p} and examine its impact on the BCE loss.

\begin{lemma} \label{lem:minimize-noise-var}
    Let $z_v = cZ_k + \xi_v$, where $\xi_v \sim \mathcal{N}(0, v)$ is independent of $Z_k$. For a fixed $c$, if $u > v$, then:
    \begin{align*}
        L(z_v) < L(z_{u}),
    \end{align*}
    where $L(z)$ denotes the BCE loss.
\end{lemma}
\begin{proof}
We first analyze the loss conditioned on $Z_k$. Assuming $y|Z_k \sim \text{Bernoulli}(\sigma(Z_k))$, we expand the conditional loss as:
\begin{align}\label{eq:l-cond-zk}
    L(z | Z_k) &= \mathbb{E}_{y | Z_k} \left[ -yz + \log(1 + e^z) \right] \notag\\
    &= -\sigma(Z_k)z + \log(1+e^z).
\end{align}
Define $g(z) = L(z|Z_k)$. Differentiating with respect to $z$ yields $g'(z) = -\sigma(Z_k) + \sigma(z)$. The second derivative is $g''(z) = \sigma(z)(1-\sigma(z))$. Since $\sigma(z) \in (0,1)$, $g''(z) > 0$, implying $g$ is strictly convex.

The total loss for $z_v$ is $L(z_v) = \mathbb{E}_{Z_k}\left[ \mathbb{E}_{\xi_v}\left[ g(cZ_k + \xi_v) \right]\right]$. To compare $z_u$ and $z_v$, let $\delta \sim \mathcal{N}(0, u-v)$ be independent of $\xi_v$. We can model the higher variance noise as $\xi_u = \xi_v + \delta$. Applying Jensen's inequality to the strictly convex function $g$:
\begin{align*}
    \mathbb{E}_\delta[g(cZ_k + \xi_v + \delta)] &> g(\mathbb{E}_\delta[cZ_k + \xi_v + \delta]) \\
    &= g(cZ_k+\xi_v).
\end{align*}
Taking the expectation over $Z_k$ and $\xi_v$ on both sides yields $L(z_u) > L(z_v)$.
\end{proof}

We next demonstrate that for the optimal predictor in the form of Lemma~\ref{lemma:lb-predict-s-p}, the scaling factor $c$ is strictly within the interval $(0,1)$.

\begin{lemma} \label{lem:lb-c-range}
    Let $z_c = c(Z_k + \xi)$, where $\xi \sim \mathcal{N}(0, v)$ with $v > 0$, and $\xi$ is independent of $Z_k$. The optimal scaling factor $c$ minimizing $L(z_c)$ satisfies $c \in (0,1)$.
\end{lemma}
\begin{proof}
Define $S = Z_k + \xi$. Note that $S \sim \mathcal{N}(0, 1+v)$. We expand the loss $L(z_c)$:
\begin{align*}
    L(z_c) = \mathbb{E}\left[ -\sigma(Z_k) \cdot cS + \log(1 + e^{cS}) \right].
\end{align*}
Let $g(c) = L(z_c)$. Differentiating with respect to $c$:
\begin{align*}
    g'(c) &= \mathbb{E}[-S\sigma(Z_k) + S\sigma(cS)] \\
    &= \mathbb{E}[-Z_k\sigma(Z_k)] + \mathbb{E}[S\sigma(cS)].
\end{align*}
The second derivative is $g''(c) = \mathbb{E}[S^2\sigma'(cS)]$. Since the sigmoid derivative is strictly positive, $g''(c) > 0$, implying that $g$ is strictly convex.

Evaluating the gradient at $c=0$:
\begin{align*}
    g'(0) &= -\mathbb{E}[(Z_k + \xi)\sigma(Z_k)] + \mathbb{E}\left[S \cdot \frac{1}{2}\right] \\
    &= -\mathbb{E}[Z_k\sigma(Z_k)].
\end{align*}
We observe that $\mathbb{E}[Z_k\sigma(Z_k)] = \text{Cov}(Z_k,\sigma(Z_k)) > 0$. Therefore, $g'(0) < 0$, which implies the minimizer of $g$ must lie to the right of 0.

Next, consider the gradient at $c=1$:
\begin{align*}
    g'(1) &= -\mathbb{E}[Z_k\sigma(Z_k)] + \mathbb{E}[S\sigma(S)].
\end{align*}
Define the function $h(u) = \mathbb{E}_X[X \sigma(X)]$ where $X \sim \mathcal{N}(0,u^2)$. We observe that $g'(1) = h(\sqrt{1+v}) - h(1)$. If $h(u)$ is increasing for $u > 0$, then $g'(1) > 0$, meaning the minimizer must be less than 1.

Using the reparameterization $X = uX'$ where $X' \sim \mathcal{N}(0,1)$, we write $h(u) = u \cdot \mathbb{E}_{X'} [X'\sigma(uX')]$. The derivative is:
\begin{align*}
    h'(u) = \mathbb{E}_{X'}[X' \sigma(uX')] + \mathbb{E}_{X'}[u (X')^2\sigma'(uX')].
\end{align*}
For $u > 0$, the first term is positive as it equals $\frac{1}{u}\text{Cov}(uX', \sigma(uX'))$. The second term is non-negative since the term inside the expectation is non-negative. Thus, $h'(u) > 0$ for $u > 0$. This concludes that the optimal $c$ lies in the interval $(0,1)$.
\end{proof}

\subsection{Connection to Excess Loss}

Finally, we connect the variance of the logit estimator to the BCE loss to establish a lower bound on the excess loss.

\begin{theorem}[Lower Bound on Convergence]
    Let $k$ denote the dimension of the feature space. Consider the feature distribution and network construction defined in \Cref{section:lb-construction}. For the agent at the end of the pass $p$ (where $p \le k-1$), let $p^*$ denote the optimal global logistic predictor and $p_D$ (where $D = kp$) the predictor of the final agent. The excess loss is lower bounded by:
    \begin{equation*}
        L(p_D) - L(p^*) = \Omega\left(\frac{1}{p}\right) = \Omega\left(\frac{k}{D}\right).
    \end{equation*}
\end{theorem}

\begin{proof}
We begin by relating the excess loss to the expected squared difference in the probability space. Invoking Lemma~\ref{lem:pythagorean} and Lemma~\ref{lem:kl-mse}, the loss difference satisfies:
\begin{equation*}
    L(p_D) - L(p^*) = \mathbb{E}[D(p^* || p_D)] \ge 2 \mathbb{E}\left[ (p^* - p_D)^2 \right].
\end{equation*}

The agent $A_D$ operates at the end of pass $p$. Let $z_D$ be the logit of this agent such that $p_D = \sigma(z_D)$. By Lemma~\ref{lemma:relevance}, $A_D$ relies only on the information in $\mathcal{I}_p$. Consequently, by Lemma~\ref{lemma:lb-predict-s-p}, $z_D$ takes the form:
\begin{align} \label{eq:lb-thm-z-D}
    z_D = c\left(Z_k + \frac{1}{\sqrt{p}}\xi\right),
\end{align}
for some constant $c$, where $\xi \sim \mathcal{N}(0,V_p)$. Since $z_D$ minimizes the BCE loss (Lemma~\ref{lem:minimize-noise-var}), $V_p$ must be minimized. Lemma~\ref{lemma:lb-predict-s-p} states this minimum occurs at $V_p = 1$. Furthermore, Lemma~\ref{lem:lb-c-range} implies $c \in (0,1)$.

To establish a lower bound, we relate the squared error in probabilities $(p^* - p_D)^2$ to the squared error in logits $(Z_k - z_D)^2$ using Mean Value
Theorem. However, the sigmoid derivative vanishes for large inputs, which could dampen the probability difference even if the logit error is large. To address this, we restrict our analysis to a bounded region where the sigmoid function has non-vanishing derivative. Define the event $\mathcal{B}_R$ where both $Z_k$ and $\xi$ are bounded by $R$:
\begin{equation*}
    \mathcal{B}_R := \{ |Z_k| < R, |\xi| < R \}.
\end{equation*}
By selecting a sufficiently large $R$, $\mathcal{B}_R$ captures a constant fraction of the probability mass. On this set, the arguments to the sigmoid function are bounded. By the Mean Value Theorem, there exists $\eta$ between $Z_k$ and $z_D$ such that
$$p^* - p_D= \sigma(Z_k) - \sigma(z_D)  = \sigma'(\eta)(Z_k - z_D)$$ 
Since $|\eta| < 2R$ on $\mathcal{B}_R$, $\sigma'(\eta) \ge C_{\text{univ}} > 0$ for some constant $C_{\text{univ}}$ dependent only on $R$.

Restricting the expectation to $\mathcal{B}_R$ and defining $C_1 = 2C_{\text{univ}}^2$, we have:
\begin{equation*}
    2 \mathbb{E}\left[ (p^* - p_D)^2 \mathbf{1}_{\mathcal{B}_R} \right] \ge C_1 \mathbb{E} \left[ (Z_k - z_D)^2 \mathbf{1}_{\mathcal{B}_R} \right].
\end{equation*}

Substituting the estimator form from \Cref{eq:lb-thm-z-D}, we expand the quadratic term:
\begin{align*}
\mathbb{E}&\left[\left((1-c)Z_k - \frac{c}{\sqrt{p}}\xi\right)^2 \mathbf{1}_{\mathcal{B}_R}\right] \\
&= (1-c)^2\mathbb{E}[Z_k^2 \mathbf{1}_{\mathcal{B}_R}] - \frac{2c(1-c)}{\sqrt{p}}\mathbb{E}[Z_k \xi \mathbf{1}_{\mathcal{B}_R}] \\
&\quad\; + \frac{c^2}{p}\mathbb{E}[\xi^2 \mathbf{1}_{\mathcal{B}_R}].
\end{align*}

 Since $Z_k$ and $\xi$ are independent and centered, and the region $\mathcal{B}_R$ is symmetric about the origin for both variables, $\mathbb{E}[Z_k \xi \mathbf{1}_{\mathcal{B}_R}] = 0$. 

Since $Z_k$ and $\xi$ follow the same distribution on $\mathcal{B}_R$, we define $C_2 = \mathbb{E}[Z_k^2 \mathbf{1}_{\mathcal{B}_R}] = \mathbb{E}[\xi^2 \mathbf{1}_{\mathcal{B}_R}]$. This yields:
\begin{equation*}
    L(p_D) - L(p^*) \ge C_1 C_2 \left( (1-c)^2 + \frac{c^2}{p} \right).
\end{equation*}

The quadratic function $(1-c)^2 + \frac{c^2}{p}$ is minimized at $c = \frac{p}{p+1}$. Substituting this value:
\begin{align*}
    L(p_D) - L(p^*) &\ge \frac{C_1 C_2}{p+1} = \Omega\left(\frac{1}{p}\right).
\end{align*}

Recalling that $p = D/k$, we conclude:
\begin{equation*}
    L(p_D) - L(p^*) = \Omega\left(\frac{k}{D}\right). \qedhere
\end{equation*}
\end{proof}