\section{Upper Bounds}

This section analyzes information aggregation within a network of logistic regression agents. We demonstrate that sequentially minimizing Binary Cross Entropy (BCE) allows the network to approximate the global predictor derived from all features, assuming sufficient network depth and feature coverage.

We first establish that the residuals of the BCE loss optimizer are orthogonal to the input. A similar result was previously derived by \cite{kearns2026networked} for the linear regression with MSE loss.

\begin{lemma}[Orthogonality of Residuals] \label{lem:orthogonality}
Let $p^*$ be the optimal logistic predictor on a feature space $\mathcal{X}$. The residual error $(p^*(x) - y)$ is orthogonal to the feature vector $x$ in expectation:
\begin{equation*}
    \mathbb{E} \left[ x (p^*(x) - y) \right] = 0.
\end{equation*}
\end{lemma}
\begin{proof}
Given $p^{(\theta)}(x) = \sigma(\theta^T x)$, the gradient of the logistic output is $\nabla_\theta p^{(\theta)}(x) = p^{(\theta)}(x) (1 - p^{(\theta)}(x)) x$. Applying the chain rule to $L(\theta)$ yields:
\begin{align*}
    \nabla_\theta L(\theta) &= -\mathbb{E} \left[ (y(1 - p^{(\theta)}(x)) - (1-y)p^{(\theta)}(x)) x \right] \\
    &= \mathbb{E} \left[ (p^{(\theta)}(x) - y) x \right].
\end{align*}
The optimal parameters $\theta^*$ satisfy the condition $\nabla_\theta L(\theta^*) = 0$. Thus:
\begin{equation*}
    \mathbb{E} \left[ x (p^*(x) - y) \right] = 0. \qedhere
\end{equation*}
\end{proof}

This orthogonality allows us to decompose the error of any suboptimal model. We express the error of a suboptimal model in terms of the optimal predictor using the expected Kullback-Leibler divergence of the Bernoulli distribution, defined as follows.

\begin{definition}
    Let $p$ and $q$ be two predictors on feature space $\mathcal{X}$. Then $D(p \| q)$ is defined as:
    \begin{equation*}
        D(p\|q) = \mathbb{E} \left[ D_{\text{KL}}\left(\text{Bernoulli}(p(x)) \| D_{\text{KL}}(\text{Bernoulli}(q(x)\right) \right],
    \end{equation*}
    where $D_{\text{KL}}(p' \| q')$ is the Kullback-Leibler divergence between distributions $p'$ and $q'$, given by:
    \begin{equation*}
        D_{\text{KL}}(p' \| q') = \sum_{x} p'(x) \log \frac{p'(x)}{q'(x)}.
    \end{equation*}
    Expanding this definition, $D(p \| q)$ becomes:
    \begin{equation*}
        D(p\|q) = \mathbb{E}\left[p(x) \log \frac{p(x)}{q(x)} + (1-p(x)) \log \frac{1 - p(x)}{1 - q(x)}\right].
    \end{equation*}
\end{definition}

\begin{lemma}[Decomposing Loss] \label{lem:pythagorean}
Let $p^*$ be the optimal logistic predictor on a feature set $S$, and let $q$ be any logistic predictor on $S$. The loss decomposes as:
\[
L(q) = L(p^*) + D(p^* \| q).
\]
\end{lemma}

\begin{proof}
Using the identity $\log \sigma(z) = z - \log(1 + e^z)$, we write the loss with $z = \theta^T x$ as:
\begin{align*}
L(\theta) &= - \mathbb{E} \left[ yz - \log(1 + e^z) \right] \\
&= \mathbb{E} \left[ \log(1+e^{\theta^T x}) - y(\theta^T x) \right].
\end{align*}
Let $\theta^*$ be the corresponding parameters of $p^*$. Expanding the difference $L(\theta) - L(\theta^*)$:
\begin{align*}
L(\theta) - L(\theta^*) = \mathbb{E} &\Bigl[ \log(1+e^{\theta^T x}) \\
&\quad - \log(1 + e^{(\theta^*)^T x}) \\
&\quad - y((\theta - \theta^*)^T x) \Bigr].
\end{align*}
Adding and subtracting $p^*(x)(\theta - \theta^*)^T x$ inside the expectation yields:
\begin{align}
\label{eq:q-from-ps-loss}
L(\theta) - L(\theta^*)
= &\mathbb{E}\Bigl[
\log(1+e^{\theta^T x})
- \log\bigl(1 + e^{(\theta^*)^T x}\bigr)
\notag\\
&\quad\; - p^*(x)\,(\theta - \theta^*)^T x \Bigr]
\notag\\
& + \mathbb{E}\Bigl[(p^*(x) - y)\,(\theta - \theta^*)^T x\Bigr].
\end{align}

The second term is zero due to the orthogonality condition derived in Lemma~\ref{lem:orthogonality}. For the first term, we expand the definition of $D(p^* \| q)$ with $z = \theta^T x$ and $z^* = (\theta^*)^T x$:
\begin{align*}
D(p^* \| q)
&= \mathbb{E} \Bigl[ p^*(x)\,(z^* - z)
- \log\left(1 + e^{z^*}\right) \\
&\quad\quad\quad + \log\left(1 + e^{z}\right) \Bigr]
\\
&= \mathbb{E} \Bigl[ \log\left(1 + e^{\theta^T x}\right)
- \log\left(1 + e^{(\theta^*)^T x}\right)
\\
&\quad\quad\quad
- p^*(x)\,(\theta - \theta^*)^T x \Bigr].
\end{align*}

This matches the first term in \Cref{eq:q-from-ps-loss}, completing the proof.
\end{proof}

To bound the parameter error using the KL divergence, we use the following inequality. This is a specific case of Pinsker's inequality \cite{pinsker1964information}, included here for completeness.

\begin{restatable}{lemma}{lemKLMSE} \label{lem:kl-mse}
    For the expected KL divergence $D(p \| q)$, the following inequality holds:
    \begin{align*}
        D(p \|q) \ge 2\mathbb{E}\left[(p(x) - q(x))^2\right].
    \end{align*}
\end{restatable}

We refer to Appendix~\ref{appendix:proofs} for the proof. We define the pointwise loss function as:
\begin{align}
    l(z,y) = \log(1+e^z) - yz.
\end{align}
We can thus write $L(p) = \mathbb{E}[l(z(x), y)]$.

\begin{lemma}
\label{lem:loss-convexity}
Let $g(x) = \sigma(z_g(x))$ be any logistic predictor. Let $S$ be a feature subspace and $p(x) = \sigma(z_p(x))$ be the predictor that minimizes $L(p)$ over $S$. Then:
\begin{equation*}
    L(p) \le L(g) + |\mathbb{E}[(p-y)z_g]|.
\end{equation*}
\end{lemma}
\begin{proof}
Let $\phi(z) = \log(1+e^z)$. The derivatives are $\phi'(z) = \sigma(z)$ and $\phi''(z) = \sigma(z)(1-\sigma(z))$. Since $\sigma(z) \in (0,1)$, we have $\phi''(z) \ge 0$, implying $\phi$ is convex. Convexity implies that for any $u, v \in \mathbb{R}$, $\phi(v) \ge \phi(u) + \phi'(u)(v-u)$. Rearranging implies the following:
\begin{equation} \label{eq:phi-convex}
    \phi(u) \le \phi(v) + \sigma(u)(u-v).
\end{equation}
We define the relationship between the losses $l(u,y)$ and $l(v,y)$. Substituting $l(z,y) = \phi(z) - yz$, we aim to show the following inequality:
\begin{equation} \label{eq:l-convex}
    l(u,y) \le l(v,y) + (\sigma(u)-y)(u-v).
\end{equation}
Expanding terms confirms this holds given the convexity of $\phi$ in \Cref{eq:phi-convex}:
\begin{align*}
    \phi(u) - yu &\le \phi(v) - yv + \sigma(u)(u-v) - yu + yv \\
    \iff \phi(u) &\le \phi(v) + \sigma(u)(u-v).
\end{align*}

Now, for a point $x$, let $u = z_p(x)$ and $v = z_g(x)$. Applying \Cref{eq:l-convex}:
\begin{align*}
    l(z_p, y) \le l(z_g,y) + (\sigma(z_p)-y)(z_p-z_g).
\end{align*}
Taking the expectation over $x$:
\begin{align*}
    L(p) &\le L(g) + \mathbb{E}[(p-y)(z_p-z_g)] \\
    &= L(g) + \mathbb{E}[(p-y)z_p] - \mathbb{E}[(p-y)z_g].
\end{align*}
From Lemma~\ref{lem:orthogonality} (Orthogonality), we know that for any feature $x_l$ in the support of $p$, $\mathbb{E}[x_l(p-y)] = 0$. Since $z_p$ is a linear combination of such features, $\mathbb{E}[(p-y)z_p] = 0$. Substituting this yields:
\begin{equation*}
    L(p) \le L(g) - \mathbb{E}[(p-y)z_g] \le L(g) + |\mathbb{E}[(p-y)z_g]|. \qedhere
\end{equation*}
\end{proof}

We consider a path of agents $A_1, \dots, A_D$. Each agent $i$ receives the logit $z_{i-1}$ from its predecessor and trains a logistic predictor model using locally observed features $x_{S_i}$, $z_{i-1}$, and possibly some other predecessors' logits. Since one option for the agent $A_i$ is to pass the logits $z_{i-1}$ through, we have that $L(p_{i-1}) \ge L(p_i)$. We also get that Lemma~\ref{lem:pythagorean} holds for $p_{i-1}$ and $p_i$, since $p_{i-1}$ is in the stricter subspace of $p_i$.

We use the notation $\|f(x)\|_2 = \sqrt{\mathbb{E}\left[f(x)^2\right]}$ for any function $f$.

\begin{lemma}[Residual Bound via Path Coverage]
\label{lem:residual-bound}
Let $A_1, \dots, A_k$ be a path of agents where every feature $x_l$ is observed at least once. Let $g(x) = \sigma(z_g(x))$ where $z_g(x) = \sum_{l=1}^d \alpha_lx_l$ be any logistic predictor over the whole space. Assume the coefficients of $z_g$ satisfy $\sum_{l=1}^d |\alpha_l| \le B_g$, and the features satisfy $\mathbb{E}[x_l^2] \le B_X^2$, for some $B_g$ and $B_X$. Let $\varepsilon \ge L(p_1) - L(p_k)$. Then:
\begin{align*}
    |\mathbb{E}[(p_k-y)z_g]| \le B_g B_X \sqrt{\frac{k\varepsilon}{2}}.
\end{align*}
\end{lemma}
\begin{proof}
Let $z_g(x) = \sum_{l=1}^d \alpha_l x_l$. We bound the error term:
\begin{align*}
    |\mathbb{E}[(p_k-y)z_g]| &= \left|\mathbb{E}\left[\sum_{l=1}^d \alpha_l x_l(p_k-y)\right]\right| \\
    &\le \sum_{l=1}^d |\alpha_l|\; |\mathbb{E}[x_l(p_k-y)]|.
\end{align*}
Consider a feature $x_l$. Due to each feature being observed, this feature appears in the index set of some agent $A_j$ in the path. By orthogonality, $\mathbb{E}[x_l(p_j-y)] = 0$. We decompose the expectation using the triangle inequality:
\begin{align*}
    |\mathbb{E}[x_l(p_k-y)]| &\le |\mathbb{E}[x_l(p_k-p_j)]| + |\mathbb{E}[x_l(p_j-y)]| \\
    &= |\mathbb{E}[x_l(p_k-p_j)]|.
\end{align*}
Applying the Cauchy-Schwarz inequality:
\begin{align*}
    |\mathbb{E}[x_l(p_k-p_j)]| &\le \sqrt{\mathbb{E}[x_l^2]} \sqrt{\mathbb{E}[(p_k-p_j)^2]} \\
    &= \|x_l\|_2 \|p_k-p_j\|_2.
\end{align*}
Given $\|x_l\|_2 \le B_X$, we bound $\|p_k-p_j\|_2$ using the loss difference $\varepsilon$. Applying Lemma~\ref{lem:kl-mse}, we get for any $s \in \{1,\dots,k-1\}$:
\begin{equation*}
\mathbb{E}\left[(p_{s} - p_{s+1})^2\right] \leq \frac{1}{2} D(p_{s+1} \| p_{s}).
\end{equation*}
By the triangle inequality and Cauchy-Schwarz:
\begin{align*}
\|p_j - p_k\|_2 
&\leq \sum_{s=j}^{k-1} \sqrt{\frac{D(p_{s+1} \| p_{s})}{2}} \\
&\leq \sqrt{\frac{k \sum_{s=1}^{k-1} D(p_{s+1} \| p_{s})}{2}} \\
&\leq \sqrt{\frac{k \varepsilon}{2}}.
\end{align*}
Combining these bounds with the constraint on $\alpha_l$:
\begin{equation*}
    |\mathbb{E}[(p_k-y)z_g]| \le \sum_{l=1}^d |\alpha_l| \cdot |\mathbb{E}[x_l(p_k-y)]| \le  B_g B_X \sqrt{\frac{k\varepsilon}{2}}. \qedhere
\end{equation*}
\end{proof}

We now give the below definition.
\begin{definition}[$M$-Coverage Condition, \citet{kearns2026networked}]
A path satisfies the $M$-coverage condition if every contiguous subsequence of $M$ agents collectively observes all $d$ features $x_1, \dots, x_d$.
\end{definition}

We are finally ready to prove Theorem~\ref{thm:convergence}. Combining Lemma~\ref{lem:loss-convexity} and Lemma~\ref{lem:residual-bound}, we obtain the relationship $L(p_k) \le L(g) + B_g B_X \sqrt{k\varepsilon/2}$ for a path of length $k$. Extending this analysis over the full path satisfying the $M$-coverage condition leads to our main convergence result.

\begin{theorem}[Global Convergence Rate]\label{thm:convergence}
Consider a DAG $G$ containing a path of length $D$ of agents $A_1,\dots,A_D$ satisfying the $M$-coverage condition. Let $p^*$ be the global optimal logistic predictor over all $d$ features. Assume:
\begin{enumerate} 
    \item Bounded second moments: $\mathbb{E} [x_l^2] \le B_X^2$ for all $l \in \{1,\dots,d\}$. 
    \item Bounded coefficients: for the optimal logits $z^*(x) = \sum_l \alpha_lx_l$ where $\| \alpha \|_1 \le B_{p^*}$.
\end{enumerate}
Then the excess risk of the final agent $p_D$ is bounded by:
\begin{equation*}
L(p_D) - L(p^*) \leq B_{p^*} B_X \frac{M}{\sqrt{D}} =O\left( \frac{M}{\sqrt{D}} \right). 
\end{equation*}
\end{theorem}
\begin{proof}
We partition the path into $K = \lfloor D/M \rfloor$ disjoint blocks of length $M$. By the Pigeonhole Principle, since the total loss reduction is bounded by the loss of the first agent $L(p_1)$, there exists at least one \emph{stable} block $k^*$ where the reduction is at most the total reduction divided by $K$. Suppose this block $k^*$ is on indices $s, s+1,\dots, t$.
\begin{equation*}
\sum_{i=s+1}^t \left( L(p_{i-1}) - L(p_i) \right) \leq \frac{L(p_1)}{K} \le \frac{2M L(p_1)}{D} := \varepsilon. 
\end{equation*}
Applying Lemma~\ref{lem:loss-convexity} and Lemma~\ref{lem:residual-bound}, we get that over this path $L(p_t) \le L(p^*) + B_{p^*}B_X\sqrt{M\varepsilon/2}$. Next, note that $L(p_1) \le \log 2$ since using $\theta_1 = 0$ achieves a loss of $\log 2$, and because the first agent optimizes within its domain then $L(p_1) \le \log 2 < 1$. Combined with the non-increasing losses, we get:
\begin{equation*}
    L(p_D) - L(p^*) \le B_{p^*}B_X \frac{M}{\sqrt{D}}. \qedhere
\end{equation*}
\end{proof}
