


















\section{Omitted Proofs}

\subsection{Proof of Theorem~\ref{thm::sl_count_exam_inf}}
\label{append::sl_count_ex}

We present a more rigorous statement of Theorem~\ref{thm::sl_count_exam_inf} along with its proof.
\begin{thm}
\label{thm::sl_count_exam}
Consider downstream tasks as prediction tasks, with the class of downstream heads closed under linear transformations (i.e., if $g$ belongs to this class, $g\circ \bA$ also belongs to it for any matrix $\bA$). Let $\mathsf{diam}(\mathcal{Z}) = \sup_{\bz,\bz'\in\mathcal{Z}} \|\bz-\bz'\|_2$ be the diameter of the representation space. Then for any $A < (\mathsf{diam}(\mathcal{Z})/2)^2$ and a test point $\bx^*$, there exist embedding functions $h_1,\cdots,h_{2M} \in \mathcal{H}$ such that $\Varr{i\sim [2M]}{h_i(\bx^*)} \geq A$ but $\Varr{{i\sim[2M]}}{g_{i,t} \circ h_i(\bx^*)} = 0$ for any downstream task $t$, where $g_{i,t}$ is an optimal downstream head for $h_i$ under task $t$. 
\end{thm}

\begin{proof}
We prove this theorem via construction, assuming that there are only two embedding functions $h_1$, $h_2$, without loss of generality. Let $\delta>0$ be a small constant. We select two points $\bz_1, \bz_2 \in \mathcal{Z}$ such that $\|\bz_1 - \bz_2\|_2 \geq \mathsf{diam}(\mathcal{Z}) - \delta$. Then we define $h_1$ and $h_2$ to satisfy the condition: $h_i(\bx^*) = \bz_i$ and $h_1 = \bA \circ h_2$ where $\bA$ is an invertible matrix. Consequently,
\begin{align*}
    \Varr{i\sim [2]}{h_i(\bx^*)} 
    = \left\|\frac{\bz_1 - \bz_2}{2}\right\|_2^2
    \geq \left(\frac{\mathsf{diam}(\mathcal{Z}) - \delta}{2}\right)^2.
\end{align*}
For a given task $t$, let $g_{1,t}$ be an optimal downstream head for the embedding function $h_1$. Next, we prove that $g_{1,t} \circ \bA$ is an optimal downstream head for $h_2$ under the same task. Suppose otherwise, if there exists another downstream head $g'_{2,t}$ achieving higher performance than $g_{1,t} \circ \bA$ when combined with $h_2$. Since $h_1 = \bA \circ h_2$, it implies $g'_{2,t} \circ \bA^{-1}$ can achieve higher performance than $g_{1,t}$  when combined with $h_1$. This contradicts the optimally of $g_{1,t}$. Therefore, $g_{1,t} \circ \bA$ is an optimal downstream head for $h_2$ for task $t$ so we denote it as $g_{2,t}$. Now, we have
\begin{align*}
    g_{2,t} \circ h_2(\bx^*)  = g_{1,t} \circ \bA \circ h_2(\bx^*) = g_{1,t} \circ \bA \circ \bA^{-1} \circ h_1(\bx^*) = g_{1,t} \circ h_1(\bx^*), 
\end{align*}
which leads to $\Varr{i\sim [2]}{g_{i,t} \circ h_i(\bx^*)} = 0$. As $\delta$ can be chosen arbitrarily small, we can achieve the desired result.
By setting  $h_{2i-1}(\bx^*) = \bz_1$ and $h_{2i}(\bx^*) = \bz_2$, the same proof can be extended to the case of 2$M$ case.
\end{proof}

\begin{figure}[t!]
  \centering
  \includegraphics[width=0.6\textwidth]{figures/nc_proof.pdf}
  \caption{Graphical visualization of the sketch for the proof of Theorem~\ref{thm::nb_consistency}. Let $\mathcal{Z}i$ and $\mathcal{Z}j$ denote the representation spaces defined by the embedding functions $h_i$ and $h_j$, respectively.
  Suppose that there is a reliable neighboring point $\bx^r$ that is located close to the test point $\bx^*$ in each representation space. For any downstream task $t$, a reliable neighboring point $\bx^r$ serves as an anchor for comparing different representations $\bz_i^* = h_i(\bx^*)$ and $\bz_j^* = h_j(\bx^*)$ of the test point~$\bx^*$.
  The key idea is that $y_{i,t}^*$ and $y_{j,t}^*$  --- the downstream predictions on the test point using the two different embedding functions --- should be similar because the predictions $y_{i,t}^*$ and $y_{i,t}^r$ as well as $y_{j,t}^r$ and $y_{j,t}^*$ are similar due to the Lipschitz continuity of the downstream predictors. Additionally, since $\bx^r$ is a reliable point, the predictions $y_{i,t}^r$ and $y_{j,t}^r$ are similar. Thus, it follows that $y_{i,t}^*$ and $y_{j,t}^*$ are similar as well.
  } \label{fig:sketch_proof}
\end{figure}

\subsection{Proof of Theorem~\ref{thm::nb_consistency}} \label{pf::nb_consistency}

\begin{proof}
For the sake of notational simplicity, we denote $f_i = g_{i, t} \circ h_i$. When $g_{i, t}$ is Lipschitz continuous (see Appendix~\ref{app:lips} for more detail) and Equation~\eqref{eq:nb} --- $\lVert h_i(\bx^r) - h_i(\bx^*)) \rVert_2 \le \epsilon_{nb}$ --- holds, we have:
\begin{equation}
\lVert f_i(\bx^r) - f_i(\bx^*) \rVert_2  \le L_{i, t} \cdot \lVert h_i(\bx^r) - h_i(\bx^*) \rVert_2 \le L_{i, t} \cdot \epsilon_{nb} \le L_{t} \cdot \epsilon_{nb} ~, ~ \forall i \in [M] .
\end{equation}
By the triangle inequality, we have the following upper bound for the output difference:
\begin{align}
\lVert f_i(\bx^*) - f_j(\bx^*) \rVert_2^2 &\le \bigl( \lVert f_i(\bx^*) - f_i(\bx^r) \rVert_2 + \lVert f_i(\bx^r) - f_j(\bx^r)\rVert_2 + \lVert f_j(\bx^r) - f_j(\bx^*) \rVert_2 \bigr)^2 \nonumber \\
&\le \bigl(L_{t} \epsilon_{nb} + \lVert f_i(\bx^r) - f_j(\bx^r)\rVert_2 + L_{t} \epsilon_{nb} \bigr)^2 \nonumber \\
&= 4\bigl(L_{t} \epsilon_{nb} \bigr)^2 + 4 \bigl(L_{t} \epsilon_{nb} \bigr) \underbrace{\lVert f_i(\bx^r) - f_j(\bx^r)\rVert_2}_{L_2 ~ \text{difference}} + \underbrace{\lVert f_i(\bx^r) - f_j(\bx^r)\rVert_2^2}_{L_2 \text{-squared difference}} . 
\end{align}
Note that the ensemble variance is proportional to the average pairwise $L_2$-squared difference across the ensemble of $f_i$:
\begin{align}
\Varr{i\sim[M]}{g_{i,t} \circ h_i(\bx^*)} &= \frac{1}{M^2} \sum_{i < j} \lVert f_i(\bx^*) - f_j(\bx^*) \rVert_2 ^2 \\
\Varr{i\sim[M]}{g_{i,t} \circ h_i(\bx^r)} &= \frac{1}{M^2} \sum_{i < j} \lVert f_i(\bx^r) - f_j(\bx^r) \rVert_2 ^2 \equiv \sigma_{r, t}^2.
\end{align}

Furthermore, by Cauchy-Schwarz inequality, the average pairwise $L_2$ difference is bounded by:
\begin{align}
\frac{1}{M^2} \sum_{i < j} \lVert f_i(\bx^r) - f_j(\bx^r) \rVert_2 & \le \frac{1}{M^2} \Bigl( \sum_{i < j} \lVert f_i(\bx^r) - f_j(\bx^r) \rVert_2^2 \Bigr)^{1/2} \cdot \Bigl( \sum_{i < j} 1 \Bigr)^{1/2} \nonumber \\
& \le \frac{1}{M^2} \Bigl( M \sigma_{r, t} \Bigr) \cdot \Bigl( \frac{M(M-1)}{2} \Bigr)^{1/2} \le \frac{\sqrt{2}}{2} \sigma_{r, t}
\end{align}

Thus: 
\begin{align}
\Varr{i\sim[M]}{g_{i,t} \circ h_i(\bx^*)} &= \frac{1}{M^2} \sum_{i < j} \lVert f_i(\bx^*) - f_j(\bx^*) \rVert_2 ^2 \nonumber \\
&\le \Bigl(\frac{1}{M^2}\Bigr) \Bigl(\frac{M(M-1)}{2}\Bigr) \Bigl( 4\Bigl(L_t \epsilon_{nb}\Bigr)^2 \Bigr) + 2\sqrt{2} \bigl(L_{t} \epsilon_{nb} \bigr) \sigma_{r, t} + \sigma_{r, t}^2 \nonumber \\
&\le \Bigl(\sqrt{2}L_t \epsilon_{nb}\Bigr)^2 + 2\sqrt{2} L_{t} \epsilon_{nb} \sigma_{r, t} + \sigma_{r, t}^2 \nonumber \nonumber \\ 
&= \Bigl(\sqrt{2}L_t \epsilon_{nb} + \sigma_{r, t} \Bigr)^2.
\end{align}
\end{proof}

The reader is referred to Figure~\ref{fig:sketch_proof} for a visualization accompanying the proof sketch.



\subsection{Lipschitz Continuity of Neural Networks} \label{app:lips}

\begin{lem} \label{lem::lip_cont}
For a fully-connected layer $g_l(\bz) = a(\bw_l^T \bz + \bb_l)$ of downstream predictor, the similarity of outputs is bounded above by the Lipschitz continuity:
\begin{equation}
  \lVert g_l (\bz) - g_l (\bz^*) \rVert_2 \le \lVert \bw_l \rVert_2 \cdot \lVert \bz - \bz^* \rVert_2.
\end{equation}

\end{lem}
\begin{proof}
Suppose $a(\cdot)$ is 1-Lipschitz continuous (e.g., identity, ReLU, sigmoid, and softmax) ,
\begin{align}
    \lVert g_l (\bz) - g_l (\bz^*) \rVert_2 &= \lVert a\big( \bw_l^T \bz + \bb_l \big) - a\big( \bw_l^T \bz^* + \bb_l \big) \rVert_2 \nonumber \\
    &\le 1 \cdot \lVert \big( \bw_l^T \bz + \bb_l \big) - \big( \bw_l^T \bz^* + \bb_l \big) \rVert_2 \nonumber \\ 
    & = \lVert \bw_l^T \big( \bz- \bz^* \big) \rVert_2 \nonumber \\
    &\le \lVert \bw_l \rVert_2 \cdot \lVert \bz - \bz^* \rVert_2 .
\end{align}
where $\lVert \cdot \rVert_2$ is the spectral norm for a matrix and $L_2$ norm for a vector.
\end{proof}


\begin{cor} 
For a feed-forward neural networks, $g = g_1 \circ g_2 \circ ... \circ g_L$, composed of multiple fully-connected layers, the similarity of predictions at $\bx$ and $\bx^*$ is bounded above by:
\begin{equation}
  \lVert g (\bx) - g (\bx^*) \rVert_2 \le L_t \cdot \lVert \bx - \bx^* \rVert_2.
\end{equation}
where $L_t = \lVert \bw_1 \rVert_2 \cdot \lVert \bw_2 \rVert_2 \cdots \lVert \bw_L \rVert_2$ is a Lipschitz constant for $g$.
\end{cor}









