\section{Concentration inequality}
\label{sec:concentration_ineq}
In high dimensions, naive Monte Carlo and rejection sampling almost never cover the full geometry because concentration of measure confines nearly all the volume to a thin shell. This makes estimating uncertainty with a finite sample impractical and undermines the reliability of traditional measures. Instead, we embrace the curse of dimensionality by adopting a probabilistic framework that turns it into a blessing: rather than fight measure concentration, we exploit it to build our method.

To introduce our method—and for pedagogical clarity—we begin with the simplest case of a uniform distribution on the sphere, develop the necessary subgaussian theory, and discuss the locally uniform case. Only then do we specialize to the projected Gaussian setting that underlies our estimator. By leveraging measure concentration, we obtain a theoretically justified and efficient estimator of the true variance rather than attempting to reconstruct any arbitrary high-dimensional geometry.

Overall, our goal is to derive the general inequality \ref{final_inequality}, which establishes a rigorous and effective connection between high-dimensional probability and deep learning. It show how the phenomenon of concentration crucially depends on the structure of the embedding vector.
\subsection{Uniform assumption}

\begin{theorem}
\label{thm:theorem43_convergence}
Let $\boldsymbol{\varphi} \in S^{D-1}$ and $\Phi \in [0,\pi]$ be given, and define the spherical cap:
\begin{equation}
    C_\Phi(\boldsymbol{\varphi}) \;=\; \Bigl\{ \boldsymbol{\alpha} \in S^{D-1} : \boldsymbol{\alpha}^\top \boldsymbol{\varphi} \ge \cos \Phi \Bigr\}.
\end{equation}
The concentration parameter $ c = \mathbb{E}\bigl[\boldsymbol{\alpha}^\top\boldsymbol{\varphi}\bigr]$ quantifies how tightly the perturbed directions are distributed around $\boldsymbol{\varphi}$. 

For $M \in \mathbb{N}^*$ we suppose $\{\boldsymbol{\alpha}^{(1)}, \boldsymbol{\alpha}^{(2)}, \dots\ \boldsymbol{\alpha}^{(M)} \}$ is a sequence of i.i.d. random vectors with the uniform distribution on $C_\Phi(\boldsymbol{\varphi})$. We define the matrix :
\begin{equation}A_M :=\left(\boldsymbol{\alpha}^{(1)}, \boldsymbol{\alpha}^{(2)}, \dots, \boldsymbol{\alpha}^{(M)}\right),
\end{equation}
and we denote $\Var(A_M)$ its variance estimator. 
Then the following hold :  As $M \to \infty$,
   \begin{equation}
       \Var(A_M) \;\overset{\text{a.s.}}{\longrightarrow}\;\,\bigl(1-c^2\bigr).
   \end{equation} 

\end{theorem}
We are interested in understanding both how quickly $\Var(A_M)$ converges to $(1-c^2)$ when $M \to + \infty$. 

For the sake of simplicity, \textbf{we suppose} $\forall i, \boldsymbol{\alpha}^{(i)}$ follow an uniform distribution over all the unit hypersphere $S^{D-1}$ such that $c^2 =0$. We also define the following perturbed estimator in its case :
\begin{equation}
\widetilde{\Var A_M^f} := \Tr\left( \frac{1}{M} \sum_{i=1}^M (\boldsymbol{\alpha}^{(i)} - \boldsymbol{\mu})(\boldsymbol{\alpha}^{(i)} - \boldsymbol{\mu})^T\right),
\end{equation}
where $\boldsymbol{\mu} = \frac{1}{M}\sum_{i=1}^M \boldsymbol{\alpha}^{(i)}$

Since $\frac{M}{M-1} \simeq 1$ for $M \geq 10$, this approximation does not affect practical computations and is used solely for convenience.


\begin{theorem}(Uniform on the whole sphere)
\label{thm:concentre}
    For all $\epsilon>0$, we have
\begin{equation}
    \mathbb{P}\Bigl(1-\widetilde{\Var A_M^f} > \epsilon\Bigr)
\le \exp\!\Bigl(-\, c_1\,D\,\epsilon\, M\Bigr).
\end{equation}

where $c_1 >0$ is an absolute constant.
\end{theorem}

\begin{theorem} (Locally uniform on spherical cap)
\label{thm:concentre_local}
    For all $\epsilon>0$, we have
\begin{equation}
    \mathbb{P}\Bigl((1-c^2)-\widetilde{\Var A_M} > \epsilon\Bigr)
\le \exp\!\Bigl(-\, c_1\,D\,\epsilon\, M\Bigr).
\end{equation}

where $c_1 >0$ is an absolute constant.
\end{theorem}

To prove the Theorem.\ref{thm:concentre}, we first need to explore some key properties of sub-gaussian vectors. 

\subsubsection{Preliminaries on sub-gaussian vectors}
\begin{definition}[Sub-gaussian random variable]

We say that real random variable $X$ is sub-gaussian if there is a constant $C >0$ such that for $t \geq 0$ :

\begin{equation}
    \mathbb{P}(|X| > t) \leq 2 \exp(-t^2/C^2),
\end{equation}
Its subgaussian norm is the quantity 

\begin{equation}
    \|X\|_{\psi_2} = \inf_{\lambda>0} \mathbb{E}\left[ \exp \left( \frac{X^2}{\lambda^2} \right) \leq 2 \right].
\end{equation}
\end{definition}


\begin{definition}[Sub-gaussian random vector]
\label{subvector}
We say that the random vector $X$ is sub-gaussian if and only if
\begin{equation}
    \|X\|_{\psi_2} := \sup_{u \in S^{D-1}}\|u^TX\|_{\psi_2} < + \infty.
\end{equation}

\end{definition}

\begin{lemma}
    If $\boldsymbol{\alpha}$ is a random vector following a uniform distribution on the unit hypersphere $S^{D-1}$ then $\boldsymbol{\alpha}$ is sub-gaussian such that $\|\boldsymbol{\alpha}\|_{\psi_2} = O(\frac{1}{\sqrt{d}})$.
\end{lemma}

\begin{lemma}
    Let $\boldsymbol{\alpha}^{(1)},\ldots \boldsymbol{\alpha}^{(M)}$ be $M$ random vector i.i.d. following an uniform distribution on the unit hypersphere. Let $S := \sum_{i=1}^M \boldsymbol{\alpha}^{(i)}$. Then $\mathbb{E}[S] = 0$ and is sub-gaussian with $\|S\|_{\psi_2} = O(\frac{\sqrt{M}}{\sqrt{D}})$.
\end{lemma}

\begin{proof}
    Let $u \in S^{D-1}$. Then: 
    \begin{equation}
     \|S\|_{\psi_2} = \|u^T\sum_{i=1}^M\boldsymbol{\alpha}^{(i)} \|_{\psi_2} \\  = \| \sum_{i=1}^M u^T \boldsymbol{\alpha}^{(i)} \|_{\psi_2} \leq K \left(\sum_{i=1}^M \|u^T\boldsymbol{\alpha}^{(i)}\|_{\psi_2}\right)^{1/2}\leq K\frac{\sqrt{M}}{\sqrt{D}},
        \end{equation}
where $K>0$ is an absolute constant. A complete proof of the penultimate inequality is provided in ~\citet{vershynin2018high}.
\end{proof}

\begin{corollary}
\label{cor:gauss}
$\forall \lambda >0$ and $u \in \mathbb{R}^D$,

\begin{equation}
    \mathbb{E}[ \exp \lambda u^TS] \leq \exp \left( \frac{K^2M}{2D}\lambda^2 \|u\|^2\right),
\end{equation}
where $K>0$ is an absolute constant proportional to the subgaussian norm of $S$. 
\end{corollary}


The following theorem gives us a concentration inequality on $\|S\|^2$. We consider it as a "weak" version of the Hanson-Wright inequality since it does not require the components of the vector to be independent but subgaussian with the cost of $A$ to be positive-
semidefinite. We write the inequality as it is in ~\citet{hsu2012tail}.

\begin{theorem}[Weak Hanson-Wright Inequality]
\label{thm:hanson_weak}
Suppose that a random vector $X\in\mathbb{R}^n$ satisfies
\begin{equation}
\mathbb{E}\exp(u^T(X-\boldsymbol{\eta})) \le \exp\!\Bigl(\frac{\sigma^2\|u\|^2}{2}\Bigr), \forall u\in\mathbb{R}^n.
\end{equation}
Then, for any definite positive matrix $A\in\mathbb{R}^{m\times n}$ (with $\Sigma=A^TA$), for all $t>0$:
\begin{equation}
\mathbb{P}\Bigl(\|AX\|^2 > \sigma^2\Bigl(\operatorname{tr}(\Sigma)+2\sqrt{\operatorname{tr}(\Sigma^2)\,t}+2\|\Sigma\|\,t\Bigr)\Bigr)
\le e^{-t}.
\end{equation}
where $\| \Sigma\|$ denotes for the spectral (operator) norm of the matrix $\Sigma$. 
\end{theorem}

\begin{proof}
A complete proof of this theorem is provided in ~\citet{hsu2012tail}.\end{proof}

\subsubsection{Proof of Theorem. \ref{thm:concentre}}


\begin{proof}
Recall \eqref{eq:bigsum}, then replace $M-1$ by $M$ to the denominator, we obtain 
\begin{equation}
    \widetilde{\Var A_M}= 1 - \|\boldsymbol{\mu}\|^2 \quad (\text{where} \ \|\mathbf{z}\| = 1).
\end{equation}
Our goal is to bound the deviation $\widetilde{\Var A_M} - 1$ which reduces to control $\| \boldsymbol{\mu}\|^2$.
Let $S = \sum_{i=1}^M \boldsymbol{\alpha}^{(i)}$. Then :

\begin{equation}
    \|\boldsymbol{\mu}\|^2 = \frac{\|S\|^2}{M^2}.
\end{equation}
Thus, the error in the variance estimation is
\begin{equation}
|1- \widetilde{\Var A_M}| = 1 - \widetilde{\Var A_M}
=\frac{\|S\|^2}{M^2}.
\end{equation}



We apply the Weak Hanson-Wright Inequality \ref{thm:hanson_weak} and the Corollary. \ref{cor:gauss} We take $\boldsymbol{\eta}=0$, and $A=I_D$, so that $\Sigma = I_D$, with
\[
\Tr(I_D)=D,\quad \Tr(I_D^2)=D,\quad \|I_D\|=1 \quad \sigma^2 = K^2\,\frac{M}{D}.
\]
It follows that we have that with probability at least $1-e^{-t}$,
\begin{equation}
 \|S\|^2 \le C^2\,M\Bigl(1+2\sqrt{\frac{t}{D}}+2\frac{t}{D}\Bigr).   
\end{equation}
and it follows that : 
\begin{equation}
    \|\boldsymbol{\mu}\|^2 \le \frac{K^2}{M^2}\,\,M\Bigl(1+2\sqrt{\frac{t}{D}}+2\frac{t}{D}\Bigr)
= \frac{K^2}{M}\Bigl(1+2\sqrt{\frac{t}{D}}+2\frac{t}{D}\Bigr).
\end{equation}

Thus, for every $t \geq 0$ to get, 
\begin{equation}
1- \widetilde{\Var(A_M)}=\|\boldsymbol{\mu}\|^2 \le \epsilon,
\end{equation}
it is sufficient that:
\begin{equation}
\Bigl(1+2\sqrt{\frac{t}{D}}+2\frac{t}{D}\Bigr) \le \frac{M\epsilon}{K^2}.
\end{equation}

Suppose that $M$ is large enough such that the right term is greater than 1, then we have $c_1 > 0$ such that:
\begin{equation}
  t \ge c_1\,D\,\epsilon\, M,
\end{equation}
can be chosen such as $\|\boldsymbol{\mu}\|^2 \le \epsilon$ holds.

Thus, we deduce that:
\[
\mathbb{P}\Bigl(1-\widetilde{\Var(A_M)} > \epsilon\Bigr)
=\mathbb{P}\Bigl(\|\boldsymbol{\mu}\|^2 > \epsilon\Bigr)
\le \exp\!\Bigl(-\, c_1\,D\,\epsilon\, M\Bigr).
\]

\end{proof}


The general case stated in Theorem~\ref{thm:concentre_local} can be proved by combining advanced results on manifolds with convex boundaries, the log-Sobolev inequality, and the Bakry––Émery criterion. First, one shows that for any $\|f\|_{Lip}$-Lipschitz function \(F\) and any random vector \(X\) uniformly distributed on a spherical cap, the following concentration estimate holds:
\begin{equation}
\mathbb{P} \left(|F(\mathbf{X}) - E[F(\mathbf{X})]| > r \right) \leq\exp\left(- \frac{CD r^2}{\|f\|_{Lip}^2}\right)
\end{equation}
where $C$ is an absolute constant. The proof is beyond the scope of this work and can be found in ~\citet{kolesnikov2016riemannian, ledoux2001concentration}.

%\textcolor{brown}{Discussion:  
%This effect explain why, even with a relatively small ($M \simeq 50$) number of forward passes, the variance estimator $\Var(A_M)$ can concentrate very well around $1-c^2$ due to the presence of $D$ in the exponential term.
%One can show that for sufficiently large $M$, the following inequality holds:
%\begin{equation}
 % \mathbb{P}\Bigl(\|\mathbf{z}\|^2-\widetilde{\Var(Z_M)} > \epsilon\Bigr)
%\le \exp\!\Bigl(\frac{-\, c_1\,D\,\epsilon\, M}{\|\mathbf{z}\|^2}\Bigr),
%\end{equation}
%meaning that sampling over all embedding space would require to applies $M$ forward pass such that $M \geq \|\mathbf{z}\|^2$. This highlights the advantage of working on the unit hypersphere, making efficient sampling and estimation more feasible.
%While we have not yet established a formal proof for random vectors drawn from a spherical cap, there is no strong reason to believe the behavior should differ significantly. We believe that using isoperimetrical results for Gaussian concentration to generalize the following Levy Lemma could be a first start and we intend to address this in future work.}
\subsection{Projected gaussian assumption}

The projected Gaussian assumption covers the general case of measure concentration for Lipschitz functions on $\mathbb{R}^D$, relates precisely to the normalization process in Alg. \ref{alg:alg_train} and  Alg. \ref{alg:alg_test}. Its behavior is well understood using log-Sobolev and Herbst inequalities: its rigorous justification is also relying on log–Sobolev Inequality and Herbst arguments, and full proof of the following results can be found in ~\citep{ledoux2001concentration}.

\begin{theorem}
Suppose that for all $F: \mathbb{R}^D \to \mathbb{R}$ Lipschitz, the law of the random variable $F(\mathbf{X})$ verify Log-Sobolev Inequality assumption, in the sense that for all $r \ge 0$,
\begin{equation}
\mathbb{P}(|F(\mathbf{X}) - \mathbb{E}[F(\mathbf{X})]| \ge r) \le c \exp\left(-\frac{r^2}{2C\|F\|_{\text{Lip}}^2}\right),
\end{equation}
for some absolute constants $c, C > 0$. Then for all $F: \mathbb{R}^D \to \mathbb{R}$ Lipschitz and $r \ge \sigma \|F\|_{\text{Lip}}$,
\begin{equation}
\mathbb{P}\left(\left|F\left(\frac{\mathbf{X}}{|\mathbf{X}|}\right) - \mathbb{E}\left[F\left(\frac{\mathbf{X}}{|\mathbf{X}|}\right)\right]\right| \ge r\right) \le 2c \exp\left(-\frac{\eta^2}{8C}\left(\frac{r}{\|F\|_{\text{Lip}}} - \sigma\right)^2\right),    
\end{equation}
where
\begin{equation}
 \eta := \mathbb{E}[|\mathbf{X}|] \quad \text{and} \quad \sigma := \mathbb{E}\left[\left|\frac{|\mathbf{X}|}{\eta} - 1\right|\right].
\end{equation}

\end{theorem}

The quantity $\|F\|_{\text{Lip}} := \sup_{\mathbf{x},\mathbf{y} \in \mathbb{R}^D: \mathbf{x} \neq \mathbf{y}} \frac{|F(\mathbf{x}) - F(\mathbf{y})|}{|\mathbf{x} - \mathbf{y}|}$ is the Lipschitz norm of $F$ with respect to the Euclidean norm on $\mathbb{R}^D$.

\begin{corollary} 
\label{cor_concentration} Consider the Gaussian case $X \sim \mathcal{N}(m, \Sigma)$, $m \in \mathbb{R}^D$, $\Sigma \in \mathrm{Sym}_{D \times D}^+(\mathbb{R})$. Then the sub-Gaussian concentration of Lipschitz functions holds with
\begin{equation}
c = 2 \quad \text{and} \quad C = \|\Sigma\|_{\mathrm{op.}} = \max_{|x|=1} \langle \Sigma x, x \rangle.
\end{equation}

Moreover, one can show that $\mathbb{E}\|X\| = K'D$, leading to a concentration inequality that depends on the dimensionality $D$:
\begin{equation}
    \mathbb{P}\left(\left|F\left(\frac{\mathbf{X}}{\|\mathbf{X}\|}\right) - \mathbb{E}\left[F\left(\frac{\mathbf{X}}{\|\mathbf{X}\|}\right)\right]\right| \ge r\right) \le 2c \exp\left(-\frac{KDr^2}{\|F\|_{\mathrm{Lip}}}\right),
\end{equation}
where $K$ is an absolute constant depending on $K', \sigma$ and $C$.
\end{corollary}
\subsection{Measure tensorization}

Considering $\mathbf{Y}_1,... \mathbf{Y}_M$ i.i.d such as $\forall i \in [|1,M|], \mathbf{Y_i}$ follow the same distribution as $\frac{\mathbf{X}}{\|\mathbf{X}\|}$. Because the Log-Sobolev Inequality is stable by measure tensorization ~\citep{chafai2024logarithmic} and because the variance operator is an Lipschitz function on the unit hypersphere, it follow that if $\Var\left(\mathbf{Y}_1,...\mathbf{Y}_M \right)$ denotes the empirical variance:
\begin{equation}
\label{final_inequality}
    \mathbb{P}\left(\left|\Var\left(\mathbf{Y}_1,...\mathbf{Y}_M)\right) - \Var \left( \frac{\mathbf{X}}{\|\mathbf{X}\|}\right)\right| \ge r\right) \le 2c \exp\left(-MKr^2D\right)
\end{equation}
where $K$ is an absolute constant depending on $K', \sigma$, $C$ and the Lipchitz constant.

In our method, the normalized output of the DC Layer $\forall i \in [|1,M|], \frac{\boldsymbol{\alpha}_{DC}^{(i)}}{\|\boldsymbol{\alpha}_{DC}^{(i)}\|} := \tilde{\boldsymbol{\alpha}}^{(i)} ( \boldsymbol{z})$ follows a projected Gaussian distribution and its true variance denoted here by $\Var(\boldsymbol{\alpha_{DC}}(\boldsymbol{z}))$ depends on the embedding $z$. The absolute constant $K$ depends on the embedding $z$, hence we write $K=K(z)$. It follows that:

\begin{equation}
\mathbb{P}\Bigl(\bigl|\Var(\tilde{\boldsymbol{\alpha}}^{(1)} ( \boldsymbol{z}),\dots,\tilde{\boldsymbol{\alpha}}^{(M)} ( \boldsymbol{z})) -\Var(\boldsymbol{\alpha_{DC}}(\boldsymbol{z}))|)\leq 2c\exp\bigl(-MK(z)r^2D\bigr).
\end{equation}

This inequality also holds in the uniform case, as this setup also satisfies the Log-Sobolev inequality, which permits a measure tensorization argument. 

In practice, for OoD data, one expects the empirical variance to converges more slowly—both because it has intrinsically higher variance and because OoD examples can be highly diverse—so the required number of samples $M$ must be only calibrated on a ID validation set.

Relying on the theoretical observations established by ~\citet{sun2021react}, one can easily show that the constant $K(z)$ is larger for ID embeddings than for OoD embeddings, reflecting the fact that ID embeddings concentrate more tightly in the representation space and exhibit lower variance than OoD embeddings. A more precise study of this constant is left for future work.


