\subsection{Proof of Theorem \ref{Thm: eigenvalue convergence}}
To prove the theorem, we will use the following lemma followed from \citep{gross2011recovering,kohler2017sub}.
\begin{lemma}[Vector Bernstein Inequality \citep{gross2011recovering,kohler2017sub}]\label{Lemma: Vector Bernstein} Suppose that $z_1,\ldots ,z_n$ are independent and identically distributed random vectors with zero mean $\mathbb{E}[z_i] =\mathbf{0}$ and bounded $\ell_2$-norm $\Vert z_i\Vert_2\le c$. Then, for every $0\le\epsilon \le c$, the following holds
\begin{equation*}
    \mathbb{P}\biggl(\,\Bigl\Vert \frac{1}{n}\sum_{i=1}^n z_i  \Bigr\Vert_2 \ge \epsilon\,\biggr) \, \le\, \exp\Bigl(-\frac{n\epsilon^2}{8c^2} +\frac{1}{4}\Bigr)
\end{equation*}

\end{lemma}

We apply the above Vector Bernstein Inequality to the random vectors $\phi(x_1)\otimes \phi(x_1) ,\ldots , \phi(x_1)\otimes \phi(x_1)$ where $\otimes$ denotes the Kronecker product. To do this, we define vector $v_i = \phi(x_i)\otimes \phi(x_i) - \mathbb{E}_{x\sim P }\bigl[\phi(x)\otimes \phi(x)\bigr]$ for every $i$. Note that $v_i$ is, by definition, a zero-mean vector and also for every $x$ we have the following for the normalized kernel function $k$:
\begin{equation*}
    \bigl\Vert \phi(x) \otimes \phi(x)\bigr\Vert^2_2 = \bigl\Vert \phi(x) \bigr\Vert^2_2 \cdot \bigl\Vert\phi(x)\bigr\Vert^2_2 = k(x,x)\cdot k(x,x) =1
\end{equation*}
Then, the triangle inequality implies that $$\bigl\Vert v_i\bigr\Vert_2 \le \bigl\Vert \phi(x_i)\otimes \phi(x_i)\bigr\Vert_2 + \bigl\Vert \mathbb{E}_{x\sim P }\bigl[\phi(x)\otimes \phi(x)\bigr] \bigr\Vert_2 \le \bigl\Vert \phi(x_i)\otimes \phi(x_i)\bigr\Vert_2 +  \mathbb{E}_{x\sim P }\bigl[\bigl\Vert\phi(x)\otimes \phi(x)\bigr\Vert_2\bigr]   = 2$$ 
As a result, the Vector Bernstein Inequality leads to the following for every $0\le \epsilon\le 2$:
\begin{equation*}
    \mathbb{P}\Bigl(\Bigl\Vert \frac{1}{n}\sum_{i=1}^n \phi(x_i)\otimes \phi(x_i) - \mathbb{E}_{x\sim P }\bigl[\phi(x)\otimes \phi(x)\bigr]  \Bigr\Vert_2 \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-n\epsilon^2}{32}\Bigr)
\end{equation*}
On the other hand, note that $\phi(x)\otimes \phi(x)$ is the vectorized version of rank-1 $\phi(x) \phi(x)^\top$, which shows that the above inequality is equivalent to the following where $\Vert\cdot\Vert_{\mathrm{HS}}$ denotes the Hilbert-Schmidt norm, which will simplify to the Frobenius norm in the finite dimension case,
\begin{align*}
    &\mathbb{P}\Bigl(\Bigl\Vert \frac{1}{n}\sum_{i=1}^n  \bigl[\phi(x_i) \phi(x_i)^\top\bigr] - \mathbb{E}_{x\sim P }\bigl[\phi(x) \phi(x)^\top\bigr]  \Bigr\Vert_{\mathrm{HS}} \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-n\epsilon^2}{32}\Bigr) \\
    \Longrightarrow \;\; & \mathbb{P}\Bigl(\Bigl\Vert C_X - \widetilde{C}_X  \Bigr\Vert_{\mathrm{HS}} \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-n\epsilon^2}{32}\Bigr)
\end{align*}
Subsequently, we can apply the Hoffman-Wielandt inequality which shows that for the sorted eigenvalue vectors of $C_X$ (denoted by $\widehat{\boldsymbol{\lambda}}_n$ in the theorem) and $\widetilde{C}_X$ (denoted by $\widetilde{\boldsymbol{\lambda}}$ in the theorem) we will have
$\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \Vert_2\le \Vert C_X - \widetilde{C}_X  \Vert_{\mathrm{HS}}$, which together with the previous inequality leads to

\begin{align*}
    & \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \bigr\Vert_2 \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-n\epsilon^2}{32}\Bigr)
\end{align*}
If we define $\delta = \exp\bigl((8-n\epsilon^2)/32\bigr)$ that implies $\epsilon \le \sqrt{\frac{32\log(2/\delta)}{n}}$, we obtain the following for every $\delta \ge \exp\bigl((2-n)/8\bigr)$ (since we suppose $0\le \epsilon \le 2$)
\begin{align*}
    & \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \bigr\Vert_2 \ge \sqrt{\frac{32\log(2/\delta)}{n}}\Bigr) \, \le\, \delta \\
    \Longrightarrow \;\; & \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \bigr\Vert_2 \le \sqrt{\frac{32\log(2/\delta)}{n}}\Bigr) \, \ge\, 1- \delta
\end{align*}
which completes the proof.


\subsection{Proof of Corollary \ref{Corollary: Finite Dimension}}

\textbf{The case of $\alpha=1$}. We show that Theorem \ref{Thm: eigenvalue convergence} on the concentration of  the eigenvalues $\boldsymbol{\lambda}=[\lambda_1,\ldots , \lambda_d]$ will further imply a concentration bound for the logarithm of Vendi-1 score. In the case of $\mathrm{Vendi}_1$ (when $\alpha\rightarrow 1^+$), the concentration bound will be formed for the logarithm of the Vendi score, i.e. the Von-Neumann entropy (denoted as $H_{\alpha}$):

$$H_1(C_X):= H_1(\boldsymbol{\lambda}) = \sum_{i=1}^d \widetilde{\lambda}_i \log \frac{1}{\widetilde{\lambda}_i}$$

Theorem \ref{Thm: eigenvalue convergence} shows that $\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \Vert_2 \le \sqrt{\frac{32\log(2/\delta)}{n}}$ with probability $1-\delta$. To convert this concentration bound to a bound on the order-1 entropy (for Vendi-1 score) difference $H_1({\widehat{C}_n}) - H_1(C_X)$, we leverage the following two lemmas:

\begin{lemma}\label{Lemma: Log inequality}
For every $0\le \alpha ,\beta\le 1$ such that $|\beta-\alpha|\le \frac{1}{e}$, we have $$\Bigl\vert\alpha \log \frac{1}{\alpha} - \beta \log \frac{1}{\beta}\Bigr\vert \,\le\, \vert\beta -\alpha\vert\log\frac{1}{\vert\beta -\alpha\vert}$$ 
\end{lemma}
\begin{proof}
Let $c = |\alpha-\beta|$, where $c\in[0, \frac{1}{e}]$. Defining $g(z) = z\log(\frac{1}{z})$, the first-order optimality condition $g'(z)=-\log(z) - 1 = 0$ yields $\frac{1}{e}$ as the local maximum of $g(z)$. Therefore, there are three cases of placement of $\alpha$ and $\beta$ on the interval $[0,1]$: $\alpha$ and $\beta$ appear before maximum point, after maximum point or maximum point is between $\alpha$ and $\beta$. We show that regardless of the placement of $\alpha$ and $\beta$, the above inequality remains true.
\begin{itemize}
    \item \textbf{Case 1:} $\alpha, \beta \in [0, \frac{1}{e}]$. Note that $g''(z) = -\frac{1}{z}$. Since the second-order derivative is negative and the function $g$ is monotonically increasing within the interval $[0,\frac{1}{e}]$, the gap between $g(\alpha)$ and $g(\beta)$ is maximized when $\alpha^*=0$ and $\beta^*=c-\alpha^* = c$. This directly leads to the desired bound as follows:
    $$\Bigl\vert\alpha \log \frac{1}{\alpha} - \beta \log \frac{1}{\beta}\Bigr\vert \, \le\, \Bigl\vert\alpha^* \log \frac{1}{\alpha^*} - \beta^* \log \frac{1}{\beta^*}\Bigr\vert \, =\, \bigl\vert0 \log 0 - c \log \frac{1}{c}\bigr\vert \, \le\, c \log\frac{1}{c}$$ 
    Here, we use the standard limit $0\log 0 = 0$.
    \item \textbf{Case 2:} $\alpha, \beta \in [\frac{1}{e}, 1]$. In this case, we note that $g$ is concave yet decreasing over $[\frac{1}{e},1]$, and so the gap between $g(\alpha)$ and $g(\beta)$ will be maximized when $\alpha^*=1-c$ and $\beta^*=1$. This leads to:
    $$\Bigl\vert\alpha \log \frac{1}{\alpha} - \beta \log \frac{1}{\beta}\Bigr\vert \, \le\, \Bigl\vert\alpha^* \log \frac{1}{\alpha^*} - \beta^* \log \frac{1}{\beta^*}\Bigr\vert\, =\,   (1-c) \log \frac{1}{(1-c)} \le c\log\frac{1}{c}$$ 
    where the last inequality holds because $c\in[0, \frac{1}{e}]$, and if we define the function $h(c)= c\log\frac{1}{c} - (1-c)\log\frac{1}{1-c}$, then we have $h'(c) = \log\frac{1}{c(1-c)} - 2$, which is positive over $c\in[0,c_0]$ ($e^{-2}<c_0<e^{-1}$ is where $c_0(1-c_0)=e^{-2}$), and then negative over $[c_0,\frac{1}{e}]$, and hence $h(c)\ge \min\{h(0),h(1/e)\} = 0$ for every $c\in[0,1/e]$.  
    \item \textbf{Case 3:} $\alpha \in [0, \frac{1}{e})$ and $\beta \in (\frac{1}{e}, 1]$. When $\alpha$ and $\beta$ lie on the opposite ends from the maximum point, the inequality becomes:
    $$\bigl\vert\alpha \log \frac{1}{\alpha} - \beta \log \frac{1}{\beta}\bigr\vert \leq \mathrm{max} \Bigr\{ \bigl\vert(1/e)\log \frac{1}{1/e} - \beta\log\frac{1}{\beta}\bigr\vert , \bigr\vert \alpha\log\frac{1}{\alpha} - (1/e)\log \frac{1}{1/e}\bigl\vert\Bigl\} \leq c\log\frac{1}{c}$$
    since we pick the side with the largest difference, this difference is upper bounded by either Case 1 or Case 2 because $\mathrm{max}\{|\frac{1}{e}-\beta|, |\alpha-\frac{1}{e}|\} < c$. Therefore, this case is upper-bounded by $c\log\frac{1}{c}$.
\end{itemize}
All the three cases of placement of $\alpha$ and $\beta$ are upper-bounded by $c\log \frac{1}{c}$; Therefore, the claim holds.
\end{proof}

\begin{lemma}\label{Lemma: Entropy Schur-concave}
If $\Vert \mathbf{u}\Vert_2\le \epsilon$ for $d$-dimensional vector $\mathbf{u}\ge \mathbf{0}$ where $\epsilon\le\frac{1}{e}$, then we have $$\sum_{i=1}^d u_i \log\frac{1}{u_i} \le \epsilon\sqrt{d}\log\frac{\sqrt{d}}{\epsilon}$$ 
\end{lemma}
\begin{proof}
We prove the above inequality using the KKT conditions for the following maximization problem, representing a convex optimization problem,

\begin{align*}
    &\max_{\mathbf{u}\in\mathbb{R}^d}\qquad\;\;\: \sum_{i=1}^du_i\log(\frac{1}{u_i}) \\
    &\text{\rm subject to}\quad u_i \ge 0, \;\; \text{\rm for all}\: i\\
    &\qquad\qquad\quad \sum_{i=1}^du_i^2 \leq \epsilon^2 \;\;\ (\text{equivalent to } \Vert \mathbf{u}\Vert_2\le \epsilon)
\end{align*}

In a concave maximization problem subject to convex constraints, any point that satisfies the KKT conditions is guaranteed to be a global optimum. Let us pick the following solution $\mathbf{u}^* = \frac{\epsilon}{\sqrt{d}}\mathbf{1}$ and slack variables $\lambda^*=\frac{\sqrt{d}}{2\epsilon}\bigr(\log(\frac{\sqrt{d}}{\epsilon}) -1 \bigl)$, $\forall_i \;\mu_i^*=0$. The Lagrangian of the above problem:

$$L(\mathbf{u},\lambda,\mu_1,\dots,\mu_d) = \sum_{i=1}^du_i\log(\frac{1}{u_i}) + \lambda(\epsilon^2-\sum_{i=1}^du_i^2)-\sum_{i=1}^d\mu_iu_i$$

\begin{itemize}
    \item \textbf{Primal Feasibility.} The solution $\mathbf{u^*}$ satisfies the primal feasibility, since $\epsilon^2 - \sum_{i=1}^d(\frac{\epsilon}{\sqrt{d}})^2 = \epsilon^2 - d\frac{\epsilon^2}{d} = 0$ and $\frac{\epsilon}{\sqrt{d}} \ge 0$.
    \item \textbf{Dual Feasibility.} $\lambda^*\geq0$ is feasible because of the assumption $\epsilon\le \frac{1}{e}$ implying that $\frac{\sqrt{d}}{\epsilon}\ge e$ for every integer dimension $d\ge 1$. Note that this implies $\lambda^*= \frac{\sqrt{d}}{2\epsilon}\bigr(\log(\frac{\sqrt{d}}{\epsilon}) -1 \bigl) \ge 0$.
    \item \textbf{Complementary Slackness.} Since $\lambda^*\bigr( \epsilon^2 - \sum_{i=1}^d(\frac{\epsilon}{\sqrt{d}})^2\bigl) = \lambda^* \cdot  0 = 0$, the condition is satisfied.
    \item \textbf{Stationarity.} The condition is satisfied as follows:
    \begin{align*}
        \frac{\partial}{\partial u_i}L(\mathbf{u^*}) &= -\log(u_i^*)-1-2\lambda^*u_i^* + \mu_i^* = -\log(\frac{\epsilon}{\sqrt{d}}) - 1 - 2\cdot\frac{\sqrt{d}}{2\epsilon}\bigr( -\log(\frac{\epsilon}{\sqrt{d}}) -1 \bigl)\cdot \frac{\epsilon}{\sqrt{d}} = 0
    \end{align*}
\end{itemize}

Since all KKT conditions are satisfied and sufficient for global optimality, $\mathbf{u}^* = \frac{\epsilon}{\sqrt{d}}\mathbf{1}$ is a global optimum of the specified concave maximization problem. We note that this result is also implied by the Schur-concavity property of entropy. Following this result, the specified objective is upper-bounded as follows:
$$\sum_{i=1}^d u_i \log\frac{1}{u_i} \le \epsilon\sqrt{d}\log\frac{\sqrt{d}}{\epsilon}$$ 
Therefore, the lemma's proof is complete.
\end{proof}

Following the above lemmas, knowing that $\Vert \widehat{\lambda}_n - \widetilde{\lambda} \Vert_2 \le \sqrt{\frac{32\log(2/\delta)}{n}}$ from Theorem~\ref{Thm: eigenvalue convergence} and using the assumption $n\ge 32e^2\log(2/\delta)\approx 236.5 \log(2/\delta)$ that ensures the upper-bound satisfies $\sqrt{\frac{32\log(2/\delta)}{n}}\le \frac{1}{e}$, we can apply the above two lemmas to show that with probability $1-\delta$:

$$\Bigl\vert H_1({\widehat{C}_n}) - H_1(C_X)\Bigr\vert = \Bigl\vert H_1(\widehat{\lambda}_n) - H_1(\widetilde{\lambda})\Bigr\vert \le \sqrt{\frac{8 d \log(2/\delta)}{n}} \log\Bigl( \frac{nd}{32\log(2/\delta)}\Bigr)$$

Note that under a kernel function with finite dimension $d$, the above bound will be $\mathcal{O}\Bigl(\sqrt{\frac{d}{n}} \log\bigl(nd\bigr)\Bigr)$. %To show that our bound on $d$-dimensional space can be translated to $t$-truncated population Vendi, we utilize Lemma \ref{Lemma: Projection} in the proof of Theorem \ref{Thm: truncated Vendi guarantee}. The same upper-bound will hold for $t$-truncated Vendi when dimension $d$ is replaced by truncation parameter $t$ in the upper-bound.


%We can repeat the projection lemma in the Proof of Theorem~\ref{Thm: truncated Vendi guarantee} to show that the same upper-bound will remain to hold for $t$-truncated Vendi when dimension $d$ is replaced by truncation parameter $t$ in the upper-bound.


\textbf{The case of $1< \alpha <2$.} Note that the inequality $\Vert v \Vert_\alpha \le d^{\frac{2-\alpha}{2}} \Vert v\Vert_2$ holds for every $d$-dimensional vector $v\in\mathbb{R}^d$. Therefore, we can repeat the proof of Corollary~\ref{Corollary: Order greater than 2} to show the following for every $1< \alpha <2$
\begin{align*}
    \Bigl\vert \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, &=\, \Bigl\vert \bigl\Vert \widehat{\boldsymbol{\lambda}}_n\bigr\Vert_{\alpha} - \bigl\Vert \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{\alpha}\Bigr\vert \\
    &\le\, \bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{\alpha} \\
    &\le\, d^{\frac{2-\alpha}{2}}\bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{2}.
\end{align*}
Consequently, Theorem~\ref{Thm: eigenvalue convergence} impies that for every $1\le \alpha<2 $ and $\delta\ge \exp((2-n)/8)$, the following holds with probability at least $1-\delta$
\begin{equation*}
     \Bigl\vert \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, \le \,  d^{\frac{2-\alpha}{2}}\sqrt{\frac{32\log(2/\delta)}{n}} \, =\, \sqrt{\frac{32 d^{2-\alpha}\log(2/\delta)}{n}}
\end{equation*}





\subsection{Proof of Corollary \ref{Corollary: Order greater than 2}}
Considering the $\alpha$-norm definition $\Vert \mathbf{v}\Vert_\alpha= \bigl(\sum_{i=1}^d |v_i|^\alpha\bigr)^{1/\alpha}$, we can rewrite the order-$\alpha$ Vendi definition  as
\begin{equation*}
    \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n) = \bigl\Vert \widehat{\boldsymbol{\lambda}}_n\bigr\Vert^{\frac{\alpha}{1-\alpha}}_{\alpha} \quad \Longleftrightarrow \quad \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} = \bigl\Vert \widehat{\boldsymbol{\lambda}}_n\bigr\Vert_{\alpha}
\end{equation*}
where $\widehat{\boldsymbol{\lambda}}_n$ is defined in Theorem~\ref{Thm: eigenvalue convergence}. Similarly, given the definition of $\widetilde{\boldsymbol{\lambda}}$ we can write 
\begin{equation*}
    \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}} = \bigl\Vert \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{\alpha}
\end{equation*}
Therefore, for every $\alpha\ge 2$, the following hold due to the triangle inequality: 
\begin{align*}
    \Bigl\vert \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}}  -  \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, &=\, \Bigl\vert \bigl\Vert \widehat{\boldsymbol{\lambda}}_n\bigr\Vert_{\alpha} - \bigl\Vert \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{\alpha}\Bigr\vert \\
    &\le\, \bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{\alpha} \\
    &\le\, \bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}}\bigr\Vert_{2}.
\end{align*}
As a result, Theorem~\ref{Thm: eigenvalue convergence} shows that for every $\alpha\ge 2$ and $\delta\ge \exp((2-n)/8)$, the following holds with probability at least $1-\delta$
\begin{equation*}
     \Bigl\vert \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, \le \, \sqrt{\frac{32\log(2/\delta)}{n}}
\end{equation*}


\subsection{Proof of Theorem~\ref{Thm: truncated Vendi guarantee}}
We begin by proving the following lemma showing that the eigenvalues used in the definition of the $t$-truncated Vendi score are the projection of the original eigenvalues onto a $t$-dimensional probability simplex.
\begin{lemma}\label{Lemma: Projection}
   Consider $\mathbf{v}\in [0,1]^d$ that satisfies $\mathbf{1}^\top \mathbf{v} = 1$. i.e., the sum of $\mathbf{v}$'s entries equals $1$. Given integer $1\le t\le d$, define vector $\mathbf{v}^{(t)} \in [0,1]^d$ whose last $d-t$ entries are $0$, i.e., $v^{(t)}_{i} =0 $ for $t+1\le i\le d$, and its first $t$ entries are defined as $v^{(t)}_{j} = v_j + \frac{1-S_t}{t}$ where $S_t = v_1+\cdots +v_t$. Then, $\mathbf{v}^{(t)}$ is the projection of $\mathbf{v}$ onto the following simplex set and has the minimum $\ell_2$-norm distance to this set
   \begin{equation*}
       \Delta_t := \Bigl\{ \mathbf{u}\in [0,1]^d:\; v_i = 0\: \text{\rm for all}\: t+1\le i\le d,\;\;\; \sum_{i=1}^t v_i = 1  \Bigr\}.
   \end{equation*}
\end{lemma}
\begin{proof}
To prove the lemma, first note that $\mathbf{v}^{(t)} \in \Delta_t$, i.e. its first $t$ entries are non-negative and add up to $1$, and also its last $d-t$ entries are zero. Then, consider the projection problem discussed in the lemma:
\begin{align*}
    &\min_{\mathbf{u}\in\mathbb{R}^t}\qquad\;\;\: \sum_{i=1}^t \bigl(u_i - v_i\bigr)^2 \\
    &\text{\rm subject to}\quad u_i\ge 0, \;\; \text{\rm for all}\: i\\
    &\qquad\qquad\quad \sum_{i=1}^t u_i = 1
\end{align*}
Then, since we know from the assumptions that $v_i\ge 0$ and $\sum_{i=1}^t v_i \le 1$, the discussed $\mathbf{u}^*\in\mathbb{R}^t$ where $u^*_i = v_i + (1-S_t)/t$ together with Lagrangian coefficients $\mu_i =0 $ (for inequality constraint $u_i\ge 0$) and $\lambda = (1 - S_t)/t$ (for equality constraint) satisfy the KKT conditions. The primal and dual feasibility conditions as well as the complementary slackness clearly hold for these selection of primal and dual variables. Also, the KKT stationarity condition is satisfied as for every $i$ we have $u^*_i -v_i -\lambda - \mu_i = 0$. Since the optimization problem is a convex optimization task with affine constraints, the KKT conditions are sufficient for optimaility which proves the lemma.  
\end{proof}

Based on the above lemma, the eigenvalues $\widehat{\boldsymbol{\lambda}}_n^{(t)}$ used to calculate the $t$-truncated Vendi score $\mathrm{Vendi}^{(t)}_\alpha(x_1,\ldots ,x_n)$ are the projections of the top-$t$ eigenvalues in $\widehat{\boldsymbol{\lambda}}_n$ for the original score $\mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)$ onto the $t$-simplex subset of $\mathbb{R}^d$ according to the $\ell_2$-norm. Similarly, the eigenvalues $\widetilde{\boldsymbol{\lambda}}_n^{(t)}$ used to calculate the $t$-truncated population Vendi $\mathrm{Vendi}^{(t)}_\alpha(P_X)$ are the projections of the top-$t$ eigenvalues in $\widetilde{\boldsymbol{\lambda}}$ for the original population Vendi $\mathrm{Vendi}_\alpha(P_x)$ onto the $t$-simplex subset of $\mathbb{R}^d$.

Since $\ell_2$-norm is a Hilbert space norm and the $t$-simplex subset $\Delta_t$ is a convex set, we know from the convex analysis that the $\ell_2$-distance between the projected points $\widehat{\boldsymbol{\lambda}}_n^{(t)}$ and $\widetilde{\boldsymbol{\lambda}}^{(t)}$ is upper-bounded by the $\ell_2$-distance between the original points $\widehat{\boldsymbol{\lambda}}_n$ and $\widetilde{\boldsymbol{\lambda}}$. As  a result, Theorem~\ref{Thm: eigenvalue convergence} implies that 
\begin{align*}
    & \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \bigr\Vert_2 \le \sqrt{\frac{32\log(2/\delta)}{n}}\Bigr) \, \ge\, 1- \delta \\
    \Longrightarrow \;\; & \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}^{(t)}_n - \widetilde{\boldsymbol{\lambda}}^{(t)} \bigr\Vert_2 \le \sqrt{\frac{32\log(2/\delta)}{n}}\Bigr) \, \ge\, 1- \delta
\end{align*}
However, note that the eigenvalue vectors $\widehat{\boldsymbol{\lambda}}^{(t)}_n$ and $\widetilde{\boldsymbol{\lambda}}^{(t)}$ can be analyzed in a bounded $t$-dimensional space as their entries after index $t+1$ are zero. Therefore, we can apply the proof of Corollary~\ref{Corollary: Finite Dimension} to show that for every $1\le \alpha<2 $ and $\delta\ge \exp((2-n)/8)$, the following holds with probability at least $1-\delta$
\begin{equation*}
     \Bigl\vert \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, \le \,   \sqrt{\frac{32 t^{2-\alpha}\log(2/\delta)}{n}}
\end{equation*}
To extend the result to a general $\alpha>1$, we reach the following inequality covering the above result as well as the result of Corollary~\ref{Corollary: Order greater than 2} in one inequality
\begin{equation*}
     \Bigl\vert \mathrm{Vendi}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, \le \, \sqrt{\frac{32 \max\{1, t^{2-\alpha}\}\log(2/\delta)}{n}}
\end{equation*}


\subsection{Proof of Theorem~\ref{Thm: Nyström, FKEA}}
\textbf{Proof of Part (a)}. As defined by \cite{ospanov_fkea_2024},  the FKEA method uses the eigenvalues of $t$ random Fourier frequencies $\omega_1,\ldots , \omega_{t}$ where for each $\omega_i$ they consider two features $\cos(\omega_i^\top x)$ and $\sin(\omega_i^\top x)$. Following the definitions, it can be seen that $k(x,x') = \mathbb{E}_{\omega\sim p_\omega}\bigl[\cos(\omega^{\top}(x-x'))\bigr]$ which is approximated by FKEA as $\frac{1}{t}\sum_{i=1}^{t} \cos(\omega_i^{\top}(x-x'))$. Therefore, if we define kernel matrix $K_i$ as the kernel matrix for $k_i(x,x') = \cos(\omega_i^{\top}(x-x'))$, then we will have
\begin{equation*}
    \frac{1}{n} K^{\mathrm{FKEA}(t)} = \frac{1}{t}\sum_{i=1}^{t} \frac{1}{n}K_i
\end{equation*}
where $\mathbb{E}_{\omega_i\sim p_\omega}[\frac{1}{n}K_i] = \frac{1}{n}K$.

On the other hand, we note that $\Vert \frac{1}{n} K \Vert_{\mathrm{HS}} \le 1$ holds as the kernel function is normalized and hence $|k(x,x')|\le 1$. Since the Frobenius norm is the $\ell_2$-norm of the vectorized version of the matrix, we can apply Vector Bernstein inequality in Lemma~\ref{Lemma: Vector Bernstein} to show that for every $0\le \epsilon \le 2$: 
\begin{align*}
    &\mathbb{P}\Bigl(\Bigl\Vert \frac{1}{t}\sum_{i=1}^t  \bigl[\frac{1}{n}K_i\bigr] - \frac{1}{n}K  \Bigr\Vert_{F} \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-t\epsilon^2}{32}\Bigr) \\
    \Longrightarrow \;\; & \mathbb{P}\Bigl(\Bigl\Vert \frac{1}{n}K^{\mathrm{FKEA}(t)} - \frac{1}{n}K  \Bigr\Vert_{F} \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-t\epsilon^2}{32}\Bigr)
\end{align*}
Then, we apply the Hoffman-Wielandt inequality to show that for the sorted eigenvalue vectors of $\frac{1}{n}K$ (denoted by $\widehat{\boldsymbol{\lambda}}_n$) and $\frac{1}{n}K^{\mathrm{FKEA}(t)}$ (denoted by ${\boldsymbol{\lambda}}^{\mathrm{FKEA}(t)}$) we will have
$\Vert \widehat{\boldsymbol{\lambda}}_n - {\boldsymbol{\lambda}}^{\mathrm{FKEA}(t)} \Vert_2\le \Vert \frac{1}{n}K^{\mathrm{FKEA}(t)} - \frac{1}{n}K \Vert_{\mathrm{HS}}$, which together with the previous inequality leads to
\begin{align*}
 \mathbb{P}\Bigl(\Bigl\Vert \widehat{\boldsymbol{\lambda}}_n - {\boldsymbol{\lambda}}^{\mathrm{FKEA}(t)} \Bigr\Vert_2 \ge \epsilon\Bigr) \, \le\, \exp\Bigl(\frac{8-t\epsilon^2}{32}\Bigr)
\end{align*}
Furthermore, as we shown in the proof of Theorem~\ref{Thm: eigenvalue convergence} for every $0\le \gamma\le 2$
\begin{align*}
    & \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}_n - \widetilde{\boldsymbol{\lambda}} \bigr\Vert_2 \ge \gamma\Bigr) \, \le\, \exp\Bigl(\frac{8-n\gamma^2}{32}\Bigr)
\end{align*}
which, by applying the union bound for $\gamma = \epsilon$, together with the previous inequality shows that
\begin{align*}
 \mathbb{P}\Bigl(\Bigl\Vert \widetilde{\boldsymbol{\lambda}} - {\boldsymbol{\lambda}}^{\mathrm{FKEA}(t)} \Bigr\Vert_2 \ge 2\epsilon\Bigr) \, &\le\, \exp\Bigl(\frac{8-t\epsilon^2}{32}\Bigr)+\exp\Bigl(\frac{8-n\epsilon^2}{32}\Bigr) \\
 &\le \, 2\exp\Bigl(\frac{8-\min\{n,t\}\epsilon^2}{32}\Bigr)
\end{align*}
Therefore, Lemma~\ref{Lemma: Projection} implies that
\begin{align*}
 \mathbb{P}\Bigl(\Bigl\Vert \widetilde{\boldsymbol{\lambda}}^{(t)} - {\boldsymbol{\lambda}}^{\mathrm{FKEA}(t)} \Bigr\Vert_2 \ge \epsilon\Bigr) \, &\le \, 2\exp\Bigl(\frac{32-\min\{n,t\}\epsilon^2}{128}\Bigr)
\end{align*}
If we define $\delta= 2\exp\bigl(\frac{32-\min\{n,t\}\epsilon^2}{128}\bigr)$, implying that $\epsilon \le \sqrt{\frac{128\log(3/\delta)}{\min\{n,t\}}}$, then the above inequality shows that
\begin{align*}
 \mathbb{P}\Bigl(\Bigl\Vert \widetilde{\boldsymbol{\lambda}}^{(t)} - {\boldsymbol{\lambda}}^{\mathrm{FKEA}(t)} \Bigr\Vert_2 \le \sqrt{\frac{128\log(3/\delta)}{\min\{n,t\}}}\Bigr) \, &\ge 1-\delta
\end{align*}
Therefore, if we follow the same steps of the proof of Theorem~\ref{Thm: truncated Vendi guarantee}, we can show
\begin{equation*}
     \Bigl\vert \mathrm{FKEA}\text{-}\mathrm{Vendi}^{(t)}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}^{(t)}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, \le \, \sqrt{\frac{128 \max\{1, t^{2-\alpha}\}\log(3/\delta)}{\min\{n,t\}}}
\end{equation*}

\textbf{Proof of Part (b)}. To show this theorem, we use Theorem~3 from \citep{xu2015nystrom}, which shows that if the $r$th largest eigenvalue of the kernel matrix $\frac{1}{n}K$ satisfies $\lambda_r \le \frac{\tau}{n}$, then given $t\ge Cr\log(n)$ ($C$ is a universal constant), the following spectral norm bound will hold with probability $1-\frac{2}{n^3}$:
\begin{equation*}
    \bigl\Vert \frac{1}{n}K - \frac{1}{n}K^{\mathrm{Nystrom(t)}} \bigr\Vert_{sp} \le \mathcal{O}\Bigl( \frac{\tau \log(n)}{\sqrt{nt}}\Bigr).
\end{equation*}
Therefore, Weyl's inequality implies the following for the vector of sorted eigenvalues of $\frac{1}{n}K$, i.e. $\widehat{\boldsymbol{\lambda}}_n$, and that of $\frac{1}{n}K^{\mathrm{Nystrom(t)}}$, i.e., ${\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}}$,
\begin{equation*}
    \bigl\Vert \widehat{\boldsymbol{\lambda}}_n - {\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}} \bigr\Vert_{\infty} \le \mathcal{O}\Bigl( \frac{\tau \log(n)}{\sqrt{nt}}\Bigr).
\end{equation*}
As a result, considering the subvectors $\widehat{\boldsymbol{\lambda}}_n[1:t]$ and ${\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}}[1:t]$ with the first $t$ entries of the vectors, we will have:
\begin{equation*}
    \bigl\Vert \widehat{\boldsymbol{\lambda}}_n[1:t] - {\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}}[1:t] \bigr\Vert_{\infty} \le \mathcal{O}\Bigl( \frac{\tau \log(n)}{\sqrt{nt}}\Bigr) \quad \Longrightarrow \quad \bigl\Vert \widehat{\boldsymbol{\lambda}}_n[1:t] - {\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}}[1:t] \bigr\Vert_{2} \le \mathcal{O}\Bigl( \tau \log(n)\sqrt{\frac{t}{n}}\Bigr)  
\end{equation*}
Noting that the non-zero entries of ${\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}}$ are all included in the first-$t$ elements, we can apply Lemma~\ref{Lemma: Projection} which shows that with probability $1-2n^{-3}$ we have
\begin{equation*}
     \Bigl\Vert \widehat{\boldsymbol{\lambda}}^{(t)}_n - {\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}}\Bigr\Vert_{2} \le \mathcal{O}\Bigl( \tau \log(n)\sqrt{\frac{t}{n}}\Bigr)  
\end{equation*}
Also, in the proof of Theorem~2, we showed that
\begin{align*}
    \mathbb{P}\Bigl(\bigl\Vert \widehat{\boldsymbol{\lambda}}^{(t)}_n - \widetilde{\boldsymbol{\lambda}}^{(t)} \bigr\Vert_2 \le \sqrt{\frac{32\log(2/\delta)}{n}}\Bigr) \, \ge\, 1- \delta
\end{align*}
Combining the above inequalities using a union bound, shows that with probability at least $1-\delta-2n^{-3}$ we have 
\begin{align*}
     \Bigl\Vert {\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}} - \widetilde{\boldsymbol{\lambda}}^{(t)} \Bigr\Vert_{2} \:&\le\:  \Bigl\Vert {\boldsymbol{\lambda}}^{\mathrm{Nystrom(t)}} - \widehat{\boldsymbol{\lambda}}^{(t)}_n \Bigr\Vert_{2} + \Bigl\Vert \widehat{\boldsymbol{\lambda}}^{(t)}_n - \widetilde{\boldsymbol{\lambda}}^{(t)} \Bigr\Vert_{2} \\
     \:&\le\:  \sqrt{\frac{32\log(2/\delta)}{n}}+ \mathcal{O}\Bigl( \tau \log(n)\sqrt{\frac{t}{n}}\Bigr) \\
     &=\: \mathcal{O}\Bigl(\sqrt{\frac{\log(2/\delta)+t\log(n)^2\tau^2}{n}}\Bigr)
\end{align*}
Hence, repeating the final steps in the proof of Theorem~\ref{Thm: truncated Vendi guarantee}, we can prove
\begin{equation*}
     \Bigl\vert \mathrm{Nystrom}\text{-}\mathrm{Vendi}^{(t)}_\alpha(x_1,\ldots ,x_n)^{\frac{1-\alpha}{\alpha}} -  \mathrm{Vendi}^{(t)}_\alpha(P_x)^{\frac{1-\alpha}{\alpha}}\Bigr\vert \, \le \, \mathcal{O}\Bigl(\sqrt{\frac{\max\{t^{2-\alpha},1\}\bigl(\log(2/\delta)+t\log(n)^2\tau^2\bigr)}{n}}\Bigr)
\end{equation*}
