\subsection{Proof of Theorem \ref{thm:generalization_error_withbounded_Fro_norm}}
\iffalse
Let $\mathcal{S}$ be the set of random observations over triplets with $\mathcal{X}$ as the set of objects used to generate triplets in $\mathcal{S}$, and $\Psi_1,\ldots,\Psi_n \in \mathcal{H}$ be KPCA directions for the span $\mathcal{S}_{\mathcal{X}}$. Recall that $\widehat{\Mb}$ is the empirical risk minimizer, i.e.,  optimal solution to the optimization in (\ref{finite_opt}). We have defined following linear functional $\widehat{L}_0$, in Section \ref{sec:Theoretical Guarantees}, based on $\mathcal{S}$, psd matrix $\widehat{\Mb}$ and KPCA directions for $\mathcal{S}_{\mathcal{X}}$:
\begin{eqnarray}
\widehat{L}_0:\widehat{L}_0\phi_x =    \sum_{i=1}^n\sum_{j=1}^nw_{i,j}\Psi_i\otimes \Psi_j \mathcal{P}_{\mathcal{S}_{\mathcal{X}}}\phi_x \label{L0 def}
\end{eqnarray}
where $\Psi_i\otimes \Psi_j \phi_x = \langle \Psi_j, \phi_x\rangle_\mathcal{H}\Psi_i$ and $\Wb= \text{Chol}(\widehat{\Mb})$ such that $\Wb\Wb^T=\widehat{\Mb}$, i.e., $\Wb$ is from Cholesky decomposition of $\widehat{\Mb}$.
\fi
From Proposition \ref{prop:opt2 and opt3}, we have 
\begin{eqnarray}
    \widehat{{R}}_\mathcal{S}(\widehat{L}_0) = \widehat{{R}}_\mathcal{S}(\widehat{L}). \label{eq:empirical risk of L0 and L}
\end{eqnarray}
Then, using standard Rademacher complexity bounding techniques, we can write following
\begin{eqnarray}
R(\widehat{L}_{0})-R(L^*)&=& R(\widehat{L}_{0})-\widehat{R}_\mathcal{S}(\widehat{L}_{0})+\widehat{R}_\mathcal{S}(\widehat{L}_{0})-\widehat{R}_\mathcal{S}(L^*)+\widehat{R}_\mathcal{S}(L^*)-R(L^*) \nonumber
    \\
    &\overset{a}{=}& R(\widehat{L}_{0})-\widehat{R}_\mathcal{S}(\widehat{L}_{0})+\widehat{R}_\mathcal{S}(\widehat{L})-\widehat{R}_\mathcal{S}(L^*)+\widehat{R}_\mathcal{S}(L^*)-R(L^*) \nonumber 
    \\ &\leq&
    2 \sup_{L}|\widehat{R}_\mathcal{S}({L})-R(L)| \nonumber
    \\ 
    &\leq& 2\mathbb{E}_{\mathcal{S}\sim D}[\sup_{L}|\widehat{R}_\mathcal{S}({L})-R(L)|] + \beta\sqrt{\frac{2\ln{2/\delta}}{|\mathcal{S}|}}\label{Rademacher last}
    %\\ 
    %&\leq& 2\mathbb{E}_{\mathcal{S}\sim \mathcal{D}, \epsilon \sim \{\pm 1\} ^{|\mathcal{S}|}} \frac{2\alpha}{|\mathcal{S}|} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2))\right] + \beta\sqrt{\frac{2\ln{2/\delta}}{|\mathcal{S}|}}
    %\\
    %&=&2\mathbb{E}_{\mathcal{S}, \epsilon \sim \{\pm 1\} ^{|\mathcal{S}|}}  \frac{2l}{|\mathcal{S}|} \left[\sup_L |\sum_{t\in \mathcal{S}}|\epsilon_t( \langle L\phi_i, L\phi_i \rangle- \langle L\phi_j, L\phi_j\rangle-2(\langle L\phi_h, L\phi_i\rangle-\langle L\phi_h, L\phi_j\rangle))\right] + c\sqrt{\frac{2\ln{2/\delta}}{|\mathcal{S}|}}
\end{eqnarray}
where $\beta:= \sup_{} |\ell (\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2)-\ell (\|L\phi_h'-L\phi_i'\|_\mathcal{H}^2-\|L\phi_h'-L\phi_j'\|_\mathcal{H}^2)|$ and $(a)$ is from (\ref{eq:empirical risk of L0 and L}). Note that $\beta\leq 12\alpha\lambda_F B^2$, since the difference of triplets is bounded by $6\lambda_F B^2$ (see Lemma \ref{lem:difference of distances}) and the loss is $\alpha-$Lipschitz. 

Now, using standard symmetrization and contraction lemmas, we may introduce $\epsilon_t \in \{-1, 1\}$'s, that are Rademacher random variables corresponding to each triplet $t$. Then, we have
\begin{eqnarray*}
    \mathbb{E}_{\mathcal{S}\sim \mathcal{D}}[\sup_{L}|\widehat{R}_\mathcal{S}({L})-R(L)|]
    \leq 
    \mathbb{E}_{\mathcal{S}\sim \mathcal{D}, \epsilon \sim \{\pm 1\} ^{|\mathcal{S}|}} \frac{2\alpha}{|\mathcal{S}|} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2))\right]
\end{eqnarray*}
The expression inside the expectation on the right hand side can be considered as a function of random triplets in $\mathcal{S}$. We focus on the expectation on the right hand side:
\begin{eqnarray}
    \mathbb{E}_{\mathcal{S}\sim \mathcal{D}, \epsilon \sim \{\pm 1\} ^{|\mathcal{S}|}} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2))\right]. \label{expectation}
\end{eqnarray}
Note that (\ref{expectation}) is finite, since the difference of triplets is bounded. Therefore, we can apply Fubini's Theorem, and write it as
\begin{eqnarray}
    \mathbb{E}_{\mathcal{S}}\left[\mathbb{E}_{\epsilon |\mathcal{S}} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2))\right] \right] \label{cond_exp}
\end{eqnarray}
%To apply Matrix Bernstein, we need Hermitian matrices. So we consider $\Kb_t=\frac{1}{2}(\Kb'_t+\Kb_t'^T)$, which does not change the solution on trace operators.
where $\mathbb{E}_{\epsilon |\mathcal{S}}$ is the conditional expectation given $\mathcal{S}$. In (\ref{expectation}), we have a set of random triplets with corresponding random features $\phi_1, \ldots, \phi_n$ inside the expectation, where randomness is based on the triplet set $\mathcal{S}$. However, the conditional expectation $\mathbb{E}_{\epsilon |\mathcal{S}}$ in (\ref{cond_exp}) is conditioned on $\mathcal{S}$. Note that the size of the Rademacher random vector $\epsilon$ is $|\mathcal{S}|$. We first focus on the conditional expectation:
\begin{eqnarray}
    \mathbb{E}_{\epsilon |\mathcal{S}} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2))\right]. \label{cond_exp2}
\end{eqnarray}
Consider the span of features $\phi_1, \ldots, \phi_n$ and call it $\mathcal{S}_\mathcal{X}$. Using Riesz's Representation Theorem, we can write $L\phi$ for any $\phi$ as follows:
\begin{eqnarray*}   L\phi=\sum_{k=1}^\infty\langle \phi, \tau_k\rangle_\mathcal{H} \mathbf{e}_k
\end{eqnarray*}
For the conditional expectation in (\ref{cond_exp2}), we can write each $\tau_k$ as the summation of $\tau_k'$ and $\tau_k^\perp$, where $\tau_k'$ represents the part lies in $\mathcal{S}_\mathcal{X}$ and $\tau_k^\perp$ is orthogonal to $\mathcal{S}_\mathcal{X}$. 
\begin{eqnarray*}
    \tau_k=\tau_k'+\tau_k^\perp.
\end{eqnarray*}
We can represent each $\tau_k'$ as $\sum_{j=1}^nv_{k,j}\psi_j$, where $\{\psi_1, \ldots, \psi_n\}$ is an orthonormal basis for the set $\{\phi_1, \ldots, \phi_n\}$ and $v_{k,j}\in \mathbb{R}, \forall k,j$. Therefore, for any $\phi_i, \phi_j \in \mathcal{S}_\mathcal{X}$,
\begin{eqnarray}
    \langle L\phi_i, L\phi_j\rangle_\mathcal{H} &=& \sum_{k=1}^\infty\langle \phi_i, \tau_k \rangle_\mathcal{H} \langle \phi_j, \tau_k \rangle_\mathcal{H} \nonumber
    \\ &=& \sum_{a=1}^n\sum_{b=1}^n \left(\sum_{k=1}^\infty v_{k,a}v_{k,b}\right)\langle \phi_i, \psi_a \rangle_\mathcal{H} \langle \phi_j, \psi_b \rangle_\mathcal{H} \nonumber \\
    &=& \varphi_i^T\mathbf{M}^{\mathcal{S}_\mathcal{X}} \varphi_j
    \label{Riesz_representation}
\end{eqnarray}
where $\varphi_{i}=[\langle \phi_{i}, \psi_1 \rangle, \langle \phi_{i}, \psi_2 \rangle, \ldots \langle \phi_{i}, \psi_n \rangle]^T$ and $\Mb^{\mathcal{S}_\mathcal{X}}_{i,j}=\sum_{k=1}^\infty v_{k,j}v_{k,i}$.  Note that $\Mb^{\mathcal{S}_\mathcal{X}}$ and $\{\varphi_1, \ldots, \varphi_n\}$ are functions of $\mathcal{S}$. Based on (\ref{Riesz_representation}), for $\phi_i,\phi_j \in \mathcal{S}$, we have
\begin{eqnarray*}
    &&\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2
    \\&=&(\varphi_{j}-\varphi_{i})^T\Mb^{\mathcal{S}_\mathcal{X}} (2\varphi_{h}-\varphi_{i}-\varphi_{j})
    \\ &=&\frac{1}{2}\left((\varphi_{j}-\varphi_{i})^T\Mb^{\mathcal{S}_\mathcal{X}} (2\varphi_{h}-\varphi_{i}-\varphi_{j})+(2\varphi_{h}-\varphi_{i}-\varphi_{j})^T\Mb^{\mathcal{S}_\mathcal{X}}(\varphi_{j}-\varphi_{i}) \right)
    \\&=&\frac{1}{2}\text{Tr}\left(\Mb^{\mathcal{S}_\mathcal{X}}(2\varphi_{h}-\varphi_{i}-\varphi_{j})(\varphi_{j}-\varphi_{i})^T+\Mb^{\mathcal{S}_\mathcal{X}}(\varphi_{j}-\varphi_{i})(2\varphi_{h}-\varphi_{i}-\varphi_{j})^T\right)
    \\&=&\text{Tr}\left(\Mb^{\mathcal{S}_\mathcal{X}}(\varphi_{h}\varphi_{j}^T+\varphi_{j}\varphi_{h}^T-\varphi_{h}\varphi_{i}^T-\varphi_{i}\varphi_{h}^T+\varphi_{i}\varphi_{i}^T-\varphi_{j}\varphi_{j}^T)\right)
\end{eqnarray*}
Suppose $\Kb_t=\varphi_{h}\varphi_{j}^T+\varphi_{j}\varphi_{h}^T-\varphi_{h}\varphi_{i}^T-\varphi_{i}\varphi_{h}^T+\varphi_{i}\varphi_{i}^T-\varphi_{j}\varphi_{j}^T$. Then, we have
\begin{eqnarray}
    \mathbb{E}_{\mathcal{S}}\left[\mathbb{E}_{\epsilon |\mathcal{S}} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2)\right] \right] \nonumber
    \\ =
    \mathbb{E}_{\mathcal{S}}\left[\mathbb{E}_{\epsilon |\mathcal{S}} \left[\sup_L \text{Tr}\left(\Mb^{\mathcal{S}_\mathcal{X}}\sum_{t\in \mathcal{S}}\epsilon_t \Kb_t\right)\right]\right]. \label{finite_kpca}
\end{eqnarray}
For the expression inside the expectations in (\ref{finite_kpca}), we have    
\begin{eqnarray}
\sup_L \text{Tr}\left(\Mb^{\mathcal{S}_\mathcal{X}}\sum_{t\in \mathcal{S}}\epsilon_t \Kb_t\right)
    &\overset{a}{\leq}& \sup_L\sum_{i=1}^r\sigma_i(\Mb^{\mathcal{S}_\mathcal{X}})\sigma_i\left(\sum_{t\in \mathcal{S}}\epsilon_t \Kb_t\right) \nonumber
    \\
    %&\overset{b}{\leq}& \mathbb{E} \sup_L \left[\|\Gb\|_{\text{F}} \|(2\Psi_c-\Psi_a-\Psi_b)\Sigma (\Psi_b-\Psi_a)^T\|_{\text{F}}\right] \\
    &\overset{b}{\leq}&  \sup_L \left[\|\Mb^{\mathcal{S}_\mathcal{X}}\|_{\text{F}} \|\sum_{t\in \mathcal{S}}\epsilon_t \Kb_t\|_{\text{F}}\right] \nonumber
     \\
    &\overset{c}{\leq}& \lambda_F\|\sum_{t\in \mathcal{S}}\epsilon_t \Kb_t\|_{\text{F}} \nonumber
    \\ 
    &{=}& \lambda_F\sqrt{\|\sum_{t\in \mathcal{S}}\epsilon_t\Kb_t\|_{\text{F}}^2} \label{bounding_trace}.
\end{eqnarray}
    Here, $(a)$ is from Von Neumann's trace inequality, $(b)$ is the result of Cauchy–Schwarz Inequality and we recall that $\|\Mb^{\mathcal{S}_\mathcal{X}}\|_{\text{F}}\leq\lambda_F$. Inserting (\ref{bounding_trace}) into (\ref{finite_kpca}), we can write
\begin{eqnarray*}
    \mathbb{E}_{\mathcal{S}}\left[\mathbb{E}_{\epsilon |\mathcal{S}} \left[\sup_L \sum_{t\in \mathcal{S}}\epsilon_t(\|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2)\right] \right]
    \leq
\lambda_F\mathbb{E}_{\mathcal{S}}\left[\mathbb{E}_{\epsilon |\mathcal{S}} \left[\sqrt{\|\sum_{t\in \mathcal{S}}\epsilon_t\Kb_t\|_{\text{F}}^2}\right]\right].
\end{eqnarray*}
Then, we have
\begin{eqnarray}
\mathbb{E}_{\mathcal{S}}\left[\mathbb{E}_{\epsilon |\mathcal{S}} \left[\sqrt{\|\sum_{t\in \mathcal{S}}\epsilon_t\Kb_t\|_{\text{F}}^2}\right]\right]
       &\overset{a}{\leq}& \mathbb{E}_{\mathcal{S}}\left[\sqrt{\mathbb{E}_{\epsilon |\mathcal{S}}\left[\|\sum_{t\in \mathcal{S}}\epsilon_t\Kb_t\|_{\text{F}}^2\right]}\right]\nonumber
    \\
   &\overset{}{=}&\mathbb{E}_{\mathcal{S}}\left[ \sqrt{\mathbb{E}_{\epsilon |\mathcal{S}}\left[\langle\sum_{t\in \mathcal{S}}\epsilon_t\Kb_t, \sum_{t\in \mathcal{S}}\epsilon_t\Kb_t\rangle\right]}\right]\nonumber
    \\
   &\overset{}{=}& \mathbb{E}_{\mathcal{S}}\left[\sqrt{\mathbb{E}_{\epsilon |\mathcal{S}}\left[\sum_{t\in \mathcal{S}}\sum_{t'\in \mathcal{S}}\epsilon_t\epsilon_{t'}\langle\Kb_t, \Kb_{t'}\rangle\right]}\right]\nonumber
    \\
     &\overset{b}{=}& \mathbb{E}_{\mathcal{S}}\left[\sqrt{\mathbb{E}_{\epsilon |\mathcal{S}}\left[\sum_{t\in \mathcal{S}}\epsilon_t^2\langle\Kb_t, \Kb_t\rangle\right]}\right]\nonumber
    \\
    &\overset{}{=}& \mathbb{E}_{\mathcal{S}}\left[\sqrt{\sum_{t\in \mathcal{S}}\|\Kb_t\|_{\text{F}}^2}\right]\nonumber
    \\
    &\overset{}{\leq}& B^2\sqrt{|S|6} \label{expectation last}
\end{eqnarray}
where $(a)$ is from Jensen’s inequality where the expectation is over the randomness in $\epsilon_t$ and $(b)$ is due the fact that $\mathbb{E}(\epsilon_{t_1}\epsilon_{t_2})=0$ when $t_1\neq t_2$. For the last step, recall that $\Kb_t=\varphi_{h}\varphi_{j}^T+\varphi_{j}\varphi_{h}^T-\varphi_{h}\varphi_{i}^T-\varphi_{i}\varphi_{h}^T+\varphi_{i}\varphi_{i}^T-\varphi_{j}\varphi_{j}^T$. Then, we have
\begin{eqnarray}
    \|\Kb_t\|_{\text{F}}^2&\overset{a}{\leq}& 6\max_{i,j}\|\varphi_{i}\varphi_{j}^T\|_{\text{F}}^2 \nonumber
    \\ &\overset{b}{\leq}& 6B^4, \label{bounding_fro_norm of Kt}
\end{eqnarray}
where $(a)$ is by triangle inequality and $(b)$ follows from that fact that $\|\varphi\|_2 = \|\phi_i\|_\mathcal{H}\leq B$. Note that $\|\varphi\|_2 = \|\phi_i\|_\mathcal{H}$ is by definition, where $\varphi_i$ is defined via change of basis on the span $\mathcal{S}_\mathcal{X}$. Finally, from (\ref{Rademacher last}) and (\ref{expectation last}), we have
\begin{eqnarray*}
    R(\widehat{L}_0)-R(L^*)\leq 4\alpha B^2\lambda_F\sqrt{\frac{6}{|S|}}+2\ell\sqrt{\frac{2\gamma^2\ln{2/\delta}}{|\mathcal{S}|}},
\end{eqnarray*}
which completes the proof of Theorem \ref{thm:generalization_error_withbounded_Fro_norm}.

\begin{lemma}\label{lem:difference of distances}
    Let $\phi(\bx)$ be a feature map from $\mathbb{R}^d$ to $\mathcal{H}$ with $\|\phi(\bx)\|_\mathcal{H}\leq B$ for $\forall \bx$, and $L$ be a linear functional such that $L:\mathcal{H}\rightarrow \mathcal{H}$ and $\|L^\dagger L\|_{S_2} \leq \lambda_F
    $. Then, for any $\bx_h,\bx_i,\bx_j\in \mathbb{R}^d$, we have 
    \begin{eqnarray*}
        \|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2\leq 6B^2 \lambda_F
    \end{eqnarray*}
\end{lemma}
\textbf{Proof of Lemma \ref{lem:difference of distances}}
First, note that 
\begin{eqnarray*}
    \langle L\phi_h, L\phi_j\rangle_\mathcal{H} &=& \langle \phi_h, L^\dagger L\phi_j\rangle_\mathcal{H}  
    \\&\overset{a}{\leq}& \|\phi_h\|_\mathcal{H} \|L^\dagger L\phi_j\|_\mathcal{H}
    \\&\overset{b}{\leq}& \|\phi_h\|_\mathcal{H} \|L^\dagger L\|_{S_\infty}\|\phi_j\|_\mathcal{H}
    \\&{\leq}& \|\phi_h\|_\mathcal{H} \|L^\dagger L\|_{S_2}\|\phi_j\|_\mathcal{H}
    \\&\leq& B^2\lambda_F,
\end{eqnarray*}
where $(a)$ is from Cauchy-Schwarz Inequality and $(b)$ is by definition of operator norm ($\|\cdot\|_{S_\infty}$). Then, we have 
\begin{eqnarray*}
    \|L\phi_h-L\phi_i\|_\mathcal{H}^2-\|L\phi_h-L\phi_j\|_\mathcal{H}^2&=&2\langle L\phi_h, L\phi_j\rangle_\mathcal{H}-2\langle L\phi_h, L\phi_i\rangle_\mathcal{H}+\langle L\phi_i, L\phi_i\rangle_\mathcal{H}-\langle L\phi_j, L\phi_j\rangle_\mathcal{H}
    \\ &\leq& 6B^2 \lambda_F.
\end{eqnarray*}
