\subsection{Theoretical Guarantees for Kernelized Metric Learning}\label{sec:Theoretical Guarantees}
Frequently in optimization and learning theory, we wish to characterize \textit{model classes of functions–} classes of metrics on $\mathcal{H}$ in this case. This is important to define optimal performance within a class for theoretical results and has tight connections to regularization in optimization which is used to prevent overfitting the data and ensure good generalization performance. We define model classes of kernelized Mahalanobis metrics by bounding the Schatten$-p$ norm of their map $L$ (e.g., all kernelized metrics with a map $L$ such that $\|L\|_{S_p}\leq \lambda$). For a compact, bounded linear operator ${T}$, its Schatten $p-$norm is defined to be $\|T\|_{S_p}:=\left(\sum_{i\geq 1}s_i\left(T\right)^p \right)^{1/p}$ where $s_i(T)$ is the $i^{th}$ singular value of $T$ and may be equivalently written as $\sqrt{\lambda_i(T^\dagger T)}$ where $\dagger$ denotes the conjugate transpose and $\lambda_i(T^\dagger T)$ is the
$i^{th}$ eigenvalue of the Hermitian operator. We focus on two particular Schatten norms. First we consider the Schatten 2-norm which is a Hilbert-Schmidt norm.  Specifically, we restrict solutions $L$ to (\ref{empirical risk first}) to additionally satisfy $\|L^\dagger L\|_{S_2}\leq \lambda_F$ for a given $\lambda_F>0$. Furthermore, we consider the Schatten 1-norm, also referred to as the trace or nuclear norm. In this setting, we assume that $\|L^\dagger L\|_{S_1}\leq\lambda_*$ and again restrict solutions to satisfy this constraint. 

We define the optimal (possibly) infinite dimensional operator $L^*$ as the minimizer of following optimization:
\begin{equation}
\begin{aligned}
\min_{L} \quad & R(L)\\
\textrm{s.t.} \quad & \|L^\dagger L\|_{S_2}\leq \lambda_F.    \\
\end{aligned} \tag{P1}\label{opt-P1}
\end{equation}
Similarly we define $\widehat{L}$ as the solution to the optimization problem (\ref{opt-P2}) given below, i.e., the empirical risk minimizer:
\begin{equation}
\begin{aligned}
\min_{L} \quad & \widehat{R}_\mathcal{S}(L)\\
\textrm{s.t.} \quad &  \|L^\dagger L\|_{S_2}\leq \lambda_F.    \\
\end{aligned}\tag{P2}\label{opt-P2}
\end{equation}
\iffalse
\begin{remark}
    Recall that difference of distances is unaffected by $L$’s component on $\mathcal{S}_\mathcal{X}^\perp$ for a specific $\mathcal{S}_\mathcal{X}$ (see (\ref{eq of diff of distances})). Therefore, we have
    \begin{eqnarray*}
        \widehat{R}_\mathcal{S}(\widehat{L}) = \widehat{R}_\mathcal{S}(\widehat{L}\mathcal{P}_{\mathcal{S}_\mathcal{X}}).
    \end{eqnarray*}
\end{remark}
\fi
Suppose that $\mathcal{S}_\mathcal{X}\subset\mathcal{H}$ represents the subspace spanned by the set $\{\phi(\xb_1), \phi(\xb_2), \ldots \phi(\xb_{n})\}$ corresponding to the features of random observations. Furthermore, let the potentially infinite dimensional linear operator $\widehat{L}_0$ denote the solution to (\ref{opt-P3}), obtained from random observations and the associated kernel features, where the norm constraint is imposed solely on the component of $L$ whose domain lies within the span of features, i.e., denoted by $\mathcal{S}_\mathcal{X}$:
\begin{equation}
\begin{aligned}
\min_{L} \quad & \widehat{R}_\mathcal{S}(L)\\
\textrm{s.t.} \quad & \|\mathcal{P}_{\mathcal{S}_\mathcal{X}}^\dagger L^\dagger L\mathcal{P}_{\mathcal{S}_\mathcal{X}}\|_{S_2}\leq \lambda_F,    \\
\end{aligned} \tag{P3}\label{opt-P3}
\end{equation}
where $\mathcal{P}_{\mathcal{S}_\mathcal{X}}$ denotes the projection onto $\mathcal{S}_\mathcal{X}$.
\begin{remark}\label{remark:emp risk in the span}
    Assume $\widehat{L}_0$ denotes the solution of (\ref{opt-P3}) whose domain is restricted to the span of features, i.e., $\mathcal{S}_\mathcal{X}$. This is a reasonable assumption, because $L_0\mathcal{P}_{\mathcal{S}_\mathcal{X}}$ also optimally solves (\ref{opt-P3}) for any solution $L_0$. Therefore, optimizing (\ref{opt-P3}) can be interpreted as seeking such an $\widehat{L}_0$. 
\end{remark}
\begin{lemma} \label{lem:norm inequality}
    Recall that for a compact, bounded linear operator ${T}$, its Schatten $p$-norm is denoted as $\|T\|_{S_p}$. We have, for $p\geq 1$,
\begin{eqnarray*} \|\mathcal{P}^\dagger_{\mathcal{S}_\mathcal{X}}L^\dagger L\mathcal{P}_{\mathcal{S}_\mathcal{X}}\|_{S_p} \leq \|L^\dagger L\|_{S_p}.
\end{eqnarray*}
Note that Schatten $2-$norm is the Hilbert-Schmidt norm.
\end{lemma}
Lemma \ref{lem:norm inequality} allows us to establish a relation between solutions of (\ref{opt-P2}) and (\ref{opt-P3}), explained in Proposition \ref{prop:opt2 and opt3}. Note that optimization settings (\ref{opt-P2}) and (\ref{opt-P3}) have the same objective function. The distinction lies in the the norm constraint $\|\cdot\|_{S_2}$ imposed on $L$. In (\ref{opt-P3}), the constraint applies only to the component of $L$ whose domain is restricted to the span of features, denoted by $\mathcal{S}_\mathcal{X}$. Consequently, solving (\ref{opt-P3}) for an operator $\widehat{L}_0$, as in Remark \ref{remark:emp risk in the span}, corresponds to minimizing the empirical risk in (\ref{opt-P2}) under the additional constraint that the search is restricted to the span $\mathcal{S}_\mathcal{X}$.  

\begin{prop} \label{prop:opt2 and opt3}
     We observe that $\widehat{L}_0$ is in the solution set of (\ref{opt-P2}). More precisely, any ${L}$ within the feasible set of (\ref{opt-P2}) is an optimal solution, provided that ${L}\mathcal{P}_{\mathcal{S}_\mathcal{X}}=\widehat{L}_0$. As a result, (\ref{opt-P2}) and (\ref{opt-P3}) have the same optimal value, i.e., 
    \begin{eqnarray*}
        \widehat{{R}}_\mathcal{S}(\widehat{L})= \widehat{{R}}_\mathcal{S}(\widehat{L}_0).
    \end{eqnarray*}
    Therefore, optimizing the empirical risk in (\ref{opt-P2}) with a search restricted to $\mathcal{S}_\mathcal{X}$ suffices to assign optimal value for (\ref{opt-P2}). 
\end{prop}
\iffalse
Representer theorem (see Proposition \ref{prop:representer theorem}) guarantees that 
\begin{equation}
\begin{aligned}
\min_{\Mb}  \widehat{\overline{R}}_\mathcal{S}(\Mb)\leq \min_{L} \widehat{{R}}_\mathcal{S}(L) 
\end{aligned}\label{result1_rep_thm}
\end{equation}
$\widehat{\overline{R}}_\mathcal{S}(\widehat{\Mb})\leq\widehat{{R}}_\mathcal{S}(\widehat{L})$. (Bunu kullan, since a kucuk esit b, their expectations olsa ayni seye sahip olucak gerisi zaten follows.)
Then, we define following risk function for a given set of points $\{\bm{x}_1, \bm{x}_2, \ldots, \bm{x}_n\}$ to exploit representer theorem.
\begin{eqnarray}
{R}_\mathcal{S}(L)&:=&\mathbb{E}\left[l(y_t(\|L\mathcal{P}_{\mathcal{S}_\mathcal{X}}\phi_h-L\mathcal{P}_{\mathcal{S}_\mathcal{X}}\phi_i\|^2-\|L\mathcal{P}_{\mathcal{S}_\mathcal{X}}\phi_h-L\mathcal{P}_{\mathcal{S}_\mathcal{X}}\phi_j\|^2))\right]. 
\end{eqnarray}
Here, the randomness in the expectation is due to random objects coming from a distribution $\mathcal{D}$ and the label conditioned on $t$. Note that ${R}_\mathcal{S}(L) =  {\overline{R}_\mathcal{S}}(\Mb)$.
\begin{equation}
\begin{aligned}
\min_{L} {R}_\mathcal{S}(L) = \min_{\Mb}  {\overline{R}_\mathcal{S}}(\Mb)
\end{aligned}\label{result1_rep_thm}
\end{equation}
\fi
Recall that we wish to
learn a kernelized metric parametrized by a bounded
linear map $L: \mathcal{H}\rightarrow \mathcal{H}$ that predicts triplets effectively based on random observations. We establish a bound on the generalization error of $\widehat{L}_0$, which is a solution to the empirical risk minimization. Note that $\widehat{L}_0$ solves both (\ref{opt-P3}) and (\ref{opt-P2}). We compare it to the optimal infinite dimensional operator $L^*$, which minimizes the true risk.

The following theorem demonstrates that, with a sufficiently large set of triplets $\mathcal{S}$, the performance of $\widehat{L}_0$ is nearly as good as that of $L^*$.
\begin{theorem}\label{thm:generalization_error_withbounded_Fro_norm}
    Fix $\delta, \lambda_F>0$ and let $\ell$ be $\alpha$-Lipschitz. Assume $\|\phi(\bx)\|_\mathcal{H}\leq B$ for any $\bx$. Then, with probability at least $1-\delta$,
\begin{eqnarray*}
    {R}(\widehat{L}_0)-R(L^*)\leq 4\alpha B^2\lambda_F\sqrt{\frac{6}{|\mathcal{S}|}}+12\alpha B^2\lambda_F\sqrt{\frac{2\ln{2/\delta}}{|\mathcal{S}|}}
\end{eqnarray*}
\end{theorem}
\iffalse
\begin{remark}
    The constraint that $\|\phi(\bx)\|_\mathcal{H}\leq B$ is a mild one and can easily be relaxed by scaling the right hand side of the inequality.
\end{remark}

\begin{remark}
    The condition on $\gamma$ is needed for the proof. It can either be enforced as a convex constraint on the optimization or left unconstrained in which case the theorem holds with $\gamma = \lambda_F$.
\end{remark}
\fi
For any loss $\ell(\cdot)$ which upper bounds the $0/1-$loss, such as the logistic or hinge losses, the left hand side is an upper bound on the expected prediction accuracy for predicting triplets. Hence, the above result also provides a generalization error guarantee for prediction accuracy. 

To further interpret the result of Theorem \ref{thm:generalization_error_withbounded_Fro_norm}, consider the case of a linear kernel where the points used to generate triplets live in the unit ball in 
$\mathbb{R}^d$. In this case,
one can directly learn $L^TL=\Mb \in \mathbb{R}^{d\times d}$. %without needing the kernel trick. 
Setting $\lambda_F=O(d)$,
which is sufficient to ensure that the average entry of $\Mb$ is dimensionless, Theorem \ref{thm:generalization_error_withbounded_Fro_norm} shows that sampling $O(d^2\log(1/\delta))$ triplets is sufficient to ensure good generalization. As the number of degrees of freedom for a $d\times d$ matrix is $d^2$, this matches
intuition that the sample complexity should scale with degrees of freedom. In general, $\|L^\dagger L\|_{S_2}$
behaves like a notion of the effective dimensionality $d_\textrm{eff}$ of $L$ \citep{zhang2005learning}. Indeed, if fewer eigenvalues of $L^\dagger L$ are large, then $\lambda_F$ is smaller and the space is nearly low dimensional. Hence, we may interpret Theorem \ref{thm:generalization_error_withbounded_Fro_norm} as suggesting a sample complexity of $O(d_\textrm{eff}^2\log(1/\delta))$. 

%In the worst case, such as when $L$ is the identity map, $d_\textrm{eff}$ is determined by the intrinsic dimension of the data. %This implies that one can learn a kernelized metric that predicts triplets well, as long as $d_\textrm{eff}$ is not large.
\iffalse
\begin{remark}
    kk
\end{remark}
\fi
Next, we bound the excess risk under the constraint $\|L^\dagger L\|_{S_1}\leq \lambda_*$. Specifically, consider the optimization problems (\ref{opt-P1}), (\ref{opt-P2}) and (\ref{opt-P3}), now with Schatten 1-norm constraints of the form $\|\cdot\|_{S_1}\leq \lambda_*$. Let $L^*_n $, \( \widehat{L}_n \) and \( \widehat{L}_{n_0} \) denote the solutions to the modified versions of problems (\ref{opt-P1}), (\ref{opt-P2}) and (\ref{opt-P3}), respectively, where the Schatten 1-norm constraints replace the Schatten 2-norm constraints. 

The following theorem establishes a bound on the generalization error of $\widehat{L}_{n_0}$ by comparing it to the true risk minimizer $L^*_n$.

\begin{theorem}\label{thm:generalization_error_withbounded_Nuclear_norm}
 Fix $\delta, \lambda_*>0$ and let $\ell$ be $\alpha$-Lipschitz. Assume $\|\phi(\bx)\|_\mathcal{H}\leq B$ for any $\bx$. Then, with probability at least $1-\delta$,
\begin{eqnarray*}
    {R}(\widehat{L}_{n_0})-R(L^*_n)\leq 4\alpha\lambda_*\left(B^2\sqrt{12\frac{\log 3|\mathcal{S}|}{|\mathcal{S}|} } +\frac{2\log 3|\mathcal{S}|}{|\mathcal{S}|}\right)
    \\ +12\alpha B^2\lambda_*\sqrt{\frac{2\ln{2/\delta}}{|\mathcal{S}|}},
\end{eqnarray*}
\end{theorem}
where $\mathcal{S}$ is the set of triplets chosen and $|\mathcal{S}|$ represents the size of this set. Note that restricting the Schatten-1 norm encourages solution $L$ (and correspondingly operationalized version $\Mb$ (see Section \ref{sec: learning M})) to have low rank. This corresponds
to learning a low-dimensional metric over data. This is reasonable in settings where though the ambient dimension of data is large, one expects that the triplet comparisons are well explained by a projection of the data points onto a low dimensional space $\mathcal{S}^o$. As an example, consider $\phi$ corresponding to a polynomial kernel of degree 2: $\phi(\bx) = [\xb_1^2, \xb_1 \cdot \xb_2,\ldots,\bx_2^2, \bx_2 \cdot \bx_3, \ldots, \bx_d^2]^T$ for $\xb=[\bx_1,\ldots , \bx_d]^T$. Suppose the data is generated according to a true map $L^*$ which is a projection onto $\mathcal{S}^o$, the span of a sparse subset of $k\ll d^2$ monomials. Then, taking $\lambda_*=\|L^\dagger L\|_{S_1}=k$, Theorem \ref{thm:generalization_error_withbounded_Nuclear_norm} guarantees that sampling $O(k^2\log(k/\delta))$ triplets is sufficient. By contrast, if $L$ was the identity map on degree 2 polynomials, the same result would suggest a sample complexity of $O(d^4\log(d/\delta))$ which is much larger. Hence, this result is especially powerful for low or approximately low dimensional metrics.
\iffalse
\begin{remark}
    kk
\end{remark}
\fi
