%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{multirow}
\usepackage{amssymb}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Probabilistic Embeddings for Frozen Vision-Language Models: Uncertainty
Quantification with Gaussian Process Latent Variable Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<aishwarya.venkataramanan@uni-jena.de>?Subject=Your UAI 2025 paper}{Aishwarya Venkataramanan}{}}
\author[1]{\href{mailto:<paul.bodesheim@uni-jena.de>?Subject=Your UAI 2025 paper}{Paul Bodesheim}{}}
\author[1]{\href{mailto:<joachim.denzler@uni-jena.de>?Subject=Your UAI 2025 paper}{Joachim Denzler}{}}
% Add affiliations after the authors
\affil[1]{%
    Computer Vision Group, Friedrich Schiller University Jena, Germany
}
  
  \begin{document}
\maketitle

\begin{abstract}
Vision-Language Models (VLMs) learn joint representations  by mapping images and text into a shared latent space. However, recent research highlights that deterministic embeddings from standard VLMs often struggle to capture the uncertainties arising from the ambiguities in visual and textual descriptions and the multiple possible correspondences between images and texts. Existing approaches tackle this by learning probabilistic embeddings during VLM training, which demands large datasets and does not leverage the powerful representations already learned by large-scale VLMs like CLIP.
In this paper, we propose GroVE, a post-hoc approach to obtaining probabilistic embeddings from frozen VLMs. 
GroVE builds on Gaussian Process Latent Variable Model (GPLVM) to learn a shared low-dimensional latent space where image and text inputs are mapped to a unified representation, optimized through single-modal embedding reconstruction and cross-modal alignment objectives. Once trained, the Gaussian Process model generates uncertainty-aware probabilistic embeddings.
Evaluation shows that GroVE achieves state-of-the-art uncertainty calibration across multiple downstream tasks, including cross-modal retrieval, visual question answering, and active learning.

\end{abstract}

\section{Introduction}\label{sec:intro}
\label{sec:intro}
Deep learning has seen remarkable success over the last decade, yet its practical applicability, especially in safety-critical areas is limited by unreliable, overconfident predictions~\citep{abdar2021review}.

This has motivated the development of methods to quantify uncertainty in model predictions, including stochastic~\citep{blundell2015weight, gal2016dropout}, deterministic~\citep{van2020uncertainty, mukhoti2023deep, venkataramanan2023gaussian}, evidential~\citep{sensoy2018evidential}, and post-hoc approaches~\citep{corbiere2021confidence}, with the aim to produce calibrated confidence values that better reflect the model's actual performance~\citep{guo2017calibration}. While these methods have shown strong performance in tasks involving data from a single modality, they often struggle in multi-modal settings, such as vision language models (VLMs), where inputs come from different domains, such as images and text~\citep{jung2022uncertainty}.
The challenge arises because these single-modal approaches fail to capture the uncertainties that emerge from interactions between the different modalities.

VLMs typically encode images and their corresponding text descriptions into vector representations within a joint embedding space. While combining modalities enriches semantics and boosts performance on various tasks~\citep{zhang2024vision, venkataramanan2023integrating}, it also introduces additional uncertainties. Beyond the inherent uncertainty of each modality, there is an uncertainty due to the ambiguous relationships between images and text. This is illustrated in Figure~\ref{fig:det_vs_probabilsitic}, where each image can correspond to multiple text descriptions, and each text description can be associated with multiple images.
Deterministic embeddings from VLMs often fail to capture these uncertainties, motivating the development of probabilistic embeddings~\citep{ji2023map, chun2021probabilistic, chun2023improved}. Probabilistic embeddings represent a distribution, thereby capturing a range of possible representations for ambiguous or uncertain data. Typically, the embeddings are modeled as Gaussian distributions, and deep neural networks are trained to maximize their likelihood, learning the distribution parameters.
However, these methods require training the VLMs from scratch, which requires large-scale datasets, and does not effectively leverage the strong multi-modal representations already provided by the pre-trained large-scale VLMs~\citep{radford2021learning, li2022blip, singh2022flava}.


In this work, we introduce GroVE, a method to generate probabilistic embeddings for VLMs in a post-hoc manner
that builds on Gaussian Process Latent Variable Model (GPLVM)~\citep{lawrence2003gaussian}. GroVE stands for \underline{G}aussian Process for P\underline{ro}babilistic \underline{V}LM \underline{E}mbeddings.
A GPLVM models the relationship between a low-dimensional latent space and a high-dimensional observational space using Gaussian Processes (GPs). Traditionally, the latent space is used for dimensionality reduction~\citep{lawrence2003gaussian,lalchand2022generalised}, and less commonly for more task-specific applications, such as classification~\citep{eleftheriadis2014discriminative} and cross-modal retrieval~\citep{song2017multimodal}.
In our approach, we adopt an extension of the GPLVM framework to a multi-modal context, and show that it provides a principled approach for obtaining probabilistic image and text embeddings from the deterministic embeddings of the large-scale frozen VLMs. To achieve this, we learn a joint low-dimensional latent space, where each pair of image and text embeddings derived from a VLM is represented as a single unified point.
The mapping between the latent space and the observed VLM embeddings is established through two GPs: one for image embeddings and one for text embeddings. Our training objective consists of an embedding reconstruction loss to learn this mapping, and a cross-modal alignment that regularizes the latent space to preserve the semantic structure of the data.
Once the latent space is learned, the trained GP models are used to obtain probabilistic embeddings for images and texts. 

\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{uai2025-template/images/det_vs_prob.pdf}
    \caption{Illustration of uncertainty arising from multiple correspondences between image and text descriptions. Deterministic embeddings represent the instances as fixed points. In contrast, probabilistic embeddings capture uncertainty by modeling text and images as distributions, allowing for multiple reasonable matches.}
    \label{fig:det_vs_probabilsitic}
\end{figure}
We evaluate GroVE for uncertainty calibration in cross-modal retrieval using CLIP~\citep{radford2021learning} and BLIP~\citep{li2022blip} on the following standard benchmarks: common objects datasets MS-COCO~\citep{lin2014microsoft} and Flickr30k\citep{young2014image}, as well as fine-grained datasets CUB-200-2011~\citep{wah2011caltech} and Oxford Flowers 102~\citep{nilsback2008automated}. We further demonstrate the applicability of our approach in an active learning setting. We also evaluate its ability to provide calibrated uncertainty estimates in visual question answering (VQA) using the VQA 2.0 dataset~\citep{goyal2017making}.
Our results show that GroVE effectively learns probabilistic embeddings that provide calibrated uncertainty estimates. 

The contributions are summarized as follows:  
i) We propose GroVE, which extends GPLVM and provides a principled approach to obtain probabilistic VLM embeddings for both images and text. 
ii) We show that GroVE produces calibrated uncertainty estimates for cross-modal retrieval and VQA, and demonstrate its practical utility in active learning. 
iii) We design GroVE to work in a post-hoc manner on frozen VLMs, avoiding the need for retraining large-scale models from scratch. 
Code is available: \url{https://github.com/cvjena/GroVE-Probabilistic_VLM_embeddings.git}



\section{Related Work} \label{sec:related_work}

\textbf{Uncertainty Quantification in VLMs.} 
Input data ambiguities in VLMs are often addressed by replacing traditional deterministic embeddings with probabilistic embeddings~\cite{li2022differentiable}. PCME~\citep{chun2021probabilistic} models image and text embeddings as Gaussians with learned means and variances, optimizing the joint embedding space with a soft cross-modal contrastive loss. PCME++~\citep{chun2023improved} introduces Closed-Form Sampled Distance (CSD) to compute Gaussian embeddings of images and text for faster  uncertainty estimation compared to PCME. MAP~\citep{ji2023map} introduces a Probability Distribution Encoder to model multi-modal representations as probabilistic distributions. However, all these methods require training from scratch, and do not effectively leverage the strong multi-modal representations already learned by the pre-trained large-scale VLMs. 
ProbVLM~\citep{upadhyay2023probvlm} is a post-hoc approach that trains neural networks to estimate the parameters of  Generalized Gaussian distribution for image and text embeddings.
Although being straightforward, the prediction of distribution parameters lacks proper probabilistic modeling of statistical processes underlying the sampling of data.
Furthermore, neural networks are prone to uncalibrated predictions when presented with out-of-distribution (OOD) data or limited training samples~\citep{guo2017calibration}. In contrast, our approach leverages GPs, a Bayesian method that inherently incorporates probabilistic reasoning with reliable and theoretically sound uncertainty quantification as well as distance-awareness through the covariance function, which has proven effective in calibrated uncertainty estimation~\citep{liu2020simple, jung2022uncertainty}.

\textbf{Post-hoc approaches for uncertainty quantification.}
Some of the widely used post-hoc calibration techniques for data from a single modality are temperature scaling~\citep{guo2017calibration} and Platt scaling~\citep{platt1999probabilistic}, which adjust the model's predicted probabilities after training to better align predicted confidence scores with actual performance. Test-Time Data Augmentation (TTDA)~\citep{ayhan2018test, wang2019aleatoric} quantifies uncertainty by applying various transformations to input data during inference, generating multiple predictions, and measuring the variability among them to assess the uncertainty.
A line of work~\citep{corbiere2021confidence, yu2021slurp, hornauer2023out,  shi2019probabilistic} focuses on training auxiliary models to quantify uncertainty in the primary model, allowing for uncertainty estimation without impacting the performance of the primary model. Unlike these single-modal approaches, our method captures uncertainty from the relationship between visual and textual modalities, which is crucial for obtaining accurate uncertainty estimates in VLMs~\citep{jung2022uncertainty}.

\section{Method}
\label{sec:method}

GroVE builds on the GPLVM framework to learn a shared latent space for image and text inputs using GPs. It optimizes this space through single-modal reconstruction and cross-modal alignment loss, generating probabilistic embeddings from deterministic VLM embeddings to capture uncertainty. Figure~\ref{fig:method} illustrates the overall pipeline of GroVE.


\begin{figure}[t]
    \centering
    \includegraphics[angle=-90, width=\linewidth]{images/model.pdf}
    \caption{\textbf{Method overview of GroVE.} Given deterministic image and text embeddings from a frozen VLM, GroVE learns a joint low dimensional latent space, where each image-text pair is represented by a single point. Two GP models learn to reconstruct the image and text embeddings from the latent space points through single-modal reconstruction and cross-modal alignment objectives. The GP models act as probabilistic mappings that model the uncertainty in both the image and text modalities.}
    \label{fig:method}
\end{figure}

\subsection{Problem Description} \label{sec:formulation}

Let $\mathcal{D} = \{(I_n, T_n)\}_{n=1}^{N}\subset{\mathcal{I}\times\mathcal{T}}$ %\{(I_i, T_i)\}_{i=1}^{N}$ 
represent a dataset of $N$ paired samples, where $I_n\in\mathcal{I}$ is an image sampled from the image space $\mathcal{I}$, and $T_n\in\mathcal{T}$ is the corresponding text description sampled from the text space $\mathcal{T}$. 
The VLM maps an image $I$ and a text $T$ into a shared embedding space $\mathcal{Z}\subseteq\mathbb{R}^D$. To achieve this, the VLM consists of an image encoder $f_\mathcal{I}^{\theta_\mathcal{I}}:\mathcal{I} \rightarrow \mathcal{Z}$ with  parameters $\theta_\mathcal{I}$, and a text encoder $f_\mathcal{T}^{\theta_\mathcal{T}}:\mathcal{T} \rightarrow \mathcal{Z}$ with parameters $\theta_\mathcal{T}$. We assume that the VLM has already been trained on a large-scale dataset, and the parameters of 
$f_\mathcal{I}^{\theta_\mathcal{I}}$ and $f_\mathcal{T}^{\theta_\mathcal{T}}$ are fixed as $\theta_\mathcal{I}^{*}$ and $\theta_\mathcal{T}^{*}$, respectively. The encoders have been trained such that, for a given image-text pair $(I,T)$, the resulting embeddings $\mathbf{z}_I=f_\mathcal{I}^{\theta_\mathcal{I}^{*}}(I)$ and $\mathbf{z}_T=f_\mathcal{T}^{\theta_\mathcal{T}^{*}}(T)$ are positioned close to one another in $\mathcal{Z}$, so that semantically related visual and textual information is aligned.
While deterministic VLMs provide fixed embeddings, they lack the ability to represent the uncertainty associated with these embeddings. To address this, we propose GroVE, a method that leverages GPLVM to obtain probabilistic embeddings in a post-hoc manner to model the uncertainties.





\subsection{GroVE Model}

We obtain the image and text embeddings from the frozen VLM on $\mathcal{D}$:
\begin{equation}
    \left\{\Bigl(\mathbf{z}_{I_n},\mathbf{z}_{T_n}\Bigr) = \Bigl(f_\mathcal{I}^{\theta_\mathcal{I}^{*}}(I_n),f_\mathcal{T}^{\theta_\mathcal{T}^{*}}(T_n)\Bigr)\right\}_{n=1}^N   ,
\end{equation}
where $\mathbf{z}_{I_n}, \mathbf{z}_{T_n}\in\mathbb{R}^D$ are $D$-dimensional image and text embeddings, respectively.

To derive probabilistic embeddings using GPLVM, we assume that $\mathbf{z}_{I_n}$ and $\mathbf{z}_{T_n}$ are generated from a shared low-dimensional latent space $\mathcal{X}\subseteq\mathbb{R}^Q$ with $Q \ll D$, where each image-text pair $(\mathbf{z}_{I_n},\mathbf{z}_{T_n})$ is associated with a common latent point $\mathbf{x}_n\in\mathbb{R}^Q$. 
We define two GPLVM models $\mathcal{GP_I}$ and $\mathcal{GP_T}$, one for each modality (images from $\mathcal{I}$ and text from $\mathcal{T}$), to learn the mappings $G_\mathcal{I}: \mathcal{X}\to\mathcal{Z_I}$ and $G_\mathcal{T}: \mathcal{X}\to\mathcal{Z_T}$ from $\mathbf{x}_n$ to the high-dimensional embeddings $\mathbf{z}_{I_n}$ and $\mathbf{z}_{T_n}$, respectively. During GP model training, the latent points $\mathbf{x}_n$ are optimized to maximize the likelihood of the observed embeddings $\mathbf{z}_{I_n}$ and $\mathbf{z}_{T_n}$

\textbf{GP model definitions.}
For describing the GPLVM models, we define the matrix $\mathbf{X}\in\mathbb{R}^{N{\times}Q}$ as the collection of the $N$ latent inputs $\mathbf{x}_n$.
Image embeddings $\mathbf{z}_{I_n}$ and text embeddings $\mathbf{z}_{T_n}$ are supposed to be computed from latent functions $G_\mathcal{I}$ and $G_\mathcal{T}$: 
\begin{equation}
    \mathbf{z}_{I_n} = G_\mathcal{I}(\mathbf{x}_n) + \boldsymbol{\epsilon}_\mathcal{I} ; \quad
    \mathbf{z}_{T_n} = G_\mathcal{T}(\mathbf{x}_n) + \boldsymbol{\epsilon}_\mathcal{T}   ,
\end{equation}
with noise terms $\boldsymbol{\epsilon}_\mathcal{I}$ and $\boldsymbol{\epsilon}_\mathcal{T}$ and a GP prior such that for each dimension $d$ of the embeddings $\mathbf{z}_{I_n}$ and $\mathbf{z}_{T_n}$, the latent function values $\mathbf{g}_I^d, \mathbf{g}_T^d\in\mathbb{R}^N$ of the $N$ samples follow a multivariate Gaussian distribution: 
\begin{equation}
\begin{aligned}
    \mathbf{g}_\mathcal{I}^d &\sim \mathcal{N}(m_\mathcal{I}(\mathbf{X}), k_\mathcal{I}(\mathbf{X},\mathbf{X})),\\
    \mathbf{g}_\mathcal{T}^d &\sim \mathcal{N}(m_\mathcal{T}(\mathbf{X}), k_\mathcal{T}(\mathbf{X},\mathbf{X})).
\end{aligned}
\end{equation}

These distributions are parameterized by a mean function $m(\cdot)$ and a covariance function $k(\cdot,\cdot)$, which defines the covariance matrix between pairs of points in $\mathbf{X}$.
For both GPLVM models, we use a constant mean function $m(\mathbf{X})=\mathbf{m}$ and a radial basis function (RBF) kernel $k(\mathbf{x}_i, \mathbf{x}_j)=\exp{\left( -\frac{\|\mathbf{x}_i-\mathbf{x}_j\|^2}{2\ell^2} \right)}$ with length-scale hyperparameter $\ell$. 
However, optimal values for $\mathbf{m}$ and $\ell$ are learnt separately for each modality $\mathcal{I}$ and $\mathcal{T}$.
The likelihood functions are  defined as:
\begin{equation}
\begin{aligned}       p\bigl(\mathbf{z}_\mathcal{I}^d|\mathbf{g}_\mathcal{I}^d\bigr) &= 
\prod_{n=1}^{N}p(\mathbf{z}_{I_n}^{d}|\mathbf{g}_{I_n}^d) = 
\mathcal{N}\bigl(\mathbf{g}_\mathcal{I}^d, \sigma_\mathcal{I}^2\mathbf{I}\bigr),\\
p\bigl(\mathbf{z}_\mathcal{T}^d|\mathbf{g}_\mathcal{T}^d\bigr) &= 
\prod_{n=1}^{N}p(\mathbf{z}_{T_n}^{d}|\mathbf{g}_{T_n}^d) = \mathcal{N}\bigl(\mathbf{g}_\mathcal{T}^d, \sigma_\mathcal{T}^2\mathbf{I}\bigr),
\end{aligned}
\end{equation}
where $\sigma_\mathcal{I}^2,\sigma_\mathcal{T}^2$ are the parameters of the Gaussian noise model, which are learned along with the model parameters during the training.


\textbf{Embedding Reconstruction Objective.}
Given the prior and the likelihood, our goal is to estimate the posterior distribution.
While the exact inference is possible, it is computationally expensive, with cost $\mathcal{O}(N^3)$. In this work, we adopt a sparse GP with inducing points and variational inference~\cite{titsias2009variational}. 
We introduce $M$ inducing points in $\mathcal{X}$ for each modality $\mathcal{I}$ and $\mathcal{T}$, %$\{\mathbf{v}_{I_m}, \mathbf{v}_{T_m}\}_{m=1}^{M}$ 
where $M \ll N$. 
Each inducing point corresponds to an inducing variable, represented as the latent function values $\mathbf{u}_\mathcal{I}^d \in \mathbb{R}^{M}$ and $\mathbf{u}_\mathcal{T}^d \in \mathbb{R}^{M}$, which capture the latent function values at these locations. The key idea is to approximate the true posterior distribution over the latent function values at the observed data points by conditioning on the inducing variables. % $\mathbf{u}_\mathcal{I}^d$ and $\mathbf{u}_\mathcal{T}^d$. 
This reduces the computational complexity of the model to $\mathcal{O}(NM^2)$.

To achieve this, we introduce a variational distribution over the inducing variables as:
\begin{equation} \label{eq:var_posterior}
    q(\mathbf{u}_\mathcal{I}^d) = \mathcal{N}(\mathbf{u}_\mathcal{I}^d | \boldsymbol{\mu}_\mathcal{I}^d, \mathbf{S}_\mathcal{I}^d); \;
    q(\mathbf{u}_\mathcal{T}^d) = \mathcal{N}(\mathbf{u}_\mathcal{T}^d | \boldsymbol{\mu}_\mathcal{T}^d, \mathbf{S}_\mathcal{T}^d) ,
\end{equation}
where $\boldsymbol{\mu}_\mathcal{I}^d$ and $\boldsymbol{\mu}_\mathcal{T}^d$, $\mathbf{S}_\mathcal{I}^d$ and $\mathbf{S}_\mathcal{T}^d$ are variational parameters that are optimized during training. These variational parameters, the inducing points, along with the model parameters $\mathbf{m}_\mathcal{I}$, $\mathbf{m}_\mathcal{T}$, $l_\mathcal{I}$, $l_\mathcal{T}$, $\sigma_\mathcal{I}^2$ and $\sigma_\mathcal{T}^2$ 
are learned by maximizing the lower bound on the marginal likelihood of the data i.e. the evidence lower bound (ELBO), given by
\begin{equation}
\begin{aligned}
    \mathcal{L}_{ELBO}^d =  \mathbb{E}_{q(\mathbf{g}_\mathcal{I}^d)}[\log p(\mathbf{z}_\mathcal{I}^d|\mathbf{g}_\mathcal{I}^d)] - D_{KL}(q(\mathbf{u}_\mathcal{I}^d)||p(\mathbf{u}_\mathcal{I}^d)) \\ + \mathbb{E}_{q(\mathbf{g}_\mathcal{T}^d)}[\log p(\mathbf{z}_\mathcal{T}^d|\mathbf{g}_\mathcal{T}^d)] - D_{KL}(q(\mathbf{u}_\mathcal{T}^d)||p(\mathbf{u}_\mathcal{T}^d)),
\end{aligned}
\end{equation}
where $D_{KL}$ is the Kullback-Leibler (KL) divergence, and is measured between the variational distributions and their corresponding priors obtained by the GP prior evaluated at the inducing points. 
The embedding reconstruction objective is given by:
\begin{equation}
    \mathcal{L}_{emb} = -\sum_{d=1}^{D} \mathcal{L}_{ELBO}^d 
\end{equation}

\textbf{Cross-modal Alignment Objective.} 
In addition to this reconstruction objective, we introduce a regularization term, so that the predicted distributions of the corresponding image and text embeddings from the GPs match. Aligning these distributions encourages the latent space to learn a shared underlying structure between the modalities, so that semantically related data points are represented by similar latent variables.
To enforce this, we define a KL divergence loss function between the distributions of the image and text embeddings from the GP models, 
which take the forms $\mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{I}, \hat{\boldsymbol{\Sigma}}_\mathcal{I})$ and $\mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{T}, \hat{\boldsymbol{\Sigma}}_\mathcal{T})$ respectively (refer Sec.~\ref{sec:inference} for inference using GP). 

The resulting objective $\mathcal{L}_{KL}$ is the mean of the KL divergence in both directions (image-to-text and text-to-image):%~\cite{duchi2007derivations}:
\begin{multline} \label{eq:kl}
    \mathcal{L}_{KL} = \frac{1}{2}[ D_{KL}(\mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{I}, \hat{\boldsymbol{\Sigma}}_\mathcal{I}) \| \mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{T}, \hat{\boldsymbol{\Sigma}}_\mathcal{T})) + \\ 
    D_{KL}(\mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{T}, \hat{\boldsymbol{\Sigma}}_\mathcal{T}) \| \mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{I}, \hat{\boldsymbol{\Sigma}}_\mathcal{I})) ]. 
\end{multline}



\textbf{Final Objective.} The overall objective function is the weighted sum of the embedding reconstruction loss and the cross-modal alignment loss:
\begin{equation}
    \mathcal{L}_{total} = \lambda_1\mathcal{L}_{emb} + \lambda_2\mathcal{L}_{KL} 
\end{equation}
where $\lambda_1$ and $\lambda_2$ are trade-off parameters.

\subsection{Probabilistic Embeddings}\label{sec:inference}
Once the latent space representation $\mathbf{X}$ is learned, we use $\mathcal{GP_I}$ and $\mathcal{GP_T}$ to predict the probabilistic image and text embeddings. Given a new embedding $\mathbf{z}_*$ (image or text) obtained from the VLM, we first infer its latent representation $\mathbf{x}_*$ by randomly initializing $\mathbf{x}_*$ and iteratively optimizing it with the ELBO. This approximates the posterior distribution $p(\mathbf{x}_*|\mathbf{z}_*, \mathbf{z}_\mathcal{M})$, where $\mathcal{M}$ denotes the modality (either $\mathcal{I}$ or $\mathcal{T}$).

From $\mathbf{x}_*$, the probabilistic embedding can be inferred using the respective GP.

\textbf{Inference using GP.} The predictive distribution, which defines the predicted probabilistic embedding is given by:
\begin{equation}
    p(\mathbf{g}_*^d) = \int p(\mathbf{g}_*^d|\mathbf{u}^d_\mathcal{M})q(\mathbf{u}^d_\mathcal{M})d\mathbf{u}^d_\mathcal{M}
\end{equation}
Evaluating the integral results in a Gaussian distribution~\cite{hensman2015scalable}:
\begin{equation}
    p(\mathbf{g}_*^d) = \mathcal{N}(\mathbf{g}_*^d | \boldsymbol{\hat{\mu}}_*^d, \boldsymbol{\tilde{\Sigma}}_*^d)
\end{equation}
where the mean $\boldsymbol{\hat{\mu}}_*^d$ and covariance $\boldsymbol{\hat{\Sigma}}_*^d$ of the embedding is:
\begin{equation} \label{eq:gp_mean}
    \boldsymbol{\hat{\mu}}_*^d = \mathbf{m}_\mathcal{M} + \mathbf{A}(\boldsymbol{\mu}^d_\mathcal{M} - \mathbf{m}_{\mathbf{v}_\mathcal{M}})
\end{equation}
\begin{equation} \label{eq:gp_var}
    \boldsymbol{\hat{\Sigma}}_*^d = k(\mathbf{x}_*,\mathbf{x}_*) - \mathbf{A}(\mathbf{S}^d_\mathcal{M}-k(\mathbf{v}_\mathcal{M},\mathbf{v}_\mathcal{M}))\mathbf{A}^T,
\end{equation}
where $\mathbf{v}_\mathcal{M}$ refers to the inducing points of the respective modality, 
$\mathbf{A}=k(\mathbf{x}_*,\mathbf{v}_\mathcal{M})k(\mathbf{v}_\mathcal{M},\mathbf{v}_\mathcal{M})^{-1}$, with dimensions $k(\mathbf{v}_\mathcal{M},\mathbf{v}_\mathcal{M}) \in \mathbb{R}^{M \times M}$ and $k(\mathbf{x}_*,\mathbf{v}_\mathcal{M}) \in \mathbb{R}^M$ and $\mathbf{m}_{\mathbf{v}_\mathcal{M}}$ is the prior mean evaluated at $\mathbf{v}_\mathcal{M}$.

\begin{table*}[h]
    \begin{adjustbox}{width=2\columnwidth, center}
    \centering
    %\begin{tabular}{p{0.2cm}|c||c c c|c c c|c c c| c c c }\hline
\begin{tabular}{@{}p{-1cm}lcccccccccccc@{}}
    \toprule
&\multirow{2}{*}{Method} & \multicolumn{3}{c}{Flickr} & \multicolumn{3}{c}{COCO} & \multicolumn{3}{c}{CUB} & \multicolumn{3}{c}{Flowers} \\ \cmidrule(lr){3-5} \cmidrule(lr){6-8} \cmidrule(lr){9-11} \cmidrule(lr){12-14}
 && $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ \\ \midrule
%\multicolumn{13}{c}{\textbf{Image to Text}} \\ \midrule
         %& \multirow{3}{*}{Method} & \multicolumn{12}{c||}{CLIP} & \multicolumn{12}{c}{BLIP} \\
         %& &\multicolumn{3}{c|}{Flickr}&\multicolumn{3}{c|}{COCO}&\multicolumn{3}{c|}{CUB}&\multicolumn{3}{c}{Flowers}
         %\\
        %&&$S$ $\downarrow$ & $R^2$ $\uparrow$ & $-SR^2$ $\uparrow$ 
        %& $S$ $\downarrow$ & $R^2$ $\uparrow$ & $-SR^2$ $\uparrow$ 
        %& $S$ $\downarrow$ & $R^2$ $\uparrow$ & $-SR^2$ $\uparrow$
        %& $S$ $\downarrow$ & $R^2$ $\uparrow$ & $-SR^2$ $\uparrow$  \\\hline
        \multirow{7}{*}{\rotatebox{90}{\textbf{Image to Text}}} 
        & Deterministic & \underline{-0.80$\pm$0.00} & \underline{0.66$\pm$0.00} & \underline{0.52$\pm$0.00} & 
        \underline{-0.80$\pm$0.00} & 0.64$\pm$0.00 & \underline{0.51$\pm$0.00} & 
        -0.10$\pm$0.00 & 0.05$\pm$0.00 & 0.00$\pm$0.00 &
        -0.10$\pm$0.00 & 0.00$\pm$0.00 & 0.00$\pm$0.00  \\
        &TTDA & 0.12$\pm$0.03 & 0.32$\pm$0.07 & -0.03$\pm$0.01 & 
        -0.36$\pm$0.05  & 0.38$\pm$0.08 & 0.17$\pm$0.05& 
        -0.60$\pm$0.00 & 0.36$\pm$0.07 & 0.21 $\pm$0.04 &
        \underline{-0.78$\pm$0.04} & 0.37$\pm$0.07 & 0.28$\pm$0.06  \\
        &PFE & -0.34$\pm$0.06 & 0.45$\pm$0.04 & 0.13$\pm$0.03 &
        0.63$\pm$0.05 & \underline{0.72$\pm$0.07} & -0.46$\pm$0.05 & 
        -0.13$\pm$0.04 & 0.28$\pm$0.03 & 0.02$\pm$0.01 &
        -0.11$\pm$0.05 & 0.29$\pm$0.04 & 0.04$\pm$0.01  \\    
        &PCME & 0.61$\pm$0.06 & 0.18$\pm$0.02 & -0.11$\pm$0.02 & 
        -0.63$\pm$0.00 & 0.50$\pm$0.03 & 0.31$\pm$0.02 & 
        -0.19$\pm$0.05 & 0.13$\pm$0.03 & 0.03$\pm$0.01&
        0.12$\pm$0.07 & 0.04$\pm$0.03 & 0.00$\pm$0.01  \\          
        &PCME++ & -0.08$\pm$0.04 & 0.33$\pm$0.04 & 0.04$\pm$0.02 & 
        -0.30$\pm$0.07 & 0.37$\pm$0.04 & 0.10$\pm$0.03 & 
        \textbf{-0.62$\pm$0.05} & \underline{0.67$\pm$0.05} & \underline{0.38$\pm$0.05} &
        -0.61$\pm$0.11 & \underline{0.55$\pm$0.04} & 0.32$\pm$0.08  \\  
        &ProbVLM & -0.79$\pm$0.05 & 0.52$\pm$0.04 & 0.38$\pm$0.04 & 
        -0.72$\pm$0.04 & 0.21$\pm$0.02 & 0.14$\pm$0.02 & 
        -0.33$\pm$0.05 & 0.46$\pm$0.04 & 0.15$\pm$0.02 &
        \underline{-0.78$\pm$0.03} & 0.47$\pm$0.03 & \underline{0.36$\pm$0.03}  \\  
        &GroVE & \textbf{-0.87$\pm$0.06} & \textbf{0.85$\pm$0.04} & \textbf{0.77$\pm$0.05} &
        \textbf{-0.90$\pm$0.03} & \textbf{0.88$\pm$0.04} & \textbf{0.79$\pm$0.02} & \underline{-0.61$\pm$0.07} & \textbf{0.75$\pm$0.04} & \textbf{0.46$\pm$0.06} &
        \textbf{-0.88$\pm$0..04} & \textbf{0.81$\pm$0.01} & \textbf{0.70$\pm$0.03}   \\  
        \midrule
    %\multicolumn{13}{c}{\textbf{Text to Image}} \\ \midrule
     %\hline
        \multirow{7}{*}{\rotatebox{90}{\textbf{Text to Image}}} 
        & Deterministic & \underline{-0.90$\pm$0.00} & \textbf{0.80$\pm$0.00} & \textbf{0.73$\pm$0.00} & 
        \underline{-0.80$\pm$0.00} & 0.76$\pm$0.00 & \underline{0.61$\pm$0.00} & 
        0.60$\pm$0.00 & 0.12$\pm$0.00 & -0.06$\pm$0.00 &
        -0.30$\pm$0.00 & 0.17$\pm$0.00 & 0.05$\pm$0.00  \\  
        &TTDA & 0.08$\pm$0.06 & 0.02$\pm$0.06 & 0.01$\pm$0.01 &
        -0.61$\pm$0.06 & 0.20$\pm$0.05 & 0.12$\pm$0.04 &
        -0.53$\pm$0.05 & \textbf{0.64$\pm$0.03} & 0.32$\pm$0.02 &
        0.04$\pm$0.01 & 0.04$\pm$0.00 & 0.00$\pm$0.02  \\  
        &PFE & -0.68$\pm$0.07 & 0.56$\pm$0.05 & 0.38$\pm$0.06 & 
        0.33$\pm$0.04 & 0.52$\pm$0.02 &-0.16$\pm$0.02 & 
        -0.32$\pm$0.07 &0.34$\pm$0.02 & 0.09$\pm$0.02 &
        0.21$\pm$0.04 & 0.43$\pm$0.02 & -0.10$\pm$0.02  \\  
        &PCME & 0.18$\pm$0.08 & 0.42$\pm$0.02 & -0.07$\pm$0.04 & 
        0.86$\pm$0.04 & \textbf{0.84$\pm$0.03} & -0.74$\pm$0.05 & 
        0.57$\pm$0.04 & 0.05$\pm$0.00 & -0.03$\pm$0.00 &
        0.72$\pm$0.05 & 0.45$\pm$0.03 & -0.29$\pm$0.03  \\  
        &PCME++ & -0.13$\pm$0.04 & 0.06$\pm$0.03 & 0.01$\pm$0.00 & 
        0.02$\pm$0.07 & 0.38$\pm$0.02 & 0.01$\pm$0.03 & 
        -0.28$\pm$0.05 & 0.02$\pm$0.01 & 0.01$\pm$0.00 &
        0.12$\pm$0.07 & \underline{0.47$\pm$0.02} & -0.06$\pm$0.04  \\  
        &ProbVLM & -0.54$\pm$0.03 & 0.68$\pm$0.07 & 0.34$\pm$0.03 & 
        0.09$\pm$0.02 & 0.11$\pm$0.04 & -0.01$\pm$0.00 & 
        \textbf{-0.92$\pm$0.03} & 0.52$\pm$0.05 & \underline{0.48$\pm$0.04} &
        \underline{-0.60$\pm$0.03} & 0.16$\pm$0.06 & \underline{0.10$\pm$0.03}  \\  
        &GroVE & \textbf{-0.92$\pm$0.04} & \underline{0.74$\pm$0.04} & \underline{0.66$\pm$0.04} & 
        \textbf{-0.81$\pm$0.02} & \underline{0.81$\pm$0.01} & \textbf{0.65$\pm$0.02} & 
        \underline{-0.78$\pm$0.07} & \underline{0.60$\pm$0.02} & \textbf{0.49$\pm$0.05} &
        \textbf{-0.62$\pm$0.08} & \textbf{0.66$\pm$0.04} & \textbf{0.43$\pm$0.06} \\  
    
     \bottomrule
        
    \end{tabular}
    \end{adjustbox}
    %\medskip
    \caption{ \textbf{Uncertainty calibration for cross-modal retrieval using CLIP.} GroVE  demonstrates superior performance in uncertainty calibration in majority cases compared to baseline models. The best scores are highlighted in bold and the second-best scores are underlined.
    }
    \label{tab:clip_unc_results}
\end{table*}

\textbf{Uncertainty Quantification.} When an embedding $\mathbf{z}_*$ belongs to an ambiguous input, the uncertainty associated with the posterior distribution $p(\mathbf{x}_*|\mathbf{z}_*, \mathbf{z}_\mathcal{M})$ increases. 
This uncertainty is propagated to the predictive distribution, which can be written as:
$p(\mathbf{g}_*) = \int p(\mathbf{g}_*|\mathbf{x}_*)p(\mathbf{x}_*|\mathbf{z}_*, \mathbf{z}_\mathcal{M})d\mathbf{x}_*$. 
Here, $p(\mathbf{g}_*|\mathbf{x}_*)$ is a multivariate Gaussian distribution that describes the function values at the fixed point $\mathbf{x}_*$. Thus, a large uncertainty in $\mathbf{x}_*$ increases the variance of the predictive distribution $p(\mathbf{g}_*|\mathbf{z}_*, \mathbf{z}_\mathcal{M})$. The uncertainty is captured by $\boldsymbol{\hat{\Sigma}}_*$, which accounts for variance contributions from both the latent space uncertainty and inherent noise in the model’s predictions. The final uncertainty is obtained by averaging the uncertainty values across all dimensions in $\boldsymbol{\hat{\Sigma}}_*$.
%which accounts for both the uncertainty in $\mathbf{x}_*$ and the GP's intrinsic uncertainty in the function values at $\mathbf{x}_*$.




\begin{table*}[h]
    \begin{adjustbox}{width=2\columnwidth, center}
    \centering
\begin{tabular}{@{}p{-1cm}lcccccccccccc@{}}
    \toprule
&\multirow{2}{*}{Method} & \multicolumn{3}{c}{Flickr} & \multicolumn{3}{c}{COCO} & \multicolumn{3}{c}{CUB} & \multicolumn{3}{c}{Flowers} \\ \cmidrule(lr){3-5} \cmidrule(lr){6-8} \cmidrule(lr){9-11} \cmidrule(lr){12-14} 
 & & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ & $S \downarrow$ & $R^2 \uparrow$ & $-SR^2 \uparrow$ \\ \midrule
 \multirow{7}{*}{\rotatebox{90}{\textbf{Image to Text}}}
%\multicolumn{13}{c}{\textbf{Image to Text}} \\ \midrule
        & Deterministic & 
        \underline{-0.70$\pm$0.00} & \textbf{0.78$\pm$0.00} & \textbf{0.55$\pm$0.00} &
        \underline{-0.80$\pm$0.00} & \textbf{0.84$\pm$0.00} & \underline{0.67$\pm$0.00} &
        0.50$\pm$0.00 & 0.13$\pm$0.00 & -0.07$\pm$0.00 &
        -0.20$\pm$0.00 & 0.05$\pm$0.00 & 0.01$\pm$0.00 \\
        &TTDA & 
        -0.68$\pm$0.04 & 0.27$\pm$0.03 & 0.19$\pm$0.02 &
        -0.72$\pm$0.05 & 0.48$\pm$0.04 & 0.32$\pm$0.04 &
        -0.70$\pm$0.05 & 0.50$\pm$0.02 & 0.33$\pm$0.03 &
        -0.63$\pm$0.04 & 0.24$\pm$0.03 & 0.13$\pm$0.02 \\
        &PFE & 
        0.12$\pm$0.06 & 0.37$\pm$0.02 & -0.04$\pm$0.02 & 
        0.04$\pm$0.00 & 0.32$\pm$0.05 & 0.00$\pm$0.00 & 
        0.56$\pm$0.06 & 0.53$\pm$0.04 & 0.32$\pm$0.03 &
        0.13$\pm$0.04 & 0.02$\pm$0.03 & -0.01$\pm$0.03 \\        
        &PCME & 
        -0.31$\pm$0.06 & 0.17$\pm$0.04 & 0.05$\pm$0.03 & 
        -0.62$\pm$0.03 & 0.24$\pm$0.02 & 0.14$\pm$0.02 & 
        -0.64$\pm$0.03 & \textbf{0.63}$\pm$0.03 & 0.38$\pm$0.03 &
        0.08$\pm$0.03 & 0.25$\pm$0.04 & -0.03$\pm$0.02 \\          
        &PCME++ & 
        -0.68$\pm$0.03 & 0.26$\pm$0.03 & 0.18$\pm$0.03 & 
        -0.69$\pm$0.04 & 0.50$\pm$0.04 & 0.34$\pm$0.03 & 
        \underline{-0.71$\pm$0.04} & 0.57$\pm$0.03 & 0.40$\pm$0.03 &
        \underline{-0.69$\pm$0.06} & 0.53$\pm$0.02 & 0.37$\pm$0.02 \\  
        &ProbVLM & 
        0.03$\pm$0.07 & 0.48$\pm$0.02 & 0.02$\pm$0.02 & 
        -0.61$\pm$0.03 & 0.50$\pm$0.04 & 0.30$\pm$0.03 & 
        -0.68$\pm$0.06 & \underline{0.60$\pm$0.03} & \underline{0.42$\pm$0.04} &
        -0.67$\pm$0.00 & \underline{0.65$\pm$0.02} & \underline{0.46$\pm$0.01} \\  
        &GroVE & 
        \textbf{-0.72$\pm$0.03} & \underline{0.74$\pm$0.02} & \underline{0.51$\pm$0.03} & 
        \textbf{-0.93$\pm$0.05} & \underline{0.76$\pm$0.03} & \textbf{0.68$\pm$0.03} & \textbf{-0.89$\pm$0.04} & \underline{0.60$\pm$0.04} & \textbf{0.54$\pm$0.02} &
        \textbf{-0.72$\pm$0.07} & \textbf{0.72$\pm$0.06} & \textbf{0.50$\pm$0.05}  \\  
    
     \midrule
     %\multicolumn{13}{c}{\textbf{Text to Image}} \\ \midrule
        \multirow{7}{*}{\rotatebox{90}{\textbf{Text to Image}}} 
        & Deterministic & 
        \underline{-0.90$\pm$0.00} & \underline{0.88$\pm$0.00} & \underline{0.79$\pm$0.00} &
        \textbf{-0.90$\pm$0.00} & \textbf{0.88$\pm$0.00} & \textbf{0.80$\pm$0.00} &
        0.40$\pm$0.00 & 0.06$\pm$0.00 & 0.02$\pm$0.00 &
        -0.10$\pm$0.00 & 0.00$\pm$0.00 & 0.00$\pm$0.00 \\  
        &TTDA & 
        -0.37$\pm$0.03 & 0.35$\pm$0.04 & 0.14$\pm$0.03 &
        0.41$\pm$0.06 &  0.00$\pm$0.01 & 0.00$\pm$0.03 & 
        -0.68$\pm$0.05 & 0.48$\pm$0.06 & 0.34$\pm$0.05 &
        0.09$\pm$0.03 & \underline{0.43$\pm$0.02} & -0.04$\pm$0.04 \\  
        &PFE & 
        -0.58$\pm$0.04 & 0.50$\pm$0.03 & 0.30$\pm$0.04 & 
        0.11$\pm$0.05 & 0.15$\pm$0.04 & -0.02$\pm$0.02 & 
        \underline{-0.78$\pm$0.03} & \underline{0.58$\pm$0.02} & \underline{0.47$\pm$0.02} &
        -0.23$\pm$0.06 & 0.01$\pm$0.03 & 0.00$\pm$0.03 \\  
        &PCME & 
        -0.12$\pm$0.04 & 0.50$\pm$0.02 & 0.05$\pm$0.01 & 
        0.62$\pm$0.03 & 0.42$\pm$0.06 & -0.25$\pm$0.03 & 
        -0.68$\pm$0.04 & \underline{0.58$\pm$0.03} & 0.41$\pm$0.02 &
        \underline{-0.31$\pm$0.03} & 0.26$\pm$0.03 & \underline{0.08$\pm$0.04} \\  
        &PCME++& 
        -0.72$\pm$0.06 & 0.30$\pm$0.04 & 0.21$\pm$0.04 & 
        -0.48$\pm$0.03 & 0.31$\pm$0.02 & 0.15$\pm$0.03 &
        -0.12$\pm$0.08 & 0.00$\pm$0.04 & 0.00$\pm$0.02 &
        -0.20$\pm$0.07 & 0.06$\pm$0.06 & 0.01$\pm$0.03 \\  
        &ProbVLM & 
        -0.56$\pm$0.04 & 0.50$\pm$0.03 & 0.31$\pm$0.03 & 
        -0.12$\pm$0.05 & \underline{0.48$\pm$0.04} & 0.05$\pm$0.04 & 
        -0.43$\pm$0.03 & 0.50$\pm$0.02 & 0.18$\pm$0.03 &
        0.38$\pm$0.03 & 0.02$\pm$0.04 & -0.01$\pm$0.04 \\  
        &GroVE & 
        \textbf{-0.92$\pm$0.04} & \textbf{0.90$\pm$0.03} & \textbf{0.81$\pm$0.04} & \underline{-0.62$\pm$0.03} & 0.36$\pm$0.06 & \underline{0.22$\pm$0.04} & \textbf{-0.89$\pm$0.02} & \textbf{0.75$\pm$0.03} & \textbf{0.74$\pm$0.03} &
        \textbf{-0.73$\pm$0.03} & \textbf{0.62$\pm$0.04} & \textbf{0.44$\pm$0.02} \\  
    
     \bottomrule
        
    \end{tabular}
    \end{adjustbox}
    %\medskip
    \caption{ \textbf{Uncertainty calibration for cross-modal retrieval using BLIP.} GroVE  demonstrates superior performance in uncertainty calibration in majority cases compared to baseline models. The best scores are highlighted in bold and the second-best scores are underlined.
    }
    \label{tab:blip_unc_results}
\end{table*}





\section{Experiments} \label{sec:expts}
%We present the experimental setup and results for GroVE, evaluating on three main areas: uncertainty calibration, zero-shot calibration, and active learning. Additionally, we perform an ablation study to analyze the impact of GroVE's components.
\subsection{Experimental Setup}
%In this section, we outline the baselines, datasets, quantitative metrics, and implementation details for the evaluation of GroVE.
\textbf{Baselines and Datasets.} We evaluate GroVE against six baseline methods: Deterministic, TTDA~\citep{ayhan2018test}, PFE~\citep{shi2019probabilistic}, PCME~\citep{chun2021probabilistic}, PCME++~\citep{chun2023improved}, and ProbVLM~\citep{upadhyay2023probvlm}, using two VLMs—CLIP~\citep{radford2021learning} and BLIP~\citep{li2022blip}—with a focus on uncertainty calibration for downstream tasks. In the deterministic approach, uncertainty is quantified by the cosine distance between the image and text embeddings derived from the VLM. %In TTDA, test-time data augmentation is applied to both images and texts to quantify their uncertainty. %For images, augmentation techniques include random resized cropping and horizontal flipping, while for texts, random word masking is used. Uncertainty is quantified by measuring the variance in the predicted embeddings across multiple forward passes with different augmentations.
While PFE, PCME and PCME++ are methods to learn probabilistic embeddings for pre-training VLMs, we follow \cite{upadhyay2023probvlm}, and adapt them to work in a post-hoc manner. %ProbVLM~\citep{upadhyay2023probvlm} is a post-hoc approach that trains two adapter modules to convert the deterministic embeddings from the VLMs to probabilistic ones. 
The similarity ranking between probabilistic image and text embeddings is determined by the Wasserstein distance, with embeddings ranked based on the increasing distance, while for the deterministic embeddings, the cosine similarity is used.
The implementation details for these methods are provided in Appendix~\ref{sec:baselines}. 
The methods are evaluated on MS-COCO~\citep{lin2014microsoft}, Flickr30k~\citep{young2014image}, CUB-200-2011~\citep{wah2011caltech} and Oxford Flowers 102~\citep{nilsback2008automated} for cross-modal retrieval, and VQA2.0~\citep{goyal2017making} for visual question answering. The captions for the CUB and Flowers datasets were obtained from \cite{reed2016learning}.

\textbf{Evaluation Metrics.} The cross-modal retrieval is evaluated using the Recall@1 metric. For evaluating uncertainty calibration, we adopt the metrics used in \cite{upadhyay2023probvlm}, which computes the Spearman rank correlation (S) between different uncertainty levels and Recall@1, $R^2$ value for the regression between the uncertainty levels and the Recall@1 performance, and their product $-SR^2$.  For an ideal model, the Recall@1 score should decrease with increasing uncertainty, resulting in a $S$ value of -1. A higher $R^2$ score indicates that with increasing uncertainty levels, the model's performance declines linearly. A higher $-SR^2$ score implies better uncertainty calibration, reflecting both a strong negative correlation and a monotonic decrease in performance with increasing uncertainty. 
VQA is evaluated using the soft voting accuracy of 10 human-annotated answers~\citep{goyal2017making}.
Calibration is evaluated by the Expected Calibration Error (ECE) score between the model's confidence and the soft voting accuracy. Model confidence is computed by first predicting an uncertainty score $u(a)$ for each candidate answer $a$, and then applying a softmax function over these uncertainty scores. The model confidence is given by $\text{conf}(a)= 1 - \text{softmax}(u(a))$.


\textbf{Implementation details.}
The experiments on CLIP were conducted using the ViT-B/32 model as the image encoder, with $D=512$. For BLIP, we adopt the ViT-B architecture as the image encoder. We trained the GPs with a latent space dimension of $Q=5$ for MS-COCO, Flickr30k and VQA2.0, and $Q=10$ for CUB and Flowers alongside trade-off parameters $\lambda_1=0.01$ and $\lambda_2=400$ and 250 inducing points, determined through grid search.
The models were implemented with GPyTorch~\cite{gardner2018gpytorch}, and 
trained for 200 epochs using the Adam optimizer with a learning rate of $1e^{-5}$ and a batch size of 64. The detailed implementation, including data processing and hyper-parameter tuning is provided in Appendix~\ref{sec:app_dataset} and \ref{sec:app_hyper} respectively.
  
\subsection{Uncertainty Calibration in Cross-modal Retrieval} \label{sec:unc_calibration}
\textbf{Quantitative Results.} The uncertainty calibration results for CLIP and BLIP is provided in Table~\ref{tab:clip_unc_results} and Table~\ref{tab:blip_unc_results} respectively. We observe that GroVE demonstrates superior performance across all four datasets in both image-to-text and text-to-image retrieval tasks, outperforming other methods in most cases.  
A high $-SR^2$ value for GroVE indicates that the model maintains strong performance when uncertainty is low, and the decline in performance is well-aligned with increasing uncertainty scores, indicating effective uncertainty calibration. 
Interestingly, the Deterministic baseline also performs competitively on the Flickr30k and MS-COCO datasets. This is because the VLMs were trained on datasets with common real-world objects, well-represented in these datasets, allowing the deterministic approach to benefit from familiar image-text pair contexts.
However, on fine-grained datasets like CUB and Flowers, which are less represented in the training data, it exhibits a noticeable drop in performance. In these cases, the probabilistic methods outperform the deterministic approach, with GroVE consistently leading across both common object and fine-grained datasets. 

\textbf{Qualitative Results. }Given a query image from MS-COCO, we obtain its probabilistic embedding using GroVE. Using the distribution of this embedding, we compute the likelihood of each image in the Flickr30k dataset. Figure~\ref{fig:likelihood} shows a t-SNE plot of the mean embeddings on Flickr30k, colored by likelihood scores. The query image depicts children playing on a field. We observe that the images with the highest likelihood scores, share similar semantic content, such as scenes of people playing in fields. In contrast, images with lower likelihood values (close to 0.0) show little to no semantic or visual similarity to the query. 
Additional results containing the retrieval performance, zero-shot performance, calibration plots and qualitative analysis is provided in Appendix.~\ref{sec:app_cross-modal} and \ref{sec:app_zs}. 

\begin{figure}[tb]
    \centering
    \includegraphics[width=0.7\linewidth]{images/likelihood.pdf}
    \caption{Given a probabilistic query image embedding from COCO, the plot shows a t-SNE visualization of Flickr30k embeddings, colored by their likelihood of belonging to the query distribution. Sample images are shown in colored boxes, where images with high likelihoods share similar semantic and visual content to the query.}
    \label{fig:likelihood}
\end{figure}
\subsection{Active Learning}

\begin{figure}[h]
\hspace{-15pt}
    \centering
    \begin{minipage}{0.2\columnwidth} 
        \centering
        \rotatebox{90}{\includegraphics[height=0.7\textwidth]{uai2025-template/images/al_legend.pdf}} % Your legend image
    \end{minipage}%
    \begin{minipage}{0.65\columnwidth} % Adjust width as needed
        \centering
        \includegraphics[width=\columnwidth]{uai2025-template/images/active_learning_i2t_plot.pdf} \\
        \includegraphics[width=\columnwidth]{uai2025-template/images/active_learning_t2i_plot.pdf}
    \end{minipage}
        \caption{\textbf{Active Learning.} The results highlight GroVE's ability to effectively leverage uncertainty estimates to guide sample selection, outperforming the baselines on both image-to-text (left) and text-to-image (right) retrieval.}
    \label{fig:active_learning}
\end{figure}

The objective of this experiment is to fine-tune the CLIP model on the CUB dataset with limited labeled data. We estimate the uncertainty of image and text embeddings to identify the most uncertain samples from the unlabeled CUB dataset, which are labeled for fine-tuning. For methods using auxiliary models, we derive uncertainty estimates from models trained on COCO. We sample the top 500 uncertain samples at each step for fine-tuning with contrastive loss. A random sampling baseline is also included.
Figure~\ref{fig:active_learning} provides the Recall@1 scores achieved in relation to the number of samples used for fine-tuning the CLIP model. GroVE achieves consistently better performance compared to others, demonstrating that its uncertainty estimates effectively identify the informative samples for active learning.


\subsection{Uncertainty in Few-shot Setting}
In this experiment, we explore a practical scenario where labeled training data is scarce. To simulate this, we create a few-shot dataset by randomly selecting three images and their corresponding text descriptions from 150 classes of the CUB dataset as done by \cite{verma2021towards}. The probabilistic adapters were trained on this dataset using embeddings obtained from CLIP, and the uncertainty calibration was evaluated for cross-modal retrieval. Table~\ref{tab:few_shot} shows the $-SR^2$ scores obtained for the baselines and GroVE with different numbers of inducing points as well as exact GP models, where training and inference is performed without approximations \citep{williams2006gaussian}. The results show that while the calibration performance improves as the number of inducing points increases, GroVE consistently outperforms the baselines in terms of calibration quality. The best performance was achieved with exact GP models and no approximation. However, GroVE is
computationally expensive compared to the neural network based approaches, with longer inference time as the number of inducing points increases.
%A comparison of the retrieval performance is provided in Appendix~\ref{sec:app_few_shot}.

\begin{table}[tb]
    \begin{adjustbox}{width=0.75\columnwidth, center}
    \centering
    \begin{tabular}{cccc}\\
    \toprule
    {Method}&{Image to Text} & {Text to Image} & {Latency (ms)} \\ \midrule 
     TTDA &  0.03$\pm$0.06 & 0.01$\pm$0.04 & 288.51\\
     PFE  & -0.29$\pm$0.02 & -0.22$\pm$0.03 & \underline{31.59}\\
     PCME &  0.04$\pm$0.03 & 0.14$\pm$0.02 & 31.60\\
     PCME++ &  0.27$\pm$0.03 & 0.01$\pm$0.04 & \textbf{31.55}\\
     ProbVLM  & -0.12$\pm$0.04 & -0.43$\pm$0.04 & 32.80\\
     \midrule
     GroVE (M=50) & 0.24$\pm$0.03 & 0.22$\pm$0.03 & 47.62\\
     GroVE (M=150) & 0.36$\pm$0.04 & 0.31$\pm$0.02 & 142.85\\
     GroVE (M=250) &  0.35$\pm$0.03 &  0.31$\pm$0.03 & 392.16\\
     GroVE (exact GP)  & \textbf{0.39$\pm$0.03} & \textbf{0.36$\pm$0.02} & 1130.09 \\
     \bottomrule
    \end{tabular}
    \end{adjustbox}
     \caption{\textbf{Few-shot uncertainty calibration.}  GroVE outperforms other baselines, achieving superior uncertainty calibration in few-shot settings in terms of -$SR^2$ ($\uparrow$).}
    \label{tab:few_shot}
\end{table}

\subsection{Uncertainty Calibration for VQA}

Table~\ref{tab:vqa_expts} shows the accuracy and ECE scores obtained for VQA2.0 using BLIP as the VLM. All the baselines achieve similar accuracy scores, with deterministic achieving the best accuracy. When evaluated for confidence calibration, GroVE achieves the lowest ECE score.

\begin{table}[h]
    \begin{adjustbox}{width=0.55\columnwidth, center}
    \centering
    \begin{tabular}{ccc}\\
    \toprule
    Method & Accuracy $\uparrow$ & ECE $\downarrow$ \\ \midrule
     Determinsitic & \textbf{78.20} & 0.56  \\
     TTDA  & \underline{77.67$\pm$2.23} & \underline{0.48$\pm$0.06} \\
     PFE  & 76.34$\pm$1.98 & 0.65$\pm$0.02\\
     PCME & 77.25$\pm$1.76 & 0.64$\pm$0.01\\
     PCME++ & 77.53$\pm$1.71 & 0.64$\pm$0.02 \\
     ProbVLM & 76.66$\pm$1.13 & 0.69$\pm$0.01\\
     GroVE & 77.48$\pm$2.15 & \textbf{0.24$\pm$0.04} \\
     \bottomrule
    \end{tabular}
    \end{adjustbox}
     \caption{\textbf{Results for VQA.} While all methods achieve similar accuracy (with the deterministic model performing best), GroVE reaches the best calibration performance in terms of ECE ($\downarrow$). }
    \label{tab:vqa_expts}
\end{table}

\subsection{Ablation Analysis} \label{sec:ablation}


\begin{table}[tb]
    \begin{adjustbox}{width=0.80\columnwidth, center}
    \centering
    \begin{tabular}{ccccc}\\
    \toprule
    \multirow{2}{*}{Kernel}& \multicolumn{2}{c}{Image to Text} & \multicolumn{2}{c}{Text to Image} \\ \cmidrule(lr){2-3} \cmidrule(lr){4-5}
      &  COCO & CUB & COCO & CUB\\ \midrule
     RBF & \textbf{0.79$\pm$0.02} & 0.46$\pm$0.06 & \textbf{0.65$\pm$0.02} & \textbf{0.49$\pm$0.05} \\
     Mat\'ern ($\nu=1.5$) & 0.27$\pm$0.03 & \textbf{0.47$\pm$0.05} & 0.41$\pm$0.04 & 0.22$\pm$0.04\\
     Mat\'ern ($\nu=2.5$) & 0.52$\pm$0.05 & 0.38$\pm$0.04 & 0.43$\pm$0.04 & 0.12$\pm$0.05\\
     Cosine Similarity & 0.46$\pm$0.04 & 0.39$\pm$0.03 & 0.35$\pm$0.03 & 0.30$\pm$0.02\\
     \bottomrule
    \end{tabular}
    \end{adjustbox}
     \caption{\textbf{Ablation on choice of GP kernel.}  GroVE achieves the best performance on MS-COCO and CUB-200-2011 with the RBF kernel.}
    \label{tab:ablation_kernels}
\end{table}



\begin{figure}
    \centering
    \includegraphics[width=0.48\linewidth]{images/latent_dim_plot.pdf} 
    \includegraphics[width=0.48\linewidth]{images/data_subset_plot.pdf}
    \caption{Ablation using MS-COCO: (i) \textbf{latent space dimension} (left). Low latent space dimensions results in loss of information, while higher dimensions results in performance degradation due to over-fitting. (ii) \textbf{dataset size for training} (right). GroVE achieves good performance with just 60\% of the total training dataset.}
    \label{fig:lat_dim_data_sub}
\end{figure}




\textbf{GP Kernel.} 
We evaluate the performance of the RBF, Mat\'ern ($\nu = 1.5$ and $2.5$, where $\nu$ is the smoothness parameter) and the cosine similarity kernel on GroVE's performance on the MS-COCO and CUB data. From Table~\ref{tab:ablation_kernels}, the RBF kernel achieves superior performance compared to the other kernels across both datasets, with improvements up to 53\%. %This suggests that the RBF kernel's ability to capture smooth variations in the data aligns well with the uncertainty quantification tasks for both general and fine-grained datasets. 
The kernels are defined in Appendix~\ref{sec:def_kernel}.

\textbf{Latent Space Dimension. } We investigate the influence of the latent space dimension $Q$ on GroVE's performance using the MS-COCO dataset. Figure~\ref{fig:lat_dim_data_sub} (left) presents the $-SR^2$ scores for various values of $Q$. Low values of $Q$ lead to information loss, which compromises the model's ability to capture complex patterns in the data. Conversely, high values of $Q$ result in overfitting and make the model more challenging to optimize, resulting in a performance decline. The optimal performance was observed when $Q=5$.


\textbf{Dataset Size. } We study the impact of the dataset size on GroVE's performance by training it on various fractions of the COCO training dataset. As shown in Figure~\ref{fig:lat_dim_data_sub} (right), the model achieves good performance when trained on 60\% of the full dataset. While the uncertainty calibration performance for text-to-image retrieval plateaus beyond this point, the performance for image-to-text retrieval continues to improve almost linearly as more data is utilized.

\begin{figure}
    \centering
    \includegraphics[height=4.5cm, width=0.52\linewidth]{images/weighting_plot.pdf} 
    \includegraphics[height=4.5cm, width=0.47\linewidth]{images/img_masking_plot.pdf}
    \caption{Ablation using MS-COCO: (i) \textbf{trade-off parameter} (left). KL-divergence improves uncertainty calibration with optimal performance at $\lambda_2=400$, with $\lambda_1=0.01$. (ii) \textbf{noisy data} (right). With increasing amount of noise in the input data, GroVE predicts higher uncertainty.}
    \label{fig:reg_masking}
\end{figure}

\textbf{Cross-modal Alignment. } We compare GroVE's KL-divergence-based alignment loss with the MSE loss-based regularization used in \cite{song2017multimodal}. The authors use GPLVM for cross-modal retrieval, regularizing the latent space with the loss function $||k_I - S||^2 + ||k_T - S||^2$, where $k_I$ and $k_T$ are the GP covariance matrices, and $S$ is the latent space similarity matrix. For comparison, we replace $\mathcal{L}_{KL}$ in our method with the MSE loss and experiment with different values of the trade-off parameter $\lambda_2$, maintaining $\lambda_1 = 0.01$. As shown in Figure~\ref{fig:reg_masking} (left), the KL-divergence alignment loss improves uncertainty calibration performance by up to 23\%, with the best performance at $\lambda_2 = 400$.
Additionally, we evaluate the cross-modal alignment KL-divergence loss against other widely-used probabilistic distance metrics: Jensen-Shannon (JS) divergence and Wasserstein-2 distance. The results in Table~\ref{tab:ablation_kl} indicate that while all metrics perform similarly, KL-divergence offers a slight edge. %The distance metrics are defined in Appendix~\ref{sec:def_dist}.




\begin{table}[]
    \begin{adjustbox}{width=0.80\columnwidth, center}
    \centering
    \begin{tabular}{ccccc}\\
    \toprule
    \multirow{2}{*}{Kernel}& \multicolumn{2}{c}{Image to Text} & \multicolumn{2}{c}{Text to Image} \\ \cmidrule(lr){2-3} \cmidrule(lr){4-5}
      &  COCO & CUB & COCO & CUB\\ \midrule
     KL-Divergence & \textbf{0.79$\pm$0.02} & 0.46$\pm$0.06 & \textbf{0.65$\pm$0.02} & \textbf{0.49$\pm$0.05} \\
     JS-Divergence & 0.70$\pm$0.04 & \textbf{0.48$\pm$0.02} & 0.59$\pm$0.03 & 0.44$\pm$0.05\\
     Wasserstein-2 & 0.59$\pm$0.04 & 0.39$\pm$0.04 & 0.60$\pm$0.02 & 0.43$\pm$0.04\\
     \bottomrule
    \end{tabular}
    \end{adjustbox}
     \caption{\textbf{Ablation on probabilistic distance metric.} GroVE performs better for cross-modal alignment using KL-Divergence compared to other metrics.}
    \label{tab:ablation_kl}
\end{table}


\textbf{Noisy Data.} To evaluate the performance of GroVE against noisy inputs, we systematically introduce increasing levels of masking to both the input images and texts. This analysis employs several CLIP image encoder backbones, including ViT-B/32, ViT-B/16, and ResNet50, along with CLIP's text encoder. The results, presented in Figure~\ref{fig:reg_masking} (right), indicate that as the noise level increases, the uncertainty predicted by GroVE rises steadily as desired.

\section{Conclusion}
This paper introduces GroVE, a post-hoc approach for generating probabilistic embeddings from frozen, pre-trained VLMs to model input data ambiguities. GroVE leverages the GPLVM framework, utilizing GP models to learn a shared, low-dimensional latent space that aligns visual and textual representations. By mapping into this latent space, the GP models generate probabilistic embeddings that provide a measure of uncertainty in the predictions. GroVE demonstrates state-of-the-art performance in uncertainty calibration for cross-modal retrieval, active learning and VQA.  One limitation of GroVE is the it is computationally expensive compared to the neural network based methods. In latency-sensitive scenarios,  neural network-based stochastic models like Neural Processes~\citep{garnelo2018neural} offer a viable alternative to GPs. Future work will focus on assessing their uncertainty calibration performance for VLMs.




% References
\bibliography{uai2025-template}

\newpage

\onecolumn

\title{Probabilistic Embeddings for Frozen Vision-Language Models: Uncertainty
Quantification with Gaussian Process Latent Variable Models\\(Supplementary Material)}
\maketitle


\appendix


\section{Additional Implementation Details} \label{sec:app_implementation}

This section provides details on the data processing steps for training both the baseline models and GroVE, implementation details for each baseline, and the hyper-parameter tuning procedure applied for GroVE.

\subsection{Datasets} \label{sec:app_dataset}
For the experiments, we use MS-COCO, Flickr30k, CUB-200-2011, and Oxford Flowers 102 dataset. 

\textbf{MS-COCO~\cite{lin2014microsoft}} is a widely used cross-modal retrieval dataset includes 123,287 images, each image annotated with 5 captions describing common objects. The training set comprises 113,287 images, while both the validation and test sets contain 5,000 images each. Different papers apply varying evaluation protocols on the 5,000 test images in the COCO dataset. Some cross-modal retrieval papers report results on the full 5,000 test set, while others use 1,000 unique test images, averaging results over 5 random splits. In our study, we follow the former approach, presenting results based on the entire 5,000 test set.

\textbf{Flickr30k~\cite{young2014image}} is a widely used cross-modal retrieval dataset comprising 31,783 images, each image annotated with 5 captions describing common objects. The dataset is split into 29,783 training images, with 1,000 images each in the validation and test sets.

\textbf{CUB-200-2011~\cite{wah2011caltech}} is a fine-grained bird species dataset comprising 11,788 images across 200 categories, with each image paired with 10 captions sourced from \cite{reed2016learning}. Following the split protocol in \cite{chun2021probabilistic}, the dataset includes 7,067 training images, 1,754 validation images, and 2,967 test images.

\textbf{Oxford Flowers 102~\cite{nilsback2008automated}} is a fine-grained flowers dataset comprising 8,189 images across 102 categories, with each image paired with 10 captions sourced from \cite{reed2016learning}. Following the split protocol in \cite{upadhyay2023probvlm}, the dataset includes 7,034 training images, 750 validation and 805 test images.


All images are resized to $224 \times 224$, suitable for input to VLMs. All methods are trained and evaluated using the same dataset splits for the comparison.

\subsection{Baseline methods} \label{sec:baselines}
In this section, we provide the implementation and training details for the baseline methods.

\textbf{TTDA~\cite{ayhan2018test}.} During inference, data augmentations are applied to the input, generating multiple variations to estimate prediction uncertainty. For each augmented version, a prediction is generated, and the variance across these predictions reflects the model's uncertainty. Image augmentations include random resized cropping and horizontal flipping (applied with a probability of 0.3), while text data undergoes random word masking with a 0.3 probability. The model is run for 10 passes on these augmentations, to obtain the image and text uncertainty.

\textbf{PFE~\cite{shi2019probabilistic}, PCME~\cite{chun2021probabilistic} and PCME++~\cite{chun2023improved}.} During training, we adapt these methods to process image and text embeddings derived from a frozen VLM. Following \cite{upadhyay2023probvlm}, the model architecture consists of two Multi-Layer Perceptrons (MLPs)—one for images and one for text. Each MLP has an input layer that reduces the embedding dimension to 256, a hidden layer of 256 units, and an output layer that maps from 256 back to the original embedding dimension.  We apply the respective loss functions to learn covariances for a Gaussian distribution, with mean values matching the VLM’s deterministic embeddings. Training is conducted for 200 epochs using the Adam optimizer with a learning rate of $10^{-8}$ and batch size of 64. The learning rate was fixed using a grid search over values $\{1e^{-4}, 1e^{-5}, 1e^{-6}, 1e^{-7}, 1e^{-8}\}$

\textbf{ProbVLM~\cite{upadhyay2023probvlm}.} We follow the training procedure outlined in the original paper. The model architecture consists of two MLPs—one for image embeddings and one for text embeddings—similar to previous methods. Training is conducted with the Adam optimizer for 100 epochs, using a learning rate of $10^{-4}$ and a batch size of 64.
 


\subsection{Hyper-parameter Tuning} \label{sec:app_hyper}

GroVE introduces the following hyper-parameters which were obtained using grid-search: latent space dimension $Q$, and the trade-off parameters $\lambda_1$ and $\lambda_2$. For $Q$, we evaluated values $Q \in \{2, 5, 10, 20, 50, 128, 256\}$. For the trade-off parameters, we used $\lambda_1 \in \{1, 0.1, 0.01, 0.001\}$, and $\lambda_2 \in \{0, 200, 400, 600, 800, 1000\}$.
based on the grid-search results, the optimal setting for $Q$ was $Q=5$ for MS-COCO and Flickr30k, and $Q=10$ for CUB-200-2011 and Oxford Flowers 102. The trade-off parameters that achieved the best performance were $\lambda_1 = 0.01$ and $\lambda_2 = 400$. The number of inducing points was selected from \{100, 150, 200, 250, 300, 350\}, with model performance plateauing beyond 250 points.
Finally, the learning rate of $1e^{-5}$ was selected based on a grid-search over values $\{ 1e^{-1}, 1e^{-2}, 1e^{-3}, 1e^{-4}, 1e^{-5}, 1e^{-6}\}$.

\section{Definitions} \label{sec:definitions}
This section presents the definitions of the GP kernels and the probabilistic distance metrics used in the ablation study.

\subsection{GP Kernels} \label{sec:def_kernel}
In Table.~\ref{tab:ablation_kernels}, we presented the results for ablation study of various kernels for GP. In this section, we provide the definitions and formulas for the kernels used.

\textbf{Radial Basis Function (RBF).} The RBF kernel, also known as the Gaussian kernel, is a popular choice in GPs. It assumes that closer data points in input space have higher similarity. The RBF kernel between two points $\mathbf{x_i}$ and $\mathbf{x_j}$ is defined as:
\begin{equation}
    k(\mathbf{x_i}, \mathbf{x_j}) = \exp \left(- \frac{\| \mathbf{x_i} - \mathbf{x_j}\|}{l^2} \right)
\end{equation}
where $l$ is the length scale parameter, controlling how quickly the similarity decreases with distance in input space. 

\textbf{Mat\'ern.} The Matérn kernel generalizes the RBF kernel, defined as
\begin{equation}
    k(\mathbf{x_i}, \mathbf{x_j}) =   \frac{1}{\Gamma(\nu)2^{\nu-1}}\Bigg(
\frac{\sqrt{2\nu}}{l} \|\mathbf{x_i} - \mathbf{x_j}\|
\Bigg)^\nu K_\nu\Bigg(
\frac{\sqrt{2\nu}}{l} \|\mathbf{x_i} - \mathbf{x_j}\|\Bigg)
\end{equation}
where $\nu$ controls the smoothness of the resulting function, $l$ is the length scale parameter, $\Gamma$ is the Gamma function and $K_\nu$ is a modified Bessel function. 

\textbf{Cosine Similarity.} This is a linear kernel with normalized inputs, and is defined as:
\begin{equation}
    k(\mathbf{x_i}, \mathbf{x_j}) = \frac{\mathbf{x_i}^T\mathbf{x_j}}{\|\mathbf{x_i}\|\|\mathbf{x_j}\|}
\end{equation}

\subsection{Probabilistic Distances} \label{sec:def_dist}
Sec.~\ref{sec:ablation} presented an ablation study on the choice of the probability distance metric for cross-modal alignment (refer Table.~\ref{tab:ablation_kl}). The definitions of the probabilistic distance metrics for two multivariate Gaussians $p = \mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{I}, \hat{\boldsymbol{\Sigma}}_\mathcal{I})$ and $q = \mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{T}, \hat{\boldsymbol{\Sigma}}_\mathcal{T})$, are as follows:

\textbf{Kullback-Liebler Divergence.} The KL Divergence quantifies the difference between two probability distributions, and is defined as~\cite{duchi2007derivations}:
\begin{equation} 
    D_{KL} (p\|q) =
    %D_{KL}(\mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{T}, \hat{\boldsymbol{\Sigma}}_\mathcal{T}) \| \mathcal{N}(\hat{\boldsymbol{\mu}}_\mathcal{I}, \hat{\boldsymbol{\Sigma}}_\mathcal{I})) \\
     \frac{1}{2} \Bigg[ \text{tr}(\hat{\boldsymbol{\Sigma}}_\mathcal{T}^{-1} \hat{\boldsymbol{\Sigma}}_\mathcal{I}) + (\hat{\boldsymbol{\mu}}_\mathcal{T} - \hat{\boldsymbol{\mu}}_\mathcal{I})^{T} \hat{\boldsymbol{\Sigma}}_\mathcal{T}^{-1} (\hat{\boldsymbol{\mu}}_\mathcal{T} - \hat{\boldsymbol{\mu}}_\mathcal{I}) \\
    - D + \log \left( \frac{\det(\hat{\boldsymbol{\Sigma}}_\mathcal{T})}{\det(\hat{\boldsymbol{\Sigma}}_\mathcal{I})} \right) \Bigg],   
    %\frac{1}{2} \Bigg[ \text{tr}(\hat{\boldsymbol{\Sigma}}_\mathcal{I}^{-1} \hat{\boldsymbol{\Sigma}}_\mathcal{T}) + (\hat{\boldsymbol{\mu}}_\mathcal{I} - \hat{\boldsymbol{\mu}}_\mathcal{T})^{T} \hat{\boldsymbol{\Sigma}}_\mathcal{I}^{-1} (\hat{\boldsymbol{\mu}}_\mathcal{I} - \hat{\boldsymbol{\mu}}_\mathcal{T}) \\
    %- D + \log \left( \frac{\det(\hat{\boldsymbol{\Sigma}}_\mathcal{I})}{\det(\hat{\boldsymbol{\Sigma}}_\mathcal{T})} \right) \Bigg]
\end{equation}
where $\text{tr}(.)$ is the trace and $\det(.)$ is the determinant of a matrix. 
Note that the KL-Divergence is asymmetric; thus, we calculate the total cross-alignment loss as the mean of the KL divergences in both directions (refer Eq.~\ref{eq:kl}):
$\frac{1}{2}[D_{KL}(p \| q) + D_{KL}(q \| p)]$.


\textbf{Jensen-Shannon (JS) Divergence.} The JS Divergence is obtained by averaging the KL divergences between each distribution and the average distribution. The JS divergence is defined as:
\begin{equation}
    D_{JS}(p\|q) = \frac{1}{2} \left(D_{KL}(p \| m) +  D_{KL}(q \| m)\right)
\end{equation}
where $m = \frac{1}{2}(p+q)$ is the mean distribution of $p$ and $q$. %, 

\textbf{Wasserstein-2 distance.} The Wasserstein-2 distance quantifies the cost of transforming one distribution into another. This is defined as:
\begin{equation}
    W_2^2(p,q) = \|\hat{\boldsymbol{\mu}}_\mathcal{I} - \hat{\boldsymbol{\mu}}_\mathcal{T}\|^2 + \\\text{tr}\left(\hat{\boldsymbol{\Sigma}}_\mathcal{I} + \hat{\boldsymbol{\Sigma}}_\mathcal{T} - 2\left(\hat{\boldsymbol{\Sigma}}_\mathcal{I}^{1/2} \, \hat{\boldsymbol{\Sigma}}_\mathcal{T} \, \hat{\boldsymbol{\Sigma}}_\mathcal{I}^{1/2}\right)^{1/2}\right) 
\end{equation}



\section{Additional Results} \label{sec:add_results}
This section presents additional quantitative and qualitative results.

\subsection{Cross-modal Retrieval} \label{sec:app_cross-modal}

\subsubsection{Calibration plots}
Figure~\ref{fig:clip_calibration} and Figure~\ref{fig:blip_calibration} show the calibration plots for the CLIP and BLIP models, respectively. 
Calibration plots are obtained by binning uncertainty values, referred to as uncertainty levels and computing Recall@1 for each bin. From the plots, GroVE maintains a more consistent alignment between decreasing uncertainty and increasing Recall@1.

\begin{figure*}[t]
    \centering
    \begin{tabular}{@{}c@{}c@{}c@{}c@{}c@{}}
        %& \multicolumn{1}{c}{\small Flickr} & \multicolumn{1}{c}{\small MS-COCO} & \multicolumn{1}{c}{\small CUB} & \multicolumn{1}{c}{\small Flowers}\\
        & Flickr30k & MS-COCO & CUB & Flowers \\
        \raisebox{1.2cm}{\rotatebox{90}{\centering Image to Text}} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flickr_im2txt_clip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/coco_im2txt_clip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/cub_im2txt_clip.pdf} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flowers_im2txt_clip.pdf}    \\    
        \raisebox{1.2cm}{\rotatebox{90}{\centering Text to Image}} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flickr_txt2im_clip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/coco_txt2im_clip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/cub_txt2im_clip.pdf} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flowers_txt2im_clip.pdf} 
    \end{tabular}
    \includegraphics[width=0.8\linewidth]{images/legend.pdf} 
    \caption{\textbf{Evaluation of uncertainty calibration} for embeddings obtained from CLIP on Image-to-Text retrieval. For perfect calibration, the Recall@1 should show a monotonic decrease as uncertainty levels increase.
    GroVE exhibits a more consistent relationship between increasing uncertainty and performance degradation compared to the baseline methods.
    }
    \label{fig:clip_calibration}
\end{figure*}

\begin{figure*}[h]
    \centering
    \begin{tabular}{@{}c@{}c@{}c@{}c@{}c@{}}
        %& \multicolumn{1}{c}{\small Flickr} & \multicolumn{1}{c}{\small MS-COCO} & \multicolumn{1}{c}{\small CUB} & \multicolumn{1}{c}{\small Flowers}\\
        & Flickr & MS-COCO & CUB & Flowers \\
        \raisebox{1.2cm}{\rotatebox{90}{\small Image to Text}} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flickr_im2txt_blip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/coco_im2txt_blip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/cub_im2txt_blip.pdf} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flowers_im2txt_blip.pdf}      
        \\ 
        \raisebox{1.2cm}{\rotatebox{90}{\small Text to Image}} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flickr_txt2im_blip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/coco_txt2im_blip.pdf} & 
        \includegraphics[width=0.23\linewidth, height=4cm]{images/cub_txt2im_blip.pdf} &
        \includegraphics[width=0.23\linewidth, height=4cm]{images/flowers_txt2im_blip.pdf}    
    \end{tabular}
    \includegraphics[width=0.8\linewidth]{images/legend.pdf} % Adjust the width of the legend
    \caption{\textbf{Evaluation of uncertainty calibration} for embeddings obtained from BLIP on Image-to-Text (top) and Text-to-Image (bottom) retrieval tasks. For perfect calibration, the Recall@1 should show a monotonic decrease as uncertainty levels increase. GroVE exhibits a more consistent relationship between increasing uncertainty and performance degradation compared to the baseline methods.}
    \label{fig:blip_calibration}
\end{figure*}



\subsubsection{Retrieval performance}

Table~\ref{tab:retrieval} presents the Recall@1 scores for various baselines using CLIP. The score for the Deterministic baseline was computed by retrieving the nearest image/text embedding based on cosine similarity to the query text/image from the deterministic embeddings generated by the CLIP model. For the other baselines, retrieval was performed by selecting the image/text embedding with the minimum Wasserstein distance to the query, using the probabilistic image/text embeddings. Results show that GroVE achieves a good performance on the fine-grained CUB and Flowers dataset, whereas deterministic achieves the best scores in MS-COCO and Flickr30k dataset. %While the probabilistic approaches show a drop in performance, their primary objective is to serve as an auxiliary method for estimating uncertainty, rather than maximizing retrieval accuracy.


\begin{table}[h]
    %\begin{adjustbox}{width=0.8\columnwidth, center}
    \centering
    \begin{tabular}{p{0.2cm}lcccc}
    \toprule
    &{Method} & Flickr & COCO & CUB & Flowers \\
    \midrule
        \multirow{7}{*}{\rotatebox{90}{\textbf{Image to Text}}} 
        &Deterministic & \textbf{0.801} & \textbf{0.715} & \textbf{0.532} & \underline{0.357}\\ 
        &TTDA & 0.423 & 0.326 & 0.133 & 0.289\\
        &PFE & 0.238 & 0.213 & 0.101 & 0.102\\
        &PCME & 0.392 & 0.246 & 0.129 & 0.134\\
        &PCME++ & 0.423 &0.397 & 0.124 & 0.111\\
        &ProbVLM & 0.491 & 0.303 & 0.136 & 0.245\\
        &GroVE & \underline{0.569} & \underline{0.512} & \underline{0.307} & \textbf{0.402}\\
    
     \midrule
     \multirow{7}{*}{\rotatebox{90}{\textbf{Text to Image}}} 
        &Deterministic & \textbf{0.543} & \textbf{0.515} & \underline{0.141} & \underline{0.109}\\ 
        &TTDA & 0.202 & 0.139 & 0.046 & 0.057\\
        &PFE & \underline{0.298} & 0.219 & 0.023 & 0.024\\
        &PCME & 0.092 & 0.102 & 0.099 & 0.029\\
        &PCME++ & 0.133 & 0.125 & 0.087 & 0.058\\
        &ProbVLM & 0.104 & 0.156 & 0.005 & 0.102\\
        &GroVE & 0.241 & \underline{0.288} & \textbf{0.343} & \textbf{0.379}\\
    
     \bottomrule
        
    \end{tabular}
    %\end{adjustbox}
    \caption{\textbf{Retrieval performance using CLIP.} Table shows the Recall@1 scores obtained using the different baselines. GroVE achieves the best scores for the fine-grained datasets.
    }
    \label{tab:retrieval}
\end{table}


\subsubsection{Qualitative Analysis}
A t-SNE visualization of the probabilistic embeddings from GroVE on a subset of the CUB dataset is provided in Figure~\ref{fig:latent}, where the uncertainty corresponds to the area of the embedding. 
The plot shows that images and texts with similar semantic content are clustered together, and the probabilistic embeddings are able to capture the uncertainty arising from the data ambiguities.

\begin{figure}[h]
    \centering
    \includegraphics[angle=-90,width=0.8\linewidth]{images/latent.pdf}
    %\caption{Visualisation of latent space learnt by }
    \caption{\textbf{t-SNE visualization of the probabilistic representations} generated by GroVE on a subset of the CUB-200-2011 dataset. Starting from deterministic embeddings provided by frozen VLMs, GroVE produces corresponding probabilistic representations that capture input ambiguities. }

    \label{fig:latent}
\end{figure}



\clearpage

\subsection{Zero-shot Uncertainty Calibration} \label{sec:app_zs}

%This section presents the results for the zero-shot uncertainty calibration experiment described in Sec.~\ref{sec:coco_zs}. 
We evaluate the generalization of uncertainty calibration across methods that use auxiliary models for probabilistic embeddings on out-of-distribution datasets. Two CLIP experiments are conducted by training on MS-COCO and CUB, then evaluating on their respective unseen datasets. 
Table~\ref{tab:coco_zs} and \ref{tab:cub_zs} presents the $-SR^2$ scores with models trained on MS-COCO and CUB respectively. The models trained on MS-COCO show a
strong performance on Flickr30k due to its similarity to MS-COCO, thereby exhibiting better generalization. There is a drop in performance on the more fine-grained Flowers and CUB datasets, particularly for text-to-image retrieval. GroVE, however, demonstrates better generalization than the baseline methods for both the experiments. 
%The results for CUB, and the calibration plots are provided in Appendix~\ref{sec:app_zs}.
%For models trained on CUB, generalization performance improves overall. Consistent with earlier findings, GroVE achieves superior calibration across unseen datasets. 
%The calibration plots for these experiments are provided in Appendix.~\ref{sec:add_results}.


\begin{table}[tb]
    %\begin{adjustbox}{width=0.8\columnwidth, center}
    \centering
    \begin{tabular}{p{0.2cm}lccc}
    \toprule
    &{Method} & Flickr & Flowers & CUB \\
    \midrule
        \multirow{5}{*}{\rotatebox{90}{\textbf{Image to Text}}} 
        &PFE & 0.01$\pm$0.03 & 0.38$\pm$0.04 & 0.02$\pm$0.02\\
        &PCME & 0.04$\pm$0.02 & 0.13$\pm$0.04 &0.09$\pm$0.06\\
        &PCME++ & 0.01$\pm$0.02 & \underline{0.48$\pm$0.03} & 0.03$\pm$0.02\\
        &ProbVLM & \underline{0.55$\pm$0.03}  & 0.19$\pm$0.04 & \underline{0.15$\pm$0.04}\\
        &GroVE & \textbf{0.74$\pm$0.03} & \textbf{0.69$\pm$0.02} & \textbf{0.41$\pm$0.03}\\
    
     \hline
        \multirow{5}{*}{\rotatebox{90}{\textbf{Text to Image}}} 
        &PFE & \underline{0.41$\pm$0.03} & 0.02$\pm$0.03   & 0.04$\pm$0.01\\
        &PCME &  0.24$\pm$0.03 & -0.01$\pm$0.02 & \underline{0.02$\pm$0.03}\\
        &PCME++ & -0.43$\pm$0.03 &  \underline{0.05$\pm$0.03} & 0.03$\pm$0.02\\
        &ProbVLM & 0.14$\pm$0.05 & 0.01$\pm$0.03 & 0.00$\pm$0.01\\
        &GroVE & \textbf{0.42$\pm$0.02} & \textbf{0.09$\pm$0.03} & \textbf{0.04$\pm$0.02}\\
    
     \bottomrule
        
    \end{tabular}
    %\end{adjustbox}
    \caption{\textbf{Zero-shot uncertainty calibration - MS-COCO.} GroVE outperforms other baselines in most cases, achieving superior uncertainty calibration in zero-shot settings. The best scores are highlighted in bold and the second-best scores are underlined.
    }
    \label{tab:coco_zs}
\end{table}


\begin{table}[tb]
    %\begin{adjustbox}{width=0.8\columnwidth, center}
    \centering
    \begin{tabular}{p{0.2cm}lccc}
    \toprule
    &{Method} & Flickr & COCO & Flowers \\
    \midrule
        \multirow{5}{*}{\rotatebox{90}{\textbf{Image to Text}}} 
        &PFE & 0.00$\pm$0.04 & 0.02$\pm$0.03 & -0.13$\pm$0.03\\
        &PCME & \underline{0.46$\pm$0.02} & 0.01$\pm$0.05 &0.02$\pm$0.04\\
        &PCME++ & 0.40$\pm$0.03 & 0.10$\pm$0.02 & \underline{0.44$\pm$0.03}\\
        &ProbVLM & 0.15$\pm$0.02  & \underline{0.38$\pm$0.03} & 0.18$\pm$0.03\\
        &GroVE & \textbf{0.59$\pm$0.03} & \textbf{0.45$\pm$0.03} & \textbf{0.50$\pm$0.04}\\
    
     \hline
        \multirow{5}{*}{\rotatebox{90}{\textbf{Text to Image}}} 
        &PFE & -0.01$\pm$0.02 & 0.31$\pm$0.04   & -0.12$\pm$0.03\\
        &PCME &  \underline{0.14$\pm$0.02} & -0.19$\pm$0.03 & 0.15$\pm$0.04\\
        &PCME++ & 0.13$\pm$0.02 &  \textbf{0.52$\pm$0.03} & \underline{0.36$\pm$0.03}\\
        &ProbVLM & 0.01$\pm$0.03 & 0.01$\pm$0.02 & 0.02$\pm$0.03\\
        &GroVE & \textbf{0.76$\pm$0.03} & \underline{0.42$\pm$0.02} & \textbf{0.37$\pm$0.03}\\
    
     \bottomrule
        
    \end{tabular}
    %\end{adjustbox}
    \caption{\textbf{Zero-shot uncertainty calibration - CUB-200-2011.} GroVE outperforms other baselines in most cases, achieving superior uncertainty calibration in zero-shot settings. The best scores are highlighted in bold and the second-best scores are underlined.
    }
    \label{tab:cub_zs}
\end{table}



The calibration results for the experiments are presented in Figure~\ref{fig:coco_zs_calibration} and Figure~\ref{fig:cub_zs_calibration}, respectively, where 
GroVE maintains a more consistent alignment between decreasing uncertainty and increasing Recall@1.


\begin{figure*}[t]
    \centering
    \begin{tabular}{@{}c@{}c@{}c@{}c@{}}
        %& \multicolumn{1}{c}{\small Flickr} & \multicolumn{1}{c}{\small MS-COCO} & \multicolumn{1}{c}{\small CUB} & \multicolumn{1}{c}{\small Flowers}\\
        & Flickr & Flowers & CUB  \\
        \raisebox{1.2cm}{\rotatebox{90}{\small Image to Text}} &
        \includegraphics[width=0.25\linewidth, height=4cm]{images/coco_flickr_zs_im2txt.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/coco_flower_zs_im2txt.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/coco_cub_zs_im2txt.pdf}     
        \\
        
        \raisebox{1.2cm}{\rotatebox{90}{\small Text to Image}} &
        \includegraphics[width=0.25\linewidth, height=4cm]{images/coco_flickr_zs_txt2im.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/coco_flower_zs_txt2im.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/coco_cub_zs_txt2im.pdf}     
    \end{tabular}
    \includegraphics[width=0.7\linewidth]{images/legend_zs.pdf} % Adjust the width of the legend
    \caption{\textbf{Evaluation of zero-shot uncertainty calibration using MS-COCO} for embeddings obtained from CLIP on Image-to-Text (top) and Text-to-Image (bottom) retrieval tasks. For perfect calibration, the Recall@1 should show a monotonic decrease as uncertainty levels increase. GroVE exhibits a more consistent relationship between increasing uncertainty and performance degradation compared to the baseline methods.}
    \label{fig:coco_zs_calibration}
\end{figure*}


\begin{figure*}[h]
    \centering
    \begin{tabular}{@{}c@{}c@{}c@{}c@{}}
        %& \multicolumn{1}{c}{\small Flickr} & \multicolumn{1}{c}{\small MS-COCO} & \multicolumn{1}{c}{\small CUB} & \multicolumn{1}{c}{\small Flowers}\\
        & Flickr & MS-COCO & Flowers  \\
        \raisebox{1.2cm}{\rotatebox{90}{\small Image to Text}} &
        \includegraphics[width=0.25\linewidth, height=4cm]{images/cub_flickr_zs_im2txt.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/cub_coco_zs_im2txt.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/cub_flower_zs_im2txt.pdf}     
        \\
        
        \raisebox{1.2cm}{\rotatebox{90}{\small Text to Image}} &
        \includegraphics[width=0.25\linewidth, height=4cm]{images/cub_flickr_zs_txt2im.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/cub_coco_zs_txt2im.pdf} & 
        \includegraphics[width=0.25\linewidth, height=4cm]{images/cub_flower_zs_txt2im.pdf}      
    \end{tabular}
    \includegraphics[width=0.7\linewidth]{images/legend_zs.pdf} % Adjust the width of the legend
    \caption{\textbf{Evaluation of zero-shot uncertainty calibration using CUB-200-2011} for embeddings obtained from CLIP on Image-to-Text (top) and Text-to-Image (bottom) retrieval tasks. For perfect calibration, the Recall@1 should show a monotonic decrease as uncertainty levels increase. GroVE exhibits a more consistent relationship between increasing uncertainty and performance degradation compared to the baseline methods.}
    \label{fig:cub_zs_calibration}
\end{figure*}

\clearpage

\subsection{Few-shot Uncertainty Calibration} \label{sec:app_few_shot}

Table~\ref{tab:few-shot-retrieval} shows the Recall@1 scores for the cross-modal retrieval task for the auxiliary models trained using limited data from the synthetic CUB dataset.  The performance of the neural network based methods drop, which is expected given the insufficient number of data points for the model to generalize. Note that deterministic and TTDA are agnostic to the few shot setting since they work directly on the VLM embeddings for the prediction. Among the methods using auxiliary models, GroVE achieves a higher score, leveraging the ability of GPs to generalize well even with limited data because of their distance awareness property by capturing structure through kernel functions. Moreover, as the number of inducing points increases, GroVE's performance improves, with the best results achieved when performing exact GP. 



\begin{table}[tbh]
    \centering
    \begin{tabular}{ccc}
    \toprule
     Method  & Image to Text & Text to Image \\ \midrule
     Determinsitic  & \textbf{0.532} & \underline{0.141} \\
     TTDA (10 passes) &  \underline{0.133$\pm$0.003} & 0.046$\pm$0.011 \\
     PFE & 0.062$\pm$0.001 & 0.026$\pm$0.010\\
     PCME & 0.074$\pm$0.002 & 0.031$\pm$0.005\\
     PCME++ & 0.063$\pm$0.003 & 0.031$\pm$0.003 \\
     ProbVLM & 0.081$\pm$0.001 & 0.034$\pm$0.005 \\
     \midrule
     GroVE (M=50) & 0.062$\pm$0.002 & 0.035$\pm$0.009 \\
     GroVE (M=150) & 0.084$\pm$0.004 & 0.049$\pm$0.004 \\
     GroVE (M=250) & 0.086$\pm$0.003 & 0.056$\pm$0.004 \\
     GroVE (exact GP) & 0.103$\pm$0.002 & \textbf{0.182$\pm$0.002} \\ \bottomrule
    \end{tabular}
    \caption{Retrieval performance using Recall@1 scores and inference speed per instance for few-shot experiment using CUB-200-2011. The best results are highlighted in bold and the second best are underlined.}
    \label{tab:few-shot-retrieval}
\end{table}

Table~\ref{tab:classification} shows the results for uncertainty calibration for few-shot classification using the CUB-200-2011 dataset, with the same settings as described in Section 4.4, and the key difference that we perform multi-class classification instead of cross-modal retrieval. We evaluate using accuracy values and $-SR^2$ scores for uncertainty calibration. For computing the $-SR^2$ scores, we adopt a procedure similar to that used for cross-modal retrieval. Specifically, we divide the uncertainty values into bins, and instead of using Recall@1, we use accuracy scores to compute the $-SR^2$ values. The results show that GroVE achieves a better accuracy and uncertainty calibration in comparison to the other baselines.

\begin{table}[h]
    \centering
    \begin{tabular}{ccc} \toprule
       Method  &  Accuracy & $-SR^2$ \\ \midrule
       TTDA  & 15.08$\pm$0.04 & 0.23$\pm$0.08 \\
       PFE & 13.26$\pm$0.03 & 0.03$\pm$0.04 \\
       PCME & 14.89$\pm$0.05 & 0.16$\pm$0.07 \\
       PCME++ & 13.65$\pm$0.03 & 0.22$\pm$0.04 \\
       ProbVLM & 14.62$\pm$0.05 & 0.38$\pm$0.03 \\
       GroVE & \textbf{15.32$\pm$0.06} & \textbf{0.62$\pm$0.02} \\ \bottomrule
    \end{tabular}
    \caption{Results for few-shot classification using CUB-200-2011.}
    \label{tab:classification}
\end{table}


\end{document}
