\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{colortbl}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{caption}
% \usepackage{subcaption}
% \documentclass{article}
\usepackage{booktabs}
% \usepackage{multirow}
\usepackage{graphicx}
\usepackage[table,xcdraw]{xcolor}
% \jmlrvolume{-- Under Review}
% \jmlryear{2026}
% \jmlrworkshop{Full Paper -- MIDL 2026 submission}
% \editors{Under Review for MIDL 2026}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 179}
\editors{Accepted for publication at MIDL 2026}

% \title[Short Title]{Uncertainty-Aware Image Retrieval Using Evidential Transformers}
% \title[Evidential Retrieval]{Towards Calibrated and Uncertainty-Aware Medical Image Retrieval via Evidential Transformer Models}
\title[Evidential Retrieval]{Evidential Retriever: Uncertainty-Aware Medical Image Retrieval}

\midlauthor{\Name{Sai Susmitha Arvapalli} \Email{\lowercase{susmitha@cse.iitk.ac.in}}\\
\addr  IIT Kanpur, India
\AND
\Name{Vinay P. Namboodiri} \Email{\lowercase{vpn22@bath.ac.uk}}\\
\addr University of Bath, UK
}

\begin{document}

\maketitle
% \vspace{-0.5cm}
\begin{abstract}
Medical image retrieval systems could play a vital role in clinical decision support by enabling physicians to find visually and semantically similar cases from large medical databases. However, deep learning-based retrieval models often overlook uncertainty in their predictions. To address this, we propose the Evidential Retriever, a novel architecture that combines evidential deep learning principles with transformer-based image representations to achieve more accurate and calibrated retrieval. Built upon a Swin Transformer backbone, our model features a dual-headed design: a retrieval head that performs metric learning for robust image embeddings, and an evidential head that models predictive uncertainty. We use a unified dual-loss, combining a regularized contrastive loss with an evidential loss. Experiments on five diverse medical imaging datasets: CheXpert, NIH-14, ISIC17, COVID-QU-Ex, and KVASIR - demonstrate that our method outperforms state-of-the-art retrieval models in retrieval accuracy and uncertainty estimation. Furthermore, we demonstrate that our evidential framework is architecture-agnostic and can be used to improve the calibration of large-scale Foundation Models.
\end{abstract}

\begin{keywords}
Medical Image Retrieval, Evidential deep learning, Uncertainty Estimation.
\end{keywords}

\section{Introduction}
Content-Based Medical Image Retrieval (CBMIR) systems aim to retrieve clinically relevant and visually similar images from large-scale archives in response to a query image. This technology has potential to aid in clinical decision support, case-based reasoning, medical education, and differential diagnosis \cite{dubey2021decade,choe2022content,manna2024deep}. With the success of deep learning, modern CBMIR has shifted from handcrafted features to deep embeddings learned by Convolutional Neural Networks (CNNs)\cite{shetty2023medical, hu2022x} and, more recently, Vision Transformers \cite{pmlr-v250-susmitha24a,trinh2021endoscopy,thakrar2023semantic}. These models learn powerful, low-dimensional representations that capture complex semantic content, leading to significant improvements in retrieval accuracy.

Despite this progress, a critical gap remains: reliability. Most deep retrieval models are deterministic. They minimize a prediction or metric loss, but the model is ignorant of its own confidence \cite{sensoy2018evidential}. When presented with a query, it will retrieve the ``closest" matches from its embedding space, even if the match is ambiguous, poorly acquired, or from a completely unrelated domain (out-of-distribution). In a critical medical environment, this is a limitation. A model that retrieves an incorrect case can mislead a clinician.
Hence, there is an emergent need for retrieval systems that are not only accurate but also uncertainty-aware \cite{cai2025risk}. This can be solved to some extent using Bayesian Neural Networks (BNNs). However, BNNs often introduce significant computational overhead. Moreover, their uncertainty may not be well calibrated.

A promising and more efficient alternative is Evidential Deep Learning (EDL) \cite{sensoy2018evidential,ulmer2023prior}. EDL approaches the uncertainty problem from a `Theory of Evidence' perspective. Instead of producing a simple softmax probability (a point estimate), an evidential network is trained to output the parameters of a  Dirichlet distribution. This distribution models uncertainty over the class probabilities, directly quantifying the model's confidence based on the ``evidence" it has collected from the data. 
% This framework has shown unprecedented success in detecting OOD samples and resisting adversarial attacks \cite{sensoy2018evidential,gao2025comprehensive}.
% Recently, this paradigm was extended to general image retrieval, demonstrating that evidential classification can serve as a strong baseline for deep metric learning .
In this paper, we introduce the Evidential Retriever, an architecture that integrates uncertainty quantification into a transformer model for CBMIR. Unlike prior work that used evidential learning for classification-derived embeddings \cite{evidentialtransformers2024}, our model is an end-to-end unified framework that simultaneously learns discriminative embeddings and their associated evidential uncertainty. Our model feeds a shared [CLS] token representation into two parallel heads:
An {\textit{Embedding head}} trained with a deep metric learning loss to produce a discriminative embedding for retrieval.
An {\textit{Evidential head}} trained with an evidential loss to predict the Dirichlet parameter.
Our design allows the model to simultaneously optimize for feature discrimination (for accurate retrieval) and evidence-based calibration (for reliable uncertainty).
To summarize, our key contributions are:
\begin{itemize}
    \item We propose the Evidential Retriever, a novel dual-head architecture that concurrently learns discriminative embeddings for retrieval and evidential parameters for uncertainty quantification in a single, end-to-end model.
    \item We introduce a composite loss function that effectively balances a deep metric learning objective for the embedding head with an evidential loss for the evidential head.
    \item We obtain state-of-the-art retrieval performance on diverse public medical datasets: ISIC (skin lesions), COVID-QU-Ex (chest X-Ray), and Kvasir (gastrointestinal endoscopy). We demonstrate qualitatively and quantitatively that our model's uncertainty estimates are calibrated, and provides ability for error-based filtration.
\end{itemize}

% \vspace{-0.5cm}
% \section{Related Work}

% Deep metric learning forms the foundation of image retrieval systems, where contrastive \cite{pmlr-v250-susmitha24a,el2021training} and triplet losses \cite{hu2022x} promote compact and well-separated embeddings. Vision Transformers (ViTs) \cite{dosovitskiy2020image} have recently outperformed CNNs by modeling global dependencies and learning richer representations. In medical image retrieval, studies on COVID, Kvasir, and ISIC datasets \cite{tschandl2019diagnostic,shetty2023medical,agrawal2022content} have primarily relied on CNN-based architectures like ResNet, VGG, and DenseNet. Subsequent works introduced improved similarity measures such as relative difference-based similarity (RDBSM) \cite{ahmed2023content} and opponent class adaptive margin (OCAM) loss \cite{ozturk2023content}, while ViT-based methods \cite{el2021training,trinh2021endoscopy,thakrar2023semantic,gupta2023medical,manzari2023medvit, arvapalli2025exploring} capture global contextual relationships more effectively. However, these approaches cannot quantify predictive reliability, an essential aspect in safety-critical domains like healthcare. Uncertainty in deep learning is  categorized into aleatoric (data-dependent) and epistemic (model) uncertainty. Gal and Ghahramani \cite{gal2016dropout} demonstrated that dropout applied at test time approximates Bayesian inference. \cite{lakshminarayanan2017simple} proposed Deep Ensembles, that trains multiple networks independently and combines their predictions. Other studies \cite{caldeira2020deeply} have compared Bayesian methods, variational inference, and ensemble-based approaches for deep uncertainty quantification. Methods like Probabilistic Face Embeddings (PFE) \cite{shi2019probabilistic} introduced amortized inference to quantify uncertainty by using auxiliary network heads. Subsequently, the Bayesian Triplet Loss (BTL) \cite{warburg2021bayesian} adopted this model structure to predict uncertainty but enforced constraints using a novel, analytically derived Bayesian triplet loss objective. BaNN's, Monte Carlo dropout, and deep ensembles are widely used but computationally expensive and require multiple forward passes or models. We use these methods as baselines. Note, our method estimates uncertainty in a single forward pass. Evidential Deep Learning (EDL) is a principled framework for quantifying predictive uncertainty by interpreting network outputs as evidence distributions-Dirichlet for classification and Normal–Inverse-Gamma (NIG) for regression. Early works introduced evidential classifiers that decompose uncertainty without sampling \cite{sensoy2018evidential}, later extended to regression for single-pass uncertainty estimation \cite{amini2020deep}. Prior Networks \cite{malinin2018predictive} advanced distributional uncertainty modeling and OOD detection, influencing many calibration-oriented variants. Subsequent research explored pixel-level uncertainty in segmentation \cite{li2023region}, and large-scale comparisons \cite{schreck2023evidential} showed that evidential models can approach ensemble-level uncertainty with lower computational cost. A recent survey by \cite{gaosurvery2024} offers a unified overview of EDL methods and challenges. Despite their adoption in classification and segmentation, evidential methods remain largely unexplored in image retrieval. The closest work \cite{evidentialtransformers2024} relies on evidential classification foundations, i.e. they use classification-trained CLS token as an embedding. One of their other proposed approach is a Naive strategy to incorporate metric learning would require two networks - one for retrieval and one for uncertainty. Our method uses a single unified network. To the best of our knowledge, this is the first work to use evidential learning for medical image retrieval.

\section{Related Work}

Content-Based Medical Image Retrieval (CBMIR) relies on learning robust feature representations that map semantically similar images to nearby points in a latent space. Deep metric learning forms the foundation of these image retrieval systems, where contrastive \cite{pmlr-v250-susmitha24a,el2021training} and triplet losses \cite{hu2022x} promote compact and well-separated embeddings. Vision Transformers (ViTs) \cite{dosovitskiy2020image} have recently outperformed CNNs by modeling global dependencies and learning richer representations. In medical image retrieval, studies on COVID, Kvasir, and ISIC datasets \cite{tschandl2019diagnostic,shetty2023medical,agrawal2022content} have primarily relied on CNN-based architectures like ResNet, VGG, and DenseNet. Subsequent works introduced improved similarity measures such as relative difference-based similarity (RDBSM) \cite{ahmed2023content} and opponent class adaptive margin (OCAM) loss \cite{ozturk2023content}, while ViT-based methods \cite{el2021training,trinh2021endoscopy,thakrar2023semantic,gupta2023medical,manzari2023medvit, arvapalli2025exploring} capture global contextual relationships more effectively. Most recently, the field has begun to leverage Foundation Models (FMs) pre-trained on massive corpora. Denner et al.~\cite{denner2025leveraging} established a large-scale radiological retrieval benchmark combining heterogeneous datasets including CheXpert and NIH-14. Their study highlights the potential of general biomedical FMs like BiomedCLIP~\cite{zhang2023biomedclip} and DINOv2~\cite{oquab2023dinov2}, as well as domain-specialized models like RAD-DINO~\cite{perez2025exploring} for chest X-rays.However, despite these approaches offering powerful representations, they cannot quantify predictive reliability, an essential aspect in safety-critical domains like healthcare.

Uncertainty in deep learning is categorized into aleatoric (data-dependent) and epistemic (model) uncertainty. Gal and Ghahramani \cite{gal2016dropout} demonstrated that dropout applied at test time approximates Bayesian inference. \cite{lakshminarayanan2017simple} proposed Deep Ensembles, that trains multiple networks independently and combines their predictions. Other studies \cite{caldeira2020deeply} have compared Bayesian methods, variational inference, and ensemble-based approaches for deep uncertainty quantification. Methods like Probabilistic Face Embeddings (PFE) \cite{shi2019probabilistic} introduced amortized inference to quantify uncertainty by using auxiliary network heads. Subsequently, the Bayesian Triplet Loss (BTL) \cite{warburg2021bayesian} adopted this model structure to predict uncertainty but enforced constraints using a novel, analytically derived Bayesian triplet loss objective. BNN's, Monte Carlo dropout, and deep ensembles are widely used but computationally expensive and require multiple forward passes or models. We use these methods as baselines. Note, our method estimates uncertainty in a single forward pass.

Evidential Deep Learning (EDL) is a principled framework for quantifying predictive uncertainty by interpreting network outputs as evidence distributions-Dirichlet for classification and Normal–Inverse-Gamma (NIG) for regression. Early works introduced evidential classifiers that decompose uncertainty without sampling \cite{sensoy2018evidential}, later extended to regression for single-pass uncertainty estimation \cite{amini2020deep}. Prior Networks \cite{malinin2018predictive} advanced distributional uncertainty modeling and OOD detection, influencing many calibration-oriented variants. Subsequent research explored pixel-level uncertainty in segmentation \cite{li2023region}, and large-scale comparisons \cite{schreck2023evidential} showed that evidential models can approach ensemble-level uncertainty with lower computational cost. A recent survey by \cite{gaosurvery2024} offers a unified overview of EDL methods and challenges.

Despite their adoption in classification and segmentation, evidential methods remain largely unexplored in image retrieval. The closest work \cite{evidentialtransformers2024} relies on evidential classification foundations, where the classification-trained CLS token is used as a retrieval embedding. While they also proposed a naive strategy to incorporate metric learning, it necessitates two separate networks-one for retrieval and another for uncertainty estimation. In contrast, our method employs a single unified network. To the best of our knowledge, this is the first work to use evidential learning for medical image retrieval.

% \vspace{-0.3cm}
\section{Method}
Our approach, termed the \textbf{Evidential Retriever}, unifies evidential learning with deep metric learning to produce uncertainty-aware image embeddings. 
\vspace{-0.3cm}
\subsection{Architecture}
The proposed model is built on a transformer-based visual backbone, specifically the \textbf{Swin Transformer}(swin\_small) has been chosen for its strong long-range and hierarchical feature capabilities (refer ablation study \ref{sec:backbone}). To further enhance discriminative capability, we incorporate a cross-batch memory~\cite{wang2020understanding} that utilizes past embeddings as hard negatives, decoupling negative mining from batch size without extra computational cost.
Given an input image $\mathbf{x}$, the Swin Transformer encodes it into a rich latent feature representation $\mathbf{h} \in \mathbb{R}^{d}$ using shifted-window attention blocks and patch-merging layers that preserve local and global contextual cues.

On top of this backbone, two lightweight task-specific heads are added:
\begin{itemize}
    \item \textbf{Embedding Head:} A shallow two-layer feed-forward network consisting of a fully connected layer followed by a ReLU activation projects the pooled Swin features into a low-dimensional embedding space ($\mathbb{R}^{256}$). The embeddings are then $L_2$-normalized to ensure consistent magnitude across samples, which stabilizes contrastive learning.
    \item \textbf{Evidential Head:} A linear layer maps the same Swin features to class evidence logits, which are passed through a non-negative activation function (\textit{Softplus}) to produce the evidence vector $\mathbf{e} = [e_1, \ldots, e_K]$. Interpreting $e_k$ as the accumulated support for class $k$, we compute Dirichlet parameters $\alpha_k = e_k + 1$ following Subjective Logic. These parameters define a distribution $\text{Dir}(\mathbf{p}|\boldsymbol{\alpha})$ over the probability simplex, treating class probabilities $\mathbf{p}$ as random variables rather than point estimates, thereby enabling simultaneous quantification of belief and epistemic uncertainty.
    
    
    % These parameters define a Dirichlet distribution that quantifies both belief and epistemic uncertainty.
\end{itemize}

\begin{figure}[htbp]

\floatconts
  {fig:CBMIR}
  {\caption{
        Architecture of the proposed evidential retrieval model. Producing $L_2$-normalized descriptors for contrastive learning with cross-batch memory from the embedding head and an evidential head that generates non-negative evidence converted into parameters $\boldsymbol{\alpha}$ of a Dirichlet Distribution ($\text{Dir}(\mathbf{p}|\boldsymbol{\alpha})$) over the class probability simplex, effectively quantifying the model's categorical belief ($\text{Cat}(\mathbf{y}|\mathbf{p})$) and its epistemic uncertainty ($u$). The final output shown is Top-3 Retrieval along with their respective uncertainty($u$).
    }}
  {\includegraphics[width=0.7\linewidth]{Figures/Evi_architecture_main.png}}
\end{figure}
The expected class probabilities and total uncertainty are computed as:
\[
    \hat{p}_k = \frac{\alpha_k}{S}, \qquad u = \frac{K}{S}, \qquad S = \sum_{i=1}^{K} \alpha_i,
\]
where $S$ denotes the total evidence strength. Here, $u$ represents the epistemic uncertainty, which is inversely proportional to the total evidence collected. When the model has gathered significant evidence for any class (high $S$), $u$ approaches 0; conversely, for OOD or ambiguous samples where evidence is scarce, $u$ approaches 1.
% The expected class probabilities and total uncertainty are computed as:
% \[
%     \hat{p}_k = \frac{\alpha_k}{S}, \qquad u = \frac{K}{S}, \qquad S = \sum_{i=1}^{K} \alpha_i,
% \]

% where $S$ denotes the total evidence strength. Higher $S$ indicates confident and reliable representations, while lower $S$ captures ambiguity or lack of evidence.
\vspace{-0.3cm}
\subsection{Loss Formulation}

The network is optimized with a joint objective that balances retrieval performance and evidential calibration:
\[
L_{\text{total}} = L_{\text{contr}} + \lambda_{\text{reg}} L_{\text{KoLeo}} + L_{\text{evid\_fit}},
\]
 \textbf{Contrastive Loss} ($L_{\text{contr}}$) ensures embeddings of semantically similar images are close in the learned space, while dissimilar ones are pushed apart, improving discriminative retrieval. It is defined as:
\begin{equation}
L_{\text{contr}} = \frac{1}{N} \sum_{i} \left[
\sum_{j:y_i=y_j} (1 - z_i^\top z_j) +
\sum_{j:y_i \neq y_j} \max(0, z_i^\top z_j - \beta)
\right]
\end{equation}
where $z_i^\top z_j$ denotes the cosine similarity between $L_2$-normalized embeddings, $\beta$ is a margin controlling hard negatives, and $N$ is the number of samples in a batch. 
\textbf{KoLeo Regularization} ($L_{\text{KoLeo}}$) promotes geometric uniformity in the embedding space, preventing feature collapse.  The regularization term is weighted by a coefficient $\lambda_{\text{reg}}$ that controls its contribution to the total loss.
\begin{equation}
L_{\text{KoLeo}} = -\frac{1}{N} \sum_{i=1}^{N} \log(\rho_i)
\end{equation}
where $\rho_i = \min_{j \neq i} \| z_i - z_j \|$ is the distance between $z_i$ and its nearest neighbor. This term encourages embeddings to spread uniformly over the hypersphere, improving generalisation and retrieval robustness.

\textbf{Evidential Fit Loss} ($L_{\text{evid\_fit}}$) aligns the Dirichlet mean $\hat{p}_k$ with the one-hot label $y_k$, penalizing variance to encourage confident predictions when evidence is strong:
\begin{equation}
L_{\text{evid\_fit}} = \sum_k (y_k - \hat{p}_k)^2 + \frac{\alpha_k(S - \alpha_k)}{S^2(S + 1)}.
\end{equation}

\vspace{-0.4cm}
\subsection{Uncertainty-Aware Retrieval(Inference)}

During inference, the retrieval embedding $\mathbf{z}$, obtained from the normalized Embedding Head, is used for similarity search via cosine similarity against the gallery. Separately, the Dirichlet-based uncertainty $u$ is calculated from the Dirichlet parameters ($\boldsymbol{\alpha}$) predicted by the Evidential Head, providing an interpretable, per-image measure of reliability. Images with higher uncertainty (low evidence) are flagged as ambiguous or out-of-distribution, while low-uncertainty samples indicate reliable matches. This unified formulation enables a single deterministic model to perform both high-accuracy feature retrieval and robust epistemic uncertainty estimation.
% \section{Results and Discussion}



% \vspace{-0.15cm}
\section{Results and Discussion}

% {\textit{Datasets: }}
% We evaluate our proposed Evidential Retriever on three diverse medical imaging datasets covering different modalities. The ISIC Skin Lesion Dataset \cite{codella2018skin} includes 2,750 dermoscopic images of benign nevi, seborrheic keratosis, and melanoma, representing a fine-grained classification and retrieval task. The COVID-QU-Ex Dataset \cite{tahir2022covid} comprises 33,920 chest X-ray (CXR) images curated by researchers at Qatar University, categorized into 11,956 COVID-19 cases, 11,263 non-COVID infections (viral or bacterial pneumonia), and 10,701 normal cases. The dataset additionally provides ground-truth lung segmentation masks, enabling precise lung isolation for advanced retrieval tasks and making it the largest publicly available CXR dataset with lung masks. The Kvasir-V2 Dataset \cite{pogorelov2017Kvasir} contains 8,000 endoscopic images categorized into eight classes, including anatomical landmarks and pathological findings. \textcolor{red}{To evaluate scalability, we additionally include two large-scale benchmarks: \textbf{CheXpert}~\cite{irvin2019chexpert}, comprising 224,316 chest radiographs with uncertainty labels, and \textbf{NIH-CXR14}~\cite{wang2017chestx}, containing 112,120 X-ray images. These datasets represent realistic, long-tailed clinical distributions.}

% {\textit{Datasets: }}
% We evaluate our proposed Evidential Retriever on three diverse medical imaging datasets covering different modalities. The ISIC Skin Lesion Dataset \cite{codella2018skin} includes 2,750 dermoscopic images of benign nevi, seborrheic keratosis, and melanoma, representing a fine-grained classification and retrieval task. The COVID-QU-Ex Dataset \cite{tahir2022covid} comprises 33,920 chest X-ray (CXR) images curated by researchers at Qatar University, categorized into COVID-19 cases, non-COVID infections, and normal cases. The dataset additionally provides ground-truth lung segmentation masks, enabling precise lung isolation. The Kvasir-V2 Dataset \cite{pogorelov2017Kvasir} contains 8,000 endoscopic images categorized into eight classes, including anatomical landmarks and pathological findings. \textcolor{red}{To evaluate scalability, we additionally include two large-scale benchmarks: \textbf{CheXpert}~\cite{irvin2019chexpert} (224,316 radiographs) and \textbf{NIH-CXR14}~\cite{wang2017chestx} (112,120 images), representing realistic, long-tailed clinical distributions. Finally, to assess robustness against within-modality distribution shifts, we utilize the \textbf{PAD-UFES-20} dataset~\cite{pacheco2020pad}. Unlike the standardized dermoscopy images in ISIC, PAD-UFES-20 contains 2,298 skin lesion images captured using various smartphone devices, introducing significant variations in lighting, resolution, and artifacts common in real-world tele-dermatology.}
{\textit{Datasets: }} We evaluate our proposed Evidential Retriever on five diverse medical imaging datasets covering different modalities. The ISIC Skin Lesion Dataset \cite{codella2018skin} includes 2,750 dermoscopic images of benign nevi, seborrheic keratosis, and melanoma, representing a fine-grained classification and retrieval task. The COVID-QU-Ex Dataset \cite{tahir2022covid} comprises 33,920 chest X-ray (CXR) images curated by researchers at Qatar University, categorized into COVID-19 cases, non-COVID infections, and normal cases. The dataset additionally provides ground-truth lung segmentation masks, enabling precise lung isolation. The Kvasir-V2 Dataset \cite{pogorelov2017Kvasir} contains 8,000 endoscopic images categorized into eight classes, including anatomical landmarks and pathological findings. To evaluate scalability, we include the large-scale \textbf{CheXpert}~\cite{irvin2019chexpert} (224k images) and \textbf{NIH-14}~\cite{wang2017chestx} (112k images) benchmarks. Finally, for within-modality OOD testing, we use \textbf{PAD-UFES-20}~\cite{pacheco2020pad}, containing 2,298 smartphone-acquired skin lesions across 6 classes. This dataset introduces real-world artifacts (e.g., lighting, noise) distinct from the standardized ISIC dermoscopy data.

% We conduct experiments on three medical image retrieval benchmarks: ISIC, COVID-QU, and Kvasir-V2 to evaluate both retrieval performance and uncertainty reliability. To rigorously assess Out-of-Distribution (OOD) detection, we define distinct OOD pairs for each in-distribution (ID) dataset: for ISIC (ID), we use COVID-QU-Ex as the OOD set; for COVID-QU-Ex (ID), we use Kvasir as the OOD set; and for Kvasir (ID), we use ISIC as the OOD set. 

We conduct experiments on five medical image retrieval benchmarks: CheXpert and NIH-14 (large-scale evaluation), and ISIC, COVID-QU-Ex, and Kvasir-V2(specialized evaluation). To rigorously assess Out-of-Distribution (OOD) detection on the specialized datasets, we define distinct OOD pairs for each in-distribution (ID) dataset. For cross-modality shifts, we use: ISIC (ID)  COVID-QU-Ex (OOD); COVID-QU-Ex (ID)  Kvasir (OOD); and Kvasir (ID)  ISIC (OOD). Additionally, to evaluate robustness to realistic within-modality shifts, we use PAD-UFES-20 as the OOD set for models trained on ISIC.

{\textit{Baselines:}} Our comparisons span a diverse set of \emph{deterministic}, \emph{probabilistic}, \emph{Bayesian}, and \emph{evidential classification} retrieval models. Deterministic baselines include MIR-ViT~\cite{pmlr-v250-susmitha24a}, X-MIR~\cite{hu2022x}, and Context-MIR~\cite{arvapalli2025exploring}; note that Context-MIR results are omitted for Kvasir as the method relies on segmentation maps, which are unavailable for this dataset. Probabilistic approaches such as Probabilistic Face Embeddings (PFE)~\cite{shi2019probabilistic} and Bayesian Triplet Loss (BTL)~\cite{warburg2021bayesian} perform amortized inference to estimate the mean and variance of latent embeddings; for PFE, we incorporate an additional uncertainty head composed of \texttt{linear--BN--ReLU--linear--BN} layers while freezing the backbone parameters. We also evaluate approximate Bayesian methods including MC Dropout~\cite{gal2016dropout} and Deep Ensembles~\cite{lakshminarayanan2017simple}, which have been widely used for uncertainty-aware retrieval. Finally, following recent benchmarks in radiological retrieval~\cite{denner2025leveraging}, we compare against state-of-the-art Foundation Models (FMs): \textbf{BiomedCLIP}~\cite{zhang2023biomedclip}, \textbf{DINOv2}~\cite{oquab2023dinov2}, and \textbf{RAD-DINO}~\cite{perez2025exploring}. We evaluate these FMs in two modes: `Off-the-shelf' (frozen features) and `Evidential' (frozen backbone + our trained dual-heads).

{\textit{Evaluation metrics:}}
Across all methods, we assess image retrieval performance using Recall@K[1, 5, 10], mean Average Precision (mAP), and mean Precision@K (mP@K[1, 5, 10]). To evaluate uncertainty calibration on in-distribution (ID) data, we report the Expected Calibration Error (ECE). For out-of-distribution (OOD) detection, we measure the Area Under Receiver Operator Curve (AUROC) and Area Under Precision-Recall Curve (AUPRC), which quantifies the separability and ranking quality of uncertainty scores. Together, these metrics provide a comprehensive evaluation of retrieval effectiveness and uncertainty reliability. Additionally, for our proposed Evidential Retriever, we adopt $\lambda_{\text{reg}} = 0.7$, selected based on the hyperparameter tuning analysis detailed in Appendix~\ref{sec:hyper_analysis}.

\begin{table*}[t]
\centering
% \caption{\textbf{Quantitative Results.} Comparison of retrieval performance (Recall, mAP, mP), Out-of-Distribution (OOD) detection, and In-Distribution (ID) reliability across three medical imaging datasets. Best results are highlighted in \textbf{bold}.}
\caption{\textbf{Quantitative Results.} Comparison of retrieval performance (Recall, mAP, mP), Out-of-Distribution (OOD) detection, and In-Distribution (ID) reliability across three medical imaging datasets. We compare against deterministic baselines (MIR-ViT~\cite{pmlr-v250-susmitha24a}, X-MIR~\cite{hu2022x}, Context-MIR~\cite{arvapalli2025exploring}), probabilistic methods (MC Dropout~\cite{gal2016dropout}, BTL~\cite{warburg2021bayesian}, PFE~\cite{shi2019probabilistic}), Deep Ensembles~\cite{lakshminarayanan2017simple}, and the Evidential Classification baseline~\cite{evidentialtransformers2024}.}

\label{tab:main_results}
\resizebox{\textwidth}{!}{%
\begin{tabular}{llcccccccc}
\toprule
 &  & \multicolumn{3}{c}{\textbf{IMAGE RETRIEVAL}} & \multicolumn{2}{c}{\textbf{OOD}} & \multicolumn{1}{c}{\textbf{ID}} \\
\cmidrule(lr){3-5} \cmidrule(lr){6-7} \cmidrule(lr){8-8}
\textbf{} & \textbf{Model} & \textbf{Recall@[1,5,10] $\uparrow$} & \textbf{mAP $\uparrow$} & \textbf{mP@[1,5,10] $\uparrow$} & \textbf{AUROC $\uparrow$} & \textbf{AUPRC $\uparrow$}  & \textbf{ECE $\downarrow$} \\ 
\midrule

% --- ISIC SECTION ---
\multirow{8}{*}{\rotatebox[origin=c]{90}{\textbf{ISIC 2017}}} 
 & MIR-ViT & [75.67, 87.33, 90.00] & 70.90 & [75.67, 74.50, 74.30] & - & - &  - \\
 & X-MIR & [\textbf{80.67}, 92.00, 96.00] & 69.29 & [\textbf{80.67}, \textbf{82.35}, \textbf{82.89}] & - & - & - \\
 & Context-MIR & [74.00, 91.33, 96.00] & 71.33 & [74.00, 73.33, 73.87] & - & - & - \\
 \cmidrule{2-8}
 & MC Dropout & [72.66, \textbf{93.83}, 97.50] & 63.52 & [72.66, 68.10, 67.70] & 0.3915 & 0.8405 &  0.1411 \\
 & BTL & [80.66, 92.00, 94.00] & 68.62 & [80.66, 74.00, 73.20] & 0.4611 & 0.8306 &  0.3468 \\
 & PFE & [73.00, 93.00, 96.83] & 61.47 & [73.00, 68.83, 67.93] & 0.6825 & 0.8815 &  0.1562 \\
 & Deep Ensembles & [71.67, 93.33, \textbf{98.33}] & 70.54 & [71.67, 72.20, 72.45] & 0.3497 & 0.7458  & \textbf{0.0660} \\
 & Evidential Classif. & [74.83, 91.83, 95.00] & 70.45 & [74.83, 74.07, 73.68] & 0.6859 & 0.9785  & 0.2213 \\
 \rowcolor{gray!10} 
 & \textbf{Evidential (Ours)} & [79.67, 89.17, 91.00] & \textbf{73.65} & [79.67, 77.33, 76.98] & \textbf{0.9075} & \textbf{0.9876}  & 0.1492 \\ 
 \midrule

% --- COVID SECTION ---
\multirow{8}{*}{\rotatebox[origin=c]{90}{\textbf{COVID-QU-Ex}}} 
 & MIR-ViT & [93.80, 97.72, 98.42] & 91.43 & [93.80, 93.41, 93.36] & - & - &  - \\
 & X-MIR & [92.71, 97.73, 98.39] & 91.86 & [92.71, 92.56, 92.38] & - & - &  - \\
 & Context-MIR & [93.95, 97.08, 97.75] & 92.49 & [93.95, 93.56, 93.52] & - & -  & - \\
 \cmidrule{2-8}
 & MC Dropout & [92.56, 97.61, 98.52] & 86.19 & [92.56, 91.77, 91.52] & 0.4710 & 0.2341 &  0.0908 \\
 & BTL & [93.37, \textbf{98.39}, \textbf{99.07}] & 81.73 & [93.37, 92.84, 92.54] & 0.6480 & 0.2995 &  0.3973 \\
 & PFE & [93.84, 97.81, 98.43] & 88.18 & [93.84, 92.84, 92.49] & 0.7149 & 0.3450 &  0.0868 \\
 & Deep Ensembles & [94.74, 98.38, 98.75] & 93.74 & [94.74, 94.60, 94.35] & 0.3233 & 0.1898  &0.0698 \\
 & Evidential Classif. & [93.17, 97.86, 98.71] & 89.76 & [93.17, 92.47, 92.22] & 0.8936 & 0.7877  &0.0902 \\
 \rowcolor{gray!10}
 & \textbf{Evidential (Ours)} & [\textbf{95.67}, 97.02, 97.48] & \textbf{94.98} & [\textbf{95.67}, \textbf{95.47}, \textbf{95.44}] & \textbf{0.9717} & \textbf{0.8959} &  \textbf{0.0581} \\ 
 \midrule

% --- KVASIR SECTION ---
\multirow{8}{*}{\rotatebox[origin=c]{90}{\textbf{KVASIR}}} 
 & MIR-ViT & [\textbf{93.79}, 97.42, 97.96] & 90.97 & [\textbf{93.79}, \textbf{93.56}, \textbf{93.52}] & - & - &  - \\
 & X-MIR & [90.92, 97.67, 98.75] & 88.68 & [90.92, 90.76, 90.72] & - & - &  - \\
 & Context-MIR & - & - & - & - & - &  - \\
 \cmidrule{2-8}
 & MC Dropout & [91.54, 97.25, 98.16] & 89.23 & [91.54, 91.23, 91.20] & 0.7145 & 0.2805 & 0.0957 \\
 & BTL & [92.87, \textbf{98.29}, \textbf{98.83}] & 88.20 & [92.87, 92.30, 92.29] & 0.7612 & 0.1055 &  0.3115 \\
 & PFE & [91.87, 97.50, 98.37] & 89.02 & [91.87, 91.00, 90.72] & 0.3840 & 0.1544 &  0.1069 \\
 & Deep Ensembles & [91.95, 97.08, 97.95] & 90.57 & [91.95, 92.32, 92.28] & 0.4253 & 0.1783  &0.0654 \\
 & Evidential Classif. & [92.29, 97.71, 98.50] & 89.96 & [92.25, 91.91, 91.70] & 0.9148 & 0.6325 & 0.0776 \\
 \rowcolor{gray!10}
 & \textbf{Evidential (Ours)} & [93.08, 97.54, 98.33] & \textbf{91.99} & [93.08, 93.17, 93.43] & \textbf{0.9543} & \textbf{0.9517} &  \textbf{0.0593} \\ 
 \bottomrule
\end{tabular}%
}
\end{table*}



% From the quantitative results in Table~\ref{tab:main_results}, the proposed evidential transformer consistently provides the best overall balance between retrieval quality and uncertainty reliability across all three datasets. On ISIC, it achieves the highest mAP (73.65) and the strongest mP@K, while maintaining Recall@1 competitive with the strongest deterministic and probabilistic baselines; importantly, it delivers substantially superior OOD detection with AUROC~0.91 and AUPRC~0.99, while keeping ECE lower than most Bayesian baselines. On COVID-QU-Ex, our model attains the best retrieval scores (mAP 94.98 and the highest mP@K values), surpassing even strong deterministic retrieval models and the evidential classification rival, while simultaneously yielding the best OOD performance (AUROC~0.97, AUPRC~0.90) and a low ECE, indicating both accurate and well-calibrated predictions. A similar trend is observed on Kvasir-V2, where our method again attains the highest mAP (91.99) and strong Recall@K, with AUROC~0.95 and AUPRC~0.95 that markedly exceed all competing uncertainty methods, including BTL, PFE, Deep Ensembles, and pure evidential classification. Notably, while evidential classification is often the strongest competitor in OOD metrics, it  underperforms our joint evidential transformer in retrieval scores and exhibits higher ECE, indicating that collapsing evidential learning into a classification-only head is less effective.
% From the quantitative results in Table~\ref{tab:main_results}, the proposed evidential transformer consistently provides the best overall balance between retrieval quality and uncertainty reliability across all datasets. On ISIC, it achieves the highest mAP (73.65) and the strongest mP@K, while delivering substantially superior OOD detection with (AUROC~0.91, and AUPRC~0.99) and keeping ECE lower than most Bayesian baselines. \textcolor{red}{A similar trend is observed on COVID-QU-Ex and Kvasir-V2, where our method attains the highest retrieval scores (mAP 94.98 and 91.99, respectively) and superior OOD metrics (AUROC $\ge$ 0.95), markedly exceeding competing uncertainty methods including BTL, PFE, and Deep Ensembles.} Notably, while evidential classification is often the strongest competitor in OOD metrics, it underperforms our joint evidential transformer in retrieval scores and exhibits higher ECE, indicating that collapsing evidential learning into a classification-only head is less effective.

From the quantitative results in Table~\ref{tab:main_results}, the proposed evidential transformer consistently provides the best overall balance between retrieval quality and uncertainty reliability across all datasets. On ISIC, it achieves the highest mAP (73.65) and the strongest mP@K, while delivering substantially superior OOD detection with (AUROC~0.91, and AUPRC~0.99) and keeping ECE lower than most Bayesian baselines. A similar trend is observed on COVID-QU-Ex and Kvasir-V2, where our method attains the highest retrieval scores (mAP 94.98 and 91.99, respectively) and superior OOD metrics (AUROC $\ge$ 0.95), markedly exceeding competing uncertainty methods including BTL, PFE, and Deep Ensembles. Notably, while evidential classification is often the strongest competitor in OOD metrics, it underperforms our joint evidential transformer in retrieval scores and exhibits higher ECE, indicating that collapsing evidential learning into a classification-only head is less effective.
\begin{figure}[t]
    \centering
    \includegraphics[width=\textwidth]{Figures/six_models_density_final_v7_isic.png}
    \caption{\textbf{Qualitative Safety Analysis (ID vs.\ OOD).} Density histograms of uncertainty scores for In-Distribution (\textcolor{green}{green}) ISIC samples and OOD as (\textcolor{red}{red}) COVID-QU-Ex samples across six retrieval baselines. Deep Ensembles, MC Dropout, and BTL (b,c,e) exhibit variance collapse, where the ID and OOD distributions either overlap heavily or degenerate into narrow spikes, resulting in unreliable OOD detection. PFE and Evi Classification (d, f) shows partial separation. \textbf{Our method} (a) achieves clear semantic separation: ID samples form a compact low-uncertainty mode, while OOD samples shift distinctly toward higher uncertainty, aligning with the superior AUROC.}
    \label{fig:density_comparison}
\end{figure}

% The robustness of the quantified uncertainty is further analyzed through Out-of-Distribution (OOD) detection capabilities, visualized in the density plots in Fig.~\ref{fig:density_comparison}. As observed in the ISIC density analysis, our Evidential Retriever achieves a distinct and clean separation between In-Distribution (ID) and OOD uncertainty distributions, with OOD samples consistently pushed toward higher uncertainty scores. This separation is statistically corroborated by the highest AUROC and AUPRC scores across all datasets. In contrast, amortized inference methods such as PFE and BTL show significant overlap between ID and OOD distributions. This highlights a fundamental limitation of amortization: while effective at modeling aleatoric data noise within the training distribution, it struggles to extrapolate epistemic uncertainty for distinct, unseen distributions. While the Evidential Classification baseline also exhibits a similar but consistently weaker separation, our model demonstrates a more balanced distribution that supports both OOD detection and granular reliability assessment for retrieval candidates, a trend that is consistent across the COVID-QU-Ex and Kvasir datasets.

The robustness of the quantified uncertainty is further evaluated through OOD detection, illustrated by the density plots in Fig.~\ref{fig:density_comparison}.  On ISIC, our Evidential Retriever shows a clear separation between ID and OOD uncertainty distributions, with OOD samples consistently assigned higher uncertainty, corroborated by the highest AUROC and AUPRC across datasets. In contrast, amortized inference methods such as PFE and BTL show significant overlap, reflecting their limited ability to extrapolate epistemic uncertainty beyond the training distribution, despite being effective at modeling aleatoric noise. While the Evidential Classification baseline shows a similar but consistently weaker separation, our model yields a more balanced uncertainty distribution that supports both reliable OOD detection and fine-grained retrieval confidence, a trend consistently observed on COVID-QU-Ex and Kvasir.

% \textcolor{red}{\textit{Robustness to Within-Modality Domain Shift:} To evaluate reliability under realistic clinical heterogeneity, we conducted a ``within-modality'' OOD experiment where the model was trained on \textbf{ISIC 2017} (standardized dermoscopy) and tested on \textbf{PAD-UFES-20}~\cite{pacheco2020pad} (smartphone-acquired skin lesions). Unlike the gross semantic shifts in the previous experiment, this setup mimics the subtle but common domain shift from specialized equipment to consumer devices. In this challenging setting, our Evidential Retriever achieves an AUROC of \textbf{0.7261}, outperforming the Evidential Classification baseline (0.6909). Figure~\ref{fig:ood_plots} illustrates the uncertainty density: our model (Left) exhibits improved discriminability between the ID (ISIC) and OOD (PAD-UFES) distributions compared to the classification baseline (Right), which shows greater overlap. This suggests that our unified metric-evidential loss is more sensitive to subtle distributional shifts that are often missed by standard classification objectives.}
\textit{Robustness to Within-Modality Domain Shift:} To evaluate reliability under realistic clinical heterogeneity, we conducted a ``within-modality'' OOD experiment where the model was trained on \textbf{ISIC 2017} (standardized dermoscopy) and tested on \textbf{PAD-UFES-20}~\cite{pacheco2020pad} (smartphone-acquired skin lesions). While our previous experiments addressed cross-modality shifts, this setup mimics the more subtle but critical shift from specialized to consumer devices. Our Evidential Retriever achieves an AUROC of \textbf{0.7261}, outperforming the Evidential Classification baseline (0.6909). Figure~\ref{fig:ood_plots} illustrates that our model (Left) provides better separation between the ID and OOD distributions compared to the classification baseline (Right), demonstrating that our metric-evidential loss is able to identify these subtle distributional shifts.

% \textcolor{red}{\textit{Robustness to Within-Modality Domain Shift:} To evaluate reliability under realistic clinical heterogeneity, we tested our ISIC-trained model on \textbf{PAD-UFES-20}~\cite{pacheco2020pad} (smartphone-acquired lesions). While our previous experiments addressed cross-organ shifts, this setup mimics the more subtle but critical shift from specialized to consumer devices. Our Evidential Retriever achieves an AUROC of \textbf{0.7261}, outperforming the Evidential Classification baseline (0.6909). Figure~\ref{fig:ood_plots} illustrates that our model (Left) provides better separation between the ID and OOD distributions compared to the classification baseline (Right), confirming that our metric-evidential loss remains sensitive to these fine-grained distributional shifts.}
\begin{figure}[h]
    \centering
    \begin{minipage}{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Evidential_ret_padupes.png} % Replace with your plot
        \caption*{(a) Evidential Retriever (Ours)}
    \end{minipage}
    \hfill
    \begin{minipage}{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Evi_padupes.png} % Replace with your plot
        \caption*{(b) Evidential Classification}
    \end{minipage}
    \caption{\textbf{Uncertainty Density under Domain Shift (ISIC $\to$ PAD-UFES).} Our model (a) separates the standardized ID data (ISIC) from the smartphone-acquired OOD data (PAD-UFES) more effectively than the baseline (b).}
    \label{fig:ood_plots}
\end{figure}

% \textcolor{red}{
% \subsection{Robustness to Within-Modality Domain Shift}
% \label{sec:ood_robustness}
% To evaluate reliability under realistic clinical heterogeneity, we conducted a ``within-modality'' OOD experiment. The model was trained on \textbf{ISIC 2017} (standardized dermoscopy) and tested on \textbf{PAD-UFES-20}~\cite{pacheco2020pad}, a dataset of smartphone-acquired skin lesions. This setup mimics the common domain shift from specialized equipment to consumer devices.}

% \textcolor{red}{
% Our Evidential Retriever achieves an AUROC of \textbf{0.7261}, outperforming the Evidential Classification baseline(0.6909). This confirms that our unified metric-evidential loss captures subtle distributional shifts often missed by standard objectives.}

% The practical utility of this uncertainty is demonstrated by the sparsification curves in Fig.~\ref{fig:sparsification}, which plot the improvement in retrieval accuracy (mAP@1) as the most uncertain samples are progressively filtered out. Across the three datasets(ISIC, COVID-QU-Ex, and Kvasir-V2), we display the top-3 performing curves - corresponding to Evidential Retriever(ours), BTL, and Evidential Classification. The curves for the Evidential Retriever rise monotonically, confirming that the model reliably associates higher uncertainty with harder or erroneous observations. Crucially, our model distinguishes itself from the baselines in two key aspects: the starting point and the rate of improvement. The Evidential Retriever consistently begins at the highest base accuracy (0\% filter rate) and maintains a steep, robust ascent in the critical low-rejection regime. In comparison, BTL shows a positive correlation but starts from a lower performance baseline and its practical utility is compromised by the very poor OOD detection performance observed in Table~\ref{tab:main_results}. More importantly, our model significantly outperforms the Evidential Classification baseline in this metric. The sparsification curves for the classification approach often start lower and exhibit less effective filtering (as seen in the flatter trajectory on COVID-QU-Ex), suggesting that while the Evidential classification captures global uncertainty, it is less correlated with specific retrieval errors than the uncertainty derived from our unified metric learning framework. These results show that our approach assigns the highest uncertainty to genuinely hard cases, making its uncertainty estimates actionable. For a visual examination of these flagged cases, we provide a detailed qualitative analysis of the high-uncertainty samples in Section \ref{sec:qualitative_covid} (COVID-QU-Ex) and in Appendix~\ref{sec:qualitative_isic} (ISIC), and \ref{sec:qualitative_kvasir} (Kvasir) and retrieval analysis in \ref{sec:retrieval_analysis}.

% The practical utility of uncertainty is illustrated by the sparsification curves in Fig.~\ref{fig:sparsification}, which show improvements in retrieval accuracy (mAP@1) as the most uncertain samples are progressively filtered out across ISIC, COVID-QU-Ex, and Kvasir-V2. 
% \textcolor{red}{Among the top-performing methods (Evidential Retriever, BTL, and Evidential Classification), our model exhibits a strictly monotonic rise, indicating that higher uncertainty reliably corresponds to harder or erroneous queries.}
% \textcolor{red}{Crucially, the Evidential Retriever starts from the highest base accuracy (0\% filter rate) and improves most rapidly in the low-rejection regime, whereas BTL shows a
% positive correlation but begins from a weaker baseline and suffers from poor OOD detection (Table~\ref{tab:main_results}), and Evidential Classification shows flatter, less effective filtering, particularly on COVID-QU-Ex, suggesting that Evidential classification captures coarser uncertainty that is less aligned with retrieval errors than our metric-based uncertainty.}
% \textcolor{red}{These results indicate that uncertainty from our unified metric-evidential framework is more tightly coupled to retrieval errors, making it practically actionable; qualitative examples are provided in Sec.~\ref{sec:qualitative_covid}, Appendix~\ref{sec:qualitative_isic}, \ref{sec:qualitative_kvasir}, and Sec.~\ref{sec:retrieval_analysis}.}


The practical utility of uncertainty is illustrated by the sparsification curves in Fig.~\ref{fig:sparsification}, which show improvements in retrieval accuracy (mAP@1) as the most uncertain samples are progressively filtered out across ISIC, COVID-QU-Ex, and Kvasir-V2. Among the top-performing methods (Evidential Retriever, BTL, and Evidential Classification), our model exhibits a strictly monotonic rise, indicating that higher uncertainty reliably corresponds to harder or erroneous queries. Crucially, the Evidential Retriever starts from the highest base accuracy (0\% filter rate) and improves most rapidly in the low-rejection regime, whereas BTL shows a positive correlation but begins from a weaker baseline and suffers from poor OOD detection (Table~\ref{tab:main_results}), and Evidential Classification shows flatter, less effective filtering, particularly on COVID-QU-Ex, suggesting that Evidential classification captures coarser uncertainty that is less aligned with retrieval errors than our metric-based uncertainty.  These results indicate that uncertainty from our unified metric-evidential framework is more tightly coupled to retrieval errors, making it practically actionable; qualitative of high-uncertainty examples are provided in Sec.~\ref{sec:qualitative_covid}, Appendix~\ref{sec:qualitative_isic}, \ref{sec:qualitative_kvasir}, and .~\ref{sec:retrieval_analysis}.
%In summary, the proposed Evidential (Ours) retrieval model provides the best trade-off between retrieval performance, OOD detection, and uncertainty reliability. It achieves high accuracy while improving calibration and robustness to OOD inputs, demonstrating that our unified metric-evidential framework yields highly informative and trustworthy confidence estimates for medical image retrieval.
\begin{figure}[t]
    \centering
    % Image 1
    \begin{minipage}{0.32\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Sparsification_isic_3_new.png}
    \end{minipage}\hfill
    % Image 2
    \begin{minipage}{0.32\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Sparsification_covid_3_new.png}
    \end{minipage}\hfill
    % Image 3 (Fixed: Changed 0.9 to 1.0/linewidth)
    \begin{minipage}{0.32\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Sparsification_kvasir_3_new.png}
    \end{minipage}
    
    \caption{
        \textbf{Uncertainty Utility (Sparsification Curves).} These plots show the $\text{mAP}@1$ gain versus the Filter Out Rate ($\%$) for three models across ISIC, COVID-QU-Ex, and KVASIR datasets. The monotonically increasing curves confirm that uncertainty correlates with observation difficulty. Our Evidential Retriever (Blue) achieves the highest $\text{mAP}@1$ gain, even at low Filter Out Rates, validating the superior calibration of its uncertainty scores for challenging samples.
    }
    \label{fig:sparsification}
\end{figure}



% \vspace{-0.2cm}

\subsection{Scalability and Comparison with Foundation Models}
\label{sec:scalability_fm}

To evaluate scalability, we benchmarked the Evidential Retriever against Foundation Models (FMs) on two large-scale clinical datasets \textbf{CheXpert}~\cite{irvin2019chexpert} and \textbf{NIH-14}~\cite{wang2017chestx}. We compare \textit{Off-the-shelf} (frozen) FMs against \textit{Evidential Versions} where we train our dual-head architecture on the frozen backbones.

% \textcolor{red}{Results at Scale: Table~\ref{tab:large_scale} shows that while frozen FMs lack calibration, equipping them with our evidential heads significantly improves performance. For instance, adding our heads to RAD-DINO on CheXpert boosts mAP from 38.25\% to 46.47\% with a low ECE of 0.1028, proving our framework effectively ``upgrades'' deterministic FMs. Notably, our efficient \textbf{Swin-Small} model achieves \textbf{47.37\% mAP} on CheXpert, outperforming even massive FMs while maintaining superior calibration.}

Results at Scale: Table~\ref{tab:large_scale} shows that while frozen FMs lack calibration, equipping them with our evidential heads significantly improves performance. On CheXpert, adding our heads to RAD-DINO boosts mAP from 38.25\% to 46.47\% (ECE 0.1028), while our efficient \textbf{Swin-Small} model achieves \textbf{47.37\% mAP}. Similarly, on NIH, the upgraded RAD-DINO yields the highest performance (24.66\% mAP), with our Swin-Small remaining highly competitive (24.37\% mAP). This shows that our method works well both by itself and as a simple addition to improve existing pre-trained models.
% , while our efficient \textbf{Swin-Small} model achieves \textbf{47.37% mAP}. \textcolor{red}{Similarly on NIH, the upgraded RAD-DINO yields the highest performance (24.66% mAP), with our Swin-Small remaining highly competitive (24.37% mAP).

We extended this analysis to ISIC, COVID-QU-Ex, and Kvasir. We found that off-the-shelf FMs struggle with domain-specific distributions (e.g., frozen RAD-DINO on Kvasir gets 27.21\% mAP). However, our Swin-Small model consistently outperforms both frozen and fine-tuned FMs on these tasks. Detailed results are in \textbf{Appendix~\ref{sec:appendix_fm_specialized}} and Table \ref{tab:specialized_fm}.

\begin{table*}[t]
\centering
\caption{\textbf{Large-Scale \& Foundation Model Evaluation.} Comparison on CheXpert and NIH-14. We compare \textit{Off-the-shelf Frozen Foundation Models} against our \textit{Evidential Versions} and \textit{Evidential Retriever}. Our method significantly improves calibration (ECE) compared to frozen baselines.}
\label{tab:large_scale}
% Reduced resize width slightly and added \footnotesize for better font appearance
\resizebox{0.9\textwidth}{!}{%
\footnotesize

\begin{tabular}{llccc}
\toprule
\textbf{Dataset} & \textbf{Model} & \textbf{mAP $\uparrow$} & \textbf{Recall@1 $\uparrow$} & \textbf{ECE $\downarrow$} \\
\midrule
% --- CheXpert ---
\multirow{10}{*}{\textbf{CheXpert}} 
 & \multicolumn{4}{c}{\textit{\textbf{Off-the-shelf FMs (Frozen)}}} \\ % Centered {c}
 \cmidrule(lr){2-5} % Trimmed left and right (lr) for balanced look
 & BiomedCLIP & 41.59 & 52.56 & - \\
 & DINOv2 & 36.73 & 47.86 & - \\
 & RAD-DINO & 38.25 & 47.01 & - \\
 \cmidrule(lr){2-5}
 & \multicolumn{4}{c}{\textit{\textbf{Evidential Versions (Trained Heads)}}} \\ % Centered {c}
 \cmidrule(lr){2-5}
 & Evidential Classification & 46.66 & 55.56 & 0.2863 \\
 % & BiomedCLIP + 2 Heads & 46.40 & 45.73 & 0.3902 \\
 % & DINOv2 + 2 Heads & 46.06 & 48.72 & 0.3791 \\
 & RAD-DINO + 2 Heads & 46.47 & 56.41 & \textbf{0.1028} \\
 & \textbf{Evidential Retriever (Ours)} & \textbf{47.37} & \textbf{57.26} & 0.1889 \\
\midrule
% --- NIH-14 ---
\multirow{10}{*}{\textbf{NIH-14}} 
 & \multicolumn{4}{c}{\textit{\textbf{Off-the-shelf FMs (Frozen)}}} \\ % Centered {c}
 \cmidrule(lr){2-5}
 & BiomedCLIP & 23.62 & 33.10 & - \\
 & DINOv2 & 21.70 & 30.83 & - \\
 & RAD-DINO & 21.81 & 39.49 & - \\
 \cmidrule(lr){2-5}
 & \multicolumn{4}{c}{\textit{\textbf{Evidential Versions (Trained Heads)}}} \\ % Centered {c}
 \cmidrule(lr){2-5}
 & Evidential Classification & 23.17 & 30.32 & 0.5488 \\
 % & BiomedCLIP + 2 Heads & 23.87 & 29.40 & 0.5911 \\
 % & DINOv2 + 2 Heads & 21.80 & 24.20 & 0.6489 \\
 & RAD-DINO + 2 Heads & \textbf{24.66} & \textbf{39.56} & \textbf{0.2657} \\
 & \textbf{Evidential Retriever (Ours)} & 24.37 & 38.06 & 0.3565 \\
\bottomrule
\end{tabular}
}
\end{table*}




% {\textit{Effect of Backbone Choice:}}
% \label{sec:backbone}
% To investigate the impact of architectural design on retrieval quality and uncertainty modeling, we evaluate four transformer backbones: ViT-Small, ViT-Base, Swin-Tiny, and Swin-Small - across all three datasets. Table \ref{tab:backbone_ablation} summarizes the retrieval performance in terms of R@K, mAP, mP@K, and calibration metrics. The hierarchical transformer backbones consistently outperform the ViT models, indicating that multi-scale feature aggregation and localized windowed attention are more effective for modeling fine-grained medical structures such as dermoscopic lesion borders and polyp textures. Between the two Swin variants, Swin-Small shows a consistent advantage over Swin-Tiny across datasets, reflecting the benefits of deeper stages and richer multi-scale representations. Considering the overall balance between retrieval accuracy, uncertainty calibration, and computational efficiency, Swin-Small emerges as the strongest backbone and is therefore selected as the backbone for our Evidential Retriever model.
% \vspace{-2cm}
\subsection{Qualitative Analysis of Uncertainty (COVID-QU-Ex)} \label{sec:qualitative_covid}

To demonstrate the model's reliability on the COVID-QU-Ex dataset, we examined images with the lowest and highest uncertainty scores. As shown in the low-uncertainty gallery (Fig.~\ref{fig:covid_qualitative}, a), the model is most confident ($u \approx 0.05$) on clear, standard X-rays (e.g. ID:4811) that look exactly like the typical training data. In contrast, the high-uncertainty gallery (Fig.~\ref{fig:covid_qualitative}, b) shows that the model correctly flags "odd" or difficult images as unreliable. This includes technical errors-such as scans of \textbf{children} (ID:1609), \textbf{rotated images} (ID:6549), or wires blocking the view (ID:406)-as well as unusual medical cases like severe spinal curvature (ID:5320) or completely \textbf{obscured lungs} (ID:393). The t-SNE plot (Fig.~\ref{fig:tsne_all} (b)) confirms this behavior, showing that these uncertain images are pushed to the edges of the data clusters, far from the standard examples. Similarly, a detailed analysis for ISIC given in Appendix~\ref{sec:qualitative_isic}, and \ref{sec:qualitative_kvasir} (Kvasir), \ref{sec:retrieval_analysis}(retrieval analysis), and Figure~\ref{fig:tsne_all} for a t-SNE geometric interpretation, which confirms that high-uncertainty samples distinctively cluster at manifold edges and decision boundaries.

\begin{figure}[t]
    \centering
    % Left Column: Low Uncertainty
    \begin{minipage}[t]{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Covid_qualitative_low.png}
        \caption*{\textbf{(a) Low Uncertainty (Sanity Check):} The model is most confident ($u \approx 0.05$) on high-quality, standard X-rays (e.g., ID:4811) that are clear, upright, and free of artifacts.}
    \end{minipage}
    \hfill % Spacing between columns
    % Right Column: High Uncertainty
    \begin{minipage}[t]{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/Covid_quantitative_high.png}
        \caption*{\textbf{(b) High Uncertainty (Safety Mechanism):} The model acts as a safety net by correctly flagging unreliable images. This includes \textbf{Technical Errors} (e.g., ID:1609: Pediatric, ID:6549: Rotated Image) and \textbf{Difficult Medical Cases} (e.g., ID:5320: Severe Spine Curvature, ID:393: Obscured Lungs).}
    \end{minipage}
    
    \caption{\textbf{Qualitative Analysis on COVID-QU-Ex Dataset.} The uncertainty score reliably distinguishes between standard, clear images and complex or erroneous inputs that require human review.}
    \label{fig:covid_qualitative}
\end{figure}
% \vspace{-2cm}

\subsection{Ablation Studies}

\label{sec:loss_ablation}
\begin{table}[t]
\centering
\normalsize
\renewcommand{\arraystretch}{1.2}

\definecolor{lightgray}{RGB}{240,240,240}
\caption{Ablation study of the three loss components - contrastive ($L_{\text{contr}}$), KoLeo ($L_{\text{KoLeo}}$), and evidential fit loss ($L_{\text{evid\_fit}}$) on ISIC, COVID-QU-Ex, and Kvasir. Each component improves performance, and the full combination achieves the best mAP across all datasets, confirming their complementary contributions to retrieval quality.}
\label{tab:ablation_losses}
\begin{tabular}{ccc|ccc}
\toprule
% \rowcolor{lightgray}
\multicolumn{3}{c|}{\textbf{Loss Components}} &
\multicolumn{3}{c}{\textbf{Datasets}} \\
% \rowcolor{lightgray}
\textbf{$L_{\text{contr}}$} & \textbf{$L_{\text{KoLeo}}$} & \textbf{$L_{\text{evid\_fit}}$} &
\textbf{ISIC} & \textbf{COVID-QU-EX} & \textbf{Kvasir} \\
\midrule
\multicolumn{3}{c|}{} & \textit{mAP} & \textit{mAP} & \textit{mAP} \\
\midrule

-- & -- & \checkmark &
70.45 & 89.76 & 89.96 \\

\checkmark & -- & -- &
68.20 & 91.43 & 89.41 \\

\checkmark & -- & \checkmark &
69.67 & 93.22 & 90.34 \\

\checkmark & \checkmark & -- &
71.33 & 92.49 & 90.97 \\

% \rowcolor{lightgray}
\checkmark & \checkmark & \checkmark &
\textbf{73.65} & \textbf{94.98} & \textbf{91.99} \\

\bottomrule
\end{tabular}

\end{table}

{\textit{Ablation Study: Effect of Loss Functions:}}
To assess how different loss formulations affect retrieval quality and uncertainty modeling, we compare five configurations across the three medical imaging datasets as shown in Table \ref{tab:ablation_losses}. The evidential-only model ($L_{\text{evid\_fit}}$) outperforms the standard contrastive baseline ($L_{\text{contr}}$) on both the ISIC and Kvasir datasets, achieving an mAP of 70.45\% and 89.96\% respectively. This suggests that the evidential objective, while formulated for classification, imposes strong class-discriminative constraints that implicitly structure the shared feature space effectively for retrieval. However, on the COVID-QU-Ex dataset, the contrastive baseline proves superior, indicating that pairwise metric learning is still essential for certain data distributions. We observe that combining components leads to further improvements; for instance, adding evidential supervision to the contrastive loss ($L_{\text{contr}} + L_{\text{evid\_fit}}$) boosts performance on COVID-QU-Ex to 93.22\%, while incorporating KoLeo regularization ($L_{\text{contr}} + L_{\text{KoLeo}}$) is particularly effective on ISIC, raising the mAP to 71.33\% by mitigating feature collapse. Across all datasets, the best performance is consistently achieved by our unified loss $L_{\text{total}}$, which combines regularized contrastive learning with evidential modeling.  For a qualitative t-SNE visualization learned by the evidential classification baseline and our Evidential Retriever, please refer to the Appendix \ref{sec:tsne_analysis}, which confirm that our Evidential Retriever produces significantly more structured embeddings with improved intra-class compactness and inter-class separability compared to the baseline. Additionally, we provide a comprehensive ablation study comparing the effect of different backbones (CNNs, Swin Transformer, and ViT) in Appendix~\ref{sec:backbone}, where Swin-Small was selected as it consistently provided the best balance of accuracy and calibration.

% \vspace{-0.3cm}
% \section{Conclusion}
% In this work, we introduce the Evidential Retriever, a framework that unifies discriminative representation learning with evidential uncertainty modeling for medical image retrieval. By extending the Dirichlet-based evidential formulation, our approach enables the model to generate feature representations that encode both semantic similarity and uncertainty.
% Through evaluation on three diverse medical imaging benchmarks: ISIC, COVID-QU-Ex, and Kvasir, we demonstrate that the Evidential Retriever achieves consistently superior retrieval performance while offering meaningful uncertainty estimates that strongly correlate with embedding quality. Our experiments confirm that the model distinguishes between in-distribution and out-of-distribution samples and provides uncertainty signals for error filtration. 
\section{Conclusion}
In this work, we introduce the Evidential Retriever, a framework that unifies discriminative representation learning with evidential uncertainty modeling for medical image retrieval. By extending the Dirichlet-based evidential formulation, our approach enables the model to generate feature representations that encode both semantic similarity and uncertainty.
Through evaluation on five diverse medical imaging benchmarks, including large-scale cohorts (CheXpert, NIH-14) and specialized datasets (ISIC, COVID-QU-Ex, Kvasir), we demonstrate that the Evidential Retriever achieves consistently superior retrieval performance while offering meaningful uncertainty estimates that strongly correlate with embedding quality. Our experiments confirm that the model distinguishes between in-distribution and out-of-distribution samples and provides uncertainty signals for error filtration. Furthermore, we demonstrate that our framework scales effectively to heterogeneous distributions, consistently outperforming frozen Foundation Models (e.g., RAD-DINO, DINOv2, BiomedCLIP) and serving as an architecture-agnostic upgrade for ensuring calibrated retrieval.
\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research work was partially supported by Research-I Foundation of the Department of CSE at IIT Kanpur.}


\bibliography{midl26_179}


\appendix
\newpage
\subsection{Qualitative Analysis of Feature Embeddings}
\label{sec:tsne_analysis}

To further validate the discriminative capability of our proposed Evidential Retriever, we visualize the learned latent feature spaces using t-Distributed Stochastic Neighbour Embedding (t-SNE). Figure~\ref{fig:tsne_comparison} presents a side-by-side comparison of the test set embeddings generated by our method versus the Evidential Classification baseline across the ISIC, COVID-QU-Ex, and Kvasir datasets. As observed in the visualizations, the Evidential Retriever produces significantly more structured feature representations characterized by improved intra-class compactness and inter-class separability. This is particularly evident in the COVID-QU-Ex dataset, where the baseline classification model yields dispersed and elongated clusters, whereas our method condenses these into distinct, spherical distributions. Similarly, for the ISIC dataset, our approach enforces a clearer margin between different classes compared to the baseline, which exhibits blurred boundaries. We attribute this structural improvement to our unified dual-head framework, which seamlessly integrates uncertainty quantification into the metric learning objective. By jointly optimizing the evidential head and the embedding head, the model effectively leverages uncertainty estimates to regularize the latent space, penalizing ambiguous overlap and encouraging the formation of high-density, class-specific manifolds. This ensures that the learned metric space is not only discriminative but also calibrated, directly supporting the quantitative performance gains reported in Table~\ref{tab:main_results}.
\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{Figures/tsne_final_cropped.png} % Replace with your actual file path
    \caption{t-SNE visualization of test set embeddings. The top row displays the feature space learned by our proposed \textbf{Evidential Retrieval} framework, while the bottom row shows the \textbf{Evidential Classification} baseline. Our method demonstrates significantly improved intra-class compactness and inter-class separability, particularly visible in the ISIC and COVID-QU-Ex datasets.}
    \label{fig:tsne_comparison}
\end{figure*}
\subsection{Qualitative Analysis of Uncertainty (ISIC)} \label{sec:qualitative_isic}

We validated the clinical reliability of the Evidential Retriever by analyzing its uncertainty estimation on the ISIC skin lesion dataset. As shown in the low-uncertainty gallery (Fig.~\ref{fig:isic_qualitative}, a), the model assigns minimal uncertainty ($u < 0.04$) to canonical dermoscopic samples - such as ID:209 and ID:184 - which represent the ideal diagnostic scenario characterized by high-contrast lesions, distinct borders, and a complete absence of obstruction; this confirms the model's robustness on the clean, high-density regions of the training distribution. Conversely, the high-uncertainty gallery (Fig.~\ref{fig:isic_qualitative}, b) reveals a robust safety mechanism that detects image quality degradation and external artifacts. Specifically, the model flags  \textbf{severe occlusion}, where dense hair blocks the lesion (e.g., ID:233),  \textbf{foreign object interference}, such as purple surgical sutures (ID:36) or ink annotations (ID:113), and  \textbf{acquisition artifacts}, where immersion fluid bubbles create discordant textures (ID:249). The t-SNE visualization (Fig.~\ref{fig:tsne_all}(a)) corroborates these findings, showing that the most uncertain samples (marked with Red Stars) cluster in sparse regions or at the manifold periphery, confirming that the model correctly isolates these anomalous inputs from the reliable distribution.
\begin{figure}[t]
    \centering
    % Left Column: Low Uncertainty
    \begin{minipage}[t]{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/ISIC_qualitative_low.jpg}
        \caption*{\textbf{(a) Low Uncertainty (Sanity Check):} The model is highly confident ($u < 0.04$) on canonical dermoscopic images (e.g., ID:209, ID:184) characterized by distinct lesion borders and no artifacts.}
    \end{minipage}
    \hfill
    % Right Column: High Uncertainty
    \begin{minipage}[t]{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/ISIC_qualitative_high.png}
        \caption*{\textbf{(b) High Uncertainty (Safety Mechanism):} The model correctly flags reliability risks including \textbf{Occlusion} (e.g., ID:233: Dense Hair), \textbf{Foreign Objects} (e.g., ID:36: Sutures), and \textbf{Acquisition Artifacts} (e.g., ID:249: Gel/Bubbles).}
    \end{minipage}
    
    \caption{\textbf{Qualitative Analysis on ISIC Dataset.} The uncertainty score acts as a quality filter, distinguishing between clear diagnostic samples and inputs degraded by occlusion or synthetic artifacts.}
    \label{fig:isic_qualitative}
\end{figure}

% \begin{figure}[h]
%     \centering
%     \includegraphics[width=0.85\linewidth]{Figures/tsne_top15_high_uncertainty.png}
%     \caption{\textbf{t-SNE Visualization of Uncertainty (ISIC).} The top 15 most uncertain samples (Red Stars) cluster in sparse regions or at the manifold edges, confirming that the model treats occluded and artifact-heavy images as distributional outliers.}
%     \label{fig:isic_tsne}
% \end{figure}
% \subsection{Qualitative Analysis of Uncertainty (COVID-QU-Ex)} \label{sec:qualitative_covid}

% To demonstrate the model's reliability on the COVID-QU-Ex dataset, we examined images with the lowest and highest uncertainty scores. As shown in the low-uncertainty gallery (Fig.~\ref{fig:covid_qualitative}, a), the model is most confident ($u \approx 0.05$) on clear, standard X-rays (e.g. ID:4811) that look exactly like the typical training data. In contrast, the high-uncertainty gallery (Fig.~\ref{fig:covid_qualitative}, b) shows that the model correctly flags "odd" or difficult images as unreliable. This includes technical errors-such as scans of \textbf{children} (ID:1609), \textbf{rotated images} (ID:6549), or wires blocking the view (ID:406)-as well as unusual medical cases like severe spinal curvature (ID:5320) or completely \textbf{obscured lungs} (ID:393). The t-SNE plot (Fig.~\ref{fig:tsne_all} (b)) confirms this behavior, showing that these uncertain images are pushed to the edges of the data clusters, far from the standard examples.

% \begin{figure}[t]
%     \centering
%     % Left Column: Low Uncertainty
%     \begin{minipage}[t]{0.48\linewidth}
%         \centering
%         \includegraphics[width=\linewidth]{Figures/Covid_qualitative_low.png}
%         \caption*{\textbf{(a) Low Uncertainty (Sanity Check):} The model is most confident ($u \approx 0.05$) on high-quality, standard X-rays (e.g., ID:4811) that are clear, upright, and free of artifacts.}
%     \end{minipage}
%     \hfill % Spacing between columns
%     % Right Column: High Uncertainty
%     \begin{minipage}[t]{0.48\linewidth}
%         \centering
%         \includegraphics[width=\linewidth]{Figures/Covid_quantitative_high.png}
%         \caption*{\textbf{(b) High Uncertainty (Safety Mechanism):} The model acts as a safety net by correctly flagging unreliable images. This includes \textbf{Technical Errors} (e.g., ID:1609: Pediatric, ID:6549: Rotated Image) and \textbf{Difficult Medical Cases} (e.g., ID:5320: Severe Spine Curvature, ID:393: Obscured Lungs).}
%     \end{minipage}
    
%     \caption{\textbf{Qualitative Analysis on COVID-QU-Ex Dataset.} The uncertainty score reliably distinguishes between standard, clear images and complex or erroneous inputs that require human review.}
%     \label{fig:covid_qualitative}
% \end{figure}

% \begin{figure}[h]
%     \centering
%     \includegraphics[width=0.85\linewidth]{Figures/covid_tsne_top15_high_uncertainty.png}
%     \caption{\textbf{t-SNE Visualization of Uncertainty (COVID-QU-Ex).} The chart shows that the most uncertain samples (Red Stars) are pushed far away from the normal data clusters, confirming that the model treats them as outliers.}
%     \label{fig:covid_tsne}
% \end{figure}

\subsection{Qualitative Analysis of Uncertainty (Kvasir)} \label{sec:qualitative_kvasir}

To validate the clinical safety of the Evidential Retriever, we qualitatively analyze retrieval behaviors under varying levels of epistemic uncertainty. As shown in the low-uncertainty gallery (Fig.~\ref{fig:kvasir_qualitative}, a), the model demonstrates maximum confidence ($u \approx 0.08$) on canonical samples belonging to \textbf{Class 4 (Pylorus)}, where distinct anatomical landmarks facilitate robust retrieval. Conversely, the high-uncertainty gallery (Fig.~\ref{fig:kvasir_qualitative}, b) highlights the model's capacity to flag reliability risks. We observe two predominant sources of uncertainty: (1) \textbf{acquisition artifacts}, where samples containing non-biological features like green UI overlays (e.g., ID:1210) are treated as out-of-distribution, and (2) \textbf{semantic ambiguity}, where borderline cases (e.g., ID:118) lie on the decision boundary between \textbf{Normal Z-line} and \textbf{Esophagitis}. The corresponding t-SNE visualization (Fig.~\ref{fig:tsne_all} (c)) corroborates these observations, showing that high-uncertainty samples cluster at class peripheries or in sparse manifold regions. This confirms that the evidential head effectively grounds uncertainty in the geometry of the embedding space.

\begin{figure}[t]
    \centering
    \begin{minipage}[t]{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/kvasir_Low_Uncertainty_Gallery_15.png}
        \caption*{\textbf{(a) Low Uncertainty (Sanity Check):} The model is highly confident ($u \approx 0.08$) on distinct anatomical landmarks like the Pylorus (Class 4), retrieving consistent samples.}
    \end{minipage}
    \hfill
    \begin{minipage}[t]{0.48\linewidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/kvasir_High_Uncertainty_Gallery_15.png}
        \caption*{\textbf{(b) High Uncertainty (Safety Mechanism):} The model flags two types of reliability risks: (1) \textbf{OOD Artifacts} (e.g., ID:1210 with green UI overlay) and (2) \textbf{Medical Ambiguity} (e.g., ID:118), where the distinction between Normal Z-line and Esophagitis is visually subtle.}
    \end{minipage}
    \caption{\textbf{Qualitative Analysis on Kvasir Dataset.} The uncertainty score serves as an effective reliability indicator, distinguishing between clear anatomical features and ambiguous or artifact-laden inputs.}
    \label{fig:kvasir_qualitative}
\end{figure}


% \begin{figure}[t]
%     \centering
%     \begin{minipage}[t]{0.85\linewidth}
%         \centering
%         \includegraphics[width=\linewidth]{Figures/kvasir_Low_Uncertainty_Gallery_15.png}
%         \caption*{\textbf{(a) Low Uncertainty (Sanity Check):} The model is highly confident ($u < 0.08$) on distinct anatomical landmarks like the Pylorus (Class 4), retrieving consistent samples.}
%     \end{minipage}
    
%     \vspace{0.5em}
    
%     \begin{minipage}[t]{0.85\linewidth}
%         \centering
%         \includegraphics[width=\linewidth]{Figures/kvasir_High_Uncertainty_Gallery_15.png}
%         \caption*{\textbf{(b) High Uncertainty (Safety Mechanism):} The model flags two types of reliability risks: (1) \textbf{OOD Artifacts} (e.g., ID:1210 with green UI overlay) and (2) \textbf{Medical Ambiguity} (e.g., ID:118), where the distinction between Normal Z-line and Esophagitis is visually subtle.}
%     \end{minipage}
    
%     \caption{\textbf{Qualitative Analysis on Kvasir Dataset.} The uncertainty score serves as an effective reliability indicator, distinguishing between clear anatomical features and ambiguous or artifact-laden inputs.}
%     \label{fig:kvasir_qualitative}
% \end{figure}


% \begin{figure}[h]
%     \centering
%     \includegraphics[width=0.85\linewidth]{Figures/kvasir_tsne_top15_high_uncertainty.png}
%     \caption{\textbf{t-SNE Visualization of Uncertainty (Kvasir).} The top 15 most uncertain samples (Red Stars) cluster at the decision boundaries between ambiguous classes (e.g., Z-line vs. Esophagitis) or in low-density regions (Artifacts), confirming the geometric validity of the uncertainty estimates.}
%     \label{fig:kvasir_tsne}
% \end{figure}


\begin{figure*}[t]
    \centering
    % --- Column 1: ISIC ---
    \begin{minipage}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/tsne_top15_high_uncertainty.png}
        \centerline{\textbf{(a) ISIC (Dermatology)}}
    \end{minipage}
    \hfill % Adds flexible space
    % --- Column 2: COVID (Now b) ---
    \begin{minipage}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/covid_tsne_top15_high_uncertainty.png}
        \centerline{\textbf{(b) COVID-QU-Ex (X-Ray)}}
    \end{minipage}
    \hfill % Adds flexible space
    % --- Column 3: Kvasir (Now c) ---
    \begin{minipage}[b]{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Figures/kvasir_tsne_top15_high_uncertainty.png}
        \centerline{\textbf{(c) Kvasir (Endoscopy)}}
    \end{minipage}
    
    \vspace{0.2cm}
    \caption{\textbf{Geometric Interpretation of Uncertainty across Datasets.} t-SNE visualizations showing the embedding space for (a) ISIC, (b) COVID-QU-Ex, and (c) Kvasir. In all cases, the top 15 most uncertain samples (Red Stars) are not randomly distributed; they consistently cluster in sparse regions, at manifold edges (representing artifacts/OOD), or along ambiguous decision boundaries. This confirms that the evidential uncertainty is geometrically well-grounded.}
    \label{fig:tsne_all}
\end{figure*}


\begin{figure}[t]
    \centering


    \includegraphics[width=1.0\linewidth]{Figures/Isic_both_retrievals.png}
    \caption*{(a) \textbf{ISIC (Texture Bias):} Querying with a clear lesion (Left) yields accurate neighbors. Querying with a hair-occluded outlier (Right) triggers texture bias-retrieving irrelevant "hairy" images—but is safely flagged by high uncertainty ($u=0.294$).}

    \vspace{1.7em}

  
    \includegraphics[width=1\linewidth]{Figures/Covid_retrieval_analysis.png}
    \caption*{(b) \textbf{COVID-QU-Ex (Geometric Shift):} The canonical upright scan (Left) is retrieved robustly. The rotated outlier (Right) causes mixed-class retrieval due to domain shift but is correctly identified ($u=0.254$) as unreliable.}

    \vspace{1.7em}

    \includegraphics[width=1\linewidth]{Figures/kvasir_both_retrievals.png}
    \caption*{(c) \textbf{Kvasir (Artifacts):} Distinct anatomy (Left) succeeds. The green-artifact outlier (Right) retrieves spurious noise sharing the same artifact, which is effectively flagged ($u=0.220$) to prevent misdiagnosis.}

    % \caption{\textbf{Qualitative Analysis of Retrieval Safety.}  We visualize retrieval results for the highest and lowest uncertainty samples identified in Sec.~\ref{sec:qualitative_isic},\ref{sec:qualitative_covid},\ref{sec:qualitative_kvasir}. While "Ideal Success" queries (Left) result in accurate retrieval, "Safe Failure" queries (Right) demonstrate that the model correctly assigns high uncertainty to inputs degraded by occlusion, rotation, or artifacts, effectively preventing silent failures.}

    \caption{\textbf{Qualitative Analysis of Retrieval Safety.} We visualize the top-3 retrieved neighbors for the highest and lowest uncertainty samples identified in Sec.~\ref{sec:qualitative_isic}, \ref{sec:qualitative_covid}, and \ref{sec:qualitative_kvasir}. Retrieval correctness is color-coded (\textbf{Green}: Correct Class, \textbf{Red}: Incorrect Class). The figure is organized by dataset: \textbf{(a) ISIC} (Texture Bias), \textbf{(b) COVID-QU-Ex} (Geometric Shift), and \textbf{(c) Kvasir} (Artifacts). While "Ideal Success" queries (Left) yield consistently accurate (green) retrievals, "Safe Failure" queries (Right) demonstrate that the model retrieves incorrect (red) classes when inputs are degraded by occlusion, rotation, or artifacts; crucially, the high uncertainty assignment effectively prevents these from becoming silent failures.}
    \label{fig:retrieval_case_studies}
\end{figure}


\subsection{Qualitative Analysis of Retrieval Safety}
\label{sec:retrieval_analysis}
To validate the practical utility of the Evidential Retriever, we conducted a retrieval case study using the specific samples identified as the least and most uncertain in our previous global analysis in Sec.~\ref{sec:qualitative_isic}, \ref{sec:qualitative_covid}, \ref{sec:qualitative_kvasir}. By querying with these exact outliers, we directly observe the downstream consequences of data irregularities in Fig.~\ref{fig:retrieval_case_studies}. In the low-uncertainty scenarios (Left Column), the model consistently retrieves semantically relevant neighbors, confirming its robustness on canonical, high-quality data. Conversely, querying with the flagged high-uncertainty samples (Right Column) reveals distinct failure modes: hair-occluded skin lesions trigger texture bias (retrieving other hairy images), rotated chest X-rays cause geometric confusion, and artifact-laden endoscopy images lead to the retrieval of synthetic noise. Crucially, however, the high evidential uncertainty scores correctly identify these predictions as unreliable. This confirms the safety utility of our method: it effectively warns clinicians when retrieval results are driven by occlusion, distribution shifts, or artifacts, thereby preventing "silent failures" in the decision-making process.


\subsection{Detailed Comparison with Foundation Models on Specialized Datasets}
\label{sec:appendix_fm_specialized}

To rigorously assess the utility of Foundation Models (FMs) in specialized medical domains, we evaluated \textbf{BiomedCLIP}, \textbf{DINOv2}, and \textbf{RAD-DINO} on the three primary datasets: ISIC (Dermoscopy), COVID-QU-Ex (X-Ray), and Kvasir (Endoscopy). Table~\ref{tab:specialized_fm} details the performance of "Off-the-shelf" frozen models versus "Evidential Versions" where we train our dual-head architecture on top of the frozen backbones.\\

\textbf{Analysis:}
\begin{itemize}
    \item \textbf{Domain Specificity Matters:} Off-the-shelf FMs struggle significantly when the target domain mismatches their pre-training data. For instance, \textbf{RAD-DINO}, which is specialized for chest X-rays, performs poorly on the Kvasir endoscopy dataset (mAP 27.21\%) and the ISIC dermoscopy dataset (mAP 50.54\%).
    \item \textbf{The "Evidential Upgrade":} Attaching our evidential heads consistently improves performance across all baselines. For example, on the COVID-QU-Ex dataset, adding our heads to RAD-DINO boosts mAP from 50.13\% to \textbf{91.53\%}, demonstrating that our loss formulation effectively adapts general representations to specific tasks while enabling uncertainty estimation.
    \item \textbf{Superiority of Swin-Small:} Despite the scale of these Foundation Models, our proposed \textbf{Evidential Retriever (Swin-Small)} consistently achieves the highest retrieval accuracy (e.g., \textbf{94.98\%} mAP on COVID-QU-Ex), while maintaining strong calibration across these specialized datasets. This confirms that for specialized medical retrieval tasks, a well-optimized, domain-specific architecture trained with our unified loss remains the most effective approach.
\end{itemize}

% \textcolor{red}{
% \begin{table*}[t]
% \centering
% \caption{\textbf{Comparison with Foundation Models on Specialized Datasets.} Off-the-shelf models struggle with domain-specific tasks (e.g., Dermoscopy, Endoscopy). Training our dual-head architecture on top of these backbones (``+ 2 Heads'') yields massive performance gains. However, our \textbf{Evidential Retriever (Swin-Small)} consistently outperforms even the fine-tuned Foundation Models on these datasets.}
% \label{tab:specialized_fm}
% \resizebox{0.95\textwidth}{!}{%
% \begin{tabular}{llccc}
% \toprule
% \textbf{Dataset} & \textbf{Model} & \textbf{mAP $\uparrow$} & \textbf{Recall@1 $\uparrow$} & \textbf{ECE $\downarrow$} \\
% \midrule
% \multicolumn{5}{l}{\textit{\textbf{ISIC 2017 (Dermoscopy)}}} \\
% \midrule
% \multirow{3}{*}{\shortstack[l]{Off-the-shelf FMs\\(Frozen)}} 
%  & BiomedCLIP & 54.29 & 67.17 & - \\
%  & DINOv2 & 54.41 & 68.33 & - \\
%  & RAD-DINO & 50.54 & 59.33 & - \\
% \midrule
% \multirow{4}{*}{\shortstack[l]{Evidential Versions\\(Trained Heads)}} 
%  & Evidential Classification & 70.45 & 74.83 & 0.2213 \\
%  & BiomedCLIP + 2 Heads & 57.17 & 67.33 & 0.1323 \\
%  & DINOv2 + 2 Heads & 61.55 & 72.00 & 0.0877 \\
%  & RAD-DINO + 2 Heads & 56.51 & 61.17 & 0.1777 \\
% \midrule
% \rowcolor{gray!10}
% \textbf{Ours} & \textbf{Evidential Retriever (Swin-S)} & \textbf{73.65} & \textbf{79.67} & \textbf{0.1492} \\
% \midrule
% \multicolumn{5}{l}{\textit{\textbf{COVID-QU-Ex (X-Ray)}}} \\
% \midrule
% \multirow{3}{*}{\shortstack[l]{Off-the-shelf FMs\\(Frozen)}} 
%  & BiomedCLIP & 51.68 & 83.28 & - \\
%  & DINOv2 & 45.41 & 81.03 & - \\
%  & RAD-DINO & 50.13 & 91.21 & - \\
% \midrule
% \multirow{4}{*}{\shortstack[l]{Evidential Versions\\(Trained Heads)}} 
%  & Evidential Classification & 89.76 & 93.17 & 0.0902 \\
%  & BiomedCLIP + 2 Heads & 74.26 & 87.65 & 0.0622 \\
%  & DINOv2 + 2 Heads & 78.96 & 88.13 & 0.1394 \\
%  & \textbf{RAD-DINO + 2 Heads} & \textcolor{red}{\textbf{91.53}} & \textcolor{red}{\textbf{94.58}} & \textcolor{red}{\textbf{0.1817}} \\
% \midrule
% \rowcolor{gray!10}
% \textbf{Ours} & \textbf{Evidential Retriever (Swin-S)} & \textbf{94.98} & \textbf{95.67} & \textbf{0.0581} \\
% \midrule
% \multicolumn{5}{l}{\textit{\textbf{Kvasir (Endoscopy)}}} \\
% \midrule
% \multirow{3}{*}{\shortstack[l]{Off-the-shelf FMs\\(Frozen)}} 
%  & BiomedCLIP & 54.50 & 78.71 & - \\
%  & DINOv2 & 46.20 & 75.00 & - \\
%  & RAD-DINO & 27.21 & 53.54 & - \\
% \midrule
% \multirow{4}{*}{\shortstack[l]{Evidential Versions\\(Trained Heads)}} 
%  & Evidential Classification & 89.96 & 92.29 & 0.0776 \\
%  & BiomedCLIP + 2 Heads & 79.04 & 84.62 & 0.1074 \\
%  & DINOv2 + 2 Heads & 84.09 & 88.42 & 0.1744 \\
%  & RAD-DINO + 2 Heads & 65.15 & 73.00 & 0.0940 \\
% \midrule
% \rowcolor{gray!10}
% \textbf{Ours} & \textbf{Evidential Retriever (Swin-S)} & \textbf{91.99} & \textbf{93.08} & \textbf{0.0593} \\
% \bottomrule
% \end{tabular}
% }
% \end{table*}}


\begin{table*}[t]
\centering
\caption{\textbf{Comparison with Foundation Models.} Off-the-shelf models struggle with domain-specific tasks (e.g., Dermoscopy, Endoscopy). Training our dual-head architecture on top of these backbones (``+ 2 Heads'') yields massive performance gains. However, our \textbf{Evidential Retriever (Swin-Small)} consistently outperforms even the fine-tuned Foundation Models on these datasets.}
\label{tab:specialized_fm}
\resizebox{0.95\textwidth}{!}{%
\begingroup
% \color{red}
\begin{tabular}{llccc}
\toprule
\textbf{Dataset} & \textbf{Model} & \textbf{mAP $\uparrow$} & \textbf{Recall@1 $\uparrow$} & \textbf{ECE $\downarrow$} \\
\midrule
\multicolumn{5}{c}{\textit{\textbf{ISIC 2017 (Dermoscopy)}}} \\
\midrule
\multirow{3}{*}{\shortstack[l]{Off-the-shelf FMs\\(Frozen)}} 
 & BiomedCLIP & 54.29 & 67.17 & - \\
 & DINOv2 & 54.41 & 68.33 & - \\
 & RAD-DINO & 50.54 & 59.33 & - \\
\midrule
\multirow{4}{*}{\shortstack[l]{Evidential Versions\\(Trained Heads)}} 
 & Evidential Classification & 70.45 & 74.83 & 0.2213 \\
 & BiomedCLIP + 2 Heads & 57.17 & 67.33 & 0.1323 \\
 & DINOv2 + 2 Heads & 61.55 & 72.00 & \textbf{0.0877} \\
 & RAD-DINO + 2 Heads & 56.51 & 61.17 & 0.1777 \\
\midrule
\rowcolor{gray!10}
\textbf{Ours} & \textbf{Evidential Retriever (Swin-S)} & \textbf{73.65} & \textbf{79.67} & 0.1492 \\
\midrule
\multicolumn{5}{c}{\textit{\textbf{COVID-QU-Ex (X-Ray)}}} \\
\midrule
\multirow{3}{*}{\shortstack[l]{Off-the-shelf FMs\\(Frozen)}} 
 & BiomedCLIP & 51.68 & 83.28 & - \\
 & DINOv2 & 45.41 & 81.03 & - \\
 & RAD-DINO & 50.13 & 91.21 & - \\
\midrule
\multirow{4}{*}{\shortstack[l]{Evidential Versions\\(Trained Heads)}} 
 & Evidential Classification & 89.76 & 93.17 & 0.0902 \\
 & BiomedCLIP + 2 Heads & 74.26 & 87.65 & 0.0622 \\
 & DINOv2 + 2 Heads & 78.96 & 88.13 & 0.1394 \\
 & RAD-DINO + 2 Heads & 91.53 & 94.58 & 0.1817 \\
\midrule
\rowcolor{gray!10}
\textbf{Ours} & \textbf{Evidential Retriever (Swin-S)} & \textbf{94.98} & \textbf{95.67} & \textbf{0.0581} \\
\midrule
\multicolumn{5}{c}{\textit{\textbf{Kvasir (Endoscopy)}}} \\
\midrule
\multirow{3}{*}{\shortstack[l]{Off-the-shelf FMs\\(Frozen)}} 
 & BiomedCLIP & 54.50 & 78.71 & - \\
 & DINOv2 & 46.20 & 75.00 & - \\
 & RAD-DINO & 27.21 & 53.54 & - \\
\midrule
\multirow{4}{*}{\shortstack[l]{Evidential Versions\\(Trained Heads)}} 
 & Evidential Classification & 89.96 & 92.29 & 0.0776 \\
 & BiomedCLIP + 2 Heads & 79.04 & 84.62 & 0.1074 \\
 & DINOv2 + 2 Heads & 84.09 & 88.42 & 0.1744 \\
 & RAD-DINO + 2 Heads & 65.15 & 73.00 & 0.0940 \\
\midrule
\rowcolor{gray!10}
\textbf{Ours} & \textbf{Evidential Retriever (Swin-S)} & \textbf{91.99} & \textbf{93.08} & \textbf{0.0593} \\
\bottomrule
\end{tabular}
\endgroup
}
\end{table*}

{\subsection{Ablation Study - Effect of Backbone Choice:}
\label{sec:backbone}
To investigate the impact of architectural design on retrieval quality and uncertainty modeling, we evaluate broadly across two distinct feature extraction families: \textbf{Convolutional Neural Networks} (ResNet-50, DenseNet-121) and \textbf{Vision Transformers} (ViT, Swin).Table \ref{tab:backbone_ablation} summarizes the performance across primary three datasets.

While CNN backbones provide competitive baselines, particularly on the texture-heavy COVID-QU-Ex dataset (e.g., DenseNet-121 achieves 93.27\% mAP), they are consistently outperformed by the hierarchical Transformer models. \textbf{Swin-Small} demonstrates superior retrieval accuracy across all tasks, surpassing ResNet-50 by significant margins on ISIC (+3.0\% mAP) and Kvasir (+3.25\% mAP). More importantly, we observe a substantial difference in calibration; CNNs exhibit higher Expected Calibration Error (ECE) compared to Swin Transformers (e.g., on Kvasir, ResNet-50 reaches 0.2059 ECE vs. 0.0593 for Swin-Small), indicating that they are more prone to overconfidence. This confirms that the hierarchical attention mechanism of Swin Transformers is not only effective for capturing fine-grained medical semantics but also crucial for generating reliable evidential uncertainty estimates.


% \subsection{}
\begin{table*}[ht]
\centering
\caption{Backbone comparison across ISIC, Kvasir, and COVID-QU-Ex. We evaluate Transformers (Swin, ViT) and CNNs (ResNet, DenseNet). \textbf{Swin-Small} consistently offers the best balance of retrieval accuracy and calibration. }
\label{tab:backbone_ablation}
\resizebox{\textwidth}{!}{
\begin{tabular}{llcccc}
\toprule
\textbf{Dataset} & \textbf{Backbone} & \textbf{Recall@[1,5,10]} & \textbf{mAP} & \textbf{mP@[1,5,10]} & \textbf{ECE} \\
\midrule
\multirow{6}{*}{\textbf{ISIC}}
& DenseNet121 & [74.00, 89.00, 92.83] & 65.76 & [74.00, 71.23, 70.80] & 0.1544 \\
& ResNet50 & [77.17, 88.83, 91.83] & 70.65 & [77.17, 75.10, 74.38] & 0.1585 \\
& ViT-Base & [73.33, 88.33, 91.17] & 70.38 & [73.33, 73.57, 73.50] & 0.1775 \\
& ViT-Small & [75.17, \textbf{90.17}, \textbf{94.00}] & 71.40 & [75.17, 75.33, 75.02] & 0.1522 \\
& Swin-Tiny & [76.17, 89.33, 92.67] & 72.05 & [76.17, 76.17, 75.68] & \textbf{0.1310} \\
\rowcolor{gray!10}
& \textbf{Swin-Small} & [\textbf{79.67}, 89.17, 91.00] & \textbf{73.65} & [\textbf{79.67}, \textbf{77.33}, \textbf{76.98}] & 0.1492 \\
\midrule
\multirow{6}{*}{\textbf{Kvasir}}
& DenseNet121 & [91.33, 96.46, 98.04] & 88.54 & [91.33, 90.92, 90.86] & 0.2214 \\
& ResNet50 & [89.75, 97.12, \textbf{98.42}] & 88.74 & [89.75, 90.13, 90.25] & 0.2059 \\
& ViT-Base & [93.29, 96.25, 97.04] & 88.80 & [93.29, 92.18, 92.07] & 0.0743 \\
& ViT-Small & [\textbf{93.33}, 96.50, 97.71] & 89.51 & [\textbf{93.33}, \textbf{93.28}, 93.09] & 0.1096 \\
& Swin-Tiny & [93.08, 97.25, 97.92] & 90.44 & [93.08, 93.07, 93.11] & 0.0643 \\
\rowcolor{gray!10}
& \textbf{Swin-Small} & [93.08, \textbf{97.54}, 98.33] & \textbf{91.99} & [93.08, 93.17, \textbf{93.43}] & \textbf{0.0593} \\
\midrule
\multirow{6}{*}{\textbf{COVID-QU}}
& DenseNet121 & [95.11, 97.55, 98.07] & 93.27 & [95.11, 94.60, 94.55] & 0.1126 \\
& ResNet50 & [94.40, \textbf{97.80}, \textbf{98.32}] & 92.44 & [94.40, 94.04, 93.95] & 0.0826 \\
& ViT-Base & [94.83, 97.69, 98.19] & 91.96 & [94.83, 94.44, 94.18] & 0.0599 \\
& ViT-Small & [94.80, 97.58, 98.03] & 92.72 & [94.80, 94.51, 94.35] & 0.0704 \\
& Swin-Tiny & [95.60, 97.70, 98.14] & 93.93 & [95.60, 95.12, 94.99] & 0.0721 \\
\rowcolor{gray!10}
& \textbf{Swin-Small} & [\textbf{95.67}, 97.02, 97.48] & \textbf{94.98} & [\textbf{95.67}, \textbf{95.47}, \textbf{95.44}] & \textbf{0.0581} \\
\bottomrule
\end{tabular}
}
\end{table*}

\subsection{Hyper parameter tuning - Effect of $\lambda_{\text{reg}}$ in Metric Learning}
\label{sec:hyper_analysis}
To determine the optimal balance between standard metric learning and evidential regularization, we conduct a systematic hyperparameter sweep over the evidential weighting factor $\lambda_{\text{reg}} \in \{0.0, 0.3, 0.7\}$ using the validation sets of ISIC, COVID-QU-Ex, and Kvasir-V2 as shown in Figure \ref{fig:map_datasets_lambda}. For all experiments, we use the Swin-Small backbone and evaluate retrieval quality using Recall@K, mAP, and mP@K. For models trained from scratch as seen, $\lambda_{\text{reg}} = 0.0$ (i.e., no regularization) consistently results in the weakest retrieval performance, indicating that regularization plays a crucial role in stabilizing the embedding space. The ISIC dataset achieves its highest validation performance at $\lambda_{\text{reg}} = 0.3$, followed by $\lambda_{\text{reg}} = 0.7$. In both COVID-QU-Ex and Kvasir-V2 obtain their strongest Recall@K, mAP, and mP@K metrics at $\lambda_{\text{reg}} = 0.7$, demonstrating that stronger evidential regularization improves robustness and calibration for datasets with more homogeneous structural patterns. Since $\lambda_{\text{reg}} = 0.7$ performs best on two out of the three primary datasets and remains competitive on ISIC, we adopt $\lambda_{\text{reg}} = 0.7$ as the unified hyperparameter for all subsequent experiments, offering the best trade-off between retrieval accuracy and uncertainty-aware representation learning. Across all datasets for Foundation Model experiments (e.g., RAD-DINO), we observed that $\lambda_{\text{reg}} = 0.0$ yielded the best performance, likely because the pre-trained features are already highly robust and require less regularization.


\begin{figure}[htbp]
  \centering
  \floatconts
    {fig:map_datasets_lambda}
    {\caption{Comparison of mAP Scores Across Datasets for Different Regularization Strengths $\lambda_{\text{reg}}$.}}
    {\includegraphics[width=0.85\linewidth]{Figures/mAP_dataset_vs_lambda.png}}
\end{figure}

\subsection{Implementation Details}
\label{sec:implementation}

We adopt the training recipe outlined in \cite{pmlr-v250-susmitha24a}. All models are optimized using the AdamW optimizer with a learning rate of $3 \times 10^{-5}$ and a weight decay of $5 \times 10^{-4}$ for 10,000 iterations. For the contrastive objective, the margin is set to $\beta = 0.5$. To analyze the effect of regularization, we experiment with weighting factors $\lambda_{\text{reg}} \in \{0.0, 0.3, 0.7\}$. Standard data augmentation techniques are applied during training, including resizing images to $256 \times 256$, followed by a random crop to $224 \times 224$ and random horizontal flipping. The size of the dynamic offline memory queue is set to match the cardinality of each respective dataset. For the evidential classification baselines, we maintain consistent optimizer and iteration settings to ensure a fair comparison. Finally, all retrieval metrics are reported for $K \in \{1, 5, 10\}$.


\subsection{ECE Metric Calculation}
\label{sec:ece_calculation}
To evaluate uncertainty reliability, we report Expected Calibration Error (ECE), calculated as the weighted average difference between empirical accuracy and predicted confidence across $M=10$ bins: 

\[
\mathrm{ECE} = \sum_{m=1}^{M} \frac{|B_m|}{N} \, \big| \text{acc}(B_m) - \text{conf}(B_m) \big|.
\]

We adapt the definition of confidence $\hat{p}_i$ to the uncertainty mechanism. For analytic methods (Evidential, BTL), confidence is derived directly from the explicit uncertainty score as $\hat{p}_i = 1 - u_i$. For stochastic methods (Deep Ensembles, MC Dropout, PFE), we measure confidence via prediction consistency, defined as the fraction of latent samples or ensemble members agreeing with the majority prediction. This formulation ensures a consistent calibration assessment across both deterministic and probabilistic baselines.

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
