\documentclass{midl} 

\usepackage{makecell}
\usepackage{booktabs}
\usepackage{multirow}

\usepackage{svg}
\usepackage{array, xcolor, colortbl}

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 080}
\editors{Accepted for publication at MIDL 2025}

\title[Visual Prompt Engineering for VLMs in Radiology]{Visual Prompt Engineering for  \\ Vision Language Models in Radiology}


 

\midlauthor{
\Name{Stefan Denner\nametag{$^{1,2}$}} \Email{stefan.denner@dkfz-heidelberg.de} \AND
\Name{Markus Bujotzek\nametag{$^{1,3}$}} \Email{markus.bujotzek@dkfz-heidelberg.de} \AND
\Name{Dimitrios Bounias\nametag{$^{1,3}$}} \Email{dimitrios.bounias@dkfz-heidelberg.de}\AND
\Name{David Zimmerer\nametag{$^{1}$}} \Email{d.zimmerer@dkfz-heidelberg.de}\AND
\Name{Raphael Stock\nametag{$^{1,2}$}} \Email{raphael.stock@dkfz-heidelberg.de}\AND
\Name{Klaus Maier-Hein\nametag{$^{1,2, 3}$}} \Email{k.maier-hein@dkfz-heidelberg.de}\\
\addr $^{1}$ Division of Medical Image Computing, German Cancer Research Center, Heidelberg, Germany \\
\addr $^{2}$ Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg, Germany \\
\addr $^{3}$ Medical Faculty Heidelberg, University of Heidelberg, Heidelberg, Germany \\
}

\begin{document}
\maketitle
\begin{abstract}
Medical image classification plays a crucial role in clinical decision-making, yet most models are constrained to a fixed set of predefined classes, limiting their adaptability to new conditions. Contrastive Language-Image Pretraining (CLIP) offers a promising solution by enabling zero-shot classification through multimodal large-scale pretraining. However, while CLIP effectively captures global image content, radiology requires a more localized focus on specific pathology regions to enhance both interpretability and diagnostic accuracy.
To address this, we explore the potential of incorporating visual cues into zero-shot classification, embedding visual markers, such as arrows, bounding boxes, and circles, directly into radiological images to guide model attention. Evaluating across four public chest X-ray datasets, we demonstrate that visual markers improve AUROC by up to 0.185, highlighting their effectiveness in enhancing classification performance. Furthermore, attention map analysis confirms that visual cues help models focus on clinically relevant areas, leading to more interpretable predictions.
To support further research, we use public datasets and provide our codebase and preprocessing pipeline \href{https://github.com/MIC-DKFZ/VPE-in-Radiology}{here}, serving as a reference point for future work on localized classification in medical imaging.
\end{abstract}

\begin{keywords}
Vision-Language Models, Localized Classification, Zero-Shot Classification
\end{keywords}

\section{Introduction}
\label{sec:intro}

\begin{figure*}[ht!]
    \centering
    \includesvg[width=\textwidth]{figures/Fig1.svg}

    \caption{
    Training paradigm of CLIP (left) and how we use it for zero-shot classification (right). 
    CLIP pretrained on biomedical image-text pairs from scientific articles learns semantical representations aligning image and text. 
    For zero-shot classification, we provide the target image (with a visual marker) and text descriptions of the potential classes.
    Example image (left) from \cite{example_image_1_red_circle_xray}.
    }
    \label{fig:schema}
\end{figure*}

Medical image classification remains a long-standing and critical problem in the field of healthcare. 
Despite the advances in automatic classification approaches, these methods are typically limited to the few specific pathologies they were trained on \cite{holste2024towards}.
This limitation is particularly pronounced due to the vast range of potential pathologies and the insufficient availability of comprehensive training data \cite{langlotz2023future}. 
In contrast, model architectures like Contrastive Language-Image Pre-training (CLIP) \cite{CLIP}, have shown great performance by not training for a specific classification task, but leveraging the large corpora of text-image pairs for pre-training.
CLIP demonstrates that their pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn state-of-the-art image representations.
After pre-training, CLIP enables zero-shot classification by leveraging natural language to reference visual concepts \cite{CLIP}. While CLIP captures global image content, certain applications require a more fine-grained focus on specific regions of interest. This limitation is particularly crucial in radiology, where pathological structures are often small, subtle, and challenging to detect. Moreover, CLIP’s global perspective becomes insufficient when multiple regions of interest exist within a single image \cite{alphaCLIP}, a common scenario in radiology where multiple pathologies frequently coexist in the same scan \cite{padchest-gr, nih14, vindrcxr}.
Additionally, radiologists often identify abnormalities but require assistance in classifying them \cite{microsoftMotivationRadiology}. This highlights the need for models that can prioritize and interpret localized pathological regions rather than relying solely on global image representations.
An intuitive approach to let the CLIP focus on a specific region would be to crop the image. 
However, this loses the global context of the pathology, therefore might harm classification performance. 
Recent works in the natural image domain investigated to draw markers directly on the image leading to state-of-the-art results in zero-shot tasks \cite{redCircle,Finegraind_visual_prompting}.
They hypothesize that the model has seen the chosen visual markers during training and understands the meaning behind them. 
However, they also indicate that this behavior is more likely to be learned from large datasets and high-capacity models, given the scarcity of such visual markers in the training data \cite{redCircle}.

In radiology, due to limited data availability, a common strategy for training Vision-Language Models(VLMs) involves utilizing public research articles \cite{pubmedclip,BiomedCLIP1,BiomedCLIP2,pmc-clip,lozano2025biomedica,roco,medicat}. 
Given the prevalence of visual markers in scientific images (see Appendix Fig. \ref{fig:example_images}), we hypothesize that VLMs trained on these datasets, despite being smaller than their natural image counterparts, can still recognize and interpret such markers. This capability may enable them to leverage visual cues to guide attention and influence decision-making. Therefore, this work investigates whether visual prompt engineering, i.e.  embedding markers within radiological images, enhances zero-shot classification performance.
We evaluate our hypothesis on multiple chest X-ray datasets.
Beyond quantitative analysis, we also provide evidence that the model truly recognizes the visual markers by visualizing attention maps. 
To our knowledge, this is the first study to investigate visual prompt engineering in the radiological domain.

\section{Methods}
\label{sec:methods}

\subsection{Zero-shot Classification with CLIP}
\label{sec:zero_shot_clip}
CLIP \cite{CLIP} classifies images in a zero-shot manner by embedding images and text into a shared space. CLIP consists of two separate encoders: one for images and one for text. Given an image $\mathbf{I} \in \mathbb{R}^{3 \times H \times W}$, CLIP’s image encoder produces an embedding $\phi(\mathbf{I})$, while the text encoder maps an input text $t \in \Sigma^*$ to an embedding $\psi(t)$. Both embeddings lie in a shared latent space. A compatibility score
$    s(\mathbf{I}, t) = \text{cosine}\bigl(\phi(\mathbf{I}), \psi(t)\bigr)$
is computed, by using the cosine similarity between the image and text embeddings \cite{CLIP}.

To perform classification over $N$ candidate classes, we first define a set of text prompts $\{ \mathbf{T}_i \}$ for $i \in \{1, 2, \ldots, N\}$. Each $\mathbf{T}_i$ describes a class. We then compute the similarity scores $\{s_i\}$ by evaluating 
$
    s_i = s\bigl(\mathbf{I}, \mathbf{T}_i\bigr),
$
for each class $i$. These similarity scores are interpreted as logits for a softmax function:
\begin{equation}
    P\bigl(y=i \;\big|\; \mathbf{I}, \{\mathbf{T}_i\}\bigr)
    =
    \frac{\exp\bigl(s_i\bigr)}{\sum_{j=1}^N \exp\bigl(s_j\bigr)}.
\end{equation}
The final predicted class $\hat{y}$ is taken to be the one with the highest softmax probability:
\begin{equation}
    \hat{y} = \arg\max_{i}\;
    P\bigl(y=i \;\big|\; \mathbf{I}, \{\mathbf{T}_i\}\bigr).
\end{equation}

\subsection{Visual Prompting} \label{sec:visual_prompting}
While encoding an image into a global embedding is effective for broad categorization tasks, this global view can overshadow small or subtle findings. This is particularly problematic in radiology, where pathologies are often localized and subtle. Moreover, multiple pathologies may appear simultaneously in a single scan, each requiring targeted attention. Therefore, it is essential to develop approaches that direct VLMs’ attention to specific regions of interest, rather than relying solely on global image features.
A common approach to incorporate region-specific information into image classification pipelines is cropping, where the image is truncated to the region of interest. This effectively reduces distractions but risks losing global context, which is often critical in radiological assessment. 
Some works, including \cite{MedSam, SAM, alphaCLIP}, integrate region-specific prompting techniques directly into model architectures. These approaches, however, require dedicated training on task-specific data with precise spatial annotations, which is costly and often infeasible in medical imaging.
In contrast, recent works in the natural image domain investigated to draw markers directly on the image leading to state-of-the-art results in zero-shot tasks \cite{redCircle,Finegraind_visual_prompting}.
This method is particularly appealing because it requires no additional training or fine-tuning, allowing for post-hoc application even in pretrained models. Moreover, it eliminates the need for extensive datasets with spatial annotations, which are scarce in radiology.
While \citet{redCircle} hypothesize that this emergent capability is limited to models trained on very large datasets, we propose that models exposed to scientific literature, which frequently includes visual markers, may also develop this ability.
Therefore, we follow the approach from \citet{redCircle} and draw the visual prompts directly in the image. 
We study a range of visual prompts in shape and color, inspired by common highlighting techniques in the medical literature (Appendix Fig. \ref{fig:example_images}). Specifically, we experiment with: arrows, which point at the target object, bounding boxes and circles surrounding the target object. 
We assume access to the bounding box coordinates for the region of interest.
For the bounding box marker, the predefined bounding boxes are directly drawn on the image. The circle marker is represented by an ellipse encompassing the given bounding box coordinates. The arrow marker extends from the image center to the bounding box center, with a length of at least 25\% of the smaller image dimension to ensure visibility. If the bounding box center coincides with the image center, a slight offset is applied to avoid a zero-length arrow.

The modified images are then processed by the image encoder and classified using the previously described approach (Sec. \ref{sec:zero_shot_clip}).

\paragraph{Text Prompts}
The text prompts used in our experiments follow a standardized template:  
"A chest X-ray with signs of \{class\}." For binary malignancy classification, we adapt this format to  
"A chest X-ray with a \{malignancy\} \{class\}." where \{malignancy\} is either "malignant" or "benign".  

To investigate the effect of explicitly referencing visual markers, we conduct an ablation study by modifying the prompts to include marker descriptions. Specifically, we append  
"indicated by a \{color\} \{annotation\}." where \{annotation\} represents the type of marker (arrow, bounding box, or circle) and \{color\} corresponds to the applied visual marker color.

\subsection{Evaluation}
\paragraph{Quantitative Evaluation}
We quantitatively evaluate the effect of visual prompts using AUROC for multi-label and binary classification.
In the multi-label setting, we macro average the class-wise AUROC \cite{hanley1982meaning,maier2024metrics}.
Since in the multi-label setting, there can be multiple pathologies in a single image, the evaluation without any cropping or prompting is not straightforward, since usually only the text prompt with the highest probability is selected. 
Therefore, if there is more than one pathology, we choose the top $M$ predicted classes, with $M$ being the number of ground truth pathologies in the image. 
In cases where we apply visual prompting, we only utilize the highest class probability, since we provide $M$ images with different visual prompts. 
This approach slightly favors the non-prompting case, since for the prompting case, each prediction is independent, therefore allows multiple times the same prediction, which is not possible in our selected datasets.

\paragraph{Explainability}
To assess whether visual prompts improve not only classification performance but also the model’s ability to focus on relevant regions, we employ LeGrad \cite{LeGrad} as an explainability method. 
LeGrad computes gradients with respect to the attention maps of the ViT layers, using these gradients as an explainability signal.
It has demonstrated superior spatial fidelity and robustness to perturbations compared to other state-of-the-art explainability methods \cite{LeGrad}.  
We qualitatively compare the attention maps of images with and without visual prompts to evaluate whether the model focuses on the intended regions.

 

\section{Experiments}
\label{sec:experiments}

\subsection{Dataset}
\label{sec:datasets}
To evaluate our proposed approach, we utilize four public chest X-ray datasets with location annotations for the pathologies. 
A more detailed description about the datasets can be found in Appendix \ref{sec:appendix_datasets}.

\paragraph{Padchest-GR}
 includes 4,555 chest X-ray (CXR) studies with grounded radiology reports and bounding box annotations \cite{padchest-gr}. 
 We filter for samples where each pathology has a only single bounding box to ensure fair comparison with our cropping baseline.

\paragraph{VinDr-CXR}
includes 18,000 chest X-ray (CXR) scans with radiologist-annotated bounding boxes for 22 findings \cite{vindrcxr}. We use the official train and test split and apply the same filtering criteria to retain only samples with a single bounding box per pathology.

 

\paragraph{Chestx-ray8 (NIH14)}
  consists of 108,948 frontal-view chest X-ray images labeled with eight common thoracic diseases extracted via natural language processing from radiology reports \cite{nih14}. A subset of 983 images includes manually annotated bounding boxes for 1,600 pathology instances, which we use for our study. 

\paragraph{JSRT}
 includes 154 chest X-Rays with a lung nodule (100 malignant and 54 benign nodules) including the X and Y coordinates, and the size of the nodule \cite{JSRT}. 

\subsection{Models}
We evaluate our proposed approach on two biomedical vision-language models, BiomedCLIP and BMCA-CLIP, both trained on scientific biomedical image-text pairs from PubMed Central (PMC) using the CLIP framework.
For both models, we use the official HuggingFace \cite{huggingface} models and implementation, including the preprocessing. 

\paragraph{BiomedCLIP} is pretrained on 15 million PMC-derived image-text pairs and adapts CLIP for biomedical tasks, using PubMedBERT as the text encoder and an ImageNet-pretrained ViT-B/16 as the image encoder. It has demonstrated state-of-the-art performance in image classification, retrieval, and visual question answering (VQA), even outperforming some radiology-specific models on chest X-ray benchmarks \cite{BiomedCLIP1}.

\paragraph{BMCA-CLIP} is trained on 24 million image-text pairs from BIOMEDICA, extends this approach with continual pretraining and streaming-based optimization, using a ViT-L/14 image encoder and PubMedBERT text encoder. It achieves state-of-the-art zero-shot classification across 40 biomedical tasks while requiring 10× less compute than previous models \cite{lozano2025biomedica}.

 

 

\begin{table}
\caption{Zero-shot classification performance (AUROC) of BiomedCLIP and BMCA-CLIP on four chest X-ray datasets. Across most datasets, visual prompt markers improve the classification performance. In most cases, mentioning the marker in prompt further improves the performance. Colors are normalized by model and column.}
\centering
\label{tab:results}
\begin{tabular}{c|cc|ccc|cc|c|c}
\toprule
\multirow{2}{*}{} & \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Visual\\ Prompt\end{tabular}} & \multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Marker in\\ text prompt\end{tabular}} & \multicolumn{3}{c|}{Padchest-GR} & \multicolumn{2}{c|}{VinDr-CXR} & NIH14 & JSRT \\
 & &  & Train & Val & Test & Train & Test &  &  \\
 \midrule
\multirow{8}{*}{\rotatebox{90}{BiomedCLIP}} & \multicolumn{2}{c|}{No visual prompt} &  {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.607 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.633 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.621 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.612 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.629 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.705 & {\cellcolor[HTML]{A7D96B}} \color[HTML]{000000} 0.550 \\
& \multicolumn{2}{c|}{Crop} & {\cellcolor[HTML]{2AA054}} \color[HTML]{F1F1F1} 0.751 & {\cellcolor[HTML]{FFFAB6}} \color[HTML]{000000} 0.715 & {\cellcolor[HTML]{93D168}} \color[HTML]{000000} 0.744 & {\cellcolor[HTML]{FEDC88}} \color[HTML]{000000} 0.659 & {\cellcolor[HTML]{ECF7A6}} \color[HTML]{000000} 0.693 & {\cellcolor[HTML]{7FC866}} \color[HTML]{000000} 0.758 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.508 \\
\cmidrule{2-10}
& Arrow &  &  {\cellcolor[HTML]{A7D96B}} \color[HTML]{000000} 0.722 & {\cellcolor[HTML]{FFF6B0}} \color[HTML]{000000} 0.713 & {\cellcolor[HTML]{D1EC86}} \color[HTML]{000000} 0.726 & {\cellcolor[HTML]{2AA054}} \color[HTML]{F1F1F1} 0.717 & {\cellcolor[HTML]{15904C}} \color[HTML]{F1F1F1} 0.736 & {\cellcolor[HTML]{FEE695}} \color[HTML]{000000} 0.734 & {\cellcolor[HTML]{B9E176}} \color[HTML]{000000} 0.548 \\
& Arrow & \checkmark &  {\cellcolor[HTML]{9BD469}} \color[HTML]{000000} 0.724 & {\cellcolor[HTML]{F2FAAE}} \color[HTML]{000000} 0.723 & {\cellcolor[HTML]{C1E57B}} \color[HTML]{000000} 0.731 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.732 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.745 & {\cellcolor[HTML]{FEE797}} \color[HTML]{000000} 0.735 & {\cellcolor[HTML]{E9F6A1}} \color[HTML]{000000} 0.541 \\
& BBox &  &  {\cellcolor[HTML]{6BBF64}} \color[HTML]{000000} 0.737 & {\cellcolor[HTML]{57B65F}} \color[HTML]{F1F1F1} 0.772 & {\cellcolor[HTML]{33A456}} \color[HTML]{F1F1F1} 0.768 & {\cellcolor[HTML]{CBE982}} \color[HTML]{000000} 0.687 & {\cellcolor[HTML]{69BE63}} \color[HTML]{F1F1F1} 0.722 & {\cellcolor[HTML]{6EC064}} \color[HTML]{000000} 0.760 & {\cellcolor[HTML]{F16640}} \color[HTML]{F1F1F1} 0.519 \\
& BBox & \checkmark &  {\cellcolor[HTML]{108647}} \color[HTML]{F1F1F1} 0.761 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.803 & {\cellcolor[HTML]{097940}} \color[HTML]{F1F1F1} 0.784 & {\cellcolor[HTML]{9BD469}} \color[HTML]{000000} 0.698 & {\cellcolor[HTML]{33A456}} \color[HTML]{F1F1F1} 0.730 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.775 & {\cellcolor[HTML]{DCF08F}} \color[HTML]{000000} 0.543 \\
& Circle &  &  {\cellcolor[HTML]{219C52}} \color[HTML]{F1F1F1} 0.753 & {\cellcolor[HTML]{1E9A51}} \color[HTML]{F1F1F1} 0.784 & {\cellcolor[HTML]{17934E}} \color[HTML]{F1F1F1} 0.775 & {\cellcolor[HTML]{DAF08D}} \color[HTML]{000000} 0.683 & {\cellcolor[HTML]{45AD5B}} \color[HTML]{F1F1F1} 0.727 & {\cellcolor[HTML]{5AB760}} \color[HTML]{F1F1F1} 0.762 & {\cellcolor[HTML]{70C164}} \color[HTML]{000000} 0.555 \\
& Circle & \checkmark &  {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.771 & {\cellcolor[HTML]{06733D}} \color[HTML]{F1F1F1} 0.799 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.791 & {\cellcolor[HTML]{F5FBB2}} \color[HTML]{000000} 0.675 & {\cellcolor[HTML]{51B35E}} \color[HTML]{F1F1F1} 0.725 & {\cellcolor[HTML]{06733D}} \color[HTML]{F1F1F1} 0.773 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.568 \\
\bottomrule
\toprule

\multirow{8}{*}{\rotatebox{90}{BMCA-CLIP}}  & \multicolumn{2}{c|}{No visual prompt} &  {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.582 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.613 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.604 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.526 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.589 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.624 & {\cellcolor[HTML]{FA9B58}} \color[HTML]{000000} 0.484 \\
& \multicolumn{2}{c|}{Crop} & {\cellcolor[HTML]{B3DF72}} \color[HTML]{000000} 0.706 & {\cellcolor[HTML]{FEEDA1}} \color[HTML]{000000} 0.692 & {\cellcolor[HTML]{B5DF74}} \color[HTML]{000000} 0.728 & {\cellcolor[HTML]{F7FCB4}} \color[HTML]{000000} 0.577 & {\cellcolor[HTML]{F67C4A}} \color[HTML]{F1F1F1} 0.606 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.701 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.548 \\
\cmidrule{2-10}
& Arrow &  &  {\cellcolor[HTML]{E2F397}} \color[HTML]{000000} 0.688 & {\cellcolor[HTML]{FFF7B2}} \color[HTML]{000000} 0.698 & {\cellcolor[HTML]{E6F59D}} \color[HTML]{000000} 0.709 & {\cellcolor[HTML]{128A49}} \color[HTML]{F1F1F1} 0.617 & {\cellcolor[HTML]{73C264}} \color[HTML]{000000} 0.648 & {\cellcolor[HTML]{F57245}} \color[HTML]{F1F1F1} 0.640 & {\cellcolor[HTML]{BFE47A}} \color[HTML]{000000} 0.517 \\
& Arrow & \checkmark &  {\cellcolor[HTML]{DCF08F}} \color[HTML]{000000} 0.691 & {\cellcolor[HTML]{F7FCB4}} \color[HTML]{000000} 0.706 & {\cellcolor[HTML]{E0F295}} \color[HTML]{000000} 0.711 & {\cellcolor[HTML]{0F8446}} \color[HTML]{F1F1F1} 0.618 & {\cellcolor[HTML]{57B65F}} \color[HTML]{F1F1F1} 0.651 & {\cellcolor[HTML]{EC5C3B}} \color[HTML]{F1F1F1} 0.638 & {\cellcolor[HTML]{8ECF67}} \color[HTML]{000000} 0.525 \\
& BBox &  &  {\cellcolor[HTML]{0C7F43}} \color[HTML]{F1F1F1} 0.757 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.791 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.789 & {\cellcolor[HTML]{84CA66}} \color[HTML]{000000} 0.599 & {\cellcolor[HTML]{128A49}} \color[HTML]{F1F1F1} 0.660 & {\cellcolor[HTML]{E2F397}} \color[HTML]{000000} 0.668 & {\cellcolor[HTML]{FFF5AE}} \color[HTML]{000000} 0.502 \\
& BBox & \checkmark &  {\cellcolor[HTML]{138C4A}} \color[HTML]{F1F1F1} 0.752 & {\cellcolor[HTML]{04703B}} \color[HTML]{F1F1F1} 0.788 & {\cellcolor[HTML]{108647}} \color[HTML]{F1F1F1} 0.777 & {\cellcolor[HTML]{91D068}} \color[HTML]{000000} 0.598 & {\cellcolor[HTML]{42AC5A}} \color[HTML]{F1F1F1} 0.653 & {\cellcolor[HTML]{F2FAAE}} \color[HTML]{000000} 0.665 & {\cellcolor[HTML]{A50026}} \color[HTML]{F1F1F1} 0.461 \\
& Circle &  &  {\cellcolor[HTML]{036E3A}} \color[HTML]{F1F1F1} 0.763 & {\cellcolor[HTML]{0E8245}} \color[HTML]{F1F1F1} 0.781 & {\cellcolor[HTML]{07753E}} \color[HTML]{F1F1F1} 0.783 & {\cellcolor[HTML]{18954F}} \color[HTML]{F1F1F1} 0.614 & {\cellcolor[HTML]{108647}} \color[HTML]{F1F1F1} 0.660 & {\cellcolor[HTML]{A2D76A}} \color[HTML]{000000} 0.678 & {\cellcolor[HTML]{FBFDBA}} \color[HTML]{000000} 0.505 \\
& Circle & \checkmark &  {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.766 & {\cellcolor[HTML]{05713C}} \color[HTML]{F1F1F1} 0.788 & {\cellcolor[HTML]{0B7D42}} \color[HTML]{F1F1F1} 0.780 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.624 & {\cellcolor[HTML]{006837}} \color[HTML]{F1F1F1} 0.665 & {\cellcolor[HTML]{9BD469}} \color[HTML]{000000} 0.679 & {\cellcolor[HTML]{FFFCBA}} \color[HTML]{000000} 0.503 \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Visual Prompting Details}

For all datasets, we utilize the provided location information of pathologies to generate visual prompts. These prompts are based on bounding box coordinates, which highlight the regions of interest within the images.
For the JSRT dataset, nodule locations are given as center coordinates (X, Y) along with the nodule size in mm. To define a bounding box, we first convert the nodule size from millimeters to pixels based on the dataset resolution. A square bounding box is then centered at (X, Y), with its side length equal to the converted nodule size in pixels.
For all other datasets, bounding box coordinates are directly provided.
As a baseline, we evaluate cropping, where the image is cropped around the bounding box centroid while maintaining a minimum crop size. The cropping dimensions are dynamically adjusted to be at least the bounding box size or 20\% of the image size, ensuring a balance between focus on the pathology and preserving contextual information. To prevent out-of-bounds errors, the final crop is constrained within the image boundaries.

Through preliminary experiments, we find that a red line with a width of 1 achieves the highest average performance across models and visual markers (Appendix Table \ref{tab:linewidth_ablation} and Table \ref{tab:color_ablation}). Therefore, all experiments are conducted using this configuration.

\section{Results}
\label{sec:results}

\subsection{Quantitative Results}
Our results (Table \ref{tab:results}) show that focusing the model on the region of interest, either via cropping or visual markers, consistently improves zero-shot classification performance compared to no visual prompt across all datasets. This confirms that guiding attention to pathology regions enhances the model’s discriminative capabilities.
Among visual markers techniques, bounding boxes and circles emerge as the most robust choices across datasets, with circle prompts performing particularly well. Cropping remains competitive but is generally outperformed by visual markers, except for BMCA-CLIP on NIH14 and JSRT, where cropping achieves the highest performance.
For BiomedCLIP, visual markers consistently outperform cropping on all datasets, with bounding boxes and circles leading on PadChest-GR, and arrows performing best on VinDR-CXR. 

While the JSRT dataset is only binary, the task of differentiating between malignant and benign nodules is notoriously challenging even for experienced radiologists \cite{macmahon2017guidelines}. This inherent difficulty likely underlies the uniformly low performance scores observed across all approaches. 
Notably, however, the circle marker in BiomedCLIP achieves the highest performance.

As shown in Appendix Fig. \ref{fig:abl_pathology_size}, small pathologies particularly benefit from visual prompting. This is likely because, without explicit markers, smaller lesions are more prone to being overlooked by the models. Visual prompts therefore appear especially beneficial when pathology visibility is low, further underscoring their relevance in radiological applications.

Additionally, incorporating visual marker descriptions into text prompts further enhances performance in most cases, indicating a synergistic effect between textual and visual cues. This suggests that explicitly referencing visual prompts in text helps align the model’s attention with the pathology region.

Since visual markers directly modify the input images, they inherently carry the risk of occluding diagnostically relevant features. Additionally, their effectiveness could depend on precise marker placement and size. To assess the robustness of visual prompting under realistic variations, we conduct ablation studies evaluating sensitivity to spatial shifts and marker scaling.

In Appendix Fig. \ref{fig:ablation}, panel (a) quantifies the effect of shifting the marker up to 25 \% away from the ground truth location in randomly chosen directions. This simulates realistic localization uncertainty. Despite a gradual performance decline with increasing displacement, visual prompting consistently outperforms the no-visual-prompt baseline across shifts. This highlights the approach’s robustness to moderate localization errors.

To further evaluate robustness, we assess sensitivity to changes in marker size (Appendix Fig. \ref{fig:ablation}, panel (b)). Specifically, we shrink and enlarge the markers by up to 25 \% relative to the original ground truth bounding box. While performance varies with marker size, all visual prompting conditions substantially outperform the no-prompting baseline.

In most cases, shrinking the marker reduces performance, likely due to insufficient visibility of the diagnostically relevant regions. Interestingly, for many configurations, enlarging the marker beyond the ground truth region actually improves performance, suggesting that slightly expanding the highlighted area can enhance the model’s ability to detect the pathology.

\begin{figure}[t!]
    \centering
    \includesvg[width=\textwidth]{figures/xai_tsne_rebuttal.svg}
      \caption{(a) Input images and LeGrad attention maps for BMCA-CLIP with different visual prompts. Each row corresponds to a distinct visual prompt. The first and fourth columns display the input images, while the remaining columns show LeGrad attention maps. The second and fifth columns depict attention maps when no visual marker description was included in the text prompt, whereas the third and sixth columns show attention maps when the visual marker was explicitly mentioned.
      (b) t-SNE projection of single-class samples from the PadChest-GR dataset, with pathologies color-coded. The top plot represents BMCA-CLIP's image embeddings without visual prompts, while the bottom plot shows embeddings with a red circle prompt. The addition of visual prompts enhances clustering, suggesting improved model focus on pathology-relevant features.
      }
    \label{fig:xai_tsne}
\end{figure}

\subsection{Qualitative Results}
\paragraph{Explainability}
To better understand the impact of visual prompts, we employ LeGrad \cite{LeGrad}, an explainability method that visualizes model attention. When visual markers are mentioned in the text prompt, the model demonstrates increased focus on the relevant pathology regions, as shown in the attention maps (Fig. \ref{fig:xai_tsne}(a)). This suggests that visual prompts not only improve classification performance but also enhance model interpretability, ensuring that the model attends to clinically relevant areas.

\paragraph{t-SNE}
Visual prompt markers alter the input image while refining the model’s focus, which should ideally result in more distinct and pathology-aligned feature embeddings. To test this hypothesis, we analyze embedding clusters using t-SNE \cite{tsne}. Specifically, we apply t-SNE to a single-class subset of PadChest-GR to observe whether visual prompting improves the clustering of pathology representations.

As shown in Fig. \ref{fig:xai_tsne} (b), pathology clusters appear more distinct and well-separated when using a circle visual prompt, compared to no visual prompt. This indicates that visual prompting enhances feature representation, making embeddings more discriminative and aligned with pathology characteristics.

\section{Conclusion}
\label{sec:dicussion_and_conclusion}

This study demonstrates that incorporating visual cues can significantly enhance the zero-shot classification performance of Vision-Language Models (VLMs) for radiological images. By leveraging visual markers such as arrows, bounding boxes, and circles, alongside corresponding text prompts, we observed consistent performance improvements across multiple public datasets. Beyond improving classification accuracy, our results show that visual cues help guide model attention to clinically relevant areas, as evidenced by attention maps and feature clustering analyses.

Importantly, our work goes beyond visual prompt engineering by exploring how spatial information can improve zero-shot localized classification. To support further research, we rely exclusively on public datasets and release our code and preprocessing pipeline, allowing for standardized benchmarking in localized classification for medical imaging. We hope this serves as a useful reference for future work and contributes to improving the integration of visual cues in zero-shot medical image classification.

\clearpage  

\midlacknowledgments{We thank Piotr Kalinowski, Julius Holzschuh and Paul F. Jäger. This study was partially funded by NUM 2.0 (FKZ: 01KX2121).}

\bibliography{midl25_080}

\newpage
\appendix




\begin{figure}[t!]
    \centering
    \includegraphics[width=\textwidth]{figures/example_images_2.pdf}
    \caption{Example figures from PubMedCentral \cite{example_image_2_3_ct,example_image_4_ct} containing visual markers to guide the reader on specific regions of interest.
    Those markers are also referred to in the figure descriptions.
    }
    \label{fig:example_images}
\end{figure}


\section{Datasets}
\label{sec:appendix_datasets}
 
 \paragraph{Padchest-GR}
 includes 4,555 chest X-ray (CXR) studies with grounded radiology reports and bounding box annotations \cite{padchest-gr}. 
 We filter for samples where each pathology has a only single bounding box to ensure fair comparison with our cropping baseline.
 We use the official train, validation, and test split but filter for samples where each pathology has a only single bounding box to ensure fair comparison with our cropping. 
For the training set, this results in 1,547 images with a total of 19 classes: hiatal hernia, vascular hilar enlargement, atelectasis, cardiomegaly, nodule, aortic atheromatosis, aortic elongation, scoliosis, vertebral degenerative changes, alveolar pattern, hypoexpansion, pleural effusion, hemidiaphragm elevation, fracture, pleural thickening, hyperinflated lung, goiter, bronchiectasis, interstitial pattern.
For the validation set, this results in 221 samples and 20 classes, with the same classes as the training set, plus osteopenia.
For the test set, this results in 446 samples and 19 classes, with the same classes as the training set, except osteopenia missing.

\paragraph{VinDr-CXR}
includes 18,000 chest X-ray (CXR) scans with radiologist-annotated bounding boxes for 22 findings \cite{vindrcxr}. We use the official train and test split and limit our selection to samples where each pathology has only a single bounding box, ensuring fair comparison with our cropping baseline. Furthermore, we exclude samples labeled as 'Other lesion' due to their lack of specificity.
For the training set, this results in 2602 images with in total 21 classes: Infiltration, Lung Opacity, Consolidation, Nodule/Mass, Aortic enlargement, Cardiomegaly, Pleural effusion, Pulmonary fibrosis, Pleural thickening, Enlarged PA, ILD, Lung cavity, Atelectasis, Calcification, Mediastinal shift, Clavicle fracture, Pneumothorax, Rib fracture, Emphysema, Lung cyst, Edema.
For the test set, this results in 609 samples and the same classes, except Emphysema, Lung cyst and Edema missing.


\begin{table}[]
\caption{Ablation study on line width of visual prompt marker. We fix the marker color to red and evaluate AUROC on the PadChest-GR test set, averaging results across conditions where the marker was and was not mentioned in the text prompt.}
\centering
\begin{tabular}{l|ccccccc}
 & 1 & 2 & 3 & 4 & 5 & 7 & 10 \\ 
\midrule
BiomedCLIP & & & & & & & \\
\hspace{15mm}  Arrow & \textbf{0.728} & 0.718 & 0.712 & 0.709 & 0.699 & 0.689 & 0.685 \\
\hspace{15mm}  BBox &\textbf{0.776} & 0.770 & 0.761 & 0.756 & 0.751 & 0.736 & 0.720 \\
\hspace{15mm} Circle & \textbf{0.783} & 0.775 & 0.770 & 0.764 & 0.758 & 0.746 & 0.741 \\
\midrule
BMCA-CLIP & & & & & & & \\
\hspace{15mm}  Arrow & \textbf{0.710} & 0.706 & 0.705 & 0.702 & 0.697 & 0.691 & 0.679 \\
\hspace{15mm}  BBox & \textbf{0.783} & 0.778 & 0.772 & 0.764 & 0.755 & 0.741 & 0.717 \\
\hspace{15mm}  Circle & \textbf{0.782} & 0.765 & 0.761 & 0.755 & 0.752 & 0.742 & 0.714 \\
\midrule
\midrule
Average & \textbf{0.760} & 0.752 & 0.747 & 0.742 & 0.735 & 0.724 & 0.709 \\
\end{tabular}
\label{tab:linewidth_ablation}
\end{table}

\begin{table}[]
\caption{Ablation study on color of the visual prompt marker. We fix the line width to 1 and evaluate AUROC on the PadChest-GR test set, averaging results across conditions where the marker was and was not mentioned in the text prompt.}
\centering
\begin{tabular}{l|ccccccc}
 & Black & Blue & Green & Orange & Red & White & Yellow \\ 
\midrule
BiomedCLIP & & & & & & & \\
\hspace{15mm}  Arrow & \textbf{0.730} & 0.718 & 0.712 & 0.709 & 0.728 & 0.723 & 0.708 \\
\hspace{15mm}  BBox & 0.760 & 0.753 & 0.775 & 0.765 & \textbf{0.776} & 0.745 & 0.759 \\
\hspace{15mm}  Circle & 0.772 & 0.780 & \textbf{0.794} & 0.779 & 0.783 & 0.752 & 0.773 \\
\midrule
BMC-CLIP & & & & & & & \\
\hspace{15mm}  Arrow & 0.691 & 0.704 & 0.694 & \textbf{0.713} & 0.710 & 0.696 & \textbf{0.713} \\
\hspace{15mm}  BBox & 0.757 & 0.776 & 0.768 & 0.769 & \textbf{0.783} & 0.743 & 0.768 \\
\hspace{15mm}  Circle & 0.765 & \textbf{0.783} & 0.779 & 0.782 & 0.782 & 0.765 & 0.780 \\
\midrule
\midrule
Average & 0.746 & 0.752 & 0.754 & 0.753 & \textbf{0.760} & 0.737 & 0.750
\end{tabular}
\label{tab:color_ablation}
\end{table}

\begin{figure}[ht!]
    \centering
    \includesvg[width=0.85\textwidth]{figures/Ablation.svg}
    \caption{
    Ablation study on robustness. (a) Evaluation of robustness to spatial shifts of the visual prompt across models and datasets.
    (b) Performance assessment across different scales of the visual prompt marker.
    }
    \label{fig:ablation}
\end{figure}

\begin{figure}[t!]
    \centering
    \includesvg[width=\textwidth]{figures/pred_vs_size.svg}
    \caption{
        Average predicted probability (y-axis) as a function of pathology size (x-axis) for the PadChest-GR training dataset. For each image, we extract the softmax probability assigned to the ground truth class and compute a moving average across pathology sizes. Results are shown for both models. BiomedCLIP (top) and BMCA-CLIP (bottom). Particularly, small pathologies benefit from visual prompting.
    }
    \label{fig:abl_pathology_size}
\end{figure}

\end{document}