% This is a modified version of Springer's LNCS template suitable for anonymized MICCAI 2025 main conference submissions. 
% Original file: samplepaper.tex, a sample chapter demonstrating the LLNCS macro package for Springer Computer Science proceedings; Version 2.21 of 2022/01/12

\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encodings may result in incorrect characters.
%
\usepackage{graphicx,verbatim}
\usepackage{xcolor} % Get rid of for final version
\usepackage{lipsum} % Get rid of for final version
\usepackage{hyperref}
\usepackage{amsmath}
\setlength{\belowcaptionskip}{-7pt}
%\usepackage[style=numeric, maxnames=6, backend=biber]{biblatex}
%\addbibresource{mybibliography.bib}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
\usepackage{color}
\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{GroundingDINO for Open-Set Lesion Detection in Medical Imaging}
%
% \begin{comment}  %% Removed for anonymized MICCAI 2025 submission
\author{Samuel J. Roughley\inst{1} \and
Johanna P. Müller\inst{2} \and
Shangqi Gao\inst{3,4} \and
Zeyu Gao\inst{3,4} \and
Marta Ligero\inst{5} \and
Rudolfs Blums\inst{6} \and
Mireia Crispin-Ortuzar\inst{3,4} \and
Julia Schnabel\inst{6,7,8,9} \and
Bernhard Kainz\inst{2,10} \and
Cosmin I. Bercea\inst{6,7} \and
Ines Prata Machado\inst{3,4}
}
%
\authorrunning{S. J. Roughley et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
% \institute{Princeton University, Princeton NJ 08544, USA \and
% Springer Heidelberg, Tiergartenstr. 17, 69121 Heidelberg, Germany
% \email{lncs@springer.com}\\
% \url{http://www.springer.com/gp/computer-science/lncs} \and
% ABC Institute, Rupert-Karls-University Heidelberg, Heidelberg, Germany\\
% \email{\{abc,lncs\}@uni-heidelberg.de}}
\institute{Department of Physics, University of Cambridge, UK \and
Friedrich-Alexander University Erlangen-Nürnberg, Erlangen,
Germany \and
Department of Oncology, University of Cambridge, UK \and
Early Cancer Institute, University of Cambridge, UK \and
Else Kroener Fresenius Center for Digital Health, Technical University
Dresden, Dresden, Germany \and
Technical University of Munich, Munich, Germany \and
Helmholtz AI and Helmholtz Center Munich, Germany \and
Munich Center for Machine Learning, Germany \and
King's College London, UK \and
Imperial College London, UK
}

% \end{comment}

% \author{Anonymized Authors}  %% Added for anonymized MICCAI 2025 submission
% \authorrunning{Anonymized Author et al.}
% \institute{Anonymized Affiliations \\
%     \email{email@anonymized.com}}

\maketitle % typeset the header of the contribution
%
%\vspace{-20pt}
\begin{abstract}
Open-world anomaly detection is a task in which machine learning is well-positioned to advance cancer diagnosis, potentially leading to significantly improved survival rates. For a model to be used in clinical settings, it must demonstrate high performance, robustness, and generalisability. A common approach to achieving high generalisability is to incorporate information from broader representations within the model. In this work, we investigate the application of GroundingDINO to medical anomaly detection and localisation, evaluating both its overall performance and the influence of text prompts. We find that GroundingDINO outperforms the YOLOv11n model even with minimal use of contextual information. When exploring methods to introduce more contextual information, we observe that specifying the organ within the prompt improves closed-set performance on rarer lesion classes. However, adding visual descriptions of lesions during training leads to a significant performance drop on those subsets, indicating that the model memorises prompt-image pairs rather than learning meaningful semantic relationships. Our work highlights a critical limitation of GroundingDINO in medical imaging and proposes targeted modifications to the model architecture or training strategies as promising directions for utilising richer semantic prompts to improve anomaly detection.

\keywords{Anomaly Detection \and GroundingDINO \and Prompt Engineering \and Medical Imaging \and Lesion Detection \and Cancer Research}

\end{abstract}

\section{Introduction}
\label{sec: Introduction}
Early detection is critical to improving survival outcomes for cancer, which accounts for nearly 1 in 6 deaths globally~\cite{noauthor_cancer_nodate-1}\cite{mcphail_stage_2015}. To aid in diagnosis, medical imaging technologies such as Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) provide detailed 3D anatomical views. However, automated identification of open-world anomalies in these scans has not kept pace with advancements in imaging technologies, as interpreting the resulting images remains highly challenging~\cite{robinson_radiologys_1997}. For example, studies have found that approximately one-third of diagnoses are often missed across various diagnostic pathways~\cite{kim_fool_2014}\cite{berlin_accuracy_2007}. Therefore, research into computer-aided cancer detection is invaluable not only for improving cancer survival rates but also for alleviating the growing burden on healthcare systems.
As such, significant effort has been dedicated to developing machine learning models for medical anomaly detection (AD). The appearance of cancer varies widely across types, subtypes, and individual patients, making robust open-set performance challenging ~\cite{khader_importance_2022}\cite{wu_identifying_2017}. However, for a model to be clinically viable, it must be capable of detecting both common and rare, or previously unseen, pathologies. A common strategy for improving generalisability is to incorporate contextual information into the model. For example, the GroundingDINO model achieves state-of-the-art open-set performance in the natural imaging domain by introducing language prompts into a closed-set detector~\cite{liu_grounding_2024}. Despite such successes, however, these methods remain relatively underexplored in the medical domain. \textbf{Contributions.} We present the first investigation of GroundingDINO for medical anomaly detection, focusing on lesion detection in CT scans of the chest–abdomen–pelvis region, and compare its performance with the state-of-the-art YOLOv11n model. Through a series of experiments using varied text prompts, we examine the impact of prompt design on both closed-set and open-set performance, exploring how semantic information can enhance medical AD. Our ultimate goal is to lay the groundwork for future integration of text and image modalities to achieve state-of-the-art performance with real clinical applicability. 

\section{Methodology}
\label{sec: Method}

\noindent\textbf{Background.}
GroundingDINO is a transformer-based vision–language model originally trained for object detection on natural images. Its primary goal is to generalise to unseen object classes by integrating semantic information via language into the closed-set detector DINO~\cite{zhang_dino_2022}, thereby enabling open-set capabilities. The model’s architecture includes three cross-modality fusion points, which the authors argue provide stronger language guidance during detection compared to models with fewer fusion locations~\cite{liu_grounding_2024}.
Open-set detection is particularly relevant in medical imaging tasks such as cancer screening, where rare and previously unseen lesions may be encountered. Recent work has highlighted the importance of integrating semantic priors to improve detection generalisation in these settings~\cite{aleem2024test}.
Recent advances in Large Language Models (LLMs), such as Gemini~\cite{team_gemini_2025}, BiomedGPT~\cite{luo2024biomedgpt}, and ChatGPT-4~\cite{achiam2023gpt}, have demonstrated strong capabilities in generating clinically rich, context-aware descriptions. These models provide a powerful means of constructing descriptive prompts to guide open-set detection models in medical applications~\cite{liu2025segment}.
Despite its comparatively modest size and training data, GroundingDINO achieves state-of-the-art performance on open-set detection benchmarks, outperforming larger models such as GLIP~\cite{li_grounded_2022} in the COCO zero-shot setting~\cite{lin_microsoft_2014}. Its utility in medical contexts has already been demonstrated in the BiomedParse study~\cite{zhao_biomedparse_2025}, where it was used to propose bounding boxes without additional training.

\noindent\textbf{Model Architecture.}
The pipeline used in our experiments is illustrated in Figure~\ref{fig: Pipeline}. Since GroundingDINO is limited to 2D detection, a single slice must first be selected from the scan. The slice is then normalised to improve consistency across samples. Before being passed to GroundingDINO, a text prompt must also be generated. As the method of prompt generation varies across our experiments, a general representation is shown in Figure~\ref{fig: Pipeline}. When relevant, the images are used post-normalisation to generate the prompts. The prompt and normalised image are then passed to the GroundingDINO architecture, where the text and image backbones extract features from the inputs. The feature enhancer then updates the features, making use of text-to-image and image-to-text cross-attention. The updated text features then guide the selection of queries to be used in the decoder, where text and image cross-attention are used to generate the model outputs. For additional details, we refer the reader to the original work by Liu et al.~\cite{liu_grounding_2024}

\begin{figure}[!tb]
\centering
\includegraphics[width=0.98\textwidth]{Figures/Pipeline.pdf}
\caption{Detection pipeline used during experiments, highlighting the inclusion of the GroundingDINO architecture. A single slice is normalised and a text prompt generated, before being passed to GroundingDINO to perform the detection. The locations of cross-modality fusion are highlighted: cross-attention blocks within the feature enhancer and decoder, and language guidance of query selection. Multiple methods for prompt generation were explored, so it is shown generally.} \label{fig: Pipeline}
\end{figure}


\section{Evaluation}
\label{sec: Experiments}

\noindent\textbf{Datasets.}
For training and evaluation, we used the Universal Lesion Segmentation Challenge 2023 (ULS23) dataset~\cite{grauw_uls23_2024}, comprising chest–abdomen–pelvic CT scans with segmentation mask annotations. The dataset contains $6,382$ lesions from $2,627$ patients across various organs (Figure~\ref{fig:Data Info}). Each scan is cropped to a volume of interest (VOI) of $256 \times 256 \times 128$ voxels, centred on a single annotated lesion. Although lesion centring introduces bias, this controlled setup establishes baseline performance. Extending to whole-volume detection is required for clinical use and can be addressed in future translation work.

\begin{figure}[!tb]
\centering
\includegraphics[width=0.98\textwidth]{Figures/dataset_info_v4.pdf} 
\caption{Breakdown of the ULS23 Dataset. a) Number of slices from scans containing (no) lesions in each organ. b) Distribution of mask sizes by organ, with samples outside $1.5$ times the IQR from the nearest quartile shown as outliers. c) Number of patients with scans of lesions in each organ.}
\label{fig:Data Info}
\end{figure}
\noindent\textbf{Pre-processing.}
Annotations of multiple lesions from the same scan were combined into a single annotation without merging adjacent lesions, enabling detection use. Segmentation masks were converted to bounding boxes for GroundingDINO. Scans were normalised first.
Due to wide variation in Hounsfield units (HU) across lesions, fixed windowing was unsuitable. Following the ULS23 baseline, Z-score normalisation was applied per slice. Visibility—measured as the absolute difference between median lesion and surrounding intensities divided by local standard deviation—improved for all lesion types except those in bone, indicating potential bias.
The dataset was split into $80~\%$ training ($274{,}617$ slices), $10~\%$ validation ($33{,}995$ slices), and $10~\%$ testing ($36{,}230$ slices), with organ-specific and patient-level separation to prevent data leakage.

\noindent\textbf{Experiments.}
Three experiment types were conducted with models trained on prompts of varying detail. The first used a simple prompt, \textit{“lesion”}, for all scans, providing minimal language guidance and serving as a baseline. Equivalent YOLOv11n models~\cite{yolo11_ultralytics}, which lack language input, were trained for comparison, mainly relevant to this first experiment.
The second experiment specified the organ in the prompt (e.g., \textit{``[organ] lesion''}). The third fine-tuned these models using visual descriptions generated by Gemini (‘gemini-2.5-pro-preview-03-25’ model)~\cite{team_gemini_2025}, focusing on lymph node lesions due to their moderate sample size and lower initial performance.
All three experiments were run both with all lesion types and with mediastinal lesions ($4,879$ training slices) excluded, as they had the fewest samples, minimising training set reduction. Testing on excluded mediastinal lesions evaluates open-set performance. Since data consists of cropped CT scans, each shows only a small anatomical region.

\noindent\textbf{Training Strategy.}
To train GroundingDINO, the Open-GroundingDINO training code was used with default model hyperparameters and data augmentations~\cite{Open_Grounding_Dino}. The released \href{https://github.com/IDEA-Research/GroundingDINO/releases/tag/v0.1.0-alpha}{GroundingDINO} model with the Swin-T image backbone was used as the initialization, and bert-base-uncased~\cite{devlin_bert_2019} from Hugging Face~\cite{wolf_huggingfaces_2020} served as the text backbone. For YOLO training, the default implementation from the \href{https://docs.ultralytics.com/quickstart/}{Ultralytics} Python package was used. All models were trained for 25 epochs on NVIDIA A40 and L40S GPUs. To evaluate model performance, we used the Average Precision (AP) and RoDeO~\cite{meissen_robust_2023} metrics. For RoDeO, a bounding box threshold of $0.2$ was selected based on sweeps over the validation set.

\section{Results}
\label{sec: Results}

\subsection{Minimal Language Guidance}
\label{subsec: Minimal Language Guidance}

\begin{figure}[!tb]
\centering
\includegraphics[width=0.92\textwidth]{Figures/GD_vs_YOLO_radar_charts_v3.2.pdf}
\caption{Radar charts comparing the performance of GroundingDINO and YOLO stratified by organ, with the prompt of \textit{``lesion''} given to GroundingDINO. Average Precision and RoDeO/total metrics are shown for GroundingDINO (green) and YOLO (blue) models that saw all lesion types (top) and all except mediastinal lesions (bottom) from the ULS23 dataset during training.} \label{fig: Minimal Language Guidance Radars}
\end{figure}

The results for the GroundingDINO models using the prompt \textit{``lesion''} for all scans, along with the corresponding YOLO models, are shown in Figure~\ref{fig: Minimal Language Guidance Radars}. GroundingDINO performs as well as or better than YOLO across all organs. GroundingDINO's superior performance using only simple prompts indicates that semantic alignment, not present in YOLO, offers tangible benefits independent of prompt complexity, highlighting the model's potential suitability for medical anomaly detection and supporting its use in research such as ours. 
Inference examples from the GroundingDINO model trained on all lesion types are shown in Figure~\ref{fig: Inference Examples}, illustrating both successful detections and failure cases. Figure~\ref{fig: Inference Examples}c highlights ambiguities in lesion definition, bounding an internal substructure within the ground truth. Figure~\ref{fig: Inference Examples}d contains false positives, typically observed near anatomical features resembling lesion morphology (e.g., vessels or bones).  The issue of false positives is noted in the original GroundingDINO paper~\cite{liu_grounding_2024}. The persistence of these issues with minimal prompts points to the need for more precise annotations and improved semantic grounding.
As expected, after removing mediastinal lesions from training, performance on mediastinal lesions drops significantly. However, while YOLO's performance falls to near zero (e.g., RoDeO/total $= 0.013$), GroundingDINO maintains better performance. This better preservation of accuracy, even before introducing additional language guidance, suggests stronger inherent generalisability, making results especially relevant in discussions of clinical deployment. Nevertheless, the sizeable performance drop underscores that open-set detection remains a significant challenge. Consequently, with multimodal models like GroundingDINO, it is natural to consider whether language guidance can mitigate this decline.

\begin{figure}[!tb]
\centering
\includegraphics[width=0.95\textwidth]{Figures/inference_tests_figure_v2.pdf}
\caption{Inference examples from the GroundingDINO model trained on lesions across all organs in the ULS23 dataset, using \textit{“lesion”} as the text prompt. Ground truth annotations (top, red boxes) and model predictions (bottom, blue boxes) are shown for six organ sites.} \label{fig: Inference Examples}
\end{figure}

\subsection{Enhanced Language Guidance}
\label{subsec: Enhanced Language Guidance}

The results for the different GroundingDINO models using the three different prompt types are shown in Figure~\ref{fig: Enhanced Language Guidance Radars}.


\begin{figure}[!tb]
\centering
\includegraphics[width=0.92\textwidth]{Figures/GD_different_prompts_radars_v3.pdf}

\caption{Radar charts comparing the performance of GroundingDINO models stratified by organ, with the models differing by the choice of text prompt used. Average Precision and RoDeO/total metrics are shown for GroundingDINO models that saw all lesion types (top) and all except mediastinal lesions (bottom) from the ULS23 dataset during training. Prompts of \textit{``lesion''} (green), \textit{``[organ] lesion''} (blue) and the addition of visual descriptions (red) were all tested.} \label{fig: Enhanced Language Guidance Radars}
\end{figure}
\noindent\textbf{Organ-specific prompts.}
Organ-specific prompts show no definitive overall effect on performance. A slight improvement is seen when all organs are included in training, but its small magnitude and disappearance when mediastinal lesions are excluded make its significance unclear.
Notably, performance improves for colon ($54~\%$ RoDeO/total), mediastinal ($87~\%$), and abdominal ($30~\%$) lesions when all lesion types are included. These gains likely result from limited training data for these lesion types (Figure~\ref{fig:Data Info}a), making them more susceptible to being overshadowed. Organ-specific prompts reduce interclass competition, helping the model learn relevant visual features.
A similar improvement is observed for colon lesions when mediastinal lesions are excluded. However, no gains are seen for mediastinal or abdominal lesions. For mediastinal lesions, this is expected, as the model had no exposure to them. For abdominal lesions, the absence of improvement suggests their performance was suppressed specifically by the presence of mediastinal lesions, despite the latter being the smallest class.

\noindent\textbf{Descriptive prompts.} 
After fine-tuning models using visual descriptions for lymph node lesions during training, performance on lymph node lesions dropped to zero. In the test set, none of the model's predictions for lymph node lesions exceeded a confidence score of $0.05$, explaining why RoDeO/total $= 0$. To better understand this behaviour, the model outputs were analysed in more detail. During inference, GroundingDINO generates 900 (box, caption) pairs. For each pair, an activation score is computed for every token in the text input, and tokens with scores above a threshold form the caption. Examples of the mean activation scores across the 900 predictions for a training and testing sample are shown in Figure~\ref{fig: Activation Scores}. 
\begin{figure}[!tb]
\centering
\includegraphics[width=0.98\textwidth]{Figures/combined_activation_map_v2.pdf}
\caption{Token-level activation maps from GroundingDINO for a lymph node lesion sample from the training and testing sets, showing uniformly distributed attention across tokens.} \label{fig: Activation Scores}
\end{figure}
Activation is highly uniform across tokens. Excluding start and end markers, the maximum activation difference per prediction is just $0.0001$ in training and $0.00004$ in testing. GroundingDINO is meant to align text and image features so semantics guide detection, but the uniformity suggests overfitting: the model aligns the entire prompt with image features rather than understanding it. As a result, it fails to link the test prompt “lymph node lesion” to relevant training visuals, leading to inaccurate predictions, especially when training and test prompts differ, which earlier experiments did not reveal.
The drop in lymph node performance to zero, despite previous success with descriptive prompts, suggests the initial learning rate was too high. A lower rate might have preserved some understanding but would not fix the uniform activation, which stems from how GroundingDINO learns. Addressing this may require changes to the loss function, text encoder, or prompt engineering.


\section{Conclusions}
\label{sec: Conclusions}

GroundingDINO was found to outperform the YOLOv11n model when prompted with the term \textit{“lesion”}, highlighting its suitability for research into medical anomaly detection. Incorporating organ-specific information into text prompts significantly improves closed-set performance on rare lesion classes, emphasising the importance of semantic conditioning. Although overall and open-set performance remain unchanged, these findings suggest clear opportunities for improvement. Using detailed visual lesion descriptions during training revealed overfitting issues that hinder semantic generalization, underscoring the need to refine training methods to better leverage language-based cues. 

\begin{credits}
\subsubsection{\ackname} C.I.B. is funded via the EVUK programme (“Next-generation AI for Integrated Diagnostics”) of the Free State of Bavaria and partially supported by the Helmholtz Association [Munich School for Data Science]. This work is also supported by the Berdelle-Stiftung [TimeFlow]. The authors acknowledge scientific support and HPC resources from NHR@FAU [b143dc, b180dc], funded by federal and Bavarian authorities, with partial hardware funding from the DFG [440719683]. Additional support was received from the ERC [101083647], the DFG [KA 5801/2-1, INST 90/1351-1], and the state of Bavaria.
Further funding was provided by Cancer Research UK [A22905], the CRUK Cambridge Centre [CTRQQR-2021-100012, A25177], The Mark Foundation for Cancer Research [RG95043], GE HealthCare, the CRUK National Cancer Imaging Translational Accelerator [A27066], and the NIHR Cambridge Biomedical Research Centre [NIHR203312, BRC-1215-20014].

\subsubsection{\discintname} The authors have no competing interests to declare that are relevant to the content of this article.
\end{credits}
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{Paper-0010}

%
% \begin{thebibliography}{8}
% \bibitem{ref_article1}
% Author, F.: Article title. Journal \textbf{2}(5), 99--110 (2016)

% \bibitem{ref_lncs1}
% Author, F., Author, S.: Title of a proceedings paper. In: Editor,
% F., Editor, S. (eds.) CONFERENCE 2016, LNCS, vol. 9999, pp. 1--13.
% Springer, Heidelberg (2016). \doi{10.10007/1234567890}

% \bibitem{ref_book1}
% Author, F., Author, S., Author, T.: Book title. 2nd edn. Publisher,
% Location (1999)

% \bibitem{ref_proc1}
% Author, A.-B.: Contribution title. In: 9th International Proceedings
% on Proceedings, pp. 1--2. Publisher, Location (2010)

% \bibitem{ref_url1}
% LNCS Homepage, \url{http://www.springer.com/lncs}, last accessed 2023/10/25
% \end{thebibliography}
\end{document}
