\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{array}
\usepackage{amsmath}
\usepackage{xcolor}
\usepackage{color,soul}
\usepackage{makecell}
\usepackage{multirow}
\usepackage[title]{appendix}
\usepackage[misc]{ifsym}
\jmlrvolume{-- Under Review}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 006}
\editors{Accepted for publication at MIDL 2025}

\title[PCA-YOLO]{PCA-YOLO: A Small Liver Tumor Detection Model with Patch-Contrastive Attention}


% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Xueyang Li\nametag{$^{1}$}} \Email{xli34@nd.edu} \\
\addr $^{1}$ University of Notre Dame, Notre Dame, USA \\
\Name{Han Xiao\nametag{$^{2}$}} \Email{xiaoh69@mail.sysu.edu.cn} \\
\addr $^{2}$ The First Affiliated Hospital of Sun Yat-sen University, Guangzhou, China\\
\Name{Zongpeng Weng\nametag{$^{2}$}} \Email{wengzp5@mail2.sysu.edu.cn} \\
\Name{Xinrong Hu\nametag{$^{1}$}} \Email{xhu7@nd.edu} \\
\Name{Danny Chen\nametag{$^{1}$}} \Email{dchen@nd.edu} \\
\Name{Yiyu Shi\nametag{$^{1}$}} \Email{yshi4@nd.edu}
}


\begin{document}

\maketitle

\begin{abstract}
Liver tumors, as one of the most common malignant tumor types, represent a significant clinical challenge, with the detection of small tumors being particularly problematic. 
Despite the rapid advances in deep learning (DL) offering significant support in reducing the workload of radiologists, current detection models still struggle with the detection of small tumors. This is particularly troubling as these are the cases where even experienced radiologists are more prone to errors, underscoring the critical need for improved accuracy of detection methods in this area. Addressing this critical gap, this article introduces patch-contrastive attention YOLO (PCA-YOLO), an innovative adaptation of the YOLO framework, incorporating a patch-based attention module to specifically target the detection of small liver tumors. Furthermore, we collected a specialized CT dataset focusing exclusively on small liver tumors, complemented with meticulously annotated bounding boxes, to facilitate this study. Our experimental findings demonstrate that our approach achieves a leading mean Average Precision (mAP) score of 77.2\% at a 50\% Intersection Over Union (IoU) threshold, surpassing all current leading detection methods tested against our specialized dataset.
\end{abstract}

\begin{keywords}
Liver tumor detection, Attention mechanism, CT scan dataset, YOLO model.
\end{keywords}

\section{Introduction}
Liver cancer ranks as the second most deadly form of tumor, with Hepatocellular Carcinoma (HCC) being its most prevalent type, constituting approximately 85\% of all liver cancer cases~\cite{doi:10.1056/NEJMra1713263}. Computed tomography (CT) scanning is a fundamental tool for diagnosing HCC, but the manual evaluation of these scans is notably time-consuming. This is especially true for small liver tumors, on which the task of interpretation demands radiologists to have substantial experience and specialized knowledge.

On the other hand, advances in artificial intelligence have provided deep learning (DL) based approaches for liver tumor analysis using CT scans. Based on the widely recognized BCLC guideline and RECIST standard, the number of tumors is a critical parameter for staging HCC, and the maximum tumor diameter is a pivotal factor influencing patient prognosis~\cite{eisenhauer2009new,reig2022bclc}. Given their capacity to efficiently derive such critical information and the significantly reduced human effort needed for labeling, detection models stand out as more suitable alternatives for DL-aided HCC diagnosis, in comparison to segmentation models.

% However, tumor detection models, less explored, offer critical insights for early HCC diagnosis. These models efficiently quantify tumor count and size—key factors in HCC staging and prognosis as per BCLC guidelines and RECIST standards~\cite{reig2022bclc}~\cite{eisenhauer2009new}. Detection models, providing rapid and relevant diagnostic information, may thus hold greater clinical utility over detailed segmentation outputs.

While liver tumor segmentation has garnered significant attention, the task of liver tumor detection remains relatively under-explored. Current approaches predominantly adapt modified U-Net~\cite{ronneberger2015u} structures (e.g., RA-Net~\cite{kalsoom2022computer} and CLIP~\cite{liu2023clip}) for this purpose. In addition to this, state-of-the-art detection models originally designed for natural images, like DETR~\cite{carion2020end} and YOLO~\cite{redmon2016you}, show promise for medical imaging applications following certain adaptations, as evidenced by RCS-YOLO in brain tumor detection~\cite{kang2023rcs} and SPN-TS in breast tumor detection~\cite{he2023cancer}. However, a notable barrier to adapting these natural image detection models for medical settings is their performance with small objects. In natural scenes, objects typically occupy larger portions of the images and have more distinct features compared to tumors in medical scans. This issue becomes particularly acute when identifying small-sized tumors, demanding the model to further detect subtle, less pronounced features amidst the complex background of CT slices~\cite{abdusalomov2023brain,he2023cancer}. Further, accurate detection of such small tumors is crucial, surpassing the importance of identifying larger, more noticeable tumors. This is because even experienced radiologists are more susceptible to mistakes when evaluating small tumors, and they may not be able to easily rectify the inaccuracies incurred by DL models, as they might with more conspicuous tumors.


% Additionally, the detection of small tumors, which is important yet challenging for many deep learning models~\cite{abdusalomov2023brain}~\cite{he2023cancer}, also remains a critical area for improvement, given the higher likelihood of diagnostic errors in such cases by radiologists, underscoring the urgent need for both advanced detection datasets and sophisticated detection models tailored to liver tumor diagnostics.

% Despite their proven efficacy, the application of these advanced detection models to liver tumor detection is yet to be fully investigated, possibly due to the absence of a specialized detection dataset. The liver tumor segmentation benchmark (LiTS) dataset~\cite{bilic2023liver}, primarily designed for segmentation, poses challenges in converting segmentation masks to bounding boxes, especially in complex scenarios involving tumor clusters or adjacent tumors, where domain expertise is crucial. This is pivotal as tumor count directly influences HCC staging~\cite{reig2022bclc}. 

Inspired by the successes of attention mechanism~\cite{vaswani2017attention} and the contrastive learning strategy in the Siamese network~\cite{koch2015siamese}, this paper synthesizes these methodologies to introduce a novel YOLO framework called \textit{patch-contrastive attention YOLO} (PCA-YOLO), aiming to address the challenge of detecting small tumors. 
% More specifically, the approach involves dividing large CT scans into smaller patches. These patches are then analyzed in comparison to the original images using a Siamese network to determine the presence of tumors in each patch. The resulting scores from this comparison serve as an attention mechanism of each patch, which is subsequently integrated into the YOLOv8x model~\cite{yolov8_ultralytics} for enhanced detection capabilities. 
The rationale for this design is clear:
%straightforward: 
While small tumors may be less conspicuous due to limited information, they should still be more distinguishable and tumor-like in appearance compared to other regions within the same slice. Additionally, we introduce to the public via this study our Small Liver Tumor Detection (SLTD) CT dataset, a pioneering resource specifically designed for detecting small liver tumors, to further promote research along this direction. Our PCA-YOLO approach attains an mAP@50\% score of 77.2\% on this dataset, surpassing the performance of all existing state-of-the-art detection models.
% in this paper, we present our Small Liver Tumor Detection (SLTD) dataset, a pioneering resource specifically designed for the detection of small liver tumors. The SLTD dataset comprises 208 3D CT scans, totaling 41,587 2D slices. From this collection, our team of radiologists meticulously selected and annotated 416 2D scans. These scans were chosen based on their proximity to the initial and final slices where the tumor is visible within each 3D volume. The focus was exclusively on tumors with a diameter of 6.5 centimeters or less, ensuring the dataset's specificity to small tumor detection. We also introduce the Patch-Contrastive Attention YOLO (PCA-YOLO), built on top of 



% On the other hand, the rapid advancements within the artificial intelligence domain have notably facilitated the application of deep learning-based approaches for automatic liver tumor segmentation in CT scans. Despite this progress, the potential of tumor detection has received comparatively less attention. However, in the context of early HCC detection and diagnosis, detection models may provide more pertinent insights than segmentation models. Notably, detection models are adept at quantifying tumor count, a critical parameter for staging HCC in accordance with the widely recognized BCLC diagnostic framework~\cite{reig2022bclc}. Furthermore, in alignment with the RECIST guidelines, the maximum tumor diameter is a pivotal factor influencing patient prognosis~\cite{eisenhauer2009new}, and is deemed adequate for clinical diagnosis. Detection models are capable of rapidly furnishing such metrics, whereas the precise tumor boundaries delineated by segmentation models are seldom utilized in clinical practice.

\section{The SLTD Dataset}
The liver tumor segmentation benchmark (LiTS) dataset~\cite{bilic2023liver}, primarily designed for segmentation, is commonly used when performing liver tumor detection~\cite{liu2023clip}. But, it poses challenges in converting segmentation masks to bounding boxes, especially in complex scenarios involving tumor clusters or adjacent tumors, where domain expertise is crucial. This is pivotal as tumor count directly influences HCC staging~\cite{reig2022bclc}. To address this gap and provide a more dedicated and realistic dataset for the small liver tumor detection task, we introduce our Small Liver Tumor Detection (SLTD) dataset. The SLTD dataset comprises 208 3D CT volumes, totaling 41,587 2D slices. The CT scans are acquired during the portal venous phase, as it is more commonly used and clinically effective for tumor diagnosis compared to the arterial and unenhanced plain scan phases. The images have a resolution of 512×512 and were acquired from the same CT machine, with a window width of 150, window level of 50, radiation dose 120kV, slice thickness of 1 mm, and slice gap of 0.8 mm. The images underwent manual quality control to exclude any scans with noticeable artifacts or blurriness and to verify the completeness of all slices. From this collection, our team of radiologists with at least five years of clinical expertise meticulously selected and annotated 452 2D slices. These slices were chosen based on their proximity to the initial and final slices where a tumor is visible within each 3D volume. To ensure annotation reliability, only slices with bounding boxes size of at least 0.4 cm were included. In summary, our dataset contains bounding boxes with sizes ranging from 0.4 cm to 5.0 cm, while the tumor maximum diameters range from 1.0 cm to 6.5 cm. 
% The selected dataset is specifically designed for small tumor detection. 
Each slice contains between 1 to 4 tumors, averaging 1.49 tumors per slice, providing a diverse and comprehensive representation of small tumor cases. Representative images from the selected slices are shown in Fig.~\ref{dataset}. The dataset is publicly available at \url{{https://github.com/XLIAaron/Small_LiverTumor}}.

\begin{figure}[t]
\centering
\includegraphics[width=0.9\textwidth]{Dataset.pdf}
\caption{Representative CT slices from the SLTD dataset: (a) A tumor with a bounding box size of 0.8 cm; (b) a tumor with a bounding box size of 2.2 cm; (c) the largest tumor with a bounding box size of 5.0 cm.} \label{dataset}
\end{figure}
% \setlength{\tabcolsep}{0.5em}
% \begin{table}
% \centering
% \caption{Statistics of our dataset}\label{tab0}
% \begin{tabular}{cccc}
% \hline
%  & Minimum & Maximum & Median   \\
% \Xhline{2\arrayrulewidth}
% Tumor diameter (cm) & 1.0 & 6.5 & 3.1  \\
% \hline
% \# of tumors on slice & 1 & 4 & 1  \\
% \hline
% \end{tabular}
% \end{table}


\section{Methodology}
The overall architecture of PCA-YOLO is depicted in Fig.~\ref{model}(a), featuring a patch-contrastive attention (PCA) module at its core. Within this module, a Siamese network~\cite{koch2015siamese} is utilized to assess whether a given patch is similar to the original image, i.e., whether the patch contains a tumor. Following the PCA module, the architecture is integrated with a detection head, employing the YOLOv8x~\cite{yolov8_ultralytics} model for this purpose. The details are given below. 

\begin{figure}[t]
\centering
\includegraphics[width=0.8\textwidth]{model24.pdf}
\caption{(a) The general structure of PCA-YOLO. $X$ is an original input image, and $X'$ denotes the attention-augmented input. (b) The detailed structure of the attention module. $x_i$ denotes the patches extracted using non-overlapping cropping of the original image $X$. The Siamese network is trained using each up-sampled patch and $X$ as a paired input, and the ground truth similarity is 1 if a patch contains any tumor and 0 otherwise. $Y$ is the collection of outputs from the Siamese network for all the patches of $X$, which is used to generate the attention map $Y'$. $Y'$ is concatenated with $X$ to form an attention-augmented input $X'$ for YOLOv8x. } \label{model}
\end{figure}

\subsection{Patch-Contrastive Attention Module:} 
Fig.~\ref{model}(b) illustrates the design of the PCA module. The process begins with an input image $X$ of dimensions $512\times512$, which is cropped into $n$ smaller patches using a sliding window technique with no overlap, each patch $x_i$ being $m\times m$ in size, in which $m$ is a hyper-parameter and its impact will be studied in our ablation study. Subsequently, these patches are upscaled back to the original dimensions of $512\times512$ with Bicubic Interpolation. Each upscaled patch and the initial image $X$ are then used as an input pair to train a Siamese network using the default cross-entropy loss function augmented with regularization~\cite{koch2015siamese}, with the ground truth similarity set to one when the patch contains any tumor (and thus similar to the original image) and zero otherwise. 
The trained Siamese network's output, $y_i$, then reflects the likelihood that patch $x_i$ contains any tumor. After this, the attention map of the patch $x_i$, ${y_i}'$, is calculated, as:
\begin{equation}
\label{eqn:1}
{y_i}' = g(y_i)\cdot c\cdot J,
\end{equation}
\begin{equation}
\label{eqn:2}
g(y_i) = \frac{{y_i}-\min(Y)}{\max(Y)-\min(Y)},
\end{equation}
where $g(y_i)$ is a normalization function, with $y_i$ reflecting the predicted similarity score relative to the original image, $c$ is a constant for attention intensity, $J$ is an $m\times m$ matrix filled exclusively with ones that reconstruct the attention maps to patch size, and $Y=\{y_1, y_2, \ldots, y_n\}$ is the set of predicted similarity scores for all the small patches cropped from the input image $X$. The normalization function $g(y_i)$ is a key, as the Siamese network may miss tiny tumors, causing many of the $y_i$ scores in $Y$ to approach 0. Thus, directly using these unnormalized scores for attention would be quite ineffective. Nevertheless, small tumors typically stand out more than the adjacent areas, resulting in slightly higher $y_i$ scores. Normalizing these scores with $g(y_i)$ to the range $[0, 1]$ enhances attention to the areas with small tumors.
% The normalization function $g(y_i)$ plays a crucial role in our approach, particularly because the Siamese network might overlook very small tumors, even when the patches are upscaled to the original size. Consequently, the scores $y_i$ in $Y$ might all veer towards $0$, indicating a negative outcome. Applying unnormalized scores for attention enhancement would be ineffective due to the insubstantial nature of such attention. Nonetheless, as previously stated, even small tumors should exhibit more tumor-like characteristics than surrounding regions, thereby achieving scores that are, even if marginally, higher than those of other patches. By normalizing the scores to a $[0, 1]$ range through $g(y_i)$, regions harboring small tumors attain elevated scores relative to their original $y_i$ values. 
The subsequent multiplication by the attention intensity $c$ is for accommodating the input normalization used in YOLO. Afterwards, these maps ${y_i}'$ are stitched together based on the locations of the corresponding patches in the original image $X$, to form a size $512\times 512$ attention map, $Y'$, matching the original image's dimensions. Finally, $Y'$ is concatenated alongside the 
%grayscale 
input image $X$ with the matrix concatenation function $\sigma$ as an additional channel, forming the attention-augmented input, $X'$, for YOLOv8x (see Fig.~\ref{model}(b)). 


An alternative design to Eqs.~(\ref{eqn:1}) and (\ref{eqn:2}) is, instead of using the normalized scores $y_i$ and the attention intensity constant $c$, to directly combine the Class Activation Map (CAM)~\cite{zhou2016learning} from the small patches into a large $512\times 512$ CAM matrix, and use it as the attention map $Y'$. However, our experiments revealed that this strategy does not significantly enhance the performance of the baseline detection models. The details and implications of this finding will be further explored and elucidated in the ablation study section.

% An alternative design to the methodology described in Equations(\ref{eqn:1}) and (\ref{eqn:2}) involves skipping the use of normalized predicted scores $y_i$ and the attention intensity constant $c$. Instead, this approach combines the Class Activation Map  (CAM)~\cite{zhou2016learning}  derived from smaller patches into a large $512\times 512$ CAM. This aggregated CAM is then concatenated with the original input $X$ to generate $X'$. However, our experimentation revealed that this strategy does not significantly enhance the performance of the baseline detection model. The details and implications of this finding will be further explored and elucidated in the ablation study section.

\subsection{Detection Head:} YOLO (You Only Look Once)~\cite{redmon2016you}, standing out as a leading model in the realm of object detection, has undergone significant advancements through various versions.
% ~\cite{redmon2017yolo9000}~\cite{redmon2018yolov3}~\cite{bochkovskiy2020yolov4}~\cite{yolov5}~\cite{li2023yolov6}~\cite{wang2022yolov7}~\cite{yolov8_ultralytics}. 
Despite their original design for natural images, YOLO models from YOLOv5 onwards have integrated auto-anchor algorithms, improving their detection of smaller objects beyond the capabilities of many traditional detection models~\cite{yolov5}. The efficacy of YOLO in detecting brain and bone tumors has been demonstrated in studies such as RCS-YOLO~\cite{kang2023rcs} and YOLO-DL~\cite{li2023primary}. Given the similarities in the challenges posed by liver tumor detection, YOLO is selected as the preferred model for our detection head. Further, we opt for YOLOv8x~\cite{yolov8_ultralytics}, the most recent version known for its state-of-the-art capabilities, and train it utilizing our attention-augmented input $X'$.

% As presented in RCS-YOLO~\cite{kang2023rcs} and YOLO DL~\cite{li2023primary}, the xxx-based YOLO models have outstanding performances in brain tumor detection and bone tumor detection. 


% \subsection{Detection Head}
% \subsection{Attention Mechanism}
% \subsection{PCA-YOLO}

\section{Experiments}


\subsection{Experimental Setup:} In the attention module, we choose a patch-cropping size $m$ of 64, tailored to the typical tumor sizes observed on our dataset. We adjust the attention intensity constant $c$ to 255, complementing the subsequent image normalization by YOLOv8x. For the Siamese network~\cite{koch2015siamese}, we employ the Adam optimizer~\cite{kingma2014adam} combined with a Binary Cross Entropy loss function~\cite{ba2014deep}, setting the batch size to 16 and the learning rate to $1e-4$, with an input image size of $512\times512$. To address the imbalance in the patch dataset fed to the Siamese network, Standard Scale Jittering~\cite{ghiasi2021simple} is utilized to augment 100\% of the positive patches and 20\% of the negative patches. The training is conducted in 200 epochs, with an early stopping mechanism triggered after 50 epochs of no performance improvement. For the YOLOv8x detection head, we enhance the default data augmentation strategies by setting the mixup augmentation rate to $0.1$ and the mosaic augmentation rate to $0.3$, aiming to prevent overfitting and improve model robustness. The learning rate for YOLOv8x is $5e-5$, with the training duration extended to 2,000 epochs and an early stopping criterion of 100 epochs. All the remaining settings are maintained with their default values.



\subsection{Baseline Selection:} For baseline comparisons, we select leading models across various categories: nnU-Net~\cite{isensee2021nnu} as a superior version of U-Net~\cite{ronneberger2015u}, SPN-TS~\cite{he2023cancer} which utilizes an FPN~\cite{lin2017feature} architecture for detecting small breast tumors, and Transformer-based models Swin-Unet~\cite{cao2022swin} and RT-DETR~\cite{lv2023detrs}. Additionally, RCS-YOLO~\cite{kang2023rcs}, designed for brain tumor detection, and the latest YOLO version, YOLOv8x~\cite{yolov8_ultralytics}, are also included to showcase the cutting-edge in YOLO advancements. Moreover, we conduct further assessments of our PCA module's effectiveness by substituting it with other state-of-the-art attention mechanisms. These include the self-attention mechanism from ViT-YOLO~\cite{zhang2021vit}, the multi-attention mechanism from Multi-attention Tri-branch Network (MTNet)~\cite{zhong2023semi}, and channel-wise attention~\cite{li2020object}. Each of them is integrated with the same detection head, YOLOv8x, under identical hyper-parameter configurations. Nevertheless, certain attention methods published recently are incompatible with our dataset, such as the slice-wise attention tailored for 3D datasets~\cite{lu2023hacl} and the cross-attention designed for multi-modal datasets~\cite{lin2023few}, and thus are not included in the comparisons.

All the experiments are run on 3 NVIDIA A100 GPUs with 40GB memory each, and 5-fold cross-validation is performed to ensure generalizability. As this is a detection task, we use the mAP@50\% score, Precision, and Recall as the evaluation metrics. 

\section{Results and Discussions}
\begin{table}[t]
\centering
\caption{mAP@50\%, Precision, and Recall scores of different models on the SLTD dataset. Four types of models, including U-Net, Feature Pyramid Network (FPN), Transformers (XFMR), and YOLO, are included for comparison. The last column presents the paired t-test p-value for the mAP@50\% results of each baseline model compared to our PCA-YOLO model. }
\resizebox{\textwidth}{!}{%
\label{tab:results}
\begin{tabular}{|c|c|c|c|c|c|}
\hline
\textbf{Type}       & \textbf{Model} & \textbf{$mAP_{50}$ (\%) }& \textbf{Precision (\%)}& \textbf{Recall (\%)}& p-value  \\ \hline
U-Net & nnU-Net &36.1 $\pm$ 4.7 &56.2 $\pm$ 6.5 &  33.5 $\pm$  5.8 & 0.000      \\ \hline
FPN & SPN-TS & 70.8 $\pm$ 4.6  & 80.9 $\pm$ 6.3 &  63.8 $\pm$ 7.7 &  0.018       \\ \hline
\multirow{2}{*}{XFMR} & Swin-Unet &55.8 $\pm$ 6.5 & 67.5 $\pm$ 10.1&   52.0 $\pm$  10.4 & 0.002     \\ \cline{2-6}
                                   & RT-DETR & 66.5 $\pm$ 3.6 & 77.0 $\pm$ 5.1 &  61.8 $\pm$  6.2 &  0.001       \\ \hline
\multirow{6}{*}{YOLO} & RCS-YOLO  & 72.5 $\pm$ 5.0& 74.1 $\pm$ 8.4&  64.7 $\pm$  6.7 & 0.012     \\ \cline{2-6}
                            & YOLOv8x & 73.7 $\pm$ 3.4& 80.8 $\pm$ 5.5& 66.3 $\pm$ 4.9 &  0.035        \\ \cline{2-6}
                            & Channel-Attention + YOLOv8x & 73.9 $\pm$ 3.3 & 80.5 $\pm$ 5.7 & 68.9 $\pm$ 4.8 & 0.031\\ \cline{2-6}
                            & Self-Attention + YOLOv8x & 74.8 $\pm$ 3.4 & 79.6 $\pm$ 5.5 & 69.1 $\pm$ 4.7 & 0.052\\ \cline{2-6}
                            & MTNet + YOLOv8x & 74.9 $\pm$ 2.9 &81.9 $\pm$ 5.3  & 69.2 $\pm$ 4.7 & 0.064\\ \cline{2-6}
                            & PCA-YOLO (ours) & \textbf{77.2} $\pm$ 2.1 & \textbf{82.5} $\pm$ 3.0 &   \textbf{71.0} $\pm$ 2.4 &  /          \\ \hline
\end{tabular}
}
\end{table}

% \begin{table}[t]
% \centering
% \caption{mAP@50\%, Precision, and Recall scores of different models on the SLTD dataset. Four types of models, including U-Net, Feature Pyramid Network (FPN), Transformers (XFMR), and YOLO, are included for comparison.}
% \label{tab:results}
% \begin{tabular}{|c|c|c|c|c|}
% \hline
% \textbf{Type}       & \textbf{Model} & \textbf{$mAP_{50}$ (\%) }& \textbf{Precision (\%)}& \textbf{Recall (\%)}  \\ \hline
% U-Net & nnU-Net~\cite{isensee2021nnu} &36.1 $\pm$ 4.7 &56.2 $\pm$ 6.5 &  33.5 $\pm$  5.8       \\ \hline
% FPN & SPN-TS~\cite{he2023cancer} & 70.8 $\pm$ 4.6  & 80.9 $\pm$ 6.3 &  63.8 $\pm$ 7.7         \\ \hline
% \multirow{2}{*}{XFMR} & Swin-Unet~\cite{cao2022swin} &55.8 $\pm$ 6.5 & 67.5 $\pm$ 10.1&   52.0 $\pm$  10.4      \\ \cline{2-5}
%                                    & RT-DETR~\cite{lv2023detrs} & 66.5 $\pm$ 3.6 & 77.0 $\pm$ 5.1 &  61.8 $\pm$  6.2         \\ \hline
% \multirow{6}{*}{YOLO} & RCS-YOLO~\cite{kang2023rcs}  & 72.5 $\pm$ 5.0& 74.1 $\pm$ 8.4&  64.7 $\pm$  6.7       \\ \cline{2-5}
%                             & YOLOv8x~\cite{yolov8_ultralytics} & 73.7 $\pm$ 3.4& 80.8 $\pm$ 5.5& 66.3 $\pm$ 4.9           \\ \cline{2-5}
%                             & Channel-Attention~\cite{li2020object} + YOLOv8x & 73.9 $\pm$ 3.3 & 80.5 $\pm$ 5.7 & 68.9 $\pm$ 4.8 \\ \cline{2-5}
%                             & Self-Attention~\cite{zhang2021vit} + YOLOv8x & 74.8 $\pm$ 3.4 & 79.6 $\pm$ 5.5 & 69.1 $\pm$ 4.7 \\ \cline{2-5}
%                             & MTNet~\cite{zhong2023semi} + YOLOv8x & 74.9 $\pm$ 2.9 &81.9 $\pm$ 5.3  & 69.2 $\pm$ 4.7 \\ \cline{2-5}
%                             & PCA-YOLO (ours) & \textbf{77.2} $\pm$ 2.1 & \textbf{82.5} $\pm$ 3.0 &   \textbf{71.0} $\pm$ 2.4            \\ \hline
% \end{tabular}
% \end{table}


\begin{table}[t]
\centering
\caption{Ablation study on the proposed PCA module.}
\label{tab:abl}
\begin{tabular}{|c|c|c|c|}
\hline
 \textbf{Model Structure} & \textbf{$mAP_{50}$ (\%)}& \textbf{Precision (\%)}& \textbf{Recall (\%)}  \\ \hline
 YOLOv8x & 73.7 $\pm$ 3.4& 80.8 $\pm$ 5.5& 66.3 $\pm$ 4.9           \\ \hline
YOLOv8x w/ CAM attention& 74.5 $\pm$ 3.6 & 80.6 $\pm$ 4.1 & 67.7 $\pm$ 4.5        \\ \hline
PCA-YOLO w/o $g(y_i)$ &75.6 $\pm$ 3.9& 81.6 $\pm$ 4.8&  70.5 $\pm$  3.7      \\ \hline
PCA-YOLO w/ $128\times 128$ patches& 75.9 $\pm$ 2.3 & 81.6 $\pm$ 3.0 &  70.8 $\pm$ 2.8         \\ \hline
PCA-YOLO w/ $256\times 256$ patches& 76.7 $\pm$ 2.5  & 81.4  $\pm$ 3.6 &  70.4 $\pm$  2.1        \\ \hline
PCA-YOLO w/ $64\times 64$ patches& \textbf{77.2} $\pm$ 2.1 & \textbf{82.5} $\pm$ 3.0 &   \textbf{71.0} $\pm$ 2.4          \\ \hline


\end{tabular}
\end{table}

\noindent Table~\ref{tab:results} presents the mAP@50\%, Precision, and Recall scores for all the evaluated models. As one can see from Table~\ref{tab:results}, our PCA-YOLO outperforms all the other models in all of the mAP@50\%, Precision, and Recall metrics, and it also demonstrates the lowest standard deviations across these metrics, which validate its better stability. PCA-YOLO surpasses the top-performing baseline, MTNet~\cite{zhong2023semi} combined with YOLOv8x, by an average mAP@50\% score of 2.3\%, indicating that our proposed PCA module is more effective than other attention mechanisms. A comparative analysis between the original YOLOv8x and its attention-augmented variants reveals that on our dataset, different attention mechanisms improve YOLOv8x's detection capabilities to varying degrees: Channel Attention by 0.2\%, Self Attention by 1.1\%, MTNet by 1.2\%, and our PCA module by 3.5\%. 
 
It is also interesting to note that YOLO-based models generally outshine those in the other categories on our dataset, including SPN-TS~\cite{he2023cancer}, tailored for small breast tumor detection, and RT-DETR~\cite{lv2023detrs}, which outdoes YOLO in natural image detection. This underscores YOLOv8's adeptness in small liver tumor detection. 
 
\begin{figure}[t]
\centering
\includegraphics[width=0.9\textwidth]{pred.png}
\caption{Visualization examples of our detection results with (a) a tumor of bounding box size $d = 0.9$ cm and (b) a tumor of maximum bounding box size $d = 2.4$ cm.}
\label{pred}
\end{figure}

 
Furthermore, Table~\ref{tab:results} reveals that all the evaluated models exhibit lower Recall scores relative to Precision, a trend attributable to the dataset's exclusive composition of small-sized tumors, which are inherently more challenging to detect than their normal-sized counterparts. This discrepancy underscores the significant difficulties in identifying small liver tumors. Nevertheless, our model stands out as the only one achieving a Recall score above 70.0\%, outperforming the second-best model by 1.8\%. This distinction highlights our model's effectiveness in addressing the task of small liver tumor detection compared to the other models. Fig.~\ref{pred} visualizes PCA-YOLO's successful detection examples. The siamese network's accuracy and more visual comparison results with the other methods are provided in the Appendix~\ref{appendix:a}, Supplementary Material. 

We also conducted a paired t-test comparing PCA-YOLO against all baseline models, with p-values reported in Table 1, using $\alpha = 0.05$. As shown, our model demonstrates statistically significant improvements over the majority of baselines. These findings indicate that PCA-YOLO consistently outperforms existing methods, with strong statistical evidence supporting its effectiveness. 


\subsection{Ablation Study of the PCA Module:} As previously discussed, we explored an alternative approach by directly applying the Class Activation Map (CAM)~\cite{zhou2016learning} from the Siamese network as attention maps, rather than using normalized patch attention. However, as indicated by Table~\ref{tab:abl}, this method results in only a 0.8\% increase in the mAP@50\% score, significantly less than the 3.5\% enhancement achieved by PCA-YOLO. This suggests the superiority of our proposed PCA module over CAM-based attention, possibly due to the patch-based attention's compatibility with YOLO's anchor box mechanism. Further, an ablation study on the PCA module's normalization function, $g(y_i)$, highlights its critical role. As shown in Table~\ref{tab:abl}, omitting this normalization step leads to reductions in the mAP@50\%, Precision, and Recall scores by 1.6\%, 0.9\%, and 0.5\%, respectively, compared to the original PCA-YOLO model, underscoring the significance of the normalization process in our PCA module. 
% XXXXXX
Lastly, an ablation study is conducted to evaluate the impact of varying patch sizes. As demonstrated in Table~\ref{tab:abl}, a patch size of $64 \times 64$ attains superior performance compared to the configurations with the other two patch sizes.

\subsection{Error Analysis}
Examples of detection errors are presented in Fig.~\ref{error}. In Case (a), a small tumor was incorrectly merged with an adjacent larger tumor, despite medical annotations indicating them as separate entities. This may be due to the small tumor containing too few distinguishable features, making differentiation inherently challenging for the model. Although our model successfully detects most small tumors, rare cases like this suggest that further refinement in feature extraction could enhance robustness. Conversely, in Case (b), a single tumor was mistakenly detected as two distinct tumors, despite the model correctly capturing its location. This misclassification may be attributed to inconsistencies in texture or intensity variations within the tumor, leading to over-segmentation. While our model demonstrates strong overall detection performance, incorporating improved spatial feature aggregation in future iterations could further mitigate such segmentation inconsistencies. 

% We hope our SLTD dataset serves as a benchmark, accelerating progress in small tumor detection and facilitating the development of solutions to these challenges.
\begin{figure}[t]
    \centering
    \includegraphics[width=0.9\textwidth]{error.png}
    \caption{Examples of detection errors.}
    \label{error}
\end{figure}

\section{Conclusions}
In this paper, we proposed PCA-YOLO, a novel detection framework developed upon YOLOv8x, dedicated to addressing the challenges of detecting small-sized liver tumors --- a task that, besides being more challenging than identifying liver tumors of normal sizes, holds significant clinical importance. Our new PCA-YOLO model surpasses existing state-of-the-art detection methods in the realm of small liver tumor detection with an mAP@50\% score of 77.2\%. Furthermore, the PCA module we developed demonstrated superior performance over alternative attention mechanisms when integrated with the same detection architecture. To support this specialized detection task and for future follow-up research from the medical imaging community, we have compiled the SLTD dataset, which consists of 208 3D CT volumes, encompassing 41,587 2D slices, with 452 slices annotated with bounding boxes by our team of skilled radiologists. 



\bibliography{midl25_006}
\appendix
\section{Supplementary Material}
\label{appendix:a}
\begin{table}[h]
\centering
\caption{Accuracy of the Siamese network using patch $x_i$ and original image $X$ as inputs with patch size of $64 \times 64$.}\label{Ablation}
\begin{tabular}{>{\centering\arraybackslash}p{4cm} >{\centering\arraybackslash}p{2cm}}
\hline
{\bfseries  Model}&  {\bfseries  Accuracy(\%) } \\
\hline
Siamese~\cite{koch2015siamese} & 93.4 $\pm 2.5$  \\
\hline
\end{tabular}
\end{table}

% \newpage
\begin{figure}
\includegraphics[width=\textwidth]{Compare.png}
\caption{Visual comparisons of PCA-YOLO (ours) with RT-DETR~\cite{lv2023detrs}, SPN-TS~\cite{he2023cancer}, and RCS-YOLO~\cite{kang2023rcs} for: (a) a small-sized tumor with bounding box size $d = 0.8$ cm, (b) a median-sized tumor in our dataset with bounding box size $d = 2.2$ cm, and (c) the largest tumor in our dataset with bounding box size $d = 5.0$ cm.} \label{head}
\end{figure}

\begin{figure}
\includegraphics[width=\textwidth]{attention.png}
\caption{Example of an input image augmented by crop attention map.} \label{attention}
\end{figure}

% \begin{figure}
% \includegraphics[width=0.5\textwidth]{lits.png}
% \caption{Example of image from the LiTS~\cite{bilic2023liver} dataset} \label{lits}
% \end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.4\textwidth]{lits2.png}
    \caption{Example of an image from the LiTS dataset. Automatic bounding box extraction from segmentation masks may lead to ambiguities. For example, tumor A and B could potentially be encompassed within a single bounding box while they are different tumors. Moreover, determining whether adjacent tumors in the cluster C should be treated as a single large tumor or as separate tumors requires expert medical judgment. This distinction is crucial, as tumor count is a key parameter in HCC staging according to the RECIST standard and BCLC guidelines.}
    \label{lits}
\end{figure}


\end{document}
