\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow} % For multi-row table cells
\jmlrvolume{-- 42}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}


\title[Detection vs Segmentation for Malaria Diagnosis]{Detection versus Instance Segmentation for Multi-Species Malaria Diagnosis: A Head-to-Head Comparison and Multi-Dataset Validation of YOLOv12 Architectures with Small Object Optimization}

% Three authors with the same address:
\midlauthor{\Name{Ahmed Tahiru Issah} \Email{aissah@andrew.cmu.edu}\\
\Name{Idaya Seidu} \Email{iseidu@andrew.cmu.edu}\\
 \Name{Carine Mukamakuza\midljointauthortext{Corresponding author}} \Email{cmukamak@andrew.cmu.edu}\\
 \addr Carnegie Mellon University Africa}

 

\begin{document}

\maketitle

\begin{abstract}
Automated malaria parasite detection using deep learning holds promise for addressing diagnostic gaps in resource-limited settings, yet most studies rely on single-dataset evaluations that fail to capture real-world variability. In this work, we rigorously validate YOLOv12-based architectures for malaria detection across diverse geographic and institutional contexts. We introduce a dual-head architecture combining instance segmentation with a high-resolution P2 detection head to target tiny ring-stage parasites. Our evaluation on a diverse Rwandan thick-smear dataset (2,739 images) and two external datasets from Ghana (Lacuna) and Nigeria (FASTMAL) reveals critical insights into model robustness. While the proposed YOLOv12-Seg-N-P2 model achieves state-of-the-art internal performance (mAP@50 $0.888$) and significantly improves detection of challenging \textit{P. vivax} ($+10.9\%$) and \textit{P. falciparum} ring forms, external validation exposes severe domain shift, with performance dropping by $>80\%$ on unseen datasets. We further demonstrate that while P2 heads enhance morphological precision on source data, they reduce zero-shot generalization, likely by overfitting to dataset-specific acquisition characteristics. We additionally evaluate white blood cell (WBC)-anchored stain normalization and pixel-scale rescaling as inference-time domain adaptation strategies. While WBC detection improves substantially (up to $+45\%$ on Lacuna), \textit{P.~falciparum} detection remains critically low across both external datasets despite partial recovery on FASTMAL, confirming that preprocessing-based adaptation alone is insufficient for reliable cross-site parasite detection.

\end{abstract}

\begin{keywords}
Malaria diagnosis, deep learning, YOLOv12, instance segmentation, object detection, computer vision, medical imaging
\end{keywords}

\section{Introduction}
Malaria remains a major global health threat, causing an estimated 263 million cases and 597,000 deaths in 2023, with the burden concentrated in sub-Saharan Africa \cite{who_wmr2024}. Effective control relies on rapid, accurate diagnosis to guide treatment and limit drug resistance. Multiple \textit{Plasmodium} species infect humans, including \textit{P. falciparum}, \textit{P. vivax}, \textit{P. malariae}, and \textit{P. ovale} \cite{cdc_dpdx_malaria}, making species identification essential for appropriate treatment \cite{cdc_malaria_quickref_2025}. Microscopy of Giemsa-stained smears remains the gold standard but is slow, subjective, and dependent on skilled microscopists \cite{cdc_malaria_diagnosis}. In low-resource settings, thick blood smears are a primary diagnostic tool \cite{cdc_malaria_diagnosis,manescu_expert-level_2020}.

Thick smears concentrate parasites within a small field, enhancing sensitivity for low parasitemia \cite{koirala_deep_2022} and are therefore standard in endemic regions \cite{manescu_expert-level_2020}. Despite this, deep learning research has focused on thin smears \cite{akpo_binary_2024}, which are less complex than thick smears that include debris, platelets, and white blood cells \cite{nakasi_dataset_2025, koirala_deep_2022}. This mismatch between research and clinical practice underscores the need for modality-relevant validation studies.

Deep learning has expanded automated microscopy capabilities. Early CNN classifiers like VGG19 and InceptionV3 labeled cropped cells as parasitized or uninfected \cite{okoronkwo_comparative_2025}, but these approaches do not reflect real diagnostic workflows. Object detection models, particularly the YOLO family, now identify parasites in whole fields and estimate density \cite{lipsa_5g_2025}, evolving from YOLOv3 and YOLOv4 \cite{lipsa_5g_2025} to YOLOv5 and YOLOv8 \cite{koirala_deep_2022, zedda_deep_2025}. Task-specific validation shows YOLOv5 can outperform later versions in speed and memory \cite{lipsa_5g_2025}, while variants like YOLO-mp \cite{koirala_deep_2022} and YOLO-PAM \cite{zedda_yolo-pam_2023} demonstrate detection-based potential.

Debate exists on whether bounding box detection or instance segmentation is better for parasite identification. Segmentation provides pixel-level precision and is widely used in biomedical imaging \cite{akpo_binary_2024, abraham_malaria_2019}, but malaria parasites are small, so the additional annotation and computational load may offer limited benefits. No direct comparison of high-performing detection and segmentation models on thick smears exists; this study provides one.

Detecting early ring-stage parasites remains difficult due to their small size. Standard feature pyramid networks lose high-resolution detail at deeper layers, reducing detection of small objects. Introducing a P2 detection head on early feature maps preserves fine spatial detail and may improve detection \cite{koirala_deep_2022}. Previous work has highlighted this challenge \cite{zedda_deep_2025} and explored small-object modules \cite{zhang_multi-branch_2024}. This study evaluates P2 heads in YOLO Para for measurable improvements.

Finally, reliance on single-dataset validation limits robustness. Domain differences from slide preparation, staining, microscope hardware, and geography can reduce model performance by 5-30\% when applied elsewhere \cite{zedda_deep_2025, okoronkwo_comparative_2025, nakasi_dataset_2025}. Robust external validation is essential, and this study tests zero-shot generalization across datasets from Ghana \cite{lacuna_dataset, nakasi_dataset_2025} and Nigeria \cite{manescu_expert-level_2020}.

These clinical, methodological, and technical gaps motivate the central aim of this paper. The goal is not to introduce a new architecture but to provide a rigorous validation of existing models. The study examines the relative strengths of detection and segmentation for thick smears, evaluates the benefit of P2 heads for small parasite detection, measures generalization across diverse datasets, assesses computational efficiency for resource-limited settings, and analyzes performance on challenging cases such as rare species and low-parasitemia slides. Through this evidence, the study seeks to clarify best practices and support progress toward clinically deployable automated malaria diagnostics.


\section{Related Work}

\subsection{Manual and Automated Malaria Microscopy}
For more than a century, the examination of stained blood smears under a microscope has remained the gold standard for malaria diagnosis because it is affordable, sensitive, and capable of providing parasite density, species, and life-cycle stage information \cite{lipsa_5g_2025, wangai_sensitivity_2011, poostchi_image_2018}. Despite its strengths, manual microscopy depends heavily on the skill and experience of the microscopist, which leads to substantial variability in diagnostic consistency \cite{koirala_deep_2022}. Thick blood smears, although widely used for detection in endemic regions, introduce additional challenges. The lysis of red blood cells can distort or remove parasites during staining and washing, which may lower visible parasite counts \cite{bejon_thick_2006}. The resulting images contain overlapping layers of parasites, white blood cells, platelets, and debris, which makes accurate identification difficult \cite{poostchi_image_2018, zedda_deep_2025, koirala_deep_2022, fatima_automatic_2020}. Digitized microscopy and automated analysis offer a way to reduce subjectivity and workload, and deep learning methods have become central to these efforts.

\subsection{Evolution of Deep Learning Architectures for Object Detection}
Deep learning for medical imaging has advanced rapidly, especially in object detection. Early work was dominated by two-stage detectors such as the R-CNN family, which proposed candidate regions before classification. These models achieved high accuracy but were often too slow for real-time use \cite{lipsa_5g_2025, zedda_deep_2025}. One-stage detectors, including the YOLO family, removed the region proposal step and predicted bounding boxes and classes directly, which made them faster and more suitable for real-time applications \cite{lipsa_5g_2025, koirala_deep_2022}. Successive YOLO versions improved this balance between speed and accuracy, although earlier versions sometimes outperform later ones in efficiency \cite{lipsa_5g_2025}. Many modern models also incorporate attention modules such as CBAM, which highlight important features like parasite nuclei and suppress irrelevant background patterns \cite{zedda_deep_2025, zhang_multi-branch_2024}. These innovations have been essential in adapting general vision models to medical microscopy.

\subsection{Deep Learning Applications in Malaria Detection}
Early deep learning studies treated malaria diagnosis as a simple classification task, using CNNs to label isolated cell patches as parasitized or uninfected \cite{okoronkwo_comparative_2025}. These works showed feasibility but relied on external preprocessing to crop cells, which limited clinical relevance. The use of object detection models marked a major shift. YOLO-based systems can process a full microscopic field, locate parasites, and classify species or stages, which aligns more closely with clinical needs such as estimating parasitemia \cite{koirala_deep_2022, zedda_deep_2025}. Many studies have refined these detectors and achieved strong results, yet an important methodological gap remains. Object detection offers efficient localization through bounding boxes, while instance segmentation provides more precise pixel-level masks. A direct and rigorous comparison of these two approaches for thick smear parasites is still missing. This study addresses that gap.

\subsection{Segmentation in Medical Imaging}
Semantic segmentation provides pixel-level masks for each object rather than simple bounding boxes \cite{akpo_binary_2024}. This level of detail is important in areas such as oncology or anatomical analysis, where accurate boundaries influence treatment decisions and measurement \cite{athalye_domain-guided_2023, akpo_binary_2024}. U-Net and its variants remain foundational architectures for biomedical segmentation \cite{ronneberger2015unet}. However, segmentation requires labor-intensive pixel-level annotations and is more computationally demanding than object detection. For malaria diagnosis in thick smears, it is unclear whether precise pixel boundaries provide enough added value to justify this cost, especially given the small size and simple morphology of parasites.

\subsection{Small Object Detection and P2 Heads}
Many detection models struggle with very small objects \cite{zedda_yolo-pam_2023, zedda_deep_2025}. Early ring forms of malaria parasites occupy only a small pixel area, and down-sampling through the feature hierarchy often removes the fine details needed for accurate detection \cite{zedda_yolo-pam_2023, zedda_deep_2025}. Feature Pyramid Networks address multi-scale features, but deeper layers sacrifice spatial resolution. The P2 head is a solution found in modern architectures that adds a detection head at a higher resolution level of the feature pyramid \cite{mura_yolo-tryppa_2025}. This preserves spatial detail before it is lost and improves sensitivity to tiny targets \cite{mura_yolo-tryppa_2025, zedda_deep_2025}. Evidence from computer vision supports this strategy, and some malaria studies have used variants of early-layer feature routing \cite{koirala_deep_2022}. A formal validation of P2 heads in state-of-the-art malaria detectors is still missing, and this study provides that evaluation.

\subsection{The Importance of External Validation and Domain Shift}
Domain shift continues to be a major barrier to clinical deployment. Models often perform well on internal datasets but decline sharply when applied to data from different sites \cite{zedda_deep_2025, sukumarran_optimised_2024}. Reported drops in performance can reach thirty percent across studies \cite{zedda_deep_2025, okoronkwo_comparative_2025, sukumarran_optimised_2024}. Malaria microscopy is highly heterogeneous, with variation in staining, microscope models, camera sensors, image pipelines, slide preparation, and geographic differences in parasite appearance \cite{nakasi_dataset_2025, zedda_deep_2025, okoronkwo_comparative_2025}. Many studies rely on single-dataset testing, which hides these issues and inflates expectations of real-world performance \cite{okoronkwo_comparative_2025, sukumarran_optimised_2024, zedda_deep_2025}. Multi-center external validation is essential for assessing generalizability.

\subsection{Computational Efficiency for Point of Care Deployment}
Real-world deployment requires attention to computational constraints, especially in low-resource settings where malaria is most common \cite{lipsa_5g_2025}. Many clinics operate without high-end hardware, and diagnostic systems must run on modest devices, including mobile phones or embedded processors \cite{alawfi_hybrid_2025, koirala_deep_2022, zedda_deep_2025}. This creates a balance between accuracy, inference speed, memory use, and computational cost. Larger models may be accurate but impractical, while lightweight models may be fast but insufficiently reliable. Research in malaria detection now regularly considers these trade-offs, and any thorough evaluation must include computational performance \cite{koirala_deep_2022}.

\subsection{Diagnosing Rare Species and Challenging Cases}
A clinically useful system must detect rare species and difficult cases as reliably as common ones. Less prevalent species such as \textit{P. malariae} and \textit{P. ovale} are often misidentified even by skilled microscopists \cite{alawfi_hybrid_2025, zedda_deep_2025}. Early ring forms further increase difficulty. These challenges arise mainly from data imbalance, since most datasets overrepresent common species like \textit{P. falciparum} \cite{ramarolahy2021classification}. Models trained on imbalanced data tend to favor majority classes, which weakens performance on minority cases \cite{alawfi_hybrid_2025}. Data augmentation can help by generating synthetic samples through rotation, scaling, or color variation \cite{islam_systematic_2024}. However, its effectiveness for rare Plasmodium species remains uncertain, which supports this study’s focus on evaluating performance in rare and challenging scenarios.

\subsection{Synthesis of Gaps in the Literature}
Current research shows several gaps that limit progress toward clinically deployable malaria detection systems. Detection and segmentation methods have not been directly compared on thick smear images. Key features such as P2 heads remain insufficiently validated for small ring-stage parasites. Many studies omit external multi-center testing, which leads to an incomplete understanding of domain shift.Furthermore, high computational demands remain a barrier to deployment in low-resource settings. Performance on rare species and difficult cases is weakened by data imbalance. This study addresses these gaps to build a clearer evidence base for reliable and clinically useful diagnostic models.

\section{Methods}

\subsection{Study Design and Validation Framework}

We conducted a rigorous two-phase validation study to evaluate YOLOv12-based models for automated malaria parasite identification in thick blood smears. Our investigation addresses two key architectural questions: (1) whether instance segmentation provides advantages over standard object detection, and (2) whether incorporating high-resolution P2 detection heads improves identification of tiny parasites.

The validation framework consists of an internal phase using a Rwandan dataset spanning four \textit{Plasmodium} species, followed by an external phase testing generalization across three geographically and institutionally distinct datasets. Critically, we employ a zero-shot evaluation protocol: models are trained exclusively on Rwandan data and evaluated on external datasets without any adaptation or fine-tuning, simulating real-world deployment where labeled data from the target institution may be unavailable.

\subsection{Training and Validation Data}

\subsubsection{Rwandan Dataset Collection and Annotation}

Our primary dataset originates from a collaboration with the Rwanda Biomedical Center (RBC), the national reference laboratory for malaria diagnostics. Blood samples were collected from patients presenting with fever at healthcare facilities, following established protocols documented in prior work by our lab \citep{akpo_binary_2024, issah_bridging_2026}. Expert microscopists at RBC prepared Giemsa-stained thick blood smears after which we captured images at 100$\times$ magnification using an Olympus microscope.

Thick smears were specifically selected over thin smears due to their superior sensitivity for parasite detection, which is approximately 11-times higher \citep{poostchi_image_2018}, thus making them the preferred modality for clinical screening in endemic regions. The dataset includes 2,739 validated images with comprehensive annotations for four \textit{Plasmodium} species: \textit{P. falciparum} (Pf), \textit{P. malariae} (Pm), \textit{P. ovale} (Po), and \textit{P. vivax} (Pv).

Image annotation was performed using the VGG Image Annotator (VIA) tool, with each parasite instance receiving both a bounding box (rectangular boundary) and a polygonal segmentation mask (precise contour). This dual annotation strategy enables training of both detection-only and segmentation-capable models. All annotations underwent iterative validation by RBC expert microscopists, with corrections incorporated until consensus was achieved. Of approximately 6,000 images collected to date, only the 2,739 fully validated images were utilized for model development, while the remaining images continue undergoing expert review for future work.

The dataset exhibits substantial class imbalance reflective of natural epidemiological patterns. \textit{P. falciparum} is represented by 838 images containing 7,568 parasite instances, while \textit{P. ovale} appears in 893 images with 2,353 instances. \textit{P. malariae} comprises 834 images containing 1,802 instances, and \textit{P. vivax} is severely underrepresented with only 174 images containing 669 instances (6.4\% of data). This severe underrepresentation of \textit{P. vivax} necessitated the species-targeted augmentation approach described in Section 3.4.2.

Data partitioning followed a 70-15-15 percentage split, yielding 1,915 training images, 410 validation images, and 411 test images.

\subsubsection{External Validation Datasets}

We assembled two publicly available thick smear datasets to evaluate model generalization across diverse acquisition conditions, geographic populations, and annotation protocols.

The Lacuna Malaria Dataset, compiled through the Lacuna Fund initiative, contains 3,925 thick smear images from Ghana \citep{lacuna_dataset, nakasi_dataset_2025}, representing a distinct West African epidemiological context. Images were provided with pre-formatted YOLO bounding box annotations for only the \textit{P. falciparum} and White Blood Cells (WBCs).

The FASTMAL Clinical Microscopy Dataset, developed by University College London (UCL) researchers, provides high-resolution (2560$\times$2160) thick blood film images from \textit{P. falciparum} infected patients with detailed rectangular bounding box annotations \citep{manescu_giemsa_2020}. Images were captured using extended depth-of-field microscopy with z-stack projection. To ensure annotation quality and format compatibility, we: (i) converted TIFF images to JPEG format, (ii) transformed proprietary JSON annotations to COCO format, and (iii) filtered out all ambiguous annotations (crowd instances, background, and ignore regions), retaining only cleanly annotated parasite and white blood cell instances.

\tableref{tab:external-datasets} summarizes the key characteristics of these external validation datasets, including species coverage, dataset size, geographic origin, and annotation methodology.

\begin{table}[htbp]
\floatconts
{tab:external-datasets}
{\caption{External Validation Datasets Summary}}
{%
\begin{tabular}{lllll}
\bfseries Dataset & \bfseries Species Covered & \bfseries Image Count & \bfseries Source Region & \bfseries Annotation Type\\
\hline
Lacuna & \textit{P. falciparum} & 3,925 & Ghana & Bounding Boxes\\

FASTMAL & \textit{P. falciparum} & 243 & Nigeria & Bounding Boxes\\
\end{tabular}
}
\end{table}

A significant limitation is the absence of publicly available thick smear datasets containing \textit{P. malariae} or \textit{P. ovale} with object-level annotations. We found publicly available datasets from the National Library of Medicine (NLM) containing \textit{P. falciparum} and \textit{P. vivax} \citep{NLM}, however persistent EXIF orientation metadata conflicts prevented reliable spatial alignment between images and annotations despite multiple preprocessing attempts. Consequently, external validation is restricted to \textit{P. falciparum} only, with \textit{P. vivax}, \textit{P. malariae}, and \textit{P. ovale} performance assessed only on the internal Rwandan test set.

\subsection{Model Architectures and P2 Head Integration}

\subsubsection{YOLOv12 Framework Selection}

We selected YOLOv12n as our base architecture due to its recently introduced Attention-Centric design \citep{tian_yolov12_2025}, which replaces traditional convolutional layers with Area Attention (A2) mechanisms. This architectural innovation reduces computational complexity while maintaining or improving detection accuracy, particularly relevant for deployment in resource-constrained clinical settings.

The standard YOLOv12 architecture employs a Feature Pyramid Network (FPN) with detection heads at three scales: P3 (stride 8), P4 (stride 16), and P5 (stride 32). These strides correspond to 1/8th, 1/16th, and 1/32nd of the input resolution respectively.

\subsubsection{Addressing the Small Object Challenge with P2 Heads}

\textit{Plasmodium} parasites, particularly \textit{P. falciparum} ring-stage trophozoites, are exceptionally small objects in microscopy images. At 2048$\times$2048 input resolution, early-stage rings frequently occupy fewer than 20$\times$20 pixels. The standard P3-P5 detection pyramid operates at increasingly coarse spatial resolutions, potentially losing fine morphological details necessary to distinguish parasites from staining artifacts or cellular debris.

To preserve high-resolution spatial information, we augmented both detection and segmentation architectures with a P2 detection head operating at stride 4 (512$\times$512 effective feature map resolution). This modification creates four model variants: YOLOv12-Obj-N (standard object detection with P3-P5 heads only), YOLOv12-Obj-N-P2 (detection with added P2 head), YOLOv12-Seg-N (instance segmentation with P3-P5 heads), and YOLOv12-Seg-N-P2 (segmentation with added P2 head).


\subsection{Data Preprocessing and Training Protocol}

\subsubsection{Image Standardization}

All preprocessing operations were executed through Roboflow, which automatically adjusts bounding box and polygon annotations to match geometric transformations. Images were resized to 2048$\times$2048 pixels using a letterbox method (``Fit with Black Edges" in Roboflow terminology). This approach scales the longer image dimension to 2048 pixels, proportionally scales the shorter dimension, then pads remaining space with zero-valued (black) pixels. Critically, this preserves aspect ratios and prevents morphological distortion of parasite structures that would occur with stretch-based or crop-based resizing methods.

Microscope cameras embed rotation metadata (EXIF tags) that can create inconsistent coordinate systems across images. We applied auto-orientation preprocessing to standardize pixel layouts, preventing systematic annotation misalignment \citep{auto_orient_roboflow_2020}.


\subsubsection{Addressing Class Imbalance Through Differential Augmentation}

The extreme scarcity of \textit{P. vivax} examples (121 training images versus 586-625 for other species) necessitated a targeted augmentation strategy. Rather than applying uniform augmentation factors, we implemented species-specific augmentation \citep{issah_bridging_2026}, with expansion factors proportional to underrepresentation: \textit{P. falciparum}, \textit{P. malariae}, and \textit{P. ovale} received 3-fold expansion, while \textit{P. vivax} received 10-fold expansion.

This applied augmentations are given in  was \tableref{tab:augmentation-transforms} with values informed by augmentation protocols validated in prior malaria detection studies \citep{zedda_yolo-pam_2023}. Roboflow generates augmented variants by randomly sampling from specified transformation ranges for each original training image, while validation and test sets remain unaugmented to ensure evaluation realism.

\begin{table}[htbp]
\floatconts
{tab:augmentation-transforms}
{\caption{Augmentation Transformation Specifications}}
{%
\begin{tabular}{lll}
\bfseries Augmentation & \bfseries Values & \bfseries Purpose\\
\hline
Rotation & Discrete: 90$^\circ$, 180$^\circ$, 270$^\circ$ & Accounts for varying slide placement\\
Hue shift & Continuous: $\pm$20$^\circ$ & Simulate stain and light variation\\
Saturation & Continuous: $\pm$30\% & Represents stain color variability\\
Brightness & Continuous: $\pm$20\% & Simulates variable illumination intensity\\
\end{tabular}
}
\end{table}

These values were adopted from YOLO-PAM \citep{zedda_yolo-pam_2023}, which demonstrated their effectiveness for Giemsa-stained microscopy. Importantly, we excluded shearing, perspective transforms, and other geometric distortions explicitly discouraged in that study, as they generate biologically implausible parasite morphologies.
The final balanced and augmented dataset is given in \tableref{tab:training-composition}.



\begin{table}[htbp]
\floatconts
{tab:training-composition}
{\caption{Final Training Dataset Composition After Species-Specific Augmentation}}
{%
\begin{tabular}{lccccc}
\bfseries Species & \bfseries Original & \bfseries Expansion Factor & \bfseries Final Train & \bfseries Final Val & \bfseries Final Test\\
\hline
\textit{P. falciparum} & 586 & $\times$3 & 1,761 & 125 & 126\\
\textit{P. malariae} & 583 & $\times$3 & 1,752 & 125 & 125\\
\textit{P. ovale} & 625 & $\times$3 & 1,875 & 134 & 134\\
\textit{P. vivax} & 121 & $\times$10 & 1,210 & 26 & 26\\
\hline
Totals & 1,915 & --- & 6,598 & 410 & 411\\
\end{tabular}
}
\end{table}

\subsubsection{Model Training Configuration}

All models were trained for 70 epochs on an NVIDIA H100 GPU (80GB VRAM) using consistent hyperparameters to ensure fair comparison. We employed Stochastic Gradient Descent (SGD) with momentum of 0.937 and weight decay of $5\times10^{-4}$. The batch size was set to 6, constrained by the 2048$\times$2048 resolution, and mixed-precision training (FP16) was enabled to accelerate computation.

Initial experiments revealed training instability (exploding gradients manifesting as NaN losses) for P2-augmented models when using the standard learning rate of 0.01. This instability stems from the additional high-resolution detection head introducing steeper gradient magnitudes. To stabilize training without compromising convergence, we reduced the initial learning rate to 0.005 for P2 models only, while maintaining 0.01 for baseline models. This adjustment resolved gradient explosions and ensured all models converged within the 70-epoch window.

Training optimized a composite objective combining classification, localization, and (for segmentation models) mask prediction losses:

\begin{equation}
\mathcal{L}_{\text{total}} = \lambda_{\text{cls}} \mathcal{L}_{\text{cls}} + \lambda_{\text{box}} \mathcal{L}_{\text{box}} + \lambda_{\text{dfl}} \mathcal{L}_{\text{dfl}} + \lambda_{\text{mask}} \mathcal{L}_{\text{mask}}
\end{equation}

The classification loss $\mathcal{L}_{\text{cls}}$ ($\lambda=0.5$) employs multi-class cross-entropy for species classification. The bounding box loss $\mathcal{L}_{\text{box}}$ ($\lambda=7.5$) uses Complete Intersection-over-Union (CIoU) for accurate localization regression. The distribution focal loss $\mathcal{L}_{\text{dfl}}$ ($\lambda=1.5$) refines localization precision. For segmentation models, the mask loss $\mathcal{L}_{\text{mask}}$ applies binary cross-entropy for pixel-wise segmentation. Training progress was monitored via Weights \& Biases for reproducibility.

\subsection{Performance Metrics and Clinical Interpretation}

We evaluated models using four complementary metrics, each addressing distinct clinical requirements.

\subsubsection{Recall (Sensitivity)}

Recall, defined as the ratio of true positives to the sum of true positives and false negatives, serves as our primary safety metric. In infectious disease screening, failing to identify an infected patient (false negative) can result in untreated infection and potential mortality. Therefore, models with high recall at the expense of modest precision are clinically preferable. For multi-species detection, species-specific recall indicates whether the model reliably identifies challenging species, and is computed as:

\begin{equation}
\text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}
\end{equation}

\subsubsection{Mean Average Precision at IoU 0.5 (mAP@50)}

This metric measures successful detection under a lenient spatial overlap criterion (50\% Intersection-over-Union). For malaria screening, identifying the approximate parasite location is clinically sufficient, whereas exact pixel boundaries are less critical than flagging the infected cell for microscopist verification. Higher mAP@50 indicates robust parasite localization across diverse morphologies and imaging conditions, and is computed as:

\begin{equation}
\text{mAP@50} = \frac{1}{N_{\text{classes}}} \sum_{i=1}^{N_{\text{classes}}} \text{AP}_i(0.5)
\end{equation}

\subsubsection{Precision (Positive Predictive Value)}

Precision quantifies the reliability of positive predictions. While important for minimizing false alarms, precision is secondary to recall. Missed parasites are more critical than a few misclassified artifacts. Precision is computed as:

\begin{equation}
\text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}}
\end{equation}

\subsubsection{Mean Average Precision at IoU 0.5-0.95 (mAP@50-95)}

This metric averages precision across stringent IoU thresholds (50\%-95\% overlap). For tiny objects like malaria parasites, achieving 95\% IoU is mathematically challenging, given that single-pixel shifts dramatically reduce overlap. While standard in computer vision benchmarking, mAP@50-95 is less clinically meaningful than mAP@50 for object detection applications like this, where we are interested in counting. It is computed as:

\begin{equation}
\text{mAP@50-95} = \frac{1}{10} \sum_{t=0.5,0.55,...,0.95} \text{AP}(t)
\end{equation}



\subsection{Reproducibility Statement}

Implementation utilized PyTorch with Ultralytics YOLOv12. All hyperparameters, and training configurations are fully specified above. 



\section{Results and Discussion}

\subsection{Internal Validation (Rwandan Test Set)}

\subsubsection{Baseline Model Comparison (Detection vs Segmentation)}

Table \ref{tab:baseline-comparison} presents overall performance of baseline detection (YOLOv12-Obj-N) and segmentation (YOLOv12-Seg-N) models.

\begin{table}[htbp]
\floatconts
{tab:baseline-comparison}
{\caption{Baseline Model Performance Comparison (Box Metrics, All Classes)}}
{%
\begin{tabular}{llcccc}
\bfseries Model & \bfseries Architecture & \bfseries Precision & \bfseries Recall & \bfseries mAP@50 & \bfseries mAP@50-95\\
\hline
YOLOv12-Obj-N & Detection & \textbf{0.837} & 0.820 & \textbf{0.874} & 0.636\\
YOLOv12-Seg-N & Segmentation & 0.800 & \textbf{0.829} & 0.859 & \textbf{0.640}\\
\end{tabular}
}
\end{table}

Aggregate metrics show minimal differences, but per-species analysis (Table \ref{tab:baseline-per-species}) reveals important patterns.

\begin{table}[htbp]
\floatconts
{tab:baseline-per-species}
{\caption{Baseline Model Performance by Species (Box mAP@50)}}
{%
\begin{tabular}{lcc}
\bfseries Species & \bfseries Obj-N & \bfseries Seg-N\\
\hline
\textit{P. falciparum} & 0.752 & \textbf{0.771}\\
\textit{P. malariae} & 0.837 & \textbf{0.903}\\
\textit{P. ovale} & \textbf{0.921} & 0.897\\
\textit{P. vivax} & \textbf{0.940} & 0.821\\
\end{tabular}
}
\end{table}

Segmentation substantially benefits rare species: \textit{P. malariae} (+7.9\%) and \textit{P. falciparum} (+2.5\%). Pixel-level supervision helps distinguish rare parasites from complex backgrounds. Conversely, segmentation reduces \textit{P. vivax} performance (0.940 to 0.821), likely because its distinctive amoeboid morphology is well-captured by bounding boxes. \textbf{RQ1:} Segmentation provides selective benefits for rare species but not universally. It is most useful when morphological precision is required.

\subsubsection{P2 Head Impact on Segmentation Models}

Table \ref{tab:p2-segmentation-impact} compares segmentation models with and without P2 heads.

\begin{table}[htbp]
\floatconts
{tab:p2-segmentation-impact}
{\caption{P2 Head Impact on Segmentation Models (All Classes)}}
{%
\begin{tabular}{lccc}
\bfseries Metric & \bfseries Seg-N & \bfseries Seg-N-P2 & \bfseries $\Delta$\\
\hline
Box mAP@50 & 0.859 & \textbf{0.888} & \textbf{+2.9\%}\\
Box mAP@50-95 & 0.640 & \textbf{0.676} & \textbf{+3.6\%}\\
Mask mAP@50 & 0.859 & \textbf{0.888} & \textbf{+2.9\%}\\
Mask mAP@50-95 & 0.605 & \textbf{0.656} & \textbf{+5.1\%}\\
\end{tabular}
}
\end{table}

The P2 head consistently improves all metrics, with mask improvements (+5.1\%) exceeding box improvements (+3.6\%), suggesting benefits primarily for boundary precision.

\subsubsection{Per-Class Performance with P2 Head on Segmentation Models}

Table \ref{tab:p2-per-class} reveals species-specific heterogeneity.

\begin{table}[htbp]
\floatconts
{tab:p2-per-class}
{\caption{Per-Class P2 Impact on Segmentation Models}}
{%
\begin{tabular}{llccc}
\bfseries Class & \bfseries Metric & \bfseries Seg-N & \bfseries Seg-N-P2 & \bfseries $\Delta$\\
\hline
\multirow{2}{*}{\textit{P. falciparum}} & Box mAP@50 & 0.771 & \textbf{0.779} & \textbf{+0.8\%}\\
 & Mask mAP@50-95 & 0.444 & \textbf{0.477} & \textbf{+3.3\%}\\
\hline
\multirow{2}{*}{\textit{P. malariae}} & Box mAP@50 & 0.903 & \textbf{0.907} & \textbf{+0.4\%}\\
 & Mask mAP@50-95 & 0.629 & \textbf{0.660} & \textbf{+3.1\%}\\
\hline
\multirow{2}{*}{\textit{P. vivax}} & Box mAP@50 & 0.821 & \textbf{0.930} & \textbf{+10.9\%}\\
 & Mask mAP@50-95 & 0.501 & \textbf{0.669} & \textbf{+16.8\%}\\
\end{tabular}
}
\end{table}

\textit{P. vivax} shows dramatic improvement (+10.9\% box, +16.8\% mask), an indication that high-resolution P2 features enable better generalization particularly for Pv's complex amoeboid morphology. Other species show modest box improvements (0.4--0.8\%) but substantial mask improvements (3.1--3.3\%), indicating P2 primarily refines boundary precision. For ring-stage \textit{P. falciparum}, the 3.3\% mask improvement represents meaningful morphological refinement.

\subsubsection{P2 Head Impact on Detection Models}

Table \ref{tab:p2-detection-impact} shows more complex effects for detection-only models.

\begin{table}[htbp]
\floatconts
{tab:p2-detection-impact}
{\caption{P2 Head Impact on Detection Models (All Classes)}}
{%
\begin{tabular}{lccc}
\bfseries Metric & \bfseries Obj-N & \bfseries Obj-N-P2 & \bfseries $\Delta$\\
\hline
Precision & \textbf{0.837} & 0.787 & $-$5.0\%\\
Recall & 0.820 & \textbf{0.843} & \textbf{+2.3\%}\\
mAP@50 & \textbf{0.874} & 0.862 & $-$1.2\%\\
mAP@50-95 & \textbf{0.636} & 0.615 & $-$2.1\%\\
\end{tabular}
}
\end{table}

P2 increases recall by 2.3\% but reduces precision by 5.0\%, decreasing overall mAP. This recall-precision trade-off indicates that for detection-only models, higher sensitivity is outweighed by false positives.

\subsubsection{Computational Cost}

\tableref{tab:computational-cost} quantifies P2 overhead.

\begin{table}[htbp]
\floatconts
{tab:computational-cost}
{\caption{Computational Trade-offs of P2 Integration}}
{%
\begin{tabular}{lccc}
\bfseries Metric & \bfseries Seg-N & \bfseries Seg-N-P2 & \bfseries Change\\
\hline
Inference Time (ms) & 25.4 & 30.1 & +18.5\%\\
Parameters (M) & 2.76 & 2.83 & +2.5\%\\
{GFLOPs} & {7.9} & {8.4} & {+6.3\%}\\
\end{tabular}
}
\end{table}

P2 adds modest parameter overhead (2.5\%) but 18.5\% latency increase. This suggests burden stems from higher-resolution feature maps rather than parameters. {The proposed YOLOv12-Seg-N-P2 model maintains a compact footprint with only 2.83 million parameters and 8.4 GFLOPs, significantly more efficient than standard medical detection architectures like Faster R-CNN which has over 40M parameters, or U-Net ensembles.}

{While inference latency on GPU ($\approx$30ms) indicates high feasibility for real-time deployment in well-equipped microscopy labs, deployment to low-resource settings requires additional considerations. Modern mobile Neural Processing Units (NPUs) typically support $>$10 Tera Operations Per Second (TOPS), suggesting this model is theoretically capable of real-time inference on edge devices like smartphones through the standard ONNX or TFLite optimization pipelines. However, empirical validation on target hardware like mobile phones was not conducted in this study as it is the next phase of our work, remains necessary for deployment in resource-constrained point-of-care settings.}

\textbf{RQ4:} Computational overhead is modest and acceptable for {deployment in well-equipped microscopy labs. Theoretical analysis suggests feasibility for mobile/edge deployment, but empirical validation on low-resource devices is required before point-of-care deployment.}

\subsection{External Validation (Lacuna \& FASTMAL)}

External validation used detection models only (external datasets lack masks). Zero-shot transfer (no fine-tuning) assesses domain shift robustness.

\subsubsection{Performance on Lacuna and FASTMAL}

Table \ref{tab:external-validation} presents results for \textit{P. falciparum} and WBC detections.

\begin{table}[htbp]
\floatconts
{tab:external-validation}
{\caption{External Validation Results (Zero-Shot Transfer)}}
{%
\begin{tabular}{llcccc}
\bfseries Dataset & \bfseries Model & \bfseries Class & \bfseries Precision & \bfseries Recall & \bfseries mAP@50\\
\hline
Lacuna & Obj-N & \textit{P. falciparum} & \textbf{0.140} & \textbf{0.174} & \textbf{0.0998}\\
 & & WBC & 0.341 & 0.441 & 0.366\\
\cline{2-6}
 & Obj-N-P2 & \textit{P. falciparum} & 0.117 & 0.0919 & 0.0690\\
 & & WBC & 0.425 & 0.379 & 0.369\\
\hline
FASTMAL & Obj-N & \textit{P. falciparum} & \textbf{0.169} & \textbf{0.323} & \textbf{0.128}\\
 & & WBC & 0.697 & 0.858 & \textbf{0.778}\\
\cline{2-6}
 & Obj-N-P2 & \textit{P. falciparum} & 0.138 & 0.252 & 0.0975\\
 & & WBC & \textbf{0.771} & 0.856 & \textbf{0.806}\\
\end{tabular}
}
\end{table}

\textit{P. falciparum} performance collapses on external datasets: mAP@50 drops to 0.0998 (Lacuna) \citep{lacuna_dataset} and 0.128 (FASTMAL). P2 models underperform baselines by 31--44\%. In contrast, WBC detection generalizes substantially better: FASTMAL WBC mAP reaches 0.778--0.806 (comparable to internal 0.921). Lacuna WBC mAP of 0.366--0.369 is lower but respectable. This asymmetry reflects object size and morphological complexity: larger, simpler WBCs transfer better than tiny, staining-dependent parasites.

\subsubsection{Domain Shift Quantification}

Table \ref{tab:domain-shift} quantifies the severity.

\begin{table}[htbp]
\floatconts
{tab:domain-shift}
{\caption{Domain Shift Magnitude Across External Datasets}}
{%
\begin{tabular}{lccc}
\bfseries Dataset & \bfseries Pf mAP@50 (Internal) & \bfseries Pf mAP@50 (External) & \bfseries Relative Drop\\
\hline
Lacuna & 0.75--0.78 & $\sim$0.10 & $\sim$87\%\\
FASTMAL & 0.75--0.78 & $\sim$0.13 & $\sim$83\%\\
\end{tabular}
}
\end{table}

Both datasets exhibit catastrophic Pf degradation (83--87\% drop), reflecting fundamental distribution shifts from differences in imaging equipment, staining, resolution, and presentation. The baseline (Obj-N) consistently outperforms P2 variants, suggesting P2's high-resolution features overfit to Rwandan characteristics (staining intensity, camera noise, lighting) and fail to generalize. 



% --- REPLACEMENT: DETAILED FORENSIC ANALYSIS ---
{To investigate the mechanism of this failure, we conducted a detailed visual analysis of the dataset distributions (\figureref{fig:domain_shift}). The internal Rwandan training data (\figureref{fig:domain_shift}A) consists of standard rectangular fields captured via a high quality camera-mounted Olympus microscope with a characteristic organic pinkish/purple color. In contrast, the external datasets exhibit severe hardware-driven shifts. Lacuna images (\figureref{fig:domain_shift}B) were captured via mobile phones (Samsung S8+, Redmi) held to the eyepiece of an Olympus CX-23 microscope \citep{lacuna_dataset}, introducing circular vignetting, lens distortion, and uneven illumination absent in our training data. Conversely, FASTMAL images (\figureref{fig:domain_shift}C) were acquired using a high-end Olympus BX63 with a PCO Edge 5.5c camera and processed using Wavelet Extended Depth of Field (EDoF) \citep{manescu_giemsa_2020}. This creates a ``grainy" texture and dark blue/gray hue. These differences in pixel scaling, geometry, and texture explain the drop in detection performance and poor generalization without domain adaptation, especially for the P2 head which relies on fine-grained morphological features.}
% -----------------------------------------------

\begin{figure}[htbp]
\floatconts
{fig:domain_shift}
{\caption{{Visual analysis of domain shifts driving performance degradation. (A) Internal Rwanda dataset (Standard microscopy, rectangular field). (B) Lacuna dataset (Mobile phone capture) \citep{lacuna_dataset}, exhibiting circular vignetting and geometric distortion. (C) FASTMAL dataset (Extended Depth of Field), exhibiting synthetic ``grainy" texture and dark blueish hue.}}}
{%
\centering
\begin{minipage}{0.32\textwidth}
  \centering
  \includegraphics[width=\linewidth]{dataset_rwanda}
  \centerline{\footnotesize (A) Internal (Rwanda)}
\end{minipage}\hfill
\begin{minipage}{0.32\textwidth}
  \centering
  \includegraphics[width=\linewidth]{dataset_lacuna} 
  \centerline{\footnotesize (B) Lacuna (Mobile)}
\end{minipage}\hfill
\begin{minipage}{0.32\textwidth}
  \centering
  \includegraphics[width=\linewidth]{dataset_fastmal}
  \centerline{\footnotesize (C) FASTMAL (EDoF)}
\end{minipage}
}
\end{figure}

% ---------------- INSERTION ENDS HERE -----------




\textbf{RQ2:} P2 is effective for segmentation on source data, particularly for rare species (+10.9\% Pv) and boundary refinement (+3.3\% Pf mask). However, P2 fails on external data (31--44\% worse), indicating overfitting to Rwandan acquisition characteristics.

\textbf{RQ3:} Zero-shot transfer shows severe degradation for parasites (83--87\%) but lower degradation for WBCs (~25--60\%), underscoring tiny objects' vulnerability to domain shift. Current models require fine-tuning or domain adaptation before multi-site deployment.



% ================= DOMAIN ADAPTATION ADDITIONS HERE ============

\subsubsection{Domain Adaptation via Stain Normalization and Rescaling}
\label{sec:domain-adaptation}

The severe domain shift identified above motivates two preprocessing interventions, both clinically practical because WBCs are always present (with 20-40\% lymphocytes): 

(1)~\textbf{WBC-anchored stain normalization}, which transfers Rwanda's colorimetric statistics onto the external test images using white blood cell (WBC) bounding-box crops to compute stain statistics; three methods were evaluated: Macenko \citep{macenko_method_2009}, Reinhard \citep{reinhard_color_2001}, and Vahadane \citep{vahadane_structure-preserving_2016};

(2)~\textbf{Lymphocyte-based rescaling}, which uses the mean WBC lymphocyte diameter in the Rwanda training set ($105.67$~px) as a biological scale reference to rescale external images to match internal pixel resolution. The rescaling factors for FASTMAL and Lacuna were determined as $0.830$  and $0.709$ respectively. Because Macenko and Vahadane apply OD-space statistics transfer with fixed Giemsa stain vectors, they produce identical outputs for thick Giemsa smears and are reported together.

Table~\ref{tab:domain-adaptation} presents mAP@50 test results for all preprocessing conditions applied to the original object detection zero-shot baseline model (Obj-N).

\begin{table}[htbp]
\floatconts
{tab:domain-adaptation}
{\caption{Effect of stain normalization and lymphocyte-based rescaling on zero-shot transfer (Obj-N). Macenko and Vahadane produce identical outputs for Giemsa thick smears and are shown as a single entry. Baseline values reproduced from Table~\ref{tab:external-validation}.}}
{%
\begin{tabular}{llcc}
\bfseries Dataset & \bfseries Preprocessing & \bfseries Pf mAP@50 & \bfseries WBC mAP@50\\
\hline
FASTMAL & Baseline (none) & 0.128 & 0.778\\
 & Rescaling only & 0.127 & 0.778\\
 & Macenko / Vahadane & \textbf{0.200} & \textbf{0.808}\\
 & Reinhard & 0.139 & 0.804\\
 & Rescaling + Macenko / Vahadane & \textbf{0.201} & 0.805\\
 & Rescaling + Reinhard & 0.140 & 0.808\\
\hline
Lacuna & Baseline (none) & 0.100 & 0.366\\
 & Rescaling only & 0.097 & 0.367\\
 & Macenko / Vahadane & 0.095 & \textbf{0.532}\\
 & Reinhard & 0.096 & 0.411\\
 & Rescaling + Macenko / Vahadane & 0.096 & 0.530\\
 & Rescaling + Reinhard & 0.096 & 0.415\\
\end{tabular}
}
\end{table}

Stain normalization substantially improves WBC detection on Lacuna and modestly on FASTMAL. For \textit{P. falciparum}, Macenko/Vahadane normalization yields a 50\% gain on FASTMAL, yet Pf mAP@50 remains critically low ($\leq 0.201$). On Lacuna, no preprocessing condition improves Pf detection beyond baseline. Lymphocyte-based rescaling alone has no measurable effect on either class, confirming that YOLO's multi-scale training confers sufficient scale invariance. These findings indicate that although stain shift is a secondary contributor to Pf failure, the primary obstacle is the structural domain gap that these preprocessing steps alone cannot overcome.


% ================ DOMAIN ADAPTATION ENDS HERE



\subsection{Limitations and Future Directions}

% --- REPLACEMENT TEXT START ---
{While external validation was restricted to \textit{P. falciparum} due to data availability, this focus is clinically and technically justified. \textit{P. falciparum} accounts for the vast majority of malaria mortality \citep{who_wmr2024} and represents the most challenging computer vision task due to the minute size of early ring stages ($<$2~$\mu$m) and their resemblance to staining artifacts \citep{delahunt_metrics_2024}. Distinguishing these tiny targets from background noise is significantly harder than identifying the larger, distinct morphologies of \textit{P. vivax} or \textit{P. malariae}. Thus, performance on \textit{P. falciparum} serves as the primary stress-test for model robustness.}

{Our visual analysis (\figureref{fig:domain_shift}) suggested that performance degradation is likely driven by geometric (pixel scaling) and colorimetric (stain) shifts. To bridge this gap, we implemented lymphocyte-based rescaling, and three stain normalization methods to align external datasets with the internal training distribution. As shown in Section~\ref{sec:domain-adaptation}, stain normalization substantially recovers WBC detection, and improves but does not resolve the catastrophic Pf degradation, particularly on the Lacuna mobile-phone dataset.}
% --- REPLACEMENT TEXT END ---


\subsection{Clinical and Validation Study Implications}

On the source domain, YOLOv12-Seg-N-P2 achieves strong performance (mAP@50 0.888), especially for challenging cases like Pv and Pf ring forms. However, external validation reveals these models are not deployment-ready without further validation. The severe domain shift reflects a fundamental medical AI challenge: models learn dataset-specific characteristics rather than generalizable patterns. This underscores the critical importance of external validation. For clinical adoption, models must be validated across institutions, deployed with on-site fine-tuning, or used as decision-support tools with human expertise. Our findings provide evidence-based guidance: segmentation models are preferable when species precision is required, P2 heads improve source performance but reduce generalizability, and current models require domain adaptation before multi-site deployment.


\section{Conclusion}

This study provides a systematic validation of modern deep learning models for automated malaria microscopy and addresses several long-standing gaps in the field. By evaluating detection and segmentation approaches on thick smear images, the work clarifies the practical differences between bounding boxes and pixel-level masks for parasite localization. The analysis of P2 heads within the YOLOv12 architecture shows how early feature maps can improve detection of small ring-stage parasites. The cross-dataset evaluation demonstrates the extent of domain shift and highlights the need for multi-center validation before clinical deployment. The study also shows that computational efficiency remains a key constraint for point-of-care use and that performance on rare species and low-parasitemia slides continues to be a major challenge. Together, these findings form an evidence base that can guide future research and support the development of diagnostic systems that perform reliably across diverse clinical settings.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank the Rwanda Biomedical Center for their collaboration and data provision. We acknowledge support from Carnegie Mellon University Africa. We are especially grateful to Professor Charles B. Delahunt for his steady guidance, technical reviews, and constructive feedback throughtout this work}

\bibliography{midl26_42}

\end{document}