\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{algorithmic}

\usepackage{multirow}

\graphicspath{ {figs/} }
\jmlrvolume{-- 127}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

%\title[MILCA]{MILCA: Multiple Instance Learning and Class Activations for Sample-Level Weakly Supervised Malaria Parasite Detection in Blood Film Microscopy}
\title[MILCA]{MILCA: Malaria Parasite Detection from Sample-Level Weak Labels}
 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
%\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
%\addr $^{1}$ Address 1 \\
%\addr $^{2}$ Address 2 \AND
%\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
\Name{Petru Manescu\nametag{$^{}$}} \orcid{0000-0003-1829-3676} \Email{p.manescu@ucl.ac.uk}\\
\Name{Delmiro Fernandez-Reyes \nametag{$^{}$}} \Email{delmiro.fernandez-reyes@ucl.ac.uk}\\
\addr Department of Computer Science, University College London \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
}

\begin{document}

\maketitle

\begin{abstract}
Malaria diagnosis requires the inspection of multiple image fields per sample. Training vision models for malaria parasite detection typically requires large numbers of expert-provided bounding boxes, which are costly to obtain and often impractical in real-world deployments.  We introduce MILCA, a weakly supervised object detection framework that learns parasite localization from sample-level diagnostic labels, which are routinely recorded in clinical practice. MILCA combines Multiple Instance Learning (MIL) for sample classification with an iterative Class Activation (CA) Mapping  procedure that yields coarse parasite pseudo-labels, which are further enriched with hard negatives from parasite-free samples. These pseudo-labels enable training a detector without any manual bounding-box supervision. Experiments on multiple microscopy datasets show that MILCA achieves reliable detection and counting performance under fully weak supervision, and that fine-tuning with only a small fraction of expert annotations provides substantial additional gains, outperforming supervised and pseudo-labeling baselines under the same or lower annotation budgets. By converting coarse, sample-level clinical labels into effective object-level supervision, MILCA provides a label-efficient route toward automated malaria parasite detection and a general approach for weakly supervised blood film analysis.
\end{abstract}

\begin{keywords}
Malaria, Weak Supervision, Object Detection, Multiple Instance Learning, Class Activations
\end{keywords}

\section{Introduction}

Malaria remains a major global health challenge, disproportionately affecting children in low- and middle-income countries (LMICs) of the Global South \cite{world2022world}. 
%Fast and reliable diagnostic tools are essential to reduce morbidity, prevent progression to severe disease, and decrease mortality \cite{world2022world}. 
The current gold diagnosis standard is the visual inspection of Giemsa-stained Thick Blood Films (TBFs).  Microscopists must identify extremely small, faint parasites among
numerous artefacts such as platelets or staining debris, a demanding and time-consuming process that is difficult to scale in resource-limited regions \cite{world2010basic}. Parasite density estimation further guides prognosis and treatment efficacy \cite{siahaan2018laboratory}. Automated detection systems based on deep learning offer promise for high-throughput TBF analysis \cite{mehanian2017computer,
torres2018automated, yang2019deep, manescu2020expert, manescu2020weakly, chibuta2020real, kassim2021diagnosing}, yet nearly all rely on tens of thousands of manually annotated bounding boxes, and on segmentation-based region proposal techniques, an impractical requirement for widespread deployment.

By contrast, \emph{sample-level} diagnostic labels (malaria-positive vs.\ malaria-negative)
are routinely collected as part of standard clinical workflows and are inexpensive to
acquire. TBF samples consist of collections or \emph{bags} of image fields, naturally suited to a Multiple Instance Learning (MIL) formulation. MIL has been widely used in computational
pathology to learn from slide-level labels \cite{campanella2019clinical, gadermayr2024multiple}
and has seen limited use in hematology \cite{sidhom2021deep, manescu2023detection, 
gao2023childhood}, but has not been explored for parasitic infection detection in TBFs,
where object instances are tiny, numerous, and visually ambiguous.


MIL alone, however, produces only sample-level predictions and does not yield object-level localization, which is essential for parasite density estimation and clinical interpretability. Weakly supervised object detection (WSOD) methods based on Class Activation Maps (CAMs) have been used to derive localization cues from image-level labels \cite{zhang2021weakly, belharbipixelcam, kniesel2025weakly}, but they have not been investigated for high-magnification bright-field TBF microscopy. Moreover, existing WSOD pipelines assume that each labeled image contains at least one target instance, whereas a malaria-positive sample may contain hundreds of fields and only a small fraction with parasites. For example, \cite{kniesel2025weakly} assumes image-level labels for single electron-microscopy images, a modality not used in clinical  workflows. In contrast, MILCA operates on clinically acquired brightfield thick blood films, where each sample consists of up to 100 fields and requires a MIL backbone rather than a single-image classifier to account for the sample-level diagnostic labels routinely produced in practice.

In addition to weakly supervised methods, semi-supervised learning (SemiSL) has been widely investigated to reduce annotation costs in medical imaging. SemiSL frameworks leverage both limited labeled data and large pools of unlabeled images, tipically through consistency regularization, pseudo-labeling, or teacher–student schemes \cite{chen2022semi, bai2017semi, madani2018semi, xu2022bayesian, semicellmidl}.  However, these methods require an initial supervised detector, a challenge in malaria microscopy, where only a handful of bounding boxes may be available. MILCA circumvents this limitation by constructing the initial detector entirely from weak sample-level supervision before incorporating limited strong labels. Related efforts have explored self-supervised representation learning and domain adaptation for  microscopy analysis, aiming to improve robustness across imaging conditions or reduce labeled data requirements \cite{dilawar2025miadapt}.

Our main contributions are:
\begin{enumerate}
   

    \item  \textbf{MIL for malaria diagnosis.} We formulate thick blood film analysis as a Multiple Instance Learning problem operating directly on bags of high-magnification fields, enabling malaria positivity prediction from sample-level labels without candidate extraction, handcrafted preprocessing, or ROI selection.

    \item \textbf{ICAM: a domain-adapted iterative CAM refinement.} Building on WSOD principles but adapted to  small, dense malaria parasites in TBF, ICAM combines confidence-coupled erasure, decayed CAM accumulation, and hard-negative mining to obtain usable pseudo-annotations from MIL predictions.
    

    \item A hybrid weak+strong supervision regime with strong generalization and high label efficiency. We show that detectors initialized purely from MILCA pseudo-annotations and fine-tuned with only a small fraction of manual labels outperform supervised and simple SemiSL baselines under the same or even lower—annotation budgets. A multi-center evaluation further demonstrates robust performance and improved label efficiency.


\end{enumerate}




\section{Methods}

\begin{figure}[t]
\centering
\includegraphics[width=0.9\textwidth]{milca_overview.png}
\caption{Brief overview of MILCA. (a) Multiple Instance Learning for TBF sample classification. (b) CAM-based generated annotations are used to train an object detector.}
\label{fig:overview}
\end{figure}

MILCA converts sample-level diagnostic labels into object-level supervision through three stages:
(i) multiple instance learning (MIL) for sample classification,
(ii) iterative CAM refinement (ICAM) for pseudo-annotation generation, and
(iii) pseudo-label–driven detector training with optional fine-tuning (Fig.~\ref{fig:overview}).



\subsection{Datasets}

We trained and evaluated MILCA on three thick blood film (TBF) microscopy datasets collected from different geographic and clinical settings (Table~\ref{tab:datasets}). The Ibadan dataset \cite{manescu2020expert} contains 144 malaria-negative and 155 malaria-positive samples, each with approximately 100 high-resolution image fields. The Chittagong-1 dataset \cite{kassim2021diagnosing} comprises 50 negative and 150 positive samples, with 20 image fields per sample. Finally, the Chittagong-2 dataset \cite{yang2019deep} includes 150 positive samples with on average 12 fields per sample and over 80k expert-annotated parasites.  

The Ibadan dataset was acquired using a laboratory-grade digital scientific camera (PCO Edge) mounted on a conventional light microscope, while the Chittagong datasets were acquired using smartphone-based microscopy attachments, resulting in differences in image appearance and color. The Chittagong-2 dataset was acquired for a different study and contains only malaria-positive samples with higher parasite density annotations than Chittagong-1 \cite{yang2019deep} and we used it exclusively for evaluating object-level parasite detection and counting. This dataset was fully held out from all training stages, including MIL training, ICAM pseudo-annotation generation, and detector initialization. All three datasets were imaged under $100\times$ oil-immersion objectives. 




\begin{table}[h!]
\centering
\caption{Summary of datasets used for training and evaluation. *Used only for evaluation of the parasite detection. **Image field size containing TBF is of approximately 2400x2400; the rest is black background}
\label{tab:datasets}
\resizebox{\textwidth}{!}{
\begin{tabular}{lccc}
\toprule
\textbf{Dataset} & Ibadan & Chittagong-1  & Chittagong-2*  
\\
 &  \cite{manescu2020expert} & \cite{kassim2021diagnosing}  &  \cite{yang2019deep} 
\\

\midrule
\textbf{Negative Samples}  & 144 & 50 & 0\\
\textbf{Positive Samples}  & 155 & 50 & 150  \\
\textbf{Image fields per sample}  & 100 & 20 & 12  \\
\textbf{Image field size (pixels)}  & 2560x2160 & 4032x3024** & 4032x3024** \\
\textbf{No. annotated parasites} & 2,986 &25,765&84,961\\
\textbf{Camera} & PCO Edge & Smartphone &Smartphone\\
\textbf{Country} & Nigeria & Bangladesh & Bangladesh\\
\bottomrule
\end{tabular}
}
\end{table}




\subsection{Sample-level predictions with Multi-Instance Learning}

Each TBF sample is treated as a \textit{bag} $B=\{x_1,x_2,…,x_n\}$ of image patches (instances) randomly cropped from digitized image fields acquired under a 100x 1.4 NA objective. According to MIL assumptions: 

% Bag-level label definition

\[
y_B =
\begin{cases} 
1 & \text{if at least one instance $x_i$ in bag $B$ contains malaria parasites} \\
0 & \text{if all instances in bag $B$ are negative.}
\end{cases}
\]

Each tile $x_i$ is passed through a ConvNet encoder \cite{simonyan2014very} $f_\theta(\cdot)$, followed by global average pooling (GAP), producing a feature embedding
\begin{equation}
h_i = \text{GAP}(f_\theta(x_i)), \quad h_i \in \mathbb{R}^d.
\end{equation}

The bag-level representation is obtained via max feature pooling:
\begin{equation}
H(B) = \max_{i=1,\dots,n} h_i.
\end{equation}

A linear classifier $g_\phi(\cdot)$ then predicts the bag label probability:
\begin{equation}
p(y_B=1 \mid B) = \sigma\big(g_\phi(H(B))\big),
\end{equation}
where $\sigma(\cdot)$ is the sigmoid function. For each sample, we construct a bag of 50 tiles. When a sample contains $F$ image fields ($F=20$ for Chittagong-1; $F=100$ for Ibadan), we first extract one random tile from each available field. If $F<50$, the remaining $(50-F)$ tiles are obtained by sampling fields uniformly with replacement, allowing the same field to be selected multiple times and extracting additional random crops from the selected fields.

We adopted max-pooling to aggregate field embeddings into a bag representation since, in many cases, only a small fraction of image fields in a positive sample contain parasites. An ablation comparing max pooling and attention-based MIL aggregation \cite{ilse2018attention} showed comparable overall performance, with max pooling yielding slightly higher recall and attention pooling favouring precision; details are provided in Appendix C. 
 
The MIL encoder is initialized from ImageNet-pretrained VGG-19 weights and trained end-to-end without freezing. Binary cross-entropy loss was used to optimize the model weights. At test time, all available image tiles from each sample were passed through the trained model, and their aggregated bag representation $H(B)$ was used to predict the sample label. The model was fully trained for 100 epochs with an SGD optimizer and a learning rate of 0.0003. The resulting MIL model provided two key signals: (i) sample-level predictions and (ii) per-field confidence scores used to trigger ICAM refinement.


\subsection{Pseudo-annotation generation with Iterative Class Activations}

\begin{figure*}[t]
\centering
\includegraphics[width=0.9\textwidth]{icam.png}
\caption{Iterative Class Activation Maps for Malaria Parasite Annotations Generation (ICAM). }
\label{fig:icam}
\end{figure*} 

While MIL enables sample-level classification, it does not localize individual parasites. To generate pseudo-annotations, for each high-confidence malaria-positive image field ($p(y_B) > 0.9$), we derive Class Activation Maps (CAMs) from the MIL classifier. Given convolutional feature maps $F \in \mathbb{R}^{C \times H \times W}$ and classifier weights $w_{c,k}$ for class $c$, the CAM is computed as
\begin{equation}
\text{CAM}_c(i,j) = \sum_{k=1}^C w_{c,k} \, F_k(i,j).
\end{equation}

CAMs highlight discriminative image regions that contribute to the positive class, but often emphasize only the most prominent parasite or artifact. To address this, we introduce Iterative Class Activation Mapping (ICAM), which progressively reveals multiple faint parasite instances through a sequence of erasure and refinement steps.
At each iteration $s$, the highest CAM region is erased from the image $I^s$, and a new CAM is computed if the classifier score of the modified image remains above a confidence threshold (empirically, $\tau = 0.7$). 
At each iteration, the ICAM map is updated by adding the new CAM with an exponential decay factor ($\delta = 2$), ensuring that earlier discoveries retain more influence:

\begin{equation}
\text{ICAM}(I) = \text{CAM}(I) + \sum_{s=1}^{S} \delta^{-s} \cdot \text{CAM}(I^s).
\end{equation}

To convert ICAM maps into bounding boxes, we normalized each map, apply Otsu thresholding to isolate salient regions, eroded small artifacts with a 5x5 kernel, and detected subsequent peaks separated by at least 10 pixels. Around each peak, a bounding box of fixed size (64×64 pixels) is placed. This box dimension reflects typical parasite size in TBF microscopy \cite{manescu2020expert} . This procedure produced $\sim$200k pseudo-annotations on image fields from the Ibadan\cite{manescu2020expert} and Chittagong-1\cite{kassim2021diagnosing} datasets.

\begin{algorithm2e}[t]
\caption{Iterative Class Activation Mapping (ICAM)}
\label{alg:icam}
\begin{algorithmic}[1]
\REQUIRE Image $I$, trained MIL classifier $f_\theta$, 
confidence threshold $\tau$, decay factor $\delta$, 
maximum iterations $S$
\ENSURE ICAM heatmap

\STATE Initialize $\text{ICAM} \gets 0$, $s \gets 0$
\WHILE{$f_\theta(I) > \tau$ \AND $s < S$}
    \STATE Compute $\text{CAM}_s \gets \text{CAM}(I)$
    \STATE Update $\text{ICAM} \gets \text{ICAM} + \delta^{-s} \cdot \text{CAM}_s$
    \STATE $I \gets \text{erase\_highest\_activation}(I, \text{CAM}_s)$
    \STATE $s \gets s + 1$
\ENDWHILE
\RETURN $\text{ICAM}$
\end{algorithmic}
\end{algorithm2e}



\textbf{Hard negative mining.} Negative samples often contain parasite-like artifacts such as platelets and stain precipitates. We performed hard-negative mining by applying the MIL classifier to fields from negative bags and computing CAMs with the positive-class weights. Although these fields do not contain any parasites, the resulting CAMs identify texture patterns the model mistakenly finds discriminative (Sup. Fig. \ref{supfig:ex_pseudo}). Peaks extracted from these CAMs are converted into bounding boxes using the same procedure as ICAM. This procedure yielded $\sim$60k hard negative annotations. Although illustrated here for malaria microscopy, the hard negative mining strategy addresses visually confusing but non-pathological patterns, and the underlying idea may extend to other medical imaging tasks, with task-specific adaptations.

\subsection{Parasite detector training and pseudo-label refinement}

The set of ICAM-generated parasite and artefact pseudo-annotations were next used to train an initial one-stage RetinaNet with a ResNet-50 backbone object detector \cite{lin2017focal} $\mathcal{D}_{\text{MILCA-raw}}$. 
We further incorporated a bootstrapped refinement stage. $\mathcal{D}_{\text{MILCA-raw}}$ is applied to all training fields, and its predictions are retained if their confidence exceeds 0.7. A second detector, $\mathcal{D}_{\text{MILCA}}$, was next trained on the refined pseudo-labels. %This can be summarized as follows: \\
%(i) train $\mathcal{D}_{\text{MILCA-raw}}$ on raw pseudo-labels; \\
%(ii) re-annotate the unlabeled dataset with $\mathcal{D}_{\text{MILCA-raw}}$, filtering by confidence ($\tau=0.7$); \\
%(iii) retrain a refined detector $\mathcal{D}_{\text{MILCA}}$ on the improved annotations.

The MILCA detectors were trained with a focal loss \cite{lin2017focal} for 30 epochs using an Adam Optimizer with learning rate of $5\times10^{-4}$. 

 
\subsection{Fine-Tuning with Limited Manual Annotations}

To evaluate annotation efficiency, we further fine-tuned $\mathcal{D}_{\text{MILCA}}$ using varying fractions of the available manually annotated data (5-75\%). For each fraction, we randomly sampled the corresponding subset of annotations three times and report the mean performance across the three runs to reduce sensitivity to the particular subset chosen. Fine-tuning updates only the classification and regression heads and is performed for 10 epochs.

This hybrid regime differs from conventional semi-supervised pipelines, which require manual labels to bootstrap the initial detector. In contrast, MILCA first constructs a detector entirely from weak supervision and incorporates expert annotations only at the final refinement stage. 

We compared MILCA fine-tuning with a semi-supervised learning (SemiSL) baseline based on naive pseudo-labeling previously used in microscopy \cite{semicellmidl}. In this setup, a vanilla object detector was first trained on the same fractions of expert annotations as MILCA. This detector was then applied to the remaining unlabeled training images, and only predictions with a high confidence score (above $0.7$) were retained as pseudo-annotations. A new detector was subsequently trained from scratch using both the original manual annotations and the generated pseudo-labels.



\section{Results}
\label{sec:results}
\begin{figure}[t]
\centering
\includegraphics[width=0.8\linewidth]{sample_level.png}
\caption{Sample-level classification results. (a) ROC curve with mean AUC over 3 folds ($n=99$). LOW: $<$1000 parasites/$\mu$l. Comparison with previous studies is just indicative as the train or test datasets were not made available.(b) Additional sample-level metrics.}
\label{fig:sample-classification}
\end{figure}

\subsection{Sample-level predictions}

The MILCA classifier accurately distinguished malaria-positive from negative samples across all three datasets.  
In 3-fold cross-validation, it achieved an average AUC of $0.969 \pm 0.013$ (Fig.~\ref{fig:sample-classification}a), with overall accuracy exceeding 90\%.  Performance was lower on samples with medium to low-parasitemia ($<$1000 parasites/$\mu$l), reflecting the inherent difficulty of detecting sparse parasite instances (Fig.~\ref{fig:sample-classification}b). These sample-level predictions are produced without any manual parasite annotations and provide the image field-selection signal used by ICAM.


\subsection{Weakly supervised parasite detection}

Parasite detection performance is evaluated using Average Precision (AP), defined as the area under the precision–recall curve computed over parasite detections at a fixed Intersection-over-Union (IoU) threshold. IoU is defined as the ratio between the area of overlap and the area of union of a predicted bounding box and its corresponding ground-truth box \cite{everingham2010pascal}.
Across the Ibadan dataset (23 test fields, $\sim$700 parasites), Chittagong-1 dataset (10,598 parasites in test set), and secondary detection-only Chittagong-2 dataset (84,961 parasites) a detector trained solely on ICAM pseudo-labels (no manual boxes) achieved mean AP values between 0.1--0.3 at an IoU\,=\,0.5 demonstrating that sample-level supervision alone provides sufficient signal to bootstrap a malaria parasite detector. As expected given the small parasite size, AP was higher at lower IoU thresholds (Fig.~\ref{fig:qualitative-milca}c). Qualitative examples illustrate that MILCA detects many parasites without supervision, and fine-tuning improves recall and bounding-box accuracy (Fig.~\ref{fig:qualitative-milca}a-b).    

\subsection{Label efficiency and hybrid fine-tuning}
We next evaluated parasite detection using an object detector trained on MILCA pseudo-annotations and fine-tuned with different fractions of manually labeled data (5-75\%). We compared against a fully supervised detector trained on all ground-truth annotations (vanilla).  

 
 Fine-tuning with small fractions of manual annotations yielded substantial gains (Fig.~\ref{fig:qualitative-vanilla} and Fig.~\ref{fig:qualitative-milca}).  Across Ibadan, Chittagong-1, and the secondary \textit{detection-only} Chittagong-2 dataset, MILCA outperformed both the fully supervised detector and a naive pseudo-labeling SemiSL baseline at low budgets (For more details, see Sup. Fig. \ref{supfig:ssl-comparison}).  With only 5\% of the labels, MILCA improved AP by 10–20 points over supervised training, reflecting the stronger initialization produced by ICAM pseudo-labels. At 50\% labels, MILCA remained competitive or superior across all dataset (Table~\ref{tab:milca_budget_comparison}). Evaluated on Chittagong-2—unseen during training—MILCA generalized better than the supervised or naive SemiSL detectors. This suggests that training on large numbers of weak, diverse pseudo-labels provides robustness to staining variability and acquisition differences, which are common in routine malaria microscopy. ICAM-derived pseudo-labels are coarse, as they are generated solely from sample-level supervision, and are not expected to yield precise bounding boxes in isolation. Nevertheless, they provide a dense and task-aligned initialization that enables the detector to reliably identify parasite regions. Consequently, performance under fully weak supervision is more limited by localization accuracy rather than by parasite discovery, and even a small amount of expert fine-tuning substantially improves bounding-box regression, leading to large gains at stricter IoU thresholds.




\begin{figure}[t]
\centering
\includegraphics[width=0.9\linewidth]{vanilla_milca_sec.png}
\caption{Example detections using the Vanilla  and the MILCA and fine-tuned parasite detectors on (a) Ibadan, (b) Chittagong-1 and (c) Chittagong-2  test images. (d) Parasite detection performance. Mean AP at IoU=0.5 for different fractions of labeled data on Chittagong-1, Ibadan, and Chittagong-2 datasets. Experiments repeated 3 times with random fractions.}
\label{fig:qualitative-vanilla}
\end{figure}

\begin{figure}[t]
\centering
\includegraphics[width=0.9\linewidth]{figs/milca_ap_iou_detection_sec.png}
\caption{Example detections using the  MILCA object detector (red: no fine-tuning; blue: with 5\% fine-tuning) on (a) Ibadan and (b) Chittagong-1 test images.(c)Parasite detection performance.  AP as a function of IoU threshold.}
\label{fig:qualitative-milca}
\end{figure}



\begin{table}[t]
\centering
\caption{Detection performance (Parasite AP50\%, mean with std in parentheses) for Vanilla Supervised, SemiSL \cite{semicellmidl}, and MILCA across three datasets under 5\% and 50\% annotation budgets. Best performance for each dataset and budget is highlighted in bold.}
\label{tab:milca_budget_comparison}
\begin{tabular}{llccc}
\toprule
Annotation & Method              & Ibadan        & Chittagong-1        & Chittagong-2        \\
Budget &               & (Internal)        & (Internal)        & (Secondary det-only)        \\
\midrule
\multirow{3}{*}{5\%}
       & Vanilla Supervised  & 0.15 (0.05)      & 0.53 (0.03)      & 0.38 (0.03)      \\
       & SemiSL                 & 0.31 (0.05)      & 0.56 (0.03)      & 0.33 (0.02)      \\
       & MILCA           & \textbf{0.49 (0.04)} & \textbf{0.66 (0.01)} & \textbf{0.57 (0.04)} \\
\midrule
\multirow{3}{*}{50\%}
       & Vanilla Supervised  & 0.60 (0.02)      & 0.76 (0.00)      & 0.56 (0.01)      \\
       & SemiSL                 & 0.64 (0.01)      & 0.75 (0.01)      & 0.51 (0.01)      \\
       & MILCA           & \textbf{0.69 (0.00)} & \textbf{0.78 (0.01)} & \textbf{0.60 (0.03)} \\
\bottomrule
\end{tabular}
\end{table}








\subsection{Parasite count estimation}
MILCA produced accurate parasite counts on internal test sets, achieving $R^2=0.80$ on Chittagong-1 (Fig.~\ref{fig:parasite-count-mahidol}).  
Performance decreased on the secondary \textit{detection-only} Chittagong-2 dataset ($R^2=0.14$), consistent with domain shift effects, yet MILCA remained more accurate than the vanilla parasite detector under all annotation budgets.  
All models tended to underestimate counts at very high parasitemia levels, where overlapping parasites introduce ambiguity.  
Overall, MILCA demonstrates that meaningful object-level supervision can be derived entirely from sample-level diagnostic labels, enabling strong detection and counting performance with minimal expert annotation.

\begin{figure*}[t]
\centering
\includegraphics[width=\linewidth]{pc_both_sec.png}
\caption{Parasite count evaluation on (a) Chittagong-1 (internal) and (b) Chittaging-2 (secondary detection-only). Scatter plots of predicted vs. true counts for MILCA with and without fine-tuning.  MILCA outperforms vanilla detectors under the same annotation budget.}
\label{fig:parasite-count-mahidol}
\end{figure*}



\section{Discussion}

We introduced MILCA, a weakly supervised framework for malaria parasite detection in thick blood films that relies solely on sample-level diagnostic labels. By combining MIL image field selection, ICAM refinement, and hard-negative mining, MILCA generates dense pseudo-annotations that enable training a practical detector without any bounding-box labels. Across multiple datasets, MILCA achieves competitive detection and counting performance, under fully weak supervision and substantially outperforms supervised and SemiSL baselines when annotation budgets are low. A key finding is that weak supervision \emph{before} strong supervision is highly effective:  
detectors initialized from ICAM pseudo-labels require far fewer manual annotations to reach high accuracy compared to conventional semi-supervised pipelines.   This offers a scalable path for developing diagnostic models in settings where expert time is limited and annotation costs are prohibitive.

From a deployment perspective, MILCA substantially lowers the barrier to developing parasite detection and counting systems by eliminating the need for large-scale parasite-level annotations. The framework relies only on sample-level diagnostic labels (positive/negative), which are already generated as part of routine clinical workflows, and can therefore be applied retrospectively to existing datasets without additional annotation burden.

\paragraph{Future work.}
A limitation of this study is that it does not report strict site-held-out (Ibadan vs Chittagong) evaluation for parasite detection; assessing cross-site generalization under weak supervision remains an important direction for future work. Future work will also explore more modern tile encoders and MIL aggregation mechanisms, which may further improve sample-level discrimination and the quality of the pseudo-annotations. We will equally investigate integrating MILCA with more advanced semi-supervised detection frameworks, using MILCA-derived pseudo-labels to bootstrap or guide teacher–student training under limited annotation budgets. Enhancements may also include incorporating uncertainty-aware pseudo-label filtering, joint MIL--detector training and contrastive pretraining to mitigate domain shift. MILCA could also be extended to other pathology tasks where annotations are scarce but sample-level labels are readily available.





\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We thank a bunch of people.}


\bibliography{midl26_127}

\clearpage
\appendix
\renewcommand{\figurename}{Sup. Fig.}
\renewcommand{\tablename}{Sup. Table}
\setcounter{figure}{0}
\setcounter{table}{0}


\section{Probability of Missing Parasites Under Stochastic Tile Sampling}

We analyze the probability that a malaria-positive sample yields no parasite-containing tiles under the stochastic bag construction used for Multiple Instance Learning (MIL) training. This analysis is performed on the Chittagong datasets, for which dense parasite bounding-box annotations are available. An equivalent analysis for Ibadan is not possible due to the absence of field-level annotations.

\subsection*{Bag Construction}

Each sample is represented as a bag of 50 image tiles. For a sample containing $F$ image fields ($F=20$ for Chittagong), one random $224\times224$ crop is first extracted from each field. The remaining $m = 50 - F$ tiles are obtained by sampling fields uniformly with replacement and extracting additional random crops from the selected fields.

\subsection*{Analytical Miss-Rate}

Let $r_j$ denote the probability that a single random crop from field $j$ contains at least one parasite, and let $a_j = 1 - r_j$. Under the above sampling scheme, the probability that none of the 50 tiles in a bag contain a parasite is given by:
\begin{equation}
P_{\mathrm{miss}} =
\left(\prod_{j=1}^{F} a_j \right)
\left( \frac{1}{F} \sum_{j=1}^{F} a_j \right)^{m}.
\end{equation}

When only the number of parasites $N_j$ in field $j$ is available, $r_j$ can be approximated assuming a uniform spatial distribution of parasites within the field:
\begin{equation}
r_j \approx 1 - \left(1 - \frac{A_{\mathrm{crop}}}{A_{\mathrm{field}}}\right)^{N_j}
\;\approx\;
1 - \exp\!\left(-N_j \frac{A_{\mathrm{crop}}}{A_{\mathrm{field}}}\right),
\end{equation}
where $A_{\mathrm{crop}}$ and $A_{\mathrm{field}}$ denote the areas of a crop and an image field, respectively.

\subsection*{Monte Carlo Estimation}

To obtain an empirical estimate of $r_j$, we perform a Monte Carlo simulation using the available bounding-box annotations. For each field, we randomly sample crop locations uniformly over the valid image area and record the fraction of crops that intersect at least one parasite bounding box. This yields an empirical estimate $\hat{r}_j$ for each field, which is substituted into Eq.~(1) to compute $P_{\mathrm{miss}}$ per sample.

\subsection*{Analysis}


Using the available Chittagong annotations, we quantified the probability that a malaria-positive sample yields no parasite-containing tiles under our sampling strategy and found that with a bag size of 50 the median miss-rate is approximately 1–2\% (Sup. Table \ref{tab:miss_rate_bag_size}), with close agreement between empirical and analytical estimates, supporting the robustness of our tile sampling choice. 

\begin{table}[t]
\centering
\caption{Probability of missing all parasite-containing tiles ($P_{\mathrm{miss}}$) for different bag sizes on the Chittagong dataset. Results are summarized across positive samples using both Monte Carlo (MC) estimation based on bounding-box annotations and an analytical approximation based on parasite counts per field. P95 denotes the 95th percentile across samples, i.e., a worst-case estimate for the majority (95\%) of samples.}
\label{tab:miss_rate_bag_size}
\begin{tabular}{c|ccc|ccc}
\hline
\multirow{2}{*}{Bag size} 
& \multicolumn{3}{c|}{Monte Carlo $P_{\mathrm{miss}}$} 
& \multicolumn{3}{c}{Analytical $P_{\mathrm{miss}}$} \\
\cline{2-7}
& Mean & Median & P95
& Mean & Median & P95 \\
\hline
20 & 0.2381 & 0.1822 & 0.6185
   & 0.2380 & 0.1729 & 0.6390 \\
30 & 0.1490 & 0.0783 & 0.4872
   & 0.1545 & 0.0724 & 0.5113 \\
40 & 0.0991 & 0.0336 & 0.3838
   & 0.1065 & 0.0303 & 0.4093 \\
50 & 0.0676 & 0.0145 & 0.2936
   & 0.0752 & 0.0127 & 0.3143 \\
\hline
\end{tabular}
\end{table}


While an equivalent estimate is not possible for Ibadan due to missing field-level boxes or parasite counts, the Chittagong analysis provides a quantitative reference for the effect of bag size on miss probability.


\clearpage
%\appendix




\section{Supplementary Figures}





\begin{figure}[h!]
\centering
\includegraphics[width=\linewidth]{ex_pseudo.png}
\caption{ (a) Malaria parasite partial annotations generated with CAM. (b) Malaria annotations augmented with ICAM.
(c) Hard negative annotations generated by CAM.}
\label{supfig:ex_pseudo}
\end{figure}



\begin{figure}[h]
\centering
\includegraphics[width=\linewidth]{figs/milca_raw_vs_refined_sec.png}
\caption{Parasite detection performance. Mean AP at IoU=0.5 for different fractions of labeled data on Chittagong-1,
Ibadan, andChittagong-2 datasets. Experiments repeated 3 times with random fractions. Milca-refined refers to the
bootstrapped pseudo-label retrained model.}
\label{supfig:bootstrap-comparison}
\end{figure}

\begin{figure}
\centering
\includegraphics[width=\linewidth]{ap_milca_ssl.png}
\caption{Comparison with a naive pseudo-labeling SemiSL baseline. MILCA fine-tuning achieves higher AP across all fractions of labeled data.}
\label{supfig:ssl-comparison}
\end{figure}



\begin{figure}
\centering
\includegraphics[width=0.8\linewidth]{detection_milca_ssl.png}
\caption{Example detections using MILCA fine-tuned and the Semi Supervised object detectors on (a) Ibadan and (b) Chittagong-1 test images.}
\label{supfig:qualitative-ssl}
\end{figure}
%This is a boring technical proof of
%\begin{equation}\label{eq:example}
%\cos^2\theta + \sin^2\theta \equiv 1.
%\end{equation}

%\section{Proof of Theorem 2}

%This is a complete version of a proof sketched in the main text.

\clearpage
\section{MIL Aggregation Ablation}

We compare max pooling with an attention-based MIL aggregation mechanism \cite{ilse2018attention} for sample-level malaria classification. Both models use the same encoder, training protocol, and data splits. Results are reported as mean $\pm$ standard deviation across runs.

\begin{table}[h]
\centering
\caption{Comparison of MIL aggregation strategies.}
\label{tab:mil_aggregation_ablation}
\begin{tabular}{lcccc}
\hline
Aggregation & Accuracy & Precision & Recall & AUROC \\
\hline
Max pooling 
& $0.91 \pm 0.02$
& $0.90 \pm 0.02$
& $\mathbf{0.94 \pm 0.02}$
& $0.97 \pm 0.01$ \\
Attention pooling
& $0.89 \pm 0.01$
& $\mathbf{0.94 \pm 0.04}$
& $0.84 \pm 0.02$
& $0.95 \pm 0.01$ \\
\hline
\end{tabular}
\end{table}

Overall performance is comparable across aggregation strategies. Max pooling yields higher recall, consistent with the presence of sparse positive instances, while attention pooling favors precision. Given the focus of this work on label-efficient weak-to-strong supervision rather than MIL architecture optimization, we used max pooling in all subsequent experiments.


\end{document}


