\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{subcaption}
%\usepackage{geometry}
\usepackage{float}
\usepackage{multicol} % pour eq multicol
\usepackage{multirow}

\usepackage{enumitem}

\setlength{\belowcaptionskip}{-14pt}
\setlength{\abovecaptionskip}{4pt}

\expandafter\def\expandafter\normalsize\expandafter{%
    \normalsize%
    \setlength\abovedisplayskip{4pt}%
    \setlength\belowdisplayskip{4pt}%
    \setlength\abovedisplayshortskip{-4pt}%
    \setlength\belowdisplayshortskip{4pt}%
}

\usepackage{titlesec}
\titlespacing*{\section}{0pt}{0.7\baselineskip}{0.1\baselineskip}
\titlespacing*{\subsection}{0pt}{0.2\baselineskip}{0.1\baselineskip}

\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 263}
\editors{Accepted for publication at MIDL 2024}

\title[Weakly supervised prostate cancer detection and generalization on unseen domains]{Weakly supervised deep learning model with size constraint for prostate cancer detection in multiparametric MRI and generalization to unseen domains}

% Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Robin Trombetta\nametag{$^{1}$}} \Email{robin.trombetta@creatis.insa-lyon.fr}\\
\Name{Olivier Rouvière\nametag{$^{2}$}} \Email{olivier.rouviere@chu-lyon.fr}\\
\Name{Carole Lartizien\nametag{$^{1}$}} \Email{carole.lartizien@creatis.insa-lyon.fr}\\
\addr $^{1}$ Univ. Lyon, CNRS UMR 5220, Inserm U1294, INSA Lyon, UCBL, CREATIS, France\\
\addr $^{2}$ Hospices Civils de Lyon, Radiology Department, Edouard Herriot Hospital, Lyon, France
}

\begin{document}

\maketitle

%% VERSION SOUMISE LE 31 JANVIER
% \begin{abstract} 
% Fully supervised deep models have shown promising performance for many medical segmentation tasks. 
% Still, the deployment of these tools in clinics is limited by the very time-consuming collection of manually expert-annotated data.
% Moreover, most of the state-of-the-art models have been trained and validated on homogeneous (eg from  a unique clinical center or scanner type) or moderately heterogeneous (one or more clinical centers with scanners of different vendors) datasets. It is known that deep learning methods are often greatly degraded by domain or label shifts and are yet to be built in such a way as to be robust to unseen data or label distributions. In the clinical setting, this problematic is particularly relevant as the deployment institution may have different scanners or acquisition protocols than those from which the data has been collected to train the model. 
% In this work, we propose to address these two challenges on the detection of clinically significant prostate cancer (csPCa) from bi-parametric MRI. First, we propose a weakly supervised method based on scribble annotations that builds on the size constraint loss proposed by \cite{kervadec} to produce fine semantic cancer lesions segmentations. 
% Secondly, we evaluate performance of our proposed weakly supervised model and fully supervised state-of-the art baselines both on in-distribution test cases and unseen cases from other institutions.
% All models are built on 2D and 3D U-Nets architectures that have shown to be strong supervised baselines on the PI-CAI challenge leaderboard.
% These models are trained and tested in a 5-fold cross-validation fashion on the 1295 manually annotated MRI images of the PI-CAI public training dataset. Generalization performances are assessed on the Prostate158 challenge dataset and on a private dataset containing 219 patients, and compared to fully supervised models trained on each of these databases. 
% The main results of this study are as follows :  

% First, we show that our model achieves results on-par performance with fully supervised baseline models, both on in-distribution validation data and unseen test images. Our implementation of 2D and 3D fully supervised U-Net are also shown to achieve performance in par with those reported on the Pi-CAI challenge leaderboard.
% Second, we observe a performance decrease for both fully supervised and weakly supervised models in the generalization configuration (models trained on the PI-CAI dataset and evaluated on the Prostate158 and private dataset). This confirms the crucial need for efficient domain adaptation methods if deep learning models are aimed to be deployed in a clinical environment. 
% Finally, we show that ensemble predictions from multiple trainings increase generalization performance.
% \end{abstract}

%%% SHORTER VERSION FOR THE FULL PAPER
\begin{abstract} 
Fully supervised deep models have shown promising performance for many medical segmentation tasks. Still, the deployment of these tools in clinics is limited by the very time-consuming collection of manually expert-annotated data. Moreover, most of the state-of-the-art models have been trained and validated on moderately homogeneous datasets. It is known that deep learning methods are often greatly degraded by domain or label shifts and are yet to be built in such a way as to be robust to unseen data or label distributions. In the clinical setting, this problematic is particularly relevant as the deployment institutions may have different scanners or acquisition protocols than those from which the data has been collected to train the model. 
In this work, we propose to address these two challenges on the detection of clinically significant prostate cancer (csPCa) from bi-parametric MRI. 
%We train a weakly supervised method based on scribble annotations that builds on the size constraint loss proposed by \cite{kervadec} to produce fine semantic cancer lesions segmentations. 
We evaluate the method proposed by \cite{kervadec}, which introduces a size constaint loss to produce fine semantic cancer lesions segmentations from weak circle scribbles annotations.
Performance of the model is based on two public (PI-CAI and Prostate158) and one private databases. First, we show that the model achieves on-par performance with strong fully supervised baseline models, both on in-distribution validation data and unseen test images. Second, we observe a performance decrease for both fully supervised and weakly supervised models when tested on unseen data domains. This confirms the crucial need for efficient domain adaptation methods if deep learning models are aimed to be deployed in a clinical environment. Finally, we show that ensemble predictions from multiple trainings increase generalization performance.
\end{abstract}



%for 2D and 3D U-Nets that reach performances comparable to supervised baselines reported on the PI-CAI challenge leaderboard, we observe a performance decrease for both fully supervised and weakly supervised models in the generalization configuration (models trained on the PI-CAI dataset and evaluated on the Prostate158 and private dataset). This confirms the crucial need for efficient domain adaptation methods if deep learning models are aimed to be deployed in a clinical environment. 
%Second, we show that our proposed weakly supervised models has better generalization capability that fully supervised models. 
%Second, we show that ensemble predictions from multiple trainings increase generalization perfromance and finally, we show that our model achieves results on-par performance with fully supervised baseline models, both on in-distribution validation data and unseen test images.//

%% Very first version
% \begin{abstract}
% As for many medical images applications, the development of deel learning models to detect prostate cancer lesions from MRI has recently shown promising results. Still, it is limited by the very time-consuming collection of manually expert-annotated data.
% Morevover, it is known that deep learning methods are often greatly degraded by domain or label shifts and are yet to be built in such a way as to be robust to unseen data or label distributions. In the clinical setting, this problematic is particularly relevent as the deployment institution may have different scanners or acquisition protocols than those from which the data has been collected to train the model. 
% In this work, propose to address these two challenges on the task of detection clinically significant prostate cancer from bi-parametric MRI. First, we propose a weakly supervised method, based on the size constraint loss proposed by \cite{kervadec}, capable to segment prostate cancer lesions from weak scribble annotations, using only 14\% of lesion annotations. Secondly, we evaluate the performances of our model and the fully supervised baslines both on in-distribution cases and unseen cases from other institutions.
% The models are trained and tested in a 5-fold cross-validation fashion on the 1295 manually annotated MRI images of the PI-CAI public training dataset. The generalization performances are assessed on the challenge dataset Prostate158 and on a private dataset containing 219 patients, and compared to fully supervised models trained on each of these databases. We show that the domain is and should be focused on if deep learning models aimed to be deployed in a clinical environment. We quantify to what extent ensemble predictions from multiple trainings help for generalization and also show that our model achieve results that are on-par with fully supervised baseline models, both on in-distribution validation data and unseen test images.
% \end{abstract}

\begin{keywords}
Prostate cancer detection, Weakly supervised learning, Domain generalization, Multiparametric MRI, Deep learning
\end{keywords}

\section{Introduction}
Over the last years, deep learning models have become state-of-the-art methods in almost all medical imaging applications, including segmentation and detection. Among data-oriented methods, fully supervised models remain the most common and best performing ones. However, gathering numerous expert-annotated data to train such models is a very time and ressources consuming process, restraining the current use of such models in the medical field. For this reason, other promising paradigms have also been explored such as semi-, weakly- or unsupervised learning \cite{bosma_semisupervised, baur_unsupervised}. They aimed to mitigate the need of annotated data to train deep learning models.

Another known drawback of deep learning methods is the limited generalization capacity to unknown data distribution. It has been shown that when tested on out-of-distribution data, deep learning models can significantly underperform compared to in-distribution evaluation \cite{BOONE2023}. Yet, the robustness of models to unseen domains is an absolute necessary condition for their use in clinical settings given the inherent heterogeneity among scanners and acquisition protocols between and within clinical institutions.
%Yet, the robustness of models to unseen domains is an absolute necessary condition with the view to make use of deep learning in a clinical setup. The inherent heterogeneity between scanners and acquisition protocols between institutions therefore appears to be a great limit for deep learning models. 

In this work, we propose to tackle these two problems in the challenging task of detecting and localizing clinically significant (ISUP grade group $\ge 2$) prostate cancer (csPCa) lesions in multi-parametric MRI. This is a task of primary clinical interest, as shown by the recent success in the urology community of the PI-CAI (The Prostate Imaging: Cancer IA)\footnote{The PI-CAI grand challenge : \url{https://pi-cai.grand-challenge.org/PI-CAI/}} challenge. Many recent works have tried to improve the automatisation of cancerous prostate lesions detection \cite{Bhattacharya_review22}. 
Most proposed deep learning strategies focus on supervised models, with architectures such as nnUNet ranging among the top performing on the PI-CAI challenge dataset. A few recent works have proposed self- or weakly supervised approaches \cite{tardi2021, bateson2021}, leveraging bounding boxes \cite{nndetection}, scribbles or patient-level annotations \cite{el_jurdi2021,Yang_ISBI21}, partially lowering the gap with supervised approaches. 
%Some of them have proposed weakly supervised approaches that leverage only bouding boxes, scribbles or patient-level annotations \textcolor{red}{REF}, partially lowering the gap with supervised approaches.

Our contributions in this work are threefold:
\vspace{-4mm}
\begin{itemize}
\itemsep-1mm 
    %\item We propose a weakly supervised model, trained only on circle scribbles, that achieve performances close to strong fully supervised baselines.
    \item We evaluate the method proposed by \cite{kervadec} for the challenging task of segmenting csPCa lesions in multiparametric MRI and achieve performances close to strong fully supervised baselines using only circle scribbles and image-level priors.
    \item We evaluate how the scribble annotation process impacts performance of weakly supervised model and show that the model is robust to various weak annotation strategies.
    %\item We carry out a study on how the process of obtaining weak annotations is modeled to account the robustness of the weakly supervised model.} 
    \item We evaluate the models both on in-distribution validation data and unseen test images to evaluate the drop in performance in the generalization configuration, and show that our weakly supervised model is less prone to such effect.
    \item We quantify to what extent ensemble predictions from multiple trainings improve generalization of deep learning models.
\end{itemize}

\section{Material and Method}
\subsection{A weak segmentation model based on object size constraint loss function}

In \cite{kervadec}, the authors proposed a loss function for partially annotated data that aims to impose a size constraint on the predicted segmentations of a model. The partial cross-entropy $\mathcal{H}$, computed only on the annotated pixels $\Omega_a$, is combined with a constraint loss $\mathcal{C}$ that adds a quadratic penalty to the model on the total sum of its predictions for class $c$ if it is outside a defined range $[a,b]$. More specifically, let $V_c = \sum_{p \in \Omega} S_{p,c}$ be the sum of the probabilities $S_{p,c}$ for class $c$ of every pixel $p$ in the image domain $\Omega$. The constraint loss is given by :

\begin{equation}
\mathcal{C}(V_c) = 
\begin{cases}
(V_c - a)^2 & \text{if } V_s < a \\
(V_c - b)^2 & \text{if } V_S > b \\
0 & \text{otherwise}
\end{cases}
\end{equation}

The total cost function is then defined as 
\begin{equation}
  \mathcal{H}(S)+ \lambda \mathcal{C}(V_S)
  \label{eq: weak_loss}
\end{equation}
where $\lambda$ is a positive constant weighting the two terms, $V_S=\sum_{p \in \Omega}S_p$ with $S_p$ the softmax probability at pixel $p$ in the image domain $\Omega$.


The size constraint loss term was initially used in a binary segmentation problem to improve prostate gland segmentation in multiparametric partially labeled MRI in \cite{kervadec}.
\cite{Duran_2022} extended the binary formulation to a multi-class output with $C$ classes and evaluated it on a prostate cancer detection task. %accounting for class imbalance, through a global loss term $\mathcal{L}$ that is obtained by a weighted sum of the loss term applied independently on each class. % $c$ as follows : 
% \begin{equation}
%     \mathcal{L} = \sum_{c=1}^C w_c(\mathcal{H}_{c} + \lambda \mathcal{C}(V_{c}))
% \end{equation}
%where $w_c$ designate the weight attributed to class $c$ to push the model towards better segmenting certain classes and $\lambda$ is the weight given to the size constraint loss.
They used this constraint at the image level with \textit{image tag priors}, following the definition in Kervadec et al. \cite{kervadec}, that is enforcing the presence of the target class by setting $a=1$ and $b=\vert\Omega\vert$ (the image domain) or the absence of the target with parameters $a=b=0$. This implementation achieved promising performance for segmentation and grading of PCa lesions in a weakly supervised setting on the Prostatex-2 challenge and a private datasets.

We extend the work of \cite{Duran_2022} on prostate cancer detection by leveraging the constraint term referred to as \textit{common bounds} introduced by \cite{kervadec}, whose principle is to introduce more precise lower and upper bounds $a$ and $b$ depending on the size of the lesions in the ground truth. These bounds are a way to introduce prior knowledge on the objects to detect to compensate for the partially labeled data. We implement this method by imposing a common bounds constraint both on the prostate class and the CS lesion class.
%We extend this work by setting more precise lower and upper bounds $a$ and $b$ depending of the size of the lesions in the ground truth. This constraint term is refereed to as \textit{common bounds constraint}. These bounds are a way to introduce prior knowledge on the objects to detect to compensate for the partially labeled data. It is meaningful to add such a loss in a case where the items that are aimed to be segmented can be characterized %in advanced 
%and have a rather smooth size distribution among the samples (see Appendix \ref{app:lesions_charac}). 

\subsection{Data description}

The experiments are conducted on three datasets, described hereunder :
\vspace{-2mm}
\begin{itemize}
\itemsep-1mm 
    \item PI-CAI challenge public training dataset. It contains 1500 multi-parametric MRI (T2w, DWI and ADC) exams from 3 Dutch centers acquired on 7 different scanners, 5 from Siemens Healthineers and 2 from Philips Medical Systems. It includes 328 cases from the Prostate-X challenge \cite{prostateX}. Of all the exams available, we only use the 1295 that are manually annotated by expert clinicians, and do not leverage the 205 exams with AI-derived lesion segmentations. 
    \item The Prostate158 \cite{prostate158} train and validation datasets. It consists of 139 annotated biparametric MRI (T2w, DWI) acquired at a German university hospital on 3T Magneton Vida and Skyra scanners from Siemens Healthineers.
    \item A private dataset, containing 219 multi-parametric MRI (T2w, DWI and ADC) exams acquired in clinical practice in two French hospitals on three different scanners : 26 exams were carried out on a 3T Ingenia scanner (Philips Medical Systems), 67 on a 1.5T Symphony scanner (Siemens Healthineers) and 126 on a 3T Discovery scanner (GE Heathcare). It was declared to the appropriate national administrative authorities (CPP L 09-04 and CNIL 08-06) and patients gave written informed consent for researchers to use their MR imaging data. All patients underwent a radical prostatectomy and prostate focal lesions manually outlined by expert radiologists on the different imaging sequences were validated against the prostatectomy gold standard ground truth.
\end{itemize}

Both T2-weigthed (T2w) and apparent diffusion coefficient (ADC) MR maps were used as input channels. The latter modality was registered to the former, all images were resampled to a $1 \times 1 \times 3$ mm\textsuperscript{3} pixel size and cropped to $96 \times 96 \times 20$ volumes. Images intensities were linearly normalized into the range [0, 1] for each patient and each modality.
More details about these datasets can be found on Appendices \ref{app:lesions_charac} and \ref{app:modalities_charac}, including lesion volume distributions and histograms of intensities for T2-weighted imaging and ADC maps.


\subsection{Weak annotations}
\label{subsec:weak_annotations}
The aim of weak annotations is to mimic what could be an easier and faster way for clinicians to provide annotations on real images. For this purpose, we replace full segmentations by circles of maximum radius of 3 mm inside each individual lesion. The centers of the circles are drawn randomly and independently on each axial slices. If the lesion is too small to fit a circle of this size, the radius is reduced until a circle can fit inside the lesion. The prostate gland is also annotated is such way, with only one circle per slice.
In total, weak annotations only represent 14\% of the full masks of CS lesions, considerably reducing the amount and complexity of annotations and thus the time needed for experts to make these annotations. Illustrative circle annotations are depicted on Figure \ref{fig:visual_results}. Appendix \ref{app:ablation_labels} evaluates how the best weakly supervised model performs when other annotation strategies are adopted.

\subsection{Experiments}
% Il faut rajouter une petite section dans laquelle tu décris toutes les expé que tu as réalisées. \\
% In this study, we perform different experiments:
% We first evaluate performance of our proposed scribble based weak model in the \textit{in-distribution} setup, meaning with train and test data drawn from the same distribution, here the PI-CAI database. We consider two models, one trained with partial cross-entropy (CE) and the image-tag (IT) or and one with partial CE image-tag (IT) and common bounds (CB) constraint loss terms.
% We compare it to fully supervised models trained with CE and generalized DICE loss on the full available lesions annotations.
% We use 2D and 3D MONAI's DynUNet \cite{monai} as backbone architectures for the proposed weak and fully supervised models.
% We then evaluate the generalization performance of both fully and our proposed weakly supervised model by training these models on one dataset, PI-CAI and testing it on data from two unseen domains, namely Prostate158 and our private database.
We compare several weakly supervised methods and fully supervised baseline models.
%following this training and evaluation setup. 
For our proposed scribble based weak model, we consider two main configurations : one with partial cross-entropy (CE) and the image tag (IT) and one with partial CE, image tag and common bounds (CB) constraint loss terms. We compare them to a simpler weak model with partial cross-entropy and negative cross-entropy (denoted Partial CE) as well as to fully supervised baselines trained with cross-entropy and generalized DICE loss on the full available annotations. We use 2D and 3D MONAI's DynUNet \cite{monai} as backbone architectures for the proposed weak and fully supervised models. As for comparison to other weakly supervised models, we train nnDetection \cite{nndetection} with ground truth segmentations masks being 3D rectangular cuboids framing full lesion annotations (\textit{nnDetection full}) or weak scribble annotations (\textit{nnDetection weak}). Note the comparison between these models and the ones with size constraints is not straightforward as they do not use the same kind of weak annotations.

All models are trained in 5-fold cross-validation on the PI-CAI dataset. They are first evaluated in the \textit{in-distribution} setup, meaning we report the mean performance on the 5 validation folds of the PI-CAI dataset. Then, we appraise the models in the generalization setup by testing them on data from two unseen domains, namely Prostate158 and our private database. Moreover, for each method, we combine the best models of each training fold into a single ensemble model, for which the lesion probability maps are computed as the average of the probability maps of the 5 aggregated models. These ensemble models are only tested on the two unseen data domains.



\subsection{Evaluation metrics}

% All models are trained in a 5-fold cross-validation fashion on PI-CAI dataset. For each metric, the score on in-domain data is computed as the mean value on the 5 validation folds.\\
% %on the validation set of all the trainings. 
% %To evaluate generalization performance, we evaluate the 5 models (one model per cross-validation fold) trained on PI-CAI on Prostate158 and on our private dataset and report the mean performance over these 5 models. 
% To evaluate generalization performance, we infer lesion probability maps for all images of Prostate158 and on our private datasets from the 5 models (one model per cross-validation fold) trained on PI-CAI and report the mean performance over these 5 models. 
% %Moreover, these 5 models %obtained of each training fold 
% are combined to get one ensemble model for each method, whose lesion probability map is achieved by averaging the lesion probability maps outputed by each of the 5 models. %The predictions probability maps are first averaged and then  used to get the predicted lesions.
% Moreover, we average the lesion probability maps outputted by each of the 5 models to get one ensemble model for each method.


The models are evaluated both at lesion and patient levels. Following PI-CAI guidelines, a detection map is made of non-overlapping and non connected clusters, representing predicted csPCa lesions. Each lesion is assigned a unique probability score, chosen as the average of the probabilities of the cluster's voxels. At a lesion level, we report metrics derived from the free-response receiver operating characteristics (FROC) curve which shows sensitivity as a function of the number of false positive detections per patient. 
%which is more suited for a detection task than the standard ROC curve. 
In continuity of previous works done on this csPCa detection task \cite{bosma_semisupervised, saha_2021}, we consider a predicted lesion as a true positive if it intersects a ground truth lesion with an intersection-over-union (IoU) ratio of at least 0.1.
%the criterion to count a predicted lesion as a true positive is to have an intersection-over-union (IuO) of at least 0.1 with a ground truth lesion. 
Since there is no consensus metric to summarize a FROC curve, we choose to report the sensitivity at 1 false positive per patient. 
Another complementary indicator of the performances of detection models is the average precision (AP), defined as the area under the precision-recall curve. Finally, for the patient-level diagnosis performance, the area under the ROC curve (AUROC) is reported. The patient's overall likelihood of harboring csPCa is defined as the maximum score of the predicted lesion clusters. %with the classification score of a patient being the maximum value of the lesions detected by the model.


\subsection{Implementation details and hyperparameters}
%We use  2D and 3D MONAI's DynUNet \textcolor{red}{REF} as backbone architectures of the proposed weak and fully supervised models. 
%The weak model is tested on 2D and 3D MONAI's DynUNet \textcolor{red}{REF}. 
All hyperparameters were determined with grid search on the first fold of the PI-CAI training/validation splits (see Appendix \ref{app:archi_GS} for more details). 
%Weakly supervised models are trained with partial cross-entropy and the image-tag (IT) or image-tag and common bounds (CB) constraint loss. 
%The lower and upper bounds associated with the CS lesions class were empirically set to 5 and 500 voxels for 2D models and to 30 and 4 000 voxels for 3D models based on the analysis of the lesion size distribution for each dataset (See Appendix \ref{app:lesions_charac}). 
The lower and upper bounds associated with the CS lesions class were set to 5 and 500 voxels for 2D models and to 30 and 4 000 voxels for 3D models.
For the prostate class, they were set to 100 and 2 500 in the 2D case and 10 000 and 40 000 in the 3D case. The fully supervised baselines were trained with cross-entropy and generalized DICE loss.
All models were trained during 200 epochs with Adam optimizer, a learning rate of $10^{-3}$ and a weight decay of $10^{-4}$. To compensate for the low amount of lesions in the PI-CAI dataset (see Table \ref{tab:datasets}), sampling was weighted such that 2D transverse slices for 2D models -- or 3D volumes for 3D models -- with and without lesions have the same probability of being drawn in a batch. For post-processing, predicted lesions of size inferior than 15 voxels are discarded.

\section{Results}
\subsection{Classification and detection performances}

Figure \ref{fig:performances} shows performance of all considered models for the three metrics of interest, namely sensitivity at 1 FP, AP and AUROC. Figure \ref{fig:visual_results} provides examples of visual results of lesion detection maps for some 3D models. Extended visual results, including of 2D and ensemble models, are showcased in Appendix \ref{app:visual_results}. 

First of all, it is important to note that the best performing model, namely the 3D supervised DynUNet, achieves a mean AUROC of 0.82 and mean AP of 0.42 thus producing a mean aggregated score of 0.62. This performance compares well against the best achievable reported metric on the PI-CAI challenge leader-board. We thus consider it a reliable baseline for our comparison. 
%First of all, it is important to note that the best model, that is the 3D supervised DynUNet achieves  with a score for the PI-CAI challenge, defined as the mean between Average precision and AUROC, of 0.62, thus being a reliable baseline for our comparison.
Surprisingly, the 3D supervised DynUNet is still outperformed by 2D models in term of sensitivity at 1 FP, including by models trained with weak labels.


\begin{figure}[t!]
\centering
  \includegraphics[width=0.85\linewidth]{images/all_metrics_train_picai_with_nnDetection.pdf}
  \caption{Classification and detection performances of all models. %On PI-CAI, . On the two generalization datasets (Prostate158 and our private dataset), we report both the average results from the 5 different models (one per validation fold) of each method and the ensemble prediction from these models. 
  Reference designates fully supervised 3D DynUNet trained and tested on the Prostate158 or private dataset in 5-fold cross-validation setup. See Appendix \ref{app:numerical_results} for detailed numerical values.}
  \label{fig:performances}
\end{figure}

The two weak models with size constraints (CE+IT, CE+IT+CB) clearly outperform the model trained only with partial and negative cross entropies (Partial CE), showing the interest of the additional size constraint cost functions. Between the image tag (IT) and the common bounds (CB) losses, the latter achieves a higher score in 22 of the 30 configurations. We use the term configuration to refer to a pairwise comparison between models with the same spatial dimension input (2D or 3D) and type of model (ensemble or not) evaluated on a given dataset and for a given metric. For instance, AUROC comparison between 2D CE+IT and 2D CE+IT+CB on Prostate158 accounts for one configuration.  %This is observable for 2D and 3D models, as well as with the ensembling of models. 
Remarkably, the models that have been trained with weak labels can outperform fully supervised models. They also perform favorably compared to nnDetection in most cases.
The weakly supervised CE+IT+CB model achieves better scores than its supervised counterpart in almost all 2D configurations, but only 2 times out of 30 in 3D. %When looking at IT and CB models, this is for almost all 2D configurations but only 2 times out of 30 in 3D configurations, and only for CB.

For almost all models, metrics and datasets, model ensembling improves the generalization performances, with a mean improvement of 20\% among all models and metrics. We can note that for sensitivity at 1FP and AUROC, some models, especially with ensemble predictions, equal or surpass the reference metrics, which is the mean performance over 5 folds of the 3D fully supervised DynUNet trained and tested on the Prostate158 or private datasets, respectively. However, these reference models remain better in terms of AP.\\
Finally, the comparison study provided in Appendix \ref{app:ablation_labels} shows that the weakly supervised models are robust to several scribble annotation strategies and that the one we chose does not bias the model towards an overestimation of its performance.


\subsection{Generalization to unseen data domains}

\begin{figure}[t!]
\centering
    \includegraphics[width=0.85\linewidth]{images/all_metrics_relative_train_picai_with_nnDetection.pdf}
    \caption{Relative change in performances on out-of-distribution test datasets. The reported values are the ratio between a model's performance on a test dataset (Prostate158 or our private dataset) and its cross-validation performance on PI-CAI. %The values that are reported are the ratio between score obtained on a given test dataset and the score obtained in cross-validation.
    }
    \label{fig:performances_relative}
\end{figure}

Figure \ref{fig:performances_relative} shows the relative performance of the models on the two test datasets, that is Prostate158 and our private database, compared to the performance of the same model evaluated on the in-distribution validation dataset. We did not report the results for the cases where the models are trained with partial and negative cross entropies as their absolute performances are much lower than the others. 

For most of the metrics and models, there is, as one could have expected, a notable drop in performances when the models are evaluated on a test set that has been acquired in a different setup from that of the training dataset. The average performance decrease ratio is of 28\% among all models and metrics. This can reach values as low as -61\% for supervised models.
Quite surprisingly, the Average-Precision score is even or better -- sometimes by a large amount -- on the test datasets than on the validation datasets for many models.

Compared to fully supervised models, the weak models trained with CB constraint loss has a more favorable relative change in 19 configurations out of 24 (6 configurations correspond to the PI-CAI dataset and are thus not considered here).
This advantage is also found when compared to the models trained with the IT loss. Ensemble predictions, with a rule as simple as averaging the output probability maps of models obtained from several trainings, almost always help reducing the performance gap in generalization. 


\begin{figure}[t!]
\centering
  \includegraphics[width=0.9\linewidth]{images/visual_results_3d.pdf}
  \caption{Example prediction maps of several 3D models. More visual results can be found in Appendix \ref{app:visual_results}. Blue color is for prostate and red for clinically significant lesions.}
  \label{fig:visual_results}
\end{figure}

\section{Discussion and conclusion}

Our proposed weakly supervised method achieves competitive results compared to fully supervised baselines, while requiring only 14\% of annotation voxels of clinically significant lesions. It consistently outperforms 2D supervised DynUNet trained with cross-entropy and generalized DICE loss. In the 3D configuration, the model trained with full segmentation annotations remains better overall. %, but weakly supervised approaches can still be leveraged as they achieve competitive results. 
The addition of the more precise common bounds (CB) size constraint gives better results compared to the image tag (IT) model that was proposed in \cite{Duran_2022}. 
%Among all the methods that are compared, the one with CB loss is the more robust to unseen data domains and is the one that suffers least from a drop in performances when tested on data that does not belong to the training distribution.

%Among all compared methods, the one with CB loss is the most robust to unseen data domains because it suffers the least from a drop in performances when tested on data that does not belong to the training distribution.
Among all compared methods, the weak model with CB loss is the most robust to unseen data domains. As seen on Figure \ref{fig:performances_relative}, it indeed suffers the least from a performance drop when tested on data that do not belong to the training distribution.

%Our study have also permit to show and is an issue of first interest if deep learning models are aimed to be used in a clinical environment, where heterogeneity is prominent between institutions. 
Our study confirms, for the task of csPCa lesion detection and segmentation, that heterogeneity between training and test databases noticeably impacts performance of deep learning models and is thus an issue of first interest if such models are aimed to be used in a clinical environment.
We show that one simple way to mitigate this issue is to make ensemble predictions from multiple trainings, as this allows decreasing the performance drop in almost all the configurations we have tested.
%One simple way to mitigate this issue is to make ensemble predictions from multiple trainings, as it has increased the relative score change in almost all the configurations we have tested.
Finally, it is important to note that the best models trained on PI-CAI reach performances on unseen domains that can be on par with fully supervised models trained on these datasets. This is an encouraging result that supports the current trend of building models on a given large training dataset, in a weakly or fully supervised setup, and deploying it on other institutions that have less or no annotated data. Further work includes refining the hyperparameters of the scribble based weak models as well as designing task-specific and few shot domain adaptation methods to better handle dataset heterogeneities. 

\newpage

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{
This work was supported by the RHU PERFUSE (ANR-17-RHUS-0006) of Université Claude Bernard Lyon 1 (UCBL), within the program “Investissements d’Avenir” operated by the French National Research Agency (ANR). It was also partly funded by France Life Imaging (grant ANR-11-INBS-0006).  \\
This work was granted access to the HPC resources of IDRIS under the allocation 2023-AD011013971 made by GENCI.}


\bibliography{midl2024_263}

\clearpage

\appendix

\section{Lesion characteristics for each database}
\label{app:lesions_charac}

\begin{table}[h!]
\centering
\caption{Summary of positive cases and total number of lesions for each dataset.}
\begin{tabular}{c  c  c }
 \hline
 Database &  \# of positive cases / total patients & \# of CS lesions \\
 \hline
 PI-CAI & 220 / 1295 (17\%) & 301 \\ 
 Private dataset & 183 / 219 (84\%) & 408 \\
 Prostate158 & 82 / 139 (59\%) & 236 \\
 \hline
\end{tabular}
\label{tab:datasets}
\end{table}

\begin{figure}[h!]
\centering
  \includegraphics[width=0.8\linewidth]{images/hist_cc_sizes_3d_all.pdf}
  \caption{Histograms (blue) and cumulative histograms (orange) of lesion sizes in 3D for the three datasets. The unit of lesion sizes is the number of voxels for a volume with a spatial spacing of $1 \times 1 \times 3$ mm\textsuperscript{3}. The two vertical red lines show the values of the bounds $a$ and $b$ used for the CB loss (set by grid search), which are equal to 10 and 4000 respectively.}
\end{figure}

\begin{figure}[t!]
\centering
  \includegraphics[width=0.8\linewidth]{images/hist_cc_sizes_2d_all.pdf}
  \caption{Histograms (blue) and cumulative histograms (orange) of slicewise lesion sizes (\textit{i.e.} in 2D) for the three datasets. The unit of lesion sizes is the number of voxels for a volume with a spatial spacing of $1 \times 1$ mm\textsuperscript{2}. The two vertical red lines show the values of the bounds $a$ and $b$ used for the CB loss (set by grid search), which are equal to 10 and 600 respectively.}
\end{figure}

\vspace{2cm}

\section{Characteristics of MRI modalities for each database}
\label{app:modalities_charac}

\begin{figure}[H]
\centering
  \includegraphics[width=\linewidth]{images/histo_all_t2.pdf}
  \caption{T2w voxel intensity distributions for each database.}
\end{figure}

\begin{figure}[H]
\centering
  \includegraphics[width=\linewidth]{images/histo_all_adc.pdf}
  \caption{ADC voxel intensity distributions for each database.}
\end{figure}

\vspace{1.5cm}

\section{More details about the models}
\label{app:archi_GS}

\textbf{Architecture details}

The 2D DynUNet is composed of four stages with a respective number of filters of 32, 64, 128 and 256. The kernel sizes are set to 3 and the stride is of 1 for the shallowest and 2 for the others. With an input size of $[2,96,96]$, the deepest layers have a shape of $[256,12,12]$. We use Instance Normalisation layers and a dropout of ratio 0.1. The network has a total of 3.8 million (learnable) parameters. 

The 3D DynUNet is composed of four stages with a respective number of filters of 32, 64, 128 and 256. The kernel sizes are set to 3 and the stride is of 1 for the shallowest and the deepest blocks and 2 for the others. With an input size of $[2,20,96,96]$, the deepest layers have a shape of $[256,5,24,24]$. We use Instance Normalisation layers and a dropout of ratio 0.1. The network has a total of 10.7 million (learnable) parameters. \\

\noindent
\textbf{Details about the grid search}

We did the grid search on the hyperparameters of the models as follows : for the weak constrained models, we first found the best combination of parameters for the learning rate, the weight decay and constraint weight $\lambda$ on the model with image tag. Once these parameters were set for the IT model, we reused them for the model with common bounds constraint and did the grid search for the parameters $a$ and $b$. For the supervised model and Partial CE model, we only did the grid search on the learning rate and the weight decay. The values that we tried for the hyperparameters are detailed hereunder :
\begin{itemize}
\itemsep-1.5mm 
    \item Learning rate : between $1\mathrm{e}{-4}$ and $1\mathrm{e}{-2}$ with a linear step of 0.5 in the logarithmic scale.
    \item Weight decay : between $1\mathrm{e}{-5}$ and $1\mathrm{e}{-2}$ with a linear step of 1 in the logarithmic scale.
    \item $\lambda$ : between $1\mathrm{e}{-5}$ and $1\mathrm{e}{-2}$ with a linear step of 1 in the logarithmic scale in the two-dimensional case and between $1\mathrm{e}{-5}$ and $1\mathrm{e}{-9}$ with a linear step of 1 in the logarithmic scale in the three-dimensional case. It optimal value was found to be $10^{-5}$ and $10^{-8}$ for 2D and 3D models respectively.
    \item $a$ and $b$ : $\{5, 10\}$ for $a$ and $\{100,200,300,400,500,600\}$ for $b$ in the two-dimensional case and $\{10,30,50,70,100\}$ for $a$ and $\{1500,2000,2500,3000,3500,4000,5000,6000\}$ for $b$ in the three-dimensional case. The grid search in the 2D case is smaller because an optimization of the hyperparameters had already been done in \cite{Duran_2022}.
    \item the class weights $w_c$ were set to 0.14 for the prostate and 0.22 for the lesion based on empirical values reported in \cite{Duran_2022}.
\end{itemize}

\clearpage

\section{Full numerical results associated to Figure 1}
\label{app:numerical_results}

\begin{table}[H]
\caption{Full results of the models on PI-CAI dataset. For each metric, the best model is in \textbf{bold} and the second best is \underline{underlined}. \textsuperscript{\textdagger} is used when a model does not reach 1 FP per patient; in such case, we report the maximum sensitivity.}
\begin{tabular}{@{}lccc@{}}
Model         & Sensi at 1 FP & Average Precision & AUROC \\ \hline
2D Supervised &  $0.676 \pm 0.026$  & $0.150 \pm 0.031$   &  $0.748 \pm 0.031$  \\
2D Partial CE &  $0.446 \pm 0.091 \;$  &  $0.068 \pm 0.027$  &   $0.581 \pm 0.041$ \\
2D CE + IT &    $\textbf{0.746} \pm 0.028$ &  $0.256 \pm 0.042$  &   $0.776 \pm 0.019$\\
2D CE + IT + CB &    $\underline{0.733} \pm 0.072$  &  $0.258 \pm 0.061$  &  $\underline{0.790} \pm 0.019$ \\
3D Supervised &    $0.705 \pm 0.058$\textsuperscript{\textdagger} &  $\textbf{0.412} \pm 0.047$  &  $\textbf{0.825} \pm 0.013$  \\
3D Partial CE &   $0.060 \pm 0.058\textsuperscript{\textdagger}$ &  $0.011 \pm 0.009$  &  $0.551 \pm 0.085$  \\
3D CE + IT &  $0.332 \pm 0.081\textsuperscript{\textdagger}$  &   $\underline{0.349} \pm 0.071$ &  $0.635 \pm 0.028$ \\
3D CE + IT + CB &   $0.545 \pm 0.252\textsuperscript{\textdagger}$  & $0.278 \pm 0.121$   &  $0.733 \pm 0.091$ \\
nnDetection \textit{full} &  $0.332 \pm 0.081$  &   $\underline{0.349} \pm 0.071$ &  $0.635 \pm 0.028$ \\
nnDetection \textit{weak} &   $0.545 \pm 0.252$  & $0.278 \pm 0.121$   &  $0.733 \pm 0.091$ \\ \hline
\end{tabular}
\end{table}


\begin{table}[H]
\begin{tabular}{@{}lccc@{}}
Model         & Maximum sensitivity & Avg. FP per patient \\ \hline
2D Supervised &  $\textbf{0.803} \pm 0.075$  & $3.21 \pm 0.52$  \\
2D Partial CE &  $0.339 \pm 0.071$  &  $10.69 \pm 2.53$  \\
2D CE + IT   &  $0.516 \pm 0.083$  & $3.67 \pm 1.06$   \\
2D CE + IT + CB &    $\underline{0.756} \pm 0.096$  &  $1.55 \pm 0.52$   \\
3D Supervised &    $0.705 \pm 0.058$ &  $\underline{0.69} \pm 0.15$    \\
3D Partial CE &   $0.126 \pm 0.049$ &  $5.02 \pm 2.54$  \\
3D CE + IT &  $0.332 \pm 0.081$  &   $\textbf{0.12} \pm 0.04$  \\
3D CE + IT + CB &   $0.715 \pm 0.075$  & $1.42 \pm 0.85$   \\ \hline
\end{tabular}
\end{table}

\bigskip

\begin{table}[H]
\caption{Full results of the models on our private dataset. The results after ensembling is shown between brackets. For each metric, the best model is in \textbf{bold} and the second best is \underline{underlined} (excluding the reference model). \textsuperscript{\textdagger} is used when a model does not reach 1 FP per patient; in such case, we report the maximum sensitivity.}
\begin{tabular}{@{}lccc@{}}
Model         & Sensi at 1 FP & Average Precision & AUROC \\ \hline
2D Supervised &  $0.303 \pm 0.054 \; (0.392)$  & $0.150 \pm 0.034 \; (0.0.300)$   &  $0.571 \pm 0.023 \; (\underline{0.653})$  \\
2D Partial CE &  $0.143 \pm 0.037 \; (0.267)$  &  $0.056 \pm 0.019 \; (0.120)$  &   $0.508 \pm 0.037 \; (0.551)$ \\
2D CE + IT &    $\textbf{0.405} \pm 0.036 \; (\textbf{0.504})$ &  $0.270 \pm 0.034 \; (0.369)$  &   $0.574 \pm 0.042 \; (0.606)$\\
2D CE + IT + CB &    $\underline{0.376} \pm 0.056 \; (\underline{0.401})$  &  $\underline{0.280} \pm 0.047 \; (\underline{0.399})$  &  $\textbf{0.641} \pm 0.031 \; (\textbf{0.661})$ \\
3D Supervised &    $0.305 \pm 0.045\textsuperscript{\textdagger} \; (0.272)$ &  $\textbf{0.337} \pm 0.030 \; (\textbf{0.432})$  &  $\underline{0.627} \pm 0.037 \; (0.626)$  \\
3D Partial CE &   $0.007 \pm 0.003\textsuperscript{\textdagger} \; (0.017)$ &  $0.005 \pm 0.002 \; (0.009)$  &  $0.557 \pm 0.059 \; (0.552)$  \\
3D CE + IT &  $0.077 \pm 0.032\textsuperscript{\textdagger} \; (0.056)$  &   $0.221 \pm 0.072 \; (0.262)$ &  $0.498 \pm 0.038 \; (0.495)$ \\
3D CE + IT + CB &   $0.249 \pm 0.117\textsuperscript{\textdagger} \; (0.332)$  & $0.236 \pm 0.089 \; (0.383)$   &  $0.558 \pm 0.029 \; (0.604)$ \\
nnDetection \textit{full} &  $0.332 \pm 0.081$  &   $\underline{0.349} \pm 0.071$ &  $0.635 \pm 0.028$ \\
nnDetection \textit{weak} &   $0.545 \pm 0.252$  & $0.278 \pm 0.121$   &  $0.733 \pm 0.091$ \\
Reference &   $0.651$  & $0.542$   &  $0.639$ \\ \hline
\end{tabular}
\end{table}

\begin{table}[H]
\begin{tabular}{@{}lccc@{}}
Model         & Maximum sensitivity & Avg. FP per patient \\ \hline
2D Supervised &  $\underline{0.407} \pm 0.060 \; (0.397)$  & $3.84 \pm 0.78 \; (1.47)$  \\
2D Partial CE &  $0.335 \pm 0.051 \; (0.371)$  &  $11.00 \pm 2.76 \; (8.23)$  \\
2D CE + IT &    $\textbf{0.512} \pm 0.046 \; (\textbf{0.522})$ &  $3.71 \pm 0.89 \; (1.84)$  \\
2D CE + IT + CB &    $0.391 \pm 0.073 \; (\underline{0.401})$  &  $1.55 \pm 0.32 \; (0.61)$   \\
3D Supervised &    $0.306 \pm 0.045 \; (0.272)$ &  $\underline{0.73} \pm 0.25 \; (\underline{0.22})$    \\
3D Partial CE &   $0.039 \pm 0.013 \; (0.034)$ &  $4.99 \pm 2.65 \; (3.86)$  \\
3D CE + IT &  $0.077 \pm 0.032 \; (0.056)$  &   $\textbf{0.13} \pm 0.055 \; (\textbf{0.06})$  \\
3D CE + IT + CB &   $0.331 \pm 0.042 \; (0.332)$  & $1.78 \pm 0.92 \; (0.52)$   \\
Reference &   $0.655 \pm 0.063$  & $1.40 \pm 0.20$   \\ \hline
\end{tabular}
\end{table}

\begin{table}[H]
\caption{Full results of the models on Prostate158. The results after ensembling is shown between brackets. For each metric, the best model is in \textbf{bold} and the second best is \underline{underlined} (excluding the reference model). \textsuperscript{\textdagger} is used when a model does not reach 1 FP per patient; in such case, we report the maximum sensitivity.}
\begin{tabular}{@{}lccc@{}}
Model         & Sensi at 1 FP & Average Precision & AUROC \\ \hline
2D Supervised &  $0.433 \pm 0.056 \; (0.552)$  & $0.221 \pm 0.061 \; (0.412)$   &  $0.666 \pm 0.025 \; (0.740)$  \\
2D Partial CE &  $0.181 \pm 0.026 \; (0.271)$  &  $0.048 \pm 0.015 \; (0.121)$  &   $0.510 \pm 0.029 \; (0.605)$ \\
2D CE + IT &    $\textbf{0.569} \pm 0.037 \; (\textbf{0.656})$ &  $0.355 \pm 0.027 \; (0.459)$  &   $\underline{0.728} \pm 0.027 \; (0.757)$\\
2D CE + IT + CB &    $\underline{0.542} \pm 0.059 \; (\underline{0.635})$  &  $\textbf{0.388} \pm 0.041 \; (\underline{0.421})$  &  $0.726 \pm 0.028 \; (\textbf{0.781})$ \\
3D Supervised &    $0.438 \pm 0.071\textsuperscript{\textdagger} \; (0.438)$ &  $\underline{0.366} \pm 0.031 \; (\textbf{0.484})$  &  $\textbf{0.733} \pm 0.019 \; (\underline{0.780})$  \\
3D Partial CE &   $0.027 \pm 0.018\textsuperscript{\textdagger} \; (0.031)$ &  $0.010 \pm 0.004 \; (0.011)$  &  $0.459 \pm 0.030 \; (0.402)$  \\
3D CE + IT &  $0.160 \pm 0.044\textsuperscript{\textdagger} \; (0.135)$  &   $0.322 \pm 0.044 \; (0.373)$ &  $0.583 \pm 0.030 \; (0.623)$ \\
3D CE + IT + CB &   $0.375 \pm 0.176\textsuperscript{\textdagger} \; (0.490)$  & $0.286 \pm 0.115 \; (0.456)$   &  $0.697 \pm 0.046 \; (0.733)$ \\
nnDetection \textit{full} &  $0.332 \pm 0.081$  &   $\underline{0.349} \pm 0.071$ &  $0.635 \pm 0.028$ \\
nnDetection \textit{weak} &   $0.545 \pm 0.252$  & $0.278 \pm 0.121$   &  $0.733 \pm 0.091$ \\
Reference &   $0.643$  & $0.619$   &  $0.708$ \\ \hline
\end{tabular}
\end{table}

\begin{table}[H]
\begin{tabular}{@{}lccc@{}}
Model         & Maximum sensitivity & Avg. FP per patient \\ \hline
2D Supervised &  $0.594 \pm 0.055 \; (0.552)$  & $4.47 \pm 0.90 \; (1.73)$  \\
2D Partial CE &  $0.290 \pm 0.051 \; (0.323)$  &  $8.72 \pm 1.86 \; (6.03)$  \\
2D CE + IT &    $\textbf{0.710} \pm 0.029 \; (\textbf{0.698})$ &  $5.40 \pm 0.92 \; (3.10)$  \\
2D CE + IT + CB &    $\underline{0.602} \pm 0.074 \; (\underline{0.635})$  &  $2.47 \pm 0.79 \; (1.05)$   \\
3D Supervised &    $0.452 \pm 0.082 \; (0.438)$ &  $\underline{0.94} \pm 0.24 \; (\underline{0.25})$    \\
3D Partial CE &   $0.087 \pm 0.036 \; (0.094)$ &  $5.02 \pm 2.60 \; (3.78)$  \\
3D CE + IT &  $0.160 \pm 0.044 \; (0.135)$  &   $\textbf{0.15} \pm 0.07 \; (\textbf{0.07})$  \\
3D CE + IT + CB &   $0.496 \pm 0.043 \; (0.490)$  & $2.07 \pm 1.00 \; (0.68)$   \\
Reference &   $0.643$  & $1.05$   \\ \hline
\end{tabular}
\end{table}

\section{Supplementary visual results}
\label{app:visual_results}

\begin{figure}[H]
\centering
    \includegraphics[width=0.8\linewidth]{images/visual_results_all.pdf}
    \caption{Examples prediction maps of several 2D models, ensemble of 2D models and ensemble of 3D models. Blue color is for prostate and red for clinically significant lesions.}
\label{fig:more_visual_results}
\end{figure}

\clearpage

\section{Study on the method for generating weak annotations}
\label{app:ablation_labels}

\setlist{nosep}

Modeling the process of obtaining the weak scribble annotations is crucial for correctly evaluating the relevance of the weakly supervised models, as an unrealistic modeling could lead to under- or over-estimated performance of the model. In order to assess the robustness of the weakly supervised models to such annotation process, we provide here comparison between several annotation methods that we consider to be realistic ways of producing scribble annotations. Note that these annotation methods apply only to the lesion class; for the prostate class the method described described in section \ref{subsec:weak_annotations} and referred to as \textit{random valid} is systematically used. %Random valid is systematically used. 
The methods are as follows :
\begin{itemize}
\setlength\itemsep{0.1em}
    \item \textit{Random valid} denotes the method described in section \ref{subsec:weak_annotations}.
    \item \textit{Center distance map}. For each lesion, we compute the Euclidean distance between each non-zero pixel in lesion mask and the nearest zero pixel. The center of the lesion is the maximum value of this map and the circular annotation that is drawn is the largest circle of radius inferior or equal to 3mm that can fit in the CS lesion mask. The amount of annotated pixels with this method is similar to the one of \textit{random valid} (14\%).
    \item \textit{Random distance map}. From the distance map obtained as described above, we randomly draw a center for the circular scribble with a probability density proportional to the distance map, to biase selection of the scribble center towards the center of the lesion. In this case we draw a circle which always has a radius of 3mm, resulting in an amount of annotated pixels superior to the two previous methods (16.5 \%).
    \item \textit{Erosions}. For each lesion, we iteratively trim its mask to reduce its size until we obtain a surface below a certain value. We then threshold surface is equivalent to a circle of radius 3mm, it results in an amount of annotated pixels below the other methods (10\%). Hence, for better comparison, we also perform such procedure with a higher threshold to get the same amount of CS lesion annotations (14\%).
\end{itemize}


\vspace{0.4cm}

\begin{table}[H]
\caption{Results of 2D CE + IT + CB on PI-CAI dataset for several weak annotations methods. For each metric, the best model is in \textbf{bold} and the second best is \underline{underlined}.}
\begin{tabular}{@{}lccc@{}}
Model         & Sensi at 1 FP & Average Precision & AUROC \\ \hline
Random valid &   $0.542 \pm 0.059$  &  $\textbf{0.388} \pm 0.041$  &  $0.726 \pm 0.028$ \\
Center distance map &  $\underline{0.734} \pm 0.080$  &  $0.250 \pm 0.037$  &  $\textbf{0.788} \pm 0.040$ \\
Random distance map &  $0.689 \pm 0.066$  &  $\underline{0.296} \pm 0.031$  &  $0.643 \pm 0.032$ \\
Erosions (10\%) &    $0.681 \pm 0.045$  &  $0.185 \pm 0.016$  &  $0.759 \pm 0.019$ \\
Erosions (14\%) &    $\textbf{0.746} \pm 0.062 $  &  $0.253 \pm 0.059$  &  $\underline{0.773} \pm 0.022$ \\
\end{tabular}
\end{table}


\begin{table}[H]
\caption{Results of 2D CE + IT + CB on our private dataset for several weak annotations methods. The results after ensembling is shown between brackets. For each metric, the best model is in \textbf{bold} and the second best is \underline{underlined}.}
\begin{tabular}{@{}lccc@{}}
Model         & Sensi at 1 FP & Average Precision & AUROC \\ \hline
Random valid &   $0.376 \pm 0.056 \; (0.401)$  &  $0.280 \pm 0.047 \; (0.399)$  &  $\underline{0.641} \pm 0.031 \; (0.661)$ \\
Center distance map &  $\textbf{0.422} \pm 0.025 \; (\textbf{0.453})$  &  $\textbf{0.296} \pm 0.031 \; (\textbf{0.415})$  &  $\textbf{0.643} \pm 0.032 \; (\textbf{0.685})$ \\
Random distance map &  $0.347 \pm 0.040 \; (0.371)$  &  $0.247 \pm 0.066 \; (0.379)$  &  $0.557 \pm 0.038 \; (0.627)$ \\
Erosions (10\%) &    $0.366 \pm 0.030 \; (\underline{0.431})$  &  $0.251 \pm 0.023 \; (0.370)$  &  $0.622 \pm 0.026 \; (\underline{0.682})$ \\
Erosions (14\%)&    $\underline{0.411} \pm 0.047 \; (0.426)$  &  $\underline{0.288} \pm 0.042 \; (\underline{0.402})$  &  $0.611 \pm 0.064 \; (0.663)$ \\
\end{tabular}
\end{table}

\vspace{0.5cm}

\begin{table}[H]
\caption{Results of 2D CE + IT + CB on Prostate158 for several weak annotations methods. The results after ensembling is shown between brackets. For each metric, the best model is in \textbf{bold} and the second best is \underline{underlined}.}
\begin{tabular}{@{}lccc@{}}
Model         & Sensi at 1 FP & Average Precision & AUROC \\ \hline
Random valid &  $0.542 \pm 0.059 \; (\textbf{0.635})$  &  $\textbf{0.388} \pm 0.041 \; (0.421)$  &  $0.726 \pm 0.028 \; (0.781)$ \\
Center distance map &  $\textbf{0.564} \pm 0.032 \; (0.615)$  &  $\underline{0.355} \pm 0.028 \; (0.490)$  &  $\textbf{0.760} \pm 0.035 \; (\underline{0.788})$ \\
Random distance map &  $0.485 \pm 0.044 \; (0.615)$  &  $0.268 \pm 0.052 \; (\underline{0.502})$  &  $0.684 \pm 0.037 \; (0.735)$ \\
Erosions (10\%)&    $\underline{0.550} \pm 0.026 \; (\underline{0.625})$  &  $0.346 \pm 0.023 \; (\textbf{0.504})$  &  $\underline{0.735} \pm 0.054 \; (0.773)$ \\
Erosions (14\%)&    $0.519 \pm 0.030 \; (0.614)$  &  $0.317 \pm 0.047 \; (0.483)$  &  $0.731 \pm 0.025 \; (\textbf{0.808})$ \\
\end{tabular}
\end{table}

\vspace{0.5cm}

\section{Model results for prostate segmentation}
\label{app:dice_performances}

Although not within the scope of this work, we provide hereunder a similar visualization than Figure \ref{fig:performances} for the prostate segmentation Dice score. As suggested by the examples predictions maps shown in Figures \ref{fig:visual_results} and \ref{fig:more_visual_results}, the regularization induced by the common bounds loss greatly improves the segmentation in the weakly supervised setup compared to models with partial cross-entropy (CE) or cross-entropy and image tag (CE+IT). Note that the bounds $a$ and $b$ were set roughly for the prostate class : in the 2D case, we used the ones that were set in \cite{Duran_2022}, and in 3D case, we empirically set them based on the analysis of the distribution of prostate sizes.

\begin{figure}[h!]
\centering
    \includegraphics[width=\linewidth]{images/dice_train_picai.pdf}
    \caption{DICE prostate segmentation score for all models. Weak labels only represent about 2\% of the total annotated voxels for the prostate class. Reference designates fully supervised 3D DynUNet trained and tested on the Prostate158 or private dataset in 5-fold cross-validation setup.}
    \label{fig:performances_dice}
\end{figure}

\end{document}
