\section{Results}

In this section, two datasets from four separate institutions, the PI-CAI hidden tuning cohort (N=100) and our out-of-distribution in-house cohort (N=200), are used to evaluate the performance of our trained model against all baseline models. As each model was trained using 5-fold cross validation, a simple mean ensemble is employed on the softmax output of each model fold before extracting lesion candidates using the post processing steps described in \sectionref{sec:metrics}. The lesion candidates are then reverted to original size before evaluating the metrics. Results for zonal segmentation can be seen in \appendixref{appendix:zonal-evaluation}.

% Due to applied cropping and normalization we opted to exclude the Prostate 158 cohort from our PCa detection assesment, however, it is included in the zonal segmentation assesment which can be seen in \appendixref{appendix:zonal-evaluation}.

% \subsection{In-House Dataset}
% % The evaluation of the in-house dataset (N=200) was performed on a local cluster using a single V100 (32GB VRAM) GPU adhering to privacy requirements.  

% The evaluation of the PI-CAI baselines was performed by utilizing the same docker containers used for submission to the PI-CAI grand challenge in order to get a fair comparison between the datasets.

% \begin{filecontents*}{dlr.csv}
% Model,Score,AUC, AP,Type
% U-Mamba MTL (ours),\textbf{0.805 $\pm$ 0.063},\textbf{0.925 $\pm$ 0.043},\textbf{0.685 $\pm$ 0.097},Mamba
% U-Mamba (ours),0.799 $\pm$ 0.060 ,0.923 $\pm$ 0.044,0.674 $\pm$ 0.089,Mamba
% Swin UNETR,0.773 $\pm$ 0.063,0.902 $\pm$ 0.054,0.643 $\pm$ 0.094,Transformer
% nnDetection,0.765 $\pm$ 0.064,0.920 $\pm$ 0.045 ,0.610 $\pm$ 0.099 ,CNN
% U-Net,0.759 $\pm$ 0.061 ,0.913 $\pm$ 0.043 ,0.605 $\pm$ 0.095,CNN
% nnUNet,0.731 $\pm$ 0.070,0.910 $\pm$ 0.045 ,0.555 $\pm$ 0.113 ,CNN

% \end{filecontents*}

% % \begin{filecontents*}{dlr.csv}
% % Model,Score,AUC, AP,Type
% % U-Mamba MTL (ours),\textbf{0.805},\textbf{0.925},\textbf{0.685},Mamba
% % U-Mamba (ours),0.799,0.923,0.674,Mamba
% % Swin UNETR,0.773,0.902,0.643,Transformer
% % nnDetection,0.765,0.920,0.610,CNN
% % U-Net,0.759,0.913,0.605,CNN
% % nnUNet,0.731,0.910,0.555,CNN

% % \end{filecontents*}


% \begin{table}[htbp]
%     \centering
%     \csvautobooktabular[separator=comma]{dlr.csv}
%     \label{tab:quantitative_results_dlr}
%     \caption{Results from our local dataset N=200, where  $\pm$  refers to the largest difference from mean to the 95\% confidence interval bounds, derived from 10.000 bootstrap samples \cite{jurdi_confidence_2023}.}
% \end{table}

% \begin{figure}[htbp]
% \centering
% \hspace*{0.7cm}
% \includegraphics[width=0.8\textwidth]{figures/qualitative_dlr.png}
% \caption{Qualitative comparison of the PCa detection maps among all models compared to ground truth (GT). Please note that all models predicts segmentation masks, except nnDet which is predicting bounding boxes.} \label{fig2}
% \end{figure}

% The quantitative results from the evaluation (\tableref{tab:quantitative_results_dlr}) shows that our U-Mamba MTL model achieves the highest scores for AUC, AP and the aggregated score. Although the aggregated score is closely followed by the un-altered U-Mamba, U-Mamba MTL displays its superiority in AP.



% \subsection{PI-CAI Development Set}

% The official PI-CAI challenge ended in 2022, but the open development phase still offers users to submit a docker container with their model in order to validate on the 100 cases from the PI-CAI Open Development set. The performance metrics were directly gathered from the challenge ranking website \cite{pi-cai-dev-leaderboard}, and thus no confidence intervals was calculated.

% % A submission was sent for validation for U-Mamba MTL, U-Mamba and Swin UNETR using the same inference pipeline as outlined for the in-house dataset.




% \begin{filecontents*}{picai-dev.csv}
% Model,Score,AUC,AP
% \textbf{U-Mamba MTL (ours)},\textbf{0.735},0.843,0.622
% nnDetection,0.734,\textbf{0.885},0.582
% U-Net,0.731,0.829,0.633
% U-Mamba (ours),0.727,0.820,\textbf{0.635}
% nnU-Net ,0.714,0.818,0.610
% Swin UNETR,0.665,0.792,0.537
% \end{filecontents*}

% \begin{table}[htbp]
%     \centering
%     \csvautobooktabular[separator=comma]{picai-dev.csv}
%     \label{tab:val_results}
%     \caption{Results on Hidden Development (PI-CAI N=100)}
% \end{table}

\subsection{Qualitative Results}

\figureref{fig:qualitative} shows a selection of qualitative results on the in-house dataset (N=200). The selected samples highlights cases where some or all models fails to produce correct predictions.

\begin{figure}[h]
\centering
\hspace*{0.7cm}
\includegraphics[width=0.8\textwidth]{figures/qualitative_dlr_fail.png}
\caption{Qualitative comparison on the in-house dataset (N=200) of the PCa detection maps among all models compared to ground truth (GT), overlayed on ADC channel. Green arrow highlights an area where hypointensity is lacking. Red arrows highlights areas with hypointensity.} \label{fig:qualitative}
\end{figure}


\subsection{Quantitative results}

\begin{filecontents*}{picai-dev.csv}
Model,Score,AUC,AP, Ranking, Parameters
\textbf{U-Mamba MTL Single (ours)},\textbf{0.781},0.867,\textbf{0.696}, \textbf{23rd}, 73.6M
U-Mamba MTL Dual (ours),0.735,0.843,0.622, 134th, 114M
nnDetection,0.734,\textbf{0.885},0.582, 139th, 24.7M
U-Net,0.731,0.829,0.633, 144th, 31.8M
U-Mamba,0.727,0.820,0.635, 157th, 73.6M
nnU-Net ,0.714,0.818,0.610, 188th, 44.8M
Swin UNETR,0.665,0.792,0.537, 241st, 72.8M
\end{filecontents*}

\begin{table}[htb]
    \centering
    \csvautobooktabular[separator=comma]{picai-dev.csv}
    \caption{Results on PI-CAI hidden development set (N=100)}
    \label{tab:quantitative_picai}
\end{table}

The evaluation of the models on the PI-CAI hidden development set (N=100) was acquired by submitting a docker container for each trained model to the challenge website \cite{pi-cai-dev-leaderboard}.
\tableref{tab:quantitative_picai} shows the quantitative results on the PI-CAI hidden development set, where our U-Mamba MTL-Dual achieved the highest aggregated score.

\begin{filecontents*}{dlr.csv}
Model,Score,AUC, AP
\textbf{U-Mamba MTL Single (ours)}, \textbf{0.818$\pm$0.062}, \textbf{0.932 $\pm$ 0.041}, \textbf{0.705 $\pm$ 0.101 }
U-Mamba MTL Dual (ours),0.805 $\pm$ 0.063,0.925 $\pm$ 0.043,0.685 $\pm$ 0.097
U-Mamba, 0.799 $\pm$ 0.060 ,0.923 $\pm$ 0.044,0.674 $\pm$ 0.089
Swin UNETR,0.773 $\pm$ 0.063,0.902 $\pm$ 0.054,0.643 $\pm$ 0.094
nnDetection,0.765 $\pm$ 0.064,0.920 $\pm$ 0.045 ,0.610 $\pm$ 0.099 
U-Net,0.759 $\pm$ 0.061 ,0.913 $\pm$ 0.043 ,0.605 $\pm$ 0.095
nnUNet,0.731 $\pm$ 0.070,0.910 $\pm$ 0.045 ,0.555 $\pm$ 0.113
\end{filecontents*}

\begin{table}[htb]
    \centering
    \csvautobooktabular[separator=comma]{dlr.csv}
    \caption{Results from our in-house dataset N=200, where  $\pm$  refers to the largest difference from mean to the 95\% confidence interval bounds, derived from 10.000 bootstrap samples \cite{jurdi_confidence_2023}.}
    \label{tab:quantitative_results_dlr}
\end{table}


% \tableref{tab:quantitative_results_dlr} shows the quantitative results on the out-of-distribution in-house dataset (N=200), where U-Mamba MTL achieves the highest score in all performance metrics.

\tableref{tab:quantitative_results_dlr} shows the quantitative results on our in-house, out-of-distribution dataset (N=200). U-Mamba MTL achieves the highest score across all performance metrics.




% \begin{filecontents*}{picai-dev.csv}
% Model,Score,AUC,AP, Type
% \textbf{U-Mamba MTL (ours)},\textbf{0.735},0.843,0.622, Mamba
% nnDetection,0.734,\textbf{0.885},0.582, CNN
% U-Net,0.731,0.829,0.633, CNN
% U-Mamba,0.727,0.820,\textbf{0.635}, Mamba
% nnU-Net ,0.714,0.818,0.610, CNN
% Swin UNETR,0.665,0.792,0.537, Transformer
% \end{filecontents*}

% \begin{table}[htbp]
%     \centering
%     \csvautobooktabular[separator=comma]{picai-dev.csv}
%     \caption{Results on PI-CAI hidden development set (N=100)}
%     \label{tab:quantitative_picai}
% \end{table}

% The evaluation of the models on the PI-CAI hidden development set (N=100) was acquired by submitting a docker container for each trained model to the challenge website \cite{pi-cai-dev-leaderboard}.
% \tableref{tab:quantitative_picai} shows the quantitative results on the PI-CAI hidden development set where our U-Mamba MTL achieved the highest aggregated score.






% The quantitative results from the evaluation (\tableref{tab:quantitative_results_dlr}) shows that our U-Mamba MTL model achieves the highest scores for AUC, AP and the aggregated score. Although the aggregated score is closely followed by the un-altered U-Mamba, U-Mamba MTL displays its superiority in AP.








