\section{Results}\label{sec:results}


\begin{figure}[t]
    \centering
    
    \resizebox{0.8\textwidth}{!}{%
    \begin{tabular}{ccccc}
        \textbf{Original} & \textbf{Ground Truth} & \textbf{Ours} & \textbf{Ours CN}  & \textbf{HoVer-Net} \\
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/im0500.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_500_true_contours_pannuke-combined-32_epoch_80.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_500_contours_pannuke-combined-32_epoch_80.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_500_contours_pannuke-combined-convnext-32_epoch_90.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/overlay/im0500.jpg} \\
        
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/im0900.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_900_true_contours_pannuke-combined-32_epoch_80.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_900_contours_pannuke-combined-32_epoch_80.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_900_contours_pannuke-combined-convnext-32_epoch_90.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/overlay/im0900.jpg} \\

        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/test_5.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_4_true_contours_consep-combined-convnext_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_4_contours_consep-dice-combined-w-background10_epoch_100.pth .jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_4_contours_consep-combined-convnext_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/overlay/test_5.jpg} \\

        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/test_10.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_5_true_contours_consep-combined-convnext_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_5_contours_consep-dice-combined-w-background10_epoch_100.pth .jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_5_contours_consep-combined-convnext_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/overlay/test_10.jpg} \\

        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/(H-DAB), VH21B050140A001007 (x=82510.0, y=141583.0, w=1024.0, h=1024.0).jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_8_true_contours_ki67-combined-50140_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_8_contours_ki67-combined-50140_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_8_contours_ki67-combined-convnext-50140_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/overlay/(H-DAB), VH21B050140A001007 (x=82510.0, y=141583.0, w=1024.0, h=1024.0).jpg} \\

        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/(H-DAB), VH21B050140A001007 (x=83434.0, y=136039.0, w=1024.0, h=1024.0).jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_13_true_contours_ki67-combined-50140_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_13_contours_ki67-combined-50140_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/sample_13_contours_ki67-combined-convnext-50140_epoch_100.pth.jpg} &
        \includegraphics[width=0.18\textwidth, frame]{EXAMPLES/overlay/(H-DAB), VH21B050140A001007 (x=83434.0, y=136039.0, w=1024.0, h=1024.0).jpg} \\
        
    \end{tabular}
    }
    \caption{Qualitative results across PanNuke (rows 1–2), CoNSeP (3–4), and Ki-67 (5–6). "Ours" = ResNeXt-based model, "Ours CN" = ConvNeXt-based model. Overall, we observe no major differences in classification performance across the datasets, with notable improvements in PanNuke (particularly row~2). However, a slight downgrade in segmentation quality can be seen in cases like image~4 of row~1, reflecting typical watershed artifacts.}
    \label{fig:qualitative_comparison}
\end{figure}

%\subsection{Evaluation Metrics}\label{sec:eval}

\paragraph{Evaluation Metrics}
We evaluated our model using metrics for detection, classification, and segmentation, following the definitions provided in the HoVer-Net \cite{graham2019hover} and PanNuke \cite{gamper2020pannuke} papers. For classification and detection, we used F1 scores. The detection F1 score (\(F_{1,d}\)) measures the accuracy of nucleus centroid localization, while the classification F1 score (\(F_{1,c}\)) evaluates the accuracy of cell type predictions. For segmentation, we primarily report the Dice coefficient. While Panoptic Quality (PQ) has been widely used in digital pathology, recent studies \cite{pq} have demonstrated that PQ is unsuitable for cell nucleus instance segmentation and classification tasks. Despite these limitations, we include PQ for comparative purposes, as it remains a commonly reported metric.

\paragraph{Experiments}
To evaluate the performance and efficiency of the DualU-Net, we conduct a series of experiments, including cell segmentation and classification benchmarking, inference time and computational efficiency analysis, and robustness assessment under staining variations. Detailed implementation settings for all experiments are provided in Appendix~\ref{ap:implementation}.

\paragraph{Cell Segmentation and Classification Results}

The performance of our models was evaluated on the PanNuke, CoNSeP, and Ki-67 datasets (see Appendix \ref{ap:datasets}) and compared to state-of-the-art approaches (see Table \ref{tab:merged_results}). On the PanNuke dataset, the \(F_{1,d}\) of our ResNeXt-based model (\(0.80\)) and ConvNeXt-based model (\(0.80\)) is comparable to HoVer-Net (\(0.80\)), and closely follows NuLite-M (\(0.83\)) and CellViT (\(0.82\)). Regarding classification metrics, our models achieved equivalent \(F_{1,c}\) for most categories while demonstrating superior performance in the less-represented Dead class (\(0.36\)) compared to HoVer-Net (\(0.31\)). 

On the CoNSeP dataset, the \(F_{1,d}\) of our models (\(0.72\)) is comparable to HoVer-Net(\(0.75\)). Classification performance for specific cell types, such as Epithelial (\(0.62\)) and Inflammatory (\(0.63\)–\(0.64\)), aligns with state-of-the-art results, while achieving superior results for the less-represented Miscellaneous class (\(0.44\)) compared to HoVer-Net (\(0.43\)). 

For the Ki-67 dataset, the \(F_{1,d}\) of our ResNeXt-based (\(0.80\)) and ConvNeXt-based (\(0.80\)) models are comparable to HoVer-Net (\(0.82\)). \(F_{1,c}\) for Negative, Positive, and Stroma classes show close agreement across all models, with our ConvNeXt-based model demonstrating a slight edge in the Negative class (\(0.57\) vs. \(0.54\)) and the Stroma class (\(0.47\) vs. \(0.43\)). 

Regarding segmentation metrics, our models also achieved reasonable results: On the PanNuke dataset binary Panoptic Quality (bPQ) scores (\(0.55\) and \(0.56\)). In CoNSeP Dice (\(0.74\) and \(0.77\)) and bPQ (\(0.34\) for both). In Ki-67 (\(0.83\) Dice for both models), which is comparable to state-of-the-art models such as HoVer-Net (\(0.86\)).


A qualitative comparison of the results for our approaches and HoVer-Net across the three datasets is presented in Fig.~\ref{fig:qualitative_comparison}, highlighting that segmentation results are qualitatively equivalent. Let us note that segmentation primarily serves a visualization role, while classification and detection remain the key factors for clinical decision-making.



\begin{table}[t]
    \centering
    \caption{Performance across PanNuke, CoNSeP, and Ki-67 datasets. For PanNuke and Ki-67, the reported metrics represent the average across multiple dataset folds (3 and 4 respectively). The Dice metric for PanNuke and the mPQ for CoNSEP are not reported for state-of-the-art models, as they are not provided in the referenced papers. In Ki-67, HoVer-Net models were trained from scratch, and mPQ is not included due to the absence of an official implementation in its repository.}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{l|l|cccccc|ccc}
        \toprule
        & & \multicolumn{6}{c|}{\textbf{Classification and Detection $\uparrow$}} & \multicolumn{3}{c}{\textbf{Segmentation $\uparrow$}} \\
        \midrule
        \textbf{Dataset} & \textbf{Model} & \textbf{\(F_{1,d}\)} & \textbf{\(F_{1,c_1}\)} & \textbf{\(F_{1,c_2}\)} & \textbf{\(F_{1,c_3}\)} & \textbf{\(F_{1,c_4}\)} & \textbf{\(F_{1,c_5}\)} & \textbf{Dice} & \textbf{mPQ} & \textbf{bPQ} \\
        \midrule
        \midrule
        \rowcolor{white} PanNuke
        &  &  & Neo. & Non-Neo. & Inflam. & Connect. & Dead &  &  &  \\
        \midrule
        \rowcolor{gray!10} & HoVer-Net~\cite{gamper2020pannuke}  & 0.80 & 0.62 & 0.56 & 0.54 & 0.49 & 0.31 & -    & 0.46 & 0.66 \\
        & CellViT$_{256}$ ~\cite{cellvit}              & 0.82 & 0.69 & 0.70 & 0.58 & 0.52 & 0.37 & -    & 0.48 & 0.67 \\
        \rowcolor{gray!10}& NuLite-M~\cite{tommasino2024nulite}               & 0.83 & 0.70 & 0.73 & 0.58 & 0.52 & 0.37 & -    & 0.50 & 0.68 \\
        \midrule
        & Ours                                & 0.80 & 0.64 & 0.63 & 0.56 & 0.50 & 0.36 & 0.76 & 0.41 & 0.55 \\
        \rowcolor{gray!10}& Ours ConvNeXt                       & 0.80 & 0.66 & 0.61 & 0.58 & 0.53 & 0.36 & 0.80 & 0.41 & 0.56 \\
        \midrule
        \midrule
        \rowcolor{white}CoNSeP
        &  & & Epithelial & Inflammatory & Spindle & Misc. &  & &  &  \\
        \midrule
        \rowcolor{gray!10}& HoVer-Net~\cite{graham2019hover}    & 0.75 & 0.64 & 0.63 & 0.57 & 0.43 &     & 0.85 & - & 0.52 \\
        % & CellViT~\cite{cellvit}              & 0.75 & 0.64 & 0.63 & 0.57 & 0.43 &     & 0.85 & - & 0.52 \\
        \midrule
        \rowcolor{gray!10} & Ours                                & 0.72 & 0.62 & 0.63 & 0.56 & 0.44 &     & 0.77 & - & 0.34 \\
        & Ours ConvNeXt                       & 0.72 & 0.62 & 0.64 & 0.57 & 0.34 &     & 0.74 & - & 0.34 \\
        \midrule
        \midrule
        \rowcolor{white}Ki-67
        &  & & Negative & Positive & Stroma &  &  &  &  &  \\
        \midrule
        \rowcolor{gray!10}& HoVer-Net   & 0.82 & 0.56 & 0.65 & 0.50 &     &     & 0.86 & - & 0.69 \\
        \midrule
        & Ours                                & 0.80 & 0.54 & 0.66 & 0.43 &     &     & 0.83 & - & 0.62 \\
        \rowcolor{gray!10} & Ours ConvNeXt                       & 0.80 & 0.57 & 0.66 & 0.47 &     &     & 0.83 & - & 0.63 \\
        \bottomrule
    \end{tabular}
    }
    

    \label{tab:merged_results}
\end{table}


\paragraph{Inference Time and Computational Efficiency}

It has been well-established that HoVer-Net is not optimal for fast and efficient processing  \cite{baumann2024hovernext, tommasino2024nulite}. Given this limitations, we compare our models to HoVer-Net in terms of inference time and also evaluate computational efficiency against state-of-the-art models: CellViT (performance) and NuLite (efficiency). Our models significantly reduce inference time compared to HoVer-Net. On the CoNSeP test set, our ResNeXt-based and ConvNeXt-based models complete inference in 66.3s and 65.8s, respectively, achieving a \texttimes2.5 reduction over HoVer-Net (168.35s). On PanNuke, they process images in 108.1s and 137.6s, yielding up to a \texttimes5.1 speed-up over HoVer-Net (551.45s). For a fair comparison, both codes were implemented in Python, we used the official HoVer-Net repository, and we did not extensively optimize the DualU-Net inference code. Our significantly lower runtime arises from i) having only two decoder branches instead of three, ii) avoiding HV vector predictions, and iii) generating instance boundaries via watershed from centroid maps.

Despite having more parameters, our models improves computational efficiency. Our ResNeXt-based model surpasses NuLite-S, as shown in Table \ref{tab:comp_eff}, achieving lower GLOPs (30\% lower for \(1024 \times 1024\) images) and a significantly reduced latency. Our ConvNeXt-based model, despite its higher parameter count, remains competitive, requiring fewer GLOPs than NuLite-M and achieving latency close to NuLite-S. These improvements highlight the efficiency of our approach in reducing computational overhead without sacrificing performance.

\begin{table}[t]
    \centering
     \caption{Performance comparison of different models for input sizes 256x256 and 1024x1024. Results are extracted from \cite{tommasino2024nulite}.}
    \rowcolors{2}{gray!10}{white}
    \resizebox{0.75\textwidth}{!}{%
    \begin{tabular}{lcccccc}
        \toprule
        \textbf{Model} & \textbf{Nº Parameters (M)} & \multicolumn{2}{c}{\textbf{GLOPs} $\downarrow$}   & \multicolumn{2}{c}{\textbf{Latency (ms)} $\downarrow$} \\
         &  & 256 & 1024 & 256 & 1024 \\
        \midrule
        CellViT$_{256}$ & 46.75 & 132.89 & 2125.94 & $35.71 \pm 0.37$ & $1169.7 \pm 148.92$ \\
        NuLite-S ~ & 34.10 & 23.15 & 370.25 & $29.99 \pm 1.79$ & $310.44 \pm 24.64$ \\
        NuLite-M & 47.93 & 32.54 & 520.45 & $33.37 \pm 1.34$ & $446.3 \pm 35.25$ \\
        \hline
        Ours & 41.01 & 16.26 & 260.23 & $12.05 \pm 0.41$ & $141.88 \pm 0.69$ \\
        Ours ConvNeXt & 97.81 & 26.78 & 428.49 & $20.82 \pm 0.17$ & $264.19 \pm 1.48$ \\
        \bottomrule
    \end{tabular}
    }
   \label{tab:comp_eff}
\end{table}


\paragraph{Robustness to Color Variations}

Histopathological WSIs often exhibit color variations due to inconsistencies in staining protocols and scanning conditions, which can affect model performance. To evaluate the robustness of our models to these variations, we generated five augmented versions of the CoNSeP test dataset. This augmentation involved random 90-degree rotations, flips, and perturbations in the Hematoxylin-Eosin-DAB (HED) color space. This introduces realistic staining variations, enabling a more comprehensive assessment of model stability (see Appendix~\ref{ap:examples_var} for examples and generation details). The evaluation was conducted for our main approach, the ResNeXt-based model. It demonstrates 49.1\% lower variance in \(F_{1,d}\) (\(0.70 \pm 0.0086\)) compared to HoVer-Net (\(0.73 \pm 0.0169\)), suggesting greater consistency under varying staining conditions. Similarly, the mean \(F_{1,c}\) of our model (\(0.49 \pm 0.0367\)) exhibits 15.6\% reduced variance compared to HoVer-Net (\(0.50 \pm 0.0435\)). In segmentation, our model also shows 45.3\% lower variance in Dice score (\(0.75 \pm 0.0093\)) compared to HoVer-Net (\(0.82 \pm 0.0170\)), further highlighting its robustness.



