\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
% \usepackage[dvipsnames]{xcolor}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{dsfont}
\usepackage{bm}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage{comment}
%\usepackage{caption}
% \usepackage{subfig}
\usepackage{makecell}
% \usepackage{xcolor}

% \usepackage{tocloft}  % TOC customization


% \usepackage[toc,page,header]{appendix}
\usepackage[nohints]{minitoc}


\renewcommand \thepart{}
\renewcommand \partname{}

\DeclareMathOperator{\tr}{tr}

\newcommand{\red}[1]{{\color{red}#1}}
\newcommand{\todo}[1]{{\color{red}#1}}
\newcommand{\TODO}[1]{\textbf{\color{red}[TODO: #1]}}

\newcommand\tp{\texttt{TP}\xspace}
\newcommand\tn{\texttt{TN}\xspace}
\newcommand\fp{\texttt{FP}\xspace}
\newcommand\fn{\texttt{FN}\xspace}

\newcommand\pxtp{\texttt{TP*}\xspace}
\newcommand\pxtn{\texttt{TN*}\xspace}
\newcommand\pxfp{\texttt{FP*}\xspace}
\newcommand\pxfn{\texttt{FN*}\xspace}

\newcommand\glas{\texttt{GlaS}\xspace}
\newcommand\camsixteen{\texttt{CAMELYON16}\xspace}

\newcommand\beloc{\texttt{B-LOC}\xspace}
\newcommand\becl{\texttt{B-CL}\xspace}

\newcommand\pxap{\texttt{PxAP}\xspace}
\newcommand\pxprec{\texttt{PxPrec}\xspace}
\newcommand\pxrec{\texttt{PxRec}\xspace}

\newcommand\cmtx{\texttt{Confusion Matrix}\xspace}
\newcommand\cl{\texttt{CL}\xspace}

\newcommand\pixelcam{\texttt{PixelCAM}\xspace}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\abs}[1]{\ensuremath \left| #1 \right|}
\newcommand{\update}[1]{\textcolor{red}{#1}}

\usepackage{mwe} % to get dummy images
\jmlrvolume{MIDL - 154}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

% \title[\pixelcam: Pixel Class Activation Mapping for Histopathology]{\pixelcam: Pixel Class Activation Mapping for Histopathology}
\title[\pixelcam: Pixel Class Activation Mapping for Histology Image Classification and ROI Localization]{\pixelcam: Pixel Class Activation Mapping for Histology Image Classification and ROI Localization}


% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Alexis Guichemerre\nametag{$^{1}$}} \Email{alexis.guichemerre.1@ens.etsmtl.ca}\\
\Name{Soufiane Belharbi\nametag{$^{1}$}} \Email{soufiane.belharbi@etsmtl.ca}\\
\Name{Mohammadhadi Shateri\nametag{$^{1}$}} \Email{mohammadhadi.shateri@etsmtl.ca}\\
\Name{Luke McCaffrey\nametag{$^{2}$}} \Email{luke.mccaffrey@mcgill.ca}\\
\Name{Eric Granger\nametag{$^{1}$}} \Email{Eric.Granger@etsmtl.ca}\\
\addr $^{1}$ LIVIA, Dept. of Systems Engineering, ETS Montreal, Canada \\
\addr $^{2}$ Goodman Cancer Research, Centre, Dept. of Oncology, McGill University, Canada
}

\begin{document}

\maketitle

\doparttoc % Tell to minitoc to generate a toc for the parts
\faketableofcontents % Run a fake tableofcontents command for the partocs

% \part{} % Start the document part
% \parttoc % Insert the document TOC

\begin{abstract}
Weakly supervised object localization (WSOL) methods allow training models to classify images and localize ROIs. WSOL only requires low-cost image-class annotations yet provides a visually interpretable classifier, which is important in histology image analysis. Standard WSOL methods rely on class activation mapping (CAM) methods to produce spatial localization maps according to a single- or two-step strategy. While both strategies have made significant progress, they still face several limitations with histology images. Single-step methods can easily result in under- or over-activation due to the limited visual ROI saliency in histology images and scarce  localization cues. They also face the well-known issue of asynchronous convergence between classification and localization tasks. The two-step approach is sub-optimal because it is constrained to a frozen classifier, limiting the capacity for localization. Moreover, these methods also struggle when applied to out-of-distribution (OOD) datasets.
%%%
In this paper, a multi-task approach for WSOL is introduced for simultaneous training of both tasks to address the asynchronous convergence problem.  In particular, localization is performed in the pixel-feature space of an image encoder that is shared with classification. This allows learning discriminant features and accurate delineation of foreground/background regions to support ROI localization and image classification. 
We propose \pixelcam, a cost-effective foreground/background pixel-wise classifier in the pixel-feature space that allows for spatial object localization. Using partial-cross entropy, \pixelcam is trained using pixel pseudo-labels collected from a pretrained WSOL model. Both image and pixel-wise classifiers are trained simultaneously using standard gradient descent. In addition, our pixel classifier can easily be integrated into CNN- and transformer-based architectures without any modifications.  
%%%
Our extensive experiments\footnote{\href{https://github.com/AlexisGuichemerreCode/PixelCAM}{https://github.com/AlexisGuichemerreCode/PixelCAM}} on \glas and \camsixteen cancer datasets show that \pixelcam can improve classification and localization performance when integrated with different WSOL methods.  Most importantly, it provides robustness on both tasks for OOD data linked to different cancer types, with large domain shifts between training and testing image data. 
\end{abstract}



\begin{keywords}
Deep Learning, Image Classification, Visual Interpretability, Weakly Supervised Object Localization, Histology Images, Out-Of-Distribution Data.
\end{keywords}


% \doparttoc % Tell to minitoc to generate a toc for the parts
% \faketableofcontents % Run a fake tableofcontents command for the partocs

% \part{} % Start the document part
% \parttoc % Insert the document TOC


%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction} 
\label{sec:intro}

Histology image analysis remains the gold standard for diagnosing cancers of the brain~\cite{khalsa2020automated}, breast~\cite{Veta2014}, and colon~\cite{xu2020colorectal}. However, training deep learning (DL) models for accurate localization of cancerous regions of interest (ROIs) requires pixel-wise annotation by pathologists which is a complex and time-consuming task, especially over Whole Slide Images (WSI). WSOL has emerged as a low-cost training approach~\cite{zhou2017brief}. Using only image class supervision, a DL model can be trained to classify an image, but also to provide spatial localization of ROIs associated with that class~\cite{negev,choe2020evaluating,rpwsol,rony23}. This significantly reduces the large cost of annotation and the need for dense pixel supervision. In addition, it builds interpretable classifiers~\cite{belharbi2022minmaxuncer,neto24} which is critical in medical domains such as in histology image analysis.

Recent progress in WSOL is dominated by CAM methods~\cite{rony23}. Several methods extract spatial activation maps in a single step~\cite{deepmil,oquab2015object,SAT}, and require a module on top of a feature extractor to construct these maps. Then, a spatial pooling module extracts per-class scores used for supervised training using image-class labels. However, without any explicit guidance at the pixel level, these CAM methods can lead to poor maps due to the challenging nature of histology images where objects are often not salient~\cite{rony23}. This can lead to under- or over-activation which creates a high false negative/positive rate at the pixel level. Moreover, training a single model may face the issue of asynchronous convergence of classification and localization tasks~\cite{choe2020evaluating,rpwsol,rony23}.

To bypass this convergence issue and the lack of pixel guidance, a recent direction has emerged in WSOL where it aims at providing explicit localization cues as pseudo-labels with dual models~\cite{negev,fcam,Murtaza2023dips,wei2021shallowspol,zhang2020rethinking,murtaza2025ted,murtaza2022dipssypo,zhao2023generative,murtaza2022dips}. In particular, a per-task model is considered where the localizer is trained using pseudo-labels~\cite{zhao2023generative}. This leads to more parameters and training cycles. In addition, both tasks are disconnected leading to unrelated decisions in both models.
A different approach~\cite{negev} uses a single model where a decoder is combined with a frozen classifier. 
While this yields good results, this architecture is limited as it is tied to frozen features at many layers using skip connections. This prevents the localizer from a better adaptation and hinders its performance.

In addition, recent work has revealed a major limitation to WSOL methods when dealing with domain shift in histology analysis~\cite{sfdawsol}. The performance of WSOL methods is shown to decline on both tasks with out-of-distribution (OOD), limiting their real-world application. Further inspection suggests that features at the pixel level can be the root cause. Since they lack direct localization supervision, a feature encoder may provide poorly separated features concerning classes, making them vulnerable to domain shift, and class confusion.

To alleviate the aforementioned issues, we propose a novel multi-task WSOL method for histology image analysis. It is based on called \pixelcam -- a cost-effective foreground / background (FG/BG) pixel-wise classifier working in the pixel-feature space, allowing for spatial object localization. 
It aims to explicitly learn discriminant pixel-feature representations and accurate delineation of FG and BG regions. This improves the ROI localization and image classification accuracy. Training is achieved through localization pseudo-labels extracted from a pretrained WSOL model. 
In addition, such pixel-wise classification provides the model with robustness to OOD data. Our single-step multi-task framework allows to simultaneously perform classification and localization tasks. 
%
Multi-task training is therefore leveraged to learn rich features for both tasks, compared to the constrained learning of features in a two-step approach. Our approach cooperates to converge to a satisfying solution for both tasks. The multi-task optimization is performed using standard gradient descent by using image-class labels and pixel-wise pseudo-labels extracted from a pretrained WSOL model.
%



Our main contributions are summarized as follows. 
%
\textbf{(1)} A novel multi-task WSOL method called \pixelcam is proposed for histology image analysis. Multi-task optimization of both classification and localization tasks is achieved in a single step by integrating a pixel classifier at the pixel-feature level while leveraging localization pseudo-labels. \pixelcam alleviates the asynchronous convergence issue in WSOL, improves the performance of both tasks, and importantly, provides robustness to OOD data. Our pixel classifier is versatile in that it can easily be integrated into any CNN- or transform-based classifier architecture without modification.
%
\textbf{(2)} We conduct extensive experiments on two public datasets for colon (\glas), and breast cancer (\camsixteen). Our method outperforms WSOL baseline methods on both tasks. Additionally, when dealing with large domain shifts across cancer types, our \pixelcam method can maintain a high level of accuracy, making it an ideal choice for OOD scenarios. We provide several ablations to further analyze our method.

 

%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proposed Method} 
\label{sec:proposed}

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.99\linewidth]{proposal}
    \caption{
    \textbf{Top row: WSOL learning strategies}. \textit{(a) Single-step WSOL methods} perform classification and localization in one step. While standard WSOL uses a sequential approach using only image-class and lacks leveraging localization cues, \pixelcam relies on a multi-task training using both image-class and localization cues. \textit{(b) Two-step WSOL methods} use a model per task or an encoder-decoder which are trained sequentially. 
    \textbf{Bottom row: \pixelcam training (c-1) and inference (c-2)}. The training relies on pseudo-labels collected from a WSOL CAM classifier pretrained on the same dataset ${\mathbb{D}}$. The classification head aims to classify the extracted features ${\mathsf{F}}$. The pixel-wise classifier predicts the randomly selected locations as foreground or background. At test time, all locations are classified using the localization head ${f}$ producing a localization map, while classifying the whole image using the classification head ${g}$.
    }
    \label{fig:proposal}
     \vspace{-1em} 
\end{figure*}

% \noindent\textbf{Notation}. 
Let us denote by ${\mathbb{D} = \{(\bm{X}, y)_i\}_{i=1}^N}$ a training set with $N$ samples, where ${\bm{X}: \Omega \subset \reals^2}$ is a 2d  input image of dimension ${h^{\prime}\times w^{\prime}}$ and ${y \in \{1, \cdots, K\}}$ is its global label, with $K$ being the number of classes. Our model is composed of three parts (Fig.\ref{fig:proposal}): (a) feature extractor, (b) image classifier, and (c) pixel-wise classifier.
%
(a) The feature extractor backbone $h$ with parameter ${\bm{\theta}_1}$ produces a tensor spatial features ${h(\bm{X}) = \mathsf{F} \in \reals^{h^{\prime}\times w^{\prime} \times d}}$, with depth $d$. For simplicity, we consider that $\mathsf{F}$ has the same dimensions as the input image ${\bm{X}}$ through interpolation. 
%
(b) The global image classifier head $g$ classifies the image content, with parameters ${\bm{\theta}_2}$, where  ${g(\mathsf{F})}$ is the per-class probability.
%
(c) The pixel-wise\footnote{For simplicity, we refer to a location in a spatial tensor as a pixel. However, in DL models, such location typically covers more than one pixel in the image space due to the receptive field. Interpolation can be used to upscale the features to the full image size.} classifier $f$ which classifies the embedding ${\mathsf{F}_{i, j, :} = \mathsf{F}_p \in \reals^d}$ of a pixel at location ${(i, j, :)}$ or simply $p$ in ${\mathsf{F}}$ with parameters ${\bm{\theta}_3}$ into either foreground (1) or background (0) classes. This is typically a linear classifier.
%
We refer by ${f(\mathsf{F}_p)_0}$, ${f(\mathsf{F}_p)_1}$ as the probability for the pixel at location $p$ to be background and foreground, respectively. For simplicity, we use ${f(\mathsf{F}_p)}$ for pixel-class probability.
Classifying all locations creates the two localization maps ${\mathsf{S} = f(\mathsf{F}) \in [0, 1]^{h^{\prime}\times w^{\prime} \times 2}}$, where ${\mathsf{S}_{:, :, 0}, \mathsf{S}_{:, :, 1}}$ or simply ${\mathsf{S}^0, \mathsf{S}^1}$ refer to the background and foreground maps, respectively. For simplicity, we refer by ${y^{\prime} \in \{0, 1\}}$ as the pseudo-label of a pixel location ${p}$.
%
Let us denote the standard cross-entropy by $\mbox{H}(\cdot, \cdot)$. The total parameters of our model is referred to as ${\bm{\theta}}$ which is composed of ${ \{\bm{\theta}_1, \bm{\theta}_2, \bm{\theta}_3\}}$. When ${y, y^{\prime}}$ are used with ${\mbox{H}}$, it is the one-hot encoding version that is being considered.

To train the model for full image classification, we use standard cross-entropy over $g$:

\begin{equation}
\label{eq:image_cl}
\begin{aligned}
\min_{\bm{\theta}_1, \bm{\theta}_2} \quad &  \mbox{H}(y, g(\mathsf{F})) \;.
\end{aligned}
\end{equation}



\noindent\textbf{Pixel-wise classifier}. Training the model using only Eq.\ref{eq:image_cl} builds a spatial feature ${\mathsf{F}}$ that is guided only by the image classification loss. It therefore lacks awareness of localization as it cannot directly access localization supervision cues. This may lead to poor pixel-wise features and confuse classes at the feature level. This is particularly true when dealing with less salient and strongly similar objects, as found in histology images. Subsequently, localization on top of these features is less reliable. Most importantly, introducing domain shift over the input image will lead to further feature confusion and failure by subsequent modules to localize and classify. To mitigate this issue, our pixel-classifier $f$ is used to create well-separated features between foreground/background in the pixel-feature space.

Training $f$ requires localization cues. Since the WSOL setup does not provide them, we resort to using a pretrained WSOL CAM classifier trained on the same train dataset ${\mathbb{D}}$, based on CNN or transformer, to acquire pseudo-labels. This approach is typically employed in WSOL to train a new DL model or a decoder for localization~\cite{rony23}. However, this is cumbersome and requires another training phase with many additional parameters.  
To prevent this issue, our method trains a linear model to classify pixel embeddings while training the image classifier and sharing a full backbone. The number of parameters and training cycles is thereby reduced considerably.

To produce pixel-wise pseudo labels, standard WSOL methods~\citep{rony23} are adopted. We use the CAM ${\bm{C}}$ of the true image class $y$ from a WSOL CAM classifier pretrained on the same train dataset ${\mathbb{D}}$. Strong activations typically indicate foreground regions, while low activates point to background regions~\cite{durand2017wildcat}. Instead of directly fitting these regions, recent works showed that it is better to stochastically sample regions to avoid overfitting~\cite{Murtaza2023dips,fcam,murtaza2022dips}. To sample $n$ random foreground pixel locations, a multinomial distribution is used with ${\bm{C}}$ as its pixel-sampling probability. Typically, this allows for sampling more frequently from high-activation locations. For background sampling, ${1 - \bm{C}}$ is instead used as a sampling probability. Sampled pixels are collected in the sets ${\mathbb{C}^+, \mathbb{C}^-}$ for foreground and background pixels, respectively. Note that sampled locations change for each image at every training step. This allows to explore different regions, and it reduces overfitting to specific regions. The sampled pseudo-labels can be directly used to train our pixel classifier, using partial cross-entropy since only part of the image space ${\Omega}$ is considered at once,

\begin{equation}
\label{eq:pixel_cl}
\begin{aligned}
\min_{\bm{\theta}_1, \bm{\theta}_3} \quad &  \sum_{p \in \{\mathbb{C}^+ \; \cup\; \mathbb{C}^- \}} \mbox{H}(y^{\prime}, f(\mathsf{F}_p)) \;.
\end{aligned}
\end{equation}

\noindent The pixel pseudo-label ${y^{\prime}}$ is collected based on the CAM of the true image class ${y}$. Therefore, ${y^{\prime}}$ can be perceived as an instance of the object in that image. Our pixel-classifier learns to directly recognize the class $y$, allowing it to perform object localization.

\noindent\textbf{Total training loss}. Unlike previous works that use a two-step approach and train different models for classification and localization, \pixelcam trains a single model with two heads simultaneously. A multi-task optimization setup is considered to train for both tasks, with most parameters being shared. To this end, we use the following composite loss,

\begin{equation}
\label{eq:totla_loss}
\begin{aligned}
\min_{\bm{\theta}} \quad & \mbox{H}(y, g(\mathsf{F})) + \lambda  \sum_{p \in \{\mathbb{C}^+ \; \cup\; \mathbb{C}^- \}} \mbox{H}(y^{\prime}, f(\mathsf{F}_p)) \;,
\end{aligned}
\end{equation}

\noindent where ${\lambda}$ is a balancing coefficient. Eq.\ref{eq:totla_loss} is minimized using standard gradient descent to perform both tasks. This mitigates the convergence issue, and allows a single training cycle. It also ensures a cooperation between localization and classification to reach a better solution for both tasks at the same time.
\pixelcam does not involve a considerable computation overhead. Given our lightweight pixel classifier, the number of additional parameters is negligible compared to standard DL models. Moreover, this pixel classifier is generic and can be integrated into CNN- or transformer-based models without architectural changes. 
%
Our pixel-classifier creates a boundary in the pixel feature space to well separate ROIs from noise and increase the feature's discriminant power. This helps localization, but also the subsequent classification module as they use the same spatial features for decision. Consequently, it makes features robust to OOD data, allowing better performance of both tasks in such scenario.
%
In Section \ref{sec:results}, we show that including this linear classifier into a standard deep WSOL model, allows us to improve its localization and classification accuracy. Moreover, it provides robustness to domain shift, a common issue in WSOL models for histology image analysis~\cite{sfdawsol}.



%\begin{comment}
\begin{figure}[!t]
  \includegraphics[width=\linewidth, trim=0 145 0 0, clip]{PixelCAM_GLAS.png}
  \caption{
  Standard WSOL setup. First column: input images from \glas. Second column: Ground truth. Next columns: We display the visual CAM results for WSOL baseline without and with \pixelcam, respectively.
  }
  \label{fig:visual-example-glas}
\end{figure}
%\end{comment}





%%%%%%%%%%%%%%%%%%%%%
\section{Results and Discussion} 
\label{sec:results}

\subsection[Experimental Methodology]{Experimental Methodology\footnote{A more detailed description of datasets and implementation are in Appendices ~\ref{sec:datasets-details} and~\ref{sec:training-details} , respectively.}}
% B and C,
 


%%%%%%%%%%
\noindent\textit{(a) Datasets.} Our experiments are performed on two standard public datasets for WSOL in histology images. In particular, we use \glas dataset~\cite{sirinukunwattana2015stochastic} for colon cancer, and the patch version of \camsixteen~\cite{camelyon2016paper,rony23} for breast cancer. In both cases, we follow the same WSOL experimental protocol used in~\cite{negev,sfdawsol,rony23}.

%%%%%%%%%%%%
\noindent\textit{(b) Implementation details.}
%
To assess the impact of our method, we first train a baseline WSOL method alone, and then contrast its performance when combined with our method. The pseudo-labels are extracted from the baseline model. 
Different recent WSOL baseline methods are used from single- and tow-step families, including DeepMIL~\cite{
deepmil}, GradCAM++~\cite{gradcampp}, LayerCAM~\cite{layercam}, SAT~\cite{SAT}, and NEGEV~\cite{negev}. For CNN-based models, we used ResNet-50 as the backbone, and for SAT, the transformer-based model DeiT-Tiny is used.
%
In terms of performance measures, standard WSOL metrics are used, including image classification accuracy, and pixel-localization accuracy \pxap~\cite{choe2020evaluating, fcam, negev, sfdawsol}. In addition, pixel-wise true/false positive/negative rates are used as well. For the localization task, we compare all WSOL methods to the full-supervised case using the U-Net model.




\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{1.0}
\begin{table}[!t]
\vspace{-4pt}
\centering
\resizebox{.9\textwidth}{!}{
\begin{tabular}{ | l  l | c c c | c  c c c|}
\hline
& &  & \multicolumn{2}{c}{\glas}  & & \multicolumn{2}{c}{\camsixteen} &\\
& \textbf{WSOL models} &  & \pxap  \(\uparrow\)& \cl \(\uparrow\) &  & \pxap \(\uparrow\) & \cl \(\uparrow\)& \\
\hline \hline
&  DeepMIL~\cite{deepmil} ICML $\dagger$  &  & 79.9 &\textbf{100.0}& &71.3 & 85.0& \\
&  DeepMIL w/ NEGEV~\cite{negev} MIDL $\star$ &  &\textbf{85.9} & \textbf{100.0}& &\underline{75.6}&\textbf{89.9}& \\
&  DeepMIL w/ \pixelcam  $\dagger$ &  & \underline{85.5} &\textbf{100.0}& &\textbf{75.7}& \underline{88.2}& \\
\hline
&  GradCAM{\textit{++}}~\cite{gradcampp} WACV $\dagger$ &  & 76.8 &\textbf{100.0}& &49.1 & 63.4& \\
&  GradCAM{\textit{++}} w/ NEGEV~\cite{negev} MIDL $\star$ &  &\underline{77.1}& \textbf{100.0}& &\textbf{68.6}& \textbf{89.4}& \\
& GradCAM{\textit{++}} w/ \pixelcam  $\dagger$ & & \textbf{86.6} &\textbf{100.0}& &\underline{64.1} & \underline{85.1}& \\
\hline
&  LayerCAM~\cite{layercam} IEEE TIP $\dagger$ &  & \underline{75.1}&\textbf{100.0}& &33.2& 84.8& \\
&  LayerCAM w/ NEGEV~\cite{negev} MIDL $\star$ &  &73.8& \textbf{100.0}& &\textbf{66.8}&\textbf{89.1}& \\
&  LayerCAM w/ \pixelcam $\dagger$ & & \textbf{83.6}&\textbf{100.0}& &\underline{66.2}&\textbf{89.1}& \\
\hline
&  SAT~\cite{SAT} ICCV $\dagger$ &  &\underline{65.9} &\underline{98.8}& &\underline{32.8} & \underline{83.2}& \\
&  SAT w/ \pixelcam $\dagger$ &  & \textbf{79.1}&\textbf{100.0}& &\textbf{51.2}&\textbf{87.2}& \\
\hline
& U-Net~\citep{unet} MICCAI &  &95.8&n/a & &81.6 & n/a& \\
\hline
\end{tabular}
}
\caption{Localization (\pxap) and classification (\cl) accuracy on \glas and \camsixteen test sets. $\dagger$ refers to model with a one-step approach while $\star$ refers to model from two-step family.}
\label{tab:accuracy-bloc-source}
\end{table}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%T-STAT%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{1.0}
\begin{table}[!t]
\centering
\resizebox{0.9\textwidth}{!}{
\small
\begin{tabular}{ | l  l | c c | c c |}
\hline
& & \multicolumn{2}{c|}{\glas} & \multicolumn{2}{c|}{\camsixteen} \\
& \textbf{WSOL models} & \textbf{T-stat} $\uparrow$ & \textbf{p-value} $\downarrow$ & \textbf{T-stat} $\uparrow$ & \textbf{p-value} $\downarrow$ \\
\hline \hline
& DeepMIL~\cite{deepmil} ICML $\dagger$ & 10.8 & $1.5 \times 10^{-11}$ & 4.8 & $4.1 \times 10^{-5}$ \\
& GradCAM{\textit{++}}~\cite{gradcampp} WACV $\dagger$ & 29.1 & $1.8 \times 10^{-22}$ & 17.9 & $7.4\times 10^{-17}$ \\
& LayerCAM~\cite{layercam} IEEE TIP $\dagger$ & 12.7 & $4.0 \times 10^{-13}$ & 26.5 & $2.2 \times 10^{-21}$ \\
& SAT~\cite{SAT} ICCV $\dagger$ & 58.3 & $8.9 \times 10^{-31}$ &99.7 & $2.8 \times 10^{-37}$ \\
\hline
\end{tabular}
}
\caption{T-test statistics between baselines (DeepMIL, GradCAM{\textit{++}}, LayerCAM, SAT) and our method (\pixelcam) for localization performance.}
\label{tab:tstat-results}
\vspace{-2.5em}
\end{table}




%%%%%
\subsection{WSOL Results}

We first evaluate \pixelcam using the standard WSOL protocol over  \glas and \camsixteen dataset~\cite{rony23}. 
Results are reported in Tab.~\ref{tab:accuracy-bloc-source}.
Combining our method with a WSOL baseline often provides an improvement in localization accuracy. 
Over \glas dataset, our method improves the \pxap performance by (+5.6\%,+9.8\%,+8.5\%) compared to the WSOL baseline methods (DeepMIL, GradCAM++, LayerCAM) respectively, while on \camsixteen dataset, our method gains +4.4\%,+15.0\%,+33.0\%, respectively. Fig.~\ref{fig:visual-example-glas} shows visual prediction examples. The localization performance gains are supported by statistical significance, as confirmed by t-tests, in Tab.~\ref{tab:tstat-results}, yielding very low p-values (below 0.05).
Additionally, our method improves classification performance as well. This indicates that well-separated classes at pixel-feature level in our model help improve localization but also facilitate global image classification. We further provide measures of how well are separated FG and BG pixel-features in Fig~\ref{fig:histogram_glas_separability} on the \glas test set. These results show that \pixelcam histogram of class separability index is shifted to the right compared to the baseline. This indicates a better class separability.

\begin{figure}[!htb]
\centering
\includegraphics[width=0.9\linewidth]{visualization-GLAS-all_histograms_source.png}
\caption{Histogram of foreground/background separability index for WSOL baseline methods alone vs when combined with \pixelcam on \glas test set. The higher the value, the better the separability is.
The definition of class separability is in Appendix~\ref{sec:class_separability}.
}
  \label{fig:histogram_glas_separability}
\end{figure}



\subsection{Out-of-Distribution Results}


{
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{1.0}
\begin{table}[!t]
\centering
\resizebox{0.9\textwidth}{!}{
\small
\begin{tabular}{ | l  l | c c c | c  c c c|}
\hline
& &  & \multicolumn{2}{c}{\camsixteen $\rightarrow$ \glas}  & & \multicolumn{2}{c}{\glas  $\rightarrow$ \camsixteen} &\\
& \textbf{WSOL models} &  & \pxap \(\uparrow\) & \cl \(\uparrow\) &  & \pxap \(\uparrow\)& \cl \(\uparrow\)& \\
\hline \hline
&  DeepMIL~\cite{deepmil} ICML $\dagger$ &  &\underline{64.5} &\underline{81.2}& &\underline{29.0} &\textbf{55.2}& \\
&  DeepMIL w/ NEGEV~\cite{negev} MIDL $\star$ &  &62.7 &\underline{81.2}& &27.8 &\underline{52.9}& \\
&  DeepMIL w/ \pixelcam $\dagger$ &  &\textbf{69.1} &\textbf{83.8}& &\textbf{30.2} &52.5& \\
\hline
&  GradCAM{\textit{++}}~\cite{gradcampp} WACV $\dagger$ &  & 52.9 &53.7& &\textbf{39.1} &52.4& \\
&  GradCAM{\textit{++}} w/ NEGEV~\cite{negev} MIDL $\star$ &  &\textbf{64.5} &\textbf{75.0}& &24.2 &\underline{56.7}& \\
&   GradCAM{\textit{++}} w/ \pixelcam $\dagger$ &  & \underline{56.2} &\underline{71.2}& &\underline{36.9} & \textbf{63.3}& \\
\hline
&  LayerCAM~\cite{layercam} IEEE TIP $\dagger$ &  & 59.5 &77.5& &30.4 &51.4& \\
&   LayerCAM w/ NEGEV~\cite{negev} MIDL $\star$ &  & \underline{63.9} &\textbf{83.7}& &\underline{27.9}& \underline{52.3}& \\
&   LayerCAM w/ \pixelcam $\dagger$ &  & \textbf{67.2} &\underline{78.8}& &\textbf{34.8}& \textbf{55.8}& \\
\hline
&  SAT~\cite{SAT} ICCV $\dagger$ &  & 50.7 &67.5& &\textbf{24.3} &50.2& \\
&   SAT w/ \pixelcam $\dagger$ &  & \textbf{55.1} &\textbf{78.8}& &22.5&\textbf{50.4}& \\
\hline
\end{tabular}
}
\caption{
OOD results: Localization (\pxap) and classification (\cl) accuracy on target test dataset: \glas and \camsixteen. The symbol "$\dagger$" refers to model with a one-step approach, while "$\star$" refers to model from two-step family.}
\label{tab:accuracy-bloc-target}
\vspace{-2.5em}
\end{table}

}


Additional experiments were conducted to assess the ability of \pixelcam to train robustness models on OOD data. To this end, consider the domain shift between \glas (colon cancer) and \camsixteen (breast cancer).  
This simulates a domain adaptation problem in the source-only case, where a model is pre-trained on source domain data, and then evaluated on target data. Results are reported in Tab.~\ref{tab:accuracy-bloc-target}. As shown in~\cite{sfdawsol}, WSOL methods typically decline when facing OOD in histology data. Overall, our \pixelcam method yields better performance over both tasks and across all baselines. This is mainly due to better separated pixel-features learned by our model which make them more robust to OOD data. This is demonstrated in the appendix~\ref{sec:class_separability} by inspecting the class separability of pixel-features of the target data. As presented in Tab.~\ref{tab:accuracy-bloc-target}, using \pixelcam outperforms in general WSOL baselines alone on the case \mbox{\camsixteen $\rightarrow$ \glas}. This improves \pxap scores by 4.6\%, 3.3\%, 7.7\%, and 4.4\% compared to DeepMIL, GradCAM++, LayerCAM, and SAT, respectively. Moreover, \pixelcam also improves image classification performance. The other case, \mbox{\glas $\rightarrow$ \camsixteen}, is much more challenging since the train source dataset is very small, compared to the very large and difficult target set. While the overall performance is low compared to when \glas is the target, our method still yields better localization performance in general. Moreover, classification performance is improved overall across both scenarios.
More results and interpretations are included in Appendices ~\ref{sec:ablations} and ~\ref{sec:appendix_visualization}.


We further evaluate the robustness of \pixelcam by altering the stainings in the test sets of \glas  using stain styles from the other dataset. The stainings were progressively modified by selecting stain variations ordered by their distance to the original stain. Stain 1 introduces a minor alteration, while stain 10 represents a substantial shift. This setup simulates an increasing staining shift while keeping the organ type unchanged. Additional results for \camsixteen are presented in Appendices~\ref{sec:staining}.

\begin{figure}[t]
  \centering
  \includegraphics[width=0.8\linewidth]{visualization-diff_stainings-classification_comparison_glas.png}
  \caption{Classification (\cl) accuracy on \glas test sets with LayerCAM and \pixelcam with different stainings.
  }
  \label{fig:stain-cl-glas}
\end{figure}

\begin{figure}[t]
  \centering
  \includegraphics[width=0.8\linewidth]{visualization-diff_stainings-localization_comparison_glas.png}
  \caption{Localization (\pxap) accuracy on \glas test sets with LayerCAM and \pixelcam with different stainings.
  }
  \label{fig:stain-loc-glas}
\end{figure}
\section{Conclusion} 
\label{sec:conclusion}
In this paper, \pixelcam is introduced for multi-task single-step WSOL method. It is based on a cost-effective FG/BG pixel-wise classifier in the pixel-feature space, allowing for spatial object localization. It leverages pixel-pseudo labels cues for localization. It aims at explicitly learning discriminant pixel-features and accurately separate FG/BG regions, promoting better ROI localization and image classification. Both tasks are optimized in parallel without incurring notable computational cost nor additional parameters. 
%
Our pixel classifier is versatile. It can easily be integrated into any CNN- or transform-based classifier architecture without modification.
%
Results on two histology datasets showed that \pixelcam can improve WSOL baseline methods over both tasks, but also make them robust to OOD data. However, there is still room to improve our method by better optimizing both tasks while reducing their mutual negative impact. In addition, dealing with OOD in WSOL scenario is still an ongoing issue, and our method still requires more improvements despite its progress. 

\newpage
\midlacknowledgments{This research was supported in part
by the Canadian Institutes of Health Research, the Natural
Sciences and Engineering Research Council of Canada, and
the Digital Research Alliance of Canada.
}


%%%%%%%%%%%%%%%%%

%

%
\bibliography{midl25_154}
\input{supp-mat.tex}

\end{document}
