\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\DeclareUnicodeCharacter{04D3}{\"a}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- Under Review}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Under Review for MIDL 2026}

\title[Better Weakly Supervised Quantization]{Beyond Classification: Elaborating Network Predictions for Better Weakly Supervised Quantization}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Chih-Chieh Chen\nametag{$^{1}$}} 
\Email{jackfrank@gmail.com}\\
\Name{Chang-Fu Kuo\nametag{$^{1,2,3}$}} \Email{zandis@gmail.com}\\
\addr $^{1}$ Center for Artificial Intelligence in Medicine, Chang Gung Memorial Hospital, Taoyuan, Taiwan \\
\addr $^{2}$ Medical Education Department, Chang Gung Memorial Hospital, Taoyuan, Taiwan \\
\addr $^{3}$ Division of Rheumatology, Allergy and Immunology, Chang Gung Memorial Hospital, Taoyuan, Taiwan
}

\begin{document}

\maketitle

\begin{abstract}

For clinical applications, more detailed information such as specific locations and the region of interest (ROI) volumes is preferred.  However, most of the time only classification annotations are available. Class Activation Mapping (CAM) and its variants are the most commonly used techniques for weakly supervised localization tasks.   In this study, we assessed both traditional and modern network architectures regarding classification accuracy and CAM visualization. 
Although all networks achieved high AUROC scores and their heatmaps closely corresponded to pathology locations, we observed that the heatmaps were influenced by the particular network architectures and pretrained weights used. Additionally, current models produce heatmaps from small latent spaces (e.g. $16 \times 16$), which limits the precision of these heatmaps for further detailed analysis.

Based on the observations mentioned above, we designed a UNet-style architecture that utilizes pretrained classification networks as the encoder and produces heatmaps within a latent space of size $128 \times 128$. We observed that the generated heatmaps are more detailed and suitable for weakly supervised segmentation. We validated the effectiveness of our approach using the intracerebral hemorrhage (ICH) dataset.



\end{abstract}

\begin{keywords}
ICH classification, heatmap generations, weakly supervised localization.
\end{keywords}

\section{Introduction}










The localization of pathologies is just as crucial as their classification for clinical use. By utilizing heatmaps or bounding boxes, clinicians can revisit patient data to verify the presence of pathologies in specific areas, thereby lowering the false negative rates in diagnoses. However, creating bounding box annotations is time-consuming. In this work we are interested in ICH subtype segmentation. Hematoma volume and hematoma expansion are essential predictors of mortality and outcome \cite{li2024code}. Previous studies  \cite{scherer2016development} have shown that the ABC/2 method, which estimates lesion volume by multiplying the largest total clot diameter by the applicable perpendicular clot diameter and the number of slices, frequently yields inaccurate overestimations of ICH volume. This indicates a clear need for the development of a more precise volume estimation method. 


Consequently, it is valuable to explore how weakly supervised methods can minimize the need for extensive labeling. In medical image analysis,  Grad-CAM based methods \cite{zhou2016learning, selvaraju2017grad} are probably currently the most widely used techniques. For Grad-CAM, the heatmap of the chosen layer by weighted summing the channel values, where the weight of the channel  is given as the summations of partial derivatives of final outputs with respect to all the spatial positions. Due to the spatial biases inherent in convolutional neural networks, locations corresponding to class-related features tend to have higher output values and growth rates, resulting in elevated heatmap scores at these positions.

In this study, we assessed the Grad-CAM approach on quantifying subtypes of intracranial hemorrhage (ICH). The five subtypes considered are intraparenchymal hemorrhage (IPH), intraventricular hemorrhage (IVH), subarachnoid hemorrhage (SAH), subdural hematoma (SDH), and epidural hematoma (EDH). Typically, these subtypes can be differentiated by their distinct locations and shapes \cite{heit2016imaging}. We surprisingly found the outputs of fine-grained pretrained models are highly accurate and delicate. Therefore, our idea is to modify the heatmaps generated by Grad-CAM. However, since they are derived from low-resolution latent spaces, their representational
capacity is limited.  

Modern architectures achieve impressive discriminative performance with fewer parameters and FLOPs by incorporating modern components like the squeeze-and-excitation block \cite{hu2018squeeze} and depthwise convolutions, along with improved training and scaling methods. Building on this insight, our work aims to leverage features from hidden layers to produce more detailed heatmaps. To do this, we construct a U-shaped network that uses the pretrained model as the encoder. Since segmentation labels are unavailable,   inspired by \cite{pinheiro2015image}, we replace the final average pooling and linear layers with a Log-Sum-Exp (LSE) pooling layer, which sums the exponentials of patch outputs to generate predictions.  The decoder’s architecture is simple, and our intuition is to use the pretrained encoder, possibly with regularization terms, to guide the nearly linear, untrained decoder in producing more fine-grained heatmaps. We also incorporate intermediate
supervision and regularization on each layer’s output to ensure the generated heatmaps
align with human intuition.


We demonstrated the effectiveness of our methods using ICH datasets. Our model was trained on the RSNA intracranial hemorrhage dataset \cite{rsnaich} and its performance was assessed on the BHSD dataset  \cite{wu2023bhsd}. Our empirical results indicate that although the classification accuracy of our proposed architecture is comparable to or surpasses that of standard models, the heatmaps produced by the Grad-CAM algorithm are significantly more accurate.

In sum, our contributions are as follows:

\begin{itemize}
\item We empirically highlight both the strengths and limitations of the Grad-CAM algorithm when applied to clinical tasks.
\item To produce more detailed heatmaps, we introduced a method that also leverages the shallow layers of a pretrained network by constructing a UNet-like architecture and generating heatmaps from the final layer of this U-shaped network.
\item We evaluated our proposed architecture on the classification of intracranial hemorrhage subtypes, showing that it achieves comparable classification performance while producing more precise heatmaps.
\end{itemize}


\section{Related Works}

\subsection{ ICH Segmentation}
In recent years, deep learning-based approaches have been proposed.  Authors in \cite{expertich} developed a joint classification and segmentation framework based on 4,396 CT scans and demonstrated a case-level sensitivity of $100$\% and specificity of $90$\% on 200 randomly selected CT scans. In their related work \cite{patchfcn}, DICE score of $76.6$\% was reported.  In a similar study \cite{monteiro2020multiclass}, edema was included as an additional ground truth class.  A simplified annotation was used, merging SDH, EDH, and SAH into a single category named extra-axial hemorrhage (EAH). The testing dataset, which excludes lesions of 1 mL or smaller, yielded a case-level mean Dice score of 59.3\% for ICH. 

Although these methods yield promising results, they require extensive fine-grained annotations. Therefore, it is desirable to create techniques that demand less annotation effort.

\subsection{Grad-CAM \label{sec:cam}}



Given a classifier ending with a global average pooling layer followed by a fully connected layer, the final output corresponding to class $c$ is given by

\begin{equation}
y^c = w_l^c \cdot \frac{1}{Z} \Sigma_i \Sigma_j A^l_{i,j}, 
\end{equation}

where $A^l_{i,j}$  represents the output of the $l$-th feature at spatial location $(i,j)$ in the last convolutional layer and $w^c_l$ is the weight of the fully connected layer associated with  class $c$ for unit $l$.  In \cite{zhou2016learning}, the heatmap  $M^c$  is proposed to be 
\begin{equation} \label{eq:cam_orig}
M_c(i, j) = \Sigma_l w_l^c A^l_{i,j}.
\end{equation}

 In \cite{selvaraju2017grad}, the idea is further generalized to arbitrary neural networks by letting 
\begin{equation}  \label{eq:cam_coef}
w_l^c = \frac{1}{Z} \Sigma_i \Sigma_j \frac{\partial y^c}{\partial A^l_{i,j}},  
\end{equation}

and the equivalence of these two approaches when the target layer is right before the final global classification layer is also demonstrated  in  \cite{selvaraju2017grad}.

Lastly, to create visualizable heatmaps, $M_c$ in Equation \ref{eq:cam_orig} will be passed through a  ReLU layer and further normalized by its maximum value.

To generate High resolution images, authors in\cite{selvaraju2017grad} further suggested combining Grad-CAM with guided backpropagation \cite{springenberg2014striving}. For guided backpropagation, negative gradients are not backpropagted through ReLU layers when calculating the heatmaps with respect to image pixels. In \cite{selvaraju2017grad}  it is suggested to use the Hadamard product of heatmap generated by Grad-CAM and by guided backpropagation to obtain a refined heatmap.. 

Integrated gradient \cite{sundararajan2017axiomatic} is another common choice for generating high resolution heatmaps. Given an image $x$, reference image $x'$ (usually a black image), function $F$ and pixel $x_i$,  in \cite{sundararajan2017axiomatic}, the heatmap score at $x_i$ is generated by integrating $\frac{\partial F (x' + (x-x'))}{ \partial x_i}$ through $\alpha$ from 0 to 1.



Numerous studies have been conducted to enhance performance across various specialized domains \cite{smilkov2017smoothgrad, djoumessi2025soft}. In this work, we will choose guided backpropagation and intergrated gradient as our baseline model. 




\subsection{ Weakly Supervised Segmentation}



Weakly supervised learning is an encouraging approach in situations where annotated data is scarce, and numerous research methods have been explored \cite{chen2025weakly}.  In the current era, with the advent of foundation models \cite{radford2021learning, kirillov2023segment, simeoni2025dinov3}, it is possible to achieve more detailed weakly supervised segmentation outcomes. Nevertheless, even in the absence of foundation models, several significant studies have already shown highly promising results on datasets such as PASCAL VOC2012 \cite{everingham2010pascal}. For instance, by starting with Grad-CAM based predictions, in  \cite{wang2018weakly, shen2018bootstrapping}, performances are improved using techniques like the neural version of seed region growing \cite{adams1994seeded} or by grabcut \cite{rother2004grabcut}. Although these methods might potentially yield good results in weakly supervised ICH segmentation tasks, we also wish to highlight that these modification techniques might assume a distinct boundary between class objects and the background. In contrast, this work aims to introduce a framework that does not rely on this assumption.

\cite{rasoulian2023weakly} investigate weakly supervised ICH-segmentation approach using Swin transformer.
A novel layer attention map with respect to the head of the self attention layer is proposed. However, to the best of the author's knowledge, the work works on ICH, namely, weakly supervised binary segmentation. Instead, in this work attempt to work on weakly supervised multi-label subtype segmentation.  The motivations and purposes between  \cite{rasoulian2023weakly}  and our work are also different. While pre-trained weights are not used in \cite{rasoulian2023weakly}, we aim to show that the outputs of contemporary network architectures, such as ResNet-RS and RDNet used in our study, with pre-trained weights, encode rich information and suitable for visualization purposes. Our primary contribution is to extract this information into higher resolution outputs, enabling us to obtain quantified data, such as subtype volume.



\subsection{LSE Pooling Layer \label{sec:lse}}

Let 

\begin{equation} \label{eq:lse_def}
LSE_r(x_1, \dots , x_n) = \log (\Sigma_i \exp (r \cdot x_i))
\end{equation}



be the scaled LogSumExp function. Recall that the partial derivatives 

\begin{equation} \label{eq:lse}
\frac{ \partial LSE_r(x1, \dots , x_n)}{\partial x_j} = \frac{\exp (r \cdot x_j)}{ \Sigma_i \exp (r \cdot x_i)}.
\end{equation}

Thus, when the global average pooling layer is replaced by the LSE pooling layer, according to Equation  \ref{eq:cam_coef}, the contributions to the weight $w_l^c$ with respect to the spatial locations are proportional to their scaled softmax values. This implies that the final heatmap scores at each spatial location are proportional to the product of their patch-level outputs and the scaled softmax values. Compared to heatmaps generated by networks using global average pooling, these heatmaps are significantly more sensitive to patch-level outputs.


\section{Proposed Architecture and Algorithm}
\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:arch}
  {\caption{Proposed Architecture.   HLoss refers to the heatmap consistency loss in Sec.  \ref{sec:loss}}}
  {\includegraphics[width=\linewidth]{figs/SEGDENSE.drawio.png}}
\end{figure}

\subsection{Observations on Heatmaps Generations}

Our primary focus is on two groups of architectures: ResNet \cite{he2016deep}, DenseNet \cite{huang2017densely},  along with their variants such as ResNet-RS \cite{bello2021revisiting} and RDNet \cite{kim2024densenets}. We trained these models to classify ICH subtypes, and all achieved AUROC scores exceeding $0.95$. For ICH subtype localization, we observed that all models could produce heatmaps accurately pinpointing the locations of each subtype, especially when multiple subtype features are completely separate   (see Fig. \ref{fig:multi-label_illustration} for more details.)  

\begin{figure}
\centering
\begin{minipage}[t]{0.16\textwidth}
        
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76.png}
        \includegraphics[width=\textwidth]{figs/label_mask/IPH_label.png}
        \includegraphics[width=\textwidth]{figs/label_mask/IVH_label.png}
        \includegraphics[width=\textwidth]{figs/label_mask/SAH_label.png}
        \includegraphics[width=\textwidth]{figs/label_mask/SDH_label.png}
          \begin{center}
  {GT}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
        
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76_1.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76_2.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76_3.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76_4.png}
          \begin{center}
  {ResNet152}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_3d544ff76.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_3d544ff76_1.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_3d544ff76_2.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_3d544ff76_3.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_3d544ff76_4.png}
          \begin{center}
  {DenseNet121}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
\includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_3d544ff76.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/20_ID_3d544ff76_1.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/20_ID_3d544ff76_2.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/20_ID_3d544ff76_3.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/20_ID_3d544ff76_4.png}
          \begin{center}
  {DenseNet121*}
  \end{center}
    \end{minipage}
    \begin{minipage}[t]{0.16\textwidth}
     \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_3d544ff76.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_3d544ff76_1.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_3d544ff76_2.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_3d544ff76_3.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_3d544ff76_4.png}
          \begin{center}
  {ResNet-RS200}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
\includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_3d544ff76.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_3d544ff76_1.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_3d544ff76_2.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_3d544ff76_3.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_3d544ff76_4.png}
          \begin{center}
  {RDNet}
  \end{center}
    \end{minipage}
\caption{ICH subtype heatmap generations selected from RSNA intracranial hemorrhage dataset \cite{rsnaich}    . From the above to below: IPH, IVH, SAH, SDH. DenseNet* refers to DenseNet without pretrained model.\label{fig:multi-label_illustration}}
\end{figure}

\begin{figure}
\centering
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/13_ID_3e33b16bb.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/13_ID_3e33b16bb_label.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/13_ID_3e33b16bb_0.png}
          \begin{center}
  {ResNet152}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_661bee514.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/20_ID_661bee514_label.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_661bee514_0.png}
          \begin{center}
  {DenseNet121}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/17_ID_7f57645d6.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/17_ID_7f57645d6_label.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/17_ID_7f57645d6_0.png}
          \begin{center}
  {DenseNet121*}
  \end{center}
    \end{minipage}
    \begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/resnetrs200/16_ID_d7484d836.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/16_ID_d7484d836_label.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/16_ID_d7484d836_0.png}
          \begin{center}
  {ResNet-RS200}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_71bde6db6.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/20_ID_71bde6db6_label.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_71bde6db6_0.png}
          \begin{center}
  {RDNet-T}
  \end{center}
    \end{minipage}
\caption{Illustrations of heatmap generations. (*= no-pretrained model.)  \label{fig:conv_Grad-CAM_illustration}}
\end{figure}

Regarding the estimation of stroke volume subtypes, as shown in Fig. \ref{fig:conv_Grad-CAM_illustration}, we found there is room for improvement. With ResNet152, the produced heatmaps lack strong location sensitivity, occasionally missing large strokes. In the case of DenseNet121, likely due to numerous skip-connections, the heatmaps are focused but the regions of high intensity are too broad for detailed features such as SAH. When training DenseNet from scratch without the ImageNet pretrained weights (referred to as DenseNet* in Fig. \ref{fig:conv_Grad-CAM_illustration}), the heatmaps become less focused, indicating that pretraining on a large and diverse image set aids heatmap quality. For newer models like ResNet-RS and RDNet, the heatmaps are more detailed. Additional examples can be found in Fig. \ref{fig:detailed_illustration}.




\subsection{Architecture}

The main proposed architecture is illustrated in Fig. \ref{fig:arch}.  It is a UNet-like architecture with encoder that is first  pretrained on the target dataset. For the encoder backbone, we use RDNet-T \cite{kim2024densenets}, an updated version of DenseNet designed to enhance accuracy and scaling strategy compared to the original. Modern architectures like patchification stem, base block from ConvNeXt \cite{liu2022convnet} are adopted. And the transition layer is redesigned and applied after every three blocks instead of only after each stage.  Working on images with size $512 \times 512$, we added additional parameters at the end of each stage, orresponding to spatial dimensions of $128 \times 128, 64 \times 64, 32 \times 32$, and  $16 \times 16$, with channel sizes of $256, 440, 744, 1040$, respectively.

For the decoder, each stage primarily includes three components: a convolution block, an upsampling block, and the final classification layer. The convolution block applies 1D convolution to convert the concatenated features into the desired channel dimension, followed sequentially by a Layer Normalization layer, a ReLU activation, and a squeeze-and-excitation block. The upsampling block first uses 1D convolution to adjust the channel dimension to match that of the previous stage, then applies 2D convolution to increase the number of channels by four times, and performs upsampling using the depth-to-space operation. A Layer Normalization layer is added at the end of the upsampling block. Lastly, the final classification layer produces multi-label scores, which are then fed into the LSE pooling layer. Heatmaps are generated just before the top LSE pooling layer. As discussed in Section \ref{sec:lse}, when patch-level predictions are trained with high accuracy, the resulting heatmaps tend to be sharper compared to those produced by networks that use global average pooling layers.

\subsection{Loss Function \label{sec:loss}}

Let $f_0, \dots, f_3$ represent the inputs of the LSE pooling layer, and  $l_0, \dots, l_3$ the outputs following the LSE pooling layer, from above to below. Denote $gt$ the ground truth label. Then we have the classification loss 

\begin{equation}  \label{eq:closs}
 \text{CLoss} = \alpha_0 \text{BCE}(l_0, gt) + \alpha_1 \text{BCE}(l_1, gt) + \alpha_2 \text{BCE}(l_2, gt) + \alpha_3 \text{BCE}(l_3, gt),
\end{equation}

where BCE represents the binary cross
entropy loss function and $\alpha_0, \dots, \alpha_3$ are weight coefficients. Additionally, to ensure consistency in heatmap generation at each level, a heatmap consistency loss is applied,

\begin{equation} \label{eq:hloss}
 \text{HLoss} = \beta_0 \text{MSE}(\text{Pool}(f_0), f_1) + \beta_1 \text{MSE}(\text{Pool}(f_1), f_2) + \beta_2 \text{MSE}(\text{Pool}(f_2), f_3),
\end{equation}

where MSE represents the mean square error loss, Pool refers to average pooling with a size of 2, and $\beta_0, \dots, \beta_2$ are weight coefficients. The overall loss is the sum of the classification loss ($\text{CLoss}$) and the heatmap consistency loss ($\text{HLoss}$).

\subsection{Weakly Supervised Quantization Algorithm}

Based on the observations that the generated heatmaps for different disease subtypes are nearly mutually exclusive, our objective is to produce class-specific heatmaps for each category, subsequently determining the final predicted mask by employing the argmax function. Nevertheless, we have empirically observed that Grad-CAM tends to yield an excessive number of false positive results. In order to mitigate this concern, we introduce two distinct thresholds: class thresholds and mask thresholds. The class thresholds are selected based on the values that yield maximal Youden indices. We generate the class-specific heatmap solely when the corresponding output surpasses the designated class threshold for that specific category. Conversely, given our intent to retain only the regions highlighted in red in Fig. \ref{fig:multi-label_illustration}, therefore we set mask thresholds: heatmap values that fall below these mask thresholds are adjusted to zero.  We use grid search to find the mask thresholds, and the details are provided in Appendix \ref{sec:cam_for_visualization}.  





\section{Experiments}

In this work, we validate our proposed approach for intracranial hemorrhage subtype classification. In 2019, the RSNA hosted a competition centered on classifying acute intracranial hemorrhage subtypes  \cite{rsnaich}, providing over 20,000 CT scans along with slice-level classification labels. The class annotations include five primary ICH subtypes: intraparenchymal hemorrhage (IPH), intraventricular hemorrhage (IVH), epidural hematoma (EDH), subdural hematoma (SDH), and subarachnoid hemorrhage (SAH). We randomly selected $2,734$ CT scans ($91,844$ slices) for training, and $89$ CT scans ($3,000$ slices) for validation of subtype classification. The class thresholds used in Algorithm \ref{alg:heatmap_to_segmentation} were chosen based on the threshold that maximized the difference between true positive rate and false positive rate on the validation set. For testing, we utilized the BHSD dataset \cite{wu2023bhsd}, which contains 192 CT scans with subtype segmentation labels, as the test dataset.

For image input, we followed the approach used by \cite{expertich}: all CT slices were adjusted with a window width of 130 and a window center of 25. We opted to stack three consecutive slices as input and use the segmentation mask of the middle slice as the output, instead of employing a three-dimensional model. To the best of our knowledge, using three consecutive slices is generally sufficient for accurately detecting and classifying subtype hemorrhages in almost all cases, except for a small number of instances that require distinguishing between EDH and SDH.  We utilize RDNet as the framework to create the heatmap through guided backpropagation. For the integrated gradient, we apply the class threshold (in Algorithm \ref{alg:heatmap_to_segmentation})  from RDNet and employ grid search to identify the appropriate mask thresholds. To evaluate the Dice coefficients at the instance level, we choose instances for each subtype with a volume exceeding 3000 (to minimize false negatives) and compute the binary Dice coefficients. There are 90 cases for IPH, 40 cases for IVH, 45 cases for SAH, 45 cases for SDH, and 14 cases for EDH. All experiments were carried out using a single Nvidia V100 GPU. For models that utilized pretrained checkpoints, we applied SGD with a step learning rate schedule over 20 epochs. For models without pretrained checkpoints, we employed Adam optimizer with a learning rate of  $1e-4$ for 30 epochs. Detailed hyperparameter settings can be found in Appendix \ref{app:hyperparameter}.





\section{Results}

The AUROCs on the validation dataset are shown in Table \ref{tab:auroc}. Although our method achieves the highest performance, we observed that all models yield similar results. However, as shown in Table \ref{tab:dice_use_class_thres} and Table \ref{tab:instance_dice_use_class_thres}, our method significantly outperforms all other models across all disease subtypes. The heatmaps generated by our proposed method are illustrated in Fig. \ref{fig:multilabel-lse}  and Fig. \ref{fig:case_illustration}.  When compared to the ground truth labels and the heatmaps produced by other network architectures shown in Fig. \ref{fig:conv_Grad-CAM_illustration}, our method clearly performs better in accurately capturing the shapes and locations of hemorrhages.
 

The fifth row of  Fig. \ref{fig:case_illustration}. displays the heatmaps produced by guided backpropagation. While many of these heatmaps seem reasonable, their performance is surprisingly inferior to that of RDNet alone.  This can be possibly due to too many noises in the heatmaps. On the other hand, , the heatmaps produced by integrated gradients are shown in the final row of Fig. \ref{fig:case_illustration}. We noticed that the heatmaps are focused on the skulls of the brains and other areas not related to the class, and the performance levels do not match those of the other baseline models. We want to highlight that the heatmap consistency loss described in Equation \ref{eq:hloss} is essential. Without this regularization during training, as shown in Fig. \ref{fig:case_illustration}, the resulting heatmaps tend to have a grid-like pattern and are less focused on the stroke areas. Therefore the results demonstrated in Table \ref{tab:auroc} considerably deteriorate.  




\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:auroc}%
  {\caption{AUROCs on the validation dataset.  (*= no-pretrained model.)  }}%
  {\begin{tabular}{lllllll}
  \hline 
  \bfseries Model & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG \\
  \hline 
  \hline 
  ResNet152 & 0.970 & 0.997 & 0.949 &0.923 &0.941 & 0.956\\
  \hline 
  DenseNet121& \textbf{0.974} &0.997 &0.957 &0.918 &0.954 & 0.960 \\
  \hline 
  DenseNet121*& 0.973&0.997&0.960& 0.915 &0.963 & 0.961 \\
  \hline 
  ResNetRS & 0.971 & 0.996 &0.956 &0.926 &0.956 & 0.961\\
  \hline 
  RDNet & 0.973 &0.996 &0.948 &0.916 &\textbf{0.975} & 0.962\\
  \hline 
  Ours & 0.973 &  \textbf{0.998} & \textbf{0.959} & \textbf{0.934} &  0.969 & \textbf{0.967}\\
  \hline 
  \end{tabular}}
\end{table}

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:dice_use_class_thres}%
  {\caption{DICE coefficients on the BHSD dataset, using the class thresholds of the models to generate heatmaps.   (*= no-pretrained model.)  }}%
  {\begin{tabular}{llllllll}
  \hline 
  \bfseries Model & \bfseries Normal & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG \\
  \hline
  \hline
  ResNet152 & 0.992& 0.242 & 0.093 & 0.056 &0.075 &0.060 & 0.253\\
  \hline
  DenseNet121 &0.990& 0.256 &0.113 &0.060 &0.070 &0.078 & 0.261 \\
  \hline
  DenseNet121* & 0.989& 0.260&0.132&0.047& 0.076 &0.054 & 0.260 \\
  \hline
  ResNetRS &0.996& 0.445 & 0.182 &0.138 &0.144 &0.158 & 0.344\\
  \hline
  RDNet & 0.996 & 0.380 & 0.211 &0.108 &0.133 &0.141 & 0.328\\
  \hline
  Guided Gradient &0.991 & 0.110 & 0.232 & 0.126 & 0.106 &0.111 & 0.279\\
  \hline
  Integrated Gradient  & 0.914 & 0.004 & 0.012 & 0.007 & 0.002 &  0.011 & 0.158\\
  \hline
  Ours (no HLoss) & 0.990 &  0.142 & 0.388 & 0.137 & 0.082 & 0.112& 0.309 \\
  \hline 
  Ours &\textbf{0.997} & \textbf{0.551} &  \textbf{0.387} & \textbf{0.225} & \textbf{0.245} &  \textbf{0.356} & \textbf{0.460}\\
  \hline
  \end{tabular}}
\end{table}






\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:instance_dice_use_class_thres}%
  {\caption{Instance DICE coefficients on the BHSD dataset, using the class thresholds of the models to generate heatmaps.  {(*= no-pretrained model.)}  }}%
  {\begin{tabular}{lllllll}
  \hline 
  \bfseries Model  & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH  \\
  \hline
  \hline 
  ResNet152 & $0.248 \pm 0.14$ & $0120 \pm 0.06$ & $0.077 \pm 0.05$ & $0.115 \pm 0.09$ & $0.255 \pm 0.16$ \\
  \hline 
  DenseNet121 & $0.286 \pm 0.17$ & $0.143 \pm 0.08$ & $0.085 \pm 0.05$ & $0.110 \pm 0.09$ & $0.169 \pm 0.13$  \\
  \hline 
  DenseNet121* & $0.258 \pm 0.16$& $0.126 \pm 0.07$ & $0.053 \pm 0.03$& $0.086 \pm 0.08$ & $0.121 \pm 0.10$  \\
  \hline 
  ResNetRS & $0.414\pm 0.18$  & $0.213 \pm 0.07$ & $0.119 \pm 0.07$ & $0.185 \pm 0.12$ & $0.272 \pm 0.15$\\
  \hline 
  RDNet  & $0.354 \pm 0.16$ & $0.227 \pm 0.08$ & $0.130 \pm 0.07$ & $0.178 \pm 0.13$ & $0.255 \pm 0.16$ \\
  \hline 
  Ours  & $\mathbf{0.514} \pm 0.17 $  &  $\mathbf{0.398} \pm 0.10$ & $\mathbf{0.204 }\pm 0.11$ & $\mathbf{0.228} \pm 0.12$ &  $\mathbf{0.366} \pm 0.19$\\
  \hline 
  \end{tabular}}
\end{table}






\begin{figure}
\centering
\begin{minipage}[t]{0.19\textwidth}
       \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76.png}
       \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_3d544ff76.png}
          \begin{center}
  {Source Image}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
\includegraphics[width=\textwidth]{figs/label_mask/IPH_label.png}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_3d544ff76_1.png}
          \begin{center}
  {IPH}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
\includegraphics[width=\textwidth]{figs/label_mask/IVH_label.png}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_3d544ff76_2.png}
          \begin{center}
  {IVH}
  \end{center}
    \end{minipage}
    \begin{minipage}[t]{0.19\textwidth}
    \includegraphics[width=\textwidth]{figs/label_mask/SAH_label.png}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_3d544ff76_3.png}
          \begin{center}
  {SAH}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
\includegraphics[width=\textwidth]{figs/label_mask/SDH_label.png}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_3d544ff76_4.png}
          \begin{center}
  {SDH}
  \end{center}
    \end{minipage}
\caption{Illustrations of subtype heatmaps generations by our proposed method.  From above to below: ground truth labels, heatmaps generated by our proposed method. \label{fig:multilabel-lse}}
\end{figure}

\begin{figure}[htbp]
\centering
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/ours/13_ID_3e33b16bb.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/13_ID_3e33b16bb_label.png}
        \includegraphics[width=\textwidth]{figs/ours/13_ID_3e33b16bb_0.png}
         \includegraphics[width=\textwidth]{figs/no_heatmap_loss/13_ID_3e33b16bb_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_guided_again/13_ID_3e33b16bb_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_integrated/13_ID_3e33b16bb_0.png}
         
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_661bee514.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/20_ID_661bee514_label.png}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_guided_again/20_ID_661bee514_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_integrated/20_ID_661bee514_0.png}
  
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/ours/17_ID_7f57645d6.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/17_ID_7f57645d6_label.png}
        \includegraphics[width=\textwidth]{figs/ours/17_ID_7f57645d6_0.png}
         \includegraphics[width=\textwidth]{figs/no_heatmap_loss/17_ID_7f57645d6_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_guided_again/17_ID_7f57645d6_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_integrated/17_ID_7f57645d6_0.png}
    \end{minipage}
    \begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/ours/16_ID_d7484d836.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/16_ID_d7484d836_label.png}
        \includegraphics[width=\textwidth]{figs/ours/16_ID_d7484d836_0.png}
          \includegraphics[width=\textwidth]{figs/no_heatmap_loss/16_ID_d7484d836_0.png}
          \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_guided_again/16_ID_d7484d836_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_integrated/16_ID_d7484d836_0.png}
    \end{minipage}
\begin{minipage}[t]{0.19\textwidth}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_71bde6db6.png}
        \includegraphics[width=\textwidth]{figs/no_heatmap_loss/gt_label/20_ID_71bde6db6_label.png}
        \includegraphics[width=\textwidth]{figs/ours/20_ID_71bde6db6_0.png}
\includegraphics[width=\textwidth]{figs/no_heatmap_loss/20_ID_71bde6db6_0.png}
\includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_guided_again/20_ID_71bde6db6_0.png}
         \includegraphics[width=\textwidth]{figs/experiments_rdnet_rebuttal_segmentation_integrated/20_ID_71bde6db6_0.png}
    \end{minipage}
\caption{Illustrations of the heatmap generations of ICH by our proposed method. From above to below: original images, ground truths, heatmap generated by our proposed method, heatmap generated by our proposed method without the heatmap consistency loss (HLoss),  heatmaps generated by applying guided backpropagation, heatmap generated by integrated gradients.\label{fig:case_illustration}}
\end{figure}

 For ICH strokes, all types except IPH are irregular, and SAH lesions are typically small and difficult to capture with low-resolution heatmaps. On the other hand, because we generate heatmaps in a latent space of size  $128 \times 128$, our approach can better approximate irregular or small, fine-grained shapes compared to other methods. 


















\section{Conclusion}

  Creating detailed annotations is a labor-intensive process. Nevertheless, quantification is essential in certain clinical contexts. In this research, we explored weakly supervised segmentation of intracerebral hemorrhage (ICH) subtypes. Surprisingly, we found that while all models showed similar classification accuracy, the quality of the heatmaps they generated varied significantly. Although every model correctly identified the subtype location, advanced models with pretrained checkpoints produced more detailed heatmaps closely matched subtype stroke segmentation masks.

However, all heatmaps were generated from relatively low-resolution feature maps, for example,  $16 \times 16$. To harness the full potential of these advanced models and pretrained checkpoints, we developed a decoder for these pretrained models and retrained them on higher-resolution feature maps using the Log-Sum-Exp (LSE) pooling layer. We carefully enforced consistency across latent spaces at each level and ultimately generated heatmaps from the latent space with the highest resolution. Our findings indicate that even without segmentation labels, well-trained classifiers may possess some capacity for quantification. We believe our approach can be useful in situations where training a segmentation model is costly. We plan to systematically explore this direction in future work.







\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version


\midlacknowledgments{ Both authors acknowledge funding from the Center for Artificial Intelligence in Medicine at Chang Gung Memorial Hospital, via grant
agreements no. CLRPG3H0016 and no. CORPG3L0463}

\bibliography{bibs}



\appendix




\begin{figure}
\centering
\begin{minipage}[t]{0.16\textwidth}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/13_ID_3e33b16bb.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_661bee514.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/17_ID_7f57645d6.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/16_ID_d7484d836.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_71bde6db6.png}
          \begin{center}
  {Origin}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/13_ID_3e33b16bb_0.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/17_ID_7f57645d6_0}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/16_ID_d7484d836_0.png}
        \includegraphics[width=\textwidth]{figs/resnet152_pretrained/20_ID_71bde6db6_0.png}
          \begin{center}
  {ResNet152}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
     \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/13_ID_3e33b16bb_0.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/17_ID_7f57645d6_0}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/16_ID_d7484d836_0.png}
        \includegraphics[width=\textwidth]{figs/densenet121_7_pretrained/20_ID_71bde6db6_0.png}
          \begin{center}
  {DenseNet121}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
\includegraphics[width=\textwidth]{figs/dense121_no_pretrained/13_ID_3e33b16bb_0.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/17_ID_7f57645d6_0}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/16_ID_d7484d836_0.png}
        \includegraphics[width=\textwidth]{figs/dense121_no_pretrained/20_ID_71bde6db6_0.png}
          \begin{center}
  {DenseNet121*}
  \end{center}
    \end{minipage}
    \begin{minipage}[t]{0.16\textwidth}
    \includegraphics[width=\textwidth]{figs/resnetrs200/13_ID_3e33b16bb_0.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/17_ID_7f57645d6_0}
        \includegraphics[width=\textwidth]{figs/resnetrs200/16_ID_d7484d836_0.png}
        \includegraphics[width=\textwidth]{figs/resnetrs200/20_ID_71bde6db6_0.png}
          \begin{center}
  {ResNet-RS200}
  \end{center}
    \end{minipage}
\begin{minipage}[t]{0.16\textwidth}
\includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/13_ID_3e33b16bb_0.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_661bee514_0.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/17_ID_7f57645d6_0}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/16_ID_d7484d836_0.png}
        \includegraphics[width=\textwidth]{figs/rdnet_pretrained_epoch_7/20_ID_71bde6db6_0.png}
          \begin{center}
  {RDNet}
  \end{center}
    \end{minipage}
\caption{More examples for different network architectures. \label{fig:detailed_illustration}}
\end{figure}


\section{ Pseudo Code for the Weakly Supervised Quantization}
The pseudo-code of our main algorithm is summarized in Algorithm \ref{alg:heatmap_to_segmentation}.

\begin{algorithm2e}
\caption{Pseudocode for Subtype Segmentation Mask Generation}
\label{alg:heatmap_to_segmentation}
 % older versions of algorithm2e have \dontprintsemicolon instead
 % of the following:
 %\DontPrintSemicolon
 % older versions of algorithm2e have \linesnumbered instead of theuIf
 % following:
 %\LinesNumbered
 
\KwIn{Class thresholds $c_{\text{IPH}},\dots, c_{\text{EDH}}.$ Mask thresholds $m_{\text{IPH}},\dots, m_{\text{EDH}}.$ Neural network $N$, image $I$, image size $(h,w)$.}
\KwOut{Segmentation masks $M$}
$M \leftarrow $ np.zeros((6, h, w)) \\
\For{$i\leftarrow $ \ ICH subtypes}{
   \uIf {$N(I)[i] > c_i$} {
  $M[i] \leftarrow \ \text{np.where}(\text{Grad-CAM}(I) > m_i, \text{Grad-CAM}(I), 0)$\;
  }
}
$M \leftarrow $ np.argmax(M, 0) \\ 
\Return $M$
\end{algorithm2e}

\section{Performance for Pure Visualization  \label{sec:cam_for_visualization}}
We aim to demonstrate that when the ground truth labels are known (i.e., after eliminating all  false positive and  false negative cases), although our model achieves the highest performance, advanced models such as ResNetRS and RDNet are also suitable for ICH quantification. First, we described how we selected our mask thresholds. We rescaled all heatmaps to the range $[0,255]$, and then applied grid search on thresholds $m = 25, 50, 75, 100, 125, 150, 175, 200$  to evaluate subtype performance. As shown in Fig. \ref{fig:multi-label_illustration}, we empirically observed that these subtype heatmaps are mutually exclusive. Therefore, we tested the subtype thresholds jointly. The final mask thresholds are summarized in Table \ref{tab:vis_hyper}.

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:vis_hyper}%
  {\caption{Selected mask thresholds. }}%
  {\begin{tabular}{llllllll}
  \bfseries Model &  \bfseries $\mathbf{m_{IPH}}$ & \bfseries $\mathbf{m_{IVH}}$ & \bfseries $\mathbf{m_{SAH}}$ & \bfseries $\mathbf{m_{SDH}}$ & \bfseries $\mathbf{m_{EDH}}$ \\
  ResNet152 & 150 & 150 & 175 &150 &175 \\
  DenseNet121 & 175 &200 &200 &175 &200  \\
  DenseNet121* & 200&200&200& 200 &200  \\
  ResNetRS & 150 & 175 &150 &150 &175 \\
  RDNet  & 175 &200 &175 &175 &200 \\
  Ours  & 75 &  125 & 125 & 75 &  25 \\
  \end{tabular}}
\end{table}

The final results are presented in Table \ref{tab:dice_best}.  We observed that ResNetRS, RDNet, and our model all achieved a Dice coefficient above 0.5 for IPH.  We believe these results are sufficiently accurate for some situations where clinicians need to include quantified information in radiology reports, but was previously performed using less precise estimation methods.

\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:dice_best}%
  {\caption{DICE coefficients on the BHSD dataset, using the ground truth labels to generate heatmaps. }}%
  {\begin{tabular}{llllllll}
  \bfseries Model & \bfseries Normal & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG\\
  ResNet152 & 0.996& 0.359 & 0.116 & 0.105 &0.158 &0.281 & 0.336\\
  DenseNet121 &0.995& 0.425 &0.162 &0.122 &0.166 &0.303 & 0.362 \\
  DenseNet121* & 0.993& 0.286&0.151&0.084& 0.131 &0.205 & 0.308 \\
  ResNetRS &0.998& 0.552 & 0.245 &0.193 &0.243 &0.425 & 0.443\\
  RDNet & 0.997 & 0.502 &0.264 &0.173 &0.234 &0.373 & 0.424\\
  Ours &0.998 & 0.607 &  0.424 & 0.254 & 0.281 &  0.445 & 0.502\\
  \end{tabular}}
\end{table}




\begin{table}
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:dice_thres05}%
  {\caption{DICE coefficients on the BHSD dataset, using the ground truth labels to generate heatmaps  and with all mask thresholds equal to 125. }}%
  {\begin{tabular}{llllllll}
  \bfseries Model & \bfseries Normal & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG\\
  ResNet152 & 0.993& 0.354 & 0.112 & 0.094 &0.161 &0.242 & 0.326\\
  DenseNet121 &0.990& 0.352 &0.106 &0.854 &0.140 &0.186 & 0.310 \\
  DenseNet121* & 0.978& 0.169&0.057&0.051& 0.085 &0.123 & 0.244 \\
  ResNetRS &0.996& 0.539 & 0.211 &0.183 &0.249 &0.361 & 0.423\\
  RDNet & 0.994 & 0.468 &0.182 &0.143 &0.213 &0.295 & 0.383\\
  Ours &0.998 & 0.576 &  0.424 & 0.254 & 0.236 &  0.255 & 0.457\\
  \end{tabular}}
\end{table}





\begin{table}
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:dice_thres03}%
  {\caption{DICE coefficients on the BHSD dataset, using the ground truth labels to generate heatmaps and with all mask thresholds equal to 75. }}%
  {\begin{tabular}{llllllll}
  \bfseries Model & \bfseries Normal & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG\\
  ResNet152 & 0.988& 0.293 & 0.089 & 0.071 &0.138 &0.171 & 0.291\\
  DenseNet121 &0.983& 0.263 &0.078 &0.062 &0.108 &0.128 & 0.270 \\
  DenseNet121* & 0.960& 0.100&0.035&0.035& 0.062 &0.079 & 0.211 \\
  ResNetRS &0.993& 0.450 & 0.158 &0.140 &0.211 &0.265 & 0.369\\
  RDNet & 0.990 & 0.377 &0.126 &0.105 &0.166 &0.212 & 0.329\\
  Ours &0.997 & 0.607 &  0.366 & 0.207 & 0.281 &  0.381 & 0.473\\
  \end{tabular}}
\end{table}

\begin{table}
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:dice_thres01}%
  {\caption{DICE coefficients on the BHSD dataset, using the ground truth labels to generate heatmaps  and with all mask thresholds equal to 25. }}%
  {\begin{tabular}{llllllll}
  \bfseries Model & \bfseries Normal & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG\\
  ResNet152 & 0.973& 0.197 & 0.057 & 0.042 &0.083 &0.099 & 0.242\\
  DenseNet121 &0.967& 0.167 &0.047 &0.037 &0.071 &0.070 & 0.226 \\
  DenseNet121* & 0.924& 0.053&0.021&0.022& 0.041 &0.043 & 0.184 \\
  ResNetRS &0.986& 0.322 & 0.097 &0.088 &0.143 &0.161 & 0.299\\
  RDNet & 0.980 & 0.262 &0.076 &0.064 &0.101 &0.125 & 0.268\\
  Ours &0.996 & 0.577 &  0.293 & 0.168 & 0.253 &  0.445 & 0.455\\
  \end{tabular}}
\end{table}

\section{Hyperparameter Setting}
\label{app:hyperparameter}

For the hyperparameters in Equations \ref{eq:closs} and \ref{eq:hloss}, in this work, we set $\alpha_0 = 2, \alpha_1 = 0.5, \alpha_2 = 0.25, \alpha_3 =0.125, \beta_0 = 0.5, \beta_1 = 0.25$, and $\beta_2 = 0.125.$ Additionally, the scaling factor $r$ in the LSE pooling layer (Equation \ref{eq:lse}) is set to 3.

 To demonstrate our method is robust to hyperparameter tuning, we try to double/halve  $\beta_0, \beta_1,$ and $\beta_2$ and retrain the models for 10 epochs. For simplicity, we use the the same mask threshold of our default setting in Table \ref{tab:vis_hyper}, thereby potentially leading to suboptimal outcomes.  The performances are presented in Table \ref{tab:dice_ablation}. 

\begin{table}
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:dice_ablation}%
  {\caption{DICE coefficients with different hyperparameters. Double refers to we double the parameters $\beta_0, \beta_1,$ and $\beta_2$, and similarly Half means we halve $\beta_0, \beta_1,$ and $\beta_2$. }}%
  {\begin{tabular}{llllllll}
  \bfseries Model & \bfseries Normal & \bfseries IPH & \bfseries IVH & \bfseries SAH & \bfseries SDH & \bfseries EDH & \bfseries AVG\\
  Half & 0.996 & 0.530 &0.294 &0.206 &0.243 &0.291 & 0.427\\
  Double &0.996 & 0.539 &  0.286 & 0.199 & 0.222 &  0.265 & 0.418\\
  \end{tabular}}
\end{table}

\section{Results for different hyperparameters}
\label{app:diff_hyperparameter}

We discovered that mask thresholds play a key role in the final heatmap quantization. Some of the results are presented in Table \ref{tab:dice_thres05}, Table \ref{tab:dice_thres03}, and Table \ref{tab:dice_thres01}.
\section{Heuristics about the Heatmap Consistency Loss}
\label{app:HCL}
 In this section we would like to provide an intuitive explanation for the heatmap consistency loss in Equation \ref{eq:hloss}. Following the notation in Section \ref{sec:loss}, locally the loss will be

 \begin{equation}
f^{k,1}_{i,j} + f^{k,2}_{i,j}+ f^{k,3}_{i,j}+ f^{k,4}_{i,j} = f^{k+1}_{i,j},
\end{equation}

where $f^{k+1}_{i,j}$ is the value of $f^{k+1}$ at spatial location $(i,j)$, and $f^{k,1}_{i,j}, \dots,  f^{k,4}_{i,j}$ are values of $f^{k}$ lying above $(i,j)$. Now we assume  $f^{k,l}_{i,j} >= f_{min}$, for a designed minimum $f_{min}$.  For $r > 0$, we rewrite the
LSE function in Equation \ref{eq:lse_def} as
\begin{equation} 
\log (\Sigma_{i,j} (\exp (r \cdot f^{k,1}_{i,j}) + \exp (r \cdot f^{k,2}_{i,j})+ \exp (r \cdot f^{k,3}_{i,j}) + \exp (r \cdot f^{k,4}_{i,j})) = \log (\Sigma_{i,j} L^k_{i,j}).
\end{equation}

On the other hand,  from the inequality of arithmetic and geometric means,

\begin{equation}
\frac{L^k_{i,j}}{4} \geq (\exp(r \cdot ( f^{k,1}_{i,j} + f^{k,2}_{i,j} + f^{k,3}_{i,j}  + f^{k,4}_{i,j} ))^{\frac{1}{4}} = \exp(\frac{r \cdot f^{k+1}_{i, j}}{4}),
\end{equation}

and the unique minimum is achieved when $f^{k,1}_{i,j} = f^{k,2}_{i,j} = f^{k,3}_{i,j}  = f^{k,4}_{i,j}$. On the other hand, to find the maximum value, without loss of generalization, we assume 
$ f^{k,1}_{i,j} > f^{k,2}_{i,j} \geq f^{k,3}_{i,j}  \geq f^{k,4}_{i,j}$, then we rewrite $f^{k,4}_{i,j} =  f^{k+1}_{i, j} - ( f^{k,1}_{i,j} + f^{k,2}_{i,j} + f^{k,3}_{i,j}  )$, then

\begin{equation}
\frac{\partial L^k_{i,j} }{\partial f^{k,1}_{i,j}} = r*( \exp (r \cdot f^{k,1}_{i,j}) - \exp (r \cdot f^{k,4}_{i,j}  )) > 0, 
\end{equation}

which means that we can increase $L^k_{i,j}$ by increasing $f^{k,1}_{i,j}$ until $f^{k,4}_{i,j}$ achieves the designed lower bound $f_{min}$. Recursively following the argument, we find the maximum value is achieved when

\begin{equation}
f^{k,1}_{i,j} = f^{k+1}_{i, j} - 3 \cdot f_{min}, \ \ f^{k,2}_{i,j} = f^{k,3}_{i,j} = f^{k,4}_{i,j} = f_{min}.
\end{equation}

In sum, if the spatial location $(i, j)$ belongs to the pathological region, then only one of $f^{k,l}_{i,j}$ achieves the extreme value while the remaining ones tends to achieve the minimum value. On the other hand, if the spatial location $(i, j)$ belongs to the normal region, all the four values tends to achieve the $\frac{f^{k+1}_{i,j}}{4}$, hence the heatmap will be more fine-grained in the outputs of the upper levels.
\end{document}
