% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{bm}
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx}
\usepackage{adjustbox}
\usepackage{lipsum} % For dummy text
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
\usepackage{color}
\usepackage{multirow}
\renewcommand{\thetable}{\arabic{table}}
\usepackage[colorlinks, linkcolor=blue, urlcolor=blue, anchorcolor=blue, citecolor=blue]{hyperref}
%
\begin{document}
%
\title{Mammo-Net: Integrating Gaze Supervision and Interactive Information in Multi-view Mammogram Classification}
%
\titlerunning{Mammo-Net for Multi-view Mammogram Classification}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Changkai Ji\inst{1,2} \and 
Changde	Du\inst{2} \and
Qing Zhang\inst{3} \and
Sheng Wang\inst{1,4,5} \and
Chong Ma\inst{6} \and
Jiaming	Xie\inst{7} \and
Yan	Zhou\inst{3} \and
Huiguang He\inst{1,2*} \and
Dinggang Shen\inst{1,5,8*}} %1{Ji, Changkai} 2{Du, Changde} 3{Zhang, Qing} 4{Wang, Sheng} 5{Ma, Chong} 6{Xie, Jiaming} 7{Zhou, Yan} 8{He, Huiguang} 9{Shen, Dinggang}  index{Last Name, First Name}
%
\authorrunning{C. Ji et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{School of Biomedical Engineering, ShanghaiTech University, Shanghai, China \\ \email{\{jichk, dgshen\}@shanghaitech.edu.cn}\\ \and
State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China \\ \email{huiguang.he@ia.ac.cn}\and
Department of Radiology Renji Hospital Shanghai Jiao Tong University School of Medicine, Shanghai, China \and
Institute for Medical Imaging Technology, School of Biomedical Engineering, Shanghai Jiao Tong University, Shanghai, China \and
Shanghai United Imaging Intelligence Co., Ltd., Shanghai, China  \and
School of Automation, Northwestern Polytechnical University, Xi'an, China \and
Department of Computer Science, The University of Hong Kong, Hong Kong \and
Shanghai Clinical Research and Trial Center, Shanghai, China}
%
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Breast cancer diagnosis is a challenging task. Recently, the application of deep learning techniques to breast cancer diagnosis has become a popular trend. However, the effectiveness of deep neural networks is often limited by the lack of interpretability and the need for significant amount of manual annotations. To address these issues, we present a novel approach by leveraging both gaze data and multi-view data for mammogram classification. The gaze data of the radiologist serves as a low-cost and simple form of coarse annotation, which can provide rough localizations of lesions. We also develop a pyramid loss better fitting to the gaze-supervised process. Moreover, considering many studies overlooking interactive information relevant to diagnosis, we accordingly utilize transformer-based attention in our network to mutualize multi-view pathological information, and further employ a bidirectional fusion learning (BFL) to more effectively fuse multi-view information. Experimental results demonstrate that our proposed model significantly improves both mammogram classification performance and interpretability through incorporation of gaze data and cross-view interactive information.

\keywords{Mammogram classification \and Gaze \and Multi-view interaction \and Bidirectional fusion learning}
\end{abstract}
%
%
%
\section{Introduction}
Breast cancer is the most prevalent form of cancer among women and can have serious physical and mental health consequences if left unchecked \cite{giaquinto2022cancer}. Early detection through mammography is critical for early treatment and prevention \cite{selvi2014breast}. Mammograms provide images of breast tissue, which are taken from two views: the cranio-caudal (CC) view, and the medio-lateral oblique (MLO) view \cite{frazer2021evaluation}. By identifying breast cancer early, patients can receive targeted treatment before the disease progresses.

Deep neural networks have been widely adopted for breast cancer diagnosis to alleviate the workload of radiologists. However, these models often require a large number of manual annotations and lack interpretability, which can prevent their broader applications in breast cancer diagnosis. Radiologists typically focus on areas with breast lesions during mammogram reading \cite{kundel2008using,voisin2013investigating}, which provides valuable guidance. We propose using real-time eye tracking information from radiologists to optimize our model. By using gaze data to guide model training, we can improve model interpretability and performance \cite{wu2019eye}. 

Radiologists' eye movements can be automatically and unobtrusively recorded during the process of reading mammograms, providing a valuable source of data without the need for manual labeling. Previous studies have incorporated radiologists' eye-gaze as a form of weak supervision, which directs the network's attention to the regions with possible lesions \cite{wang2022follow,ma2023eye}. Leveraging gaze from radiologists to aid in model training \textit{not only} increases efficiency and minimizes the risk of errors linked to manual annotation, \textit{but also} can be seamlessly implemented  without affecting radiologists' normal  clinical interpretation of mammograms.

Mammography primarily detects two types of breast lesions: masses and microcalcifications \cite{moreira2012inbreast}. The determination of the benign or malignant nature of masses is largely dependent on the smoothness of their edges \cite{li2021domain}. The gaze data can guide the model's attention towards the malignant masses. Microcalcifications are small calcium deposits which exhibit irregular boundaries on mammograms \cite{jorgensen2015breast}. This feature makes them challenging to identify, often leading to missed or false detection by models. Radiologists need to magnify mammograms to differentiate between benign scattered calcifications and clustered calcifications, the latter of which are more likely to be malignant and necessitate further diagnosis. Leveraging gaze data can guide the model to locate malignant calcifications.


In this work, we propose a novel diagnostic model, namely Mammo-Net, which integrates radiologists' gaze data and interactive information between CC-view and MLO-view to enhance diagnostic performance. To the best of our knowledge, this is the first work to integrate gaze data into multi-view mammography classification. We utilize class activation map (CAM) \cite{ouyang2020learning} to calculate the attention maps for the model. Additionally, we apply pyramid loss to maintain consistency between radiologists' gaze heat maps and the model's attention maps at multiple scales of the pyramid \cite{adelson1984pyramid}. Our model is designed for single-breast cases. Mammo-Net extracts multi-view features and utilizes transformer-based attention to mutualize information \cite{vaswani2017attention}. Furthermore, there are differences between multi-view mammograms of the same patient, arising from variations in breast shape and density. Capturing these multi-view shared features can be a challenge for models. To address this issue, we develop a novel method called bidirectional fusion learning (BFL) to extract shared features from multi-view mammograms.

Our contributions can be summarized as follows:


\begin{itemize}
    \item[\textbullet] We emphasize the significance of low-cost gaze to provide weakly-supervised positioning and visual interpretability for the model. Additionally, we develop a pyramid loss that adapts to the supervised process.
    \item[\textbullet] We propose a novel breast cancer diagnosis model, namely Mammo-Net. This model employs transformer-based attention to mutualize information and uses BFL to integrate task-related information to make accurate predictions.
    \item[\textbullet] We demonstrate the effectiveness of our approach through experiments using mammography datasets, which show the superiority of Mammo-Net.
\end{itemize}

\section{Proposed Method}
\subsection{Overall Architecture}
The pipeline of Mammo-Net is illustrated in Fig. \ref{fig:image1}. Mammo-Net feeds two-view mammograms of the same breast into two ResNet-style \cite{he2016deep} CNN branch networks. We use several ResNet blocks pre-trained on ImageNet \cite{deng2009imagenet} to process mammograms. Then, we use global average pooling (GAP) and fully connected layers to compute the feature vectors produced by the model. Before the final residual block, we employ cross-view attention to mutualize multi-view information. Our proposed method employs BFL to effectively fuse multi-view information to improve diagnostic accuracy. Additionally, by integrating gaze data from radiologists, our proposed model is able to generate more precise attention maps. The fusion network combines multi-view feature representations using a stack of linear-activation layers and a fully connected layer, resulting in a classification output.

\begin{figure}[htb]
\includegraphics[width=\textwidth]{1.pdf}
\caption{Mammo-Net consists of two components: a multi-view classification network (upper half) and an attention consistency module (lower half). The classification network interacts multi-view information, while the attention consistency module provides positional supervision.} \label{fig:image1}
\end{figure}

\subsection{Gaze supervision}
In this module, we utilize CAM to calculate the attention map for the network by examining gradient-based activations in back-propagation. After that, we employ pyramid loss to make the network attention being consistent with the supervision of radiologists' gaze heat maps, guiding the network to focus on the same lesion areas as the radiologists. This module guides the network to accurately extract pathological features.

\noindent\textbf{Class Activation Map.} At the final convolutional layer of our model, the activation of the $i$th feature map $f_i(x, y)$ at coordinates $(x,y)$ is associated with a weight $w^k_i$ for class $k$. This allows us to generate the attention map $H^k$ for class $k$ as:
\begin{equation}
H^k = \sum\limits_{i}w_i^kf_i(x, y).
\end{equation}

\noindent\textbf{Pyramid Loss.} To enhance the learning of important attention areas, we propose a pyramid loss constraint that requires consistency between the network and gaze attention maps. The pyramid loss is based on using a pyramid representation of the attention map:
\begin{equation}
\mathcal{L}_{Pyramid} = \sum_l^L||(Z ( G_l(H)))^+ - (Z ( G_l(R)))^+||_2,
\end{equation}
where $H$ is the network attention map generated by the CAM and $R$ is the radiologist's gaze heat map. $G_l(\cdot)$ represents the feature map at the $l$th level of the Gaussian pyramid, obtained by downsampling $G_{l-1}(\cdot)$ using a Gaussian kernel, where $G_1(R)=R$. $Z$ means to perform Layernorm and ReLU activation on each feature map. This focuses the consistency loss on the more important pathological regions. The positive part of the normalized $Z(R)$, denoted as $Z(R)^+$, indicates the network focuses on the lesions where the radiologist spent most time reading. The minimization of the pyramid loss involves calculating the mean square error (MSE) between the attention maps generated by the radiologist and the model at each level of the Gaussian pyramid. This allows the model to mimic the attention of radiologists and enhance diagnostic performance. 

Moreover, the pyramid representation enables the model to learn from the important pathological regions on which radiologists are focusing, without the need for precise pixel-level information. Layernorm is also employed to address the issue of imprecise gaze data. This reduces noise in the consistency process by performing consistency loss only in the regions where radiologist spent most time.

\subsection{Interactive Information}
\noindent\textbf{Transformer-based Mutualization Model.} We use transformer-based attention to mutualize information from the two views at the level of the spatial feature map. For each attention head, we compute embeddings for the source and target pixels. Our model does not utilize positional encoding, as it encodes the relative position of each pixel and is not suitable for capturing information between different views of mammograms \cite{vaswani2017attention}. The target view feature maps are transformed into $Q$, the source view feature maps are transformed into $K$, and the original source feature maps are transformed into $V$. We can then obtain a weighted sum of the features from the source view for each target pixel using \cite{vaswani2017attention}: 
\begin{equation}
Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V.
\end{equation}

Subsequently, the output is transformed into attention-based feature maps $X$ and mutualized with the feature maps $Y$ from the other view. The mutualized feature maps are normalized and used for subsequent calculations:
\begin{equation}
Z = Norm (Y+Linear(X)).
\end{equation}

\noindent\textbf{Bidirectional Fusion Learning.} To enable the fusion network to retain more of the shared features between the two views and filter out noise, we propose to use BFL to learn a fusion representation that maximizes the cross-view mutual information. The optimization target is to generate a fusion representation $I$ from multi-view representations $p_v$, where $v \in \{cc, mlo\}$. We employ the Noise-Contrastive Estimation framework \cite{gutmann2010noise} to maximize the mutual information, which is a contrastive learning framework: 
\begin{equation}
\mathcal{L}(I, \bm{P}_v) = -\mathbb{E}_{\bm{P}}\left[log\frac{s(I, p_v^i)}{\sum_{p^j_{v} \in \bm{P}_{v}}s(I, p^j_{v})}\right],
\end{equation}
where $s(I, p_v)$ evaluates the correlation between multi-view fused representations and single-view representations \cite{oord2018representation}:
\begin{equation}
\begin{aligned}
    s(I, p_v) = exp\left(\overline{p_v\vphantom{N}}\left(\overline{N\left(I\right)}\right)^T\right),\\
    \overline{p_v\vphantom{N}} = \frac{p_v}{||p_v||_2}, \qquad \overline{N(I)} = \frac{N(I)}{||N(I)||_2},
\end{aligned}
\end{equation}
where $N(I)$ is a reconstruction of $p_v$ generated by a fully connected network $N$ from $I$ and the Euclidean norm $||\cdot||_2$ is applied to obtain unit-length vectors. In contrastive learning, we consider the same patient mammograms as positive samples and those from different patient mammograms in the same batch $\bm{\tilde{P}}_v^i = \bm{P}_v \backslash \{p_v^i\}$ as negative samples \cite{oord2018representation}. Minimizing the similarity between the same patient mammograms enables the model to learn shared features. Maximizing the dissimilarity between different patient mammograms enhances the model's robustness. 

In short, we require the fusion representation $I$ to reversely reconstruct multi-view representations $p_v$ so that more view-invariant information can be passed to $I$. By aligning the prediction $N(I)$ to $p_v$, we enable the model to decide how much information it should receive from each view.

The overall loss function for this module is the sum of the losses defined for each view:
\begin{equation}
\mathcal{L}_{BFL} = \mathcal{L}_I^{cc} + \mathcal{L}_I^{mlo}.
\end{equation}

\subsection{Loss Function}
 We use binary cross entropy loss (BCE) between the network prediction and the ground-truth as the classification loss. In conclusion, we have proposed a total of three loss functions to guide the model training: $\mathcal{L}_{BCE}$, $\mathcal{L}_{BFL}$, and $\mathcal{L}_{Pyramid}$. The overall loss function is defined as the sum of these three loss functions, with coefficients $\lambda$ and $\mu$ used to adjust their relative weights:
\begin{equation}
\mathcal{L}_{overall} = \mathcal{L}_{BCE} + \lambda \mathcal{L}_{Pyramid} + \mu \mathcal{L}_{BFL}.
\end{equation}

\section{Experiments and Results}
\subsection{Datasets}
\noindent\textbf{Mammogram dataset.} 
Our experiments were conducted on CBIS-DDSM \cite{lee2017curated} and INbreast \cite{moreira2012inbreast}. The CBIS-DDSM dataset contains 1249 exams that have been divided based on the presence or absence of masses, which we used to perform mass classification. The INbreast dataset contains 115 exams with both masses and micro-calcifications, on which we performed benign and malignant classification. We split the INbreast dataset into training and testing sets in a 7:3 ratio. It is worth noting that the official INbreast dataset does not provide image-level labels, so we obtained these labels following Shen et al. \cite{shen2019deep}.

\noindent\textbf{Eye gaze dataset.} Eye movement data was collected by reviewing all cases in INbreast using a Tobii Pro Nano eye tracker. The scenario is shown in Appendix and can be accessed at  \url{https://github.com/JamesQFreeman/MicEye}. Participated radiologist has 11 years of experience in mammography screening.

\subsection{Implementation details}
We trained our model using the Adam optimizer \cite{kingma2014adam} with a learning rate of $10^{-4}$ (partly implemented by MindSpore). To overcome the problem of limited data, we employed various data augmentation techniques, including translation, rotation, and flipping. To address the problem of imbalanced classes, we utilized a weighted loss function that assigns higher weights to malign cases in order to balance the number of benign and malign cases. The coefficients $\lambda$ and $\mu$ of $\mathcal{L}_{overall}$ were set to $0.5$ and $0.2$, respectively, based on 5-fold cross validation on the training set. The network was trained for 300 epochs. We used Accuracy (ACC) and the Area Under the ROC Curve (AUC) \cite{wu2019deep} as our evaluation metrics, and we selected the final model based on the best validation AUC. Considering the relatively small size of our dataset, we used ResNet-18 as the backbone of our network. 

\subsection{Results and Analysis}
\begin{table}[h]
  \caption{Ablation study of key components of Mammo-Net, and comparison of different models in terms of AUC and ACC. "BFL" denotes "Bidirectional Fusion Learning", and "RA" denotes "Radiologist Attention".}
  \renewcommand{\arraystretch}{1.2}
  \setlength{\tabcolsep}{12pt}
  \centering
    \begin{adjustbox}{max width=\textwidth}
      \begin{tabular}{llll}
        \hline 
        \textbf{Dataset} & \textbf{Model} & \textbf{AUC} & \textbf{ACC} \\
        \hline 
        \multirow{7}{*}{\textbf{CBIS-DDSM}} 
     & Lopez et al. \cite{lopez2022multi} & 0.739 & 0.754 \\
     & Tulder et al. \cite{cheng2021depth} & 0.802 & 0.811 \\
     & Xian et al. \cite{xian2021towards} & 0.812 & 0.735 \\
     \cline{2-4}
     & MLO-view & 0.701 & 0.763 \\
     & CC-view & 0.721 & 0.754 \\
     & Cross-view & 0.809 & 0.838 \\
     & Cross-view+BFL & \textbf{0.821} & \textbf{0.864} \\
        \hline
        \multirow{10}{*}{\textbf{INbreast}} 
     & Wang et al. \cite{wang2022follow} & 0.806 & 0.756 \\
     & Jiang et al. \cite{jiang2023eye} & 0.819 & 0.793 \\
     & Lopez et al. \cite{lopez2022multi} & 0.793 & 0.830 \\
     & Xian et al. \cite{xian2021towards} & 0.859 & 0.791 \\
     \cline{2-4}
     & MLO-view & 0.663 & 0.716 \\
     & CC-view & 0.650 & 0.704 \\
     & Cross-view & 0.762 & 0.755\\
     & Cross-view+BFL & 0.786 & 0.812 \\
     & Cross-view+RA & 0.864 & 0.830 \\
        & Cross-view+BFL+RA (Mammo-Net)  & \textbf{0.889} & \textbf{0.849}\\
        \hline
      \end{tabular}
    \end{adjustbox}
  \label{tab:table_label_two}
\end{table}



\noindent\textbf{Performance Comparison.} As shown in Table \ref{tab:table_label_two}, we compare our model to other methods and find that our model performs better. Lopez et al. \cite{lopez2022multi} proposed the use of hypercomplex networks to mimic radiologists. By leveraging the properties of hypercomplex algebra, the model is able to continually process two mammograms together. Lee et al. \cite{xian2021towards} proposed a 2-channel approach that utilizes a Gaussian model to capture the spatial correlation between lesions across two views, and an LT-GAN to achieve a robust mammography classification. 

\begin{figure}[htbp]
\includegraphics[width=\textwidth]{2.pdf}
\caption{Comparative visualization of mammography diagnosis with and without gaze supervision. After integrating gaze supervision, the model's capability in localizing lesions becomes more precise.} \label{fig:image2}
\end{figure}

We also compare our model with other methods that use eye movement supervision as shown in Table \ref{tab:table_label_two}. The GA-Net \cite{wang2022follow} proposed a ResNet-based model with class activation mapping guided by eye gaze data. We developed a multi-view model using this approach for a fair comparison, and found that our method performed better. We believe that one possible reason for the inferior performance of GA-Net compared to Mammo-Net might be the use of a simple MSE loss by GA-Net, which neglects the coarse nature of the gaze data. Jiang et al. \cite{jiang4247845eye} proposed a Double-model that fuses gaze maps with original images before training. However, this model did not consider the gap between research and clinical workflow. This model requires gaze input during both the training and inference stages, which limits its practical use in hospitals without eye-trackers. In contrast, our method does not rely on gaze input during inference stage.

\noindent\textbf{Visualization.} Fig. \ref{fig:image2} illustrates the visualization of our proposed model on three representative exams from the INbreast dataset that includes masses, calcifications, and a combination of both. For each exam, we present gaze heat maps generated from eye movement data. The preprocessing process is shown in Fig. 5 (see Appendix). To make an intuitive comparison, we exhibit attention maps generated by the model under both unsupervised and gaze-supervised cases. Each exam is composed of two views, i.e., the CC-view and the MLO-view. More exams can be found in Fig. 6 (see Appendix).

The results of the visualization demonstrate that the model's capability in localizing lesions becomes more precise when radiologist attention is incorporated in the training stage. The pyramid loss improves the model's robustness even when the radiologist’s gaze data is not entirely focused on the breast. This intuitively demonstrates the effectiveness of training the model with eye-tracking supervision.

\noindent\textbf{Ablation Study.} We perform an ablation analysis to assess each component (radiologist attention, cross-view attention and BFL) in Mammo-Net. Table \ref{tab:table_label_two} suggests that each part of the proposed framework contributes to the increased performance. This shows the benefits of adapting the model to mimic the radiologist’s decision-making process.


\section{Conclusion and discussion}
In this paper, we have developed a breast cancer diagnosis model to mimic the radiologist’s decision-making process. To achieve this, we integrate gaze data as a form of weak supervision for both lesion positioning and interpretability of the model. We also utilize transformer-based attention to mutualize multi-view information and further develop BFL to fully fuse multi-view information. Our experimental results on mammography datasets demonstrate the superiority of our proposed model. In future work, we intend to explore the use of scanning path analysis as a means of obtaining insights into the pathology-relevant regions of lesions.


\subsubsection{Acknowledgements.}This work was supported in part by The Key R\&D Program of Guangdong Province, China (grant number 2021B0101420006), National Natural Science Foundation of China (grant numbers 62131015, 82272072), Science and Technology Commission of Shanghai Municipality (STCSM) (grant number 21010502600), and the CAAI-Huawei MindSpore Open Fund.

%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
% \bibliographystyle{splncs04}
% \bibliography{mybibliography}
%
\bibliographystyle{splncs04}
\bibliography{ref}
\newpage
\section*{Appendix}

\setcounter{figure}{2}
\begin{figure}[htbp]
\includegraphics[width=\textwidth]{appendix1.pdf}
\caption{Examples of breast lesions: including benign and malignant masses, as well as benign and malignant microcalcifications.} \label{fig:apdx1}
\end{figure}

\begin{figure}[htbp]
\includegraphics[width=\textwidth]{appendix3.pdf}
\caption{Data Collection Scenarios.} \label{fig:apdx3}
\end{figure}

The scene of our data acquisition system is shown in Fig. \ref{fig:apdx3}, which shows that our system is designed without interfering with the diagnostic process of radiologists. In addition to eye-tracking data, we also collected physiological information such as pupil size, sitting posture, and heart rate, which may be useful in the future. The use of multiple modalities for the diagnosis of a single disease may be beneficial. For example, the fatigue level of radiologists can be detected through eye movement or pupil size. When radiologists are fatigued, their gaze patterns may change and the reliability of the collected eye movement data may be reduced. In such cases, it may be advisable to decrease the weight of eye movement information during training.

\begin{figure}[htbp]
\centering
\includegraphics[width=\textwidth]{appendix2.pdf}
\caption{The gaze data comprises spatial coordinates $(x, y)$ and timestamp $t$, which are used to compute the attention level. The gaze points are classified into two types: saccade points and fixation points. Saccade points correspond to rapid eye movements, while fixation points correspond to the locations where the eyes focus.} \label{fig:apdx2}
\end{figure}

Fig.\ref{fig:apdx2} shows the preprocessing process for the eye-tracking data. Fig. \ref{fig:apdx5} illustrates the visualization of our proposed model on some representative exams.

\begin{figure}[htbp]
\includegraphics[width=\textwidth]{appendix5.pdf}
\caption{Comparative visualization of mammography diagnosis with and without gaze supervision. After integrating gaze supervision, the model's capability in localizing lesions becomes more precise.} \label{fig:apdx5}
\end{figure}

\end{document}
