\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs} 
% \usepackage{subcaption}
\usepackage{caption}
\usepackage{graphicx}
% \usepackage{hyperref}
\usepackage{xcolor}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 160}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[DuoFormer]{DuoFormer: Leveraging Hierarchical Representations by Local and Global Attention Vision Transformer}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Xiaoya Tang\nametag{$^{1,2}$}} \Email{xiaoya.tang@utah.edu}\\ 
\Name{Bodong Zhang\nametag{$^{1,3}$}} \Email{bodong.zhang@utah.edu}\\
\Name{Man Minh Ho\nametag{$^{1}$}} \Email{manminhho.cs@gmail.com}\\
\Name{Beatrice S. Knudsen\nametag{$^{4}$}} \Email{beatrice.knudsen@path.utah.edu}\\
\Name{Tolga Tasdizen\nametag{$^{1,3}$}} \Email{tolga@sci.utah.edu}\\
% \addr $^{4}$ Address 4
\addr $^{1}$ Scientific Computing and Imaging Institute, University of Utah, SLC, UT, USA \\
\addr $^{2}$ Kahlert School of Computing, University of Utah, SLC, UT, USA \\
\addr $^{3}$ Electrical and Computer Engineering, University of Utah, SLC, UT, USA \\
\addr $^{4}$ Department of Pathology, University of Utah, Salt Lake City, UT, USA
} 
% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% % \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% % \item You should follow the guidelines provided by the conference.
% % \item Read through the JMLR template documentation for specific \LaTeX
% %   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

\begin{document}

\maketitle

\begin{abstract}
Despite the widespread adoption of transformers in medical applications, the exploration of multi-scale learning through transformers remains limited, while hierarchical representations are considered advantageous for computer-aided medical diagnosis. We propose a novel hierarchical transformer model that adeptly integrates the feature extraction capabilities of Convolutional Neural Networks (CNNs) with the advanced representational potential of Vision Transformers (ViTs). Addressing the lack of inductive biases and dependence on extensive training datasets in ViTs, our model employs a CNN backbone to generate hierarchical visual representations. These representations are adapted for transformer input through an innovative patch tokenization process, preserving the inherited multi-scale inductive biases. We also introduce a scale-wise attention mechanism that directly captures intra-scale and inter-scale associations. This mechanism complements patch-wise attention by enhancing spatial understanding and preserving global perception, which we refer to as local and global attention, respectively. Our model significantly outperforms baseline models in terms of classification accuracy, demonstrating its efficiency in bridging the gap between Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs). The components are designed as plug-and-play for different CNN architectures and can be adapted for multiple applications. The code is available at \href{https://github.com/xiaoyatang/DuoFormer.git}{https://github.com/xiaoyatang/DuoFormer.git}.
\end{abstract}

\begin{keywords}
Vision Transformer, Inductive Bias, Multi-scale features
\end{keywords}

\section{Introduction}
The Vision Transformer (ViT)~\cite{dosovitskiy2020image} adapted transformers from language to vision, demonstrating superior performance over CNNs when pre-trained on large datasets. ViT employs a patch tokenization process that converts images into a sequence of uniform token embeddings. These tokens undergo Multi-Head Self-Attention (MSA), transforming them into queries, keys, and values that capture extensive non-local relationships. Despite their potential, ViTs can underperform similarly-sized ResNets~\cite{he2016deep} when inadequately trained due to their lack of inductive biases such as translation equivariance and locality\cite{lee2021vision}, which are naturally encoded by CNNs. Recent efforts have focused on mitigating ViTs' limitations by integrating convolutions or adding self-supervised tasks~\cite{liu2021efficient}. Prevalent approaches combine CNN feature extractors with transformer encoders \cite{araujo2019computing,wu2021cvt,yuan2021incorporating,li2021localvit,d2021convit,zhang2023crossformer,hou2024conv2former}, such as the 'hybrid' ViT~\cite{dosovitskiy2020image}. Other methods such as knowledge distillation \cite{touvron2021training} transfer biases from CNNs to ViT, add a convolutional kernel to the attention matrix to bring translation equivariance \cite{dai2021coatnet}, and use pooling to build multi-stage transformers \cite{li2022mvitv2}. Nonetheless, ViTs' uniform representations throughout layers and their non-local receptive fields compared to CNNs limit their ability to capture detailed semantics~\cite{raghu2021vision}, which is important for medical images. 

The application of ViTs in medical imaging, particularly in CT and X-ray data, is gaining momentum, showcasing their potential in handling extensive datasets~\cite{shamshad2023transformers}. A notable application in histopathology is presented by \cite{shao2021transmil}, which utilizes transformers to understand correlations between patches in whole slide images (WSIs), demonstrating the adaptability of transformers for complex pathological data. Histopathology image analysis involves examining WSIs to detect and interpret complex tissue structures and cellular details. This analysis faces challenges due to similar appearances between background and tumor areas, as well as the varied scales of visual entities within WSIs. These include differences in sizes of cell nuclei and vascular structures, both of which can significantly impact a model's ability to differentiate between low- and high-risk kidney cancers as an example. Moreover, global features of cancer and its microenvironment, observable only at lower scales, are crucial for various downstream tasks. The neglect of these multiple scales can significantly impair the performance of deep learning models in medical image recognition tasks. CNNs tackle this issue by utilizing a hierarchical structure created by lower and higher stages.
%, which allows them to detect visual patterns from simple low-level edges to complex semantic features. 
Such hierarchical structures are thought to be advantageous for cancer diagnosis and prognosis tasks. However, CNNs fall short in extracting the global contextual information crucial for medical image classification compared to Transformers. By harnessing a hierarchical structure similar to that of CNNs, ViTs can be prevented from overlooking the critical multi-scale features, while also imparting necessary inductive biases. Most existing works on directly integrating multi-scale information into ViTs vary primarily in the placement of convolutional operations: during patch tokenization\cite{yuan2021incorporating,xu2021co,guo2022cmt}, within\cite{guo2022cmt,lin2023scale,fan2024rmt} or between self-attention layers, including query/key/value projections\cite{wu2021cvt,yuan2021incorporating}, forward layers~\cite{li2021localvit}, or positional encoding~\cite{xu2021co}, etc. Recent advancements, such as those by \cite{liu2024exploiting}, which leverage a feature pyramid and a k-NN graph to enhance local feature representation in histopathological images, reflect a growing trend in adopting hierarchical architectures tailored for medical datasets ~\cite{azad2024advances}. Inspired by the Swin Transformer~\cite{liu2021swin}, a shifting window strategy bringing locality to transformer, \citet{chowdary2024med} used different window sizes in attention mechanisms and shifted window blocks to improve the accuracy of thoracic disease classification. \citet{manzari2023medvit} employed both convolutions and poolings before and inside the attentions for medical data classification. \citet{luo2022hybrid} fused a UNet and a transformer, employing two cross-attention modules to enhance medical image segmentation. \citet{wang2022uctransnet} proposed a channel attention to bridge the semantic gap between different stages of a UNet on medical image segmentation. \citet{pina2024cell} applied a multi-scale deformable transformer~\cite{zhu2020deformable} to cell detection and classification. Additionally, \citet{guo2023higt} considered a WSI pyramid as a hierarchical graph and employed a hierarchical graph-transformer to communicate between different resolutions of the WSI pyramids, thus improving the analysis of these images. Despite the benefits of hierarchical configurations, a definitive model for medical image analysis has not yet been established. Challenges persist in effectively producing and utilizing features across various scales, with the influence of different scales requiring further exploration.

To address these challenges, we propose a novel hierarchical Vision Transformer model.
% , outlined as follows:
First, our proposed multi-scale tokenization involves a single-layer projection, patch indexing, and concatenation, assembling features from different stages of the CNN into multi-scale tokens, facilitating a richer representation of an image. Second, we introduce a novel local attention mechanism, combined with global patch attention, enabling the model to learn connections between scales. This approach effectively bridges the gap between CNN and Transformer architectures and various scales of features. Finally, our proposed scale token, part of the scale attention, is initialized with a fused embedding derived from hierarchical representations. It enriches the transformer's multi-granularity representation and aggregates scale information, serving as the input for the global patch attention.

% \section{Related Work}
% Various approaches have explored integrating the hierarchical architecture of CNNs into Vision Transformers (ViTs) on general visual tasks, including image classification \cite{fan2024rmt}, object detection\cite{wang2021pyramid,xia2024vit}, and segmentation\cite{hou2024conv2former}. Notable methods emulate the pyramid structure of CNN with stage-wise pooling and convolutional embeddings~\cite{heo2021rethinking} or integrate pooling within the attention mechanism\cite{fan2021multiscale}. Multiple scales have been exploited beyond mere convolution integration. Swin Transformer~\cite{liu2021swin} utilizes a shifting window strategy, while \citet{dong2022cswin} split multi-heads to perform self-attention in horizontal and vertical stripes. \citet{chen2021crossvit} developed a dual-branch architecture that processes varying patch sizes, and \citet{zhang2024mg} implemented a multi-granularity strategy. Beyond application of plain transformer-based models on medical tasks, multiple scales has gained an increasing attention in this field. \citet{wang2022uctransnet} proposed a channel attention to bridge the semantic gap between different stage of a UNet on medical image segmentation.


% More specifically, the inductive biases may derive from the local properties of the visual domain and the multi-scale information provided by CNNs\cite{liu2021efficient,lee2021vision,heo2021rethinking,wu2021cvt,liu2021swin}, while ViTs do not intrinsically embed these features due to their different architectural design. The self-attention mechanism in ViT, while effective at capturing global dependencies, does not inherently provide locality. It has been observed that the internal visual representations in ViTs are more uniform across all layers compared to those in ResNet\cite{raghu2021vision}. With sufficient training data, the lower attention layers in ViT can effectively attend to both local and global features. But without  ample data, ViT struggle to focus locally\cite{raghu2021vision}. These inherent characteristics restrict ViT’s ability to acquire inductive biases solely from training data, leading to a greater need for extensive datasets to learn critical locality and multi-scale information from scratch.

% Recent efforts have focused on mitigating the limitations of ViTs by integrating convolutional inductive biases or incorporating self-supervised tasks. Prevalent approaches often combine CNN feature extractors with transformer encoders \cite{araujo2019computing,dosovitskiy2020image,wu2021cvt,liu2021swin,yuan2021incorporating,fan2021multiscale,li2021localvit,d2021convit}, where a notable example is the 'hybrid' ViT~\cite{dosovitskiy2020image} replacing the embedding layer of a transformer encoder by a ResNet.  Wu et al. \cite{wu2021cvt} integrates convolutional projections into the token embedding and transformer blocks. Another typical method is knowledge distillation~\cite{touvron2021training} which transferred the inductive biases from CNNs teacher to a transformer student. They also developed a modified self-attention mechanism called gated positional self-attention to enhance local perceptions at lower layers\cite{d2021convit}. Lee et al. ~\cite{lee2021vision} suggest enriching patches by spatially shifting images alongside the input image to expand the receptive field and embed more spatial information into visual tokens, which can partially alleviate the issue of limiting local inductive bias.

% Histopathology image analysis, critical in medical diagnostics, involves examining whole slide images (WSIs) to detect and interpret complex tissue structures and cellular details. This analysis faces challenges due to the varied scales of visual entities within WSIs,  such as the differing sizes of cell nuclei and vascular structures, both of which can contribute to a model's task of distinguishing low- and high-risk kidney cancers. Moreover, vital global features of cancer and its microenvironment, observable only at lower scales, are crucial for various downstream tasks. Neglecting these multiple scales can significantly impair the performance of deep learning models in medical image recognition tasks. CNNs tackle this issue by utilizing a hierarchical structure created by lower and higher stages, which allows them to detect visual patterns from simple low-level edges to complex semantic features. Conversely, ViTs employ fixed-scale patches, thereby overlooking crucial multi-scale information within the same image~\cite{raghu2021vision}, which can hinder their performance across diverse tasks. By harnessing a hierarchical structure similar to that of CNNs, ViTs can avoid overlooking critical multi-scale features while also imparting necessary inductive biases. Existing works on directly integrating multi-scale information into ViTs vary primarily in the placement of convolutional operations: during patch tokenization\cite{yuan2021incorporating,xu2021co,guo2022cmt}, within\cite{hou2024conv2former,guo2022cmt,lin2023scale} or between self-attention layers, including query/key/value projections\cite{wu2021cvt,yuan2021incorporating}, forward layers~\cite{li2021localvit}, or positional encoding~\cite{xu2021co}, etc. Despite the benefits of hierarchical configurations~\cite{heo2021rethinking}, a definitive model for visual tasks has yet to emerge. The challenge remains how to effectively produce and utilize visual features across various scales. In response, we propose a novel and insightful hierarchical Vision Transformer model, outlined as follows:
% \begin{itemize}
%     \item[1.] A CNN backbone generates hierarchical image representations from various
% stages. We introduce a novel patch tokenization method to adapt these hierarchical structures for transformer input. The proposed multi-scale tokenization involves a single-layer projection, patch indexing, and concatenation, assembling features from different stages of a CNN into a sequence of multi-scale tokens that provide valuable inductive biases inherited from CNNs.
    
%     \item[2.] A novel local attention module, incorporated with global patch attention, enables the model to recognize associations between every scale of features and also across different scales, thereby expanding ViT’s receptive field and bringing the needed inductive bias.

%     \item[3.]Our proposed scale token, part of the local attention, is initialized with a fused embedding derived from hierarchical representations. It enriches the transformer's multi-granularity representation and aggregates scale information, serving as the input for the global patch attention.
% \end{itemize}

% \section{Related Work}
% \label{sec:related work}
% \subsection{Integrating convolutional inductive biases}
% Various approaches have been explored to amalgamate the strengths of CNNs while preserving the capabilities of ViTs. Most efforts focus on leveraging convolutional inductive biases to enhance performance across visual tasks such as video recognition\cite{fan2021multiscale}, image classification \cite{chen2021crossvit,dong2022cswin,d2021convit,fan2024rmt,heo2021rethinking,hou2024conv2former,li2021localvit,lin2023scale,liu2021swin,touvron2021training,wang2021pyramid,wang2023crossformer++,wu2021cvt,xu2021co,yu2022metaformer, yuan2021incorporating,yuan2021tokens,zhang2024mg}, object detection\cite{dong2022cswin,fan2024rmt,heo2021rethinking,hou2024conv2former,lin2023scale,wang2021pyramid,xia2024vit,xu2021co,yu2022metaformer,yuan2021incorporating,zhang2024mg}, and segmentation\cite{dong2022cswin,fan2024rmt,hou2024conv2former,lin2023scale,wang2022uctransnet,wang2021pyramid,xu2021co,yu2022metaformer,zhang2024mg}. Convolutions are mostly embedded in the tokenization process to incorporate convolutional biases, such as the extraction of low-level features via a CNN stem\cite{yuan2021incorporating} instead of from direct image patches. Li et al.\cite{li2021localvit} bring local inductive bias to ViT by introducing depth-wise convolutions into the feed-forward networks. Convolutions are also applied to the self-attention mechanism by replacing the linear projection at the outset of each self-attention phase with a locally-enhanced feed-forward layer, which can enhance the spatial correlation among tokens\cite{yuan2021incorporating}. Besides direct incorporations,  hierarchical structures of CNNs have been also exploited to bring needed inductive biases. Predominant CNNs, such as ResNet\cite{he2016deep}, typically commence with features of large spatial sizes and smaller channel dimensions. Gradually increasing channel size while reducing spatial dimensions, such a configuration was proven advantageous\cite{heo2021rethinking}. Xu et al.\cite{xu2021co} build such a hierarchy with convolutions and proposed an intricate co-scale mechanism to take advantage of multi-scale information as well as utilized a convolutional position encoding in their attentions. Also quite a few works attempted to build the multiscale pyramid architecture within the transformers, for instance, by segmenting a transformer into several stages with pooling operations preceding each stage\cite{heo2021rethinking} or within the attention mechanism\cite{fan2021multiscale,li2022mvitv2}.

% Beyond mere convolution integration, multiple scales have been exploited differently. The Swin Transformer~\cite{liu2021swin} proposed a shifting window strategy, restricting attention solely within non-overlapping windows while allowing attention computations in new windows to cross boundaries established in preceding layers, thus facilitating interconnections among them.  while Dong et al.~\cite{dong2022cswin} divided multi-heads into two groups to perform self-attention in horizontal and vertical stripes concurrently. Chen et al.~\cite{chen2021crossvit} developed a dual-branch architecture that processes image patches of varying sizes to bolster feature representations and employs a cross-attention mechanism among CLS tokens to amalgamate information across all tokens. Zhang et al.~\cite{zhang2024mg} devised a multi-granularity strategy by segmenting images into varying granularity patches based on their significance and using a slimming module to merge tokens, thus reducing computational overhead. PVT~\cite{wang2021pyramid} engineered a four-stage pyramid structure by progressively diminishing the feature map size without utilizing convolutions and incorporated a spatial-reduction attention mechanism that reduces the spatial scale of Key and Value prior to the attention operation. Yu et al.\cite{yu2022metaformer} claim that with such a four-stage architecture, one can even get rid of attention mechanism by only pooling operations or simple token mixers. Yuan et al.\cite{yuan2021tokens} proposed an overlapping patch split to gradually decrease token length while aggregating local information from adjacent pixels and patches, discovering that a deep-narrow structure enhances ViT efficiency. A recent study~\cite{fan2024rmt} employed a spatial decay matrix to enhance self-attention with spatial priors. 

% UNETR~\cite{hatamizadeh2022unetr} utilizes a transformer encoder within a U-shaped network for 3D segmentation. Wang et al.\cite{wang2022uctransnet} replaced skip connections with a channel attention mechanism to bridge the semantic gap between the encoder and decoder. They further observed improved semantic dependency capture by combining scale-wise and channel-wise attentions\cite{wang2024narrowing}. MCTrans\cite{ji2021multi} integrates multi-scale convolutional features as tokens and employs self-attention to handle category dependencies through a learnable proxy embedding within a cross-attention module. Auxiliary tasks provide a method to integrate inductive biases into transformer models without enlarging the dataset. Gani et al.\cite{gani2022train} and Li et al.\cite{li2022locality} demonstrate how self-supervised tasks and dual-task learning can enhance local feature extraction and overall model performance by focusing on both local and global aspects.


\section{Methodology}
\subsection{Multi-scale Patch Tokenization}
The pipeline of our model is depicted in \figureref{fig:1}. We replaced the embedding layer commonly used in ViTs with a pretrained CNN backbone, which produces hierarchical features with decreasing spatial resolutions and increasing channel dimensions. We introduced a novel patch tokenization process to adapt these hierarchical features for the transformer. This process extracts features from different stages and performs embeddings based on them using single-layer projections. Given the input of an image, $\mathbf{x} \in \mathbb{R}^{H \times W \times 3}$ with $H=W$, we derive hierarchical outputs from multiple stages, denoted as $\mathbf{x}_i \in \mathbb{R}^{P_i \times P_i \times C_i}$ for $i \in \{0,1,2,3\}$. Here, $i$ denotes the  $i^{\text{th}}$ stage in the CNN backbone, where $P_i = \frac{H}{4 \cdot 2^i}$ specifies the spatial resolution, and $C_i$ indicates the channel dimension. We then apply a linear projection to transform all the features into embeddings with dimension $D$. We refer to the subsequent embeddings as multi-scale embeddings, denoted by $\mathbf{x}'_i$, where $\mathbf{x}'_i \in \mathbb{R}^{P_i \times P_i \times D}$, as formulated in \equationref{eq:1}.
\begin{equation}
    \mathbf{x}'_i = \text{Projection}(\mathbf{x}_i)
    \label{eq:1}
\end{equation}
Next, we split the multi-scale embeddings $\mathbf{x}'$ into $N$ non-overlapping patches, and flatten the spatial dimensions of them. Thus, for each scale, we obtain a sequence of tokens, each token with a spatial size ${P'_i}^2$, where ${P'_i} = \frac{H}{4 \cdot 2^i \cdot \sqrt{N}}$, for $i \in \{0,1,2,3\}$. We index and concatenate the corresponding tokens across multiple scales for each patch to form the multi-scale tokens $\textbf{X}^t_{\sum}$. This process is explained by \equationref{eq:2,eq:3}.
\begin{equation}
    \mathbf{x}''_i \in \mathbb{R}^{N \times {P'_i}^2 \times D}
    \label{eq:2}
\end{equation}
, where ${P'_i}^2 = \frac{HW}{16 \cdot 4^i \cdot N}, i \in \{0, 1, 2, 3\}$. In \equationref{eq:3}, $S$ denotes the total embedding length of each embedded patch in the multi-scale tokens, and $S = \sum {P'_i}^2$.
\begin{equation}
    \mathbf{X}^t_{\sum} = \textbf{concat}(\mathbf{x}''_i) \in \mathbb{R}^{N \times S \times D}
    \label{eq:3}
\end{equation}

\begin{figure}[t]
  \centering
  % \fbox{\rule{0pt}{2in} \rule{0.9\linewidth}{0pt}}
  \includegraphics[width=\linewidth]{(20250307)MIDL25FIGURE1&2.pdf}
   % \includegraphics[width=1\linewidth, height=0.36\linewidth]{./FCN8__1_.pdf}
    \caption{Left: Converting an image into hierarchical representations using a CNN backbone. Right: Process 1 illustrates multi-scale patch tokenization, including projection, patch splitting, and concatenation, with blue cubes representing embeddings from multiple scales of a single patch. Various colors and lengths indicate different embedding lengths at each scale in the multi-scale tokens. 'S' denotes the total embedding lengths for each patch. Process 2 shows learning the scale token from hierarchical representations. L indicates the depth of attention modules.}
   \label{fig:1}
\end{figure}

\subsection{Scale Token}
In local attention for scale, a scale token—akin to the class token—aggregates scale information and is then passed into the global attention. We obtained the scale token $\textbf{x}_{s}$ by applying a downsampling strategy to the hierarchical representations from the CNN, explained in \equationref{eq:4}. This strategy normalizes the spatial dimensions of embeddings from different scales to $N$, maintaining consistent channel dimensions. $N$ denotes the number of patches. These embeddings are then concatenated along the channel dimension and projected into a dimension $D$ using a simple projection, illustrated by process 2 in \figureref{fig:2} and outlined in \equationref{eq:5}. The resultant scale token distills important multi-scale information, serves as an effective guide for the local attention and efficiently aggregates scale information. 
\begin{equation}
\begin{aligned}
\Tilde{\textbf{x}}_0 &= \text{MaxPool(Conv(}\textbf{x}_0\text{))},  \Tilde{\textbf{x}}_1 = \text{MaxPool(Conv(}\textbf{x}_1\text{))}, \\
\Tilde{\textbf{x}}_2 &= \text{MaxPool(}\textbf{x}_2\text{)},  \Tilde{\textbf{x}}_3 = \textbf{x}_3 ,\quad\text{where } \textbf{x}_i \in \mathbb{R}^{N \times C_i},\\
\Tilde{\mathbf{X}}_{\sum} &= \textbf{concat}(\Tilde{\textbf{x}_0}, \Tilde{\textbf{x}_1}, \Tilde{\textbf{x}_2}, \Tilde{\textbf{x}_3}) \in \mathbb{R}^{N \times C}, C = \sum C_i, \\
\end{aligned}
\label{eq:4}
\end{equation}
\begin{equation}
    \begin{aligned}
\textbf{x}_{s}&=\text{ReLU(BN(Conv(}\Tilde{\mathbf{X}}_{\sum}\text{)))} \in \mathbb{R}^{N \times D}
    \end{aligned}
    \label{eq:5}
\end{equation}
\subsection{Duo Attention Module}
Our encoder employs local and global attentions to respectively focus on detailed image features and broader contexts, as illustrated in \figureref{fig:2}(Left). The local attention(LMSA) follows the principles of Multi-Head Self-Attention (MSA) but includes an adaption to incorporate an additional scale dimension. This adaptation integrates multi-scale analysis directly into the attention mechanism and modifies tensor operations to accommodate multi-scale tokens. Learnable 2D positional embeddings are added before the first layer of local attention, encoding scale-wise information for every patch. The implementations are depicted in \figureref{fig:2}(Right), taking the first layer as an example. The input to the first local attention layer is denoted by $\mathbf{X}^{s}_{0}$ in \equationref{eq:6}. $\mathbf{W}_{qkv}$ is transformation matrix. \equationref{eq:6} details the calculations performed within a local attention(LMSA) in a single head. $d_k$ is the scaling factor, and $\mathbf{A}$ stands for the attention weights for scales. We use multi-head attention in implementation.
% \#repeat distillation~\cite{touvron2021training} transfer biases from CNNs to ViT. Nonetheless, ViTs' smaller receptive fields compared to CNNs limit distillation~\cite{touvron2021training} transfer biases from CNNs to ViT. Nonetheless, ViTs' smaller receptive fields compared to CNNs limit their ability to capture 
\begin{equation}
    \begin{aligned}
        \mathbf{X}^{s}_{0} &= \textbf{concat}(\textbf{x}_{s},\mathbf{X}^t_{\sum})+\mathbf{E}_{pos},\mathbf{E}_{pos}\in \mathbb{R}^{(S+1) \times D},\\
        \mathbf{X}^{s'}_{0} &=\mathbf{X}^{s}_{0} + \text{LMSA}(\text{LN}(\mathbf{X}^{s}_{0})), \mathbf{Y}^{s}_{0} = \mathbf{X}^{s'}_{0} + \text{FFN}(\text{LN}(\mathbf{X}^{s'}_{0})\\
        \left[\mathbf{q}\mathbf{k}\mathbf{v}\right]&=\mathbf{X}^{s}_{0}\mathbf{W}_{qkv},\mathbf{W}_{qkv}\in \mathbb{R}^{D\times 3D},\mathbf{q}/\mathbf{k}/\mathbf{v}\in \mathbb{R}^{N \times (S+1)\times D },\\
        \mathbf{A}&=\text{Softmax}\left( \frac{\mathbf{q}\mathbf{k}^T}{\sqrt{d_k}}\mathbf{V}\right),d_k=\frac{D}{n_h},\mathbf{A} \in \mathbb{R}^{(S+1)\times (S+1)},\\
    \end{aligned}
    \label{eq:6}
\end{equation}
% \begin{equation}
%     \begin{aligned}
% \left[\mathbf{q}\mathbf{k}\mathbf{v}\right]&=\mathbf{X}^{s}_{0}\mathbf{W}_{qkv},\mathbf{W}_{qkv}\in \mathbb{R}^{D\times 3D},\mathbf{q}/\mathbf{k}/\mathbf{v}\in \mathbb{R}^{N \times (S+1)\times D },\\
%         \mathbf{A}&=\text{Softmax}\left( \frac{\mathbf{q}\mathbf{k}^T}{\sqrt{d_k}}\mathbf{V}\right),d_k=\frac{D}{n_h},\mathbf{A} \in \mathbb{R}^{(S+1)\times (S+1)},\\
%     \end{aligned}
%     \label{eq:7}
% \end{equation}
\begin{figure}[t]
  \centering
  % \fbox{\rule{0pt}{2in} \rule{0.9\linewidth}{0pt}}
  \includegraphics[width=\linewidth]{MIDL25-FIGURE3&4_v2.pdf}
    \caption{Left: Local (\textcolor{blue}{blue arrows}) attention models intra- and inter-scale dependencies, while global (\textcolor{orange}{orange arrows}) attention models relationships among image patches. From top to bottom, the embedding length (spatial sizes) for a single patch increases from 1 to 64, with rich scale information embedded in the multi-scale tokens. Right: Implementation of the duo attention module, including L layers of local and global attentions, respectively.}
   \label{fig:2}
\end{figure}
After the local scale-wise attention, each patch is expected to encapsulate the necessary details across all scales. The scale token, aggregating key details from this module, is augmented with standard learnable 1D positional embeddings and passed into the global patch attention. Global attention mirrors standard MSA but empirically removes layer normalization(LN), feed-forward networks (FFN), and residual connections, as demonstrated in \figureref{fig:2}(Right). A classifier, consisting of a single linear layer, is attached to the $L_{th}$ global attention layer, taking the CLS\_token as the final image representation.

\section{Experiments}
\subsection{Experimental Setup}
Our evaluation utilized two datasets, Utah ccRCC and TCGA ccRCC ~\cite{zhang2023class}. The Utah ccRCC dataset comprises 49 WSIs from 49 patients, split into training (32 WSIs), validation (10 WSIs), and testing (7 WSIs). Tiles were extracted from marked polygons at 400x400 pixel resolution at 10X magnification with a 200-pixel stride and center-cropped to 224x224 pixels for model compatibility. The training set included 28,497 Normal/Benign, 2,044 Low Risk, 2,522 High Risk, and 4,115 Necrosis tiles, with validation and test sets proportionately distributed. The TCGA ccRCC dataset features 150 labeled WSIs divided into 30 for training, 60 for validation, and 60 for testing, using similar cropping methods but adjusted strides to gather more training patches. It contains 84,578 Normal/Benign, 180,471 Cancer, and 7,932 Necrosis tiles in the training set, with similar distributions in the validation and test sets. 

All models, including baselines, were trained using the Adam optimizer with \(\beta_1 = 0.9\) and \(\beta_2 = 0.999\), without applying weight decay. For the DuoFormer model, batch sizes were set to 32 for the Utah dataset and 6 for the TCGA dataset. We employed a OneCycle learning rate scheduler that starts from a minimal learning rate, progressively increasing to a set rate of \(1 \times 10^{-4}\). A cross-entropy loss was utilized for training all models. Each model underwent training for 50 epochs on Utah and 200 epochs on TCGA, utilizing early stopping with patience of 20 and 50 epochs, respectively. We saved the best-performing model from the validation data for inference. Model performances were evaluated using balanced accuracy across all classes for both datasets. All computations were performed on an NVIDIA RTX A6000 with 48 GB of memory. Training our model for 50 epochs on a single gpu takes around 17.4 hours. For data augmentation, we applied color jittering, random rotation, center crop, random crop, and random flips horizontally and vertically for training data. We used the mean and standard deviation from ImageNet to normalize the data. For inference, we used only center cropping and the same normalization.

\subsection{Result and Discussion}
We utilized ResNet18 and ResNet50 backbones~\cite{he2016deep} to examine the efficacy of our model under two paradigms: fine-tuning with ImageNet supervised pre-trainıng and transfer learning with pathology (The Cancer Genome Atlas-TCGA dataset~\cite{weinstein2013cancer} and TULIP self-supervised pre-trainıng~\cite{kang2023benchmarking}. Results, shown in \tableref{tab:1}, demonstrate that our model outperforms the ResNet baselines by over 2\% across all settings and exceeds various Hybrid-ViTs in both scenarios. The results underscore our model's capacity to harness multi-scale features and integrate crucial inductive biases without necessitating additional tasks or additional pre-training of the transformer encoder.
\begin{table}[ht]
  \centering
  {\small
  \begin{tabular}{@{}llcc@{}}
    \toprule
    \bfseries{Dataset} & \bfseries{ Fine-tuning} & \bfseries{Params} & \bfseries{Acc.(\%)} \\
    \midrule
    & ViT-Base & 86.57M & 73.50 $\pm$ 0.94 \\
    & ResNet50 & 23.50M & 72.74 $\pm$ 6.22 \\
    TCGA& ResNet50-ViT Base & 112.5M & 75.89 $\pm$ 2.60 \\
    & ResNet50-ViT Large & 197.6M & 73.34 $\pm$ 3.72 \\
    & ResNet50-Swin Base & 87.00M & 72.31 $\pm$ 1.68 \\
    & ResNet50-DuoFormer (Ours) & 186.0M & \bfseries{76.57 $\pm$ 2.23} \\
    \midrule
    & ViT-Base & 86.57M & 84.69 $\pm$ 1.33\\
    & ResNet18 & 11.20M & 88.87 $\pm$ 1.99 \\
    UTAH& ResNet18-ViT Base & 99.03M & 82.35 $\pm$ 3.40 \\
    & ResNet18-ViT Large & 184.1M & 86.39 $\pm$ 0.96 \\
    & ResNet18-Swin Base & 86.91M & 84.24 $\pm$ 1.08\\
    & ResNet18-DuoFormer (Ours) & 91.22M & \bfseries{91.22 $\pm$ 1.74} \\
    \bottomrule
  \end{tabular}
  }
  \caption{Comparıson of supervised pretrained models on ImageNet for TCGA and UTAH datasets. Accuracies are reported as mean values from five independent experiments.}
  \label{tab:1}
\end{table}
In the fine-tuning scenario, particularly with TCGA using a ResNet 50 backbone, deeper encoders sometimes hindered performance, highlighting the need for careful design when integrating CNN architectures, especially considering domain shifts. Our DuoFormer improved performance by 3.83\%, demonstrating its effectiveness in leveraging multi-scale representations, compared to the shifting window strategy of Swin transformer\cite{liu2021swin}. This also indicates our model's ability to learn representations for the task at hand and better guide the feature extractor to adapt to domain shifts when trained together. During the transfer learning phase, shown in \tableref{tab:2}, the backbone, self-supervised pre-trained on TCGA and TULIP, two large-scale medical datasets, was frozen to serve as a feature extractor. The backbone provided robust visual representations, leading to the most promising performance improvements. Our model significantly outperformed the baseline by 6.96\% and clearly surpassed the Hybrid-ViTs and swin transformer, showing the superiority of our model in leveraging multi-scale features. These findings suggest that the model can effectively capture essential local features while preserving global attention capabilities, thereby addressing the typical inductive bias limitations found in transformers.
\begin{table}[htbp]
  \centering
  {\small
  \begin{tabular}{@{}lcc@{}}
    \toprule
    \bfseries{Transfer Learning} & \bfseries{Params} & \bfseries{Accuracy (\%)} \\
    \midrule
    SwaV & 0.008M & 77.98 $\pm$ 0.54\\
    SwaV-ViT Base & 89.03M & 74.00 $\pm$ 1.59 \\
    SwaV-ViT Large & 174.1M & 83.35 $\pm$ 1.90 \\
    SwaV-Swin Base & 86.74M & 68.90 $\pm$ 1.24\\
    SwaV-DuoFormer(Ours) & 124.7M & \bfseries{84.94 $\pm$ 2.63} \\
    \bottomrule
  \end{tabular}
  }
  \caption{Pathology self-supervised pretrained model performances on TCGA. 
  %All feature extractors in the table are frozen.
  }
  
  \label{tab:2}
\end{table}


% \subsection{Ablation Studies}
\subsubsection{Ablation on Multi-Scale Representations}
%Despite a few studies on effectively utilizing multi-scale features—typically encompassing three or four scales based on downsampling strategies—a gap persists in understanding the benefits of these multi-scale contexts. 
A natural question to ask is whether it always better to incorporate additional scales, especially in medical datasets characterized by diverse scales. Intuitively, we might expect performance improvements as more scales are integrated. 
%In \tableref{tab:3}, we maintained consistent scale token learning across all scales, with the size of queries, keys, and values in local attention varying based on combined scales. 
Interestingly, our results reveal significant performance enhancements when utilizing a single scale, which outperforms other baselines and underscores the efficacy of our proposed components. %However, introducing two scales leads to performance degradation on both datasets, likely due to the increased complexity and potential for misguiding the model with dual-level contexts. 
As illustrated in Table \ref{tab:3}, model performance generally improves with the incorporation of three and four scales in TCGA, a medium-sized dataset. Conversely, adding more than two stages  slightly diminishes generalization capabilities on the UTAH dataset, given its smaller size. Our findings indicate that the optimal combination varies between datasets, influenced by the dataset size and potentially by the scale of the class-related lesions. Specifically, including scale 1 tends to yield substantial gains, which we attribute to an optimal balance between rich semantic information and manageable embedding lengths. 
\begin{table}[htb]
\centering
% \setlength{\arrayrulewidth}{0.2pt}  % Setting the general line thickness
\begin{tabular}{cccccc}
\toprule
Scale 0 & Scale 1 & Scale 2 & Scale 3 & UTAH (Acc. \%) & TCGA (Acc. \%)  \\ 
% &&&& UTAH & TCGA\\
\midrule
\checkmark &  &            &            & 90.32 $\pm$ 1.61 & 81.02 $\pm$ 2.04 \\
 & \checkmark &            &            & 90.27 $\pm$ 2.78 & 81.47 $\pm$ 2.59\\
 &  &   \checkmark         &            & 90.10 $\pm$ 0.85 & 81.13 $\pm$ 1.37\\
 &  &            &      \checkmark      & 89.56 $\pm$ 0.53 & 81.59 $\pm$ 3.54\\
 % \arrayrulewidth=0.2pt % Sets the line thickness to 0.2pt for subsequent lines
 \midrule
\checkmark & \checkmark &            &            & 87.91$\pm$ 1.96  & 81.64 $\pm$ 1.83\\
\checkmark &            & \checkmark &            & 86.14 $\pm$ 2.17 & 80.56 $\pm$ 0.59\\
\checkmark &            &            & \checkmark & 85.18  $\pm$ 0.63 & 79.74 $\pm$ 4.35\\
           & \checkmark & \checkmark &            & 87.35  $\pm$ 1.35 & 81.36 $\pm$ 2.25\\
           & \checkmark &            & \checkmark & $\mathbf{91.22\pm 1.74}$  & 80.07 $\pm$ 1.04\\
           &            & \checkmark & \checkmark & 90.87 $\pm$ 1.22 & 80.24 $\pm$ 2.70\\
% \arrayrulewidth=0.2pt % Sets the line thickness to 0.2pt for subsequent lines
\midrule
\checkmark & \checkmark &            & \checkmark & 89.11 $\pm$ 1.58 & 82.87 $\pm$ 1.64\\
\checkmark & \checkmark & \checkmark &            & 87.88 $\pm$ 1.54 & 81.96 $\pm$ 1.03\\
\checkmark &            & \checkmark & \checkmark & 88.54 $\pm$ 2.93 & 81.90 $\pm$ 2.28\\
           & \checkmark & \checkmark & \checkmark & 89.78 $\pm$ 0.84 & 84.00 $\pm$ 2.26\\
\midrule
\checkmark & \checkmark & \checkmark & \checkmark & 88.59 $\pm$ 1.97 & $\mathbf{84.94\pm 2.63}$ \\
\bottomrule
\end{tabular}
\caption{Ablation study on inclusion of scales: Features from different stages are numbered 0 to 3 as in \figureref{fig:1} and \figureref{fig:2}. 
%56\t$S_0$ is the shallowest stage with spatial sizes of ($56\times 56$), and $S_3$ is the deepest of ($7\times 7$). 
Mean accuracies from five independent runs are reported.}
\label{tab:3}
\end{table}

\subsubsection{Ablation on Scale Attention}
We performed ablations on the local and global attention mechanism in DuoFormer using our optimal models in both transfer learning and fine-tuning settings~\tableref{tab:4}. Using only the local attention outperforms setups of replying solely on global attention, which resembles a hybrid ViT model\cite{dosovitskiy2020image}. Moreover, the results demonstrate that optimal performance on both datasets is attained only when both attention modules are combined, emphasizing the necessity of integrating both local and global information.
% For ablation studies, we utilized our best models in both settings, employing ResNet18 pretrained on ImageNet for UTAH and ResNet50 pretrained on histopathology images for TCGA. We evaluated the individual contributions of scale and patch attention mechanisms using configurations of 6 layers for UTAH and 8 layers for TCGA. The different depths were chosen to adapt to the larger size of the TCGA dataset and the smaller size of the UTAH dataset.  

\subsubsection{Ablation on Scale Token}
\label{sec:ablation onscale token}
The channel dimension embeds rich scale information as it captures different semantic patterns in segmentations\cite{wang2022uctransnet}. 
%Our investigation into the impact of a scale token on model performance revealed notable improvements when incorporated. To substantiate this finding, 
We experimented with configurations with and without a scale token in our model, as presented in \tableref{tab:4}. Results indicated that our proposed scale token more effectively guides the model in capturing critical local information. For configurations lacking a scale token, we observed enhancements over the baseline model by either using the first token in scale attention or averaging all tokens. Remarkably, employing the first token proved more beneficial than averaging. This token corresponds to the output from the final stage of the CNN backbone, typically utilized as the input for the classification head. We hypothesize that this performance boost stems from the final stage’s ability to convey concise, crucial information for the task, whereas averaging might introduce unwanted
noise. 
% The comparison demonstrates the effectiveness of our scale token as a guide and aggregator of relevant information.
\begin{table}[htb]
\centering
{\small
\begin{tabular}{@{}lcc@{}}
\toprule
Method & UTAH & TCGA \\
\midrule
Local Attn & 90.31 $\pm$ 1.15& 79.90 $\pm$ 0.10 \\
Global Attn & 82.35 $\pm$ 3.40 & 74.00 $\pm$ 1.59\\
% Scale Attn \& Patch Attn 
Ours & \bfseries{91.22 $\pm$ 1.74} & \bfseries{84.94 $\pm$ 2.63} \\
\bottomrule
\end{tabular}
}
\hspace{1cm} % Adjust horizontal space as needed
{\small
\begin{tabular}{@{}lcccc@{}}
\toprule
 & \multicolumn{2}{c}{w/o Scale Token} & \multicolumn{2}{c}{w/i Scale Token} \\
\midrule
Dataset & First Token & Average & Learnable & Ours \\
\midrule
UTAH & $90.61\pm 0.69$ & $89.62\pm 1.40$ & $88.80\pm 0.78$ & $\mathbf{91.22\pm 1.74}$ \\
TCGA & $83.22\pm 1.58$ & $82.62\pm 0.39$ & $83.13\pm 0.46$ & $\mathbf{84.94\pm 2.63}$ \\
\bottomrule
\end{tabular}
}
\caption{(Top) Ablations on scale and patch attention. Configurations with only scale attention use a single fully-connected layer to adapt the scale token for the classification head. (Bottom) Ablation study on the impact of different scale token configurations.}
\label{tab:4}
\end{table}


\section{Conclusion}
We introduced a novel hierarchical transformer model that integrates duo attention mechanisms to enhance visual data interpretation across various scales. Our model effectively captures spatial and contextual information, proving beneficial for medical image classification. Ablation studies confirmed that combining both attention mechanisms optimizes performance, showcasing the model's robustness and versatility across different backbones and tasks. This adaptability paves the way for broader applications in medical imaging and other vision-related challenges.



\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl25_160}

% \clearpage 
\appendix
% \section{Scale-wise Multi-Head Self-Attention}
% \label{sec:Local Attention}
% equations...
% \section{Model Training Details}
% \label{sec:training details}
% All models, including baselines, were trained using the Adam optimizer with \(\beta_1 = 0.9\) and \(\beta_2 = 0.999\), without applying weight decay. For the DuoFormer model, batch sizes were set to 32 for the Utah dataset and 6 for the TCGA dataset. We employed a OneCycle learning rate scheduler that starts from a minimal learning rate, progressively increasing to a set rate of \(1 \times 10^{-4}\). A cross-entropy loss was utilized for training all models. Each model underwent training for 50 epochs on Utah and 200 epochs on TCGA, utilizing early stopping with patience of 20 and 50 epochs, respectively. We saved the best-performing model from the validation data for inference. Model performances were evaluated using balanced accuracy across all classes for both datasets. All computations were performed on an NVIDIA RTX A6000 with 48 GB of memory. 

% We applied color jittering, random rotation, center crop, random crop, and random flips horizontally and vertically for training data. We used the mean and standard deviation from ImageNet to normalize the data. For inference, we used only center cropping and the same normalization.

\section{Ablation on Numbers of Heads and Layers}
\label{sec:Ablation on hyperparameters}
We assessed our model's sensitivity to two hyperparameters: the number of heads and the number of layers in dual attention modules. Results are given in \tableref{tab:5}. Initially, we fixed the number of heads at 12 and varied the number of layers from 4 to 12 to identify optimal configurations for each dataset. Subsequently, we tested heads from 4 to 12, excluding 10 due to incompatibility with the feature dimension $D=768$, using the optimal number of layers. We observed that performance generally increases and then decreases with attention depth. Specifically, performance peaks at 6 layers for the Utah dataset and at 8 layers for the TCGA dataset, likely due to the varying sizes of the datasets. Additionally, we noted a similar pattern of initial increase followed by a decrease in performance for the number of heads across both datasets, peaking at 8 heads.

\begin{table}[htbp]
\centering
\begin{minipage}[t]{0.48\linewidth} % Change alignment to top
\centering
\begin{tabular}{ccc}
\toprule
Number of & TCGA & UTAH \\
Layers & Acc. (\%) & Acc. (\%) \\
\midrule
4  & 80.83 & 89.37 \\
6  & 79.70 & 90.41  \\
8  & 82.67 & 88.64 \\
10 & 81.09 & 88.87 \\
12 & 79.66 & 89.86 \\
\bottomrule
\end{tabular}
% \subcaption{(a)} % Uncomment if subcaption package is included and working
\label{tab:5(a)}
\end{minipage}\hfill
\begin{minipage}[t]{0.48\linewidth} % Change alignment to top
\centering
\begin{tabular}{ccc}
\toprule
Number of & TCGA & UTAH \\
Heads & Acc. (\%) & Acc. (\%) \\
\midrule
4  & 78.74 & 90.00 \\
6  & 82.84 & 90.02 \\
8  & 84.94 & 91.22  \\
12 & 82.67 & 90.41 \\
\bottomrule
\end{tabular}
% \subcaption{(b)} % Uncomment if subcaption package is included and working
\label{tab:5(b)}
\end{minipage}
\caption{Ablation studies comparing the impact of different configurations on dual attention modules for both datasets: (a) variations in the number of blocks and (b) variations in the number of heads. All configurations synchronize the blocks and heads in both scale and patch attention. Encoder layers are set to be the optimized ones according to the ablations on blocks.}
\label{tab:5}
\end{table}
% \subsection{Explanability}
% \subsubsection{Attention Map for Multiple Scales}
% \begin{figure}[t]
%   \centering
%   % \fbox{\rule{0pt}{2in} \rule{0.9\linewidth}{0pt}}
%    \includegraphics[width=0.4\linewidth]{./20230207_IMG0407_TCGA-B0-4849-01Z-00-DX1.979ac1bc-d04f-470d-bed2-85d3bd3cd912_polygon_4_High_NC_125.png}
%    \caption{Input sample image from TCGA dataset to visualize the attentions for different scales during inference stage. Image size: $224$ x $224$. Labeled class: Cancer. }
%    \label{fig:6}
% \end{figure}

% \begin{figure*}[ht] % 'figure*' environment spans both columns
%     \centering % Centers the figure within the figure* environment
%     \includegraphics[width=1\linewidth]{./Attentions_4_scales_8heads_separateSoftmax_224x.png} 
%     \caption{From top to bottom: $S_3$ (7x7), $S_2$ (14x14), $S_1$ (28x28), and $S_0$ (56x56). Each attention map is upscaled to the input size of $224 \times 224$ pixels using nearest-neighbor interpolation. At the highest level (top row), the model minimally focuses on non-essential areas, such as the central blank space. Progressing to lower levels (second and third rows), attention shifts to areas with concentrated nuclei or abrupt structural changes. At the lowest level (bottom row), the model meticulously highlights some cell nuclei, which should be important for distinguishing types of kidney cancer. } 
%     \label{fig:7} % Label for referencing the figure in text
% \end{figure*}
% ResNets are proven to be effective feature extractors with a hierarchical structure. Our proposed model benefits from the hierarchical representations produced by ResNet backbones. Unlike previous hierarchical transformer models, our model introduces a novel scale-wise attention that preserves all levels of features output by the feature extractor, allowing explicit examination of how the model focuses on each level of features in an image. To investigate this, we drew the attention maps for all scales using raw attention scores after our duo attentions, as shown in \cref{fig:7}. The model was set to evaluation mode, and a sample from the 'Cancer' class in the TCGA dataset was chosen, see \cref{fig:6}. In previous attention-based models, the patch size is typically fixed at 14 x 14, which offers a limited view to the model. However, in our model, we can explicitly see what the model pays attention to, from low-level details to high-level semantics in the image. At the top level, we can see it pays very little attention to the hole at the center. At the second level, the model focuses on areas containing nuclei, which is important for distinguishing kidney cancers. At the next level, the model begins to attend to more details, such as the small sharp corner inside the blank area. At the lowest level, the model broadly focuses on regions containing cell nuclei. Our model should benefit from these low-level details because vital global features of cancer and its microenvironment can only be observed at lower scales, which are crucial for our task. Across all four scales, we can observe that our model benefits from every level of features while maintaining global perception. This multi-scale representation allows our model to better detect visual patterns, from low-level cell details to complex tissue structures. Lastly, it makes the model more interpretable, compared to other previous works.


\end{document}
