\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission
\setlength{\textfloatsep}{9pt} % adjust the value as needed
\setlength{\intextsep}{9pt} % adjust the value as needed
% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{floatrow}
\newfloatcommand{capbtabbox}{table}[][\FBwidth]
\usepackage{blindtext}
\usepackage{array} % for adjusting row spacing
\usepackage{enumitem}
\usepackage{mwe} % to get dummy images
\usepackage{array}
\usepackage{multirow}
\usepackage{float}
\usepackage{caption}
\newcolumntype{M}[1]{>{\arraybackslash}m{#1}}
\usepackage{booktabs}

\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}

\title[VariViT: A Vision Transformer for Variable Image Sizes]{VariViT: A Vision Transformer for Variable Image Sizes}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

\midlauthor{\Name{Aswathi Varma\nametag{$^{1,2}$}} \Email{aswathi@tum.de}\\
\Name{Suprosanna Shit\nametag{$^{2,3}$}} \Email{suprosanna.shit@tum.de}\\
\Name{Chinmay Prabhakar\nametag{$^{3}$}} \Email{chinmay.prabhakar@uzh.ch}\\
\Name{Daniel Scholz\nametag{$^{1,2}$}} \Email{daniel.scholz@mri.tum.de}\\
\Name{Hongwei Bran Li\nametag{$^{4}$}} \Email{holi2@mgh.harvard.edu}\\
\Name{Bjoern Menze\nametag{$^{3}$}} \Email{bjoern.menze@uzh.ch}\\
\Name{Daniel Rueckert\midljointauthortext{Contributed equally as senior authors}\nametag{$^{2}$}} \Email{daniel.rueckert@tum.de}\\
\Name{Benedikt Wiestler\midlotherjointauthor\nametag{$^{1}$}} \Email{b.wiestler@tum.de}\\
\\
\addr $^{1}$ Department of Neuroradiology, Technical University of Munich\\
\addr $^{2}$ Institute for Artificial Intelligence and Informatics in Medicine, Technical University of Munich\\
\addr $^{3}$ Department of Quantitative Biomedicine, University of Zurich\\
\addr $^{4}$ Athinoula A. Martinos Center for Biomedical Imaging, Harvard Medical School, Boston
}

\begin{document}
\maketitle
\vspace{-0.5cm}
\begin{abstract}

% TODO

%Suggestion Bene -> Please one sentence about the embedding (see my comment in the "current" abstract below)
%Vision Transformers (ViTs) have emerged as the state-of-the-art architecture in image analysis, leveraging self-attention mechanisms to excel in various tasks. They are also increasingly used in medical image analysis.  \emph{ViTs} process an image by dividing it into fixed-size patches. This patching operation constrains the images to have a predefined size. This fixed-size input requirement poses significant challenges, particularly in medical imaging, where irregularly shaped structures like tumors abound. Addressing this limitation, we propose Unified\emph{ViT}  (VariViT), a model adept at handling variable image sizes while maintaining consistent patch sizes.  \emph{VariViT} enhances feature representation, which is crucial for tasks with complex spatial relationships. Furthermore, we introduce a novel batching strategy within  \emph{VariViT} to mitigate computational complexity, yielding accelerated training and inference times.

%In our evaluations on two independent 3D brain MRI datasets,  \emph{VariViT} surpasses both\emph{ViT}  and ResNet models in glioma genotype prediction and brain tumor classification, achieving F1-scores of 75.5% and 76.3%, respectively. Our proposed batching strategy reduces computation time by up to 30% compared to conventional architectures. These findings underscore the efficacy of  \emph{VariViT} in addressing challenges inherent in medical image analysis and its potential to advance the state-of-the-art in the field.
Vision Transformers (\emph{ViTs}) have emerged as the state-of-the-art architecture in representation learning, leveraging self-attention mechanisms to excel in various tasks. \emph{ViTs} split images into fixed-size patches, constraining them to a predefined size and necessitating pre-processing steps like resizing, padding, or cropping. This poses challenges in medical imaging, particularly with irregularly shaped structures like tumors. A fixed bounding box crop size produces input images with highly variable foreground-to-background ratios. Resizing medical images can degrade information and introduce artefacts, impacting diagnosis. Hence, tailoring variable-sized crops to regions of interest can enhance feature representation capabilities. Moreover, large images are computationally expensive, and smaller sizes risk information loss, presenting a computation-accuracy tradeoff. We propose \emph{VariViT}, an improved \emph{ViT}  model crafted to handle variable image sizes while maintaining a consistent patch size. \emph{VariViT} employs a novel positional embedding resizing scheme for a variable  number of patches. We also implement a new batching strategy within \emph{VariViT} to reduce computational complexity, resulting in faster training and inference times. In our evaluations on two 3D brain MRI datasets, \emph{VariViT} surpasses vanilla \emph{ViTs} and \emph{ResNet} in glioma genotype prediction and brain tumor classification. It achieves F1-scores of 75.5\% and 76.3\%, respectively, learning more discriminative features. 
Our proposed batching strategy reduces computation time by up to 30\% compared to conventional architectures. These findings underscore the efficacy of \emph{VariViT} in image representation learning.
\end{abstract}

\begin{keywords}
Vision Transformers, Architecture, Representation, Tumor Classification
\end{keywords}

\section{Introduction}

% TODO

Deep neural architectures, notably Convolutional Neural Networks (CNNs) and Vision Transformers (\emph{ViTs}), have emerged as effective architectures for image representation learning, consistently achieving state-of-the-art performance on real-world data across different domains. \emph{ResNet}, a CNN-based variant \cite{he2016deep}, achieves its efficacy in representation learning by the use of residual connections.  \emph{ViT} \cite{dosovitskiy2020image} captures long-range dependencies by directly attending to global image information through self-attention mechanisms.  \emph{ViTs} are increasingly attracting attention in medical image analysis \cite{he2021global, gao2021covid,chen2021transunet,shamshad2023transformers,gao2021utnet,jang2022m3t}.
\begin{figure}[t]
    \centering
    \includegraphics[trim = {0cm 0.8cm 0cm 0cm}, width=0.75\linewidth]{images/attn_maps (1).pdf}
    \caption{\small Selecting the optimal input crop size is essential for maximizing the representation quality (i.e., attention level) of \emph{ViTs}. \textbf{(a):} 2D slice of the large tumor bounding box crop. \textbf{(b):} Attention map of the vanilla\emph{ViT} model on the image, showing attention to background data rather than the tumor. \textbf{(c):} The image resized to a large fixed size shifts focus to distortions arising from the operation. \textbf{(d):} (Ours) Smaller image crop without resizing. In our method, attention is mainly given to desired tumor regions.}
    \label{fig:attn}
\end{figure}
Input images are typically resized to a fixed size before being fed into a \emph {ViT} model. These models perform well with evenly dispersed signals, allowing operations like interpolation and cropping for fixed-size inputs. 
However, medical images feature small, irregular regions of interest, where such methods can be detrimental. Fixed-size inputs result in varying foreground-to-background ratios, especially with smaller pathologies, where more background (e.g., healthy brain) is included compared to larger tumors. Moreover, medical images are particularly sensitive to distortions. Resizing them may introduce artificial features or modify existing ones, mimicking or obscuring real abnormalities and interfering with diagnosis. Figure \ref{fig:attn} illustrates the attention map of a traditional \emph{ViT} for tumor classification. Despite tumor regions, background areas are heavily attended to, wasting computational resources. Smaller crops resized with interpolation introduce distortions, causing unwanted focus shifts. These limitations might impair efficient model training. 

We propose \emph{VariViT}  to handle variable-size images, addressing the limitation of fixed-size inputs. Our method recognizes the heterogeneous nature of real-world medical images where foreground-to-background ratios vary significantly. \emph{VariViT} retains the favorable properties of \emph{ViTs} while integrating the capability to handle diverse image sizes. Our contributions are as follows:
\begin{enumerate}[itemsep=0pt,parsep=0pt,topsep=0pt,partopsep=0pt]

    \item We introduce a novel \textbf{flexible positional embedding strategy} tailored to different image sizes.

    \item We propose an \textbf{alternate batching strategy} to improve computational efficiency, leveraging on the inclusion of smaller-sized images.
   
    \item We demonstrate the \textbf{applicability of \emph{VariViT}} in (i) glioma genotype prediction and (ii) brain tumor classification, two challenging tasks due to the highly variable tumor sizes. Our extensive experiments on two brain MRI datasets highlight the superior performance of \emph{VariViT} over both vanilla \emph{ViT}  and ResNet architectures.
\end{enumerate}
\section{Related Work}

The traditional \emph {ViT} model uses a fixed input size. It either initializes fixed positional embeddings or learns them during training. These embeddings are linearly interpolated for fine-tuning and evaluation at higher resolutions. The \emph{FlexiViT} model \cite{beyer2023flexivit} also adapts to variable image-to-patch size ratios and sequence lengths by resizing the learnable 2D positional embedding grid with bilinear interpolation. While effective in 2D, interpolating the 3D embedding grid for variable-sized images may result in information loss and higher computational requirements due to more complex calculations. The \emph{SuperViT} model \cite{lin2023super} patchifies an image at multiple scales and improves computational cost by randomly dropping tokens. However, a random selection of tokens risks information loss and degrades representation quality, particularly in medical images where the region of interest may be small.

The \emph{Pix2Struct} model \cite{lee2023pix2struct}, similar to ours, handles variable image sizes. The vision encoder resizes the input images to extract fixed-size patches fitting a predefined sequence length. Padding is applied to the sequence as needed, allowing it to reach the desired fixed length. The model  learns a large grid of 2D absolute positional embeddings, enabling the identification of patch positions based on $x$ and $y$ coordinates. However, this approach can be computationally expensive, especially for 3D images. In contrast, our model efficiently manages size differences without resorting to such computationally expensive operations. Moreover, scaling the images can introduce undesired artefacts.
 
 The \emph{NaViT} model \cite{dehghani2023patch} also  addresses challenges on computational complexity and variable image sizes. \emph{NaViT} packs patches from different-sized images into the same sequence. It maintains a fixed sequence length by randomly dropping tokens and padding. The model employs masked attention and pooling to prevent interactions between patches from different images. However, dropping tokens may lead to information loss, and implementing masked attention and pooling introduces more complex architectural changes.
% Self-attention masks and masked pooling prevent mutual interaction between tokens of different examples.

\begin{figure}[ht]
    \centering
\includegraphics[trim = {0cm 0cm 0cm 0cm}, width=1\linewidth]{images/model_new (1).pdf}
    \caption{\small The \emph{VariViT} model addresses the problem of handling images with different sizes by introducing a novel positional embedding resizing mechanism and employing different batching strategies. The model utilizes a fixed patch size, ensuring consistent patch embedding sizes across images while simultaneously adapting to different sequence lengths using a \textit{center and select} resizing strategy. }
    \label{fig:model}
\end{figure}

\section{Method}

 \textbf{Overview.} We focus on learning {3D} image representation in this work. In the conventional \emph{ViT} framework, a 3D input image is divided into non-overlapping patches. These patches are flattened and linearly projected to obtain a sequence of patch embeddings. The sequence is fed into the transformer encoder after adding a CLS token. The enriched CLS token is used for the final classification. However, the transformer model lacks intrinsic awareness of the spatial arrangement of patches within the sequence. Therefore, positional embeddings are added to the sequence to explicitly incorporate this information. Both \textit{relative} and \textit{absolute} positional encoding can be employed \cite{vaswani2017attention, wu2021rethinking}. They can either be \textit{fixed} or \textit{learned} during training. In its simplest form, absolute fixed \textit{sinusoidal} embeddings are used for this purpose.
 % In the conventional \emph{ViT} framework, a 3D input image \(X \in \mathbb{R}^{L \times H \times W \times C}\) is divided into non-overlapping patches \(x_p \in \mathbb{R}^{N \times P^3 \times C}\), where \(L\), \(H\), and \(W\) represent spatial dimensions of the image, \(P\) is the \textit{fixed} patch size, \(N\) is the number of patches, and \(C\) is the number of channels. 
 
 Fixed positional embeddings are predefined vectors representing each patch's absolute coordinates within the input sequence, typically generated using sinusoidal functions. The 1D positional encoding for even and odd indices is formulated as \cite{vaswani2017attention}:
\begin{equation}
\label{eq:sin_cos}
\begin{aligned}
\text{PE}(\textit{pos}, 2i) &= \sin\left(\textit{pos} / 10000^{2i/d}\right) \\
\text{PE}(\textit{pos}, 2i+1) &= \cos\left(\textit{pos} / 10000^{(2i+1)/d}\right)
\end{aligned}
\end{equation}

\noindent $pos$ represents the position to be encoded.  The parameter \(1 / 10000^{ 2i / \text{d}} \) governs the wavelength of the sinusoids. Here, $d$ denotes the embedding dimension and $i$ refers to each of the individual dimensions of the embedding.

Positional embeddings can be adjusted for various image sizes through interpolation, as suggested in the \emph{ViT} paper \cite{dosovitskiy2020image}. However, interpolation increases computational complexity and may introduce approximations, making it suboptimal for resizing 3D embedding grids in variable-sized tumor crops. \emph{Pix2Struct} suggests learning a large grid of embeddings for a predefined sequence length \cite{lee2023pix2struct}. This, however, can extend training time. Leveraging the consistent center alignment in tumor crops can provide a reliable reference point for resizing positional embeddings without the need for interpolation, thus preserving the original information.

Building upon this concept, the \emph{VariViT} model (Figure \ref{fig:model}) introduces the \textit{center and select} method for resizing positional embeddings.  Our model adapts the vanilla\emph{ViT} architecture for various input image sizes, particularly tailored to heterogeneous tumor shapes, while maintaining a fixed patch size. This is achieved by extracting 3D bounding box crops categorized into three sizes by tumor volume. Additionally, we exploring batching methods to manage diverse image sizes within batches. The architectural and training modifications to the base\emph{ViT}  are explained in the following sections.

% \begin{figure}[h]
%     \centering
%     \begin{tabular}{M{8cm} M{8cm}}
%       \includegraphics[width=\linewidth]{images/model.pdf} & \includegraphics[width=0.85\linewidth]{images/embedding.pdf}
%     \end{tabular}   
%     \caption{Predham}
%     \label{fig:model}
% \end{figure}
\vspace{0.5cm}
\noindent \textbf{Center and Select}. Similar to the vanilla \emph{ViT}, the patch embedding size remains constant in \emph{VariViT} regardless of the image size. However, with a fixed patch size, variations in image size can lead to a difference in the number of patches or the sequence length. Consequently, positional embeddings must be dynamically resized to accommodate these variations. 
\begin{figure}[h]
    \centering
    \includegraphics[trim = {0cm 0.5cm 0cm 0cm}, width=0.9\linewidth]{images/embedding (10).pdf}
    \caption{\small \textbf{(a):} Tumor bounding boxes of two sizes are displayed, with the tumor center of mass aligned. \textbf{(b):} Our proposed \textit{center and select} method for resizing position embeddings initializes fixed sinusoidal embeddings for the largest image size. Embeddings for other sizes are selected from the center of this grid. \textbf{(c):}  We present two batching strategies: \textit{Custom Batch Sampler (CBS)} with the same image sizes and \textit{Gradient Accumulation (GA)} with varying image sizes within a batch.}
    \label{fig:posemb}
\end{figure}
In tumor classification tasks, 3D crops are often employed to isolate the tumor region by identifying tumor boundaries using segmentation masks. The center coordinates of the 3D crop are aligned with the center of mass of the tumor. Despite variations in size, tumors positioned at the centers of the 3D crops will have coinciding centers of mass, establishing a consistent reference point (Figure \ref{fig:posemb} - (a)). Leveraging this, we introduce the \textit{center and select} method, resizing positional embeddings by centering them in 3D space.

The implementation involves initializing a fixed positional embedding for the largest image size. We utilize sinusoidal encoding as described by Equation \ref{eq:sin_cos} extended to 3D coordinates. This results in positional embeddings of dimension [\(N, d\)], excluding the CLS token. Here, \(N\) is the number of patches and \(d\) is the embedding dimension. The embedding can be viewed as a 3D grid for the \(l\), \(h\), and \(w\) dimensions with a size of \(G_l \times G_h \times G_w = N\) (Figure \ref{fig:posemb} - (b)). The center of the grid \((C_l, C_h, C_w)\), determined as \( \left( \left\lfloor \frac{G_l}{2} \right\rfloor, \left\lfloor \frac{G_h}{2} \right\rfloor, \left\lfloor \frac{G_w}{2} \right\rfloor \right) \), serves as the reference point for selecting a subset of positions based on the current input image size. We dynamically compute the new positional embedding size [\(N', d\)] when the image dimensions differ from the largest size, resulting in a new grid size \(G'_l \times G'_h \times G'_w\). To adjust the positional embedding for a different image size, we select a subset around the center from the initialized positional embedding. The position range \([ \textit{start}, \textit{end})\) for each dimension is determined by \( \textit{start} = C_k - \left\lfloor \frac{G'_k}{2} \right\rfloor \) and \( \textit{end} = \textit{start} + G'_k \), where \( k = l, h, w \). Thus, the original positional information is extracted at a lower computational cost.


     % \item \textbf{Initialize sin-cos positional embedding} - Initialize a fixed positional embedding for the largest image size. In our work, we employ 3D sinusoidal encoding to incorporate positional information of the patches, as seen in Equation \ref{eq:sin_cos}. This sequence length is given by is given by: \(N_{\text{init}} = \frac{{L_{\text{init}} \times H_{\text{init}} \times W_{\text{init}}}}{{P^3}}\). Here, \(L_{\text{init}}\), \(H_{\text{init}}\), and \(W_{\text{init}}\) denote the dimensions of the largest image size, and \(P\) denotes the fixed patch size.

     % \item \textbf{Locate the center of initialized positional embedding} - The center, denoted by \((C_l, C_h, C_w)\), is determined as \(\left( \frac{L_{\text{init}}}{2}, \frac{H_{\text{init}}}{2}, \frac{W_{\text{init}}}{2} \right)\), serving as the reference point for selecting an appropriate subset of the embedding based on the current input image size.

     % \item \textbf{Compute the new positional embedding size} - During training, dynamically compute the positional embedding size when the image dimensions differ from the largest size. Excluding the CLS token, the new size is given by: \(N' = \frac{{L' \times H' \times W'}}{{P^3}}\), where \(L'\), \(H'\), and \(W'\) represent the dimensions of the different-sized image.
     
     % \item \textbf{Select from the initialization} - Determine the range of positions to be selected from around the center: \([C_l - \frac{N'}{2}, C_l + \frac{N'}{2}]\), \([C_h - \frac{N'}{2}, C_h + \frac{N'}{2}]\), \([C_w - \frac{N'}{2}, C_w + \frac{N'}{2}]\). Finally, to adapt the positional embedding to the current input image size, extract the relevant subset from the initialized positional embedding.
\vspace{0.5cm}
\noindent \textbf{Batching Methods.}
Training models with different input dimensions poses challenges due to the variability of bounding box sizes in the dataset. Existing Python frameworks lack seamless solutions to address this issue. To address this challenge, we opt for two specific strategies, namely a \textit{custom batch sampler} and \textit{gradient accumulation}.
\begin{enumerate}[itemsep=0pt,parsep=0pt,topsep=0pt,partopsep=0pt]
    \item \textbf{Custom Batch Sampler (CBS)} - This strategy involves grouping images of the same size into a batch, as shown in Figure \ref{fig:posemb} (c) while allowing the image size to vary randomly from batch to batch. This maintains consistency within each batch. Including batches with smaller image sizes contributes to significantly faster training.
    \item \textbf{Gradient Accumulation (GA)} - In this method, the weight update is performed after accumulating gradients over several mini-batches, resulting in batches with varying image sizes (Figure \ref{fig:posemb} - (c)). The effective batch size is given by: \(\textit{Batch Size} = \textit{Mini-Batch Size} \times \textit{Update Interval}\). Here, we adjust the mini-batch size to 1 and set the update interval to the desired batch size.
\end{enumerate}
We make our codes for  \emph{VariViT} and the batching schemes publicly available at \url{https://github.com/Aswathi-Varma/varivit}.
\vspace{-0.4cm}
\section{Experimental Setup}
\textbf{Datasets.} To highlight the effectiveness of \emph{VariViT}, we perform experiments on two distinct 3D brain MRI datasets (Appendix \ref{appendix:datasets}): The \textit{glioma} dataset comprising 1856 MRI scans sourced from various studies \cite{van2021erasmus,sayah2022enhancing,calabrese2022university,bakas2022university,bakas2017advancing}; and the \textit{brain tumor} dataset containing 1699 MRIs from publicly available datasets \cite{baid2021rsna,suter2022lumiere, moawad2023brain}. Both datasets contain FLAIR, T2w, T1w, and T1w+contrast MR images, all registered to the SRI24 atlas and resampled to a uniform voxel size of 1x1x1 mm³, forming a four-channel multi-modal input for our experiments. These datasets are chosen to evaluate the model's performance on two binary classification tasks: (i) identifying the \textit{isocitrate dehydrogenase (IDH)} mutation status, a key biomarker that separates two adult-type diffuse gliomas groups \cite{louis20212021}. (ii) distinguishing between primary brain tumors (gliomas) and secondary brain tumors (metastases). We use the glioma dataset for multi-class classification task, targeting three glioma subtypes: \textit{glioblastoma}, \textit{astrocytoma}, and \textit{oligodendroglioma}.

We extract the largest tumor in each patient for our baseline models by cropping a 96$\times$96$\times$96 mm³ bounding box guided by segmentation masks provided in the datasets. To address different tumor sizes in \emph{VariViT}, we categorize the datasets into three size bins with equal sample distribution (Appendix \ref{appendix:bounding_box}). These bins correspond to crop sizes of 64$\times$64$\times$64 mm³, 80$\times$80$\times$80 mm³, and 96$\times$96$\times$96 mm³ for the largest tumors. Additionally, we rescale all image intensities to the range [0, 1].

\vspace{0.5cm}
\noindent \textbf{Training and Evaluation.} In our training configuration, we opt for the 3D\emph{ViT}  model \cite{prabhakar2023vit}. We employ the \emph{ViT}-S/16 configuration with a patch size of 16. This setup consists of 12 encoder blocks, each having an embedding dimension of 384 and 6 attention heads. A linear layer is used as the classification head. For both the \emph{VariViT}-S/16 models (GA and CBS), we use the same vanilla \emph {ViT}  base, differing only in the positional embedding. The model comprises approximately 28 million trainable parameters. We utilize \emph{ResNet-18}, a CNN with 33 million parameters to compare our results with a convolutional model of roughly the same number of parameters. To benchmark against recent state-of-the-art variable image size models, we incorporate the \emph{Pix2Struct} vision encoder into a 3D framework. This model serves as one of our baselines, with 34.8 million trainable parameters.We fix the sequence length of \emph{Pix2Struct} at 216, corresponding to our dataset's maximum image size. For all the models, we utilize absolute positional encoding. We explore relative positional embedding with our batching methods in Appendix \ref{appendix:r_pe} but do not observe any significant advantage over the absolute approach.

All models undergo training for 100 epochs, utilizing the AdamW optimizer \cite{loshchilov2017decoupled} with a weight decay of 0.05. The base learning rate is set to 1e-3, and cosine decay \cite{loshchilov2016sgdr} is applied for learning rate decay. A batch size of 8 is used, and the experiments are conducted on Nvidia RTX A6000. A warm-up schedule \cite{goyal2017accurate} of 40 epochs is applied. Data augmentations such as \textit{random affine}, \textit{random noise}, \textit{random gamma}, \textit{random blur}, and \textit{random flips} are incorporated during training. Cross-entropy loss with class weights is employed to mitigate class imbalance in the dataset. For the evaluation metrics, we utilize the \textit{Area Under the Curve} (AUC), \textit{F1-score}, and \textit{Matthews Correlation Coefficient} (MCC) score.
\begin{figure}[ht]
    \centering
    \includegraphics[trim = {0cm 0.9cm 0cm 0cm}, width=1\linewidth]{images/tsne_vit_pix(1).pdf}
    \caption{\small t-SNE visualization of embedding layer output for \textit{IDH} status classification. \textbf{(a)}: Vanilla \emph{ViT}, \textbf{(b)}: \emph{Pix2Struct} and \textbf{(c)}: \emph{VariViT-GA}, all with an embedding dimension of 384. Notice the clearer separation of clusters in our model's plot.}
    \label{fig:t-SNE_vit}
\end{figure}
\section{Results}
\textbf{IDH Mutation Status.}  We train our proposed model on the glioma dataset for \textit{IDH} status classification task. Then, we compare its performance against three baseline models: fixed-size 3D \emph{ResNet-18}, 3D vanilla \emph{ViT}, and variable-size 3D \emph{Pix2Struct}. We employ a k-fold cross-validation strategy with k=5 for all the models and report the mean metrics and standard deviation obtained from the test sets. The \emph{VariViT-GA} model consistently outperforms the baseline models in terms of AUC, F1-score, and MCC (Table \ref{tab:glioma}). Our model also exhibits notably faster training times compared to its counterparts. \emph{VariViT-CBS} not only outperforms \emph{ResNet-18} and vanilla \emph {ViT} on various performance metrics, including AUC, but also further reduces the training time. The t-SNE plot \cite{van2008visualizing} in Figure \ref{fig:t-SNE_vit} illustrates the improved cluster separation achieved by our model.
\begin{table}[ht]
    \centering
    \footnotesize
     \renewcommand{\arraystretch}{1}
\begin{tabular}{llcccc}
  \toprule
  & \textbf{Method} & \textbf{AUC} & \textbf{F1-Score} & \textbf{MCC} & \textbf{Training Time}\\
  \midrule
 \textit{Fixed} & 3D ResNet-18 & $0.928 \pm 0.042$ & $0.716 \pm 0.058$ & $0.654 \pm 0.071$ & \multirow{2}{*}{\includegraphics[width=0.1\textwidth]{images/idh_time1.png}}\\
  & 3D Vanilla-ViT & $0.927 \pm 0.027$ & $0.744 \pm 0.059$ & $0.679 \pm 0.076$ &\\
  \cmidrule(lr{1em}){1-5}
 \textit{Variable} & 3D Pix2Struct &$0.940 \pm 0.012$ &$0.742 \pm 0.040$ & $0.686 \pm 0.056$ & \multirow{3}{*}{\includegraphics[width=0.1\textwidth]{images/idh_time2.png}}\\
  & VariViT-CBS & $0.937 \pm 0.009$ & $0.718 \pm 0.027$ & $0.653 \pm 0.028$\\
  & VariViT-GA & $\textbf{0.942} \pm 0.011$ & $\textbf{0.755} \pm 0.059$ & $\mathbf{0.709} \pm 0.069$\\
  \bottomrule
\end{tabular}
\caption{\small Comparison of \emph{VariViT} with baseline models for the \textit{IDH} mutation status prediction task. Average training times visualized on the right (hours).}
\label{tab:glioma}
\end{table}

\noindent \textbf{Brain Tumor Type.} To further highlight the effectiveness of our proposed model, we apply it to the brain tumor dataset for the classification of primary versus metastatic tumors. Comparing the \emph{VariViT-GA} model with the baselines, it distinctly outperforms \emph{Pix2Struct} and \emph{ResNet-18}. Our model shows superior performance in MCC and F1-scores compared to Vanilla\emph{ViT} (Table \ref{tab:brats}). The \emph{VariViT-CBS} surpasses \emph{Pix2Struct} in performance and achieves similar results to \emph{ResNet-18}, with faster training times. This underscores its efficiency and suitability for practical applications.
\begin{table}[ht]
    \centering
    \footnotesize
     \renewcommand{\arraystretch}{0.9}
    \begin{tabular}{llcccc}
    \toprule
    &\textbf{Method} & \textbf{AUC} & \textbf{F1-Score} & \textbf{MCC} & \textbf{Training Time}\\
    \midrule
    \textit{Fixed} & 3D ResNet-18 & $0.948 \pm 0.013$ & $0.745 \pm 0.035$ & $0.694 \pm 0.051$ & \multirow{2}{*}{\includegraphics[width=0.1\textwidth]{images/type_time1.png}}\\
    & 3D Vanilla-ViT & $\textbf{0.957} \pm 0.011$ & $0.752 \pm 0.067$ & $0.696 \pm 0.081$\\
    \cmidrule(lr{1em}){1-5}
    \textit{Variable} & 3D Pix2Struct & $0.945 \pm 0.020$ & $0.720 \pm 0.058$ & $0.663 \pm 0.073$ & \multirow{3}{*}{\includegraphics[width=0.09\textwidth]{images/type_time2.png}}\\
    & VariViT-CBS & $0.947 \pm 0.013$ & $0.746 \pm 0.035$ & $0.686 \pm 0.045$\\
    & VariViT-GA & $0.954 \pm 0.007$ & $\textbf{0.763} \pm 0.036$ & $\mathbf{0.706} \pm 0.046$\\
    \bottomrule
  \end{tabular}
  \caption{\small Comparison of  \emph{VariViT} with baseline models for the primary \textit{vs.} secondary brain tumor classification. Average training times are visualized on the right (hours).}
  \label{tab:brats}
\end{table}

\vspace{0.5cm}
\noindent \textbf{Ablation Study - Positional Embedding.} Here, we compare the effectiveness of different positional embedding methods within the  \emph{VariViT-GA} architecture. We analyze three position-embedding strategies alongside the center and select method: (i) \textit{Independent, Fixed} (Indep\_Fixed) - initializes separate fixed sinusoidal positional embeddings for each image size category. (ii) \textit{Interpolated, Fixed} (Interp\_Fixed) - initializes fixed sinusoidal embedding for the largest image size and employs trilinear interpolation for smaller image sizes. (iii) \textit{Interpolated, Learned} (Interp\_Learned) - uses the positional embedding learned from the largest image size to create embeddings for smaller images through trilinear interpolation. All methods perform effectively (Table \ref{tab:ablation}), but the \textit{center and select} approach produces better results for the \textit{IDH} status classification task.

\vspace{0.5cm}
\noindent \textbf{Ablation Study - Multi-Class Classification.} We extend the glioma dataset to classify three glioma subtypes, thereby evaluating the model's performance in this more complex task. Our model achieves comparable MCC scores to both \emph{ResNet-18} and \emph{Pix2Struct} (Table \ref{tab:multi-class}), while demonstrating faster training times. This study underscores the effectiveness of our model across diverse classification tasks, especially in scenarios involving multiple classes.
\begin{figure}[ht]
\begin{minipage}{\linewidth}
\begin{floatrow}
\raggedright
\capbtabbox[1.2 \linewidth]{%
\footnotesize
  \begin{tabular}[t]{l@{\hspace{0.3em}}ccc}
\toprule
\textbf{Method} & \textbf{AUC} & \textbf{F1-Score} & \textbf{MCC}\\
\midrule
Indep\_Fixed & $0.938 \pm 0.011$ & $0.742 \pm 0.076$ & $0.701 \pm 0.074$\\
Interp\_Fixed & $0.929 \pm 0.007$ & $0.720 \pm 0.065$ & $0.677 \pm 0.048$\\
Interp\_Learned & $0.940 \pm 0.008$ & $0.750 \pm 0.025$ & $0.690 \pm 0.034$\\
Center \& Select & $\textbf{0.942} \pm 0.011$ & $\textbf{0.755} \pm 0.059$ & $\mathbf{0.709} \pm 0.069$ \\
\bottomrule
\end{tabular}
}{%
  \caption{\small Comparison of positional embedding strategies using the \emph{VariViT-GA} model.}
  \label{tab:ablation}
}
\capbtabbox[0.8\linewidth]{%

\raggedleft
\footnotesize
\begin{tabular}[t]{cc}
\toprule
\textbf{Method} & \textbf{MCC}\\
\midrule
3D ResNet-18 & $\mathbf{0.548} \pm 0.04$ \\
3D Vanilla ViT & $0.519 \pm 0.07$ \\
3D Pix2Struct & $0.543 \pm 0.02$ \\
VariViT-GA & $0.544 \pm 0.06$ \\
\bottomrule
\end{tabular}  
}{%
  \caption{\small Comparison of models for multi-class classification.}
  \label{tab:multi-class}
}
\end{floatrow}
\end{minipage}
\end{figure}
\vspace{-0.7cm}
\section{Discussion and Conclusion}
%Although \emph{ViTs} are effective models for learning image features, computation restrictions and high variability of pathologies in medical images have restricted their widespread adoption in the medical community. 
%We propose a principled approach to overcome these restrictions. Our approach scales to 3D images of different sizes and learns semantic features for improved classification. Further, we firmly believe that keeping the region of interest in focus is a more efficient way of feature learning. As our method works seamlessly with different crop sizes, it will invariably focus on the tumor, thus handling shape, size, and texture heterogeneity. Finally, we hope our work paves the way for the widespread adoption of \emph{ViT} in the medical community, thus encouraging the adoption of sophisticated feature learners that have become de-facto standards in the image-processing community.
\emph{ViTs} excel at image feature learning, but limitations exist in medical image analysis due to computational burden and anatomical variability.
We address this with \emph{VariViT}, a method that efficiently scales to various 3D image sizes while effectively learning for improved classification. Our approach centers around maintaining focus on the region of interest, a strategy that demonstrably improves feature learning. By adapting to variable image sizes, \emph{VariViT} concentrates on critical areas despite inherent anatomical and/or pathological variability. This targeted approach, however, necessitates an initial bounding box or segmentation.
While we tested our framework on brain MRIs, its versatility allows for adaptation to other modalities and regions, and we encourage to adapt our method to individual needs.
A significant avenue for future research is the exploration of our batching strategy and positional embedding technique in the context of extremely large datasets or high-resolution images. In such scenarios, we anticipate that the efficiency gains of our method will be particularly advantageous when compared to traditional \emph{ViT} implementations.
\midlacknowledgments{This study was supported by the DFG within the SPP Radiomics, grant \#428223038.}
%\bibliographystyle{plain} 
\bibliography{midl24_161}
\vspace{-0.2cm}
\appendix
\section{Datasets}
\label{appendix:datasets}

The \textit{glioma} dataset comprises MRIs from various studies, including the \textit{Erasmus Glioma Database} (EGD) \cite{van2021erasmus}, the \textit{REMBRANDT} MRI dataset collection \cite{sayah2022enhancing}, the \textit{University of California San Francisco Preoperative Diffuse Glioma MRI Dataset} (UCSF-PDGM) dataset \cite{calabrese2022university}, the \textit{University of Pennsylvania Glioblastoma Imaging, Genomics, and Radiomics} (UPenn-GBM) dataset \cite{bakas2022university}, \textit{The Cancer Genome Atlas} (TCGA) \cite{bakas2017advancing}, and a private MRI dataset. In the \textit{brain tumor} dataset, we collect data from the \textit{BraTS 21} \cite{baid2021rsna} and \textit{LUMIERE} \cite{suter2022lumiere} datasets for primary tumors. For metastases, we utilize the \textit{BraTS-Mets 2023} \cite{moawad2023brain} dataset. 

\section{Bounding Box Distribution}
\label{appendix:bounding_box}
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.7\linewidth]{images/bbox_dist (1).pdf}
    \caption{\small 3D Bounding Box Distribution of Glioma (a) and Brain Tumor (b) datasets. The x-axis represents the size of the 3D bounding box along the three dimensions, while the y-axis denotes the frequency of samples. The distribution of samples is divided into three equal bins based on the size of the bounding boxes.}
    \label{fig:bbox}
\end{figure}

To simulate various image sizes, we categorize both datasets into three bins, each containing approximately equal numbers of samples, based on the dimension of the largest tumor. We establish two threshold values to ensure an equal distribution of samples across these bins. Subsequently, a fixed bounding box crop size is assigned for each bin: 64x64x64 mm³ for cases where the threshold is less than 67, 80x80x80 mm³ for threshold values between 67 and 87 (inclusive), and 96x96x96 mm³ for the largest size. This is depicted in Figure \ref{fig:bbox} showcasing the tumor size distribution for the glioma (left) and brain tumor (right) datasets. In the brain tumor dataset, metastatic samples with tumor bounding box sizes smaller than 40x40x40 mm³ are excluded to ensure adequately sized tumors, thereby enhancing the complexity of the classification task.

% \section{Training Curves}
% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=0.9\linewidth]{images/loss.pdf}
%     \caption{The training (orange) and validation (blue) loss curves of the models on the \textit{IDH} classification task. Notably, the  \emph{VariViT} - GA and  \emph{VariViT} - CBS models exhibit smoother curves. This behavior signifies higher stability as evidenced by lower standard deviations in the results. }
%     \label{fig:loss}
% \end{figure}
\section{Absolute v/s Relative Position Embedding}
\label{appendix:r_pe}
\textit{Absolute} positional encoding techniques allocate distinct encoding vectors to every position within the input sequence, thereby allowing the model to capture the absolute positions up to the maximum sequence length. These methods employ either fixed or learnable encodings. In contrast, \textit{relative} position methods encode the relative distance between input patches and learn the pairwise relationship between them \cite{shaw2018self, wu2021rethinking}. Typically, this is computed through a look-up table with learnable parameters that interact with queries and keys within self-attention modules during the training process. 
\begin{table}[ht]
    \centering
    \footnotesize
    \begin{tabular}{lccccc}
        \toprule
         \textbf{Batching} & \textbf{Coordinates} & \textbf{AUC} & \textbf{F1-Score} & \textbf{MCC} \\
        \midrule
        \textit{CBS} & Absolute & $\mathbf{0.933} \pm 0.013$ & $\mathbf{0.712} \pm 0.032$ & $\mathbf{0.646} \pm 0.036$ \\
         & Relative & $0.933 \pm 0.009$& $0.697 \pm 0.013$ & $0.622 \pm 0.019$\\
        \cmidrule(lr{1em}){1-6}
        \textit{GA} & Absolute & $\mathbf{0.945} \pm 0.007$ & $\mathbf{0.744} \pm 0.036$ & $\mathbf{0.684} \pm 0.034$\\
         & Relative & $0.931 \pm 0.016$&$0.718 \pm 0.033$ &$0.666 \pm 0.031$ & \\
        \bottomrule
    \end{tabular}
    \caption{\small Comparison of absolute and relative position embeddings for both the batching methods on the glioma dataset. Note that the embeddings are learned, and resizing is done by interpolation.}
    \label{tab:r_pe}
\end{table}
We experiment with both relative and absolute positional embeddings using our two proposed batching strategies for variable image sizes on the glioma dataset. For both methods, we initialize the positional embedding with the dimensions of the largest image size. We employ interpolation to adjust its size when dealing with varying image dimensions. \(\textit{CBS}\) and \(\textit{GA}\) batching with absolute positional embedding demonstrate superiority over their relative counterparts. Hence, relative positional embedding doesn't offer a significant advantage over absolute for our batching methods.
\section{Cosine Similarity}
\label{appendix:cos_plot}
\begin{figure}[ht]
    \centering
    \includegraphics[width=1
\linewidth]{images/cos_similarity.pdf}
    \caption{\small 2D view of the cosine similarity visualization of \emph{VariViT}'s  positional embedding for each of the three image sizes.}
    \label{fig:sim}
\end{figure}
Cosine similarity quantifies the similarity between two vectors in a \(d\)-dimensional space, determined by the cosine of the angle between them. Values range from 0 to 1, where 1 signifies perfect similarity. In Figure \ref{fig:sim}, each cube depicts similarity between one position's embedding and the remaining \(N-1\) positions, where \(N = G \times G \times G\) denotes the total elements in the position embedding grid.
\section{t-SNE Plots}
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.599\linewidth]{images/tsne-plots.pdf}
    \caption{\small t-SNE Plots depicting \emph{VariViT} CBS on the left and GA on the right, showcasing evaluations based on the highest scores across k-folds for Glioma (top) and Brain tumor (bottom) datasets.}
    \label{fig:t-SNE}
\end{figure}
\end{document}