\documentclass{midl} % Include author names

\usepackage{multirow}
\usepackage{booktabs}
\usepackage{lipsum}
\usepackage{xcolor}
\usepackage{pifont}
\usepackage{fix-cm}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\jmlrvolume{-- Accepted}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Accepted for publication at MIDL 2026}

\title[Orientation Normalization of Multi-Stain Skin Tissue Cross-Sections]{Orientation Normalization of Multi-Stain Skin Tissue Cross-Sections}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship

\midlauthor{\Name{Ema Topolnjak\nametag{$^{1,2}$}}\vspace{0.1cm}\\
\Name{Evi Paulides\nametag{$^{1}$}}\vspace{0.1cm}\\
\Name{Willeke A. M. Blokx\nametag{$^{3}$}}\vspace{0.1cm}\\
\Name{Mitko Veta\nametag{$^{1}$}}\vspace{0.1cm}\\
\Name{Ruben T. Lucassen\nametag{$^{1,3}$}\vspace{0.1cm}} \Email{r.t.lucassen@umcutrecht.nl}\\
\addr $^{1}$ Dept.\,of Biomedical Engineering, Eindhoven University of Technology, the Netherlands \\
\addr $^{2}$ Dept.\,of Electrical and Computer Engineering, Vanderbilt University, Nashville, TN, USA \\
\addr $^{2}$ Dept.\,of Pathology, University Medical Center Utrecht, the Netherlands \\
}


\begin{document}

\maketitle

\begin{abstract}
Efficient examination of skin tissue specimens is key for pathologists to keep up with an increasing workload. Normalizing the orientation of tissue cross-sections before manual assessment could contribute to a more streamlined digital workflow. In this study, we compare multiple deep learning-based approaches for predicting the rotation angle required to correct the misorientation of skin tissue cross-sections. The models were developed and evaluated using a dataset of 10,649 H\&E-stained and 9,731 IHC-stained cross-section images from specimens with melanocytic lesions. Our results show that framing rotation angle prediction as a classification task with the circular target space divided into separate classes performed best, reaching mean absolute errors of 2.77° and 3.56° on the test sets of H\&E and IHC-stained cross-sections, respectively, approaching the level of human annotators. Automated orientation normalization, when implemented in whole slide image viewers, could make tissue examination more efficient and convenient for pathologists, while also serving as a valuable preprocessing step for the development of position-aware or multi-stain deep learning models.
\end{abstract}

\begin{keywords}
Classification, Regression, Rotation Angle, Computational Pathology, Melanoma
\end{keywords}

\section{Introduction}
Whole slide imaging has enabled the transition from conventional, microscope-based tissue examination to fully digital pathology diagnostics, offering benefits such as digital archiving, remote working, as well as computational image analysis to assist pathologists~\cite{stathonikos2013going,stathonikos2020digital}. Before a tissue specimen can be examined, however, it undergoes a histological preparation process, which includes tissue fixation, paraffin embedding, microtome sectioning, and staining. During the preparation process, tissue sections are often placed on the glass slides with an arbitrary orientation. Unlike tissue sections of most organ types, cross-sections of skin tissue specimens typically have an ideal orientation (i.e., the epidermis positioned at the top to reflect the natural outside-to-inside structure) and are, therefore, frequently misoriented, impeding optimal histological examination.

To address variation in orientation, earlier work has focused on methods such as data augmentation and rotation-equivariant convolutional neural networks~\cite{veeling2018rotation,lafarge2021roto}. While these methods improve model robustness and sample efficiency, they do not resolve the practical need of pathologists for consistently oriented tissue cross-sections. Alternatively, cross-sections can be rotated to their natural orientation prior to examination or downstream analysis. While this task has not yet been explored in dermatopathology, Shao~\textit{et al.}~\cite{SHAO2024108318} used deep learning-based rotation angle regression for radical prostatectomy sections as preprocessing step before registration with MRI scans, and a few earlier works have investigated rotation prediction in the domain of natural images using classification~\cite{hara2017designing, gidaris2018unsupervised}, regression~\cite{hara2017designing}, and hybrid approaches~\cite{fischer2015image}.

In this study, we benchmark multiple deep learning approaches for predicting the rotation angle required to correct the misorientation of skin tissue cross-sections. The models were developed and evaluated based on a dataset of 10,649 H\&E-stained and 9,731 IHC-stained cross-section images from specimens with melanocytic lesions. Providing pathologists with cross-sections that are oriented consistently can streamline tissue assessment, potentially reducing the examination time and facilitating more convenient comparison across stains. Automated orientation normalization could also form an important preprocessing step in the development of position-aware or multi-stain deep learning models, as well as whole slide image (WSI) registration methods. The code and trained model parameters are made publicly available.\footnote{\url{https://github.com/RTLucassen/orientation_normalization}}

\section{Materials}
\subsection{Study design}
This study was performed using data from the digital archive of the pathology department of the University Medical Center (UMC) Utrecht, the Netherlands. A total of 3,675 cutaneous melanocytic lesion cases with H\&E-stained and IHC-stained WSIs available, accessioned between January 1, 2013, and August 31, 2023, were randomly selected with stratification. The study does not fall within the scope of the Dutch Medical Research Involving Human Subjects Act (WMO) and therefore does not require approval from an accredited medical ethics committee in the Netherlands. Nevertheless, an independent quality assessment (25U-0162) was conducted at the UMC Utrecht to ensure compliance with relevant laws and regulations, including those related to the informed consent procedure, data management, privacy, and legal considerations. Cases from patients who opted out of the use of their data for research purposes were excluded. All data were de-identified.

\subsection{Dataset}
The dataset curation process is schematically shown in Fig.~\ref{fig:flow-chart} in Appendix~\ref{sec:dataset_details}. At the start, 3,675 unique skin biopsy and excision specimens with a melanocytic lesion were identified, consisting of 175 cases for each of the 21 most frequently performed IHC stains for melanocytic lesions at the pathology department of the UMC Utrecht. To compare the performance between stains, all available pairs of corresponding H\&E-stained and IHC-stained WSIs were selected in an automated manner based on the WSI metadata.

Image acquisition was performed using a ScanScope XT scanner (Aperio, Vista, CA, USA) at 20$\times$ magnification with a resolution of 0.50 \textmu m per pixel (226 WSIs, acquired before 2016), a NanoZoomer 2.0-XR scanner (Hamamatsu photonics, Hamamatsu, Shizuoka, Japan) at 40$\times$ magnification with a resolution of 0.23 \textmu m per pixel (4,756 WSIs, acquired starting from 2016 until May 2022), and a NanoZoomer S360 scanner (Hamamatsu photonics, Hamamatsu, Shizuoka, Japan) at 40$\times$ magnification with a resolution of 0.23 \textmu m per pixel (2,320 WSIs, acquired after May 2022). For the purpose of this study, all WSIs were analyzed at 1.25$\times$ magnification, as a trade-off between image detail and computational cost.

Tissue sections in the WSIs were segmented and separated using SlideSegmenter~\cite{lucassen2024tissue}. For segmentation of the IHC-stained WSIs, the SlideSegmenter model was finetuned on an annotated set of 77 IHC-stained WSIs before use. Manual corrections to the segmentation and/or separation of the tissue sections were performed if the quality of the automated method was unsatisfactory. Moreover, we developed a custom annotation software tool for rotating the cross-sections into their natural orientation and recording the corrective rotation angle. The angle distributions for the H\&E-stained and IHC-stained cross-sections on the original WSIs are shown in Fig.~\ref{fig:angle_histogram} in Appendix~\ref{sec:dataset_details}. Image annotation was performed by three annotators (E.T., E.P., R.L.). In the process, sections of poor quality (e.g., no epidermis present) or sections without a natural orientation (e.g., tangentially cut tissue completely surrounded by epidermis) were excluded.

In the end, the dataset included 10,649 H\&E-stained and 9,731 IHC-stained cross-section images for development and evaluation of the rotation angle prediction models. The dataset was split on a patient level into a training set (70\%), a validation set (10\%), and a test set (20\%) for independent evaluation. Approximately one-third of the images in each set were annotated by each annotator.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{preprocessing.pdf}
    \caption{Pipeline for image preprocessing. Tissue sections were first segmented and separated using SlideSegmenter~\cite{lucassen2024tissue} to crop them from the WSI at 1.25$\times$ magnification. After the background was replaced to remove artifacts and parts of adjacently positioned sections, the cross-sections were rotated to their natural orientation and the padding was corrected. Position matrices were finally generated.}
    \label{fig:preprocessing}
\end{figure}

\subsection{Preprocessing}
The preprocessing steps are illustrated in Fig.~\ref{fig:preprocessing}. All separate cross-sections were first cropped from the original WSIs, after which the background was replaced with a uniform color equal to a representative WSI background. This was done to remove artifacts and parts of other, closely positioned cross-sections, which could affect the model predictions. The images and corresponding segmentation maps were rotated to normalize the orientation of the cross-section. Excessive padding at the edges after the rotation was removed, but also added when necessary to make the dimensions divisible by the tile size used in the deep learning model. Position matrices were generated to indicate the horizontal and vertical position of each tile with respect to the centroid of the cross-section (determined using the binary segmentation map) as the point of origin.

\begin{table}[t]
\centering
\resizebox{0.93\columnwidth}{!}{
\begin{tabular}{@{}ccc@{}}
\toprule\toprule
Approach                                                        & Predicted variable(s)                            & Loss                            \\ \midrule
Angle regression                                                      & Angle of rotation in radians                     & Cosine similarity (CS)          \\
                                            &                              &             \\
\multirow{3}{*}{Coordinate regression}                                     & \multirow{3}{*}{Coordinate on unit circle}         & Cosine similarity (CS)          \\
                                                                &                                                  & Mean squared error (MSE)        \\
                                                                &                                                  & Mean absolute error (MAE)       \\
                                           &                              &             \\
\multirow{2}{*}{Angle classification} & \multirow{2}{*}{Probabilities for angle classes} & Categorical cross-entropy (CCE) \\
                                            &                                                  & Binary cross-entropy (BCE)      \\ 
                                            &                              &           \\ 
Orientation classification                                           & Probability of being correctly oriented          & Binary cross-entropy (BCE)      \\
\bottomrule\bottomrule
\end{tabular}
}
\caption{Modeling objectives for orientation normalization included in the benchmark.}
\label{tab:comparisons}
\end{table}

\section{Methods}
\subsection{Model architecture}
A Vision Transformer (ViT)~\cite{dosovitskiy2020vit} (depth~=~14, heads~=~4, MLP-ratio~=~5, embedding dimensions~=~256) with 13.2 million trainable parameters was used as model architecture for all experiments unless otherwise specified. Input images were tessellated into non-overlapping tiles of 16\,$\times$\,16 pixels and converted to feature embeddings. Because the images varied considerably in size, learnable positional embeddings, which require fixed image dimensions, were unsuitable for this application. Instead, non-learnable positional encodings were used~\cite{vaswani2017attention}. The positional encodings were generated based on the position matrices. Let $(p_x,p_y)$ denote the position of a tile along the horizontal and vertical axes, relative to the tile at the centroid of the cross-section as point of origin, which was assigned coordinate $(0,0)$. Both axes were encoded independently using:
%
\begin{equation}
    \text{PE}(p,i) = 
    \begin{cases}
    \sin\left(\frac{p}{10000^{\frac{2i}{d/2}}}\right)& \text{if}~i~\text{is even}\\
    \cos\left(\frac{p}{10000^{\frac{2i}{d/2}}}\right)& \text{if}~i~\text{is odd}
    \end{cases}~~,
\end{equation}
%
\noindent where $p$ is the position of the tile along a single axis with respect to the origin, $i$ is the encoding dimension, and $d$ is the number of embedding dimensions used by the model. The final 2-dimensional positional encoding was obtained by concatenating the encodings for the horizontal and vertical axis. Unlike their original use in language modeling, where typically only positive coordinates are encoded, sinusoidal positional encodings can also represent negative coordinates, making them suitable for our application as well.

The parameters of the tile embedding layer, Transformer blocks, and final normalization layer were initialized based on ImageNet~\cite{deng2009imagenet} pretrained ViT parameters from \texttt{timm}~\cite{rw2019timm}. The final classification layer was configured with randomly initialized parameters to accommodate the difference in the prediction task.


\subsection{Modeling objectives}
We benchmarked multiple regression and classification approaches to predict the rotation angle for the orientation normalization task, which are listed in Table~\ref{tab:comparisons}. All approaches were defined before evaluation. Starting with the regression approaches, we investigated predicting the rotation angle $\theta$ in radians, and similar to the work of Hara~\textit{et~al.}~\cite{hara2017designing}, as the corresponding coordinate on the unit circle $\mathbf{v} = (\cos\theta, \sin\theta) = (x,y)$. While both formulations are suitable for optimization using the cosine similarity loss, only the latter is appropriate in combination with the mean squared error and mean absolute error loss, because optimization problems near the modulus of 360° in the circular target space are circumvented. The cosine similarity (CS) loss is defined as
% \begin{equation}
% \begin{split}
%     \mathcal{L}_{\text{CS}} & = \frac{1}{N}\sum^{N}_{i=1} \big(1-\cos(\theta_i-\hat\theta_i)\big) \\
%     & = \frac{1}{N}\sum^{N}_{i=1} \left(1 - \frac{\mathbf{v}_i\cdot\mathbf{\hat{v}}_i}{|\mathbf{v}_i|\,|\mathbf{\hat{v}}_i|}\right) \\
%     & = \frac{1}{N}\sum^{N}_{i=1} \left(1 - \frac{x_i\,\hat{x}_i + y_i\,\hat{y}_i}{\sqrt{\hat{x}_{i}^2 + \hat{y}_{i}^2}}\right),
% \end{split}
% \end{equation}
\begin{equation}
    \mathcal{L}_{\text{CS}} = \frac{1}{N}\sum^{N}_{i=1} \big(1-\cos(\theta_i-\hat\theta_i)\big) = \frac{1}{N}\sum^{N}_{i=1} \left(1 - \frac{\mathbf{v}_i\cdot\mathbf{\hat{v}}_i}{|\mathbf{v}_i|\,|\mathbf{\hat{v}}_i|}\right) = \frac{1}{N}\sum^{N}_{i=1} \left(1 - \frac{x_i\,\hat{x}_i + y_i\,\hat{y}_i}{\sqrt{\hat{x}_{i}^2 + \hat{y}_{i}^2}}\right),
\end{equation}
the mean squared error (MSE) loss is defined as
\begin{equation}
    \mathcal{L}_{\text{MSE}} = \frac{1}{N}\sum^{N}_{i=1} \left((x_i-\hat{x}_i)^2 + (y_i-\hat{y}_i)^2\right),
\end{equation}
and the mean absolute error (MAE) loss is defined as
\begin{equation}
    \mathcal{L}_{\text{MAE}} = \frac{1}{N}\sum^{N}_{i=1} \left(|x_i-\hat{x}_i| + |y_i-\hat{y}_i|\right),
\end{equation}
where $(x_{i},y_{i})$ and $(\hat{x}_{i},\hat{y}_{i})$ are the ground truth and predicted coordinates, respectively, for image $i$ in a batch of size $N$. The predicted coordinate $\mathbf{\hat{v}} = (\hat{x},\hat{y})$ can be converted back to the predicted angle of rotation $\hat\theta = \text{atan2}(\hat{y},\hat{x})$.

Continuing with the classification approaches, we investigated predicting the rotation angle class from a set of classes that divide the circular target space. The models were optimized using the categorical cross-entropy loss in combination with the softmax as final activation function and the binary cross-entropy with a sigmoid as final activation function. The categorical cross-entropy (CCE) loss was defined as
\begin{equation}
    \mathcal{L}_{\text{CCE}} = - \frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C}\big(p_{ij}\log(\hat{p}_{ij})\big)
\end{equation}
and binary cross-entropy (BCE) loss was defined as  
\begin{equation}
    \mathcal{L}_{\text{BCE}} = - \frac{1}{NC} \sum_{i=1}^{N} \sum_{j=1}^{C} \big( p_{ij}\log(\hat{p}_{ij}) + (1 - p_{ij})\log(1 - \hat{p}_{ij}) \big),
\label{eq:BCE}
\end{equation}
where $p_{ij}$ and $\hat{p}_{ij}$ are the ground truth and predicted probability, respectively, for image $i$ in a batch of size $N$ and class $j$ out of the total number of classes $C$. For all multi-class classification models, the circular target space was divided into $C$~=~360 angle classes. The models were optimized with label smoothing, where class labels were sampled from a wrapped normal distribution centered at the rotation angle $\theta$ with a standard deviation $\sigma$~=~2.5°. The distribution was normalized for the CCE loss and unnormalized for the BCE loss. To obtain the predicted angle of rotation, the predicted probability-weighted circular mean of the center angles of each class was calculated.

Finally, we investigated a binary classification approach, predicting whether the tissue cross-section in the input image has the correct orientation. The model was optimized using the BCE loss with $C$~=~1, combined with the same label smoothing as before. Model predictions across the full circular target space were obtained by rotating the input image by 1°, predicting if the orientation is correct, and repeating this 360 times. To obtain the predicted angle of rotation, the predicted probability-weighted circular mean of the angles across the repetitions was calculated.


\subsection{Training}
All models were optimized using the same training procedure. During training, randomly sampled rotations were applied to the cross-section images with normalized orientations, after which the padding was corrected again and a new position matrix was generated (see the last two steps in Fig.~\ref{fig:preprocessing}). The models were trained to predict the angle that is needed to rotate the cross-section back to its natural orientation. Based on preliminary experiments, we found the deep learning models to perform better when small rotations near the natural orientation were oversampled. Hence, the sample probability of rotation angles in the ranges of [0°,\,15°] and [345°,\,360°] was increased from 30/360\,=\,0.083 to 0.25, with a probability of 0.75 to sample from the remaining range of [15°,\,345°]. Moreover, on-the-fly data augmentation was applied in the form of left-right flipping and color changes, including adjustments to the brightness ($\,\pm\,$0.2), contrast ($\,\pm\,$0.2), saturation ($\,\pm\,$0.2), and hue ($\,\pm\,$0.05). 

The models were trained until the validation loss had converged (i.e., 500,000 iterations for regression approaches and 750,000 iterations for classification approaches) using the AdamW~\cite{loshchilov2019decoupled} optimization algorithm ($\beta_1$~=~0.9, $\beta_2$~=~0.999). Gradients were accumulated over every 20 iterations. The model parameters that resulted in the smallest loss on the validation set were saved, which was evaluated after every 5,000 iterations. The learning rate was 2\,$\cdot$\,10$^{-5}$ at the start and halved after every five consecutive evaluations without a decrease in the loss on the validation set. Weight decay was equal to 1\,$\cdot$\,10$^{-2}$. If the total number of tile embeddings for an image exceeded 15,000, which was selected as maximum to limit the computational resources required, a subset of the embeddings equal in size to the maximum was randomly selected. Hyperparameters were tuned based on the performance on the validation set.

\subsection{Experimental setup}
For each modeling objective in the benchmark, five model instances were trained using different random seeds. The same five seeds were consistently used across the modeling objectives. The performance of the models was evaluated based on the cross-section orientations on the original WSIs using the mean absolute error (MAE) and median absolute error (MedAE) between the ground truth and predicted rotation angle, the percentage of cases within 2.5°, 5.0°, and 10.0° of the ground truth, and Bland-Altman plots~\cite{bland1986statistical}, all reported at a cross-section level. The mean and standard deviation (SD) of the performance scores for the five model instances were reported as final scores per modeling objective. All approaches were trained and evaluated on the set of H\&E-stained cross-sections, the set of IHC-stained cross-sections, and the combined set. To put the model performance into perspective, all tissue cross-sections in the test set were annotated again by all three annotators (E.T., E.P., R.L.) to assess the inter-annotator variability. One of the annotators repeated the test set annotation a second time after a wash out period of two weeks to also assess the intra-annotator variability. Moreover, an ablation study was performed to show the effect of the positional encodings and the background replacement. Additional experiments and analyses are included in Appendices~\ref{sec:time-memory}, \ref{sec:sampling-performance}, and \ref{sec:uncertainty-estimation}.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.92\linewidth]{Bland-Altman_plot.pdf}
    \caption{Bland-Altman plots of ensemble predictions from the best-performing models for the (A) H\&E-stained cross-sections and (B) IHC-stained cross-sections.}
    \label{fig:Blant-Altman}
\end{figure}

\begin{table}[]
\centering
\resizebox{0.8\columnwidth}{!}{
\begin{tabular}{@{}ccccccc@{}}
\toprule\toprule
\multirow{2}{*}{Approach}                                                         & \multirow{2}{*}{Loss} & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$}                  \\ \cmidrule(l){5-7} 
                                                                                  &                       &                      &                        & $k$\,=\,2.5°             & $k$\,=\,5.0°             & $k$\,=\,10.0°            \\ \midrule
\begin{tabular}[c]{@{}c@{}}Angle\\ regression\end{tabular}                        & CS                    & 13.04{\scriptsize $\,\pm\,$1.68}           & 7.80{\scriptsize $\,\pm\,$0.89}             & 18.6                & 34.9                & 60.2                \\
                                               \vspace{-2.5mm}              &  &  &  &  &  & \\
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}Coordinate \\ regression\end{tabular}} & CS                    & 5.71{\scriptsize $\,\pm\,$0.49}            & 3.58{\scriptsize $\,\pm\,$0.29}              & 36.5                & 63.9                & 87.7                \\
                                                                                  & MSE                   & 4.23{\scriptsize $\,\pm\,$0.29}            & 2.75{\scriptsize $\,\pm\,$0.19}              & 46.1                & 74.5                & 92.7                \\
                                                                                  & MAE                   & 3.65{\scriptsize $\,\pm\,$0.32}            & 2.27{\scriptsize $\,\pm\,$0.18}              & 54.1                & 80.8                & 94.3                \\
                                               \vspace{-1.5mm}               &  &  &  &  &  &  \\
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Angle\\ classification\end{tabular}}   & CCE                   & 2.96{\scriptsize $\,\pm\,$0.08}            & 1.75{\scriptsize $\,\pm\,$0.06}              & 63.7                & 86.3                & 96.4                \\
                                                                                  & BCE                   & 3.12{\scriptsize $\,\pm\,$0.12}            & 1.82{\scriptsize $\,\pm\,$0.05}              & 62.0                & 85.0                & 95.9                \\
                                                        \vspace{-2.5mm}     &  &  &  &  &  &  \\
\begin{tabular}[c]{@{}c@{}}Orientation\\ classification\end{tabular}              & BCE                   & 5.93{\scriptsize $\,\pm\,$0.67}            & 2.36{\scriptsize $\,\pm\,$0.32}              & 52.3                & 74.2                & 87.7                \\ \bottomrule\bottomrule
\end{tabular}
}
\caption{Results for the rotation angle prediction approaches, trained and evaluated on H\&E-stained tissue cross-section images. The results represent the mean (and standard deviation) of five model instances per approach. Note that lower scores for the mean absolute error (MAE) and median absolute error (MedAE) are better.}
\label{tab:HE_benchmark}
\end{table}


\vspace{-0.25cm}
\section{Results}
\subsection{Benchmark}
The results for the regression and classification approaches are reported in Table~\ref{tab:HE_benchmark} and Table~\ref{tab:IHC_benchmark} for the H\&E-stained and IHC-stained cross-sections, respectively. For both staining types, angle classification using the CCE or BCE loss for training reached the best performance, followed by coordinate regression using the MSE or MAE loss. The remaining approaches, including direct angle regression in radians, coordinate regression using the CS loss, and binary orientation classification, all performed substantially worse. Similar trends were seen for a smaller ViT configuration (see Table~\ref{tab:smaller_vit_results} in Appendix~\ref{sec:smaller_vit_results}). Across all approaches, the performance was better for cross-sections stained with H\&E than for cross-sections stained with IHC. Moreover, most of the models trained on the combined set of H\&E-stained and IHC-stained cross-sections reached a better performance than those trained on a single staining type when evaluated on the IHC-stained cross-sections, but performed comparably to slightly worse when evaluated on the H\&E-stained cross-sections (see Tables~\ref{tab:HE_benchmark_combined_training}~and~\ref{tab:IHC_benchmark_combined_training} in Appendix~\ref{sec:results_H&E_IHC}).

\begin{table}[b]
\centering
\resizebox{0.7\columnwidth}{!}{
\begin{tabular}{@{}cccccc@{}}
\toprule\toprule
\multirow{2}{*}{Comparison}                             & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$} \\ \cmidrule(l){4-6} 
                           &                      &                        & $k$\,=\,2.5°       & $k$\,=\,5.0°       & $k$\,=\,10.0°       \\ \midrule
1~\,vs.~1             & 1.79                 & 1.03                   & 80.9           & 94.1           & 98.5            \\
1~\,vs.~2             & 2.18                 & 1.20                   & 74.7           & 91.1           & 97.3            \\
1~\,vs.~3             & 2.71                 & 1.55                   & 69.4           & 89.3           & 96.4            \\
2~\,vs.~3             & 2.64                 & 1.48                   & 69.2           & 88.0           & 97.4            \\ \bottomrule\bottomrule
\end{tabular}
}
\caption{Intra-annotator variability after a 2-week period washout time (row 1) and inter-annotator variability (rows 2-4) for the H\&E-stained cross-sections in the test set.}
\label{tab:HE_annotator}
\end{table}

For the H\&E-stained cross-sections, the best performance was achieved by the angle classification models trained only on the H\&E-stained cross-sections using the CCE loss, which, when averaged over five model instances, reached a MAE of 2.96$\,\pm\,$0.08°. Further improvements were observed when using the average of the predictions to form a model ensemble instead, in which case a MAE of 2.77° was reached. The Bland-Altman plot of the model ensemble predictions is shown in Fig.~\ref{fig:Blant-Altman}A, demonstrating no bias and few outlier cross-sections. Visual inspection revealed the presence of separate tissue fragments and the lack of a complete epidermis among the cross-sections with the largest prediction errors, as can be seen in Fig.~\ref{fig:failure_cases_HE&IHC}A. The inter- and intra-annotator variability for the H\&E-stained cross-sections is reported in Table~\ref{tab:HE_annotator}. Between the annotators, the MAE ranged from 2.18° to 2.71°. The model ensemble reached a predictive performance close to the level of consistency of the annotators. The intra-annotator comparison showed the best level of consistency.




\begin{table}[]
\centering
\resizebox{0.8\columnwidth}{!}{
\begin{tabular}{@{}ccccccc@{}}
\toprule\toprule
\multirow{2}{*}{Approach}                                                         & \multirow{2}{*}{Loss} & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$}                  \\ \cmidrule(l){5-7} 
                                                                                  &                       &                      &                        & $k$\,=\,2.5°             & $k$\,=\,5.0°             & $k$\,=\,10.0°            \\ \midrule
\begin{tabular}[c]{@{}c@{}}Angle\\ regression\end{tabular}                        & CS                    & 19.60{\scriptsize $\,\pm\,$1.26}           & 10.73{\scriptsize $\,\pm\,$0.95}             & 13.0                & 25.7                & 47.4                \\
                                               \vspace{-2.5mm}              &  &  &  &  &  & \\
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}Coordinate \\ regression\end{tabular}} & CS                    & 9.61{\scriptsize $\,\pm\,$0.50}            & 5.36{\scriptsize $\,\pm\,$0.18}              & 26.3                & 47.1                & 73.3                \\
                                                                                  & MSE                   & 6.39{\scriptsize $\,\pm\,$0.43}            & 3.74{\scriptsize $\,\pm\,$0.16}              & 35.1                & 61.7                & 85.9                \\
                                                                                  & MAE                   & 5.87{\scriptsize $\,\pm\,$0.49}            & 2.92{\scriptsize $\,\pm\,$0.27}              & 44.0                & 70.2                & 88.8                \\
                                               \vspace{-1.5mm}               &  &  &  &  &  &  \\
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Angle\\ classification\end{tabular}}   & CCE                   & 4.46{\scriptsize $\,\pm\,$0.21}            & 2.21{\scriptsize $\,\pm\,$0.09}              & 55.1                & 79.0                & 92.7                \\
                                                                                  & BCE                   & 4.47{\scriptsize $\,\pm\,$0.17}            & 2.19{\scriptsize $\,\pm\,$0.03}              & 54.7                & 78.8                & 92.3                \\
                                                        \vspace{-2.5mm}     &  &  &  &  &  &  \\
\begin{tabular}[c]{@{}c@{}}Orientation\\ classification\end{tabular}              & BCE                   & 8.50{\scriptsize $\,\pm\,$1.12}            & 3.57{\scriptsize $\,\pm\,$0.44}              & 38.8                & 62.3                & 80.8                \\ \bottomrule\bottomrule
\end{tabular}
}
\caption{Results for the rotation angle prediction approaches, trained and evaluated on IHC-stained tissue cross-section images. The results represent the mean (and standard deviation) of five model instances per approach. Note that lower scores for the mean absolute error (MAE) and median absolute error (MedAE) are better.}
\label{tab:IHC_benchmark}
\end{table}


% 1: Ema, 2: Ruben, 3: Evi
\begin{table}[b]
\centering
\resizebox{0.7\columnwidth}{!}{
\begin{tabular}{@{}cccccc@{}}
\toprule\toprule
\multirow{2}{*}{Comparison} & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$} \\ \cmidrule(l){4-6} 
                            &                      &                        & $k$\,=\,2.5°       & $k$\,=\,5.0°       & $k$\,=\,10.0°       \\ \midrule
1~\,vs.~1             & 1.80                 & 0.92                   & 86.2           & 95.3           & 98.4            \\
1~\,vs.~2             & 2.42                 & 1.15                   & 77.0           & 90.7           & 96.4            \\
1~\,vs.~3             & 3.02                 & 1.33                   & 72.9           & 89.2           & 96.3            \\
2~\,vs.~3             & 3.03                 & 1.52                   & 67.1           & 86.7           & 96.0            \\ \bottomrule\bottomrule
\end{tabular}
}
\caption{Intra-annotator variability after a 2-week washout period (row 1) and inter-annotator variability (rows 2-4) for the IHC-stained cross-sections in the test set.}
\label{tab:IHC_annotator}
\end{table}


For the IHC-stained cross-sections, the best performance was achieved by the angle classification models trained on the combined set of H\&E-stained and IHC-stained cross-sections using the BCE loss, which, when averaged over five model instances, reached a MAE of 3.89$\,\pm\,$0.20°. Similarly, further improvements were observed when using an ensemble of the five models, reaching a MAE of 3.56°. The Bland-Altman plot of the model ensemble predictions can be seen in Fig.~\ref{fig:Blant-Altman}B, which shows no bias, but more outliers than for the H\&E-stained cross-sections. Among the IHC-stained cross-sections with the largest prediction errors, the presence of separate tissue fragments, the lack of a complete epidermis, or sections almost completely surrounded by epidermis were primarily observed, as can be seen in Fig.~\ref{fig:failure_cases_HE&IHC}B. The inter- and intra-annotator variability for the IHC-stained cross-sections is reported in Table~\ref{tab:IHC_annotator}. The MAE ranged from 2.42° to 3.03° between the annotators. A larger difference was seen between the model ensemble performance and the level of annotator consistency for the IHC-stained cross-sections than for the H\&E-stained cross-sections.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{Failure_cases.pdf}
    \caption{Three rows of (A) H\&E-stained cross-sections and (B) IHC-stained cross-sections from the test set with the largest prediction errors by the ensemble of five model instances using the best-performing approach. From left to right: the original orientation on the WSI, the orientation after applying the predicted rotation angle, and the natural orientation as provided by one of the annotators.}
    \label{fig:failure_cases_HE&IHC}
\end{figure}

\subsection{Ablation study}
\noindent The results of the ablation study are reported in Table~\ref{tab:ablation-study}. Removing the positional encodings inside the model and the background replacement during preprocessing both individually showed a decrease in the predictive performance, with a stronger decrease observed when the two were removed concurrently. Without positional encodings, the model cannot leverage any spatial relation between tiles and must rely on the patterns within the image tiles for the prediction of the rotation angle. Visual inspection of the cross-section images that showed the largest absolute errors without replaced backgrounds revealed the presence of other, closely positioned cross-sections, which would have been removed by the background replacement (see Fig.~\ref{fig:background-replacement} in Appendix~\ref{sec:background-replacement}).
\\
\section{Discussion and conclusion}
In this work, we compared several deep learning-based classification and regression approaches for predicting the rotation angle required to normalize the orientation of H\&E-stained and IHC-stained skin tissue cross-sections. Among the evaluated approaches, angle classification in combination with the BCE or CCE loss for training reached the best performance.

For all evaluated approaches, the performance was better for cross-sections stained with H\&E than for cross-sections stained with IHC, which might be due to the higher contrast of the tissue with respect to the background or because of better tissue quality. This differs from the level of consistency between the human annotators, which was similar for both staining types. In addition, training on the combined set of H\&E and IHC-stained cross-sections, compared to training exclusively on cross-sections from a single staining type, resulted for most approaches in slightly worse performance when evaluated on the test set of H\&E-stained cross-sections, while mostly reaching a better performance on the test set of IHC-stained cross-sections.

An interesting direction for future work would be to integrate orientation normalization of skin tissue cross-sections into a WSI viewer, followed by a user study to assess the effect on the diagnostic workflow (e.g., reduction in examination time or improvements in the convenience for pathologists). Automated matching of corresponding cross-sections across stains after the orientation has been normalized can potentially also be helpful. Moreover, the predictive performance on skin tissue cross-sections with non-melanocytic pathologies remains to be evaluated.

In conclusion, the best performance was achieved by approaching rotation angle prediction as a classification task with the circular target space divided into separate classes, which reached a performance close to the consistency level of human annotators. Automated orientation normalization could both streamline skin tissue assessment by pathologists, potentially reducing the examination time and improving convenience, and form an important preprocessing step for developing position-aware or multi-stain deep learning models.



\begin{table}[t]
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{@{}cccccccc@{}}
\toprule\toprule
\multirow{2}{*}{Positional Encodings} & \multicolumn{2}{c}{Background replacement}                            & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$} \\ \cmidrule(lr){2-3}\cmidrule(lr){6-8} 
                           & Training & Evaluation &                      &                        & $k$\,=\,2.5°       & $k$\,=\,5.0°       & $k$\,=\,10.0°       \\ \midrule
\ding{56} & \ding{56} & \ding{56}  & 20.79{\scriptsize $\,\pm\,$6.27}  & 17.56{\scriptsize $\,\pm\,$8.22}  & 9.7    & 19.2    & 36.0 \\
\ding{56} & \ding{51} & \ding{56}  & 5.93{\scriptsize $\,\pm\,$1.19}  & 4.10{\scriptsize $\,\pm\,$1.02}  & 34.5    & 58.5    & 84.7 \\
\ding{56} & \ding{51} & \ding{51}  & 5.78{\scriptsize $\,\pm\,$1.30}  & 4.14{\scriptsize $\,\pm\,$1.05}  & 34.0    & 58.7    & 84.8 \\
\ding{51} & \ding{56} & \ding{56}  & 5.39{\scriptsize $\,\pm\,$0.37}  & 3.31{\scriptsize $\,\pm\,$0.32}  & 40.3    & 66.7    & 88.7 \\
\ding{51} & \ding{51} & \ding{56}  & 3.45{\scriptsize $\,\pm\,$0.10}  & 1.90{\scriptsize $\,\pm\,$0.09}  & 60.5    & 83.2    & 94.7 \\
\ding{51} & \ding{51} & \ding{51}  & 2.96{\scriptsize $\,\pm\,$0.08}  & 1.75{\scriptsize $\,\pm\,$0.06}  & 63.7    & 86.3    & 96.4      \\ \bottomrule\bottomrule
\end{tabular}
}
\caption{Results of ablation study showing how the predictive performance is affected by removing the positional encodings and background replacement during preprocessing on the H\&E-stained cross-section images. The results represent the mean (and standard deviation) of five model instances per approach. Note that lower scores for the mean absolute error (MAE) and median absolute error (MedAE) are better.}
\label{tab:ablation-study}
\end{table}


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research was financially supported by the Hanarth Fonds.}


\bibliography{midl26_030}

\newpage
\appendix

\renewcommand{\thefigure}{A\arabic{figure}}
\renewcommand{\theHfigure}{A\arabic{figure}}
\setcounter{figure}{0}

\section{Dataset details}
\label{sec:dataset_details}
\begin{figure}[h!]
    \centering
    \includegraphics[width=0.85\linewidth]{flow_chart.pdf}
    \caption{Flow chart of the dataset curation process.}
    \label{fig:flow-chart}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.6\linewidth]{angle_histogram.pdf}
    \caption{Histogram showing the distribution of angles for the H\&E-stained and IHC-stained cross-sections on the original WSIs. The distributions are similar for the different staining types, both showing peaks at 0°/360°, 90°, 180°, and 270°, indicating that the lab technicians did to some extent take the orientation into account when placing the tissue sections on the microscopy slides.}
    \label{fig:angle_histogram}
\end{figure}


\newpage
\section{Results for smaller ViT configuration trained on H\&E-stained cross-sections}
\label{sec:smaller_vit_results}

\renewcommand{\thetable}{B\arabic{table}}
\renewcommand{\theHtable}{B\arabic{table}}
\setcounter{table}{0}

\begin{table}[h]
\centering
\resizebox{0.85\columnwidth}{!}{
\begin{tabular}{@{}ccccccc@{}}
\toprule\toprule
\multirow{2}{*}{Approach}                                                         & \multirow{2}{*}{Loss} & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$}                  \\ \cmidrule(l){5-7} 
                                                                                  &                       &                      &                        & $k$\,=\,2.5°             & $k$\,=\,5.0°             & $k$\,=\,10.0°            \\ \midrule
\begin{tabular}[c]{@{}c@{}}Angle\\ regression\end{tabular}                        & CS                    & 19.10{\scriptsize $\,\pm\,$2.04}           & 11.81{\scriptsize $\,\pm\,$1.50}             & 12.3                & 23.8                & 44.0                \\
                                               \vspace{-2.5mm}              &  &  &  &  &  & \\
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}Coordinate \\ regression\end{tabular}} & CS                    & 11.08{\scriptsize $\,\pm\,$0.76}            & 6.53{\scriptsize $\,\pm\,$0.41}              & 20.8                & 40.5                & 66.8                \\
                                                                                  & MSE                   & 8.01{\scriptsize $\,\pm\,$0.23}            & 4.66{\scriptsize $\,\pm\,$0.16}              & 29.9                & 52.6                & 79.0                \\
                                                                                  & MAE                   & 7.37{\scriptsize $\,\pm\,$0.52}            & 3.87{\scriptsize $\,\pm\,$0.15}              & 35.0                & 60.0                & 83.4                \\
                                               \vspace{-1.5mm}               &  &  &  &  &  &  \\
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Angle\\ classification\end{tabular}}   & CCE                   & 5.60{\scriptsize $\,\pm\,$0.27}            & 2.83{\scriptsize $\,\pm\,$0.08}              & 45.3                & 72.2                & 90.1                \\
                                                                                  & BCE                   & 6.05{\scriptsize $\,\pm\,$0.23}            & 3.00{\scriptsize $\,\pm\,$0.12}              & 43.4                & 69.5                & 88.6                \\
                                                        \vspace{-2.5mm}     &  &  &  &  &  &  \\
\begin{tabular}[c]{@{}c@{}}Orientation\\ classification\end{tabular}              & BCE                   & 30.20{\scriptsize $\,\pm\,$1.32}            & 11.03{\scriptsize $\,\pm\,$1.08}              & 16.0                   & 29.6                 & 47.7                    \\ \bottomrule\bottomrule
\end{tabular}
}                                                
\caption{Results for the rotation angle prediction approaches, trained and evaluated on H\&E-stained tissue cross-section images using a smaller ViT configuration (depth~=~8, heads~=~4, MLP-ratio~=~4, embedding dimensions~=~128) with 1.7 million trainable parameters. Note that lower scores for the mean absolute error (MAE) and median absolute error (MedAE) are better.}
\label{tab:smaller_vit_results}
\end{table}


\newpage
\section{Results for models trained on combined set of H\&E and IHC-stained cross-sections}
\label{sec:results_H&E_IHC}

\renewcommand{\thetable}{C\arabic{table}}
\renewcommand{\theHtable}{C\arabic{table}}
\setcounter{table}{0}

\begin{table}[h]
\centering
\resizebox{0.73\columnwidth}{!}{
\begin{tabular}{@{}ccccccc@{}}
\toprule\toprule
\multirow{2}{*}{Approach}                                                         & \multirow{2}{*}{Loss} & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$}                  \\ \cmidrule(l){5-7} 
                                                                                  &                       &                      &                        & $k$\,=\,2.5°             & $k$\,=\,5.0°             & $k$\,=\,10.0°            \\ \midrule
\begin{tabular}[c]{@{}c@{}}Angle\\ regression\end{tabular}                        & CS                    & 15.46{\scriptsize $\,\pm\,$2.22}           & 9.26{\scriptsize $\,\pm\,$1.20}             & 15.8                & 29.4                & 53.1                \\
                                               \vspace{-2.5mm}              &  &  &  &  &  & \\
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}Coordinate \\ regression\end{tabular}} & CS                    & 7.40{\scriptsize $\,\pm\,$0.49}            & 4.10{\scriptsize $\,\pm\,$0.19}              & 32.2                & 58.0                & 82.7                \\
                                                                                  & MSE                   & 4.61{\scriptsize $\,\pm\,$0.30}            & 2.86{\scriptsize $\,\pm\,$0.16}              & 44.8                & 72.7                & 91.5                \\
                                                                                  & MAE                   & 4.08{\scriptsize $\,\pm\,$0.59}            & 2.33{\scriptsize $\,\pm\,$0.22}              & 53.2                & 79.4                & 93.4                \\
                                               \vspace{-1.5mm}               &  &  &  &  &  &  \\
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Angle\\ classification\end{tabular}}   & CCE                   & 3.49{\scriptsize $\,\pm\,$0.17}            & 1.89{\scriptsize $\,\pm\,$0.09}              & 60.6                & 83.4                & 94.9                \\
                                                                                  & BCE                   & 3.22{\scriptsize $\,\pm\,$0.21}            & 1.82{\scriptsize $\,\pm\,$0.08}              & 62.5                & 85.1                & 95.7                \\
                                                        \vspace{-2.5mm}     &  &  &  &  &  &  \\
\begin{tabular}[c]{@{}c@{}}Orientation\\ classification\end{tabular}              & BCE                   & 6.92{\scriptsize $\,\pm\,$0.36}            & 2.49{\scriptsize $\,\pm\,$0.09}              & 50.4                    & 72.3                    & 86.5                    \\ \bottomrule\bottomrule
\end{tabular}
}                                                
\caption{Results for the rotation angle prediction approaches, trained on the combined set of H\&E-stained and IHC-stained cross-sections images, evaluated only on the H\&E-stained tissue cross-section images. The results represent the mean (and standard deviation) of five model instances per approach. Note that lower scores for the mean absolute error (MAE) and median absolute error (MedAE) are better.}
\label{tab:HE_benchmark_combined_training}
\end{table}

\begin{table}[h]
\centering
\resizebox{0.73\columnwidth}{!}{
\begin{tabular}{@{}ccccccc@{}}
\toprule\toprule
\multirow{2}{*}{Approach}                                                         & \multirow{2}{*}{Loss} & \multirow{2}{*}{MAE~(°)} & \multirow{2}{*}{MedAE~(°)} & \multicolumn{3}{c}{\% of images within $k$}                  \\ \cmidrule(l){5-7} 
                                                                                  &                       &                      &                        & $k$\,=\,2.5°             & $k$\,=\,5.0°             & $k$\,=\,10.0°            \\ \midrule
\begin{tabular}[c]{@{}c@{}}Angle\\ regression\end{tabular}                        & CS                    & 20.66{\scriptsize $\,\pm\,$2.89}           & 11.96{\scriptsize $\,\pm\,$1.96}             & 12.3                & 24.2                & 43.9                \\
                                               \vspace{-2.5mm}              &  &  &  &  &  & \\
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}Coordinate \\ regression\end{tabular}} & CS                    & 9.34{\scriptsize $\,\pm\,$0.93}            & 5.24{\scriptsize $\,\pm\,$0.30}              & 26.2                & 47.9                & 74.9                \\
                                                                                  & MSE                   & 5.85{\scriptsize $\,\pm\,$0.37}            & 3.57{\scriptsize $\,\pm\,$0.13}              & 37.2                & 63.1                & 87.0                \\
                                                                                  & MAE                   & 5.20{\scriptsize $\,\pm\,$0.67}            & 2.77{\scriptsize $\,\pm\,$0.18}              & 46.0                & 72.6                & 90.2                \\
                                               \vspace{-1.5mm}               &  &  &  &  &  &  \\
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Angle\\ classification\end{tabular}}   & CCE                   & 4.31{\scriptsize $\,\pm\,$0.27}            & 2.18{\scriptsize $\,\pm\,$0.08}              & 55.4                & 79.6                & 92.5                \\
                                                                                  & BCE                   & 3.89{\scriptsize $\,\pm\,$0.20}            & 2.08{\scriptsize $\,\pm\,$0.10}              & 57.3                & 81.1                & 93.8                \\
                                                        \vspace{-2.5mm}     &  &  &  &  &  &  \\
\begin{tabular}[c]{@{}c@{}}Orientation\\ classification\end{tabular}              & BCE                   & 7.91{\scriptsize $\,\pm\,$0.61}                    & 3.11{\scriptsize $\,\pm\,$0.10}                      & 42.8                    & 65.0                    & 82.1                    \\ \bottomrule\bottomrule
\end{tabular}
}
\caption{Results for the rotation angle prediction approaches, trained on the combined set of H\&E-stained and IHC-stained cross-sections images, evaluated only on the IHC-stained tissue cross-section images. The results represent the mean (and standard deviation) of five model instances per approach. Note that lower scores for the mean absolute error (MAE) and median absolute error (MedAE) are better.}
\label{tab:IHC_benchmark_combined_training}
\end{table}


\section{Visual examples showing the benefit of background replacement}
\label{sec:background-replacement}

\renewcommand{\thefigure}{D\arabic{figure}}
\renewcommand{\theHfigure}{D\arabic{figure}}
\setcounter{figure}{0}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth]{background_replacement.pdf}
    \caption{Five H\&E-stained cross-section images from the test set with the original background in the top row and the replaced background in the bottom row. The predicted rotation angle is shown below each image, with the absolute error relative to the ground truth in parenthesis. The predictions were obtained using angle classification with a single ViT that was trained on cross-section images with replaced backgrounds.}
    \label{fig:background-replacement}
\end{figure}


\newpage
\section{Inference time and GPU memory consumption}
\label{sec:time-memory}

\renewcommand{\thetable}{E\arabic{table}}
\renewcommand{\theHtable}{E\arabic{table}}
\setcounter{table}{0}

To integrate orientation normalization of skin tissue cross-sections into WSI viewers, the inference time and GPU memory consumption of the rotation angle prediction model are important to consider. While the optimization of these metrics was not the focus of this study, we do report statistics of the inference time and GPU memory consumption measurements for the best-performing ViT on the test set in Table~\ref{tab:time_mem_measurement1} and \ref{tab:time_mem_measurement2}. The measurements are reported at several percentiles because the cross-section images vary in size. Limiting the number of feature embeddings to a maximum of 15,000 reduced the peak inference time and GPU memory consumption substantially. The effect of sampling a subset of the feature embeddings on the predictive performance is investigated in Appendix~\ref{sec:sampling-performance}.
\\
\\
\begin{table}[h!]
\centering
\begin{tabular}{@{}cccccc@{}}
\toprule\toprule
Percentile  & ~~50~~  & ~~75~~  & ~~95~~   & ~~99~~   & ~~100~~  \\ 
& Median & & & & Max\\ \midrule
Embeddings & 2,068 & 4,285 & 9,696 & 16,190 & 50,629 \\ \midrule
Time (ms)   & 9.3 & 17.4 & 62.3 & 156.6 & 2,022.6 \\
~~Memory (GB)~~ & 0.3 & 0.7 & 3.1  & 8.2  & 78.7  \\ \bottomrule\bottomrule
\end{tabular}
\caption{Inference time and GPU memory consumption at selected percentiles across the H\&E-stained test set using a single ViT model for angle classification, evaluated on an NVIDIA A100 80\,GB GPU with no limit on the number of feature embeddings per image.}
\label{tab:time_mem_measurement1}
\end{table}

\begin{table}[h!]
\centering
\begin{tabular}{@{}cccccc@{}}
\toprule\toprule
Percentile  & ~~50~~  & ~~75~~  & ~~95~~   & ~~99~~   & ~~100~~  \\ 
& Median & & & & Max\\ \midrule
Embeddings & 2,068 & 4,285 & 9,696 & 15,000 & 15,000 \\ \midrule
Time (ms)   & 7.2 & 8.8 & 14.2 & 27.5 & 53.5 \\
~~Memory (GB)~~ & 0.3 & 0.7 & 3.1  & 7.1  & 7.2  \\ \bottomrule\bottomrule
\end{tabular}
\caption{Inference time and GPU memory consumption at selected percentiles across the H\&E-stained test set using a single ViT model for angle classification, evaluated on one-quarter of an NVIDIA A100 80\,GB GPU with the maximum number of feature embeddings per image limited to 15,000.}
\label{tab:time_mem_measurement2}
\end{table}

\newpage
\section{Effect of feature embedding sampling on the predictive performance}
\label{sec:sampling-performance}
Sampling a subset of the feature embeddings extracted from an image can be effective to reduce the inference time and GPU memory consumption. In Figure~\ref{fig:sampling-plot}, the predictive performance as a function of the percentage of embeddings sampled to make the prediction is shown. The performance decreased slightly from using 100\% to 10\% of the feature embeddings, and strongly when less than 10\% of the feature embeddings were sampled per image.


\renewcommand{\thefigure}{F\arabic{figure}}
\renewcommand{\theHfigure}{F\arabic{figure}}
\setcounter{figure}{0}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.57\linewidth]{sampling_plot.pdf}
    \caption{Median absolute error (MedAE) on the H\&E-stained test set using angle classification with a single ViT based on randomly sampled subsets of the feature embeddings for each image. All data points represent the average across 25 repetitions. Note that logarithmic scaling is used for the x-axis.}
    \label{fig:sampling-plot}
\end{figure}

\newpage
\section{Uncertainty estimation based on the predicted probability distribution}
\label{sec:uncertainty-estimation}

Estimating the uncertainty of model predictions can potentially be an effective approach to identify incorrect predictions or detect unsuitable input images. In Figure~\ref{fig:entropy-plot}, the entropy of the predicted probability distribution is plotted against the absolute error between the predicted and ground truth rotation angle. A moderate correlation is observed between the absolute error and the entropy for the H\&E-stained test set images (Pearson's correlation coefficient $r = 0.58$ and Spearman's correlation coefficient $\rho = 0.41$), and the predictions for approximately half of the images excluded during preprocessing because of poor tissue quality or a lack of a natural orientation have a comparatively high entropy. Hence, while not perfect, the entropy of the predicted probability distribution could be helpful for identifying incorrect predictions and detecting unsuitable input images. An alternative approach, which could be explored in future work, would be to add an auxiliary classifier to predict whether a natural orientation can be defined for the cross-section as quality assurance.


\renewcommand{\thefigure}{G\arabic{figure}}
\renewcommand{\theHfigure}{G\arabic{figure}}
\setcounter{figure}{0}

\begin{figure}[h!]
    \centering
    \includegraphics[width=0.60\linewidth]{entropy_plot.pdf}
    \caption{Absolute error between the predicted and ground truth rotation angle visualized against the entropy of the predicted probability distribution for all H\&E-stained cross-section images in the test set using the angle classification method with an ensemble of five ViT models. At the top, the entropy of the predicted probability distribution using the same model ensemble is shown for all H\&E-stained cross-section images that were excluded during the dataset annotation stage because of poor tissue quality or a lack of a natural orientation, which have no ground truth rotation angle or absolute error for this reason. The black dot and vertical bar represent the mean for the test set and excluded images, respectively.}
    \label{fig:entropy-plot}
\end{figure}

\end{document}