\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{float}
\usepackage{enumitem}

\usepackage[normalem]{ulem}
\useunder{\uline}{\ul}{}

\usepackage{mwe} % to get dummy images
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn} % Comment: not sure if this "nnn" here should also be the openreview ID?
\editors{Accepted for publication at MIDL 2024}

\title[RADR]{RADR: A Robust Domain-Adversarial-based Framework for Automated Diabetic Retinopathy Severity Classification}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\midlauthor{\Name{Sara Mínguez Monedero\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{sara.minguez.monedero@studium.uni-hamburg.de}\\
\Name{Fabian Westhaeusser\midlotherjointauthor\nametag{$^{2}$}} \Email{fabian.westhaeusser@zmnh.uni-hamburg.de}\\
\Name{Ehsan Yaghoubi\nametag{$^{1}$}} \Email{ehsan.yaghoubi@uni-hamburg.de}\\
\Name{Simone Frintrop\nametag{$^{1}$}} \Email{simone.frintrop@uni-hamburg.de}\\
\Name{Marina Zimmermann\nametag{$^{2}$}} \Email{marina.zimmermann@zmnh.uni-hamburg.de} \\
\addr $^{1}$ Department of Informatics, University of Hamburg, Hamburg, Germany\\
\addr $^{2}$ Institute of Medical Systems Biology, Center for Biomedical AI, Center for Molecular Neurobiology, University Medical Center Hamburg-Eppendorf, Hamburg, Germany
}


\begin{document}

\maketitle

\begin{abstract}
Diabetic retinopathy (DR), a potentially vision-threatening condition, necessitates accurate diagnosis and staging, which deep-learning models can facilitate. However, in clinical practice these models often struggle with robustness due to distribution shifts caused by variations in data acquisition protocols and hardware. We propose \textbf{RADR}, a novel \textbf{R}obust domain-\textbf{A}dversarial-based deep-learning framework for \textbf{DR} severity classification, aimed at generalization across diverse datasets and fundus cameras. Our work builds upon existing research: we combine several ideas to perform extensive dataset curation, preprocessing, and enrichment with camera information. We then use a domain adversarial training regime, which encourages our model to extract features that are both task-relevant and invariant to domain shifts. We explore our framework in its various levels of complexity in combination with multiple data augmentation policies in an ablative fashion. Experimental results demonstrate the effectiveness of our proposed method, achieving competitive performance to multiple state-of-the-art models on three unseen external datasets.
\end{abstract}

\begin{keywords}
Robustness, domain generalization, adversarial training, diabetic retinopathy.
\end{keywords}

\section{Introduction}
Diabetic retinopathy (DR) is a medical condition that occurs due to microvascular retinal complications that are caused by diabetes mellitus. If the disease progresses, the result is irreversible vision loss, which is why early diagnosis of the disease is of utmost importance. DR is characterized by the presence of lesions in the eye: microaneurysms, hemorrhages, and soft and hard exudates. These are made visible on color fundus eye images and form the basis for evaluation of the severity of the disease by ophthalmologists~\cite{wang2018diabetic, lechner2017pathology, Sun2022}. According to the International Clinical Diabetic Retinopathy (ICDR) scale, five levels of severity can be defined: no DR, mild, moderate, severe, and proliferative~\cite{DRscale}. Since the lesions are very small, manual diagnosis of DR is resource-intensive and time-consuming. This makes the development of algorithms to support the medical experts indispensable~\cite{Chen2022,Chetoui2020}. Deep-learning (DL) models have achieved great success in various tasks in the field of medical image analysis, including DR grading~\cite{li2021applications, ragabDeePSCDeepLearning2023a, litjens2017survey}. However, real-world clinical data includes many sources for variations, such as imaging standards, camera brands, and patient demographics, which lead to different distributions or covariate shifts, like e.g. in the color space \cite{domainmedical}. As a consequence, DL models trained on one (or even several) specific domain(s) usually do not generalize well to other unseen domains, resulting in a lack of robustness for the application in real-world scenarios ~\cite{domainshift, wang2022generalizing}. Domain adaptation techniques, which aim at transferring knowledge from the source domain to the target domain or making models domain-agnostic, have emerged as a promising solution to this problem studied for various imaging modalities~\cite{MIDOG, surveyDA}. Nonetheless, only limited research has been conducted in the field of domain adaptation for color fundus eye images (see Section \ref{sec: related work}). Therefore, the primary objective of this work is to develop a robust model for DR severity classification that generalizes to unseen domains. Our contributions include:
\begin{description}[itemsep=-0.5pt,leftmargin=0.8cm]
\item[DR severity classification framework:] We introduce \textbf{RADR}, a novel \textbf{R}obust domain-\textbf{A}dversarial-based DL framework for \textbf{DR} severity classification, aimed to generalize across diverse datasets and fundus cameras. This builds upon existing research in the field, by, to the best of our knowledge, combining for the first time extensive dataset curation based on quality control labels provided by \citet{Fu2019}, camera domain information provided by \citet{cycleGan}, data preprocessing as well as domain adversarial training and data augmentation.
\item[Ablation study:] We qualitatively and quantitatively assess the impact of various commonly used methods for domain generalization on our curated training dataset, namely domain adversarial learning, multi-camera (MC) training as well as AugMix and custom color augmentations, providing novel insights into their effectiveness in this field.
\item[Comparative analysis:] We finally conduct a comparative analysis with multiple state-of-the-art models, demonstrating the effectiveness of our proposed method in achieving performance comparable to or surpassing existing approaches on three unseen external datasets. 
\end{description}

\section{Related Work}
\label{sec: related work}
To the best of our knowledge, four distinct approaches for domain adaption in the field of DR grading have been published to date. First, \citet{cycleGan} introduced the Residual-CycleGAN model for image-to-image translation. They defined camera labels for the images in the EyePACS dataset and used those cameras as domains. Adapted test images improved the performance of a classifier trained on a single camera, without retraining it on the target camera. However, this method requires retraining the CycleGAN for each additional domain. Additionally, they compared against a domain adversarial training strategy on camera labels, similar to what is used in our work, though without evaluating on external data or comparing to the SOTA. The main idea presented by \citet{drgen} is to use domain gradient variance information as a regularization technique. In addition, during training of their proposed DRGen model, they search for flat minima in the validation loss. For this purpose, their approach combines the Fishr method and stochastic weight averaging densely. \citet{galappaththige2024generalizing} propose SPSD-ViT, which combines self-distillation with a prediction softening mechanism in vision transformers. Furthermore, their study involved retraining existing methodologies on a range of datasets, thereby facilitating a comprehensive comparative analysis of various SOTA model results. Finally, the multi-model learning approach presented by \citet{zhang2022multi} treats each sample as a composite derived from multiple source domains. They combine models trained on different source domains, determining the model weight based on the Euclidean distance between the source and target models' features. Pseudo-labels for target images are obtained through feature-level clustering. They achieve over 90\% accuracy on unseen data, though they transformed the five stages of DR into a simpler binary classification problem. Our work aims to expand this limited field of research, and wherever possible, benchmark against the existing methods.

\section{RADR}
In this work, we present RADR, a domain adversarial-based framework for automated diabetic retinopathy severity classification aiming for robust performance on unseen datasets. RADR is derived from the publicly available EyePACS dataset that we preprocess, curate for image quality and enrich with camera information. Using the camera labels as domains, we train RADR in an adversarial fashion to extract domain-agnostic features for its severity prediction. We finally compare our framework on three unseen commonly used DR datasets to the SOTA and evaluate our training regime in an ablative fashion. Figure~\ref{figure: proposed_method} depicts the full pipeline. In the following, our approach is described in more detail.
\subsection{Data}
\label{subsec: dataset}
We train our model on the EyePACS dataset~\cite{EyePACS}, a publicly available collection of 88,702 color fundus eye images. These images are classified into five classes, corresponding to the level of DR severity. We apply multiple steps of preprocessing and data curation to this dataset. Firstly, roughly 25\% of the images in the EyePACS collection are considered to be ungradable due to the poor quality, artifacts, excessively bright or dark images, or out-of-focus images~\cite{Chetoui2020}. Therefore, these poor quality images were eliminated according to the three-level quality labels provided by \citet{Fu2019}, removing the \textit{`reject'} category. Additionally, overly dark images were removed by thresholding on the average pixel value of images converted to grayscale. All images are cropped and resized into squares of size $512\times512$ pixels, centered around the retina, to remove noise and redundant image areas. Furthermore, the images of the EyePACS dataset were acquired using different camera brands. To utilize this inherent known heterogeneity in our model, we use the camera labels provided by \citet{cycleGan} to create a separate subdataset for each camera A, B, C, D and E. For aggregated RGB histograms per subdataset, refer to the Appendix. It should be noted that they stated differences between the provided labels and those employed in their original approach. After applying curation and preprocessing, in total 62,467 images remain of the EyePACS dataset. We split every camera subdataset by 70/15/15\% into train, validation and test sets, stratified by the severity label to assure equal distribution. In addition to the five camera datasets (A-E), three of the most popular publicly available DR grading datasets are used as external datasets to evaluate the performance on unseen data and distributions. Those datasets are Messidor, Messidor 2 and APTOS~\cite{messidor, aptos}. This will also allow us to compare the results obtained in this paper with those obtained by the SOTA models presented by \citet{drgen} and \citet{galappaththige2024generalizing}. 
\begin{figure}[ht]
  \centering
\includegraphics[width=1\linewidth]{imgs/RADR.png}
\vspace{-2em}
  \caption[Proposed method model architecture.] {Methodology of the proposed RADR framework for DR severity classification. The EyePACS dataset is enriched by camera labels, curated for image quality and preprocessed. Subdatasets A, B and C are used for end-to-end model training. Images are augmented by geometric transformations, as well as optional ColorJitter or AugMix transformations, before being fed through a ResNet50-based encoder. Resulting latent features are forwarded to both the DR staging head and the domain classifier. The GRL inverts the sign of the gradient of the domain classification loss $L_{DC}$ in the encoder, promoting domain-invariant feature representations.}
  \label{figure: proposed_method}
\vspace{-2em}
\end{figure}

\subsection{Model Architecture}
In our pursuit to build a robust DR severity classification model, we first derive a single-camera (\textbf{SC}) baseline model comprised of a ResNet-50-based feature extractor~\cite{resnet50} and a fully connected DR classifier with 5 output nodes. The SC model is trained only on camera A (source) and tested on each of the remaining subdatasets individually. In addition, geometric transformations are applied when training the SC model to increase data variability. These correspond to rotation, vertical and horizontal flipping and cropping. In the next step, we perform multi-camera (\textbf{MC}) training on the subdatasets from camera A, B and C. This is expected to increase the data variability that the model sees during training and therefore to increase performance on the remaining unseen datasets from camera D and E. In the last step, inspired by \citet{ganin2016domain}, we add a domain adversarial (\textbf{DA}) head to our network, composed of a Gradient Reversal Layer (GRL) and the domain classifier. This represents our final, most advanced model RADR. The domain classifier comprises two identical blocks, each with a linear layer, ReLu activation and dropout with probability of 0.5, followed by a final fully connected layer with three output nodes for domain classification. During backpropagation, the GRL inverts the sign of the gradient flowing from the domain head into the feature encoder (FE). Through parallel training of both classification heads, weights of the FE are adapted to produce domain-agnostic features that are still predictive for the main task of severity classification. Both the DR classifier and the domain classifier are optimized using the AdamW optimizer and Cross-Entropy loss. By utilizing both the severity labels and the domain information of all three training subdatasets in parallel during training, our framework diverges slightly from the original DANN implementation, which only used the labels of a single domain for training the main task. Additionally, a hyperparameter $\beta$ was introduced to modulate the domain classification loss's impact. Therefore, the total loss $L$ of the model can be calculated as $L = L_{DRC} + \beta \cdot L_{DC}$ where $L_{DRC}$ corresponds to the loss computed by the DR classifier and $L_{DC}$ to the loss computed by the domain classifier. $\beta$ was  empirically set to 0.3. Higher values tended towards degrading main task performance, while at lower values features could still be differentiated by the domain discriminator. We further wanted to analyze the influence of random data augmentations on the robustness of our models. For this, in addition to the default geometric transforms, we test the application of two different augmentation policies to the three presented levels of complexities of our pipeline, SC, MC, and DA training. The first policy employs color transformations, which randomly adjust the brightness, contrast, and saturation parameters of the images using ColorJitter of PyTorch's torchvision transforms \cite{paszkePyTorchImperativeStyle2019}. As our second policy, we evaluate AugMix from~\citet{hendrycks2019augmix}, which is specifically designed to increase model robustness. In detail, AugMix creates multiple copies of an image, applies a unique data augmentation chain to each and then linearly combines them using random weights. For exact settings used in this work, refer to the Appendix. Finally, to evaluate the results obtained in classifying DR images, both quantitative and qualitative metrics are used. In this work, accuracy (ACC) and quadratic weighted kappa (QWK) are used as quantitative measurements, aligning with standard metrics in the literature to facilitate comparative analysis of the outcomes. We further utilize uniform manifold approximation and projection (UMAP)~\cite{mcinnes2018umap} plots to visualize the latent representations before the final classification layer of all internal and external test data for qualitative analysis of the model. 
\section{Experiments}
\subsection{Quantitative Results: Internal Datasets}
\begin{table}[ht]
\caption[Ablation]{Performance in terms of quadratic weighted kappa (QWK) (mean ± standard deviation) of our models on the test sets of the camera domains in the EyePACS dataset, trained with five different random seeds. SC: Single-camera training on camera A, MC: Multi-camera training on cameras A, B and C, DA: Domain adversarial training on cameras A, B and C. Best performing model in bold, second best underlined.}
\renewcommand\arraystretch{1.25}
\hspace{10mm}
\resizebox{12.5cm}{!} {
\begin{tabular}{ccccccc}
\hline
\multicolumn{1}{l}{\textbf{QWK [\%]}} & Camera A                      & Camera B          & Camera C                               & Camera D          & Camera E                               & Avg           \\ \hline
\multicolumn{1}{c|}{SC}          & \multicolumn{1}{c|}{76.3±0.9} & 80.4±0.1          & 64.6±2.3                               & 49.2±4.3          & \multicolumn{1}{c|}{72.4±2.1}          & 68.6          \\
\multicolumn{1}{c|}{SC ColorAug} & \multicolumn{1}{c|}{73.0±1.3} & 78.0±1.9          & 64.4±3.2                               & 40.3±7.7          & \multicolumn{1}{c|}{64.3±5.6}          & 64.0            \\
\multicolumn{1}{c|}{SC AugMix}   & \multicolumn{1}{c|}{74.1±0.8} & 75.8±1.4          & 57.0±4.4                               & 34.6±2.4          & \multicolumn{1}{c|}{58.4±2.7}          & 60.0          \\ \hline
\multicolumn{1}{c|}{MC}          & {\ul 77.4±1.8}                & 82.6±1.1          & \multicolumn{1}{c|}{{\ul 77.8±2.6}}    & 58.9±6.2          & \multicolumn{1}{c|}{{\ul 74.1±4.2}}    & 74.2          \\
\multicolumn{1}{c|}{MC ColorAug} & 71.6±2.5                      & 79.2±2.6          & \multicolumn{1}{c|}{71.0±6.3}          & 52.2±9.9          & \multicolumn{1}{c|}{67.8±1.0}          & 68.4          \\
\multicolumn{1}{c|}{MC AugMix}   & 76.3±0.9                      & \textbf{85.3±1.6} & \multicolumn{1}{c|}{77.6±1.1}          & \textbf{68.6±6.0} & \multicolumn{1}{c|}{72.2±1.5}          & {\ul 76.0}      \\ \hline
\multicolumn{1}{c|}{DA (RADR)}   & \textbf{78.1±1.8}             & {\ul 84.4±0.6}    & \multicolumn{1}{c|}{\textbf{78.7±1.0}} & {\ul 67.0±4.2}    & \multicolumn{1}{c|}{72.6±1.6}          & \textbf{76.2} \\
\multicolumn{1}{c|}{DA ColorAug} & 73.1±3.3                      & 80.9±2.3          & \multicolumn{1}{c|}{74.7±3.9}          & 48.8±1.6          & \multicolumn{1}{c|}{\textbf{74.5±6.1}} & 70.4          \\
\multicolumn{1}{c|}{DA AugMix}   & 75.5±0.5                      & 84.1±0.5          & \multicolumn{1}{c|}{76.1±1.5}          & 61.9±1.7          & \multicolumn{1}{c|}{71.4±3.1}          & 73.8          \\ \hline
\end{tabular}}
\label{table: adversarial}
\end{table}
\noindent
Table \ref{table: adversarial} depicts the performance in terms of QWK of all training regimes and data augmentation policies on the EyePACS camera subdatasets. For the corresponding ACC results, please refer to the Appendix. All variations of the model were trained end-to-end using AdamW optimizer with hyperparameters tuned individually for best QWK on the validation splits of the EyePACS camera subdatasets. When comparing the three training regimes with only their default geometric augmentation and no added color augmentation, a consistent increase in performance on both seen and unseen datasets can be observed when using the MC approach over SC approach. Average performance is further enhanced from 74.2\% to 76.2\% by applying domain adversarial (DA) training. On camera E, a slight drop is visible, though on camera D a major improvement of 8.1 percentage points could be achieved, emphasizing the robustness-conferring influence of the adversarial training on unseen domains. Remarkably, DA training also increased QWK on the cameras A, B \& C seen during training, hinting at the possibility that the multi-task approach promoted extraction of more predictive features in general. When applying color augmentations based on ColorJitter, results deteriorate for all training regimes and cameras, except for camera E under the DA regime. Here, in contrast, camera D performance dropped by 18.2 percentage points. Overall, this indicates that the ColoJitter augmentations evaluated by us contribute negatively to overall performance and robustness. Finally, the application of the AugMix policy decreased average performance under the SC and DA training regime, though increasing average performance under the MC training regime by 1.8 percentage points, achieving comparable performance to our proposed main model RADR. A potential explanation for the negative influence of the augmentations under the DA training regime is that it opposes the domain discrimination task by blurring the differences between camera domains, limiting the potential benefit of the domain adversarial training. However, further research is required to verify this.

\subsection{Quantitative Results: External Datasets and SOTA}
We evaluate our top performing model RADR, which employs domain adversarial training without added color transformations, as well as our second best performing approach using multi-camera (MC) training with AugMix augmentations on three unseen public DR datasets, Messidor 1, Messidor 2 and APTOS (Table \ref{table: external}). Here, we also analyze classification accuracy besides QWK to enable a comparison to existing SOTA methods. Of our models, RADR achieves the highest average QWK and accuracy with 76.7\% and 65.7\%, respectively, though MC training with AugMix performs slightly better on the APTOS dataset. Notably, the highly similar scores of RADR on the QWK metric indicate consistent and robust performance across all unseen datasets. For benchmarking our method against SOTA, we compare against models from~\citet{drgen} and~\citet{galappaththige2024generalizing}. We differentiate between approaches only using EyePACS as their single source (SS) training dataset and those utilizing a multi-source (MS) leave-one-out training regime by training on the remaining three datasets when predicting on one out of EyePACS, Messidor 1, Messidor 2 and APTOS.
\begin{table}[H]
\caption[SOTA]{Performance (mean ± standard devation) of our top-performing models, MC AugMix and RADR, on the external datasets, trained with five different random seeds. SS: Single-Source training on EyePACS. MS: Multi-Source training in leave-one-out fashion on EyePACS, Messidor 1 \& 2, as well as APTOS, with prediction on the remaining dataset. Best performing model in bold, second best underlined.}
\centering	
\renewcommand\arraystretch{1.25}
\hspace{-2mm}
\resizebox{14.5cm}{!} {
\begin{tabular}{ccccl}
\hline
\multicolumn{1}{l}{\textbf{QWK [\%]}}     & Messidor 1           & Messidor 2           & APTOS                & Avg                               \\ 
\hline
\textbf{RADR} (Ours)                      & \textbf{77.7±1.9}    & \textbf{75.2±2.7}    & 77.2±2.1             &  76.7                                 \\ 
MC AugMix (Ours)              & 76.1±1.3             & 71.4±7.0             & \textbf{81.9±1.8}    &   76.5                                \\
\hline

\multicolumn{1}{l}{\textbf{ACC [\%]}}     & Messidor 1           & Messidor 2           & APTOS                & Avg                               \\ \hline
SS: \textbf{RADR} (Ours)                      & {\ul 65.3±1.3}       & {\ul 71.6±2.2}       & 60.2±2.9             & \multicolumn{1}{c}{{\ul 65.7}}    \\
SS: MC AugMix (Ours)              & 62.8±2.0             & 69.8±4.4             & 62.6±1.4             & \multicolumn{1}{c}{65.1}          \\
SS: SPSD-ViT \citep{galappaththige2024generalizing}               & 50.5±0.8             & 62.2±0.4             & \textbf{75.1±0.5}    & \multicolumn{1}{c}{62.5}          \\ 
SS: DRGen (trained by \citet{galappaththige2024generalizing})         & 54.6±1.5             & 65.4± 0.1            & 61.3±1.9             & \multicolumn{1}{c}{60.4}          \\
MS: SPSD-ViT \citep{galappaththige2024generalizing}               & 64.8±0.5             & \textbf{72.4±0.6}    & 51.7±1.2             & \multicolumn{1}{c}{62.9}          \\
MS: DANN (trained by \citet{galappaththige2024generalizing})         & 57.0±1.1             & 58.6±1.7            & 54.4±0.8             & \multicolumn{1}{c}{56.7}          \\
MS: DRGen (trained by \citet{galappaththige2024generalizing})    & 59.1±1.8              & 65.2±0.6          & 51.2±2.1     & \multicolumn{1}{c}{58.5}          \\
MS: DRGen \citep{drgen}                  & \textbf{66.7}        & 70.5                 & {\ul 70.3}           & \multicolumn{1}{c}{\textbf{69.1}} \\
\hline
\end{tabular}}
\label{table: external}
\end{table}
\noindent
Here, the original DRGen method from~\citet{drgen} achieved the best average performance with 69.1\%. Our proposed RADR model scored second best with 65.7\%. It is to note though that this comparison favors the DRGen model. By employing a leave-one-out training and evaluation regime on the four datasets EyePACS, Messidor 1\&2 and APTOS, they not only utilized significantly more training data than us, the reported accuracies per unseen dataset also stem from different versions of their model, while our results are all from the same version. When aiming for generalization and robustness, reporting results from the same model across all unseen datasets should be preferred.~\citet{galappaththige2024generalizing} reproduced the DRGen method under the MS training regime, however, only achieved an average accuracy of 58.5\%. Interestingly, they also evaluated a domain-adversarial network (DANN), similar to our approach, under the MS training regime. This achieved an average accuracy of 56.7\%, lacking behind our method by 9 percentage points, even though they utilized more training data and multiple instances of their model. This hints at the superiority of utilizing the camera labels from~\citet{cycleGan} as domain indicator for adversarial training, as in our proposed method, over defining every dataset as an individual domain. Finally, fair comparisons can only be drawn when comparing methods under the same SS training regime, only training on the EyePACS dataset and predicting on all others. Here, RADR outperformed the SPSD-ViT from~\citet{galappaththige2024generalizing} by 3.2 percentage points, as well as an SS re-implementation of the DRGen model by 5.3 percentage points. This concludes that our proposed framework is able to strongly compete with SOTA models, even by using less training data, or even surpass them under equal conditions.
\begin{figure}[ht]
    \centering
\includegraphics[width=0.8\linewidth]{imgs/RADR_umaps.png}
\vspace{-1em}
\caption[UMAP representations]{UMAPs for the camera subdatasets of EyePACS and external datasets for our three training regimes: Single Camera (a), Multi Camera (b) and RADR (c).}
\label{figure: umaps}
\vspace{-1em}
\end{figure}
\subsection{Qualitative Results: UMAP Representations}
Figure \ref{figure: umaps} depicts UMAP visualizations of the latent representations in the output of the feature extractor of all samples across all datasets used in this work. For the SC (a) and MC (b) training regime, a high separability of the domain clusters of the camera subdatasets can be observed, especially for cameras A, C \& D. This shows that for these approaches a lot of domain specific information is still contained after the FE, which isundesirable when aiming for robustness. When analyzing the external datasets, we observe that data from the same origin, Messidor 1 \& 2, forms a mixed cluster, while APTOS is separate. The UMAP of the DA training regime of RADR (c) reveals a more entangled latent space, with the different domains blending into each other. This is especially the case for the EyePACS camera subdomains and the Messidor data, though APTOS still expresses a high separability from the remaining data. Overall, the visualization of the latent spaces emphasizes the successful push towards domain-invariant feature representations and robustness of our method.

\section{Conclusion}
This paper presented RADR, a deep-learning framework for DR severity classification, which combines several ideas to perform extensive dataset curation, preprocessing, and enrichment with fundus camera information with a domain adversarial training regime. We explored our framework in its various levels of complexity in combination with multiple data augmentation policies in an ablative fashion, showing best performance when only using geometric transforms during training. Our model achieved competitive or higher performance to multiple SOTA models on three unseen external datasets, even when using less training data. We link this mostly to the reduction of noise in the dataset by the extensive preprocessing and filtering we conduct, which has, to the best of our knowledge, never been done before by similar approaches aiming for robustness in DR classification. We hypothesize that further improvements could be achieved by retraining our model on all five camera domains instead of only three, as well as by employing more advanced data augmentations, specifically aimed to function in unison with a domain adversarial training regime.



\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank Patrick Fuhlert, MSc, and Nico Kaiser, MSc, for their helpful comments.}


\bibliography{midl24_079}

\newpage

\appendix

\section{Histograms of EyePACS Camera Datasets}
\label{sec:histos}

\setcounter{figure}{0}
\renewcommand{\thefigure}{\Alph{figure}}
\begin{figure}[H]
    \centering
    \includegraphics[width=1\linewidth]{imgs/histograms.png}
    \caption{RGB histograms for aggregated pixels of all images of the EyePACS dataset, separated by camera label. To filter out the spikes of the black pixels at the edges, all pixel values $<$ 5 where removed.}
    \label{fig:histos}
\end{figure}

\newpage
\section{Model Parameters}
\label{sec:hparams}

Training of the SD, MD and DA model was performed for a maximum of 100 epochs with a batch size of 16. We used AdamW optimizer with a weight decay of 0.0005 and an initial learning rate of 1e-4. ReduceLROnPlateau scheduler was used with a reduction factor 0.2 and patience of 5 epochs. Early stopping is applied with a patience of 10 epochs. Dropout probability was set to 0.5. When adding the domain classifier, different learning rates were explored, however, it was found that using the same starting learning rate of 1e-4 lead to the best results. Furthermore, we also explored different values for $\beta$ between 0.1 and 1, with a final value of 0.3. Figure \ref{figure: domain classifier} depicts the architecture of the fully connected domain classifier. For the AugMix augmentation policy, both the severity of base augmentation operators and the number of augmentation chains was set to 3. The stochastic depth of augmentation chains was set to -1 and alpha to 0.1. The ColorAug policy was implemented using ColorJitter as provided by the Torchvision Python package. Adaptation ranges for brightness, contrast, and saturation were set to a range of [-0.3, 0.3].
\noindent
In total, the ResNet50-based feature extractor contains 23.5M trainable parameters, the DR severity classifier 10.2k parameters and the domain classifier 3.1M parameters.  

\begin{figure}[H]
    \centering
\includegraphics[width=0.8\linewidth]{imgs/Domain_Classifier.jpg}
\caption[Domain Classifier]{Domain Classifier architecture.}
\label{figure: domain classifier}
\end{figure}

\newpage
\section{Extension Quantitative Results: Internal datasets}
\label{sec:acc}

\setcounter{table}{0}
\renewcommand{\thetable}{\Alph{table}}
\begin{table}[H]
\caption[Ablation]{Performance in terms of accuracy (ACC) (mean ± standard deviation) of our models on the test sets of the camera domains in the EyePACS dataset, trained with five different random seeds. SC: Single-camera training on camera A, MC: Multi-camera training on cameras A, B and C, DA: Domain adversarial training on cameras A, B and C. All values are percentages. Best performing model in bold, second best underlined.}
\renewcommand\arraystretch{1.25}
\hspace{10mm}
\resizebox{12.5cm}{!} {
\begin{tabular}{c|ccccc|c}
\hline
\multicolumn{1}{l}{\textbf{ACC [\%]}} & \multicolumn{1}{l}{Camera A} & \multicolumn{1}{l}{Camera B} & \multicolumn{1}{l}{Camera C}          & \multicolumn{1}{l}{Camera D} & \multicolumn{1}{l}{Camera E} & \multicolumn{1}{l}{Avg} \\ \hline
SC                                & \multicolumn{1}{c|}{75.2±1.1} & 80.8±1.1                     & 77.7±2.1                               & 67.2±4.5                     & 81.7±0.8                      & 76.5                    \\
SC ColorAug                       & \multicolumn{1}{c|}{78.7±1.1} & 81.1±1.4                     & 78.4±1.9                               & 63.3±7.3                     & 80.0±3.3                      & 76.3                    \\
SC AugMix                         & \multicolumn{1}{c|}{78.9±0.6} & 78.3±1.2                     & 72.3±2.0                               & 53.9±3.4                     & 74.9±1.7                      & 71.7                    \\ \hline
MC                                & 79.2±4.1                      & 81.5±2.0                     & \multicolumn{1}{c|}{83.3±3.2}          & 67.8±2.7                     & 82.5±2.8                      & 78.8                    \\
MC ColorAug                       & 73.0±7.3                      & 77.2±3.5                     & \multicolumn{1}{c|}{75.2±1.5}          & 59.1±1.6                     & 76.4±5.7                      & 72.2                    \\
MC AugMix                         & \textbf{82.1±1.7}             & \textbf{85.3±1.1}            & \multicolumn{1}{c|}{{\ul 85.1±1.1}}    & \textbf{79.7±4.6}            & {\ul 83.1±1.1}                & \textbf{83}             \\ \hline
DA (RADR)                         & {\ul 81.5±2.1}                & 84.1±1.7                     & \multicolumn{1}{c|}{\textbf{85.4±0.8}} & {\ul 79.1±5.0}               & \textbf{83.6±1.4}             & {\ul 82.7}              \\
DA ColorAug                       & 74.5±8.0                      & 80.2±4.4                     & \multicolumn{1}{c|}{80.7±5.5}          & 60.5±1.3                     & 78.1±6.1                      & 74.8                    \\
DA AugMix                         & 81.2±0.4                      & {\ul 84.7±0.3}               & \multicolumn{1}{c|}{84.1±0.8}          & 75.4±1.6                     & 82.7±1.3                      & 81.6                    \\ \hline
\end{tabular}}
\label{accuracy}
\end{table}


\end{document}
