\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{graphicx}
\usepackage{hhline}
\usepackage{algpseudocode}
\usepackage{multirow}
\usepackage{boldline}
\usepackage{rotating}
\usepackage{hhline}
\usepackage{pifont}

%\usepackage[colorlinks=true, allcolors=blue]{hyperref}

%\usepackage{booktabs}
%\usepackage{multirow}
\usepackage{caption}
%\usepackage{subcaption}
%\usepackage{subfigure}


\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 175}
\editors{Accepted for publication at MIDL 2024}


\title[Domain generalization in OD/OC segmentation with Noisy Students]{Semi-supervised learning with Noisy~Students improves domain generalization in optic disc and cup segmentation in uncropped fundus images}



\midlauthor{\Name{Eugenia Moris} \Email{emoris@pladema.exa.unicen.edu.ar}\\
            \Name{Ignacio Larrabide} \Email{larrabide@exa.unicen.edu.ar}\\ 
            \Name{Jos\'e Ignacio Orlando} \Email{jiorlando@pladema.exa.unicen.edu.ar}\\
  \addr UNCPBA, CONICET, Yatiris Group, Instituto Pladema, Tandil, Buenos Aires, Argentina.}




\newcommand{\emm}[1]{{\textcolor{red}{#1}}}
\newcommand{\suu}{$\mathcal{S}\cup\mathcal{U}$ }
\newcommand{\U}{$\mathcal{U}$ }
\newcommand{\modelSUU}{$f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$ }
\newcommand{\modelU}{$f_{\theta_{\mathcal{U}}}$ }


\begin{document}

\maketitle

\begin{abstract}
Automated optic disc (OD) and cup (OC) segmentation in fundus images has been widely explored for computer-aided diagnosis of glaucoma. However, existing models usually suffer from drops in performance when applied on images significantly different than those used for training.
Several domain generalization strategies have been introduced to mitigate this issue, although they are trained and evaluated using images manually cropped around the optic nerve head. This operation eliminates most sources of domain variation, therefore overestimating their actual ability to cope with new, unseen patterns. 
In this paper, we analyze the most recent and accurate methods for domain generalization in OD/OC segmentation by applying them on uncropped fundus pictures, observing notorious degradations in their performance when trained and evaluated under this setting.
To overcome their drawbacks, we also introduce a simple semi-supervised learning approach for domain generalization based on the Noisy Student framework.
Using a Teacher model trained on a combination of domains, we pseudo-labeled a dataset of 18.000 originally unlabeled images that are then used for training a Student model.
This semi-supervised setting allowed the Student network to capture additional sources of variability while retaining the original cues and patterns used by the Teacher through the weak annotations.
Our results on eight different public datasets show improvements in every unseen domain over all alternative methods, and are available in \url{https://github.com/eugeniaMoris/Noisy_student_ODOC_MIDL_2024}.
\end{abstract}

\begin{keywords}
Domain Generalization, Semi-supervised learning, Segmentation
\end{keywords}

\section{Introduction}
\label{sec:introduction}


Segmenting the optic disc (OD) and cup (OC) in fundus images is a common practice for detecting and characterizing glaucoma, one of the leading causes of irreversible blindness worldwide~\cite{veena2020glaucoma}. A significant effort has been made to automate this task, resulting in models with excellent performance in known databases \cite{alawad2022machine_review,moris2023assessing,wang_BEAL}. Nevertheless, their accuracy is frequently affected when applied on images from domains unseen during training, hampering their clinical application \cite{nan2022data}. This is inherent to the natural diversity in the appearance of fundus images, e.g. due to the overall quality of the scan, variations in the acquisition protocol or device, the intrinsic retinal pigmentation associated to patient ethnicity, or the presence of lesions that were not considered in the training sets \cite{yoon2023domain_generalization_review}. 

Typically, practitioners aim to overcome this limitation with increased data augmentation \cite{lyu_AADG}, although modelling every possible scan (and disease) appearance with image transformations is unfeasible. Alternatively, several studies introduced novel domain generalization techniques \cite{yoon2023domain_generalization_review} that aim to improve OD/OC segmentation in unseen domains without requiring domain-specific information or adaptation, e.g. through domain alignment \cite{chen_SFDA-DPL,liu2021feddg,wang2020dofe,chen2022tvconv,hu2022domain,zhou2022ram_dsir}, meta-learning \cite{hu2023map}, and augmentation techniques \cite{lyu_AADG,yang2021hcdg,kang2022invariant,gu2023cddsa}.
While these have reported remarkable improvements when applied on unseen domains, we noticed that, in all cases, they are trained and validated using manual crops around the optic nerve head (ONH). This decision drastically eliminates most sources of image variability, implicitly overestimating their ability to cope with alterations outside this area. Furthermore, they also require users to perform the crop themselves when deployed in real clinical settings, hampering their automation and applicability for processing large databases.

In this paper we perform an in-depth evaluation of the most accurate existing models for domain generalization in OD/OC segmentation, by training and applying them on uncropped fundus images.
In line with our hypothesis, we observe notorious degradations in their results when compared with their reported numbers. 
Furthermore, we show that a semi-supervised learning strategy based on the Noisy Student \cite{xie2020noisyStudent} is already able to overcome this limitation. 
Semi-supervised learning is an active area of research for medical image segmentation \cite{jiao2023semisupervised_review}, while weak supervision have shown promising results for OD segmentation e.g. through classification labels or bounding boxes in a multitask setting \cite{yin2023dual} and for source-free domain adaptation \cite{huai2023context}. 
In our case, we leverage a Teacher model trained on diverse domains to pseudo-label a dataset comprising 18.000 initially unlabeled images. This massive set is then used for training a Student model, which  effectively assimilate new sources of variability while preserving the intrinsic cues and patterns imparted by the Teacher via weak annotations. Our evaluation across six unseen domains reveals consistent performance enhancements, surpassing the alternative methodologies. Furthermore, we also observe a statistical preservation of performance on unseen scans belonging to the three domains used for training.




\section{Methods}

\subsection{Domain generalization with Noisy Students in uncropped images}

Let $\mathcal{S} = \{(x_i, y_i)\}_{i=1}^{n}$ represent a dataset with $n$ pairs of uncropped fundus images $x_i \in \mathcal{X}$ and their corresponding multiclass OD/OC annotations $y_i \in \mathcal{Y}$. In a standard supervised learning setting, this set is used to train a multiclass segmentation model $f_{\theta_{\mathcal{S}}} : \mathcal{X} \rightarrow \mathcal{Y}$, with $f$ denoting the neural network architecture and $\theta_{\mathcal{S}}$ the learned parameters. When applying the model $f_{\theta_{\mathcal{S}}}$ on a new unseen target domain $\mathcal{D}_T$, it is desirable for $f_{\theta_{\mathcal{S}}}$ to retain its original performance. To this end, networks are usually trained in datasets as big and diverse as possible, e.g. by crafting $\mathcal{S}$ using subsets of images $\mathcal{X}$ sampled from multiple source domains $\mathcal{D}^{(j)}_S$. Notice that this requires scaling manual annotation for every new input sample, which is expensive and time consuming for segmentation tasks. As an alternative, one can stimulate the model to learn other alternative appearances through heavy data augmentation, although modelling every possible target scenario through image transformations is unfeasible. At the same time, it is likely that the model potentially learns to cope artificially produced artifacts, as there are no guarantees about the actual resembling of these altered images to those expected from any unseen target domain $\mathcal{D}_T$. 

Our hypothesis is that leveraging samples from a large enough unlabelled dataset $\mathcal{U}$ during training might mitigate and reduce the covariate shift between training and test domains, improving domain generalization with no extra manual annotation effort. Furthermore, feeding with uncropped fundus images should aid the network to capture sources of variation complementary to those around the ONH. To accomplish this, we propose to follow a Noisy Student approach \cite{xie2020noisyStudent}, originally introduced for image classification. In this case, our goal is to learn a Student network for OD/OC segmentation in color fundus pictures using pseudo-labelled uncropped samples from $\mathcal{U}$. Figure~\ref{fig:schematic} (a) depicts our methodology. First, we train a supervised Teacher model $f_{\theta_{\mathcal{S}}}$ using the labelled dataset $\mathcal{S}$, with uncropped images from a (combination of) source domain(s) $\mathcal{D}_S$. Afterwards, this network is applied on an unlabeled dataset $\mathcal{U} = \{ x_i \}_{i=1}^{m} $, with $n << m$, to generate weak, pseudo-labels $\hat{y}_i$ of the OD and the OC. 
The resulting pairs $(x_i, \hat{y}_i)$ can then be used to train a Student model, either individually (resulting in $f_{\theta_{\mathcal{U}}}$, Figure~\ref{fig:schematic} (a)), or in combination with $\mathcal{S}$ (resulting in $f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$, Figure~\ref{fig:schematic} (b)), (see Section \ref{subsec:methods-joint}). Both the Teacher and the Student are based on the exact same architecture $f$, and the Student is trained with stronger data augmentation than the Teacher, as suggested by \citet{xie2020noisyStudent}. This hardens the task solved by the Student model, forcing it to learn new patterns to obtain similar segmentations to those obtained by the Teacher, but in other different, more difficult images. This combination of new pseudo-labeled samples and data augmentation, and the fact that the Student is trained from scratch on this set, has been reported to improve results on out-of-distribution samples within seen test sets  \cite{xie2020noisyStudent}. Alternatively, we propose to apply this approach to improve results in unseen target domains $\mathcal{D}^{(j)}_T$.


\begin{figure}[t!]
    \centering
    \includegraphics[width=.97\linewidth]{Images/Fig1.png}
    % Figura esquemática representando el enfoque que proponemos
    \caption{Schematic representation of our method. In a Noisy Student iteration, a Teacher  $f^\mathcal{S}_{\theta}$ trained in a labeled set $\mathcal{S}$ is used to pseudo-annotate an unlabelled dataset $\mathcal{U}$, which is leveraged for training a Student, either (a) individually, or (b) jointly with $\mathcal{S}$.
    (c) The Student $f_{\theta_{\mathcal{U}}}$ could then by used as Teacher in a sequential manner by fine-tuning it on $\mathcal{S}$.}% a Teacher model, trained in a labelled set $\mathcal{S}$, is used to pseudo-label an unknown, unlabelled dataset $\mathcal{U}$. This dataset is then utilized to train the Student model. Following this, the Student model undergoes a fine-tuning process using the labeled dataset $\mathcal{S}$. The resultant fine-tuned Student model assumes the role of the new Teacher, thereby establishing a cyclic process that can be iterated as required.}
    \label{fig:schematic}
\end{figure}


\subsection{Joint training vs. iterative fine-tuning and training}
\label{subsec:methods-joint}

A Student $f_{\theta_\mathcal{U}}$ trained with pseudo-labels benefits from an implicit transfer of knowledge from the Teacher through weak targets. In practical terms, this should manifest in the Student having similar performance in the source domains $\mathcal{D}_S$. Nevertheless, recall that the Student is trained from scratch on $\mathcal{U}$, meaning that there was no direct access to images from $\mathcal{S}$. Therefore, the model might behave in domains $\mathcal{D}_S$ as if they were target domains, experiencing a certain drop in performance. To alleviate this issue, one option is to include $\mathcal{S}$ in the training set of the Student, resulting in a model ${f_\theta}_{\mathcal{U} \cup \mathcal{S}}$ that had access to both sets simultaneously (Figure~\ref{fig:schematic} (b)). Alternatively, we can think on an iterative process as the one in Figure~\ref{fig:schematic} (c), in which the Student ${f_\theta}_{\mathcal{U}}$ is fine-tuned in $\mathcal{S}$ (resulting in ${f_\theta}_{\mathcal{S}}^{(2)}$) to become a Teacher, and then applied on $\mathcal{U}$ to produce new pseudo-labels to create a new ${f_\theta}_{\mathcal{U}}^{(2)}$ as in \cite{hao2022selfiter1,guan2023iterative}. This process can be repeated $k$ times until reaching convergence, e.g. by evaluating performance increments on a held-out set.


\section{Experimental setup}


\subsection{Implementation details}
%
%We evaluated improvements in domain generalization using the standard U-Net architecture applied in \cite{moris2023assessing} as backbone for both Teachers and Students. The network has 31.384k parameters. Images were resized to a resolution of $256 \times 256$ pixels before feeding them to the network during training. In test time, the output score maps were re-scaled to the original image resolution using bilinear interpolation, to avoid biasing the evaluation metrics. Data augmentation was introduced during training. ...... Teacher augmentation / Student augmentation. Training details.

Code was implemented using PyTorch Lightning (v. 1.5.10), and all experiments were conducted using NVIDIA RTX 3060 GPUs with 12GB. Both Teacher and Student used the U-Net described  in~\cite{moris2023assessing} (with 31 million parameters) as backbone. Training was performed minimizing a multiclass cross-entropy loss with Adam optimization. Learning rates were experimentally adjusted to each run, based on validation performance. Batch sizes of 20 images were used in all cases. For $f_{\theta_{\mathcal{S}\cup\mathcal{U}}}$, each batch included 18 samples from $\mathcal{U}$ and 2 from $\mathcal{S}$, to ensure the model had access to both domains on each iteration. To differentiate errors on each domain, we used a convex sum of losses with a $\lambda$ coefficient. 

Data augmentation was used following a custom adaptation of RandAugment~\cite{cubuk2020randaugment}, using vertical and horizontal flipping, Gaussian blur, rotation, rescaling, and color jittering as transformations, parameterized with a probability $p$ of applying a transformation and a strength factor $s$ to increase or reduce their limits (e.g. angles in rotations, size of resizing and blurring filters, etc.). To avoid overfitting the Teacher, we also applied data augmentation for training it, using $p=0.1$ and $s=0.1$. The Student, on the other hand, was trained using $p=0.5$ and $s=0.5$. In all cases, images were resized to $256 \times 256$ pixels before feeding  the network. In test time, output segmentation were re-scaled to the original image resolution using nearest neighbor interpolation for metric computation.% \emm{We also analyze what happen when the $\mathcal{S}$ dataset us changes with other datasets.}

%HYPERPARAMETROS
%For all the training models we used an Adam optimizer. The learning rate (lr) was determined individually based on the performance of a validation set. The Teacher model uses a lr of 0.01. The Student ($\mathcal{U}$) employs a lr of 0.0001, and the Student ($\mathcal{S}\cup\mathcal{U}$) uses a lr of 0.001.
%
%A batch size of 20 is used in all the models. In the Student 
%($\mathcal{S}\cup\mathcal{U}$), the batch consist of 18 samples from the $\mathcal{U}$ domain and 2 samples of the $\mathcal{S}$ domain. To improve the model learning in both domains.
%
%We utilize cross-entropy loss as a loss function. In Student($\mathcal{S}\cup\mathcal{U}$) the loss strongly penalizes segmentation errors on the $\mathcal{S}$ samples are committed. This Loss is a combination of the loss function of the $\mathcal{U}$ domain and the $\mathcal{S}$ domain loss with a $\lambda$ of 0.7.
%
%We utilized various data augmentation techniques to enhance the performance of our model. These methods include vertical and horizontal flipping, Gaussian blur, rotation, rescaling, and color jittering, which modify the hue, saturation, brightness, and contrast of the images. The augmentation is applied to the Teacher model with a probability of 10\%, whereas for the Student models, it occurs with a probability of 50\%. Additionally, for the Student models, the modifications made by each augmentation technique are more significant.
%

%


%In our previous article \cite{moris2023assessing}, we presented the SS-UNET, a U-NET architecture that serves as the backbone of our methodology. However, we made a minor adjustment to the architecture by using inputs of 256x256 instead of 512x512 due to the large amount of new data utilized in this study. We employed the same architecture for both the Teacher and the Student models, with the only difference being the augmentation applied to them. We trained the Teacher model with a smooth augmentation for training or fine-tuning, while we applied stronger augmentation to train the Student model, following the concept of a noisy Teacher-Student \cite{xie2020noisyStudent}.

%It is essential to note that the focus of our study is on the Teacher-Student methodology. Although we used a U-Net architecture as the backbone, it is crucial to recognize that this selection was made as a simple memory-saving architecture and is not obligatory. The architecture can be replaced with any alternative structure as required by the research.

%The Baseline model, also referred to as the Teacher model, is trained with $\mathcal{S}$, denoted as $(f_{\theta_s}(X_s) \xrightarrow{} Y_s')$, as commonly presented in the literature. A prevalent issue in the literature is that baseline models, trained solely with a limited number of labeled samples, exhibit constraints on unseen domains.
%
%While increasing the number of samples may enhance generalization, acquiring a large dataset with corresponding labels is challenging, especially in domains like healthcare, where experts are needed for accurate segmentation. This necessitates diverting experts from their primary tasks to engage in the time-consuming process of manual segmentation. In response to this challenge, we introduce $\mathcal{U}$.
%
%The Baseline model generates weak labels for $\mathcal{U}$, denoted as $f_{\theta_s}(X_u \xrightarrow{} Y_u')$. Subsequently, the Student model is trained using the pseudo-labeled $\mathcal{U}$ ($\min \mathcal{L}(f_{\theta_u}(X_u),Y_u')$), leveraging the advantage of a large sample size but without reliable knowledge. To impart the necessary reliability, the Student model is fine-tuned with $\mathcal{S}$ ($\min \mathcal{L}(f_{\theta_u}(X_s),Y_s)$). This refined model then assumes the role of the new Teacher model, enabling the iteration of this methodology for $k$ cycles.

%Our objective is to enhance domain generalization, where $Dice(F_{\theta_u}(\mathcal{Z})) >> Dice(F_{\theta_s}(\mathcal{Z}))$, with $\mathcal{Z}$ representing an unseen labeled dataset, and $Dice$ defined in Equation \ref{eq: Dice}. Additionally, we aim for $HD(F_{\theta_u}(\mathcal{Z})) << HD(F_{\theta_s}(\mathcal{Z}))$, with $HD$ as the Hausdorff Distance, as indicated in Equation \ref{eq: HD}.



\subsection{Materials and evaluation metrics}

%We built the supervised training set $\mathcal{S}$ using 90\% of the images and OD/OC masks from the training sets of DRISHTI \cite{sivaswamy2014drishti} (51 images) and REFUGE \cite{orlando2020refuge} (1200 images) and from RIGA~\cite{almazroa2018retinal} which provided 749 samples without any predefined division between training and testing. Because of that, we use all the samples for training, resulting in a total size of 2050 scans. In case there are no validation samples pre-defined. 10\% of the training samples are separated to use as validation.
A summary of the datasets used for training, validation and test is provided in the appendix. We built the supervised training set $\mathcal{S}$ with images from DRISHTI~\cite{sivaswamy2014drishti}, REFUGE~\cite{orlando2020refuge} and RIGA~~\cite{almazroa2018retinal}. In particular, we took all REFUGE training set (400 images), and 90\% of DRISHTI training set (45 images) and RIGA (675 images). The remaining 10\% of RIGA (74) and DRISHTI (5) were combined with the offsite set of REFUGE (400 images) to build a validation set.
%
For $\mathcal{U}$, we randomly sampled 18000 scans from AIROGS~\cite{de2023airogs} training set. To calibrate model hyperparameters and monitor performance during training, 10\% of these images were separated as a validation set. Test partitions from DRISHTI (50 images) and REFUGE (400 images) were used for testing in known domains. As unknown domains covering variations in acquisition machine, ethnicity, lesions, and field of view (FOV), we used all scans from RIM-ONE V3 \cite{fumero2011rim} (159 images, Spanish sample, cropped around the ONH), ORIGA \cite{zhang2010origa} (650 images from Malay adults), and the three subsets from CHAKSU \cite{kumar2023chakṣu} (namely BOSCH--41 images--, FORUS--31 images--, and REMIDIO--264 images--, all from an Indian cohort and taken with 3 different devices). We also used images from PALM \cite{fang2023palm} (400 scans from an Asian population) to evaluate performance for OD segmentation in pathological myopia.%as it does not include 

We used the Dice Similarity Coefficient (DSC) and Hausdorff distance (HD) as evaluation metrics, to account both for the overlap with the ground truth labels and for boundary consistency, respectively \cite{maier2022metrics}. Statistical significance of the differences in metrics was assessed using one-tail Wilcoxon sign-rank tests with $\alpha=0.05$.


%To further evaluate the shape consistency of the OD/OC outputs, we computed the vertical cup to disc ratio (vCDR) from both manual and predicted annotations, and compared their predictive power for discriminating glaucomatous cases from healthy individuals. To this end, we used the diagnostic labels provided in DRISHTI, REFUGE, RIM-ONE v3, ORIGA, and CHAKSU REMIDIO, and performed the evaluation using the area under the ROC curve (AUC).


%To build $\mathcal{S}$, we used images from DRISHTI \cite{sivaswamy2014drishti} which contains 51 images for training and 50 for testing; REFUGE \cite{orlando2020refuge} which contains a total of 1200 images for training, validation, and testing, with 400 images in each category; and RIGA \cite{almazroa2018retinal} which provided 749 samples without any predefined division between training and testing. Because of that, we divided the samples into 90\% for training and 10\% for testing.
%Rationale.
%For the $\mathcal{U}$ we use the publicly available AIROGS \cite{de2023airogs} given the high variability and extended number of samples, but without OD/OC masks. We use the initial 18000 samples, which represent an 878\% increase compare to $\mathcal{S}$
%For the extensive unlabeled dataset, we used the publicly available AIROGS dataset \cite{de2023airogs}, encompassing 100,000 images from 60,000 patients across 500 diverse screening centers in the United States. In the context of this study, only the initial 18,000 samples were utilized, representing an 878\% increase in size compared to the multi-dataset

%To evaluate our model on unseen datasets, we collected several publicly available datasets with segmentation from various machines and regions. These datasets include RIM-ONE V3 \cite{fumero2011rim} which has 159 images, cropped on the OD area, from the San Carlos Clinical Hospital, Spain; ORIGA \cite{zhang2010origa} which consist of 650 images of Singapore Malay adults; CHAKSU \cite{kumar2023chakṣu}, which contains three sub-sets from different machines (CHAKSU BOSCH -41 images-, CHAKSU FORUS -31 images-, and CHAKSU REMIDIO -264 images-) representing the Indian region; and PALM \cite{fang2023palm}, a dataset of 400 images that has been use in the literature for diabetic retinopathy model training. This provides us with a diverse range of fundus images, each with their own inherent challenges.

\begin{table}[]
%\caption{We compared the Dice scores of our proposed models with those of recent articles that used domain adaptation and domain generalization methods for optic disc (OD) and optic cup (OC) segmentation. The best results for each dataset are highlighted in bold. The * symbol represents a significant improvement compared to the baseline.}
\centering
\caption{OC (top) and OD (bottom) segmentation results in uncropped images from unseen domains. The two best models are highlighted in bolds and underlined italics, respectively. 
Statistically significant improvements of Students $f_{\theta_\mathcal{U}}$ and $f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$ are indicated with * and +, respectively.}
    \label{tab:OC}
\resizebox{0.87\textwidth}{!}{
\begin{tabular}{c|c|ccccc}
\hline%\cline{2-10}
 %\multirow{ 2}{*}{\textbf{Method}} & \multicolumn{5}{c|}{\textbf{Optic disc (Dice \%)}} & \multicolumn{5}{c}{\textbf{Optic cup (Dice \%)}}                                                                                                                                   \\ \cline{2-11}
OC &\textbf{Method}  
& \multicolumn{1}{c|}{\textbf{RIMONE}}  
& \multicolumn{1}{c|}{\textbf{BOSCH}}                 
& \multicolumn{1}{c|}{\textbf{FORUS}}                  
& \multicolumn{1}{c|}{\textbf{REMIDIO}}                
& \textbf{ORIGA}  \\ \hline%               

  \multirow{ 5}{*}{\rotatebox{90}{\textbf{$\uparrow$ DSC (\%) }}} 
  &\citet{wang2020dofe}       
  &  \multicolumn{1}{c|}{64.32 $\pm$ 30.71} 
  & \multicolumn{1}{c|}{62.07 $\pm$ 25.06*$^+$}           
  & \multicolumn{1}{c|}{82.94 $\pm$ 17.41*}            
  & \multicolumn{1}{c|}{61.07 $\pm$ 25.49*$^+$}             
  & 71.90 $\pm$ 22.52*$^+$            \\
\cline{2-7} %\hline

&\citet{chen2022tvconv}                  
& \multicolumn{1}{c|}{65.89 $\pm$ 29.48} 
& \multicolumn{1}{c|}{66.22 $\pm$ 20.74*$^+$} 
& \multicolumn{1}{c|}{84.95 $\pm$ 9.99*}            
& \multicolumn{1}{c|}{63.20 $\pm$ 23.83*$^+$}             
& 72.74 $\pm$ 21.55*$^+$            \\ 
\cline{2-7}
&\citet{zhou2022ram_dsir}                  
& \multicolumn{1}{c|}{64.89 $\pm$ 20.99*$^+$} 
& \multicolumn{1}{c|}{82.05 $\pm$ 4.08*$^+$}
& \multicolumn{1}{c|}{\textit{\underline{85.90 $\pm$ 3.72*}}}            
& \multicolumn{1}{c|}{80.58 $\pm$ 6.14*$^+$}             
& 76.48 $\pm$ 12.02*$^+$          \\ 
\cline{2-7}

 & Teacher $f_{\theta_\mathcal{S}}$             
 & \multicolumn{1}{c|}{54.38 $\pm$ 22.87*$^+$} 
 & \multicolumn{1}{c|}{\textbf{87.71 $\pm$ 5.08}}  
 & \multicolumn{1}{c|}{83.64 $\pm$ 8.54*}            
 & \multicolumn{1}{c|}{78.54 $\pm$ 18.43*$^+$}             
 & \textit{\underline{82.18 $\pm$ 14.52$^+$}}   \\ \cline{2-7}\cline{2-7}\cline{2-7}\cline{2-7}
 
 & \textbf{Ours} ($f_{\theta_\mathcal{U}}$)      
 & \multicolumn{1}{c|}{\textit{\underline{68.39 $\pm$ 19.32}}$^+$} 
 & \multicolumn{1}{c|}{85.49 $\pm$ 5.44}           
 & \multicolumn{1}{c|}{\textbf{88.85 $\pm$ 5.17}*}  
 & \multicolumn{1}{c|}{\textbf{86.25 $\pm$ 6.92}}   
 & 81.08 $\pm$ 11.07 $^+$            \\ \cline{2-7} 
 
 & \textbf{Ours} ($f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$)       
 & \multicolumn{1}{c|}{\textbf{70.37 $\pm$ 18.23}}  
 & \multicolumn{1}{c|}{\underline{\textit{85.52 $\pm$ 5.94}}}           
 & \multicolumn{1}{c|}{84.34 $\pm$ 7.31*}  
 & \multicolumn{1}{c|}{\textit{\underline{85.83 $\pm$ 8.87}}}   
 &   \textbf{83.08 $\pm$ 14.52 }        \\ \hline\hline
%%%%%%%%%%%%%%%%%%%%%HD


  \multirow{ 6}{*}{\rotatebox{90}{ \textbf{$\downarrow$ HD}}}  & \textbf{Method}
  &\multicolumn{1}{c|}{\textbf{RIMONE}} 
  & \multicolumn{1}{c|}{\textbf{BOSCH}}                 
  & \multicolumn{1}{c|}{\textbf{FORUS}}                  
  & \multicolumn{1}{c|}{\textbf{REMIDIO}}                 
  & \textbf{ORIGA}                  \\ \cline{2-7}

  &\citet{wang2020dofe}       
  & \multicolumn{1}{c|}{\underline{\textit{44.78 $\pm$ 28.72}}}  
  & \multicolumn{1}{c|}{32.52 $\pm$ 24.27*$^+$}           
  & \multicolumn{1}{c|}{26.12 $\pm$ 16.07}            
  & \multicolumn{1}{c|}{64.92 $\pm$ 27.50*$^+$}             
  & \multicolumn{1}{c}{\underline{\textit{46.34 $\pm$ 23.93*$^+$}}}   \\ \cline{2-7}
  
  & \citet{chen2022tvconv} 
  & \multicolumn{1}{c|}{\textbf{42.67 $\pm$ 30.58}}           
  & \multicolumn{1}{c|}{23.99 $\pm$ 7.92*$^+$}            
  & \multicolumn{1}{c|}{24.46 $\pm$ 10.54}             
  & \multicolumn{1}{c|}{58.82 $\pm$ 22.28*$^+$}            
  & \multicolumn{1}{c}{47.41 $\pm$ 24.33*$^+$}\\ \cline{2-7}

  &\citet{zhou2022ram_dsir}                  
  & \multicolumn{1}{c|}{64.11 $\pm$ 34.26*$^+$} 
  & \multicolumn{1}{c|}{15.67 $\pm$ 5.65} 
  &  \multicolumn{1}{c|}{\underline{\textit{22.78 $\pm$ 8.03}}}            
  & \multicolumn{1}{c|}{38.95 $\pm$ 14.55*$^+$}             
  & 64.75 $\pm$ 154.56*$^+$ \\ \cline{2-7}\cline{2-7}

  &Teacher $f_{\theta_\mathcal{S}}$ 
  & \multicolumn{1}{c|}{77.97 $\pm$ 40.12*$^+$} 
  & \multicolumn{1}{c|}{\underline{\textit{14.83 $\pm$ 4.67}}}           
  & \multicolumn{1}{c|}{31.21 $\pm$ 12.21*}            
  & \multicolumn{1}{c|}{90.10 $\pm$ 140.43*$^+$}             
  & \multicolumn{1}{c}{49.28 $\pm$ 110.03$^+$} \\ \cline{2-7}\cline{2-7}\cline{2-7}\cline{2-7}\cline{2-7}

  &\textbf{Ours} (\modelU) 
  & \multicolumn{1}{c|}{47.14 $\pm$ 24.95}  
  & \multicolumn{1}{c|}{\textbf{14.78 $\pm$ 4.48}}          
  & \multicolumn{1}{c|}{\textbf{22.14 $\pm$ 7.47}}  
  & \multicolumn{1}{c|}{\textbf{34.00 $\pm$ 14.42}}   
  & \textbf{36.08 $\pm$ 18.24}  \\ \cline{2-7}

  & \textbf{Ours} (\modelSUU) 
  & \multicolumn{1}{c|}{45.49 $\pm$ 24.84} 
  & \multicolumn{1}{c|}{17.51 $\pm$ 5.44*}          
  & \multicolumn{1}{c|}{29.22 $\pm$ 11.15*}  
  & \multicolumn{1}{c|}{\underline{\textit{34.51 $\pm$ 16.78}}}   
  & 50.66 $\pm$ 126.01 \\  \hline \multicolumn{7}{c}{}\\ 
\end{tabular}}
%\end{table}
%Tabla comparativa con el estado del arte
%Agregamos una comparacion con el estado del arte utilizando dice en vez de HD a su vez se agrega el std. We compare the generalization capability.


%\begin{table}[]
%\caption{OD segmentation results in uncropped images from unseen domains. The two best models are highlighted in bolds and underlined italics, respectively. 
%Statistically significant improvements of Students $f_{\theta_\mathcal{U}}$ and $f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$ are indicated with * and +, respectively.}
%    \label{tab:OD}
\resizebox{\textwidth}{!}{
\begin{tabular}{c|c|cccccc}
\hline%\cline{2-10}
 %\multirow{ 2}{*}{\textbf{Method}} & \multicolumn{5}{c|}{\textbf{Optic disc (Dice \%)}} & \multicolumn{5}{c}{\textbf{Optic cup (Dice \%)}}                                                                                                                                   \\ \cline{2-11}
  OD &\textbf{Method}  
  & \multicolumn{1}{c|}{\textbf{RIMONE}}  
  & \multicolumn{1}{c|}{\textbf{BOSCH}}                 
  & \multicolumn{1}{c|}{\textbf{FORUS}}                  
  & \multicolumn{1}{c|}{\textbf{REMIDIO}}                 
  & \multicolumn{1}{c|}{\textbf{ORIGA}} 
  & \textbf{PALM}  \\ \hline%               

 \multirow{ 5}{*}{\rotatebox{90}{\textbf{$\uparrow$ DSC (\%) }}} 
 &\citet{wang2020dofe}       
 & \multicolumn{1}{c|}{$78.64 \pm 29.03$}  
 & \multicolumn{1}{c|}{$94.15 \pm 2.30$*$^{+}$}           
 & \multicolumn{1}{c|}{$92.76 \pm 3.49$*$^+$}           
 & \multicolumn{1}{c|}{$89.12 \pm 16.49$*$^+$}             
 & \multicolumn{1}{c|}{$90.19 \pm 13.98$}  
 & \underline{\textit{74.71 $\pm$ 36.56}}    \\
\cline{2-8} %\hline

&\citet{chen2022tvconv}                 
& \multicolumn{1}{c|}{$82.43 \pm 21.54$}           
& \multicolumn{1}{c|}{$95.78 \pm 1.57$*}            
& \multicolumn{1}{c|}{$94.46 \pm 2.16$*$^+$}             
& \multicolumn{1}{c|}{$91.58 \pm 12.23$*$^+$}            
& \multicolumn{1}{c|}{$89.64 \pm 14.52$} 
&    \textbf{78.77 $\pm$ 31.77}   \\ 
\cline{2-8}
&\citet{zhou2022ram_dsir}                  
& \multicolumn{1}{c|}{$86.65 \pm 10.14$} 
& \multicolumn{1}{c|}{$92.22 \pm 7.60$*$^+$}
& \multicolumn{1}{c|}{$93.20 \pm 2.02$*$^+$}            
& \multicolumn{1}{c|}{$90.20 \pm 9.64$*$^+$}             
& \multicolumn{1}{c|}{$90.18 \pm 9.97$} 
& $73.89 \pm 31.09$         \\ 
\cline{2-8}

 
 & Teacher $f_{\theta_\mathcal{S}}$  
 &\multicolumn{1}{c|}{$81.59 \pm 20.03^+$} 
 & \multicolumn{1}{c|}{$95.93 \pm 1.54$*}           
 & \multicolumn{1}{c|}{$95.94 \pm 2.53$*$^+$}            
 & \multicolumn{1}{c|}{$90.65 \pm 10.39$*$^+$}             
 & \multicolumn{1}{c|}{$89.98 \pm 11.55$} 
 & $68.62 \pm 34.69$   \\ \cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}
 
 & \textbf{Ours} ($f_{\theta_\mathcal{U}}$)      
 & \multicolumn{1}{c|}{\textit{\underline{86.88 $\pm$ 7.72$^+$}}}  
 & \multicolumn{1}{c|}{\textbf{96.37 $\pm$ 1.15}}          
 & \multicolumn{1}{c|}{\textbf{97.36 $\pm$ 0.67}}  
 & \multicolumn{1}{c|}{\textbf{96.74 $\pm$ 1.61}}   
 & \multicolumn{1}{c|}{\underline{\textit{91.17 $\pm$ 4.56}}} 
 & $68.03 \pm 35.37$            \\ \cline{2-8} 
 
 & \textbf{Ours} (\modelSUU)      
 & \multicolumn{1}{c|}{\textbf{87.11 $\pm$ 7.04}} 
 & \multicolumn{1}{c|}{\underline{\textit{96.19 $\pm$ 1.05}}*}          
 & \multicolumn{1}{c|}{\underline{\textit{96.81 $\pm$ 1.40}}*}  
 & \multicolumn{1}{c|}{\underline{\textit{95.13 $\pm$ 3.12}}*}   
 & \multicolumn{1}{c|}{\textbf{91.27 $\pm$ 4.16}} 
 & $60.96 \pm 40.47$*         \\  \hline\hline


%%%%%%%%%%%%%%%%%%%%%%%%%%%5HD
  \multirow{ 6}{*}{\rotatebox{90}{\textbf{$\downarrow$ HD}}}  & \textbf{Method}
  & \multicolumn{1}{c|}{\textbf{RIMONE}} 
  & \multicolumn{1}{c|}{\textbf{BOSCH}}                 
  & \multicolumn{1}{c|}{\textbf{FORUS}}                  
  & \multicolumn{1}{c|}{\textbf{REMIDIO}}                 
  & \multicolumn{1}{c|}{\textbf{ORIGA}}  
  & \textbf{PALM}                  \\ \cline{2-8}

  &\citet{wang2020dofe}       
  & \multicolumn{1}{c|}{44.69 $\pm$ 37.83}  
  & \multicolumn{1}{c|}{12.59 $\pm$ 4.19*}           
  & \multicolumn{1}{c|}{23.00 $\pm$ 10.97*$^+$}            
  & \multicolumn{1}{c|}{40.58 $\pm$ 52.77*$^+$}             
  & \multicolumn{1}{c|}{38.82 $\pm$ 68.25} 
  & \textbf{59.72 $\pm$ 129.64}   \\ \cline{2-8}
  
  & \citet{chen2022tvconv} 
  & \multicolumn{1}{c|}{45.12 $\pm$ 34.56*}           
  & \multicolumn{1}{c|}{\textbf{10.37 $\pm$ 2.0}}            
  & \multicolumn{1}{c|}{20.48 $\pm$ 7.37*$^+$}             
  & \multicolumn{1}{c|}{36.77 $\pm$ 31.49*$^+$}            
  & \multicolumn{1}{c|}{42.41 $\pm$ 60.69} 
  & \underline{\textit{64.28 $\pm$ 121.41}} \\ \cline{2-8}

  &\citet{zhou2022ram_dsir}                  
  & \multicolumn{1}{c|}{50.77 $\pm$ 43.56*$^+$} 
  & \multicolumn{1}{c|}{22.94 $\pm$ 84.53}
  & \multicolumn{1}{c|}{14.44 $\pm$ 4.10*$^+$}            
  & \multicolumn{1}{c|}{50.62 $\pm$ 198.88*}             
  & \multicolumn{1}{c|}{54.84 $\pm$ 88.22*$^+$} 
  & 133.58 $\pm$ 209.77   \\ \cline{2-8}

  &Teacher $f_{\theta_\mathcal{S}}$ 
  & \multicolumn{1}{c|}{57.43 $\pm$ 78.14*$^+$} 
  & \multicolumn{1}{c|}{11.43 $\pm$ 3.43}           
  & \multicolumn{1}{c|}{16.57 $\pm$ 7.64*$^+$ }            
  & \multicolumn{1}{c|}{71.17 $\pm$ 112.90*$^+$}             
  & \multicolumn{1}{c|}{46.99 $\pm$ 80.98$^+$} 
  & 160.44 $\pm$ 240.26 \\
  \cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}

  &\textbf{Ours} ($f_{\theta_\mathcal{U}}$) 
  & \multicolumn{1}{c|}{\textbf{35.31 $\pm$ 18.22}}  
  & \multicolumn{1}{c|}{\underline{\textit{11.38 $\pm$ 2.95}}}          
  & \multicolumn{1}{c|}{\textbf{11.96 $\pm$ 3.74}}  
  & \multicolumn{1}{c|}{\textbf{21.40 $\pm$ 10.55}}  
  & \multicolumn{1}{c|}{\underline{\textit{37.63 $\pm$ 20.64}}$^+$} 
  & 165.22 $\pm$  252.30  \\  \cline{2-8}
  
  & \textbf{Ours} (\modelSUU) 
  & \multicolumn{1}{c|}{\underline{\textit{39.26 $\pm$ 21.58}}} 
  & \multicolumn{1}{c|}{12.12 $\pm$ 2.61}          
  & \multicolumn{1}{c|}{\underline{\textit{12.78 $\pm$ 3.01}}*}  
  & \multicolumn{1}{c|}{\underline{\textit{29.49 $\pm$ 20.59}}*}  
  & \multicolumn{1}{c|}{\textbf{35.80 $\pm$ 15.15}} 
  & 182.88 $\pm$ 255.38  \\  \hline
\end{tabular}}
\end{table}






\section{Results}
\label{sec:results}

Quantitative results for OC/OD segmentation in unseen domains are reported in Tables \ref{tab:OC}. We include results obtained with other domain generalization techniques that publicly released usable implementations for comparison. To ensure a fair evaluation, we re-trained their models with uncropped images, using our supervised set $\mathcal{S}$. Notice that none of them follows a semi-supervised learning approach. To our knowledge, there are no studies on domain generalization for OD/OC segmentation following this approach.
%
Our Student \modelU reported statistically significant improvements for OC segmentation with respect to the Teacher in terms of HD, for all the unseen domains, except BOSCH and ORIGA. This also holds for DSC values obtained in RIMONE, FORUS and REMIDIO. A similar behavior is observed for OD segmentation, where this Student reported improvements over the Teacher in all sets except PALM evaluated in terms of DSC, and slightly higher HD values in BOSCH, with no statistically significant differences.
%
On the other hand, the Student \modelSUU was able to statistically improve Teacher's DSC values for OC segmentation for all datasets except BOSCH and FORUS. When evaluated in terms of average HD, this Student also achieved significantly better OC segmentations in RIMONE, REMIDIO, and ORIGA, but less accurate results in BOSCH and FORUS. For OD segmentation, the Student \modelSUU showed a different behavior, reporting better DSC and HD values than the Teacher in all datasets except PALM, where average DSC and HD are lower but statistically comparable.

When comparing the Students one another, we observe that \modelSUU performs statistically significantly better than \modelU in RIMONE and ORIGA when evaluated using DSC for OC segmentation, but comparable in BOSCH and statistically worse in FORUS and REMIDIO. When using the HD, on the other hand, \modelU performs statistically better in BOSCH, FORUS, and ORIGA, almost equivalently in REMIDIO, and slightly worse in RIMONE. Conversely, for OD segmentation we see \modelU reporting statistically better DSC values in FORUS, REMIDIO and PALM, and almost equivalent results in all other sets. In terms of HD, \modelU reported values slightly better than \modelSUU in all sets except for ORIGA, although the differences are only statistically significant in FORUS and REMIDIO.

Compared with other existing approaches, both Students reported better DSC values for OC/OD segmentation in all the datasets, except for OD results in PALM. In terms of HD, we observe improvements over the literature for OC segmentation in all datasets except for RIMONE, and for OD segmentation in RIMONE, FORUS, REMIDIO and ORIGA. In PALM, both Students reported statistically higher HD values than the three evaluated counterparts, while in BOSCH the differences are not significant. 

Figure~\ref{fig:quality} provides qualitative examples of OC/OD segmentation results obtained on images from different unseen domains. Examples were chosen to illustrate the behavior under changes in the acquisition device, ethnicity, lesions, and diversities of FOVs. As expected, the Teacher showed poor results in the unseen domains due to the intrinsic problem of domain generalization. Conversely, results obtained with the method by \citet{zhou2022ram_dsir} demonstrate shape consistency throughout datasets, except under changes in ethnicity or when extensive peripapilary atrophy is present. Notice also a completely misfit of the OD with respect to the OC detection in the last image. Alternatively, our Students present more accurate segmentation for most of the cases, except for segmenting OC/OD in images with peripapillary atrophy when incorporating $\mathcal{S}$.

Finally, we performed additional experiments focused on comparing differences in seen and unseen domains (Figure~\ref{fig:lineplot}), evaluating the effect of sequentially repeating our framework for another iteration ($k=2$, Table \ref{tab:OC_iter} and \ref{tab:OD_iter}), and evaluating glaucoma detection results when using vertical cup-to-disc ratio estimates obtained from segmentations retrieved with the Teacher and the Student models (see Appendix C).

\begin{figure}[t]
    \centering
    \includegraphics[width=.8\linewidth]{Images/Fig2.png}
    %\includegraphics[width=\linewidth]{Images/cualitatito_v2.png}
    \caption{Qualitative results for OD (blue) and OC (green) segmentation in unseen domains, obtained by the Teacher, \citet{zhou2022ram_dsir} method, and our proposed Students \modelU and \modelSUU. Reference manual annotations and DSC values are included for comparison. Masks are zoomed for visualization purposes, although the input is in all cases an uncropped image.} 
    \label{fig:quality}
\end{figure}

\section{Discussion and Conclusions}

%\begin{figure}[t!]
%    \centering
%      \includegraphics[width=\textwidth]{Images/roc_v3.png}
    %\vspace{0.2cm}
%    \caption{The ROC curve shows how well each model classifies glaucoma using the segmentation of the OD and OC to calculate the vCDR. The results obtained from the ground truth (GT) mask are also included for comparison. Our Student model (O) classification results are highlighted in violet, while the Baseline models' (B) results are in red, and the GT segmentations are represented in gray. The Area Under the Curve (AUC) is calculated and displayed in the bottom-right corner of each curve, providing a quantitative measure of classification performance.
%    }
%    \label{fig:vcdr}
%\end{figure}


% Crítica a la literatura. Vemos que trabajar con imágenes recortadas nos da modelos que funcionan muy bien sobre imágenes recortadas, pero que si esos mismos modelos se entrenan sobre imágenes sin recortar, sufren una caída notoria de performance comparadas con el otro caso. Esto creemos que se debe a que de alguna forma continuan estando especializados en los dominios que vieron y que no son capaces de modelar todas las variaciones que ocurren en el full fundus. Por el contrairo, al incorporar un Noisy Student se observa una mejora notoria en los datasets unseen, incluso sin perjudicar los resultados en los seen, obteniendo valores similares. Estos resultados se mantienen en casi todos los casos, variando la etnia de los pacientes, el FoV, ....... Nótese que además esta estrategia es fácilmente combinable con cualquier otra, pudiendo eventualmente mejorarlas a ambas.

Domain generalization remains being a challenge in OD/OC segmentation due to the significant variations observed between fundus pictures. While several approaches have been introduced to improve results under multiple imaging settings, we observed in our literature review that all of them are trained and evaluated on cropped images around the ONH \citet{wang2020dofe,chen2022tvconv,zhou2022ram_dsir,lyu_AADG,yang2021hcdg}. This drops out most sources of variability expected to occur in these images, reducing the space of patterns to model in training and overestimating their actual performance on full size images when testing. This can be observed in Table~\ref{tab:OC}, where state of the art models re-trained on uncropped scans obtained lower values than those reported in the original papers \cite{wang2020dofe, chen2022tvconv, zhou2022ram_dsir}. Additionally, training with cropped images introduces the need of doing a manual crop in test time, or training a separate model to automate it, e.g. in a coarse-to-fine manner \cite{moris2023assessing}. However, doing so was proved suboptimal for OD/OC segmentation if the coarse part lacks domain generalization capabilities \cite{moris2023assessing}. Thus, we are facing a causal loop paradox, where to achieve domain generalization we rely on a coarse detection model that needs to be able to generalize well to unseen images to ensure proper results.

%In the literature, we observe that all the authors decided to use cropped images to train the models. We decided to use a full image because cropping the input takes away a lot of variability information. Also, in a previous study \cite{moris2023assessing}, we notice that a model trained with un-cropped images obtains better results than a model trained with two steps. Sometimes, the problem comes from a lousy generalization of the automatic model that must crop the image on the OD area. Suppose it is decided not to use an automatic model for image cropping. In that case, the fundus images must be manually cropped in the disk area for subsequent segmentation, making it impractical for real-world application. With this in mind, we decided to test our model against methods found in the literature but trained with un-cropped images. They showed promising results, although they still needed to reach the one we proposed. We also notice that when tested over full fundus images, this model shows a problem with the OD segmentation, as shown in Figure \ref{fig: quality}. 

In this paper we even show that a simple solution based on Noisy Students can outperform existing approaches. By training Students using a massive set of images pseudo-labelled by a pre-trained Teacher model, we obtained networks that reported better results in unseen domains than those obtained with other more technically complex methods trained and evaluated in full size images (Tables~\ref{tab:OC}). 
These counterparts only reported better DSC and HD values for OD segmentation in one dataset, PALM, which features images of patients with pathological myopia. When analyzing results qualitatively, we observe that this drop is caused mostly when peripapilary atrophies are present (Figure~\ref{fig:quality}), which significantly alters the appeareance of the boundaries of the OD. Another interesting remark observed in Figure~\ref{fig:lineplot} is that following this approach does not degrade results in domains seen by the Teacher (or used also by the Student, if using \modelSUU), statistically retaining most of the original performance. We also empirically showed that this behavior holds when using our segmentations for glaucoma detection using vCDR estimates (Figure \ref{fig:roc-label}), reaching results in line or even superior to those obtained using manual segmentations. %\emm{We conducted a test on our model using a different dataset, namely CHAKSU as the $\mathcal{S}$. The results, as shown in Figure\ref{fig:chaksu-model}, demonstrate that even when CHAKSU is used, the Students dataset still outperforms the teacher's dataset.}

Our results using Students with and without access to $\mathcal{S}$ were inconclusive, as both seems to be accurate in specific cases, sometimes reporting statistically comparable results. Considering this scenario, one could potentially combine their results e.g. in an ensemble setting, to take advantage of their individual results. 

This Noisy Student framework is general enough to be applied with any backbone neural network architecture and/or domain generalization technique. In our experiments, we used a standard U-Net due to computational limitation, but other architectures with much more capacity could be leveraged to capture additional patterns \citet{yi2023c2ftfnet_tranformer}. Furthermore, our process could be leveraged in the context of any of the alternative approaches evaluated, potentially boosting their performance in unseen domains with uncropped images. Finally, notice that this technique could be extrapolated to any other fundus image segmentation task that requires domain generalization, e.g. diabetic retinopathy lesion segmentation. Nevertheless, as with any semi-supervised learning strategy based on weak labels, it must be considered that a poor Teacher model can degrade performance due to confirmation bias \cite{kwon2022semi}. Discarding this approach in advance would require to somehow approximate the performance in $\mathcal{U}$, which is challenging due to the intrinsic lack of labels on it. This particular limitation is an active research topic now, with many approaches being introduced e.g. to predict areas of error in the pseudo-labels or to rank labels based on the uncertainty of the model \cite{kwon2022semi,albert2023your,khan2024improving}. Future work will focus on analyzing the potential contribution of these approaches in domain generalization results.
 

%\emm{It has been observed in the literature that semi-supervised learning with weak labels may have certain limitations. Specifically, when there is a significant amount of noise in the label, it can have a negative impact on the model's training. In such cases, it becomes necessary to fine-tune (Teacher k=2) the model by training a student model with both labeled (S) and unlabeled (U) datasets. The use of labeled datasets can help prevent the model's segmentation from degrading. In addition, the study conducted by \citet{albert2023your} mentions other strategies for addressing noise pseudo-labels. We can analyze these strategies in a future study to improve our domain generalization results. }


%We observe similar results between the different student techniques. Each Student improved in different unseen domains. However, it is noticed that the Student \modelU got a better result on the HD metric, showing that the segmentation obtained by this model presents a structure more similar to the ground truth. These improvements can be seen in the segmentation of the lesion present on the PALM dataset (Figure \ref{fig: quality}) where the student \modelU improves significantly to the segmentation of the student \modelSUU. However, when we train a second iteration of the  Student \modelU, we observe that these models obtain similar or worse results than the Student \modelU, as we can observe in the Tables \ref{tab:OC_iter} and \ref{tab:OD_iter} in the appendix.




%
%We notice that the simple strategy of using the Noisy-Student methodology notoriously improves the results on unseen domains while maintaining the same performance on seen domains. These improvements are seen even with the change in the matching used for acquisition, the difference in the ethics, and even the change in the fov. These simple methods can be applied with any backbone architecture, and we can even merge this strategy with the ones present in the literature to improve both techniques in the future.


% ¿Qué vimos que pasa entre distintos de Students? Relacionarlo con las iteraciones. ¿Qué conviene más? ¿Entrenar con S tmb o hacer iteraciones? Resaltar lo de las lesiones de la papila en PALM.



% Conclusiones y trabajo futuro. Evaluación de este mismo enfoque para Domain Generalization en la detección de lesiones, que es más challenging porque las variaciones ocurren siempre en diversas regiones de las imágenes.
%This article explores the Noisy-Student methodology for enhancing OD and OC segmentation generalization. Given the significant number of samples that contribute to improving model generalization, we observe a substantial impact when employing a large dataset for model training. Even when the mask is generated using a relatively simple prior model, the influence of a high-volume dataset with pseudo-labels is evident, as highlighted in Tables \ref{tab:OC} and \ref{tab:OD} were both structures improved on their segmentation and in almost all the cases the differences with the Teacher were significantly better. 
%
%We tested our models on six diverse and unseen domains and compared them to existing models in the literature. In all cases, the model shows significant improvement in the generalization over the OD and OC segmentation except when Atrophy lesions are present in the image. 
%
%In future work, we want to apply this same technique for the generalization of lesion detection, which is a more challenging problem due to variations in the lesion position.


%Moreover, the study evaluated the ability of glaucoma classification after the segmentation. This analysis offers valuable insights into the proposed methodology's effectiveness in challenging real-world scenarios, providing a better understanding of its performance in comparison to existing models in the literature.


%
%The Student \modelU shows a lower HD among all the models. Showing an accurate consistency in the shape of the segmentation concerning the other models. 

%Remarkably, fine-tuning with expert-labeled data does not consistently outperform the segmentation achieved by the Student model trained on a large dataset with pseudo-labels (More information on the appendix). This underscores the importance of leveraging a diverse and extensive dataset for training, emphasizing the potential of semi-supervised learning strategies to capitalize on the wealth of data available without accompanying masks. 
%
%The findings underscore the efficacy of utilizing a high-variability dataset to enhance model generalization for previously unseen datasets. This phenomenon was observed in the study conducted by \cite{kolesnikov2020big}, where three models were trained using varying sample sizes. The study found that the model trained with the largest dataset achieved the best results.
%
%It is crucial to highlight that the Teacher model generally performs better for datasets like DRISHTI, which is a known dataset. This implies that a model trained with the same dataset it is supposed to evaluate tends to exhibit better performance. However, such scenarios are rare in real-world applications (more information in the appendix).

%Our model performed better than the studies mentioned in the literature, and in most cases, the Students' models proved to be better than other models.
%

%Our models have shown poor results when tested on the PALM dataset. This could be because we need more cases of lesions on the AIROGS dataset to gain knowledge on Atrophy lessons. It appears that this is a general problem among all the models, as the DSC (dice similarity coefficient) does not exceed 78\%.




%However, in the case of OC segmentation, the Teacher model showed some improvement over the others. This could be because cup segmentation is inherently difficult, which is consistent with observations in the literature where the cup usually has lower Dice scores compared to the disc [CITAS]. The challenges in cup segmentation may affect the quality of pseudo-labels generated from the AIROGS dataset, which, in turn, may affect the model's training.

%The dependency of the second iteration of the Student depending Teacher 2 is an interesting observation. It's notable that, in general, the fine-tuned Teacher didn't consistently improve the Student model. However, in cases where the Baseline was improved by the fine-tuned Teacher, Student 2 demonstrated enhanced results compared to the initial Student. Conversely, when Teacher 2 did not improve upon the Baseline, Student 2 exhibited worsened results compared to the initial Student. This observation aligns with the role of the Teacher in generating weak labels to train the Student models. The interaction between the Teacher, Student, and fine-tuned Teacher underscores the nuanced dynamics involved in the training process and highlights the importance of understanding these relationships for effective model improvement.

%Using the noisy student methodology has proven to have a positive impact on model generalization in various scenarios. It significantly improves the performance of the model as compared to a common supervised trained model. With semi-supervised learning, we can leverage a large unlabeled dataset like AIROGS to train a more robust model that can generalize better. However, we have observed that the model still faces problems in detecting lesions in fundus images. This could be due to the low number of case lesion images available on the AIROGS dataset or a bad segmentation in the supervised model.



% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was partially funded by Agencia I+D+i through PICT 2019-00070 and PICT startup 2021-00023, by CONICET through a PIP GI 2021-2023 - 11220200102472CO, and by an NVIDIA Hardware Grant.}


\bibliography{midl23_175}

\newpage
\appendix

\renewcommand\thefigure{\thesection.\arabic{figure}}
\renewcommand\thetable{\thesection.\arabic{table}}
\setcounter{figure}{0}  
\setcounter{table}{0}  


\begin{table}[t!]
\centering
\caption{Summary with the number of samples used for training, validation and testing, indicating which datasets corresponded to seen and unseen domains.}
\resizebox{.8\textwidth}{!}{
\begin{tabular}{c|c|c|c|c|c}
\hline
Dataset  & Train & Validation & Test & Seen Domain & Unseen Domain\\ \hline
DRISHTI \cite{sivaswamy2014drishti}    & 45    & 5          & 50 & $\checkmark$ & \ding{53} \\
REFUGE \cite{orlando2020refuge}        & 400   & 400        & 400 & $\checkmark$ & \ding{53} \\
RIGA \cite{almazroa2018retinal}        & 675   & 74         & -  &  $\checkmark$ & \ding{53} \\
RIMONEv3 \cite{fumero2011rim}           & -     & -          & 151  & \ding{53} & $\checkmark$\\
ORIGA \cite{zhang2010origa}            & -     & -          & 647  & \ding{53} & $\checkmark$\\
BOSCH \cite{kumar2023chakṣu}           & -     & -          & 41   & \ding{53} & $\checkmark$\\
FORUS \cite{kumar2023chakṣu}           & -     & -          & 31   & \ding{53} & $\checkmark$\\
REMIDIO \cite{kumar2023chakṣu}         & -     & -          & 264  & \ding{53} & $\checkmark$\\
AIROGS \cite{de2023airogs}             & 16200 & 1800       & -   & $\checkmark$ & \ding{53}\\ \hline
\end{tabular}}

\label{tab:dataset}
\end{table}


% \section{\emm{State of the art comparison}}
% \emm{Some of the studies of domain generalization on OD and OC segmentation are present in the literature. We compare our study with the following due
% to her public code in which we could re-train using uncropped images.}\\
% \emm{The study of \citet{wang2020dofe}, one of the first works on domain generalization on OD and OC segmentation. They present a Domain-oriented Feature embedding framework to improve the generalization ability of CNNs on unseen targets. The framework dynamically enriches the image feature with additional domain prior knowledge learned from the multi-source domain to make the semantic feature more discriminative. They introduce a domain knowledge pool to learn and memorize the prior information and the original image features are augmented with domain-oriented aggregated features, which are induced from the knowledge pool based on the similarity between the input image and multi-source domain images. }\\
% \emm{In the study of \citet{chen2022tvconv} propose a translation variant convolution (TVConv) for layout-aware visual processing, composed with affinity maps and a weight-generating block. While affinity maps depict pixel-paired relationships gracefully, the weight-generating block can be explicitly overparameterized for better training while maintaining efficient inference.} 
% \emm{\citet{zhou2022ram_dsir} designed a multi-tasking approach by combining the segmentation model with a self-supervision domain-specific image restoration (DSIR) module for model regularization. They also design a random amplitude mixup (RAM) module, which incorporates low-level frequency information of different domain images to synthesize new images. They also introduce a semantic consistency loss to make their model resistant to domain shift.}


\section{Evaluation in seen domains vs. unseen domains}

We performed an additional experiment comparing the performance in the seen domains obtained with the Teacher model and the Student approaches (see Table~\ref{tab:dataset} for better readability). Figure \ref{fig:lineplot} depicts DSC and HD values for OD and OC segmentation in DRISHTI and REFUGE test sets, including also the unseen domains as a reference. Although differences are observed between the compared models, it is worth noting that the Students are statistically indistinguishable from the Teacher regardless the metric and the specific task.

\begin{figure}[h]
    \centering
    
    \includegraphics[width=\linewidth]{Images/FigA1.png}
    %\includegraphics[width=\linewidth]{Images/Lineplot_OC_DICE.png}
    \caption{DSC and HD values obtained for OC (top) and OD (bottom) in seen and unseen domains. The average value achieved by the Teacher is indicated as a dotted lines. * indicate statistically significant differences.}
    \label{fig:lineplot}
\end{figure}

% \begin{figure}[h]
%     \centering
%     \caption{DSC of the segmentation obtained by each model in each dataset concerning the ground truth. In pink, we mark the datasets that are included in the multi-dataset (used to train in the Teacher's model). In greem we mark the datasets that were not seen by any model. With a dotted line we mark the mean of the Teacher model, the one we want to improve. }
%     \includegraphics[width=\linewidth]{Images/barplor_DICE_2024_v2.png}
%     %\includegraphics[width=\linewidth]{Images/Lineplot_OC_DICE.png}
    
%     \label{fig: lineplot}
% \end{figure}

\section{Evaluation of iteratively repeating the Noisy Student framework}


We also performed an experiment evaluating the effect of sequentially repeating our framework for a second iteration ($k=2$). Results are reported in Tables \ref{tab:OC_iter} and \ref{tab:OD_iter}. 
%
For OC segmentation, we only observed improvements in the Teacher on FORUS, and REMIDIO. Retraining the Students on these labels did not improve results in any of the evaluated datasets, except in terms of HD in FORUS using \modelU.
%
For OD segmentation, on the other hand, the fine-tuned Teacher improves results with respect to the first version in all cases except for RIMONE. The re-trained Student reported statistically comparable results with respect to its one-iteration counterpart, with only slight improvements or decreases in their metrics. %For the OD BOSCH segmentation, the models ($k=2$) got the biggest DSC. On the FORUS dataset the Student \modelU ($K=2$) got the biggest DSC and for the REMIDIO dataset got the second best results.
%
This analysis allows us to conclude that training for a second iteration is not worthwhile given that Students trained for just one iteration are more accurate.

\section{Evaluation of segmentation results for glaucoma assessment}

We extended the evaluation with an experiment comparing glaucoma detection performance using manual segmentations and results obtained with the Teacher and our Noisy Student models. To this end, we computed ROC curves in both seen and unseen domains, using the vertical cup-to-disc ratio (vCDR) \cite{orlando2020refuge} as a glaucoma score (Figure \ref{fig:roc-label}). Notice that we did not include BOSCH and FORUS images in the evaluation as they only have one glaucomatous sample each. In line with our observations for overall segmentation accuracy, the Student models perform much better than the Teacher one in all unseen domains, with areas under the curve (AUC) that are comparable or even better than those obtained using ground truth segmentation. In seen domains like DRISHTI and REFUGE, on the other hand, the Teacher model perform better than the Student without strong supervision (\modelU). Nevertheless, mixing both manual and pseudo-labelled scans (model \modelSUU) reached classification results comparable to those obtained using manual segmentations.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{Images/FigC2.png}
    \caption{ROC curves for glaucoma classification using vertical cup-to-disc ratios (vCDRs) as glaucomatous scores, as derived from both ground truth (GT) segmentations and masks predicted with our Teacher (T) and Student (\modelU and \modelSUU) models, in both seen and unseen domains.}
    \label{fig:roc-label}
\end{figure}

\section{Swapping the training set to CHAKSU}

We evaluated also the performance of our Student models \modelU and \modelSUU when using CHAKSU training sets as $\mathcal{S}$ for training a Teacher, and evaluating them all in all other sets as unseen domains. Results for OC and OD segmentation evaluated using DSC are depicted in Figure \ref{fig:chaksu-model}. The Student model \modelU improves results of the Teacher in almost all unseen datasets, except for OC segmentation in DRISHTI, although differences are not statistically significant. In seen domains, the model performs comparable to the Teacher, meaning that it is still able to retain its original performance. The Student \modelSUU, on the other hand,  significantly improves results for OD segmentation in all datasets, both seen and unseen, and OC results in seen domains. In the unseen domains, however, OC segmentations in DRISHTI, REFUGE and ORIGA are worse than those obtained with the Teacher, experiencing a notorious drop in REFUGE and ORIGA. 

\begin{figure}
    \centering
    \includegraphics[width=.8\linewidth]{Images/FigD3.png}
    \caption{Average dice (DSC) values obtained on unseen and seen domains when using CHAKSU training set for training our model. * indicate statistically significant differences.}
    \label{fig:chaksu-model}
\end{figure}

\begin{table}[h]
%\caption{We compared the Dice scores of our proposed models with those of recent articles that used domain adaptation and domain generalization methods for optic disc (OD) and optic cup (OC) segmentation. The best results for each dataset are highlighted in bold. The * symbol represents a significant improvement compared to the baseline.}
\caption{OC segmentation results in uncropped images from unseen domains with $k=2$ iterations. The two best models are highlighted in bolds and underlined italics, respectively. Statistically significant improvements of Students $f_{\theta_\mathcal{U}}$ and $f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$ are indicated with * and +, respectively.}
    \label{tab:OC_iter}
\resizebox{\textwidth}{!}{
\begin{tabular}{c|c|ccccc}
\hline%\cline{2-10}
 %\multirow{ 2}{*}{\textbf{Method}} & \multicolumn{5}{c|}{\textbf{Optic disc (Dice \%)}} & \multicolumn{5}{c}{\textbf{Optic cup (Dice \%)}}                                                                                                                                   \\ \cline{2-11}
OC &\textbf{Method}  & \multicolumn{1}{c|}{\textbf{RIMONE}}  & \multicolumn{1}{c|}{\textbf{BOSCH}}                 & \multicolumn{1}{c|}{\textbf{FORUS}}                  & \multicolumn{1}{c|}{\textbf{REMIDIO}}                 & \textbf{ORIGA}  \\ \hline%               

  \multirow{ 5}{*}{\rotatebox{90}{\textbf{$\uparrow$ DSC (\%) }}} 
  
  & Teacher $f_{\theta_\mathcal{S}}$            
  & \multicolumn{1}{c|}{$54.38 \pm 22.87$*$^+$}            
  & \multicolumn{1}{c|}{\textbf{87.71 $\pm$ 5.08}}  
  & \multicolumn{1}{c|}{$83.64 \pm 8.54$*}            
  & \multicolumn{1}{c|}{$78.54 \pm 18.43$*$^+$}             
  & \underline{\textit{82.18 $\pm$ 14.52$^+$}}   \\ \cline{2-7}
 
 & \textbf{Ours} ($f_{\theta_\mathcal{U}}$)     
 & \multicolumn{1}{c|}{\underline{\textit{68.39 $\pm$ 19.32}}}   
 & \multicolumn{1}{c|}{$85.49 \pm 5.44$}           
 & \multicolumn{1}{c|}{\textbf{88.85 $\pm$ 5.17}}  
 & \multicolumn{1}{c|}{\textbf{86.25 $\pm$ 6.92}}   
 & $81.08 \pm 11.07$            \\ \cline{2-7} 
 
 & \textbf{Ours} ($\mathcal{U} \cup \mathcal{S}$) 
 & \multicolumn{1}{c|}{\textbf{70.37 $\pm$ 18.23}}  
 & \multicolumn{1}{c|}{\underline{\textit{85.52 $\pm$ 5.94}}}           
 & \multicolumn{1}{c|}{$84.34 \pm 7.31$}  
 & \multicolumn{1}{c|}{\underline{\textit{85.83 $\pm$ 8.87}}}   
 &   \textbf{83.03 $\pm$ 14.52 }        \\ \cline{2-7}\cline{2-7}
 
 & Teacher ($k=2$)            
 & \multicolumn{1}{c|}{$53.22 \pm 26.27$*$^+$} 
 & \multicolumn{1}{c|}{$79.05 \pm 9.28$*$^+$}           
 & \multicolumn{1}{c|}{\underline{\textit{88.51 $\pm$ 5.04}}}           
 & \multicolumn{1}{c|}{$83.94 \pm 9.94$*$^+$}            
 & $80.84 \pm 11.92^+$            \\ \cline{2-7}
 
 & \textbf{Ours} ($\mathcal{U}$, $k=2$)             
 & \multicolumn{1}{c|}{$65.83 \pm 24.93$}  
 & \multicolumn{1}{c|}{$74.23 \pm 7.81$*$^+$}           
 & \multicolumn{1}{c|}{$86.42 \pm 8.43$}           
 & \multicolumn{1}{c|}{$83.12 \pm 9.15$*$^+$}             
 & $74.57 + 12.94$*$^+$            \\ \hline\hline      
%%%%%%%%%%%%%%%%%%%%%HD


  \multirow{ 6}{*}{\rotatebox{90}{ \textbf{$\downarrow$ HD}}}  & \textbf{Method}
  &\multicolumn{1}{c|}{\textbf{RIMONE}} 
  & \multicolumn{1}{c|}{\textbf{BOSCH}}                 
  & \multicolumn{1}{c|}{\textbf{FORUS}}                  
  & \multicolumn{1}{c|}{\textbf{REMIDIO}}                 
  & \textbf{ORIGA}                  \\ \cline{2-7}

  &Teacher $f_{\theta_\mathcal{S}}$ 
  & \multicolumn{1}{c|}{$77.97 \pm 40.12$*$^+$} 
  & \multicolumn{1}{c|}{\underline{\textit{14.83 $\pm$ 4.67}}}           
  & \multicolumn{1}{c|}{$31.21 \pm 12.21$*}            
  & \multicolumn{1}{c|}{$90.10 \pm 140.43$*$^+$}             
  & \multicolumn{1}{c}{$49.28 \pm 110.03^+$} \\ \cline{2-7}\cline{2-7}\cline{2-7}\cline{2-7}\cline{2-7}

  & \textbf{Ours} (\modelU) 
  & \multicolumn{1}{c|}{\underline{\textit{47.14 $\pm$ 24.95}}}  
  & \multicolumn{1}{c|}{\textbf{14.78 $\pm$ 4.48}}          
  & \multicolumn{1}{c|}{\underline{\textit{22.14 $\pm$ 7.47}}}  
  & \multicolumn{1}{c|}{\textbf{34.00 $\pm$ 14.42}}   
  & \textbf{36.08 $\pm$ 18.24}  \\ \cline{2-7}

  & \textbf{Ours} (\modelSUU) 
  & \multicolumn{1}{c|}{\textbf{45.49 $\pm$ 24.84}} 
  & \multicolumn{1}{c|}{$17.51 \pm 5.44$}          
  & \multicolumn{1}{c|}{$29.22 \pm 11.15$}  
  & \multicolumn{1}{c|}{\underline{\textit{34.51 $\pm$ 16.78}}}   
  & $50.66 \pm 126.01$       \\ \cline{2-7}\cline{2-7}
 
 & Teacher ($k=2$)            
 & \multicolumn{1}{c|}{$98.41 \pm 181.39$*$^+$} 
 & \multicolumn{1}{c|}{$18.07 \pm 4.78$*}           
 & \multicolumn{1}{c|}{22.38 $\pm$ 6.45}          
 & \multicolumn{1}{c|}{$35.94 \pm 15.53$*$^+$}            
 & \underline{\textit{44.21 $\pm$ 68.01}*$^+$}            \\ \cline{2-7}
 
 & \textbf{Ours} ($\mathcal{U}$, $k=2$)             
 & \multicolumn{1}{c|}{$71.12 \pm 152.02$}  
 & \multicolumn{1}{c|}{$20.27 \pm 4.13$*$^+$}           
 & \multicolumn{1}{c|}{\textbf{21.47 $\pm$ 7.51}}           
 & \multicolumn{1}{c|}{$38.65 \pm 19.86$*$^+$}             
 & $47.11 \pm 42.47$*$^+$           \\  \hline
\end{tabular}}
\end{table}

\begin{table}[h]
%\caption{We compared the Dice scores of our proposed models with those of recent articles that used domain adaptation and domain generalization methods for optic disc (OD) and optic cup (OC) segmentation. The best results for each dataset are highlighted in bold. The * symbol represents a significant improvement compared to the baseline.}
\caption{OD segmentation results in uncropped images from unseen domains with $k=2$ iterations. The two best models are highlighted in bolds and underlined italics, respectively. Statistically significant improvements of Students $f_{\theta_\mathcal{U}}$ and $f_{\theta_{\mathcal{U} \cup \mathcal{S}}}$ are indicated with * and +, respectively.}
    \label{tab:OD_iter}
\resizebox{\textwidth}{!}{
\begin{tabular}{c|c|cccccc}
\hline%\cline{2-10}
 %\multirow{ 2}{*}{\textbf{Method}} & \multicolumn{5}{c|}{\textbf{Optic disc (Dice \%)}} & \multicolumn{5}{c}{\textbf{Optic cup (Dice \%)}}                                                                                                                                   \\ \cline{2-11}
OD &\textbf{Method}  
& \multicolumn{1}{c|}{\textbf{RIMONE}}  
& \multicolumn{1}{c|}{\textbf{BOSCH}}                 
& \multicolumn{1}{c|}{\textbf{FORUS}}                  
& \multicolumn{1}{c|}{\textbf{REMIDIO}}                 
& \multicolumn{1}{c|}{\textbf{ORIGA}} 
& \textbf{PALM} \\ \hline%               

  \multirow{ 5}{*}{\rotatebox{90}{\textbf{$\uparrow$ DSC (\%) }}} 
  &  Teacher $f_{\theta_\mathcal{S}}$  
  &  \multicolumn{1}{c|}{$81.59 \pm 20.03^+$} 
  & \multicolumn{1}{c|}{$95.93 \pm 1.54$*}           
  & \multicolumn{1}{c|}{$95.94 \pm 2.53$*$^+$}            
  & \multicolumn{1}{c|}{$90.65 \pm 10.39$*$^+$}             
  & \multicolumn{1}{c|}{$89.98 \pm 11.55$} 
  & \underline{\textit{68.62 $\pm$ 34.69 }}  \\ \cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}
 
 & \textbf{Ours} ($f_{\theta_\mathcal{U}}$)      
 & \multicolumn{1}{c|}{\textit{\underline{86.88 $\pm$ 7.72}}}  
 & \multicolumn{1}{c|}{96.37 $\pm$ 1.15}          
 & \multicolumn{1}{c|}{\textbf{97.36 $\pm$ 0.67}}  
 & \multicolumn{1}{c|}{\textbf{96.74 $\pm$ 1.61}}   
 & \multicolumn{1}{c|}{\underline{\textit{91.17 $\pm$ 4.56}}} 
 & $68.03 \pm 35.37$            \\ \cline{2-8} 
 
 & \textbf{Ours} (\modelSUU)      
 & \multicolumn{1}{c|}{\textbf{87.11 $\pm$ 7.04}} 
 & \multicolumn{1}{c|}{96.19 $\pm$ 1.05}          
 & \multicolumn{1}{c|}{\underline{\textit{96.81 $\pm$ 1.40}}}  
 & \multicolumn{1}{c|}{\underline{\textit{95.13 $\pm$ 3.12}}}   
 & \multicolumn{1}{c|}{\textbf{91.27 $\pm$ 4.16}} 
 & $60.96 \pm 40.47$      \\ \cline{2-8}

  &Teacher ($k=2$) 
  & \multicolumn{1}{c|}{$76.31 \pm 22.60$*$+$} 
  & \multicolumn{1}{c|}{\underline{\textit{96.66 $\pm$ 0.97}}} 
  & \multicolumn{1}{c|}{$96.79 \pm 1.65$*}           
  & \multicolumn{1}{c|}{$95.75 \pm 3.79$*}            
  & \multicolumn{1}{c|}{$90.50 \pm 6.96$} 
  & \textbf{68.97 $\pm$ 39.98}$^+$ \\ \cline{2-8}

  & \textbf{Ours} ($\mathcal{U}$, $k=2$) 
  & \multicolumn{1}{c|}{$81.83 \pm 17.84$} 
  & \multicolumn{1}{c|}{\textbf{96.86 $\pm$ 0.95}} 
  & \multicolumn{1}{c|}{\textbf{97.39 $\pm$ 1.19}}  
  & \multicolumn{1}{c|}{96.49 $\pm$ 3.24}            
  & \multicolumn{1}{c|}{$91.14 \pm 4.69$}     
  & $60.96 \pm 40.47$$^+$       \\ \hline\hline      
%%%%%%%%%%%%%%%%%%%%%HD


  \multirow{ 5}{*}{\rotatebox{90}{ \textbf{$\downarrow$ HD}}}  & \textbf{Method}
  &\multicolumn{1}{c|}{\textbf{RIMONE}} 
  & \multicolumn{1}{c|}{\textbf{BOSCH}}                 
  & \multicolumn{1}{c|}{\textbf{FORUS}}                  
  & \multicolumn{1}{c|}{\textbf{REMIDIO}}                
  & \multicolumn{1}{c|}{\textbf{ORIGA}}    
  & \textbf{PALM}              \\ \cline{2-8}

  &Teacher $f_{\theta_\mathcal{S}}$ 
  & \multicolumn{1}{c|}{$57.43 \pm 78.14$*$^+$} 
  & \multicolumn{1}{c|}{$11.43 \pm 3.43$}           
  & \multicolumn{1}{c|}{$16.57 \pm 7.64$*$^+$}            
  & \multicolumn{1}{c|}{$71.17 \pm 112.90$*$^+$}             
  & \multicolumn{1}{c|}{$46.99 \pm 80.98^+$} 
  & \underline{\textit{160.44 $\pm$ 240.26}} \\ \cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}\cline{2-8}

  &\textbf{Ours} ($f_{\theta_\mathcal{U}}$) 
  & \multicolumn{1}{c|}{\textbf{35.31 $\pm$ 18.22}}  
  & \multicolumn{1}{c|}{11.38 $\pm$ 2.95}          
  & \multicolumn{1}{c|}{\textbf{11.96 $\pm$ 3.74}}  
  & \multicolumn{1}{c|}{\textbf{21.40 $\pm$ 10.55}}   
  & \multicolumn{1}{c|}{37.63 $\pm$ 20.64} 
  & $165.22 \pm  252.30$  \\  \cline{2-8}
  
  & \textbf{Ours} (\modelSUU) 
  & \multicolumn{1}{c|}{\underline{\textit{39.26 $\pm$ 21.58}}} 
  & \multicolumn{1}{c|}{$12.12 \pm 2.61$}          
  & \multicolumn{1}{c|}{12.78 $\pm$ 3.01}  
  & \multicolumn{1}{c|}{29.49 $\pm$ 20.59}   
  & \multicolumn{1}{c|}{\textbf{35.80 $\pm$ 15.15}} 
  & $182.88 \pm 255.38$          \\  \cline{2-8}

&Teacher ($k=2$) 
& \multicolumn{1}{c|}{$84.26 \pm 143.23$*$+$} 
& \multicolumn{1}{c|}{\underline{\textit{10.26 $\pm$ 2.34}}} 
& \multicolumn{1}{c|}{$13.19 \pm 4.96$*}           
& \multicolumn{1}{c|}{$24.89 \pm 35.32$}            
& \multicolumn{1}{c|}{$40.87 \pm 26.96$} 
& \textbf{84.06 $\pm$ 169.86}$^+$ \\ \cline{2-8}

  & \textbf{Ours} ($\mathcal{U}$, $k=2$) 
  & \multicolumn{1}{c|}{$55.08 \pm 87.89$*$+$} 
  & \multicolumn{1}{c|}{\textbf{9.76 $\pm$ 2.20}} 
  & \multicolumn{1}{c|}{\underline{\textit{12.04 $\pm$ 7.58}}}  
  & \multicolumn{1}{c|}{\underline{\textit{23.35 $\pm$ 21.04}}}            
  & \multicolumn{1}{c|}{\underline{\textit{ 36.72 $\pm$ 19.31}}}     
  & $182.88 \pm 255.38$       \\ \hline
  
\end{tabular}}
\end{table}

\end{document}
