\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{lastpage}
\usepackage{subcaption}
\usepackage{mwe} % to get dummy images
\jmlryear{2024}\jmlrworkshop{Full Paper -- MIDL 2024}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2024}

\title[Distance-aware non-IID Medical Image Segmentation]{Distance-Aware Non-IID Federated Learning for Generalization and Personalization in Medical Imaging Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 \midlauthor{\Name{Julia Alekseenko}\nametag{$^{1,2}$} \Email{julia.alekseenko@ext.ihu-strasbourg.eu}\\
  \Name{Alexandros Karargyris}\nametag{$^{1}$} \Email{akarargyris@gmail.com}\\
  \Name{Nicolas Padoy}\nametag{$^{1,2}$} \Email{npadoy@unistra.fr}\\
  \addr $^{1}$ IHU Strasbourg, Institute of Image-Guided Surgery, Strasbourg, France \\
  \addr $^{2}$ University of Strasbourg, CNRS, INSERM, ICube, UMR7357, Strasbourg, France}
  


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Iuliia Alekseenko\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ IHU Strasbourg, Institute of Image-Guided Surgery, Strasbourg, France. \\
% \addr $^{2}$ ICube, University of Strasbourg, CNRS, Strasbourg, France. \AND
% \Name{Alexandros Karargyris\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Nicolas Padoy\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% }

\begin{document}

\maketitle

\begin{abstract} %ok
Federated learning (FL) in healthcare suffers from non-identically distributed (non-IID) data, impacting model convergence and performance. While existing solutions for the non-IID problem often do not quantify the degree of non-IID nature between clients in the federation, assessing it can improve training experiences and outcomes, particularly in real-world scenarios with unfamiliar datasets. The paper presents a practical non-IID assessment methodology for a medical segmentation problem, highlighting its significance in medical FL. We propose a simple yet effective solution that utilizes distance measurements in the embedding space of medical images and statistical measurements calculated over their metadata. Our method, designed for medical imaging and integrated into federated averaging, improves model generalization by downgrading the contribution from the most distant client, treating it as an outlier. Additionally, it enhances model personalization by introducing distance-based clustering of clients. To the best of our knowledge, this method is the first to use distance-based techniques for providing a practical solution to the non-IID problem within the medical imaging FL domain. Furthermore, we validate our approach on three public FL imaging radiology datasets (FeTS \cite{pati2021federated}, Prostate \cite{liu2020ms}, \cite{liu2020saml}, and Fed-KITS2019 \cite{terrail2022flamby}) to demonstrate its effectiveness across various radiology imaging scenarios. %Our code and data splits are available at (https://github.com/).
\end{abstract}

\begin{keywords}
Federated Learning, Non-IID Data, Personalization, Generalization, Medical Segmentation, Medical Imaging.
\end{keywords}

\section{Introduction}
Federated learning (FL) in healthcare aims to achieve data collaboration while preserving privacy. It enables multiple institutions or healthcare entities (clients) to jointly train or evaluate artificial intelligence (AI) models without sharing raw, sensitive patient data. This collaborative and privacy-preserving approach improves predictive models, personalized treatments, and disease detection, leveraging diverse datasets from various institutions and democratizing the power of distributed clients. FL promotes inclusive model training, incorporating diverse populations for robustness and generalizability. Studies like \cite{sheller2020federated,dou2021federated} highlight FL's efficacy in medical applications, showcasing the power of algorithms like federated averaging (FedAvg) \cite{mcmahan2017communication}.

While traditional FL algorithms, like FedAvg, assume uniform data distribution across clients, in contrast, real-world applications often face non-independently and identically distributed (non-IID) data challenges, where data across clients lacks uniformity. Factors like disease manifestation, patient populations or image acquiring protocols contribute to this heterogeneity, impacting model convergence and performance \cite{mcmahan2017communication}. Recent approaches on improving the generalization of the global model, such as FedProx by Li et al. \cite{li2020federated}, regulate local updates to improve model generalization, solutions like FedBN \cite{li2021fedbn} and FedCross \cite{xu2022federated} address non-IID scenarios by optimizing feature spaces or sequentially training the global model across clients. Another essential strategy in FL is personalization, which involves training a specific model for each client while leveraging insights from others. Recent advancements in personalized FL include training one model per participating institution through adaptations of meta-learning \cite{fallah2020personalized, acar2021debiasing}, multi-task learning \cite{marfoq2021federated}, utilizing partial model sharing \cite{pillutla2022federated}, local fine-tuning \cite{li2021ditto, yu2020salvaging}, and clustering solutions \cite{ghosh2022efficient, manthe2023whole}.

However, the majority of proposed works aim to accept client data distributions as non-IID without measuring the heterogeneity of the federation and integrating this information into the pipeline. Assessing non-IID characteristics can provide crucial insights into training challenges and generalizability, for instance, \cite{zhao2018federated} observed decreased accuracy in federated models with higher Earth Mover's Distance (EMD) in non-IID image datasets. Yet, their study only evaluated classification problems with basic datasets like MNIST and CIFAR-10, limiting its applicability to healthcare. In the medical domain, \cite{luo2023influence} proposed analyzing data distributions related to site, tumor type, tumor size, dataset size, and tumor intensity. They demonstrated a significant negative correlation between the Dice score ratio and data distribution distances, particularly with EMD, in medical image segmentation. However, practical solutions for improving the federation based on these findings have not been proposed.

\underline{Our main contribution lies in} integrating EMD distance-related insights into federated averaging to offer optimal training for both generalization and personalization strategies, surpassing the performance of traditional FL methods. Assuming that this approach may not capture all non-IID characteristics, we explore non-IID measurements within the embedding space of data (i.e., medical images). We utilize a publicly-available pre-trained model to extract rich and meaningful embeddings, and then calculate the Euclidean distance (EUC) based on them. To the best of our knowledge, this is the first work in the medical imaging FL to propose this concept.

% In our evaluation, we compared the performance of this deep learning approach with statistical distance metric, specifically EMD.

\section{Method} 

\begin{figure*}[!htbp]
 \floatconts
   {fig:example}
   {\caption{Proposed method for medical FL optimization.}}
   {\includegraphics[width=0.95\linewidth]{overview.png}}
\end{figure*}
 
Our methodology, presented in Figure \ref{fig:example}, aims to achieve two important yet opposing goals in FL: a) model generalization, focusing on improving the accuracy and generalization of the model, and b) model personalization, tailoring the model for the highest accuracy at the client level. Our proposed methodology can be subdivided into two (2) steps. The first step involves measuring the degree of data heterogeneity (non-IID) among clients. To achieve this, our methodology runs two approaches in parallel. The first approach relies on statistical distances (i.e., EMD) based on metadata from medical images (Subsection \ref{Statistical Distance}), while the second one calculates EUC in the embedding space of these images (Subsection \ref{Pre-trained Embedding}). For this, a publicly pre-trained model is deployed from the server to each client for extracting embeddings. Subsequently, we extend the federated averaging algorithm by incorporating the down-weighting of the most distant client to enhance generalization. We also augment the personalization strategy by clustering closely-related clients (Subsection \ref{Personalization and Generalization Strategies}). While we hypothesize that the federation is trusted, we recognize the potential for privacy enhancement. However, the investigation into the compatibility of our methodology with privacy-preserving techniques \cite{jin2023fedml}, \cite{wei2020federated} is beyond the scope of this study.

\subsection{Non-IID Assessment with EMD Statistical Distance}\label{Statistical Distance}
To assess the non-IID nature of the federation, characterized by disparities between each client data, we calculate the EMD distance on metadata available in training medical images from each client. This metadata includes maximum intensity values and label volume values related to specific use cases. For instance, in the Federation of Tumor Segmentation (FeTS), where three labels (WT: whole tumor, TC: tumor core, ET: expanding tumor) are segmented, we extract volume values corresponding to these labels. The choice of this metadata is based on the assumption that it can be consistently extracted for all experiments, making it inherently available for all medical radiological images.  

Subsequently, the metadata vectors \( \mathit{V}_{Mi} \) for \(i = 1, 2, \ldots, N\) obtained from each client (where \( N \) is the total number of federated clients), representing the extracted metadata, are transmitted to the server. Here, the EMD is computed for each client in relation to other clients within the federation. The choice of this distance is intentional as it has demonstrated its robustness to capture dissimilarity between probability distributions and thus provide valuable insights into the distributional disparities among clients \cite{luo2023influence}. Other metrics may be considered in alternative domains.

\subsection{Non-IID Assessment with Euclidean Embedding Distance}\label{Pre-trained Embedding} 
In contrast to computing non-IID solely based on metadata, which may not capture the intricate characteristics within the data, we investigate computation in the embedding space of data for its capacity to capture deeper and richer data representations. Given the domain (i.e., medical imaging), a large publicly-available pre-trained MedicalNet model \cite{chen2019med3d} is utilized for embedding extraction in our proposed methodology. Training on the diverse 3DSeg-8 dataset, covering a wide range of modalities, organs, and pathologies, has resulted in the development of a set of heterogeneous 3D neural network models. We adopt the pre-trained 3D-ResNet18 network to extract embedding features from the local data on each client. Specifically, image features are extracted from the network' bottleneck layer, known for providing concise and essential representations of input data. We compute the angles between Principal Component Analysis (PCA) components (where PCA = 2) to validate their alignment (more details in Appendix E). These components compress vectors before transmission to the server, thereby enhancing privacy as reconstruction accuracy drops by decreasing the number of components \cite{reddy2021feature}. 

Subsequently, all client embedding vectors, denoted as \( \mathit{V}_{Ei} \) for \(i = 1, 2, \ldots, N\), each with a dimension of \textit{client\_samples}×(512, 2) and flattened to \textit{client\_samples}×(1024), are transmitted to the server. These vectors are then utilized for the computation of EUC according to Equation (\ref{eq:euc}): %Exploring Federated PCA \cite{grammenos2020federated} could be promising.

\begin{equation}
\label{eq:euc}
\textit{EUC}(\mathit{V}_{Ei}, \mathit{V}_{Ej}) = \sqrt{\sum_{k=1}^{N} (V_{Eik} - V_{Ejk})^2} ,
\end{equation}

\noindent where \( \mathit{V}_{Ei} \) and \( \mathit{V}_{Ej} \) represent feature vectors from any two clients.

% This straightforward distance metric was chosen for its well-defined nature, efficient computation, and consistent alignment with PCA.

\subsection{Generalization and Personalization Strategies}\label{Personalization and Generalization Strategies}
For generalization, we propose downgrading the most distant client (outlier) in the federation. To achieve this, we utilize the distances between clients computed during the non-IID assessment step and incorporate this information into weights (\(\omega\)) assigned to each client. This approach is implemented using the FedAvg algorithm for the proof of concept, while noting that any FL averaging method could be employed. The proposed update of the global model in FedAvg$_{w}$ is defined in Equation (\ref{eq:update_rule}):

\begin{equation}
\label{eq:update_rule}
\mathit{w}_{t+1} = \mathit{w}_t - \alpha \cdot \sum_{i=1}^{N} \omega_{i,t} \cdot \nabla f_i(\mathit{w}_t) ,
\end{equation}

\noindent where $\mathit{w}_{t+1}$ is the updated global model, $w_{i,t}$ represents the weight for client $i$ at iteration $t$, $f_i(\mathit{w}_t)$ is the local objective function for client $i$, and \(N\) is the total number of clients.

To identify a client for down-weighting, let \textit{D}\(_{\text{EMD}} \) and \textit{D}\(_{\text{EUC}} \) represent the matrices of EMD and EUC distances between clients, respectively. By summing along each column, we identify the client with the highest total (distant client) according to Equation (\ref{eq:max_index}), where \( D \) represents either \( D_{\text{EUC}} \) or \( D_{\text{EMD}} \):

% \begin{equation}
% \label{eq:max_index}
% \textit{i}_{\textit{max}} = {\arg\max_i \left( \sum_{j=1}^{N} D_{ij} \right)}.
% \end{equation}

\begin{equation}
\label{eq:max_index}
\textit{i}_{\textit{max}} = {\arg\max_i \left( \sum_{j=1}^{N} D_{ij} \right)}.
\end{equation}

Following that, we degrade its contribution by applying the arbitrary weight values ($\omega$) of 0.1, 0.3 and 0.5 to demonstrate the trend in improving the performance. This procedure enables us to assess the impact of reducing the influence of a single distant client ($i_{max}$) on the entire federation, thereby providing a clear illustration of the correlation between computed distances and the performance of the global model. The base weights ($\omega$) for the non-downgraded clients are 1. 

Then, in our personalization strategy, we use the same distance matrices \( D_{\text{EMD}} \) and \( D_{\text{EUC}} \) to build clusters of closely-related clients. The clustering algorithm minimizes total distances within each cluster (i.e., $C_1$ and $C_2$). The process is outlined in Algorithm (\ref{alg:cluster_assignment}).

\begin{algorithm2e}
\caption{Cluster Assignment}
\label{alg:cluster_assignment}
\SetAlgoLined
\KwIn{Distance matrix \(D\) and number of clients \(N\)}
\KwOut{Cluster assignments}
\tcp{Step 1: Identify the most distant client \(i_{\text{max}}\)}
\(i_{\text{max}} \leftarrow \arg\max_i \sum_j D_{ij}\)\;
\(C_2 \leftarrow \{i_{\text{max}}\}\) \tcp*{Initialize cluster \(C_2\) with \(i_{\text{max}}\)}
\(C_1 \leftarrow \{i \mid i \neq i_{\text{max}}\}\) \tcp*{Initialize remaining clients in cluster \(C_1\)}
\tcp{Step 2: Assign one or two closest clients \(i_{\text{next}}\) to \(i_{\text{max}}\)}
\While{\(C_1\) is not empty}{
  \(i_{\text{next}} \leftarrow \arg\min_{i \in C_1} D_{i_{\text{max}}i}\)\;
  Assign \(i_{\text{next}}\) to \(C_2\)\;
  \(C_1 \leftarrow C_1 \setminus \{i_{\text{next}}\}\) \tcp*{Remove \(i_{\text{next}}\) from \(C_1\)}
  \If{\(C_1\) has only 2 clients left}{
    Break \tcp*{Break the loop if only two clients left in \(C_1\)}
  }
}
\end{algorithm2e}




This clustering approach promotes effective collaboration and information exchange among clients with closer data distributions within each cluster. We limited the evaluation to only two clusters to show the benefit of the clustering approach and its impact on improving performance per client, while maintaining a reasonable number of experiments.

\section{Experiments}

\subsection{Datasets}
We used three publicly available FL datasets for our study. FeTS 2021 \cite{pati2021federated} consists of glioblastoma multi-modality MRIs from multiple sites, with WT, TC, and ET segmentations. For our experiment, we selected four (4) clients, ensuring a balanced distribution of samples (Hospital$_6$: 34 samples, Hospital$_ {13}$: 35 samples, Hospital$_{20}$: 33 samples, and Hospital$_{21}$: 35 samples). The multi-site prostate MRI segmentation dataset \cite{liu2020ms}, \cite{liu2020saml} features T2-weighted MRIs with prostate segmentation masks. We used four (4) balanced clients for our experiments: Client$_1$: 39 samples, Client$_2$: 32 samples, Client$_3$: 40 samples, Client$_4$: 39 samples. The Fed-KITS2019 dataset \cite{terrail2022flamby} focuses on kidney and tumor segmentation in CT scans. We created a 5-client federated version, excluding one site (Client$_6$: 30 samples) for a balanced distribution of samples: Client$_1$: 12 samples, Client$_2$: 14 samples, Client$_3$: 12 samples, Client$_4$: 12 samples, Client$_5$: 16 samples. Our focus on balanced and small federations promotes equal and rapid contributions from each client, facilitating equitable evaluation of the FL model across datasets. The data in each group were divided into training (80\%) and validation (20\%) sets, as originally proposed.

Additionally, we redistributed labels among clients to create non-IID federations, diversifying client distributions. We followed the methodology proposed in \cite{luo2023influence}. For non-IID federations, we aimed to maintain consistency in assigning training and validation samples across each set. However, if they were not available on the same client, adjustments were made, potentially resulting in differences in the selected samples, while preserving set sizes. Please refer to Appendix \ref{A} for more details regarding the data, and Appendix \ref{D} for information on building non-IID federations.

\subsection{Training and Validation}
We used a 3D U-Net network \cite{ronneberger2015u} along with the SGD optimizer with a learning rate of 0.01 and momentum of 0.9, employing DiceLoss for training, following a standard protocol for medical segmentation. Training was conducted using FedAvg \cite{mcmahan2017communication}, along with its weighted variant (Equation \ref{eq:update_rule}), across 25 global epochs as a balance between model convergence and optimized experiment time. We assessed performance using the Dice Metric, considering inter-client standard deviation for variation. We compared our methods against two representative FL algorithms: FedProx \cite{li2020federated} and DITTO \cite{li2021ditto}. Please refer to Appendix \ref{B} for further details.

\section{Results}
\subsection{Generalization Optimization}
To further reduce computation of $D_{EMD}$ for FeTS clients with many modalities and labels, we considered only the metadata where EMD values are maximum (EMD$_{max}$) among others as having the greatest negative impact. This applies to both maximum intensity (EMD${_{maxI}}$) and label volume values (EMD${_{maxL}}$). Please refer to Appendix \ref{C} for more details. Table \ref{tab:my-table} presents the correlation between EMD and the performance of the FedAvg global model, indicating that as EMD values increase, performance decreases. We refine the federation with the \textit{lowest performance}$^{A,B,C}$ to enhance generalization and personalization strategies. 

\begin{table}[h]
\centering
\caption{Comparison of EMD${_{maxI}}$, EMD${_{maxL}}$, and Dice score across different federations.}
\label{tab:my-table}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lcccc|cc|cc}
\hline
              & \multicolumn{4}{c} {FeTS}      & \multicolumn{2}{c}{Prostate} & \multicolumn{2}{c}{Fed-KITS2019} \\ \cline{1-9}  {Federations: } 
              & Original & WT   & TC   & ET    & Original      & Prostate      & Original       & Kidney + Tumor \\ \hline
EMD$_{max}$$_{I}$ & 1.60     & 1.45 & 1.19 & 0.59  & 6.88          & 0.38          & 1.17        & 0.51           \\
EMD$_{max}$$_{L}$     & 1.56     & 6.31 & 6.83 & 10.96 & 0.48          & 9.83          & 0.87         & 11.62          \\
Dice          & 0.867$_{\pm0.10}$ & 0.856$_{\pm0.04}$ & 0.843$_{\pm0.05}$ & \textit{0.828$_{\pm0.12}$}^{A} & 0.325$_{\pm0.10}$ & \textit{0.271$_{\pm0.06}$}^{B} & 0.459$_{\pm0.03}$ & \textit{0.423$_{\pm0.02}$}^{C} \\ \hline
\end{tabular}%
}
\end{table}

Analyzing the distance matrices provides valuable insights into the relationships between different clients (Table \ref{tab:my-tableD}). In FeTS$^{A}$, both EMD and EUC highlight client$_1$ as the most distant within the federation while for Prostate$^{B}$, client$_4$ emerges as the most distant. In Fed-KITS2019$^{C}$, while both metrics suggest significant differences between clients, the most distant client differs. This discrepancy is attributed to the nature of the data each distance operates on. EMD analyzes metadata distribution, while EUC operates in the embedding space, potentially capturing different data features. As a result, certain clients may appear more distant in one assessment compared to the other.

% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\begin{table}[]
\centering
\caption{Distances matrices ($D_{EMD}$ and $D_{EUC}$) for FeTS$^{A}$, Prostate$^{B}$, and Fed-KITS2019$^{C}$.}
\label{tab:my-tableD}
\resizebox{13cm}{!}{%
\begin{tabular}{lccccccccccl}
\cline{2-11}
 & \multicolumn{10}{c}{FeTS$^{A}$ Clients} &  \\ \cline{2-11}
 & (EMD) & 1 & 2 & 3 & \multicolumn{1}{c|}{4} & (EUC) & 1 & 2 & 3 & 4 &  \\ \cline{2-11}
 & 1 & - & 3.60 & 7.87 & \multicolumn{1}{c|}{13.91} & 1 & - & 23 & 49 & 63 &  \\
 & 2 & 3.60 & - & 1.75 & \multicolumn{1}{c|}{4.55} & 2 & 23 & - & 30 & 47 &  \\
 & 3 & 7.87 & 1.75 & - & \multicolumn{1}{c|}{2.98} & 3 & 49 & 30 & - & 22 &  \\
 & 4 & 13.91 & 4.55 & 2.98 & \multicolumn{1}{c|}{-} & 4 & 63 & 47 & 22 & - &  \\ \cline{2-11}
 & Sum & 25.38 & 6.30 & 4.73 & \multicolumn{1}{c|}{21.45} & Sum & \textbf{135} & 100 & 101 & 132 &  \\ \cline{2-11}
 & \multicolumn{10}{c}{Prostate$^{B}$ Clients} &  \\ \cline{2-11}
 & (EMD) & 1 & 2 & 3 & \multicolumn{1}{c|}{4} & (EUC) & 1 & 2 & 3 & 4 &  \\ \cline{2-11}
 & 1 & - & 1.24 & 3.01 & \multicolumn{1}{c|}{8.54} & 1 & - & 51 & 83 & 122 &  \\
 & 2 & 1.24 & - & 2.70 & \multicolumn{1}{c|}{11.05} & 2 & 51 & - & 60 & 98 &  \\
 & 3 & 3.01 & 2.70 & - & \multicolumn{1}{c|}{4.31} & 3 & 83 & 60 & - & 59 &  \\
 & 4 & 8.54 & 11.05 & 4.31 & \multicolumn{1}{c|}{-} & 4 & 122 & 98 & 59 & - &  \\ \cline{2-11}
 & Sum & 12.79 & 13.75 & 7.02 & \multicolumn{1}{c|}{23.90} & Sum & 256 & 209 & 202 & \textbf{279} &  \\ \hline
\multicolumn{12}{c}{Fed-KITS2019$^{C}$ Clients} \\ \hline
\multicolumn{1}{c}{(EMD)} & 1 & 2 & 3 & 4 & \multicolumn{1}{c|}{5} & (EUC) & 1 & 2 & 3 & 4 & \multicolumn{1}{c}{5} \\ \hline
\multicolumn{1}{c}{1} & - & 0.92 & 1.45 & 2.46 & \multicolumn{1}{c|}{13.86} & 1 & - & 936 & 1268 & 2743 & \multicolumn{1}{c}{1207} \\
\multicolumn{1}{c}{2} & 0.92 & - & 1.03 & 2.37 & \multicolumn{1}{c|}{12.55} & 2 & 936 & - & 844 & 2211 & \multicolumn{1}{c}{600} \\
\multicolumn{1}{c}{3} & 1.45 & 1.03 & - & 1.15 & \multicolumn{1}{c|}{11.03} & 3 & 1268 & 844 & - & 1602 & \multicolumn{1}{c}{303} \\
\multicolumn{1}{c}{4} & 2.46 & 2.37 & 1.15 & - & \multicolumn{1}{c|}{4.89} & 4 & 2743 & 2211 & 1602 & - & 1548 \\
\multicolumn{1}{c}{5} & 13.86 & 12.55 & 11.03 & 4.89 & \multicolumn{1}{c|}{-} & 5 & 1207 & 600 & 303 & \multicolumn{1}{l}{1548} & \multicolumn{1}{c}{-} \\ \hline
\multicolumn{1}{c}{Sum} & 18.70 & 15.95 & 13.21 & 8.40 & \multicolumn{1}{c|}{42.34} & Sum & 6154 & 4591 & 4017 & \textbf{8104} & \multicolumn{1}{c}{3658} \\ \hline
\end{tabular}%
}
\end{table}

Table \ref{tab:my-tableglobalmodel} compares results across various learning approaches, including our down-weighting strategy (FedAvg${_w}$) for distant clients. In FeTS$^{A}$, FedAvg${_w}$ leads to a relative increase in performance compared to the default FedAvg approach and FedProx. For Prostate$^{B}$, it significantly enhances performance (+5\% vs FedAvg, +16.6\% vs FedProx). Similarly, in Fed-KITS2019$^{C}$, it improves Dice performance regardless of distance metric (EMD or EUC).

\begin{table}[h]
\centering
\caption{Global model Dice scores (mean ± standard deviation between clients in the federation). EMD indicates Earth Mover's distance, EUC stands for Euclidean distance.}
\label{tab:my-tableglobalmodel}
\small
\resizebox{14.5cm}{!}{%
\begin{tabular}{lcccc}
\hline
\multicolumn{1}{c|}{Algorithm/Dataset:}                   & \begin{tabular}[c]{@{}c@{}}FeTS$^{A}$:\\ Client_1 \end{tabular} & \begin{tabular}[c]{@{}c@{}}Prostate$^{B}$:\\ Client_4\end{tabular} & \begin{tabular}[c]{@{}c@{}}Fed-KITS2019$^{C}$ EMD:\\ Client_5\end{tabular} & \begin{tabular}[c]{@{}c@{}}Fed-KITS2019$^{C}$ EUC:\\ Client_4\end{tabular} \\ \hline
\multicolumn{1}{l|}{FedAvg, $\omega$$_{default}$ = 1.0} & 0.828$_{\pm0.12}$                                        & 0.271$_{\pm0.06}$                                            & 0.423$_{\pm0.02}$                                                    & 0.423$_{\pm0.02}$                                                    \\
\multicolumn{1}{l|}{FedProx, µ = 0.1}            & 0.831$_{\pm0.13}$                                        & 0.155$_{\pm0.07}$                                            & 0.395$_{\pm0.02}$                                                    & 0.395$_{\pm0.02}$                                                    \\ \hline
\multicolumn{1}{l|}{FedAvg$_{w}$, $\omega$ = 0.1}             & 0.812$_{\pm0.14}$                                        & \textbf{0.321$_{\pm0.06}$}                                   & \textbf{0.438$_{\pm0.02}$}                                           & 0.449$_{\pm0.02}$                                                    \\
\multicolumn{1}{l|}{FedAvg$_{w}$, $\omega$ = 0.3}             & 0.833$_{\pm0.14}$                                        & 0.255$_{\pm0.06}$                                            & 0.428$_{\pm0.02}$                                                    & \textbf{0.460$_{\pm0.04}$}                                           \\
\multicolumn{1}{l|}{FedAvg$_{w}$, $\omega$ = 0.5}             & \textbf{0.840$_{\pm0.12}$}                               & 0.290$_{\pm0.07}$                                            & 0.409$_{\pm0.03}$                                                    & 0.456$_{\pm0.03}$                                                    \\ \hline
                                                 &                                                          &                                                              &                                                                      &                                                                     
\end{tabular}%
}
\end{table}

%add idea why 0.5 for fets, because u can see fedavg and fedprox works equaly high, so it means that clients is not so far from others...

\subsection{Personalization Optimization}
According to Table \ref{tab:my-tableD}, for FeTS$^{A}$ and Prostate$^{B}$ clients, two distinct clusters based on minimum distances could be formed, facilitating potential collaboration and improving model performance within FL. Fed-KITS2019$^{C}$ clients also exhibit clustering, with differing perspectives from EMD and EUC. While EMD provides only one clear cluster assignment, with EUC, we explore another assignment option based on the proximity of client$_3$ to client$_5$ as well.

\begin{table}[h]
\centering
\caption{Dice scores for FeTS$^{A}$, Prostate$^{B}$, Fed-KITS2019$^{C}$ personalization optimization.}
\label{tab:my-tablepersonalization}
\resizebox{14.5cm}{!}{%
\small
\begin{tabular}{l|cccccc}
\hline
Algorithm/Clients: & 1 & 2 & 3 & 4 & 5 & Average \\ 
\hline
\multicolumn{7}{c}{FeTS$^{A}$} \\ 
\hline
FedAvg$_{default}$ & 0.662$_{\pm0.28}$ & 0.824$_{\pm0.10}$ & 0.897$_{\pm0.11}$ & 0.929$_{\pm0.04}$ & - & 0.828 \\ 
DITTO & 0.698$_{\pm0.25}$ & 0.832$_{\pm0.08}$ & 0.893$_{\pm0.10}$ & 0.938$_{\pm0.03}$ & - & 0.840 \\ 
FedAvg$_{\left\{1, 2\right\}\left\{3, 4\right\}}$ & \textbf{0.702$_{\pm0.25}$} & \textbf{0.864$_{\pm0.06}$} & \textbf{0.914$_{\pm0.07}$} & \textbf{0.947$_{\pm0.03}$} & - & \textbf{0.857} \\ 
\hline
\multicolumn{7}{c}{Prostate$^{B}$} \\ 
\hline
FedAvg$_{default}$ & 0.194$_{\pm0.11}$ & 0.245$_{\pm0.13}$ & 0.316$_{\pm0.14}$ & 0.330$_{\pm0.09}$ & - & 0.271 \\ 
DITTO & 0.232$_{\pm0.12}$ & 0.276$_{\pm0.14}$ & \textbf{0.343$_{\pm0.13}$} & 0.355$_{\pm0.10}$ & - & 0.302 \\ 
FedAvg$_{\left\{1, 2\right\}\left\{3, 4\right\}}$ & \textbf{0.337$_{\pm0.17}$} & \textbf{0.347$_{\pm0.15}$} & 0.337$_{\pm0.11}$ & \textbf{0.422$_{\pm0.12}$} & - & \textbf{0.361} \\ 
\hline
\multicolumn{7}{c}{Fed-KITS2019$^{C}$} \\ 
\hline
FedAvg$_{default}$ & 0.398$_{\pm0.36}$ & 0.4338$_{\pm0.37}$ & \textbf{0.439$_{\pm0.35}$} & 0.414$_{\pm0.39}$ & 0.428$_{\pm0.25}$ & 0.423 \\ 
DITTO & 0.367$_{\pm0.33}$ & 0.429$_{\pm0.35}$ & 0.400$_{\pm0.32}$ & 0.400$_{\pm0.39}$ & 0.430$_{\pm0.23}$ & 0.405 \\ 
FedAvg$_{EMD: {\left\{1, 2, 3\right\}\left\{4, 5\right\}}}$ & 0.441$_{\pm0.37}$ & 0.437$_{\pm0.38}$ & 0.433$_{\pm0.39}$ & 0.468$_{\pm0.38}$ & \textbf{0.594$_{\pm0.22}$} & \textbf{0.475} \\ 
FedAvg$_{EUC: {\left\{1, 2\right\}\left\{3, 4, 5\right\}}}$ & \textbf{0.442$_{\pm0.39}$} & \textbf{0.446$_{\pm0.39}$} & 0.433$_{\pm0.40}$ & \textbf{0.497$_{\pm0.41}$} & 0.528$_{\pm0.30}$ & 0.469 \\ 
\hline
\end{tabular}
}
\end{table}

Table \ref{tab:my-tablepersonalization} compares Dice scores for personalized models across different clients, revealing insights into method performance. In FeTS$^{A}$, FedAvg$_{\left\{1, 2\right\}\left\{3, 4\right\}}$ consistently outperforms FedAvg$_{default}$ and DITTO, indicating improved segmentation with personalized learning based on $C_{1}$ = ${\left\{1, 2\right\}}$ and $C_{2}$ =${\left\{3, 4\right\}}$. Similarly, in Prostate$^{B}$, FedAvg$_{\left\{1, 2\right\}\left\{3, 4\right\}}$ shows significant improvements over FedAvg$_{default}$ and DITTO. In Fed-KITS2019$^{C}$, segmentation either through EMD or EUC clustering consistently outperforms FedAvg$_{default}$ and DITTO.

\section{Conclusion and Discussion}

Our study underscores the significance of assessing client data heterogeneity (non-IID) in medical imaging FL to optimize both generalization and personalization goals. We propose a down-weighting strategy to enhance global model performance across datasets by reducing the impact of a distant client. Additionally, we advocate for distance-based clustering of clients as a personalization solution to enhance medical imaging segmentation accuracy across diverse datasets.

While promising, our study is limited to medical imaging, particularly volumetric radiographic datasets, and prioritizes balanced scenarios and small federations for faster computation and proof-of-concept purposes. Future research should explore unbalanced scenarios, larger federations, alternative architectures for embedding extraction to broaden the applicability of our proposed strategies.

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was partially supported by the Region Grand Est (project CLINNOVA) and by French State Funds managed by the Agence Nationale de la Recherche (ANR) under Grant ANR-22-FAI1-0001 (project DAIOR) and Grant ANR-10-IAHU-02 (IHU Strasbourg).}

\bibliography{midl24_282}

\appendix

\section{Detailed Data Information}\label{A}

Table \ref{tab:annex1} shows the number of samples in the training and validations splits of three segmentation datasets used in this study.

\begin{table}[h]
\centering
\caption{Number of samples in the training and validation splits of three datasets.}
\label{tab:annex1}
\normalsize
\begin{tabular}{lccccccccccccc}
\hline
 & \multicolumn{4}{c}{FeTS} & \multicolumn{4}{c}{Prostate} & \multicolumn{5}{c}{Fed-KITS2019} \\ \hline
\multicolumn{1}{l|}{Client:} & 1 & 2 & 3 & \multicolumn{1}{c|}{4} & 1 & 2 & 3 & \multicolumn{1}{c|}{4} & 1 & 2 & 3 & 4 & 5 \\ \hline
\multicolumn{1}{l|}{Training} & 34 & 35 & 33 & \multicolumn{1}{c|}{35} & 39 & 32 & 40 & \multicolumn{1}{c|}{39} & 9 & 11 & 9 & 9 & 12 \\ \hline
\multicolumn{1}{l|}{Validation} & 7 & 7 & 7 & \multicolumn{1}{c|}{7} & 7 & 6 & 8 & \multicolumn{1}{c|}{7} & 3 & 3 & 3 & 3 & 4 \\ \hline
\end{tabular}%
\end{table}

We standardized our pre-processing and augmentation pipelines across all datasets to uphold consistency and reduce their potential impact on results. While maintaining uniform practices like ensuring channel-first representation and intensity normalization, we adjusted specific parameters, such as spacing and cropping sizes for four patch extraction (number of patches = 4) during training, to match the unique characteristics of each dataset.

For example, in the FeTs dataset, we utilized a spacing of (1.0, 1.0, 1.0), and a cropping size of (224, 224, 144). In the Prostate dataset, the spacing was set to (0.3, 0.3, 1.0), and the cropping size to (224, 224, 32). Meanwhile, for the Fed-KITS2019 dataset, we employed a spacing of (2.90, 1.45, 1.45), and a cropping size of (256, 256, 64).

As for augmentation, random flipping is applied along each spatial axis with a probability of 50\%. Intensity scaling and shifting are applied with factors and offsets of 0.1, respectively, with a probability of 100\%. 



\section{Training and Validation}\label{B}

For the FeTS dataset, we utilized a batch size of 1, and sliding window inference with a window size of (240, 240, 160) was applied. Similarly, for the Prostate dataset, the batch size remained at 1, and sliding window inference was conducted with a window size of (224, 224, 32). For the Fed-KITS2019 dataset, we maintained a batch size of 1, and sliding window inference was performed with a window size of (256, 256, 80).


All training and validation processes were conducted using the MONAI \textsuperscript{\ref{monai}} and NVIDIA FLARE \textsuperscript{\ref{nvidia_flare}} frameworks.

% Your document content goes here

References:
\begin{enumerate}
    \item MONAI: Medical Open Network for AI. \url{https://monai.io} \label{monai}
    \item NVIDIA FLARE: Framework for AI Research and Development. \url{https://developer.nvidia.com/flare} \label{nvidia_flare}
\end{enumerate}

\section{Computation of FeTS Distance Matrix}\label{C}
As an example, we calculate the final D$_{EMD}$ matrix for the non-IID ET federation of FeTs$^{A}$ clients. Initially, we compute the Earth Mover's Distance (EMD) for intensities of all modalities across all clients: T2-weighted, T1-weighted, T1-weighted with contrast enhancement (T1C+), and FLAIR. This provides us with the following values: EMD$_{T2}$ = 0.393, EMD$_{T1}$ = 0.247, EMD$_{T1C+}$ = 0.290, and EMD$_{FLAIR}$ = 0.595.


Similarly, we perform the same process for the three available labels for segmentation: WT (Whole Tumor), TC (Tumor Core), and ET (Enhancing Tumor) for each client with respect to each other client. We obtain the following EMD values:  EMD$_{WT}$ = 0.935, EMD$_{TC}$ = 0.561, and EMD$_{ET}$ = 10.96.

We select the maximum EMD in intensity modalities as EMD$_{maxI}$ = 0.595, corresponding to the FLAIR modality (EMD$_{FLAIR}$), and the maximum EMD in label distributions as EMD$_{maxL}$ = 10.96, corresponding to the ET (Enhancing Tumor) label (EMD$_{ET}$). Consequently, we construct two $D_{EMD}$ matrices representing client-to-client correlation based on these values, as presented in Table \ref{tab:annex2}.


% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\begin{table}[]
\centering
\caption{$D_{EMD}$ matrix for the FLAIR modality and the ET (Enhancing Tumor) label.}
\label{tab:annex2}
\small
\begin{tabular}{lcllll|cccccl}
\cline{2-11}
 &
  \multicolumn{5}{c}{FLAIR} &
  \multicolumn{5}{c}{Enhancing Tumor (ET)} &
  \multicolumn{1}{c}{\textbf{}} \\ \cline{2-11}
 &
  \multicolumn{1}{l}{(EMD$_{maxI}$)} &
  \multicolumn{1}{c}{1} &
  \multicolumn{1}{c}{2} &
  \multicolumn{1}{c}{3} &
  \multicolumn{1}{c}{4} &
  \multicolumn{1}{|l}{(EMD$_{maxL}$)} &
  1 &
  2 &
  3 &
  4 &
   \\ \cline{2-11}
 & 1 & \multicolumn{1}{c}{-} & 0.33                  & 0.32                  & 0.27                  & 1 & -     & 6.87 & 15.42 & 27.55 &  \\
 & 2 & 0.33                  & \multicolumn{1}{c}{-} & 0.34                  & 1.45                  & 2 & 6.87  & -    & 3.07  & 7.40  &  \\
 & 3 & 0.32                  & 0.34                  & \multicolumn{1}{c}{-} & 0.86                  & 3 & 15.42 & 3.16 & -     & 5.18  &  \\
 & 4 & 0.27                  & 1.45                  & 0.86                  & \multicolumn{1}{c|}{-} & 4 & 27.55 & 7.40 & 5.18  & -     &  \\ \cline{2-11}
\end{tabular}%
\end{table}

\begin{table}[h]
\centering
\caption{Final $D_{EMD}$ matrix for the FeTS$^{A}$ clients.}
\label{tab-annex3}
\small
\begin{tabular}{llcccc}
\cline{2-6}
 & \multicolumn{5}{c}{$D_{EMD}$ for FeTS$^{A}$}     \\ \cline{2-6} 
 &   & 1 & 2 & 3 & 4 \\
 & 1 & -          & 3.60       & 7.87       & 13.91      \\
 & 2 & 3.60       & -          & 1.70       & 4.43       \\
 & 3 & 7.87       & 1.75       & -          & 3.02       \\
 & 4 & 13.91      & 4.43       & 3.02       & -          \\ \cline{2-6} 
\end{tabular}%
\end{table}

For the final $D_{EMD}$ matrix, we average these two matrices, with the result shown in Table \ref{tab-annex3}.

\section{Building Non-IID Federations}\label{D}

As an example of building non-IID federations, we outline the main steps. In accordance with \cite{luo2023influence}, we utilized label sizes as a criterion for forming such federations. We redistributed label sizes by organizing them from smallest to largest. Specifically, for the FeTS dataset, this process was conducted for each specific use case (WT, TC, ET); for the Prostate dataset, it was done for the prostate label; and for Fed-KITS2019, we examined both kidney and tumor regions collectively. 

Figures \ref{1f}, \ref{2f}, and \ref{3f} represent distributions of labels for each non-IID federation.
 
\begin{figure*}[h]
 \floatconts
   {1f}
   {\caption{Distribution of tumor labels (WT, TC, ET) for FeTS$^{A}$: ET non-IID federation.}}
   {\includegraphics[width=0.89\linewidth]{ET_noniid.png}}
\end{figure*}

\begin{figure*}[h]
 \floatconts
   {2f}
   {\caption{Distribution of the prostate label for Prostate$^{B}$: Prostate non-IID federation.}}
   {\includegraphics[width=0.38\linewidth]{prostate.png}}
\end{figure*}

\begin{figure*}[h]
 \floatconts
   {3f}
   {\caption{Distribution of Segment\_1 (kidney), Segment\_2 (tumor), and average (Avg) labels for Fed-KITS2019$^{C}$: Kidney+Tumor non-IID federation.}}
   {\includegraphics[width=0.89\linewidth]{kidney_tumor.png}}
\end{figure*}
%\let\clearpage\relax
\FloatBarrier
\section{Angles of PCA Components}\label{E}

Figures \ref{4a}, \ref{4b}, and \ref{4c} show the angles between PCA components (1 and 2) for each non-IID federation.

\begin{figure*}[h]
 \floatconts
   {4a}
   {\caption{Angles between PCA components (1 and 2) for the FeTS$^{A}$ clients.}}
   {\includegraphics[width=0.79\linewidth]{MRI.png}}
\end{figure*}

\begin{figure*}[h]
 \floatconts
   {4b}
   {\caption{Angles between PCA components (1 and 2) for the Prostate$^{B}$ clients.}}
   {\includegraphics[width=0.75\linewidth]{PROSTATE.png}}
\end{figure*}

\begin{figure*}[h]
 \floatconts
   {4c}
   {\caption{Angles between PCA components (1 and 2) for the Fed-KITS2019$^{C}$ clients.}}
   {\includegraphics[width=0.75\linewidth]{CT.png}}
\end{figure*}

The formula to compute the angle between two PCA components $\mathbf{v}$ and $\mathbf{w}$ is given by:
\[
\text{{Angle}}(\mathbf{v}, \mathbf{w}) = \arccos\left(\frac{\mathbf{v} \cdot \mathbf{w}}{\|\mathbf{v}\| \cdot \|\mathbf{w}\|}\right)
\]

where $\cdot$ represents the dot product, $\|\mathbf{v}\|$ and $\|\mathbf{w}\|$ represent the magnitudes of vectors $\mathbf{v}$ and $\mathbf{w}$ respectively, and $\arccos$ is the inverse cosine function. We iterate over pairs of clients and computes the angle between the corresponding components for a random sample on each client.

\end{document}