\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs} 
\usepackage{makecell}
\usepackage{svg}

\newrobustcmd{\B}{\bfseries}

\jmlrvolume{-- Under Review}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024 submission}
\editors{Under Review for MIDL 2024}

\title[ADFLL]{Towards a Collective Medical Imaging AI: Enabling Continual Learning from Peers}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
 \midlauthor{\Name{Guangyao Zheng\nametag{$^{1}$}} \Email{tz30@rice.edu} \\
\Name{Vladimir Braverman\nametag{$^{1}$}} \Email{vb21@rice.edu} \\
\Name{Michael A. Jacobs\nametag{$^{2,3}$}}\Email{Michael.A.Jacobs@uth.tmc.edu} \\
\Name{Vishwa S. Parekh\nametag{$^{4}$}} \Email{vparekh@som.umaryland.edu} \\
\addr $^{1}$ Department of Computer Science, Rice University, Houston, TX, USA \\
\addr $^{2}$ Department Of Diagnostic And Interventional Imaging, McGovern Medical School, UTHealth Houston, Houston, TX, USA, 
\addr $^{3}$The Russell H. Morgan Department of Radiology and Radiological Science, The Johns Hopkins University School of Medicine, Baltimore, MD 21205\\
\addr $^{4}$University of Maryland Medical Intelligent Imaging (UM2ii) Center\\ 
Department of Diagnostic Radiology and Nuclear Medicine \\
    University of Maryland School of Medicine \\
    Baltimore, MD 21201 \\
}
\begin{document}

\maketitle

\begin{abstract}

Federated learning is an exciting area within machine learning that allows cross-silo training of large-scale machine learning models on disparate or similar tasks in a privacy-preserving manner. However, conventional federated learning frameworks require a synchronous training schedule and are not capable of retaining knowledge from nodes that leave the system or performing lifelong learning where newer tasks are continuously introduced in the system either through new datasets at preexisting nodes or through new nodes joining the system. To that end, we propose an asynchronous decentralized federated lifelong learning (ADFLL) method that allows agents in the system to be lifelong learning from their own previous experiences and others', thus overcoming the potential drawbacks of conventional federated learning. We evaluate the ADFLL framework in two experimental setups: a simulation setup where we simulate the addition and deletion of nodes in an FL setup and a deployment setup where different nodes working on various tasks are initialized on separate machines with drastic compute differences. Experiments were evaluated on the brain tumor segmentation (BRATS) dataset for localizing the left ventricle in twenty-four imaging environments. In the simulation experiment with several agents leaving the federation every round leading up to a single agent remaining in the system, the final ADFLL agent had an excellent performance for landmark localization with an average Euclidean distance error of $8.55 \pm 7.12$ across twenty-four environments compared to $8.34\pm7.26$ ($p>0.05$) for an all-knowing agent that had centralized access to all the environments and $8.15\pm5.42$ ($p>0.05$) for a conventional lifelong learning agent that sequentially trained for twenty-four rounds, one environment at a time. Similarly, there was no significant difference between the performance of all four ADFLL agents and the all-knowing agent in the deployment setup. In addition, one of the agents outperformed the conventional lifelong learning agent (p=0.01), while the other three had no significant difference from the conventional lifelong learning agent. In conclusion, we developed an ADFLL framework with excellent performance and speed-up compared to conventional LL agents.

\end{abstract}

\begin{keywords}
Federated learning, Lifelong learning, Deep reinforcement learning, Landmark localization
\end{keywords}

\section{Introduction}
Medical imaging techniques such as MRI (Magnetic Resonance Imaging), PET (Positron Emission Tomography), CT (Computerized Tomography), X-ray, and Ultrasound play a critical role in the diagnosis, prognosis, and preventative care of patients. The use of machine learning methods in medical imaging, such as classification, segmentation, noise reduction, and landmark localization, have been used in many different applications in the clinical decision support system \cite{CHENG2022102313,PAN2022103824,doi:10.1080/02564602.2021.1937349,KHAIRANDISH2022290,9139480}. However, these methods are used on single tasks without the ability to be generalized to other tasks. They often require a complete dataset on one device to train, which may cause privacy concerns about patient data and computational constraints for the device specifications \cite{10.1007/978-3-030-91387-8_1}.

To address these challenges, Federated Learning (FL) has emerged as a promising approach enabling multiple agents to train a model collaboratively without sharing their data \cite{97c9f5c0c4714251a5c377616bf32211}, which protects data privacy and reduces computational costs at the local agent level by distributing the computation to multiple agents to train the model on their local data and sharing only the model updates with a central server. Federated learning implementations have shown promising results in various medical applications \cite{10.1007/978-3-030-60548-3_18,Jiang_Wang_Dou_2022,9268161}. However, federated learning frameworks often rely on synchronized learning schedules, meaning all participating agents start training simultaneously. They also require agents to have the same architecture as the central server to aggregate the model weights. Data and agent heterogeneity influence the training speed, which significantly reduces the efficiency and challenges the robustness of these approaches \cite{58f2965c5c4847d8b5e02e9e4408799d}. Additionally, Federated learning approaches cannot perform Lifelong Learning (LL), an essential aspect of machine learning applied to medical imaging.  Works have shown that lifelong learning capability can generalize and improve accuracy on multiple tasks \cite{10.1007/978-3-030-00928-1_54}. With new imaging sequences and new imaging technologies emerging, possessing backward compatibility while applying previously learned knowledge to new tasks is crucial for the accuracy and efficiency of the clinical decision-making support system. 


To address the abovementioned limitations, we propose an asynchronous decentralized federated lifelong learning (ADFLL) approach to landmark localization in medical imaging. This framework leverages Federated learning's ability to protect data privacy and reduce computational constraints while permitting data and agent heterogeneity in the system. This framework does not require a central node and can jointly lifelong learn multiple tasks without catastrophic forgetting. We provide a flexible, efficient, and robust framework that can be deployed in real-world applications. This paper presents experimental results demonstrating the efficacy of our framework on the 2017 brain tumor segmentation (BraTS) dataset consisting of twenty-four different imaging environments.

\section{Method}
\subsection{Deep Reinforcement Learning}
We created a deep reinforcement learning framework (DRL) that utilizes the deep Q-network (DQN) algorithm, which is depicted in Figure \ref{fig2}. DRL will be the basis of our framework, allowing lifelong and federated learning. The 3D DQN model we used in this paper was adapted from existing works \cite{mnih2013playing,alansary2018automatic,vlontzos2019multiple,parekh2020multitask}. A 3D imaging volume with x, y, and z dimensions represents the environment. The agent is represented by a 3-dimensional bounding box within the 3D image with six possible actions: moving in the positive or negative in the x, y, or z axis. The state is defined by the agent's current location (or a chain of locations), each represented by a 3D bounding box of $45\times 45 \times 11$ pixels. The reward is calculated by the change in distance to the target landmark location before and after the agent takes an action. The agent's exploration within the environment generated state-action-reward-resulting state $[s,a,r,s']$ tuples, which were recorded and sampled in the experience replay buffer (ERB) over multiple episodes. The information in the ERBs is non-sensitive, as the action and reward are numbers regarding the DRL model, and the state and resulting states are tiny fractions of the total 3D image, roughly $0.3\%$.


\subsection{Lifelong Learning}
We implemented lifelong learning using selective experience replay \cite{rolnick2019experience}, a model-agnostic lifelong learning approach that enables sharing experiences across different models. To achieve lifelong learning, we utilized ERBs produced by DRL agents during the previous training that encapsulate previously learned knowledge. To learn a generalized representation of current and past tasks, the model selects a batch of experiences from the ERB of its current task and the ERBs of previous tasks during training.

\subsection{Asynchronous Decentralized Federated Lifelong Learning}
We developed the Asynchronous Decentralized Federated Lifelong Learning (ADFLL) by constructing a network of lifelong DRL agents. Each agent shares their database of personal experiences with each other to facilitate learning from each other's experiences. More specifically, once an agent finishes training with a dataset, the resulting experience from the training is shared with the network. Furthermore, we modified the training setup for each agent to sample experiences from the current dataset ERB, the agent's personal experiences, and the incoming experiences from other agents, as shown in Fig.~\ref{fig2}. As a result, every agent in the network can learn from each other's experiences, thereby integrating federated lifelong learning capability. 

% \begin{figure}[htb]
% \centering{}
% \includegraphics[width=0.5\textwidth]{Resources/HubExample.png}
% \caption{Snapshot of the shared database maintained by the hub nodes} 
% \label{fig7}
% \end{figure}

In a naive setup, every agent would communicate their experiences with every other agent in the network. However, such an all-to-all communication setup is highly inefficient and not scalable as it would require a large amount of communication bandwidth. To address this issue, we implemented a homogeneous distributed database system. As shown in Fig.~\ref{fig2}, our network consists of a predefined set of hub nodes that act as communication hubs for spatially adjacent nodes in the network. Subsequently, every agent in the network exclusively communicates with their nearest hub node at the end of each personal training round. The experience sharing between an agent and a hub node is bidirectional. Finally, every hub node maintains a shared experience database on the network. The hub nodes periodically communicate with each other to synchronize their databases. The agents in the system are not required to have standardized training speed or start training simultaneously. The hub will regulate and preserve the experiences in the system, and agents in the system can train on different tasks. An example of this system is demonstrated in Fig.~\ref{fig2}.

\begin{figure*}[htb]
\center
\includegraphics[width=1\textwidth]{Resources/Picture_ADFLL_pic1.png}
\caption{Illustration of decentralized federated learning setup. Blue circles represent individual agents, and orange circles represent hub agents.} \label{fig2}
\end{figure*}

The advantage of our system setup is that it is robust against node or hub failures. When a node fails, the only loss is its training information; when a hub fails, the loss is the unique ERBs other hubs do not have. Moreover, the communication complexity is linear with respect to the number of nodes; each node only needs to communicate with its respective hub, and hubs sync periodically. Compared to other federated learning systems, centralized or not, they are prone to system-wide failure caused by a node failure or sacrifice communication complexity to prevent system-wide failures.

\subsection{Baseline Agents}
\textbf{All-knowing agent:} We trained a central aggregation agent, Agent X, that is trained using central aggregation of all the data across all the ADFLL agents in one place. This gives us the baseline "upper bound" performance compared to the ADFLL agents. 

\textbf{Conventional lifelong DRL agent:}
We trained a conventional lifelong learning agent, Agent M, who has access to the dataset sequentially and is therefore trained for multiple rounds with one new dataset available every round. This provides us with a comparison between lifelong learning with and without experience sharing between lifelong learning DRL agents. 

\textbf{Conventional DRL agent:}
We trained a conventional DRL agent, Agent Y, who neither has access to all the environments nor can perform lifelong learning. This forms our "lower bound" baseline for performance comparison with ADFLL agents.

\subsection{Clinical Data}
To evaluate our ADFLL  framework, we utilized the brain tumor segmentation (BraTS) dataset \cite{menze2014multimodal}. This dataset consisted of 285 patients and included pre-contrast T1-weight, post-contrast T1-weighted, T2-weighted, and Fluid Attenuated Inversion Recovery (FLAIR) sequences in the axial orientation. We randomly sampled a subset of 100 patients to use as our experiment dataset. 60 patients have high-grade glioma (HGG), and 40 patients have low-grade glioma (LGG). We split the 100 patients into two parts 80:20. 80 were used for training and 20 for evaluation, with the training set consisting of 48 HGG and 32 LGG tumors and the test set consisting of 12 HGG and 8 LGG tumors. We reconstructed the dataset to include all three imaging orientations (coronal, sagittal, and axial). As a result, we obtained twenty-four imaging environments with combinations of two pathologies, 4 imaging sequences, and 3 image orientations. All the experiments were performed for the task of left ventricle localization. A sample of 8 task-environment pairs is shown in Fig.~\ref{fig3}.

\begin{figure}[!htb]\centering{}
\includegraphics[width=0.6\textwidth]{Resources/DFL_figure3.png}
\caption{Illustration of the 8 task-environment pairs. The red boxes indicate the true landmark location of the top left ventricle. The yellow box is a predicted location from ADFLL agents during their training progressions} \label{fig3}
\end{figure}

\textbf{Evaluation Metric:}
The performance metric was set as the terminal Euclidean distance in pixels between the agent's prediction and the target landmark. We performed paired t-tests to compare the performance of the ADFLL agents with the traditional lifelong learning framework, all-knowing deep reinforcement learning agent, and partial-knowing deep reinforcement learning agent. The p-value for statistical significance was set to $p \le 0.05$. 

\section{Experiments}

\subsection{Simulation Experiments}

We conducted two simulation experiments to evaluate our framework's scalability, flexibility, and robustness. For both experiments, we evaluated the average performance of all the agents to localize the top left ventricle across all 24 imaging environments. Additionally, since it is prohibitively expensive to experiment on 24 different machines, these systems were simulated on the NVIDIA DGX-1 with a synchronous training protocol.

\textbf{Addition of agents experiment:}
We initialized this experiment with a system of four ADFLL agents. We subsequently increased the number of agents in the system from 4 to 16 over 4 rounds (4,8,12,16). At the beginning of every round, each ADFLL agent in the system receives a new training dataset with a different imaging environment. Each agent performs ADFLL using ERBs from its previous tasks in addition to ERBs being communicated across the network from other agents. We further simulated a communication dropout of $75\%$ to account for network communication issues in the real world, leading to information loss while transmitting ERBs across agents. This experiment aimed to demonstrate how newer agents joining the system at different points in time can take advantage of the information within the system and learn the collective knowledge available in the system within just one round. 


\textbf{Deletion of agents experiment:}
In the deletion experiment, we gradually decreased The number of agents in the system from 24 to 1 agent over the progression of 5 rounds (24,12,6,3,1). Similar to the addition experiment, each agent remaining in the system receives a new training dataset every round, and the ERBs are communicated across the network. The communication for this experiment was also simulated with a $75\%$ dropout. This experiment aimed to demonstrate how the proposed ADFLL framework preserves the collective knowledge in a lifelong learning manner across all the tasks, even as the agents contributing the knowledge leave the system. 

\subsection{Deployment Experiment}

We initialized this experiment with four agents and a sub-sample of eight environments (shown in Fig.~\ref{fig3}). We implemented two agents on an NVIDIA DGX-1, each with an NVIDIA V100, and two on Google Cloud, each with an NVIDIA T4. The two agents on Google Cloud, A1 and A2, have their individual hubs, H1 and H2. The two agents, A3 and A4, on the DGX-1 are connected to the third hub, H3, with a total of three hubs for four agents. Since the GPUs on DGX-1 are much more powerful than the GPUs on Google Cloud, A3 and A4 will run significantly faster than A1 and A2. We also implemented asynchronous learning, meaning when the agent finishes training on a task, it will broadcast its ERBs to the hub and begin training on a new dataset (if available) using its previous ERB and any new ERBs available at the hub. As a result, the number of ERBs available from other ADFLL agents when starting a new round of training will significantly vary between the slowest and the fastest agents in the network. This process is continued until all four agents complete three rounds of training. For comparison, Agent M was trained for eight rounds, and Agent X was trained to use the central aggregation of all eight datasets.



% \begin{figure}
% \centering{}
% \includegraphics[width=0.6\textwidth]{Resources/DFL_figure4.png}
% \caption{Illustration of the 4-agent decentralized federated lifelong learning framework of our experiment.} \label{fig4}
% \end{figure}
\begin{table}[!htp]\centering
\caption{Comparison of distance error between ADFLL agents (Agent 1-4) after round 3, all-knowing best case DRL agent (Agent X) after round 1, partially-knowing worst case DRL agent (Agent Y) after round 2.5, and traditional lifelong baseline DRL agent (Agent M) after round 8.}\label{fig5}
\scriptsize
\begin{tabular}{lrrr|rrrrr}\toprule
\B Patient Characteristics &\B AgentX  & \B AgentY  & \B AgentM  &\B Agent1 &\B Agent2 &\B Agent3 &\B Agent4 \\
&Best case&Worst case & Traditional LL &\multicolumn{4}{c}{ADFLL Agents}\\
\cmidrule{1-8}
Coronal\_LGG\_t1 &10.05 &8.94 &10.68 &8.06 &8.06 &10.49 &\B 6.08 \\
Coronal\_LGG\_t2 &9.22 &8.31 &8.94 &9.49 &\B 7.35 &8.25 &33.79 \\
Sagittal\_LGG\_flair &10.77 &60.42 &14.59 &11.75 &\B 8.12 &8.31 &10.49 \\
Axial\_LGG\_tice &7.07 &89.91 &16.67 &31.19 &6.16 &6.4 &\B 3.74 \\
Axial\_HGG\_flair &4.47 &90.05 &22.16 &66.56 &\B 4.24 &12.04 &22.09 \\
Sagittal\_HGG\_t1 &31.19 &65.49 &13.15 &10.05 &\B 6.71 &11.75 &7.55 \\
Sagittal\_HGG\_t2 &\B 10.25 &68.61 &11.58 &24.7 &12.33 &22.67 &12.37 \\
Coronal\_HGG\_tice &11.22 &44.11 &23.58 &40.47 &\B 9.54 &39.71 &13.19 \\
\midrule
Mean &11.78 &54.48 &15.17 &25.28 &\B 7.81 &14.95 &13.66 \\
Std. dev &8.16 &32.05 &5.32 &20.46 &\B2.4 &11.17 &9.87 \\
\midrule
Ttest (vs. Agent X)& &0.01 &0.4 &0.18 &0.22 &0.54 &0.73 \\
Ttest (vs. Agent M) &0.4 &0.01 & &0.12 &0.01 &0.95 &0.72 \\
Ttest (vs. Agent Y) &0.01 & &0.01 &0.01 &0 &0.01 &0.02 \\
\bottomrule
\end{tabular}
\end{table}



\subsection{Results}

In the two simulation studies, our framework showed scalability of up to 24 agents, robustness against network dropout, and flexibility in system topology. As shown in Fig.~\ref{fig:fig25}, we see that the average Euclidean distance error across all agents and all 24 tasks decreases as more agents are added to the system, with an average Euclidean distance error of $16.89\pm16.34$ at the end of 4 rounds. As shown in Fig.\ref{fig:fig25}, we also see that the average Euclidean distance error across all agents decreases while half of the agents are deleted every round, resulting in an average Euclidean distance error of $8.55 \pm 7.12$ after for the final remaining agent at the end of 5 rounds. In comparison, the average Euclidean distance error was $8.34\pm7.26$ ($p>0.05$) for Agent X and $8.15\pm5.42$ ($p>0.05$) for Agent M. This shows that the knowledge agents learned and captured in ERBs are not lost when agents are removed from the system. When agents are added, the new agents can catch up with existing agents in one round. Moreover, the $75\%$ dropout rate applied to every round of both experiments shows the robustness of our framework against network failures, a major bottleneck for federated learning frameworks.

As shown in Table \ref{fig5}, all four ADFLL agents had excellent performance with no significant difference in performance from the all-knowing Agent X ($p>0.05$). In addition, three agents (A1, A3, and A4) had no significant difference from Agent M, and A2 was significantly better than Agent M ($p=0.01$), just after three rounds of training, compared to eight rounds of training for Agent M. As shown in Table \ref{fig5}, after three rounds of training, A2 was able to achieve a mean distance error of 7.81 on all eight tasks, compared to the 11.78 (p=0.22) for Agent X, significantly lower compared to Agent Y (54.58; p<0.001), and Agent M 15.17 (p=0.01) after eight rounds of training.  



% \begin{figure*}[htb]
% \includegraphics[width=\textwidth]{Resources/result_comparison.png}
% \caption{Comparison of distance error across 8 different tasks between Agent X (Central Aggregation) and Agent 2 (ADFLL).} \label{fig6}
% \end{figure*}

\begin{figure}[htb!]
\includesvg[width=.5\linewidth]{Resources/addition.svg}\hfill
\includesvg[width=.5\linewidth]{Resources/deletion.svg}
\caption{Left: Comparison of distance error of all agents in the system across 4 rounds of training as agents join the system. Right: Comparison of distance error of all agents in the system across 4 rounds of training as agents leave the system.}
\label{fig:fig25}
\end{figure}


% \begin{figure}[htb!]
% \center
% \includegraphics[width=0.7\textwidth]{Resources/addition.png}
% \caption{Comparison of distance error of all agents in the system across 4 rounds of training as agents join the system.} \label{fig10}
% \end{figure}

% \begin{figure}[htb!]
% \center
% \includegraphics[width=0.7\textwidth]{Resources/deletion.png}
% \caption{Comparison of distance error of all agents in the system across 4 rounds of training as agents leave the system.} \label{fig11}
% \end{figure}



\section{Discussion}
In this work, we proposed the asynchronous decentralized federated learning framework to achieve collective intelligence via continual learning from peers with excellent results both in simulated and deployment settings. The ADFLL framework's performance was similar to centralized baselines: the all-knowing baseline with all the datasets centrally aggregated in one location and the conventional LL baseline, where an agent sequentially trains across all the datasets. 

Our simulation experiments demonstrated the capability of ADFLL at retaining knowledge within the federation at a high communication dropout of $75\%$ even with several agents continually leaving the system. Similarly, the addition experiment demonstrated the capability of the ADFLL framework to allow newer agents joining the system to utilize all the knowledge within the system and catch up to existing agents in one round. The deployment setup allowed us to evaluate the real-world setting where different agents can access different compute environments and train at different speeds. As a result, faster agents did not have all the ERBs available when they started their last round of training, resulting in their performances being worse than the slowest agent, A2. The possible reasons for Agent 1 to perform less than Agent X and Agent M could be because it did not have all ERBs or the training was stuck at a local minimum. This can potentially be solved by sharing the model parameters of the latest agent. 

Asynchronous federated learning has also been explored in other areas \cite{Chen2019AsynchronousOF}. They offer the ability to deal with nodes with different computational power but lack the decentralization that allows the system to be more flexible. Similarly, in \cite{Liu2022AsynchronousDF,2204.13591}, the cost of removing a central node is a quadratic complexity communication scheme in that every node communicates with every node.

Our work has certain limitations. This preliminary study focused on a single landmark localization task with deep reinforcement learning. Our future work will expand this framework to multi-agent systems and other medical imaging tasks such as segmentation and classification. In conclusion, we demonstrated a privacy-aware, asynchronous, decentralized federated learning system with robust and efficient system topology and excellent performance on landmark localization tasks for the BraTS imaging dataset. 


% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by the DARPA grant: DARPA-PA-20-02-11-HR00112190130  and 5P30CA006973 (Imaging Response Assessment Team-IRAT), U01CA140204.}


\bibliography{midl-samplebibliography}

\appendix
\section{Additional Experiments}

\subsection{Scalability Experiments}
\subsubsection{ Systems of 2,4,8,16,24 Agents}
To test the influence of agent count in a system on its performance, we tested systems with 2, 4, 8, 16, or 24 Agents training for two rounds on their performance to localize the top left ventricle in all 24 imaging environments. Since it is prohibitively expensive to experiment on 24 different machines, these systems were simulated on the NVIDIA DGX-1 with a synchronous training protocol. %%ADD round numbers

\textit{Result: } Figure \ref{fig2-24} demonstrates the comparison of Agent Rewards (AR) when different ADFLL systems consisting of 2, 4, 8, 16, and 24 agents were trained for two rounds in addition to the best outcome agent. The ADFLL agent systems with $\geq 4$ agents learned more than $80\%$ of the environments within just two rounds. Finally, the ADFLL systems with 16 and 24 agents learned greater than $90\%$ of the environments.

%%TODO: CHANGE SHELL TO ADFLL
\begin{figure}[htb]
\center
\includegraphics[width=0.5\textwidth]{Resources/Appendix_Fig1.png}
\caption{Comparison of Euclidean distance error between different experimental setups involving all the base cases and ADFLL setups with 2, 4, 8, 16, and 24 agents}\label{fig2-24} 
\end{figure}


\subsection{72 Agents with significant network dropout}
To further test the scalability limit of our framework, we designed an experiment that involves 72 agents. We will use the 24 agents from the previous experiments and add 48 agents to the system, and the 48 agents will run for two rounds. They will be randomly assigned to an imaging environment each round. In the first round, they will learn from their local data and also from ERBs of the previous 24 agents. In the second round, they will learn from their local data, ERBs of the previous 24 agents, and ERBs from the first round of the 48 agents. Additionally, the robustness of our framework against communication failure is tested, and $50\%, 75\%, 90\%$ dropout rate is applied to the number of ERBs each agent receives, meaning only $50\%, 25\%, 10\%$ ERBs reached the agents.

\textit{Result: } Figure \ref{fig48agents} shows the difference in average distance error when using different dropout rates during ERB sharing. $90\%$ dropout rate performs the worst out of the three dropout rates, with higher average distance error and higher deviation in performance. Performance $75\%$ and $50\%$ are very similar, with $50\%$ dropout rates having a slightly lower average distance error. Compared to no sharing in round 1, sharing just a tiny fraction ($10\%$) of the ERB in the system still can have a huge boost in performance. Figure \ref{fig48agentsAR} shows that the highest dropout rate the new 48 agents can accommodate without losing significant performance.

\begin{figure*}[htb]
\includegraphics[width=\textwidth]{Resources/Average Distance Error of 48 Agents.png}
\caption{Illustration of the average distance error for the 48 new agents using different dropout rates}\label{fig48agents} 
\end{figure*}


% \subsection{Saturation Experiment}
% To test our framework's effectiveness against challenging tasks or weak agents, and to understand the value of sharing ERBs, the agents in the saturation experiment are not allowed to learn fully on their tasks. Usually, it takes 4 epochs per round to learn a task; in this experiment, they only trained for 2 epochs. We tested three ADFLL systems of different sizes (6, 12, 24) for 4 rounds, 2 epochs per round, and with a $75\%$ dropout of ERBs. As a baseline, Agent M will also train for 4 rounds and 2 epochs per round.

% \textit{Result: } Figure \ref{figSaturation} illustrates the average distance error and AR for different agents (1 agent - traditional LL agent, 6-agent system, 12-agent system, 24-agent system) after training for 4 rounds. The average distance error for the conventional LL system (Agent M) was 26.18 compared to 13.79 for the ADFLL system. As the agents in the system increase, the average distance error decreases. With 2 epochs of training each round, not allowing agents to be fully trained on the given dataset, they still provide useful information in their ERBs, showing the effectiveness of sharing ERBs, and the benefit of having more agents in the ADFLL system.

% \begin{figure}[htb]
% \center
% \includegraphics[width=0.7\textwidth]{Resources/saturation.png}
% \caption{(A) Illustration of the average distance error across all agents in the system for different systems: LL (in red) vs ADFLL (in blue) after 4 rounds. Each agent only trains for 2 epochs each round. (B) Illustration of AR across all agents in the system for different systems: LL (in red) vs ADFLL (in blue) after 4 rounds.}\label{figSaturation} 
% \end{figure}

\end{document}
