\documentclass[accepted]{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{times}  % DO NOT CHANGE THIS
\usepackage{helvet}  % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
\usepackage{url}  % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm}  % DO NOT CHANGE THIS
\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing  % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{soul}
\usepackage{hyperref} 
\usepackage[utf8]{inputenc}
% \usepackage[small]{caption}
\usepackage{amsmath}
\usepackage{amsthm,bm}
\usepackage{amssymb,multirow,paralist,mathrsfs,amsfonts,dsfont}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage[switch]{lineno}
\usepackage{bbm} 
\usepackage{enumitem}
\usepackage{subfigure}
\usepackage{xcolor}
\usepackage{microtype}
% \usepackage{mathbbol}


\def\yanred{\textcolor{red}}
\def\yanblue{\textcolor{blue}}
\def\maRed{\textcolor{red}}
\def\maGreen{\textcolor{green}}
\def\maBlue{\textcolor{blue}}

\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}


\def\st{\text{s.t.}}
\def\v{{\bf v}}
\def\bmQ{{\bm Q}}
\def\bmu{\boldsymbol{\mu}}
\def\calX{\mathcal X}
\def\calY{\mathcal Y}
\def\calS{\mathcal S}
\def\calP{\mathcal P}
\def\calD{\mathcal D}
\def\calI{\mathcal I}
\def\calE{\mathcal E}
\def\calG{\mathcal G}
\def\E{\mathbb E}
\def\P{\mathbb P}
\def\R{\mathbb R}
\def\indicator{\mathbb{1}}
\def\textmin{\text{min}}
\def\mean{\text{mean}}
\def\median{\text{median}}
\def\rme{\text{rme}}
\def\ttoption{{\tt option }}
\def\ttI{{\tt I}}

\def\shiblue{\textcolor{blue}}
% \def\shired{\textcolor{red}}
\def\shired{\textcolor{black}}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Federated R\'enyi Fair Inference in Federated Heterogeneous System}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Zhiyong Ma*}
\author[2]{Yuanjie Shi*}
\author[2]{\href{mailto:<yan.yan1@wsu.edu>}{Yan Yan}{}}
\author[1]{\href{mailto:<ellachen@scut.edu.cn>}{Jian Chen}{}}

% Add affiliations after the authors
\affil[1]{%
    School of SSE\\
    South China University of Technology University\\
    Guangzhou, Guangdong, China
}
\affil[2]{%
    School of EECS\\
    Washington State University\\
    Pullman, WA, USA
}
  
  \begin{document}
\maketitle

\begin{abstract}
Federated learning (FL) is a prominent distributed learning approach that addresses two major challenges: statistical heterogeneity (i.e., non-identically distributed data) and system heterogeneity (i.e., variability in communication and computation on each client). 
As FL is commonly applied in sectors such as commercial and financial, group disparities can emerge and cause harm.
However, current fairness algorithms assume homogeneous data, which do not align with the FL context.
The main challenge is estimating global fairness measures (e.g., R\'enyi or Pearson correlation) in an asynchronous, heterogeneous system.
To address this, we propose the FedR\'enyi algorithm, which regularizes fairness by R\'enyi correlation.
For statistical heterogeneity, FedR\'enyi aggregates local fairness statistics to estimate the global R\'enyi correlation with an estimation error bound of $O(1/\sqrt{n})$, where $n$ is the total number of data samples.
This theoretical result improves significantly over the previous result $O(1/\sqrt{K})$ with $K$ clients.
We further prove that FedR\'enyi converges at the same rate as in the homogeneous setting.
For system heterogeneity, FedR\'enyi approximates missing client updates through weighted averaging over a nearest neighbor region, ensuring a non-expansive approximation error under non-convex conditions.
Extensive experiments demonstrate that FedR\'enyi achieves a promising fairness-accuracy trade-off, with at least 2\% improvement over baselines.
\end{abstract}

\section{Introduction}

% 1. Background of FL

Federated learning (FL) is an effective paradigm for decentralized learning in large-scale datasets \cite{mcmahan2017communication,kairouz2021advances}, allowing models to be trained on multiple clients without sharing raw data, thus preserving privacy \cite{zhang2023federated}. 
Many FL works \cite{karimireddy2020scaffold,FedProx,xu2023asynchronous,zhu2021dga} have been proposed to address challenges in FL, such as {\it statistical heterogeneity}, where locally distributed data are non-identically distributed (non-IID), and {\it system heterogeneity}, which involves variability in communication and computational capabilities between clients, such as unparticipating clients \cite{li2020federated,FedProx}.
These methods make FL attractive and suitable for many real-world sectors, such as commercial \cite{jain2023federated} and finance \cite{long2020federated,mammen2021federated}, where
large institutions (e.g., banks) seek to mitigate predictor bias caused by group disparities \cite{barocas2023fairness}. 
% However, existing group fairness algorithms in the centralized machine learning setting are unsuitable for FL \cite{ezzeldin2022fairfed}.

% 2. Background of group fairness

Group fairness \cite{barocas2023fairness} is a commonly used approach to mitigate prediction bias against certain demographic groups \cite{kleinberg2018algorithmic}, 
divided by sensitive attributes such as race or gender \cite{dwork2012fairness,mehrabi2021survey}. 
Many methods have been proposed to promote group fairness \cite{Baharlouei2020Rényi,woodworth2017learning} ,
but they are mostly designed for a centralized and homogeneous setting.


Some works have been proposed to improve group fairness in FL \cite{zeng2021improving,abay2020mitigating,du2020fairnessaware,zhang2020fairfl,chu2021fedfair}, 
all of which require empirical estimation of fairness measures (e.g., group disparities).
Although the previously established estimation error of the fairness measure is in $O(1/\sqrt{K})$ with $K$ clients \cite{chu2021fedfair} under non-IID conditions,
it is worse than the standard estimation error of $O(1/\sqrt{n})$ with $n$ data samples in the centralized setting \cite{mohri2018foundations}.


In addition, it is unclear how to adapt these methods to the system heterogeneity (e.g., with dropping clients or ``stragglers'' \cite{FedProx}).
Specifically, the empirical fairness measure based on partially participating clients can deviate significantly from the true fairness measure based on fully participating clients.
The above two challenges raise a question: {\it How can we develop a federated fairness-enhanced algorithm with theoretical guarantees for fairness and convergence in both statistically and systemically heterogeneous settings?} 


To this end, we propose Federated R\'enyi Fair Inference (FedR\'enyi) algorithm to promote group fairness in FL.
We use the general and tractable R\'enyi correlation \cite{renyi2007foundations,Baharlouei2020Rényi} as regularization to induce fairness globally across all clients.
Specifically, to estimate the global R\'enyi correlation, we first compute the necessary local group-wise statistics (see Eq. (\ref{eq:local_aggregation}) later) on each client, 
and then aggregate these local statistics following two federated weighting schemes (see Eq. (\ref{eq:gamma}) later) from clients into a global measure \cite{mansour2020three}.
For any nonparticipating client in each communication round, FedR\'enyi approximates its local statistics/model by weighted averaging over its neighbor clients based on their similarity measures.


We theoretically show that FedR\'enyi guarantees the estimation error bound in $O(1/\sqrt{n})$ order, which improves significantly over the previous established one in $O(1/\sqrt{K})$ \cite{chu2021fedfair} ($K \ll n$ usually in FL \cite{kairouz2021advances}).
Furthermore, we derive a convergence rate of FedR\'enyi in $O(1/\epsilon^4)$ iteration complexity, matching the same order as the standard FL result \cite{karimireddy2020scaffold}.
Moreover, we show that the proposed approximation is non-expansive for certain non-convex loss functions \cite{liu2021first} with pre-trained model \cite{tan2022federated,weller2022pretrained,tian2022fedbert}, i.e., the non-increasing distance between the approximated and true local statistics/models within a communication round.
% for both settings that reduce the expected squared norm of empirical and population gradient to $O(\epsilon)$ and $O(\epsilon + 1/n + \kappa)$ with a condition number $\kappa$ in $O(1/\epsilon^4)$ iteration complexity, respectively (Proposition \ref{proposition:convergence_based_on_scaffold}).
Finally, we empirically evaluate our method on benchmark datasets, showing that FedR\'enyi provides a promising trade-off performance between global accuracy and group fairness with at least $2\%$ improvement of the harmonic mean of accuracy and fairness over baselines in most cases. 

\noindent{\bf Contributions:}
Our key contributions are summarized:

\begingroup
\leftskip=1.0em

\textbullet~We propose FedR\'enyi to promote group fairness in FL by using R\'enyi correlation as a regularization term.
We develop an aggregation method to estimate the global R\'enyi statistics from local clients, and an approximation scheme to approximate local statistics/models based on similarity measures between clients. 

\textbullet~We theoretically prove that our FedR\'enyi effectively provides a tight estimation error bound of $O (1/\sqrt{n})$. Based on the improved results, we further derive the same convergence rate $O(1/\epsilon^4)$ of FedR\'enyi with the standard FL result \cite{karimireddy2020scaffold}.  
In addition, the similarity-based approximation scheme is non-expansive (the distance between the approximated and true statistics/models is non-increasing) under mild conditions.
 

\textbullet~Extensive experimental results verify the improved trade-off ability of FedR\'enyi (at least $2\%$ improvement in the harmonic mean of accuracy and fairness over baselines in most cases).

\endgroup

% \end{itemize}



\section{Related Work}
    \label{section:related_work}

{\bf Fairness in FL.}
\cite{gajane2017formalizing} systematically divides machine learning fairness into five types: group fairness, individual fairness, unconscious fairness, counterfactual fairness, and preference-based fairness.
Many studies have examined the impact of group fairness in FL using metrics such as demographic parity and/or equality of opportunity \cite{shi2021survey}. 
To mitigate group bias in heterogeneous settings, MWR \cite{selialia2024mitigating} employs a heuristic approach that enhances fairness by using importance weighting and regularization to optimize the accuracy of the worst-performing group.  
As data statistics are allowed to be shared in FL (e.g., \cite{shao2023survey,zhu2021data,jeong2018communication,seo202216}),
FairFed \cite{2023FairFed} increases the aggregated weights of clients with small deviations between local and global fairness metrics or accuracy.
Based on FairBatch \cite{2021FairBatch}, FedFB \cite{zeng2021improving} relies on statistical information about the performance of the client to adjust the minibatch sizes of each client in local update process to optimize the group-specific losses, thus imposing group fairness.
FedFair \cite{chu2021fedfair} estimates model fairness and incorporates this estimation as a loss function constraint. 
Its estimation bound is $O(1/\sqrt{K})$, which is especially large compared to centralized results (e.g., \cite{mohri2018foundations}) as $K \ll n$.
However, both FedFB and FairFed lack theoretical analysis of estimation errors to justify their validity.
        % Both FairFed and FedFB rely on statistical information about the performance of each client, which may violate the privacy requirements.

{\bf Heterogeneity in FL.}
In FL, heterogeneity is categorized into statistical and system heterogeneity. 
Statistical heterogeneity refers to variability in data distributions across clients, which impacts performance and convergence of FL algorithms. 
Methods like control variates to reduce variance \cite{karimireddy2020scaffold} or proximal terms to stabilize training \cite{FedProx} address these issues but lack theoretical guarantees. 
System heterogeneity refers to disparities in communication and computational capacities among clients, leading to inefficient training and potential client dropout. 
Studies analyze estimation errors due to these disparities. 
\cite{sefidgaran2024lessons} investigates the partial estimation error caused by inter-client estimation discrepancies.
In this framework, \cite{sefidgaran2024lessons} investigates the impact of communication rounds on the estimation error in federated learning, finding that increased communication does not always improve performance. 
\cite{hu2023generalization} introduces a two-level distribution framework to analyze the full estimation error caused by inter- and intra-client estimation errors in FL. 
It establishes learning bounds for participating and nonparticipating clients (stragglers), respectively. 
However, a comprehensive evaluation of the total estimation error remains limited.
 
    % \paragraph{\yanred{Statistical} Heterogeneity in FL.} 
    %     In FL, statistical heterogeneity refers to the variability in data distributions across different clients. 
    %     This heterogeneity can significantly impact the performance and convergence of federated learning algorithms. 
    %     To address these challenges, several methods have been proposed. 
    %     \cite{karimireddy2020scaffold} introduces control variates to reduce the variance caused by client updates and accelerates convergence by correcting the client drift. 
    %     \cite{FedProx} tackles the problem by adding a proximal term to the local objective functions, which limits the impact of local updates and stabilizes the training process. 
    %     % Another approach, \cite{wang2020federated}, aligns the local models before averaging them, ensuring that similar model parameters are combined.
    %     % \cite{wang2020tackling} normalizes the local updates by the number of local steps taken by each client. This normalization helps in dealing with the imbalance caused by clients having different amounts of local data and computation capabilities.
    %     However, all the above methods mitigate the effects of statistical heterogeneity empirically, lacking the theoretical guarantee for the estimation error in Heterogeneous FL.
        
    % \paragraph{System Heterogeneity in FL.} 
    %     System heterogeneity refers to the disparities in communication and computation capacities among clients. 
    %     These disparities can lead to inefficient training processes, as clients with limited resources may slow down the overall system or even drop out, affecting the robustness of the model training. 
    %     To evaluate the negative effects of system heterogeneity, some works analyze the generalization performance of FL models in heterogeneous environments.
    %     \cite{sefidgaran2024lessons} studies the partial generalization error caused by internal estimation error of each client. Under this framework, this paper investigates the impact of communication rounds on generalization error in federated learning, finding that increased communication does not always improve performance. 
    %     \cite{hu2023generalization} introduces a two-level distribution framework to analyze the full generalization error caused by external and internal estimation error of clients in FL. It establishes learning bounds for participating and unparticipating clients (stragglers), respectively. 
    %     However, there is few work investigating the total generalization error of all clients to evaluate the global impact of system heterogeneity effectively. 
        % equip asynchronous schemes or adjust the client selection strategies \cite{xu2023asynchronous}.
        % For example, \cite{xu2023asynchronous} proposed an effective strategy that adjusts the aggregation weights of stragglers based on stainless, which is the deviation time of non-straggler and straggler clients.
        % To handle the same problem, 
        % \cite{reisizadeh2020stragglerresilient} proposed a straggler-resilient FL method with similar strategy that starts the training procedure with faster nodes and gradually involves the slower nodes. 
        % There is a different strategy from \cite{wang2021resource}. It divided the edge nodes into different clusters based on communication efficiency and chose a leader node, then adopted different communication schemes within the nodes to speed up the training process.
        % \cite{chai2021fedat} combined the two above strategies that first divide clients into clusters and then use the stainless-weighted each cluster aggregation result to form the global model. 
        % Inspired by the above algorithms, we enhance the FedR\'enyi with asynchronous schemes that leverage the similarity between federated clients, dealing with the practical challenge of latency.
        % Recently, \cite{pmlr-v202-li23an} proposed a straggler-resilient algorithm to design secret sharing schemes for the local data and models, and the aggregation result is reconstructed losslessly, via decrypting computation shares from the non-straggling clients.
        % \cite{zhu2021dga} proposed Delayed Gradient Averaging, which delays the averaging step to improve efficiency and allow local computation in parallel to communication. 

\section{Background and Motivation}
\label{section:Problem Setup}
% \subsection{Background in Federated Learning}
{ \bf Notations.}
Let $(x, y, s) \in \calX \times \calY \times \calS$ be a data sample, where $\calX \subseteq \R^d$, 
$\calY = \{1, ..., C\}$ and 
$\calS = \{1, ..., P\}$ represent the feature, classification label, and protected attribute spaces, respectively. 
% with totally $C$ classes and $P$ attributes.
Let $\calP$ be the underlying distribution defined on $\calX \times \calY \times \calS$.
Let $X, Y, S$ be the random variables drawn from their respective distributions and $x,y,s$ be the realization. 
$K$ denotes the number of clients, and the set of all client indices is $[K]$. 
Let $\calI_k$ be the index set for data on client $k$,
$n_k$ denote the number of data samples on $k$-th client,
the total number of data is $n$ and $n = \sum_{k=1}^K n_k$.
Denote $n_\textmin := \min_{k} n_k$ as the minimum number of data samples across all clients. 
We define the distribution on client $k$ by $\calP_k$ and refer the data heterogeneity as $\calP_{k_1} \neq \calP_{k_2}$ where $k_1 \neq k_2$.
Let $X_k$ and $S_k$ be feature and attribute variables on client $k$, respectively.
% \yanred{$Dir$ and $p$ are the heterogeneous parameters. ?????}
% Define $M$ as the number of synchronous iterations, and $\alpha$ as the straggler proportion for the system.
Define $f_\theta(x, s)$ as the prediction function parameterized by $\theta$ on $(x,s)$ and $\ell(\widehat y, y)$ as a loss function measured on the predicted label $\widehat y$ and true label $y$.
Define $\P_c = P[f_{\theta} (X, S)=c ]$ as the probability of the model predicting class $c$ and $\rho = \min_{c \in C} \P_c $ as the smallest model prediction probability over all classes.
Let $\indicator[\cdot]$ be an indicator function and $A \backsim B$ means that $A$ and $B$ are in the same order.
% Denote $\widehat \P_k[] := \E  \indicator[A]$ as the empirical probability of event $D$.
Define $\tau$ as the total number of communication round and $e$ as the index of communication round, respectively. 
Denote $M$ as the number of local update iterations in one communication round $e$ and $m$ as the index of local iteration. 
Define the total number of iterations as $T$, where $T = \tau M$.
% Define $T$ as the total number of iteration and $I$ as the communication interval.
% Then, we define stage $s$ as two consecutive communication rounds. 
% We denote the total iteration of stage $s$ as $M$.
Notations are summarized in Table \ref{tab:notation}.

{\bf Federated Learning.}
In FL, given $K$ distributions $\calP_1, ..., \calP_K$ with sizes $\{ n_k \}_{k=1}^K$ from client $k \in [K]$, one aims to learn a model that minimizes the overall loss $ L(\theta)$:
\begin{align}
\label{eq:FL_population_objective}
&
\min_{\theta} L(\theta) 
= 
\sum_{k=1}^K \gamma_k \cdot \E_{(x,s,y) \sim \calP_k}
[ \ell(f_\theta(x, s), y) ]  ,
\end{align}
where $\gamma_k$ is the weights of client $k$.
Two target weighting schemes are commonly considered, i.e., uniform over sample and uniform over client:
\begin{equation}
\label{eq:gamma}
\gamma_k
=
\left\{
\begin{aligned}
    & 
    n_k / n, ~~~  \text{ uniform over sample,}
    \\
    & 
    1 / K,  ~~~~ \text{ uniform over client.}
    \end{aligned}
\right.
\end{equation}
The choices of $\gamma_k$ represent different schemes, which are both used in fairness-aware FL studies \cite{mohri2019agnostic,li2019fair,lyu2020collaborative,chu2021fedfair,fan2021improving}. 
% Therefore, we need to consider two different schemes and study their impact on the FL algorithm performance.
Since $L(\theta)$ is defined on population and is not accessible, an empirical FL objective is defined as:

\begin{align}\label{eq:FL_empirical_objective}
\min_\theta \widehat L(\theta) 
:= 
\sum_{k=1}^K \gamma_k 
\underbrace{\sum_{i=1}^{n_k} \frac{1}{n_k} \ell(f_\theta(x_{ki}, s_{ki}), y_{ki})}_{\widehat L_k(\theta) }
,
\end{align}
% where $
% \widehat L_k(\theta) := \sum_{i=1}^{n_k} \frac{1}{n_k} \ell(f_\theta(x_{ki}, s_{ki}), y_{ki}).
% $
where $(x_{ki}, s_{ki}, y_{ki})$ is $i$-th data sample on client $k$.

In fairness-aware FL, minimizing $\widehat L(\theta)$ alone can lead to group disparities across sensitive attributes $S$.
To mitigate this, many fairness-aware FL studies \cite{li2021ditto,chu2021fedfair} incorporate a fairness regularization term $\widehat R(\theta)$ with parameter $\lambda$ into the objective function:
\begin{align*}
\min_\theta \widehat L(\theta) + \lambda \widehat R(\theta).
\end{align*}
In this paper, we consider using R\'enyi correlation as the fairness regularization term.

{\bf R\'enyi Fair Inference.}
R\'enyi correlation measures the correlation between two random variables, ranging from $0$ (independent) to $1$ (strictly dependent).
Unlike Pearson correlation, which captures linear relationships \cite{zafar2017fairness}, R\'enyi correlation indicates high order dependencies \cite{Baharlouei2020Rényi}.
It is also computationally tractable, compared to very expensive mutual information \cite{song2019learning}.
R\'enyi correlation between two random variables $A \in \{A_1, \cdots A_a\}$ and $B \in \{B_1, \cdots B_b\}$ is defined as:
\begin{align*}
&
\rho_R(A, B)
=
\sup_{f, g} \E[ f(A) g(B) ]
\\
& \resizebox{.91\linewidth}{!}{$
\st ~ \E[f(A)] = \E[g(B)] = 0,
\E[f^2(A)] = \E[g^2(B)] = 1.
$}
\end{align*}
Following \cite{witsenhausen1975sequences,Baharlouei2020Rényi}, 
R\'enyi correlation $\rho_R(a, b)$ is the second largest singular value of the matrix $Q$, where each element $q_{ij} = \frac{ \P[ A=A_i, B=B_j ] }{ \sqrt{ \P[ A=A_i ] \P[ B=B_j ] } }$ for $1 \leq i \leq a$ and $1 \leq j \leq b$.
The main idea of R\'enyi fair inference \cite{Baharlouei2020Rényi} is to minimize the correlation between predictions and sensitive attributes.
Following \cite{Baharlouei2020Rényi}, we can also re-formulate the squared term $\rho^2(A, B)$ as follows:
\begin{align}\label{eq:squared_renyi_correlation}
\rho^2(A, B)
=
\max_{\v \perp \v_1, \|\v\|^2 \leq 1} \v^\top Q^\top Q \v ,
\end{align}
where $\v_1 = (\sqrt{ \P[B=B_1] }, ..., \sqrt{ \P[B=B_b] }) \in \R^b$ is the right singular vector associated with the largest singular value of $Q$.

In {\em centralized} machine learning, where samples are i.i.d. and accessible, the estimation error for statistics (e.g., the R\'enyi correlation) is bounded by $O(1/\sqrt{n})$ with high probability \cite{mohri2018foundations}.
However, it remains unclear how accurately these statistics can be estimated in FL under {\em decentralized} and {\em heterogeneous} settings.
Specifically, the following challenges arise:
First, server is not allowed to access attribute data to calculate R\'enyi correlation in FL. 
This restriction complicates the global estimation of fairness measures; 
Second, clients often have vastly different data distributions due to statistical heterogeneity. 
This variation makes it difficult to precisely estimate the R\'enyi correlation across all devices.
Third, due to varying computational capabilities and communication resources, the server may not receive updates from some clients (i.e., stragglers). 
This variability complicates the aggregation of local statistics and model updates.
To address these challenges, we propose FedR\'enyi algorithm.


\section{Federated R\'enyi-Regularized Learning}
\label{sec: FedRenyi}
% In this section, we present our FedR\'enyi algorithm with theoretical analysis in detail.
% First, we formulate the federated R\'enyi-regularized objective function.
% Then, we present the synchronous variant of FedR\'enyi algorithm and discuss theoretical analysis of synchronous FedR\'enyi, including the improved estimation error and convergence analysis.
% Finally, we propose the asynchronous variant of FedR\'enyi and analyze its approximation error and estimation error.

In this section, we detail the design and theoretical analysis of the FedR\'enyi algorithm. 
We begin by formulating the federated R\'enyi-regularized objective function. 
Next, we introduce the synchronous variant of FedR\'enyi and present its theoretical analysis, highlighting improved estimation error bounds and convergence guarantees. 
Finally, we propose the asynchronous variant and analyze its approximation and estimation errors.

% In the first subsection, we present the synchronous FedR\'enyi by:
% (i) formulating the federated R\'enyi-regularized objective, 
% (ii) presenting the FedR\'enyi algorithm for solving it, 
% (iii) discussing the improved estimation error bound in both uniform-over-sample and uniform-over-client schemes.
% After that, we show how the improved estimation error benefits the convergence of FedR\'enyi. 
% Finally, we exhibit the asynchronous scheme in the second subsection.


\subsection{Federated R\'enyi-regularized objective}
We use the squared R\'enyi correlation as a regularization term combined with the federated loss $\widehat L(\theta)$ aforementioned in (\ref{eq:FL_empirical_objective}).
Since all elements of the matrix $Q$ in (\ref{eq:squared_renyi_correlation}) are defined in population and unavailable to compute, instead, we aggregate the local statistics to estimate $Q$ based on a fixed model $\theta$.
We denote this empirically aggregated estimation as $\widehat Q_\theta \in \R^{C \times P}$ (recall we have $C$ classes and $P$ attributes).
For $1 \leq c \leq C, 1 \leq p \leq P$, the each entry in
$\widehat Q_\theta$ is defined as $\widehat q_{cp} := \frac{ \hat j(c,p) \cdot \hat r(p) }{ \sqrt{ \hat u(c) \cdot \hat r(p) } },$
where:
% \begin{align}
% \label{eq:local_aggregation}
% \hat j(c,p) 
% = &
% \sum_{k=1}^K \gamma_k 
% \underbrace{ 
% \sum_{i \in \calI_k} \frac{1}{ n_k } \indicator[ f_\theta(x_{ki}, s_{ki}) = c | s_{ki} = p ] 
% }_{ = \bar j_k(c, p) },
% \nonumber\\
% \hat r(p) 
% = &
% \sum_{k=1}^K \gamma_k 
% \underbrace{ 
% \sum_{i \in \calI_k} \frac{1}{ n_k } \indicator[ s_{ki} = p ] 
% }_{ = \bar r_k(p) },
% \nonumber\\
% \hat u(c) 
% = &
% \sum_{k=1}^K \gamma_k 
% \underbrace{ 
% \sum_{i \in \calI_k} \frac{1}{ n_k } \indicator[ f_\theta(x_{ki}, s_{ki}) = c ] 
% }_{ = \bar u_k(c) } .
% \end{align}
\begin{align}
\label{eq:local_aggregation}
\hat j(c,p) 
= &
\sum_{k=1}^K \gamma_k 
\underbrace{ 
\widehat \P[ f_{\theta_k}(X_k, S_k) = c | S_k = p ] 
}_{ = \bar j_k(c, p) },
\nonumber\\
\hat r(p) 
= &
\sum_{k=1}^K \gamma_k 
\underbrace{ 
\widehat \P[ S_k = p ] 
}_{ = \bar r_k(p) },
\nonumber\\
\hat u(c) 
= &
\sum_{k=1}^K \gamma_k 
\underbrace{ 
\widehat \P[ f_{\theta_k}(X_k, S_k) = c ] 
}_{ = \bar u_k(c) },
\end{align}
where $\widehat \P [X_k \in D] := \frac{1}{n_k} \sum_{i \in \calI_k} \indicator[x_{ki} \in D]$ represents the empirical probability that $X_k$ in any measurable set $D$.

Therefore, following (\ref{eq:squared_renyi_correlation}), we define $\widehat H(\theta, \v) := \v^\top \widehat Q_\theta^\top \widehat Q_\theta \v$, and we formulate the federated R\'enyi-regularized objective:
% \begin{align}
% \label{eq:fedrenyi_empirical_objective}
% \min_\theta \max_{\v \perp \hat \v_1, \|\v\|^2 \leq 1} 
% \widehat L(\theta) 
% + \lambda \widehat H(\theta, \v) ,
% \end{align}
\begin{align}
\label{eq:fedrenyi_empirical_objective}
\min_\theta \bigg \{ 
\widehat L(\theta) 
+ \max_{\v \perp \hat \v_1, \|\v\|^2 \leq 1} \lambda \widehat H(\theta, \v) \bigg \},
\end{align}
where $\hat \v_1 = \big(\sqrt{ \hat r(1) } , ..., \sqrt{ \hat r(P) } \big) \in \R^P$. 
The counterpart of $\widehat H(\theta, \v)$ defined in population instead of empirical level is denoted by $H(\theta, \v)$, as $\widehat L(\theta)$ in (\ref{eq:FL_empirical_objective}) and $L(\theta)$ in (\ref{eq:FL_population_objective})


\begin{algorithm}[!t]
\caption{FedR\'enyi Algorithm }
\label{alg:FR}
\begin{algorithmic}[1]
    \STATE Initialize $\theta^0_0$, $\v^0$ and hyperparameter $\lambda$, $M$, $J$ and $\eta$ on server and clients
    \label{alg:FR:line:init}
    \STATE Each client $k \in K$ compute $\bar r_k(p)$ following Eq. (\ref{eq:local_aggregation}) and upload $\bar r_k(p)$ and $n_k$
    \label{alg:FR:line:comput_rk}
    \STATE Server aggregate $\hat r(p) = \sum_{k=1}^K \gamma_k \bar r_k(p)$ and $\hat \v_1 = [\sqrt{\hat r(1) } , ..., \sqrt{ \hat r(P) } ] $ 
    \label{alg:FR:line:aggregate_r}
    \FOR{$e=0, \ldots, \tau-1$}
    \label{alg:FR:line:loop_begin}
        \FOR{$m \in \{0, \cdots, M-1\}$}
            \STATE Each client $k$ compute $Q_{\theta^e_{k,m}}$
            \label{alg:FR:line:local_update_begin}
        % \STATE $\theta_{k, t+1} = \theta_{k,t} - \eta \partial_{\theta} ( \E[ \ell(f(\theta_k, X), Y) ] + \lambda H(\theta_k, \v) )$\label{alg:FR:line:local_update_end}
            \STATE $\theta^e_{k, m+1} = \theta^e_{k,m} - \eta \partial_{\theta} ( \widehat L_k(\theta^e_{k,m}) + \lambda \hat H(\theta^e_{k,m}, \v) )$\label{alg:FR:line:local_update_end}
        \ENDFOR
        \FOR{$c \in \{1, \ldots, C\}$}
        \label{alg:FR:line:local_begin}
            \FOR{$p \in \{1, \ldots, P\}$}
                \STATE Compute $\{\bar j^{e}_k(c, p),\bar u^{e}_k(c)\}$ following Eq.  (\ref{eq:local_aggregation})
            \ENDFOR
        \ENDFOR
        \label{alg:FR:line:local_end}
            
        \STATE Upload $\{\bar j^{e}_k(c, p),\bar u^{e}_k(c)\}$ and $\theta^{e}_{k,M}$
        \label{alg:FR:line:upload}

        \STATE \textbf{Option I Synchronous FedR\'enyi}:
        \label{alg:FR:line:option_1}

            % \STATE \quad Aggregate $\theta_{t+1}$, $\hat j(c,p)$, $\hat u(c)$ following Eq. (\ref{eq:aggregate_sysnchronous}).
            % \label{alg:FR:line:aggregate}

        \STATE \,\,  $\theta^{e+1}_0 = \sum_{k=1}^K \gamma_k \theta^{e}_{k,M} $
        \label{alg:FR:line:aggregate_theta}
            
        \STATE \,\,  $ \hat j^{e+1}(c,p) = \sum_{k=1}^K \gamma_k \bar j^{e}_k(c, p) $
        \label{alg:FR:line:aggregate_j}
            
        \STATE \,\, $\hat u^{e+1}(c) = \sum_{k=1}^K \gamma_k \bar u^{e}_k(c)$
        \label{alg:FR:line:aggregate_u}
            
        \STATE \textbf{Option II Asynchronous FedR\'enyi}:
        \label{alg:FR:line:option_2}
            
        \STATE \,\, Find stragglers sets $I^{e+1}$, where $ | I^{e+1} | = \widetilde K^{e+1}$
        \label{alg:AFR:line:stragglers}
            
        \STATE \,\, Find neighbor set $Rob_{\zeta}(\widetilde k)$ for stragglers $\widetilde k \! \in\! I^{e\! +\! 1}$
        \label{alg:AFR:line:find_neighbor}
            
        \STATE \,\, Approximate the $\widetilde \theta^e_{\widetilde k, M}, \widetilde j^e_{\widetilde k}(c, p), \widetilde u^e_{\widetilde k}(c) $ for all \\
        \,\, stragglers $\widetilde k\in I^{e+1}$ by Algorithm \ref{alg:LA}
        \label{alg:AFR:line:approximation}
             
            % \STATE \, Aggregate $\theta_{t+1}$, $\hat j(c,p)$, $\hat u(c)$ following Eq. (\ref{eq:aggregate_asysnchronous}).
            % \label{alg:AFR:line:aggregate_theta}

        \STATE \,\, $\theta^{e+1} = \sum_{k=1}^{K-\widetilde K^{e+1}} \gamma_k \theta^e_{k, M} + \sum_{\widetilde k=1}^{\widetilde K^{e+1}} \gamma_{\widetilde k} \widetilde \theta^e_{\widetilde k, M}$
        \label{alg:AFR:line:aggregate_theta}

        \STATE \,\, \begin{small}  $\hat j^{e+1}(c, p) \! = \! \sum_{k=1}^{K\! -\! \widetilde K^{e +1}} \gamma_k \bar j^e_{k}(c,p) \! + \! \sum_{\widetilde k=1}^{\widetilde K^{e + 1}} \gamma_{\widetilde k} \widetilde j^e_{\widetilde k}(c, p)$ \end{small} 
        \label{alg:AFR:line:aggregate_j}

        \STATE  \,\, \begin{small}  $\hat u^{e+1}(c) \!= \sum_{k=1}^{K -\widetilde K^{e+1}} \! \gamma_k \bar u^e_{k}(c) +  \sum_{\widetilde k=1}^{\widetilde K^{e+1}} \gamma_{\widetilde k} \widetilde u^e_{\widetilde k}(c)$\end{small}
        \label{alg:AFR:line:aggregate_u}
            
        \STATE Compute $\hat Q^{e\! + \! 1}_{\theta}$ where each entry \begin{small} $\hat q^{e \! + \! 1}_{c,p} = \frac{ \hat j^{e\! + \! 1}(c, p) \cdot \hat r(p)}{ \sqrt{ \hat u^{e\! + \! 1}(c) \cdot \hat r(p)} }$ \end{small}
        \label{alg:FR:line:aggregate_end}
            
        \STATE  $\v^{e+1} \leftarrow \arg\max_{\v\bot \hat \v_1} [\widehat L( \theta^{e+1}) + \lambda \widehat H(\theta^{e+1}, \v)]$
        \label{alg:FR:line:compute_v}
            
        \STATE Broadcast $\theta^{e+1}$ and $\v^{e+1}$ to all clients $k \in K$
        \label{alg:FR:line:broadcast}
    \ENDFOR
    \label{alg:FR:line:loop_end}
\end{algorithmic}
\end{algorithm}

\subsection{Synchronous FedR\'enyi}
\label{subsection:Synchronous_FedR\'enyi}
% \yanred{
% Noticed that, due to the privacy constraint, we consider counting $\bar j_k(c,p)$, $\bar u_k(c)$ and $\bar r_k(p)$ on each client in Eq. (\ref{eq:tuple}) and aggregating $\hat j(c,p)$, $\hat u(c)$ and $\hat r(p)$ on the server side, which are corresponding to $\bar j(c,p)$, $\bar u(c)$ and $\bar r(p)$ in 
% Eq. (\ref{eq:original}).
% }
\subsubsection{Algorithm Design.}

To solve problem (\ref{eq:fedrenyi_empirical_objective}) without violating privacy constraints in FL, 
We propose the FedR\'enyi algorithm (summarized in Algorithm \ref{alg:FR}). 
Specifically,
we first initialize $\theta^0_0$, $\v^0$.
Then we compute $\bar r_k(p)$ and aggregate $\hat r(p)$ (see Line \ref{alg:FR:line:init} to Line \ref{alg:FR:line:aggregate_r}). 
During each communication round $e$, clients update the local model $\theta^e_{k,m+1}$ for $m\in {0, \cdots, M-1} $(see Line \ref{alg:FR:line:local_update_begin} to Line \ref{alg:FR:line:local_update_end}).
% Before proceeding iterations, we first compute $\bar r_k(p)$ and aggregate $\hat r(p)$ to decrease the time complexity (see Line \ref{alg:FR:line:comput_rk} to Line \ref{alg:FR:line:aggregate_r}). 
% Then, during the iteration, we start with updating local model parameter $\theta_{k,t+1}$(see Line \ref{alg:FR:line:local_update_begin} to Line \ref{alg:FR:line:local_update_end}).
After completing local updates, 
each client calculates
$\bar j^e_k(c,p)$ and $\bar u^e_k(c)$ (see Line \ref{alg:FR:line:local_begin} to Line \ref{alg:FR:line:local_end}), and then uploads these statistics and local model $\theta^e_{k,M}$ to server (see Line \ref{alg:FR:line:upload} ).
For synchronous FedR\'enyi (Option I), the server aggregates the global model $\theta^{e+1}$, global statistics $\hat j^{e+1}(c,p)$, and $\hat u^{e+1}(c)$ (see Line \ref{alg:FR:line:aggregate_theta} to \ref{alg:FR:line:aggregate_u}). 
Next, we compute matrix $\widehat Q^{e+1}_{\theta}$ and then apply SVD method to calculate $\v^{e+1}$ (see Line \ref{alg:FR:line:aggregate_end} and \ref{alg:FR:line:compute_v}).
Finally, the server broadcasts the global model $\theta^{e+1}$ and the fairness component $\v^{e+1}$ to each client (see Line \ref{alg:FR:line:broadcast}). 
% \begin{align}
% \label{eq:aggregate_sysnchronous}
% &
% \theta_{t+1} = \sum_{k=1}^K \gamma_k \theta_{k, t+1}, 
% \nonumber \\
% &
% \hat j(c,p) = \sum_{k=1}^K \gamma_k \bar j_k(c, p), 
% \nonumber \\
% &
% \hat u(c) = \sum_{k=1}^K \gamma_k \bar u_k(c). 
% \end{align}



% \begin{algorithm}[!t]
% \caption{FedR\'enyi Algorithm }
% \label{alg:FR}
% \begin{algorithmic}[1]
%     \STATE Initialize $\theta_0$, $\v_0$ and hyperparameter $\lambda$, $T$, $I$ and $\eta$ on server and clients.
%     \label{alg:FR:line:init}
%     \STATE Each client $k \in K$ compute $\bar r_k(p)$ following Eq. (\ref{eq:local_aggregation}) and upload $\bar r_k(p)$ and $n_k$. 
%     \label{alg:FR:line:comput_rk}
%     \STATE Server aggregate $\hat r(p) = \sum_{k=1}^K \gamma_k \bar r_k(p)$ and $\hat \v_1 = [\sqrt{\hat r(1) } , ..., \sqrt{ \hat r(P) } ] $ 
%     \label{alg:FR:line:aggregate_r}
%     \FOR{$t=0, \ldots, T-1$}
%     \label{alg:FR:line:loop_begin}
%         \STATE Each client $k$ compute $Q_{\theta_k}$
%         \label{alg:FR:line:local_update_begin}
%         % \STATE $\theta_{k, t+1} = \theta_{k,t} - \eta \partial_{\theta} ( \E[ \ell(f(\theta_k, X), Y) ] + \lambda H(\theta_k, \v) )$\label{alg:FR:line:local_update_end}
%         \STATE $\theta_{k, t+1} = \theta_{k,t} - \eta \partial_{\theta} ( \widehat L_k(\theta_k) + \lambda H(\theta_k, \v) )$\label{alg:FR:line:local_update_end}
%         \IF[Communicate]{$(t+1) \bmod I=0$} 
%         \label{alg:FR:line:communication_begin}
%             \FOR{$c \in \{1, \ldots, C\}$}
%             \label{alg:FR:line:local_begin}
%                 \FOR{$p \in \{1, \ldots, P\}$}
%                     \STATE Compute $\{\bar j_k(c, p),\bar u_k(c)\}$ following Eq.  (\ref{eq:local_aggregation}).
%                 \ENDFOR
%             \ENDFOR
%             \label{alg:FR:line:local_end}
            
%             \STATE Upload $\{\bar j_k(c, p),\bar u_k(c)\}$ and $\theta_{k, t+1}$.
%             \label{alg:FR:line:upload}
            
%             \STATE $\theta_{t+1} \leftarrow \sum_{k=1}^K \gamma_k \theta_{k, t+1}$
%             \label{alg:FR:line:aggregate_begin}
%             \STATE $\hat j(c,p) = \sum_{k=1}^K \gamma_k \bar j_k(c, p) $
%             \STATE $\hat u(c) = \sum_{k=1}^K \gamma_k \bar u_k(c)$
%             \STATE Compute $\hat Q_\theta$ where each entry $\hat q_{c,p} = \frac{ \hat j(c, p) \cdot \hat r(p)}{ \sqrt{ \hat u(c) \cdot \hat r(p)} }$
%             \label{alg:FR:line:aggregate_end}
%             \STATE $\v \leftarrow \arg\max_{\v\bot \hat \v_1} [\widehat L( \theta_{t+1}) + \lambda \widehat H(\theta_{t+1}, \v)]$
%             \label{alg:FR:line:compute_v}
%             \STATE Broadcast $\theta_{t+1}$ and $\v$ to all clients $k \in K$.
%             \label{alg:FR:line:broadcast}
%         \ENDIF
%         \label{alg:FR:line:communication_end}
%     \ENDFOR
%     \label{alg:FR:line:loop_end}
% \end{algorithmic}
% \end{algorithm}

\subsubsection{Theoretical analysis}
\label{subsubsection:theoretical_analysis_synchronous}

In this part, we first study the estimation error for synchronous FedR\'enyi from empirically aggregated $\widehat H(\theta, \v)$ to population $H(\theta, \v)$ under two weighting schemes in Theorem \ref{theorem:estimation_error_synchronous}.
Then, we discuss the convergence guarantee of synchronous FedR\'enyi in Proposition \ref{proposition:convergence_based_on_scaffold}.
Particularly, we show how the estimation error bound derived in Theorem \ref{theorem:estimation_error_synchronous} benefits the convergence guarantee.
% for achieving $\epsilon$-stationary solution.
        
% \paragraph{Estimation error analysis.}
We first prove that, under mild conditions, the estimation error is bounded by $O(1/\sqrt{n})$, which significantly improves upon the prior result $O(1/\sqrt{K})$ reported in \cite{chu2021fedfair},
leading to improved accuracy and stability in fairness estimation.

% The achieved smaller estimation error implies a stronger fairness guarantee from FedR\'enyi.
% \footnote{
% We show that under mild conditions, the estimation error under both schemes achieves $O(1/\sqrt{n})$, which significantly improves the prior result $O(1/\sqrt{K})$ \cite{chu2021fedfair}, implying a stronger fairness guarantee from the R\'enyi correlation.
% }

% The generalization error of the Renyi-regularized objective in (\ref{eq:fedrenyi_empirical_objective}), i.e., $\widehat L(\theta) + \lambda H(\theta, \v)$, is derived as $O(1/\sqrt{n})$ in Corollary \ref{corollary:generalization_error_fedrenyi_empirical_objective}. 
% \begin{theorem}
% \label{theorem:estimation_error_synchronous}
% (Estimation error of R\'enyi regularization under two schemes) 
% Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
% When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, with probability at least $1-\delta$ for $\delta >0$, the following inequality holds:
% \begin{align}
% \begin{aligned}
% &
% \widehat H(\theta^{e+1}, \v) - H(\theta^{e+1}, \v)
% \leq 
% O \big(1/\sqrt{n} \big).
% \end{aligned}
% \end{align}
% \end{theorem}
\begin{theorem}
\label{theorem:estimation_error_synchronous}
(Estimation error of R\'enyi regularization for synchronous FedR\'enyi) 
Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, for any global model $\theta$ and $\delta \in (0,1)$, the following inequality holds:
\begin{align*}
\P \Big [
\widehat H(\theta, \v) - H(\theta, \v)
\leq 
O \big(1/\sqrt{n} \big) \big | \theta \Big ]
\geq 1 - \delta.
\end{align*}
\end{theorem}
\begin{remark}
The above theorem shows that for any fixed global model $\theta$, the estimation error between population $H(\theta, \v)$ and empirically aggregated $\widehat H(\theta, \v)$ is bounded by $O(1/\sqrt{n})$ with high probability in two schemes. 
% The following corollary shows the condition to achieve $O(1/\sqrt{n} )$ in the uniform-over-client scheme.
Compared with previous results on the order of $O(1/K)$, FedR\'enyi significantly reduces estimation error, achieving a bound that is comparable to the standard estimation error in centralized settings \cite{mohri2018foundations}.
\end{remark}

% \begin{theorem}
% \label{theorem:estimation_error_uniform_over_sample}
% (Estimation error of R\'enyi regularization under uniform-over-sample scheme) 
% When $\gamma_k = \frac{n_k}{n}$, with probability at least $1-\delta$ for $\delta >0 $, the following inequality holds:
% \begin{align*}
% \widehat H(\theta, \v) \! - \! H(\theta, \v) 
% \! \leq \! 
% C^2 P^2 \! \Bigg(\frac{ \log(2 / \delta)}{ 2 n } \! + \! 2\sqrt{\frac{ C \log(2 / \delta)}{ 2 n \rho}} \Bigg) .
% \end{align*}
% \end{theorem}

% \begin{remark}
% This theorem shows that with high probability, the estimation error from global population to local empirical regularization function is upper bound by $O \Big (\frac{1}{\sqrt{n}} \Big )$ in uniform-over-sample scheme. 
% A larger number of data could improve the approximation accuracy.
% \end{remark}


% \begin{theorem}
% \label{theorem:estimation_error_uniform_over_client}
% (Estimation error of R\'enyi regularization under uniform-over-sample scheme) 
% Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$.
% When $\gamma_k = \frac{1}{K}$, with probability at least $1-\delta$ for $\delta >0$, the following inequality holds:
% \begin{align}
% \begin{aligned}
% &
% \widehat H(\theta, \v) - H(\theta, \v)
% \leq 
% C^2 P^2 \Bigg(\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin } \\ 
% & \quad \quad \quad \quad \quad \quad \quad \quad
% + 2 \sqrt{\frac{C \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin \rho }} \Bigg) .
% \end{aligned}
% \end{align}
% \end{theorem}

% \begin{remark}
% The above theorem show that with high probability, the estimation error for global $H(\theta, \v)$ with empirically aggregated $\widehat H(\theta, \v)$ is bounded by $O(1/\sqrt{n})$ in the uniform-over-sample scheme and $O \Big (\sqrt{\frac{\log(K)}{K n_\textmin}} \Big )$ in the uniform-over-client scheme, respectively. 
% The following corollary shows the condition to achieve $O(1/\sqrt{n} )$ in the uniform-over-client scheme.
% \end{remark}

% \begin{corollary}
% \label{corollary_1}
% Suppose that assumptions in Theorem \ref{theorem:estimation_error_uniform_over_client} hold.
% If $n_\textmin \backsim \frac{n}{K \log(K)}$, then
% $
% % \begin{align}
% \frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin} 
% =
% O (1/n ) .
% % \end{align}
% $
% \end{corollary}
% \noindent
% The above corollary states that if all client-wise distributions are not very skewed ($n_\textmin \backsim \frac{ n }{ K \log(K) }$), then for the uniform-over-sample weighting scheme, the local aggregation gives tight estimation with error bounded by $O(1/\sqrt{n})$, as the same result with the uniform-over-client scheme in Theorem \ref{theorem:estimation_error_uniform_over_client}.



% After deriving the estimation error of $\widehat H(\theta, \v)$, we can proceed to the following corollary for the estimation error of the R\'enyi-regularized objective in (\ref{eq:fedrenyi_empirical_objective}).


% \begin{corollary}
% \label{corollary:generalization_error_fedrenyi_empirical_objective}
% The estimation error of the R\'enyi-regularized objective in problem (\ref{eq:fedrenyi_empirical_objective}) is:\\
% (i) when $\gamma_k= \frac{n_k}{n}$:
% \begin{align*}
% \bigg | \Big[L(\theta) + \lambda H(\theta, \v) \Big] - 
% \Big[ \widehat L(\theta) + \lambda \widehat H(\theta, \v) \Big] \bigg |
% \leq 
% O ( 1 / \sqrt{n} );
% \end{align*}
% (ii) supposing that assumptions in Theorem \ref{theorem:estimation_error_uniform_over_client} hold:
% \begin{align}
% &
% \bigg | \Big[L(\theta) + \lambda H(\theta, \v) \Big] - 
% \Big[ \widehat L(\theta) + \lambda \widehat H(\theta, \v) \Big] \bigg |
% \nonumber \\
% & \leq 
% O \Big(\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \Big)
% = O(1/\sqrt{n})
% .
% \end{align}
% \end{corollary}



% \begin{remark}
% Corollary \ref{corollary:generalization_error_fedrenyi_empirical_objective} shows that the upper bound orders of total estimation error under uniform-over-sample and uniform-over-client setting are $O \Big (\frac{1}{\sqrt{n}} \Big )$ and $O \Big (\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \Big )$ respectively. Under the condition of Corollary \ref{corollary 1}, two orders are the same.
% \end{remark}

% \begin{proposition}
% \label{proposition:convergence}
% (The convergence analysis, Theorem 31 in \cite{jin2020local}) Suppose that $\big ( L(\theta) + \lambda H(\theta, \v)  \big )$ is $\ell_1$-smooth and $\beta < \frac{1}{2 \ell_1}$, then the output $\theta_T$ of Algorithm \ref{alg:FR} with step size $\eta = \frac{1}{\sqrt{T+2}}$ will satisfy: \\
% \begin{align*}
% \E[ \| \nabla \phi_{\beta}(\theta_{T}) \|^2] \leq 
% 2 \frac{ \big ( \phi_{\beta}(\theta_0) - \min_{\theta'} \phi(\theta') \big) + 2 \ell_1^2}{\sqrt{T+2} } + 4 \ell_1 \varepsilon,
% \end{align*}
% where $\varepsilon = O \Big (\frac{1}{\sqrt{n}} \Big )$ when $\gamma_k= \frac{n_k}{n}$ or $O \Big(\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \Big)$ when assumptions in Theorem \ref{theorem:estimation_error_uniform_over_client} hold, respectively. 
% \end{proposition}
% \begin{remark}
% Proposition \ref{proposition:convergence} demonstrates the convergence of Algorithm \ref{alg:FR}. 
% This proposition shows that except the additive error which is combined with the previous estimation errors of two configurations, the remaining term decreases at a rate of $O(1/T^4)$. Comparing with previous work where the estimation error is bounded by $O \big (\frac{1}{\sqrt{K}} \big )$ (\cite{chu2021fedfair}), the decreased upper bound could theoretically guarantee the Algorithm \ref{alg:FR} to find the $\varepsilon$-nearly stationary point where the gradient is more close to $0$.
% \end{remark}

Then, we analyze the convergence guarantee of synchronous FedR\'enyi to achieve $\epsilon$-stationary solution.
% We first define $\phi( \theta) = \max_{\v \perp \hat \v_1, \|\v\|^2 \leq 1} \big ( L(\theta) + \lambda H(\theta, \v)  \big )$ as the primal function of the model $\theta$. 
% According to \cite{rafique2022weakly,jin2020local}, if $\big ( L(\theta) + \lambda G( \theta, \v)  \big )$ is $\ell_1$-smooth in $\v$, then $\phi( \theta)$ is $\ell_1$-weakly convex. 
% Therefore, we could find the nearly $\varepsilon$-stationary point of $\phi(\theta)$ with its Moreau envelope: $\phi_{\beta}(\theta) := \min_{\theta'} \phi(\theta') + \frac{1}{2 \beta} \| \theta - \theta'\|^2$.   
We first denote 
$F(\theta) = L(\theta) + \max_{ \v \perp \v_1, \|\v\|^2 \leq 1 } \lambda H(\theta),$ and
$\widehat F(\theta) = \widehat L(\theta) + \max_{ \v \perp \v_1, \|\v\|^2 \leq 1 }  \lambda \widehat H(\theta)$.
Then, we consider the uniform-over-client weighting scheme following \cite{karimireddy2020scaffold}, i.e., $\gamma_k = 1/K$.
We highlight that the convergence analysis for the empirical measure $\E[ \| \nabla \hat F(\theta) \|^2 ]$ computed on infinite samples follows the same framework of previous work \cite{karimireddy2020scaffold,FedProx}. 
Our main target of the convergence analysis for the population measure $\E[ \| \nabla F(\theta) \|^2 ]$ is more difficult to achieve while considering the impact of the estimation error between $H(\theta, \v)$ and $\widehat H(\theta, \v)$.

\begin{proposition}
\label{proposition:convergence_based_on_scaffold}
(Convergence of synchronous FedR\'enyi)
Suppose $\eta \leq O(1/M)$ and $L_k(\theta)$ satisfies $(G_L, B_L)$-bounded gradient dissimilarity, where $\frac{1}{K} \sum^K_{k=1}  \| \frac{ \partial L_k(\theta) }{ \partial \theta } \|^2 \leq G_L^2 + B_L^2 \| \frac{ \partial L(\theta) }{ \partial \theta } \|^2$.
If $\| \frac{ \partial L(\theta) }{ \partial \theta } \|^2, \| \frac{ \partial Q_\theta }{ \partial \theta } \|^2, \| \frac{ \partial \widehat Q_\theta }{ \partial \theta } \|^2, \| \frac{ \partial \v_\theta }{ \partial \theta } \|^2$ are bounded by $\bar G $ for all $\theta$, 
Then, $H(\theta, \v)$ satisfies $(G_H, B_H)$-bounded gradient dissimilarity, where $G_H$ is $\frac{ C \bar G ( \rho + C) }{\rho^2}$ and $B_H$ is $1$.
$F(\theta)$ also satisfies $(G_F, B_F)$-bounded gradient dissimilarity, where $B_F = 2 B_L^2 $ and $G_F = 2G_L^2 + (4 \lambda - 2 B_L^2\lambda^2) \frac{ C \bar G ( \rho + C) }{\rho^2}
+ 4 B_L^2 \lambda \cdot \sqrt{\frac{ C \bar G^2 ( \rho + C) }{\rho^2}}$. 
Thus, FedR\'enyi algorithm achieves $\E[ \| \nabla \widehat F(\theta_T) \|^2 ] \leq \epsilon $ and $\E[ \| \nabla F(\theta_T) \|^2 ] \leq \epsilon + O\Big( \frac{1}{n} + \max_{\theta} \big\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q_\theta }{ \partial \theta } \big\|^2 \big)\Big)$ when $T \geq O(1/\epsilon^2)$.
\end{proposition}

\begin{remark}
The above proposition shows that the empirical version of synchronous FedR\'enyi algorithm could converge to $\epsilon$-stationary solution and the population version could converge to approximate $\epsilon$-stationary solution with gap $O\Big( \frac{1}{n} + \max_{\theta} \big\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q_\theta }{ \partial \theta } \big\|^2 \big)\Big)$, while the former is at the same order of previous work \cite{karimireddy2020scaffold,FedProx}.
From Theorem \ref{theorem:estimation_error_synchronous}, we know that the estimation error is bounded by $O(1/\sqrt{n})$, which is explicitly present in the above result.
Thus, the improved estimation can benefit not only the fairness guarantee but also the convergence.
\end{remark}

\subsection{ Asynchronous FedR\'enyi}
\label{subsection:Asynchronous_FedR\'enyi}
% To deal with client heterogeneity, we propose asynchronous FedR\'enyi algorithm. 
% Before describing the asynchronous FedR\'enyi algorithm, it is essential to first analyze the issue caused by system heterogeneity. 
% In a federated setting, disparities in computational and communication capacities among clients significantly impact the communication and aggregation processes. 
% Specifically, devices that exceed the communication time threshold or have insufficient local computation iterations, referred to as stragglers, fail to provide useful local updating information. 
% To address the key challenge of estimating the global R\'enyi correlation in the presence of stragglers, we propose an asynchronous variant of the FedR\'enyi algorithm.
\subsubsection{Algorithm Design}

In asynchronous FL, stragglers may fail to provide timely updates, which can compromise accurate estimation of global R\'enyi correlation. 
To overcome this challenge, following \cite{zhang2021parameterized,wang2021accelerated}, we assume that clients with similar empirical prediction distributions also have comparable data distributions, making the nearest-neighbor approximation a reasonable strategy for estimating missing updates. 
Consequently, even if a client fails to upload its model and local statistics within the communication threshold, its contribution to the global fairness measure can still be estimated reliably using its robust neighbors.

% \paragraph{Straggler Approximation.}
Specifically, 
we first identify a robust neighbor set for each straggler. 
Define $Rob_{\zeta}(\widetilde k)$ is the neighbor set of straggler $\widetilde k$, where $Rob_{\zeta}(\widetilde k) := \{k': \| \omega \bar u_{k'}(c) - \bar u_{\widetilde k}(c)\| \leq \zeta,  k' \in [K], \forall c \text{ and } \forall \omega \in (0,1) \}$.
Next, we compute the similarity between clients.
Let $dist(\cdot, \cdot)$ denote the Euclidean distance, which represents the dissimilarity. 
The similarity between the $k$-th and $k'$-th client is then quantified by weights $W_{k,k'}$,
where larger weights indicate greater similarity.
Based on the local statistics uploaded in the first communication round, for a straggler $\widetilde k$, we define: 
\begin{align}
    \label{eq:similarity_weight}
    &
    W^{\widetilde k,k'}_{\theta} = \exp \Big (\frac{ -dist (\theta^0_{\widetilde k,M}, \theta^0_{k',M})}{\rho} \Big), , k' \! \in \! [K]
     \\
    &
    W^{\widetilde k,k'}_{j} \! = \! \exp \Big (\frac{ -dist (\bar j^0_{\widetilde k \! , \! M}(c \! ,\! p), \bar j^0_{k'\! ,\! M} (c \!, \! p) )}{\rho} \Big), k' \! \in \! Rob_{\zeta}(\widetilde k),
    \nonumber \\
    &
    W^{\widetilde k,k'}_{u} = \exp \Big (\frac{ -dist (\bar u^0_{\widetilde k,M}(c), \bar u^0_{k',M} (c) }{\rho} \Big), k' \in Rob_{\zeta}(\widetilde k),
    \nonumber
\end{align} 
where $\rho$ is the temperature parameter.

Then the server approximates the model parameter $\widetilde \theta^e_{\widetilde k, M}$ and statistics $\widetilde j^e_{\widetilde k, M}(c, p)$ and $\widetilde u^e_{\widetilde k, M}(c)$ for each straggler $\widetilde k$. 
We summarize the approximation method in Algorithm \ref{alg:LA}. 

\begin{algorithm}[t]
\caption{Localized Approximation}
\label{alg:LA}
\begin{algorithmic}[1]

\STATE \textbf{Input}: $\{\bar j^e_{k, M}(c, p), \bar u^e_{k, M}(c), \theta^e_{k, M}\}$ for all non-straggler clients $k \in [K] \backslash I^{e+1}$, and temperature parameter $\rho$.

% 2023/07/23 Edited by Zhiyong Ma
\STATE Compute $W_{\widetilde k, k'}$ for all stragglers following Eq. (\ref{eq:similarity_weight}).

% \STATE For each straggler $i$, $W^{\theta}_{i, k} = exp(\frac{-dist(\theta_{a, t}, \theta_{b, t})}{\rho})$.

\STATE For each straggler $\widetilde k$, $\widetilde \theta^e_{\widetilde k, M} = \frac{\sum_{ k'=1}^{K -\widetilde K^{e+1}} W^{\widetilde k,k'}_{\theta} \theta^e_{k', M}}{\sum_{k'=1}^{K -\widetilde K^{e+1}} W_{\theta}^{\widetilde k, k'}}$.

% \STATE $W^{j}_{i, k} = (\frac{2\lambda+1}{2}-\frac{\sqrt{4\lambda+1}}{2}) \cdot dist(\bar j_{a, t}(c, p), \bar j_{b, t}(c, p)).$

\STATE $\widetilde j^e_{\widetilde k, M}(c, p) = \frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'}_{j}\bar j^e_{k, M}(c, p)}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'}_{j} }$.

% \STATE $W^{u}_{i} = (-\frac{1}{2}+\frac{\sqrt{4\lambda+1}}{2}) \cdot dist(\bar u_{a, t}(c), \bar u_{b, t}(c)).$

\STATE $\widetilde u^e_{\widetilde k, M}(c) = \frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'}_{u} \bar u^e_{k, M}(c)}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'}_{u}}$.

\end{algorithmic}
\end{algorithm}

After approximating local models and statistics for stragglers, we integrate these approximations into an asynchronous FedR\'enyi algorithm to compute the global R\'enyi correlation (see Option II in Algorithm \ref{alg:FR}).
In each round $e+1$, the server identifies the set of stragglers $I^{e+1}$ (with $ | I^{e+1} | = \widetilde K^{e+1}$) as those clients that either fail to return their local model and statistics within the communication threshold or have a local update timestamp below $M$ (see Line \ref{alg:AFR:line:stragglers}). 
For each straggler $\widetilde k$, the server selects its robust neighbor set $Rob_{\zeta}(\widetilde k)$ (see Line \ref{alg:AFR:line:find_neighbor}).
Next, the server approximates the missing statistics $\widetilde j^e_{\widetilde k, M}(c, p)$ and $\widetilde u^e_{\widetilde k, M}(c)$, , as well as the model $\widetilde \theta^e_{\widetilde k, M}$ for each straggler $\widetilde k$ by Algorithm \ref{alg:LA} (see Line \ref{alg:AFR:line:stragglers} and \ref{alg:AFR:line:approximation}).
Finally, global R\'enyi regularization statistics $j^{e+1}(c,p)$, $u^{e+1}(c)$, and the global model $\theta^{e+1}$ are aggregated (see Line \ref{alg:AFR:line:aggregate_theta} to \ref{alg:AFR:line:aggregate_u}). 
% Finally, the server broadcasts $\theta_{t+1}$ and R\'enyi correlation factor $\v$ to all connected clients (see Line \ref{alg:FR:line:broadcast}). 
% Specially, we regard all clients whose result is received by the server at $t+1$ round as connected clients.

% \begin{algorithm}[t]
%     \caption{Asynchronous FedR\'enyi}
%     \label{alg:AFR}
%     \begin{algorithmic}[1]
%     \STATE Initialize $\theta_0$, $\v_0$ and hyperparameter $\lambda$, $T$, $I$ and $\eta$ on server and clients.
%     \label{alg:AFR:line:init}
    
%     \STATE Each client $k \in K$ compute $\bar r_k(p)$ following Eq. (\ref{eq:local_aggregation}) and upload $\bar r_k(p)$ and $n_k$. 
%     \label{alg:AFR:line:comput_rk}

%     \STATE Server aggregates $\hat r(p) = \sum_{k=1}^K \gamma_k \bar r_k(p)$ and $\hat \v_1 = [\sqrt{\hat r(1) } , ..., \sqrt{ \hat r(P) } ] $ 
%     \label{alg:AFR:line:aggregate_r}
    
%     \FOR{$t=0, \ldots, T-1$}
%     \label{alg:AFR:line:loop_begin}
%         \STATE Each client $k$ compute $Q_{\theta_k}$
%         \label{alg:AFR:line:local_update_begin}
%         \STATE $\theta_{k, t+1} = \theta_{k,t} - \eta \partial_{\theta} ( \E[ \ell(f(\theta_k, X), Y) ] + \lambda H(\theta_k, \v) )$
%         \label{alg:AFR:line:local_update_end}
%         \IF[Communicate]{$(t+1) \bmod I=0$} 
%         \label{alg:AFR:line:communication_begin}
%             \STATE Same operations as step \ref{alg:FR:line:local_begin} - \ref{alg:FR:line:upload} in Algorithm \ref{alg:FR}.
%             \STATE Find the stragglers $I^{e+1}$
%             \label{alg:AFR:line:stragglers}

%             \STATE Server finds the robust neighbors $Rob_{\zeta}(k)$ for each stragglers $k\in I^{e+1}$.
%             \label{alg:AFR:line:find_neighbor}
            
%             \STATE Approximate the $\widetilde \theta_{i, t+1}, \widetilde j_{i, t+1}(c, p), \widetilde u_{i, t+1}(c) $ for all stragglers $k\in I^{e+1}$ by Algorithm \ref{alg:LA}.
%             \label{alg:AFR:line:approximation}
%             \STATE Aggregates $\theta_{t+1}$, $\hat j(c,p)$, $\hat u(c)$ following Eq. (\ref{eq:aggregate_asysnchronous}).
%             \label{alg:AFR:line:aggregate}
%             \STATE Compute $\hat Q_\theta$ where each entry $q_{c,p} = \frac{ \hat j(c, p) \cdot \hat r(p)}{ \sqrt{ \hat u(c) \cdot \hat r(p)} }$
%             \label{alg:AFR:line:aggregate_end}
%             \STATE $\v \leftarrow \arg\max_{\v\bot \hat \v_1} [\widehat L( \theta_{t+1}) + \lambda \widehat H(\theta_{t+1}, \v)]$
%             \label{alg:AFR:line:compute_v}
%             \STATE Broadcast $\theta_{t+1}$ and $\v$ to all clients $k \in K$.
%             \label{alg:AFR:line:broadcast}
            
%         \ENDIF
%         \label{alg:AFR:line:communication_end}
%     \ENDFOR
%     \label{alg:AFR:line:loop_end}
%     \end{algorithmic}
% \end{algorithm}

\subsubsection{Theoretical Analysis}
\label{subsubsection:theoretical_analysis_asynchronous}

In this part, we first analyze the approximation error of each straggler from actual local statistics $\bar j^e_{\widetilde k, M}(c, p)$, $\bar u^e_{\widetilde k, M}(c)$ and model $\theta^e_{\widetilde k, M}$ to approximated statistics $\widetilde j^e_{\widetilde k, M}(c, p)$, $\widetilde u^e_{\widetilde k, M}(c)$ and model $\widetilde \theta^e_{\widetilde k, M}$ in Proposition \ref{proposition:approximation_error}.
Then, in Theorem \ref{theorem:estimation_error_asynchronous}, we analyze the estimation error of the asynchronous FedR\'enyi algorithm, explicitly accounting for the approximation error.

Before analyzing the approximation error, we first assume the following assumptions:
% Then, we show the generalization error of Asynchronous FedR\'enyi within the approximation error of stragglers.
% Before proving the approximation error, we first define $\P [f_{\theta_{k,t+1}}(X_k,S_k) = c ]$ as the probability of model prediction on client $k$. 
% Then, we define stage $s$ as two consecutive communication rounds. 
% We denote the total iteration of stage $s$ as $M$.
% \begin{assumption}
% \label{assumption:cocoercive}
% ($\eta$-expansive, Definition 2.3 in \cite{hardt2016train})
% For all clients, the updating rule is $\eta$-expansive if:
% \begin{align*}
% \max_{k, k' \in [K]} \frac{\| \theta_{k, t+1} - \theta_{k', t+1}\|}{\| \theta_{k, t} - \theta_{k', t} \|} \leq \eta.
% \end{align*}
% \end{assumption}

\begin{assumption}
\label{assumption:cocoercive}
($\beta$-co-coercive condition of $\nabla F(\theta)$)
For all clients and any model $\theta$, the gradient of $F(\theta)$ satisfies $\beta$-co-coercive condition with $\beta \geq \frac{\eta}{2}$ if: 
\begin{align*}
\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle
\geq 
\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2.
\end{align*}
% For all clients, the updating rule is $\mu$-strong monotonicity if:
% \begin{align*}
% \max_{k, k' \in [K]} \frac{\| \theta_{k, t+1} - \theta_{k', t+1}\|}{\| \theta_{k, t} - \theta_{k', t} \|} \leq 1.
% \end{align*}
\end{assumption}

\begin{assumption}
\label{assumption:lipschitz}
($L$-Lipschitz)
$\widehat \P [f_{\theta}(X_k,S_k) = c ]$ is $L$-lipschitz on model $\theta$ such that $ | \P [f_{\theta}(X_k,S_k) = c ] - \P [f_{\theta'}(X_k,S_k) = c ] | \leq L \| \theta - \theta' \|  $.
\end{assumption}

With the two above assumptions, we can bound the approximation error 
% from local empirical statistics $\bar j_{k, t+1}(c, p)$, $\bar u_{k, t+1}(c)$ and model $\theta_{k, t+1}$ to local approximation $\widetilde j_{k, t+1}(c, p)$, $\widetilde u_{k, t+1}(c)$, and $\widetilde \theta_{k, t+1}$ of each straggler $k$ 
of each straggler by the following proposition:
% \begin{proposition}
% \label{proposition:approximation_error}
% Define $\max_{k, k' \in [K]} \| \theta_{k, 0} - \theta_{k', 0}\|$ as $\varepsilon_0$. Suppose that Assumption \ref{assumption:cocoercive} and \ref{assumption:lipschitz} hold. Then, the approximation error of stragglers is upper bounded:
% \begin{align}
% &
% \| \widetilde \theta_{k, t+1} - \theta_{k, t+1}\|
% \leq 
% \eta^{t+1} \varepsilon_0,
% \nonumber \\
% &
% | \widetilde j_{k, t+1}(c, p) -  \bar j_{k, t+1}(c, p) |
% \leq
% L\eta^{t+1} \varepsilon_0 + \zeta,
% \nonumber \\
% &
% | \widetilde u_{k, t+1}(c) -  \bar u_{k, t+1}(c) |
% \leq
% L\eta^{t+1} \varepsilon_0 + \zeta.
% \end{align}
% \end{proposition}
\begin{proposition}
\label{proposition:approximation_error}
(Approximation error of each straggler in asynchronous FedR\'enyi)
Define $\max_{k, k' \in [K]} \| \theta^e_{k, 0} - \theta^e_{k', 0}\| = \varepsilon^e_{0}$. Suppose that Assumption \ref{assumption:cocoercive} and \ref{assumption:lipschitz} hold. Then, for each communication round $e$, the approximation errors of model and local statistics on stragglers $\widetilde k$ are upper bounded as follows:
\begin{align}
&
\| \widetilde \theta^{e}_{\widetilde k, M} - \theta^{e}_{\widetilde k, M}\|
\leq 
\varepsilon^e_{0},
\nonumber \\
&
| \widetilde j^{e}_{\widetilde k, M}(c, p) -  \bar j^{e}_{\widetilde k, M}(c, p) |
\leq
L \varepsilon^e_{0} + \zeta,
\nonumber \\
&
| \widetilde u^{e}_{\widetilde k, M}(c) -  \bar u^{e}_{\widetilde k, M}(c) |
\leq
L \varepsilon^e_{0} + \zeta.
\end{align}
\end{proposition}

\begin{remark}
In the above result, the localized approximation for the stragglers shows non-expansion behavior (1st line), i.e., the error after running $s$ stages is not larger than the error at the 1st iteration.
% is determined by learning rate $\eta$ and 
% maximum distance between initial clients $\varepsilon_0$. 
In practice, we could set a smaller learning rate $\eta$ (due to Assumption \ref{assumption:cocoercive}) and use pre-trained model to decrease the approximation error $\varepsilon_0$. 
Besides, using the pre-trained model, which is common in FL \cite{tan2022federated,weller2022pretrained,tian2022fedbert}, could make the Assumption \ref{assumption:cocoercive} easy to hold. 
\end{remark}

Then, we study how approximation error influences the estimation error of the asynchronous FedR\'enyi algorithm. 
To explicitly investigate this impact, we define $\widetilde Q^{e+1}_{\theta} \in \R^{C \times P}$ as the global empirical matrix at communication round $e+1$, 
and $\widetilde \v^{e+1}$ is its corresponding second largest singular vector. 
Therefore, the empirical objective function in asynchronous setting could be rewritten as $\widehat H(\theta^{e+1}, \widetilde \v^{e+1}) = (\widetilde \v^{e+1})^\top (\widetilde Q^{e+1}_{\theta})^\top \widetilde Q^{e+1}_{\theta} \widetilde \v^{e+1}$. Our goal is to study the estimation between $\widehat H(\theta^{e+1}, \widetilde \v^{e+1})$ and $H(\theta^{e+1}, \v^{e+1})$.
% \begin{theorem}
% \label{theorem:estimation_error_asynchronous}
% (Estimation error of R\'enyi regularization for Asynchronous FedR\'enyi) 
% Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
% When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, with probability at least $1-\delta$ for $\delta >0$, we have the following error bound between two consecutive communication rounds:
% \begin{align*}
% \widehat H(\theta^{e+1}, \widetilde \v^{e+1}) \! - \! H(\theta^{e+1}, \v^{e+1}) \! \leq \!
% O \big( 1/\sqrt{n} \! + \! ( L \varepsilon^e_{0} \! + \! \zeta )^2 \big)
% .
% \end{align*}

% \end{theorem}
\begin{theorem}
\label{theorem:estimation_error_asynchronous}
(Estimation error of R\'enyi regularization for asynchronous FedR\'enyi) 
Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, for any communication round $e$, any global model $\theta^{e+1}$ and $\delta \in (0,1)$, we have the following inequality holds:
% \begin{align*}
% \P[
% \widehat H(\theta^{e\!+\!1}\!, \!\widetilde \v^{e\!+\!1}) \! - \! H(\theta^{e\!+\!1}\!,\! \v^{e\!+\!1}) \! \leq \!
% O \big( 1\! / \! \sqrt{n} \! + \! ( L \varepsilon^e_{0} \! + \! \zeta )^2 \big)]
% \geq 1 \! - \! \delta.
% \end{align*}
\begin{align*}
\P \Big [
&
\widehat H(\theta^{e + 1}, \widetilde \v^{e + 1}) -  H(\theta^{e + 1}, \v^{e + 1}) 
\\
&
\leq
O \big( 1/ \sqrt{n} + ( L \varepsilon^e_{0} + \zeta )^2 \big) \Big | \theta^{e + 1} \Big ]
\geq 1 - \delta.
\end{align*}
\end{theorem}


% \begin{theorem}
% \label{theorem:estimation_error_asynchronous}
% (Estimation error of R\'enyi regularization for Asynchronous FedR\'enyi) 
% Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
% When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, with probability at least $1-\delta$ for $\delta >0$, the following inequality holds:
% \begin{align}
% \widehat H(\theta, \widetilde \v) - H(\theta, \v) \leq
% O \Bigg( \sqrt{\frac{1}{ n }} + ( L\eta^{t+1} \varepsilon_0 + \zeta )^2 \Bigg)
% \end{align}

% % \begin{equation}
% % \begin{aligned}
% % \widehat H(\theta, \widetilde \v) - H(\theta, \v) & \leq 
% % \left\{
% % \begin{aligned}
% %     &
% %    O \Bigg( \sqrt{\frac{1}{ n }} + ( L\eta^{t+1} \varepsilon_0 + \zeta )^2 \Bigg), \\
% %    & \quad \text{where } \gamma_k = \frac{n_k}{n} , \\
% %     & 
% %     O \Bigg( \sqrt{\frac{\log (K)}{ K n_{min} }} + ( L\eta^{t+1} \varepsilon_0 + \zeta )^2 \Bigg), \\
% %     & \quad \text{where } \gamma_k = \frac{1}{K}.
% % \end{aligned}
% % \right.
% % \end{aligned}
% % \end{equation}
% \end{theorem}

\begin{remark}
The above theorem shows that, with high probability, the estimation error for asynchronous FedR\'enyi between population $H(\theta^{e+1}, \v^{e+1})$ and empirically aggregated $\widehat H(\theta^{e+1}, \widetilde \v^{e+1})$ in two consecutive communication rounds is bounded by $O( 1/\sqrt{n} + ( L \varepsilon^e_{0} + \zeta )^2 )$ and follows a stage-wise recurrence.
For each communication round $e$, the dynamic estimation error is bounded by fixed term $O( 1/\sqrt{n})$ and the largest distance of model at the beginning of each stage $\varepsilon^e_{0}$. 
During communication, $\varepsilon^{e+1}_{0} \leq \varepsilon^{e}_{M} \leq \varepsilon^{e}_{0}$ (see Section \ref{appendix:proof_of_theorem estimation_error_asynchronous} and Equation (\ref{eq:epsilon_comparison}) in Appendix).
Thus, the estimation error decreases as the communication progress. 
Unlike previous works \cite{sefidgaran2024lessons,hu2023generalization}, our global estimation error studies the inter-client and intra-client estimation error of all clients, including stragglers and participating clients.
\end{remark}

\begin{table*}[t]
    \setlength{\tabcolsep}{1mm}
    \caption{
    Experimental results of all methods with the heterogeneous setting ($Dir=0.5$) on four datasets. 
    For ACC, FR, and HM, higher values indicate better performance.
    Accuracy, fairness, and harmonic mean are denoted by ACC, FR, and HM, respectively.
    The best results are in \textbf{bold}.
    The mean and standard deviation of $20$ results with better HM for each method under different hyperparameter settings are presented.
    % To observe the effect of aggregation, we also adopt the local training setting, where each client updates their model by only local training.
    Comparing FedR\'enyi with other baselines, there exist at least $2\%$ improvements of ACC, FR and HM over three datasets (ADULT, DRUG, DUTCH).
    }
    \label{Table: result summary}
    \centering
    \resizebox{\textwidth}{!}{
      \begin{tabular}{|cccccccccc|}
\hline
\multicolumn{1}{|c|}{\textbf{}} &
  \multicolumn{1}{c|}{\textbf{FedAvg}} &
  \multicolumn{1}{c|}{\textbf{FedProx}} &
  \multicolumn{1}{c|}{\textbf{Scaffold}} &
  \multicolumn{1}{c|}{\textbf{FedFair}} &
  \multicolumn{1}{c|}{\textbf{\begin{tabular}[c]{@{}c@{}}FL-\\      FairBatch\end{tabular}}} &
  \multicolumn{1}{c|}{\textbf{FedFB}} &
  \multicolumn{1}{c|}{\textbf{FairFed}} &
  \multicolumn{1}{c|}{\textbf{\begin{tabular}[c]{@{}c@{}}FedR\'enyi\\      $(1/K)$\end{tabular}}} &
  \textbf{\begin{tabular}[c]{@{}c@{}}FedR\'enyi\\      $(n_k/n)$\end{tabular}} \\ \hline
\multicolumn{10}{|c|}{\textit{\textbf{ADULT}}} \\ \hline
\multicolumn{1}{|c|}{\textbf{ACC}} &
  \multicolumn{1}{c|}{0.62±0.12} &
  \multicolumn{1}{c|}{0.61±0.12} &
  \multicolumn{1}{c|}{0.56±0.20} &
  \multicolumn{1}{c|}{0.51±0.07} &
  \multicolumn{1}{c|}{0.64±0.00} &
  \multicolumn{1}{c|}{0.65±0.00} &
  \multicolumn{1}{c|}{0.62±0.17} &
  \multicolumn{1}{c|}{\textbf{0.67±0.03}} &
  0.65±0.04 \\ \hline
\multicolumn{1}{|c|}{\textbf{FR}} &
  \multicolumn{1}{c|}{0.87±0.1} &
  \multicolumn{1}{c|}{0.88±0.11} &
  \multicolumn{1}{c|}{0.88±0.13} &
  \multicolumn{1}{c|}{0.84±0.17} &
  \multicolumn{1}{c|}{0.91±0.02} &
  \multicolumn{1}{c|}{0.92±0.03} &
  \multicolumn{1}{c|}{0.77±0.16} &
  \multicolumn{1}{c|}{\textbf{0.94±0.04}} &
  \textbf{0.94±0.04} \\ \hline
\multicolumn{1}{|c|}{\textbf{HM}} &
  \multicolumn{1}{c|}{0.72±0.11} &
  \multicolumn{1}{c|}{0.72±0.11} &
  \multicolumn{1}{c|}{0.68±0.16} &
  \multicolumn{1}{c|}{0.63±0.10} &
  \multicolumn{1}{c|}{0.75±0.00} &
  \multicolumn{1}{c|}{0.76±0.00} &
  \multicolumn{1}{c|}{0.69±0.16} &
  \multicolumn{1}{c|}{\textbf{0.78±0.03}} &
  0.77±0.04 \\ \hline
\multicolumn{10}{|c|}{\textit{\textbf{COMPAS}}} \\ \hline
\multicolumn{1}{|c|}{\textbf{ACC}} &
  \multicolumn{1}{c|}{0.66±0.01} &
  \multicolumn{1}{c|}{0.66±0.01} &
  \multicolumn{1}{c|}{0.47±0.12} &
  \multicolumn{1}{c|}{0.62±0.03} &
  \multicolumn{1}{c|}{0.67±0.01} &
  \multicolumn{1}{c|}{0.67±0.01} &
  \multicolumn{1}{c|}{0.62±0.03} &
  \multicolumn{1}{c|}{\textbf{0.68±0.01}} &
  \textbf{0.68±0.01} \\ \hline
\multicolumn{1}{|c|}{\textbf{FR}} &
  \multicolumn{1}{c|}{0.79±0.03} &
  \multicolumn{1}{c|}{0.79±0.03} &
  \multicolumn{1}{c|}{\textbf{0.82±0.10}} &
  \multicolumn{1}{c|}{0.79±0.10} &
  \multicolumn{1}{c|}{0.78±0.02} &
  \multicolumn{1}{c|}{0.75±0.03} &
  \multicolumn{1}{c|}{0.79±0.10} &
  \multicolumn{1}{c|}{0.81±0.02} &
  0.82±0.01 \\ \hline
\multicolumn{1}{|c|}{\textbf{HM}} &
  \multicolumn{1}{c|}{0.72±0.01} &
  \multicolumn{1}{c|}{0.72±0.01} &
  \multicolumn{1}{c|}{0.60±0.11} &
  \multicolumn{1}{c|}{0.69±0.05} &
  \multicolumn{1}{c|}{0.72±0.01} &
  \multicolumn{1}{c|}{0.71±0.01} &
  \multicolumn{1}{c|}{0.69±0.05} &
  \multicolumn{1}{c|}{0.72±0.03} &
  \textbf{0.73±0.02} \\ \hline
\multicolumn{10}{|c|}{\textit{\textbf{DRUG}}} \\ \hline
\multicolumn{1}{|c|}{\textbf{ACC}} &
  \multicolumn{1}{c|}{0.67±0.02} &
  \multicolumn{1}{c|}{0.67±0.01} &
  \multicolumn{1}{c|}{0.66±0.01} &
  \multicolumn{1}{c|}{0.67±0.02} &
  \multicolumn{1}{c|}{0.66±0.00} &
  \multicolumn{1}{c|}{0.66±0.00} &
  \multicolumn{1}{c|}{0.50±0.08} &
  \multicolumn{1}{c|}{0.68±0.01} &
  \textbf{0.69±0.01} \\ \hline
\multicolumn{1}{|c|}{\textbf{FR}} &
  \multicolumn{1}{c|}{0.86±0.02} &
  \multicolumn{1}{c|}{0.86±0.02} &
  \multicolumn{1}{c|}{0.82±0.06} &
  \multicolumn{1}{c|}{0.86±0.02} &
  \multicolumn{1}{c|}{0.84±0.00} &
  \multicolumn{1}{c|}{0.85±0.00} &
  \multicolumn{1}{c|}{0.77±0.10} &
  \multicolumn{1}{c|}{\textbf{0.96±0.03}} &
  0.96±0.02 \\ \hline
\multicolumn{1}{|c|}{\textbf{HM}} &
  \multicolumn{1}{c|}{0.75±0.02} &
  \multicolumn{1}{c|}{0.75±0.01} &
  \multicolumn{1}{c|}{0.73±0.02} &
  \multicolumn{1}{c|}{0.75±0.02} &
  \multicolumn{1}{c|}{0.74±0.00} &
  \multicolumn{1}{c|}{0.74±0.00} &
  \multicolumn{1}{c|}{0.61±0.09} &
  \multicolumn{1}{c|}{\textbf{0.80±0.01}} &
  \textbf{0.80±0.01} \\ \hline
\multicolumn{10}{|c|}{\textit{\textbf{DUTCH}}} \\ \hline
\multicolumn{1}{|c|}{\textbf{ACC}} &
  \multicolumn{1}{c|}{0.81±0.01} &
  \multicolumn{1}{c|}{0.80±0.01} &
  \multicolumn{1}{c|}{0.60±0.12} &
  \multicolumn{1}{c|}{0.61±0.16} &
  \multicolumn{1}{c|}{0.81±0.01} &
  \multicolumn{1}{c|}{0.69±0.05} &
  \multicolumn{1}{c|}{0.62±0.13} &
  \multicolumn{1}{c|}{\textbf{0.83±0.01}} &
  \textbf{0.83±0.01} \\ \hline
\multicolumn{1}{|c|}{\textbf{FR}} &
  \multicolumn{1}{c|}{0.64±0.08} &
  \multicolumn{1}{c|}{0.63±0.09} &
  \multicolumn{1}{c|}{0.84±0.18} &
  \multicolumn{1}{c|}{0.65±0.35} &
  \multicolumn{1}{c|}{0.66±0.06} &
  \multicolumn{1}{c|}{0.92±0.04} &
  \multicolumn{1}{c|}{0.78±0.25} &
  \multicolumn{1}{c|}{0.94±0.04} &
  \textbf{0.96±0.04} \\ \hline
\multicolumn{1}{|c|}{\textbf{HM}} &
  \multicolumn{1}{c|}{0.72±0.02} &
  \multicolumn{1}{c|}{0.7±0.02} &
  \multicolumn{1}{c|}{0.7±0.14} &
  \multicolumn{1}{c|}{0.63±0.22} &
  \multicolumn{1}{c|}{0.73±0.02} &
  \multicolumn{1}{c|}{0.79±0.04} &
  \multicolumn{1}{c|}{0.69±0.17} &
  \multicolumn{1}{c|}{0.88±0.02} &
  \textbf{0.89±0.02} \\ \hline
\end{tabular}
}    
\end{table*}

\section{Numerical Experiments}
\label{section:numerical_experiments}

% The FedR\'enyi code is available at \url{https://github.com/AllenMa97/Federated-Renyi}.

\subsection{Experimental Setup}

{\bf Hyperparameters and Dataset.}
In this paper, we use several combinations of hyperparameters ($\lambda$, $\rho$, $T \& M$, $\alpha$, and $Dir$) to train FL models.
We use ADULT, COMPAS, DRUG, and DUTCH datasets, which are widely studied benchmarks for fairness evaluation in FL. 
They vary in size and demographic attributes, allowing comprehensive fairness analysis.
More details are provided in Appendix \ref{experiments details}.

{\bf Measurement.}
\label{measurement}
We use the accuracy (ACC), fairness score (FR), and harmonic mean (HM) of ACC and FR to measure the performance.
To evaluate global accuracy (ACC) in FL, we compute the local accuracy of each client and aggregate them using either $n_k/n$ for uniform data or $1/K$ uniform client settings across clients.
            % The ACC is computed by the proportion of the correctly predicted samples and the test data.
            % Many works in FL first compute the local accuracy of clients and then aggregate them to measure the global accuracy performance.
            % Concretely, we denote the accuracy of the $k$-th client as $acc_k$.
            % Due to the different choices of $\gamma_k$, we take the $\frac{n_k}{n}$ and $\frac{1}{K}$ as the aggregation weight, then compute the ACC performance of each algorithm.
            % For the uniform over distribution setting, which means the FL system is practicing in uniform data, $ACC = \sum_{k=1}^K \frac{n_k}{n} \cdot acc_k$.
            % For the uniform over distribution setting, which means the FL system is practicing in heterogeneous data, $ACC = \sum_{k=1}^K \frac{1}{K} \cdot acc_k$.
To measure how unfair a model is, 
\cite{NEURIPS2018_83cdcec0} propose DEO by extending the equal opportunity (EOD) \cite{10.5555/3157382.3157469} as follows:
$\left| \P(f_{\theta}(X,S)|S=1,Y=1) - \P(f_{\theta}(X,S)|S=0,Y=1) \right|$, and the FR is extended by $FR=1-DEO({f_{\theta}})$.            
For ACC, FR, and HM, higher values indicate better performance.
            % Statistical parity \cite{2011Fairness} rewards the model for classifying each group as positive at the same rate. 
            % A predictor $\hat{Y}_{\theta}$ is fair from the statistical parity perspective if $\P(\hat{Y}_{\theta}=1|S=1)= \P(\hat{Y}_{\theta}=1|S=0)$.
            % Thus, the SPD is defined as $SPD=\P(\hat{Y}_{\theta}=1|S=0)- \P(\hat{Y}_{\theta}=1|S=1)$.
            % For ACC, FR, and HM, values higher indicate better performance, while SPD is the opposite. 

{\bf Data Distribution Setting.}
            % FL Clients may have different data distributions.
            % To simulate the IID and non-IID data distribution setting, we construct the label distribution skew (quantity-based label imbalance) and quantity skew, following \cite{li2022federated}.
To simulate the IID and non-IID data distribution setting, we build quantity skew and control heterogeneity levels through $Dir$ following \cite{li2022federated,2023FairFed,lee2023fedlp}.
            % We control the heterogeneity levels through $Dir$.
            % The $Dir$ represents the parameters in standard Dirichlet distribution.
            % Thus, smaller $Dir$ indicates a more heterogeneous scenario across clients.
Smaller $Dir$ indicates a more imbalanced scenario about data quantity across clients, 
            % $p$ and $1-p$ control the proportion of clients that only contain data samples with positive and negative sensitive attributes, respectively.
            % For clarity, we use $+\infty$ to represent the uniform case in both label and quantity skews.
and $Dir=+\infty$ represents the uniform case.

{\bf Baselines.}
We include general FL methods (FedAvg \cite{mcmahan2017communication}, FedProx \cite{FedProx}, Scaffold \cite{karimireddy2020scaffold}) and fairness-aware FL baselines (FedFair \cite{chu2021fedfair}, FL-FairBatch \cite{2021FairBatch}, FedFB \cite{zeng2021improving}, FairFed \cite{2023FairFed}) to compare fairness trade-offs in FL settings.
% Several methods are adopted as baselines: FedAvg \cite{mcmahan2017communication}, FedProx \cite{FedProx}, Scaffold \cite{karimireddy2020scaffold}, FedFair \cite{chu2021fedfair}, FL-FairBatch \cite{2021FairBatch}, FedFB \cite{zeng2021improving}, and FairFed \cite{2023FairFed}.
Following \cite{Baharlouei2020Rényi,chu2021fedfair,zeng2021improving,2023FairFed}, we use the logistic regression model as backbone.
More details are shown in Appendix \ref{experiments details}.
The FedR\'enyi code is available at \url{https://github.com/AllenMa97/Federated-Renyi}.

\subsection{Experiment Result}
% Considering the limitation of the paper's length, we summarize the performance of methods via Table \ref{Table: result summary}, and more experimental results are presented in the Appendix.
{\bf FedR\'enyi consistently outperforms baseline methods.}
The main experimental results are summarized in Table \ref{Table: result summary}.
Since hyperparameter affect the performance of algorithms, we select top $20$ results (with better HM) for each method and report their mean and standard deviaition to ensure reliable comparisons.
In ADULT, DRUG, and DUTCH, FedR\'enyi outperforms other algorithms.
% Although FedR\'enyi does not beyond all baselines in COMPAS, its ACC and HM scores rank second with small gaps from the highest ($\leq$ 0.02).  
Although FedR\'enyi does not outperform all baselines in COMPAS, its HM rank second with a small gap from the highest ($\leq$ 0.1).  
% Meanwhile, in most cases of these two datasets, FedR\'enyi have lower variances in three metrics (lower than 0.05), exhibiting stability.
These results demonstrate the effectiveness of FedR\'enyi.  
More detailed results for all datasets are supplemented in the Table \ref{Table: ADULT full result summary}, \ref{Table: COMPAS full result summary}, \ref{Table: DRUG full result summary}, and \ref{Table: DUTCH full result summary}.
% To explore the robustness and trade-off ability of the proposed method, we build several ablation experiments in different heterogeneous levels, datasets, the proportion of straggler, training and communication round settings.
% The supplementary experiment results are presented in the Appendix.

{\bf FedR\'enyi effectively balances accuracy and fairness by adjusting $\lambda$.}
To further examine the trade-off, we adjust the regularization coefficient $\lambda$ within \{0.1, 0.5, 1, 5, 1000\} and visually present some experimental results in Figure \ref{fig: the effect of lambda}.
As shown in Figure \ref{fig: the effect of lambda}, the FR of FedR\'enyi becomes larger as $\lambda$ enlarges at the heterogeneous setting and the ACC increases as $\lambda$ becomes smaller.
More experimental results on four datasets are presented in Figure \ref{fig: the effect of parameter lambda in Appendix}.
% except the subfigure of DUTCH in the uniform-over-sample setting.
% We claim that this result is consistent with Table \ref{Table: result summary}, where the FR of different algorithms is relatively stable (with low variance) in DUTCH.

\begin{figure}[htbp]
    \centering
    \begin{minipage}[t]{0.23\textwidth}
    % \includegraphics[width=\textwidth,height=0.9\textwidth]{new_figure/ijcai2024/lamda_diagram/Dirichlet05/no_attribute_skew/DUTCH/LR/Lamda.pdf}
    \includegraphics[width=\textwidth,height=0.9\textwidth]{new_figure/UAI2025/non-iid_black_Lamda.pdf}
    
    \end{minipage}
    \begin{minipage}[t]{0.23\textwidth}
    % \includegraphics[width=\textwidth,height=0.9\textwidth]{new_figure/ijcai2024/lamda_diagram/Uniform/no_attribute_skew/DUTCH/LR/Lamda.pdf}
    \includegraphics[width=\textwidth,height=0.9\textwidth]{new_figure/UAI2025/non-iid_black_Lamda.pdf}
    \end{minipage}
    \caption{The accuracy and fairness trade-off adjusting via $\lambda$ of FedR\'enyi in DUTCH with heterogeneous and isomorphic setting. 
    % The uniform-over-samples and uniform-over-clients are presented on the top and bottom rows of each subfigure, respectively. 
    We could observe that the fairness increase and accuracy decrease with a larger $\lambda$ value.
    }
    \label{fig: the effect of lambda}  
\end{figure}


    
{\bf FedR\'enyi takes the optimal trade-off between accuracy and fairness.}    
A comparison of ACC and FR across different algorithms is shown in Figure \ref{fig:fairness and accuracy (LR) }.
Only the top 5 results (with better HM value) of each algorithm will be plotted, and some methods show less than 5 points are caused by overlap.
Intuitively, red and yellow scatter points (FedR\'enyi) are closer to the optimal corner than others in most cases. 
Besides, these scatters approximately form several curves, exhibiting the trade-off ability between ACC and FR.
More results on four datasets are presented in Figure \ref{fig:Appendix fairness and accuracy (LR)}.
    % In particular, most baselines behave closely in these experiments, except the FairFed (blue).
    % Some blue triangles tend to be towards the upper left, which means FairFed may over-emphasize fairness, thus penalizing the accuracy.
\begin{figure}[hbpt]
        \centering
        \begin{minipage}[]{0.23\textwidth}
        \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/ADULT/LR/scatter.pdf}
        \end{minipage}
        \begin{minipage}[]{0.23\textwidth}
        \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/ADULT/LR/scatter.pdf}
        \end{minipage}
        \caption{
        The ACC and FR trade-off on ADULT of all methods with two distribution settings.
        FedR\'enyi performs closer to optimal and approximately forms trade-off curves from the most accurate and least fair to the least accurate and most fair.
        }
        \label{fig:fairness and accuracy (LR) }
    \end{figure}


{\bf FedR\'enyi converges.}   
To study the convergence behavior, training loss at different communication rounds on four datasets are illustrated in Figure \ref{fig:loss}.
The training loss of synchronous FedR\'enyi decreases as communication proceeds and becomes stable at around $50$ rounds, verifying the convergence property. 
More results on other datasets are shown in Figure \ref{fig:isomorphism loss} and \ref{fig:isomorphism ACC and FR}.
\begin{figure}[t]
  \centering
  \begin{minipage}[t]{0.48\textwidth}
        \includegraphics[width=\textwidth]{new_figure/aaai2025/loss_diagram/Dirichlet05/loss.pdf}      
  \end{minipage}
  \caption{The training loss of FedR\'enyi under heterogeneity data settings, which verify that FedR\'enyi converges to a stable range after a certain number of rounds.}
  \label{fig:loss}
\end{figure}

{\bf Asynchronous FedR\'enyi maintains stable HM performance and effectively controls estimation errors.}  
    % 1、改成异步FedRenyi
    % 2、为了简便期间，我们固定了一个在其他异步FL算法常见的drop out比例
    % 3、想对比FedProx的情况
    % 4、想观察异步Trade-off
    % 5、观察HM下降，Error不高    
    To verify the performance with the asynchronous FedR\'enyi, we build experiments and simulate different communication thresholds by controlling the proportion of straggler $\alpha$.
    % Generally, as the proportion of stragglers ($\alpha$) increases, the amount of algorithm available data will decrease significantly, resulting in degraded HM.
    As shown in Table \ref{table:acceleration_bias}, the asynchronous FedR\'enyi not only maintains stable HM performance but also achieves effective approximation error control.
    More experimental results on other datasets are presented in Appendix \ref{subsection:appendix:robustness_experiment}.
    % 
    % When the asynchronous scheme is utilized in the training process of FedR\'enyi, there exists a tolerable decline in HM (smaller than $0.05$) and the training time is decreased with average \textasciitilde$1.23$ speedup rate over all asynchronous speedup rates. 
    % These results demonstrate our method could accelerate the training process against stragglers with a small performance decline.
    
    % As the $M$ decreases, there is a tolerable fluctuation of performance (HM decreases less than 20\%).
    % However, the time consumption of our algorithm is significantly accelerated, verifying the acceleration ability of our scheme.

    
    \begin{table}[tb]
        \centering
        \caption{
        The HM and the average approximation errors across all stragglers of FedR\'enyi with different $\alpha$. 
        These approximation errors are measured by the L2 distance between the approximation values and the actual values on stragglers.
        }
        \label{table:acceleration_bias}
        \setlength{\tabcolsep}{1mm}
        \resizebox{0.45\textwidth}{!}{
            \begin{tabular}{|cccc|}
            \hline
              \multicolumn{1}{|c|}
            {\textbf{\begin{tabular}[c]{@{}c@{}}Dir=0.5\\ $\lambda$=1 \end{tabular}}}&
            
              \multicolumn{1}{c|}
              {\textbf{Drop $\alpha$:   0\%}} &
              \multicolumn{1}{c|}{\textbf{Drop $\alpha$:   30\%}} &
              \textbf{Drop $\alpha$:   50\%} \\ \hline
            \multicolumn{1}{|c|}
            {\textbf{\begin{tabular}[c]{@{}c@{}}(T, I)\\ =(100,4)\end{tabular}}}&
              \multicolumn{1}{c|}{\textbf{\begin{tabular}[c]{@{}c@{}}HM/j   Err./\\      u Err./$\theta$ Err.\end{tabular}}} &
              \multicolumn{1}{c|}{\textbf{\begin{tabular}[c]{@{}c@{}}HM/j   Err./\\      u Err./$\theta$ Err.\end{tabular}}} &
              \textbf{\begin{tabular}[c]{@{}c@{}}HM/j   Err./\\      u Err./$\theta$ Err.\end{tabular}} \\ \hline
            \multicolumn{4}{|c|}{\textbf{COMPAS}} \\ \hline
            \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
            % pre-trained \\
            Asyn. \\($n_k/n$)\end{tabular}}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.77/0/\\      0/0\end{tabular}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.73/0.03/\\      0.01/0.92\end{tabular}} &
              \begin{tabular}[c]{@{}c@{}}0.71/0.01/\\      0.02/1.41\end{tabular} \\ \hline
            \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
            % pre-trained \\      
            Asyn. \\ ($1/K$)\end{tabular}}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.75/0/\\      0/0\end{tabular}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0.04/\\      0.01/0.27\end{tabular}} &
              \begin{tabular}[c]{@{}c@{}}0.78/0.01/\\      0.02/0.34\end{tabular} \\ \hline
            \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}Syn. \\($n_k/n$)\end{tabular}}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular}} &
              \begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular} \\ \hline
            \multicolumn{4}{|c|}{\textbf{DRUG}} \\ \hline
            \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
            % pre-trained \\
            Asyn. \\($n_k/n$)\end{tabular}}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0.01/\\      0.01/0.25\end{tabular}} &
              \begin{tabular}[c]{@{}c@{}}0.75/0.08/\\      0.03/0.40\end{tabular} \\ \hline
            \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
            % pre-trained \\
            Asyn. \\($1/K$)\end{tabular}}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.73/0/\\      0/0\end{tabular}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.72/0.01/\\      0.02/0.29\end{tabular}} &
              \begin{tabular}[c]{@{}c@{}}0.73/0.09/\\      0.02/0.37\end{tabular} \\ \hline
            \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}Syn. \\($n_k/n$)\end{tabular}}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
              \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
              \begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular} \\ \hline
            \end{tabular}            
        }
    \end{table}

{\bf Assumption \ref{assumption:cocoercive} is empirically valid.}  
To empirically demonstrate the validity of Assumption \ref{assumption:cocoercive},  
we conduct experiments on the DUTCH dataset under a heterogeneous ($Dir = 0.5$) and isomorphism Data ($Dir= +\infty$) distribution setting, with uniform over data ($\gamma = n_k/n$) and uniform over client ($\gamma = n_k/n$) settings and $\lambda=1$. 
Specifically, in Figure \ref{fig: assumption_1}, the X-axis represents training iterations, while the Y-axis shows the values corresponding to each side of the co-coercivity inequality:
(i) $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle$, the left-hand side of the inequality in Assumption \ref{assumption:cocoercive};
(ii) $\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$ with $\beta = \eta/2$, the right-hand side of the inequality in Assumption \ref{assumption:cocoercive}. 
At each iteration, we randomly select $5$ clients and record their gradients and parameter vectors. Then we compute $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle$ and $\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$ values for these clients, following the inequality structure in Assumption \ref{assumption:cocoercive}. 
Next, we plot the average across all selected clients over each iteration.
As shown in Figure \ref{fig: assumption_1}, in most iterations, the line of $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle$ consistently lies above the line of $\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$, indicating that the inequality holds, which verifying the validity of the co-coercivity assumption (i.e., $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle \geq \beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$ ) in practice.

\begin{figure}[t]
\centering
  \includegraphics[width=0.46\textwidth]{new_figure/UAI2025/Assumption1.pdf}
  \caption{ 
  Verification of the co-coercivity assumption (Assumption \ref{assumption:cocoercive}) on DUTCH under $Dir = 0.5$ and $Dir= +\infty$, with uniform over data ($\lambda = n_k/n$) and uniform over client ($\lambda = n_k/n$) settings and $\lambda=1$.
The X-axis represents training iterations, while the Y-axis shows the values corresponding to each side of the co-coercivity inequality:
(i) $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle$, the left side of the inequality in Assumption \ref{assumption:cocoercive}; 
(ii) $\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$ with $\beta = \eta/2$, the right side of the inequality in Assumption \ref{assumption:cocoercive}. 
% At each iteration, we randomly select 5 clients and record their gradients and parameter vectors. We then compute $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle$ and $\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$ values for these clients. Then, we plot the average across all clients to plot the curve over each iteration.
% We could observe that the line of $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle$ consistently lies above the line of $\beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$, indicating that the inequality holds, which verifying the validity of the co-coercivity assumption (i.e., $\langle \nabla F(\theta_1) - \nabla F(\theta_2), \theta_1-\theta_2 \rangle \geq \beta \| \nabla F(\theta_1) - \nabla F(\theta_2) \|^2$ ) in practice.
}
\label{fig: assumption_1}  
\end{figure}

{\bf Assumption \ref{assumption:lipschitz} is empirically valid.}
We plot Figure \ref{fig: assumption_2} to empirically demonstrate the validity of Assumption \ref{assumption:lipschitz}, which assumes that the change in predicted class probabilities is Lipschitz continuous with respect to model parameters $\theta$.  
We adopt the same setup: DUTCH dataset, $\lambda=1$, under both a heterogeneous ($Dir = 0.5$) and isomorphism data ($Dir= +\infty$) distribution settings and two uniform over data ($\gamma = n_k/n$) and uniform over client ($\gamma = n_k/n$) weights settings. Each subplot in Figure \ref{fig: assumption_2} visualizes the prediction probability of Class $0$ (blue) and Class $1$ (red) over training iterations. The X-axis denotes training iterations, and the Y-axis represents the predicted probability for each class. 
To compute these probabilities, at each iteration, we compute the local predicted class probabilities over all clients and then compute the average over clients to obtain the global prediction probability.
As shown in Figure \ref{fig: assumption_2}, the predicted probabilities for both classes evolve smoothly and do not exhibit sharp fluctuations throughout the iterations. 
This consistent behavior across multiple configurations empirically supports the Lipschitz continuity assumption with respect to $\theta$.

\begin{figure}[t]
    \centering
  \includegraphics[width=0.46\textwidth]{new_figure/UAI2025/Assumption2_lines.pdf}
  \caption{ 
 Verification of Lipschitz assumption (Assumption \ref{assumption:lipschitz}) on DUTCH dataset under a $Dir = 0.5$ and $Dir= +\infty$, with uniform over data ($\lambda = n_k/n$) and uniform over client ($\lambda = n_k/n$) settings and $\lambda=1$.
 % Each subplot in Figure 2 visualizes the prediction probability of Class 0 (blue) and Class 1 (red) over training iterations. 
 The X-axis denotes training iterations, and the Y-axis represents the predicted probability for each class.
 % To compute these probabilities, at each iteration, we compute the local predicted class probabilities over all clients and then compute the expectation over clients to obtain a global prediction distribution.
 % As shown in Figure, the predicted probabilities for both classes evolve smoothly and do not exhibit sharp fluctuations across iterations. This consistent behavior across multiple configurations empirically supports the Lipschitz continuity assumption with respect to $\theta$.
}
\label{fig: assumption_2}  
\end{figure}

\section{Conclusion}
We propose FedR\'enyi, a federated fairness-aware algorithm that enhances group fairness in decentralized heterogeneous systems under two weighting schemes. 
FedR\'enyi addresses data heterogeneity by aggregating local empirical statistics to estimate global R\'enyi correlation, with an estimation error of $O(1/\sqrt{n})$, matching centralized learning bounds and improving upon prior estimation error bounds. 
FedR\'enyi algorithm reduces the expected squared gradient norm to $O(\epsilon + 1/n)$ with an iteration complexity of $O(1/\epsilon^4)$.
For system heterogeneity, asynchronous FedR\'enyi uses weighted averaging over a nearest neighbor region to approximate stragglers, with a non-increasing approximation error over a communication round. 
% Moreover, we show that the proposed approximation is non-expansive for certain non-convex loss functions \cite{liu2021first} and pre-trained model \cite{tan2022federated,weller2022pretrained,tian2022fedbert}, i.e., the non-increasing distance between the approximated and true local statistics/models within a communication round.
Our experiments on multiple benchmark datasets clearly demonstrate that FedR\'enyi could achieve better accuracy and fairness trade-off over prior FL fairness baselines with at least $2\%$ improvement.
% Our experiments on multiple benchmark datasets demonstrate at least a $2\%$ improvement in the accuracy-fairness trade-off over prior baselines.


% References
\clearpage

\newpage

\bibliography{ref}

\newpage

\onecolumn

\title{Federated R\'enyi Fair Inference in Federated Heterogeneous System\\(Supplementary Material)}
\label{Appendix}
\maketitle

\appendix

\section{Mathematical Notations}

 
\begin{table}[!h]
\centering
\caption{\textbf{Key notations used in this paper.}}
\label{tab:notation}
\begin{tabular}{|l|l|}
    \hline
     {\bf Notation} & {\bf Meaning}\\
     \hline
     $X \in \mathcal{X} $ & Input feature\\
     \hline
     $Y \in \mathcal{Y} $ & The ground-truth label\\
     \hline
     $S \in \mathcal{S} $ & Sensitive attributes\\
     \hline
    $\mathcal{P}$ & The underlying distribution defined on  $\mathcal{X} \times \mathcal{Y} \times \mathcal{S}$\\
     \hline
     $(X,Y, S)$ & The data sample drawn from a distribution $\mathcal{P}$\\
     \hline
     $(x,y, s)$ & The realization of data sample\\
     \hline
     $C$ & Total number of classes\\
     \hline
     $P$ & Total number of attributes\\
     \hline
     $k $ & Index of client\\
    \hline
     $X_k $ & Feature variables on client $k$\\
    \hline
     $S_k $ & Attribute variables on client $k$\\
     \hline
     $\calP_k $ & Distribution on client $k$\\
     \hline
     $f_{\theta}$ & The prediction function parameterised by $\theta$\\
    \hline
    $n_k $ & The number of data examples for client $k$\\
    \hline 
    $n_\textmin := \min_{k} n_k$ & The minimal number of data samples across all clients \\
    \hline
    $\indicator[\cdot]$  & An indicator function\\
    % \hline
    % $\widehat \P[A]$  & The empirical probability of event $A$\\
    \hline
    $\P_c = P[f_{\theta} (X, S)=c ]$ & The probability of the model predicting class $c$ \\
    \hline
    $\rho = \min_{c \in C} \P_c $ & The smallest model prediction probability over all classes.\\
    \hline
    $\tau$ & Total number of communication rounds\\
    \hline
    $e$ & Index of communication rounds\\
    \hline
    $M$ & Total number of local updates\\
    \hline
    $m$ & Index of local updates\\
    \hline
    $T$ & Total number of iterations\\
   \hline
\end{tabular}
\end{table}

\section{Theoretical Analysis}
In this section, we prove the theoretical results in this paper.

\subsection{Proof for Section \ref{subsubsection:theoretical_analysis_synchronous}}
In this section, we prove Theorem \ref{theorem:estimation_error_synchronous} and Proposition \ref{proposition:convergence_based_on_scaffold} in Section \ref{subsubsection:theoretical_analysis_synchronous}.

\subsubsection{Proof of Theorem \ref{theorem:estimation_error_synchronous}}
\label{Appendix: Proof of theorem estimation_error_synchronous}

\begin{theorem}
\label{theorem:estimation_error_synchronous_appendix}
(Theorem \ref{theorem:estimation_error_synchronous} restated, Estimation error of R\'enyi regularization for synchronous FedR\'enyi) 
Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, for any global model $\theta$ and $\delta \in (0,1)$, the following inequality holds:
\begin{align*}
\P \Big [
\widehat H(\theta, \v) - H(\theta, \v)
\leq 
O \big(1/\sqrt{n} \big) \big | \theta \Big ]
\geq 1 - \delta.
\end{align*}
\end{theorem}

\begin{proof}
of (Theorem \ref{theorem:estimation_error_synchronous})

Before proving Theorem \ref{theorem:estimation_error_synchronous}, we first present some technical lemmas.  
Lemma \ref{lemma:mean_of_sum_uniform_data} and \ref{lemma:mean_of_mean_uniform_clients} provides the estimation error of general random variables for $\gamma_k= n_k/n$ and $\gamma_k= 1/K$ respectively.  
Lemma \ref{lemma:fraction_error_bound} shows that 
the estimation errors from Lemma \ref{lemma:mean_of_sum_uniform_data} and \ref{lemma:mean_of_mean_uniform_clients} can be transferred to each entry of matrix $\hat Q_{\theta}$.
Lemma \ref{lemma:Q_norm} bounds the norm of matrix $Q_{\theta}$

\begin{lemma}
\label{lemma:mean_of_sum_uniform_data}
(mean-of-sum for $\gamma_k = \frac{n_k}{n}$)
For any distribution $\calP_k$ on different clients, denoting $V_k = \frac{1}{n_k} \sum_{i \in \calI_k} V_{k, i}$, then the condition $\gamma_k = \frac{n_k}{n}$ gives
\begin{align}
\P\Bigg\{ \Bigg| \E [V] - \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } V_{k, i} \Bigg| \leq \sqrt{ \frac{ \log(2 / \delta)}{ 2 n } } \Bigg\}
\geq  1 - \delta .
\end{align}
\end{lemma}

\begin{lemma}
\label{lemma:mean_of_mean_uniform_clients}
(mean-of-mean for $\gamma_k = \frac{1}{K}$)
For any distribution $\calP_k$ on different clients, 
% denoting $V_k = \frac{1}{n_k} \sum_{i \in \calI_k} V_{k, i}$, 
define $n_\textmin := \min_{k=1,...,K} n_k$ as the minimal number of data samples across different clients, 
and $\mu_\textmin := \min_{k} V_k$,
then the condition $\gamma_k = \frac{1}{K}$ gives
\begin{align}
\label{ineq:proof}
&
\P\Bigg\{ \Bigg| \frac{1}{K} \sum_{k=1}^K V_k - \E\Bigg[ \frac{1}{K} \sum_{k=1}^K V_k \Bigg] \Bigg| 
\leq 
\sqrt{ \frac{ \log(2K / \delta) \cdot \log(4/\delta) }{  K n_\textmin \mu_\textmin } }  \Bigg\}
\geq 
1 - \delta. 
\end{align}
\end{lemma}

\begin{lemma}
\label{lemma:fraction_error_bound}
Suppose $|j(c,p) - \hat j(c,p)| \leq \epsilon$, $|u(c) - \hat u(c)| \leq \epsilon$ and $|r(p) - \hat r(p)| \leq \epsilon$.
Under $\epsilon = O \Big ( \frac{1}{\sqrt{n}} \Big )$ and $\hat j(c,p) \backsim \hat u(c) \backsim \hat r(p) = \Omega \big ( \frac{1}{\sqrt{n}} \big )$, the following inequality holds:
\begin{align}
\frac{j(c,p) r(p)}{ \sqrt{ u(c) r(p) } }
\leq 
\frac{\hat j(c,p) \hat r(p)}{ \sqrt{ \hat u(c) \hat r(p)} } 
+ O(\epsilon).
\end{align}
\end{lemma}

\begin{lemma}
\label{lemma:Q_norm}
Recall that $\rho = \min_{c \in C} \P_c $ as the smallest model prediction probability over all classes. Then, the norm of matrix $Q_{\theta}$
is bounded : $\|Q_{\theta} \| \leq \sqrt{\frac{C}{\rho}}$.
\end{lemma}

Now we begin to prove Theorem \ref{theorem:estimation_error_synchronous}.
Given fixed global model $\theta$, according to Lemma \ref{lemma:fraction_error_bound}, the estimation error between each entry of matrix $Q_{\theta}$ (i.e., $q_{cp}$) and $\widehat Q_{\theta}$ (i.e., $\widehat q_{cp}$) is bounded by $O(\epsilon)$, where $\epsilon$ is the estimation error bound in Lemma \ref{lemma:mean_of_sum_uniform_data} or \ref{lemma:mean_of_mean_uniform_clients}.
Thus, we define $ \varepsilon \in \R^{C \times P}$ as an estimation error matrix and $\varepsilon := Q_{\theta} - \hat Q_{\theta}$, where each entry $\varepsilon_{c, p} = \epsilon$ is the same.

First, we prove the estimation error of R\'enyi when $\gamma_k = \frac{n_k}{n}$:

\begin{align*}
&
\widehat H(\theta, \v)
= \v^{\top} \hat Q_{\theta}^{\top} \hat Q_{\theta} \v
\\
& \quad \quad \quad
= \v^{\top} (\hat Q_{\theta}^{\top} \hat Q_{\theta}) \v
\\ 
& \quad \quad \quad
\leq 
\v^{\top} \bigg [( Q_{\theta} + \varepsilon)^{\top} (Q_{\theta} + \varepsilon) \bigg ] \v
\\
& \quad \quad \quad
=
\v^{\top} \bigg [ Q_{\theta}^{\top}Q_{\theta} + \varepsilon^{\top} \hat Q_{\theta} + Q_{\theta}^{\top} \varepsilon + \varepsilon^{\top} \varepsilon \bigg ] \v
\\
& \quad \quad \quad
\leq
\v^{\top} \bigg [ Q_{\theta}^{\top} Q_{\theta} + 2 \| \varepsilon \| \cdot \| \hat Q_{\theta} \| + \| \varepsilon \|^2 \bigg ] \v
\\
& \quad \quad \quad
\leq
\v^{\top} \bigg [ Q_{\theta}^{\top} Q_{\theta} + 2(C \cdot P)^2 \cdot  \epsilon  \cdot \sqrt{\frac{C}{\rho}} + (C \cdot P)^2 \cdot  \epsilon^2 \bigg ] \v
\\
& \quad \quad \quad
=
\v^{\top} Q_{\theta}^{\top} Q_{\theta} \v+ \v^{\top} \bigg [ 2(C\cdot P)^2 \cdot \epsilon \cdot \sqrt{\frac{C}{\rho}} + (C \cdot P)^2 \cdot \epsilon^2 \bigg ] \v
\\
& \quad \quad \quad
\leq
\v^{\top} Q_{\theta}^{\top} Q_{\theta} \v + \| \v \| \cdot \bigg [ (2(C \cdot P)^2 \cdot \epsilon \cdot \sqrt{\frac{C}{\rho}} + (C \cdot P)^2 \cdot \epsilon^2 \bigg ] \cdot \|\v \|
\\
& \quad \quad \quad
\leq
\v^{\top} Q_{\theta}^{\top} Q_{\theta} \v + \bigg [ 2(C \cdot P)^2 \cdot \epsilon \cdot \sqrt{\frac{C}{\rho}} + (C \cdot P)^2 \cdot \epsilon^2 \bigg ],
\end{align*}
Where the second, third, and fourth inequality is due to Cauchy–Schwarz inequality. 

Let $\epsilon$ equals to the result in Lemma \ref{lemma:mean_of_sum_uniform_data}, the following inequality holds: 
\begin{align*}
\P \bigg [\widehat H(\theta, \v) - H(\theta, \v)
\leq 
C^2 P^2 (\frac{ \log(2 / \delta)}{ 2 n } + 2\sqrt{\frac{ C \log(2 / \delta)}{ 2 n \rho }} ) \bigg | \theta \bigg ]
\geq 1- \delta.
\end{align*}

Therefore, we have that:

\begin{align}
\label{eq:theorem_1_estimation_sample}
\P \bigg [\widehat H(\theta, \v) - H(\theta, \v)
\leq 
\tilde O \Big (\frac{1}{\sqrt{n}} \Big ) \bigg | \theta \bigg]
\geq 1- \delta, 
~\text{when } 
\gamma_k  = n_k/n.
\end{align}

Now we begin to prove estimation error when $\gamma_k = \frac{1}{K}$.

Let the estimation error of $\hat j_k(c,p)$, $\hat u_k(c)$ and $\hat r_k(p)$ be $\epsilon_j$, $\epsilon_u$ and $\epsilon_r$ respectively. 

Under the condition of $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$, according to lemma \ref{lemma:mean_of_mean_uniform_clients}, the following holds:
\begin{align*}
&
\epsilon_j= \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin j_\textmin }}
=
\sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }},
\\ &
\epsilon_u= \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin u_\textmin }}
=
\sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }},
\\ &
\epsilon_r= \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin r_\textmin }} 
=
\sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }}.
\end{align*}

Thus, the estimation errors of $\hat j_k(c,p)$, $\hat u_k(c)$ and $\hat r_k(p)$ are the same:
$\sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }}$.


Then, the following inequality holds: 
\begin{align*}
\P \Bigg [
\widehat H(\theta, \v) - H(\theta, \v) 
\leq 
C^2 P^2 (\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin } + 2 \sqrt{\frac{ C \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin \rho }} ) \Bigg | \theta \Bigg ]
\geq 1- \delta.
\end{align*}

Due to $n_\textmin \backsim \frac{n}{K \log(K)}$, we have that:
\begin{align}
\label{eq:theorem_1_estimation_client}
\P \bigg [
\widehat H(\theta, \v) - H(\theta, \v) 
\leq 
\tilde O \Big (\frac{1}{\sqrt{n}} \Big ) \bigg | \theta \bigg ]
\geq 1- \delta,
~\text{where } 
\gamma_k  = 1/K.
\end{align}

Combining Equation (\ref{eq:theorem_1_estimation_sample}) and (\ref{eq:theorem_1_estimation_client}), we have that:

\begin{align}
\label{eq:theorem_1_estimation_total}
\P \bigg [
\widehat H(\theta, \v) - H(\theta, \v) 
\leq 
\tilde O \Big  (\frac{1}{\sqrt{n}} \Big ) \bigg | \theta \bigg]
\geq 1- \delta.
\end{align}

Thus, 
the proof of Theorem \ref{theorem:estimation_error_synchronous} is finished.

\end{proof}

\subsubsection{ Proof of Lemma \ref{lemma:mean_of_sum_uniform_data}}

\begin{proof}
(of Lemma \ref{lemma:mean_of_sum_uniform_data})
Define $X_{k,i} = \indicator[ \calE_{k,i} ]$ for an event $\calE_{k,i}$.
This event $\calE$ can be instantiated with the components contained in $Q_\theta$. 
For example, it can be defined by the case where for a data $(x, y, s)$, $s = 0 $, $f_{\theta_k,t}(x) = 1$, or $s = 1 \text{ and } f_{\theta_k,t}(x) = 1$.

Define $V_{k,i} := X_{k,i}$.
Then we apply Hoeffding's inequality across $n$ data samples as follows
\begin{align*}
& \P\Bigg\{ \Bigg| \sum_{k=1}^K \sum_{i \in \calI_k } V_{k,i} - \E\Bigg[ \sum_{k=1}^K \sum_{i \in \calI_k } V_{k,i} \Bigg] \Bigg| \leq t \Bigg\}
\\
\geq &
 1 - 2 \exp\Big( - \frac{ 2 t^2 }{ \sum_{k=1}^K \sum_{i \in \calI_k } (1)^2 } \Big) 
\\
\geq &
1 - \delta ,
\end{align*} 
or equivalently
\begin{align}
\P\Bigg\{ \Bigg| \E [V] - \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } V_{k, i} \Bigg| \leq \sqrt{ \frac{ \log(2 / \delta)}{ 2 n } } \Bigg\}
\geq 
1 - \delta .
\end{align}
\end{proof}

\subsubsection{ Proof of Lemma \ref{lemma:mean_of_mean_uniform_clients} }

\begin{proof}
(of Lemma \ref{lemma:mean_of_mean_uniform_clients})
Define $X_{k,i} = \indicator[ \calE_{k,i} ]$ for an event $\calE_{k,i}$.
This event $\calE$ can be instantiated with the components contained in $Q_\theta$. 
For example, it can be defined by the case where for a data $(x, y, s)$, $s = 0 $, $f_{\theta_k,t}(x) = 1$, or $s = 1 \text{ and } f_{\theta_k,t}(x) = 1$.

For fixed $k \in \{1, ..., K\}$, applying Chernoff bound, we have
\begin{align}
\P\Bigg\{ \Bigg| \sum_{i \in \calI_k } X_i - \E\Bigg[ \sum_{i \in \calI_k } X_i \Bigg] \Bigg| \geq \epsilon \E\Bigg[ \sum_{i \in \calI_k } X_i \Bigg] \Bigg\}
\leq 
\exp\Bigg( - \frac{ \epsilon^2 \E[ \sum_{i \in \calI_k } X_{k, i} ] }{ 3 } \Bigg) ,
\end{align}
or equivalently,
\begin{align}
\P\Bigg\{ \Bigg| \sum_{i \in \calI_k } X_i - \E\Bigg[ \sum_{i \in \calI_k } X_i \Bigg] \Bigg| \leq \epsilon \E\Bigg[ \sum_{i \in \calI_k } X_i \Bigg] \Bigg\}
\geq 
1 - \exp\Bigg( - \frac{ \epsilon^2 \E[ \sum_{i \in \calI_k } X_{k, i} ] }{ 3 } \Bigg) .
\end{align}

Define $V_k := \frac{1}{n_k} \sum_{i \in \calI_k } X_{k, i}$.
Let $\calG_k$ be a {\it good} event such that $( 1 - \epsilon ) \E[V_k] \leq V_k \leq ( 1 + \epsilon ) \E[V_k]$.
From the above result, we know that $\P\{ \calG_k \} \geq 1 - \exp\Bigg( - \frac{ \epsilon^2 \E[ \sum_{i \in \calI_k } X_{k, i} ] }{ 3 } \Bigg)$.
Furthermore, denote $\calG = \bigcap_{k=1}^K \calG_k$.
Therefore, 
\begin{align}
\P\{ \calG \} 
\geq 
\prod_{k=1}^K \Bigg( 1 - \exp\Big( - \frac{ \epsilon^2 \E[ \sum_{i \in \calI_k } X_{k, i} ] }{ 3 } \Big) \Bigg) 
\geq 
1 - \sum_{k=1}^K \exp\Big( - \frac{ \epsilon^2 \E[ \sum_{i \in \calI_k } X_{k, i} ] }{ 3 } \Big) 
\geq 
1 - K \cdot \exp\Big( - \frac{ \epsilon^2 n_\textmin \mu_\textmin }{ 3 } \Big) .
\end{align}


Then we apply Hoeffding inequality across $K$ clients as follows
\begin{align*}
&
\P\Bigg\{ \Bigg| \sum_{k=1}^K V_k - \E\Bigg[ \sum_{k=1}^K V_k \Bigg] \Bigg| \leq t \Bigg\}
\\
= &
\P\{ \calG \} \cdot \P\Bigg\{ \Bigg| \sum_{k=1}^K V_k - \E\Bigg[ \sum_{k=1}^K V_k \Bigg] \Bigg| \leq t \Bigg| \calG \Bigg\}
+ ( 1 - \P\{ \calG \} ) \cdot \P\Bigg\{ \Bigg| \sum_{k=1}^K V_k - \E\Bigg[ \sum_{k=1}^K V_k \Bigg] \Bigg| \leq t \Bigg| \overline{\calG} \Bigg\}
\\
\geq &
\Bigg( 1 - K \cdot \exp\Big( - \frac{ \epsilon^2 n_\textmin \mu_\textmin }{ 3 } \Big) \Bigg) \cdot 
\Bigg( 1 - 2 \exp\Big( - \frac{ 2 t^2 }{ \sum_{k=1}^K ( 2 \epsilon \E[V_k] )^2 } \Big) \Bigg)
\\
\geq &
1 
- K \cdot \exp\Big( - \frac{ \epsilon^2 n_\textmin \mu_\textmin }{ 3 } \Big)
- 2 \exp\Big( - \frac{ 2 t^2 }{ \sum_{k=1}^K ( 2 \epsilon \E[V_k] )^2 } \Big) .
\end{align*}


If $\epsilon \geq \sqrt{ \frac{ 3 \log(2K / \delta)}{ n_\textmin \mu_\textmin } }$, then $K \cdot \exp\Big( - \frac{ \epsilon^2 n_\textmin \mu_\textmin }{ 3 } \Big) \leq \delta / 2$.
On the other hand, if $t \geq \sqrt{ \frac{ K \log(2K / \delta) \cdot \log(4/\delta) }{ n_\textmin \mu_\textmin } }$, then \\
 $2 \exp\Big( - \frac{ 2 t^2 }{ \sum_{k=1}^K ( 2 \epsilon \E[V_k] )^2 } \Big) \leq \delta / 2$.


Therefore, we have
\begin{align*}
&
\P\Bigg\{ \Bigg| \sum_{k=1}^K V_k - \E\Bigg[ \sum_{k=1}^K V_k \Bigg] \Bigg| \leq t \Bigg\}
\\
\geq &
\P\Bigg\{ \Bigg| \sum_{k=1}^K V_k - \E\Bigg[ \sum_{k=1}^K V_k \Bigg] \Bigg| \leq \sqrt{ \frac{ K \log(2K / \delta) \cdot \log(4/\delta) }{ n_\textmin \mu_\textmin } } \Bigg\}
\\
\geq &
1 - \delta ,
\end{align*}
or equivalently
\begin{align}
\P\Bigg\{ \Bigg| \frac{1}{K} \sum_{k=1}^K V_k - \E\Bigg[ \frac{1}{K} \sum_{k=1}^K V_k \Bigg] \Bigg| \leq  \sqrt{ \frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin \mu_\textmin } } \Bigg\}
\geq 
1 - \delta .
\end{align}
\end{proof}

\subsubsection{ Proof of Lemma \ref{lemma:fraction_error_bound}}

\begin{proof}
(of Lemma \ref{lemma:fraction_error_bound})

Lemma \ref{lemma:mean_of_sum_uniform_data} and \ref{lemma:mean_of_mean_uniform_clients} show the estimation error of single statistics $\hat u(c)$ and $\hat j(c,p)$. Now we study how to transfer the estimation error of each entry of the matrix $\hat Q_{\theta}$ that $\widehat q_{cp}:= \frac{ \hat j(c,p) \cdot \hat r(p) }{ \sqrt{ \hat u(c) \cdot \hat r(p) } }$: 

\begin{align*}
( \frac{ j r }{ \sqrt{ u r } } )^2
\leq & 
\frac{ (\hat j + \epsilon )^2 ( \hat r + \epsilon )^2 }{ ( \hat u - \epsilon ) ( \hat r - \epsilon ) }
=
\frac{ ( \hat j^2 + \epsilon^2 + 2 \hat j \epsilon ) ( \hat r^2 + \epsilon^2 + 2 \hat r \epsilon ) }{ \hat u \hat r - \epsilon ( \hat r + \hat u ) + \epsilon^2 }
\\
= &
\frac{ \hat j^2 \hat r^2 + \hat j^2 \epsilon^2 + 2 \hat j^2 \hat r \epsilon + \epsilon^2 \hat r^2 + \epsilon^4 + 2 \hat r \epsilon^3 + 2 \hat j \hat r^2 \epsilon + 2 \hat j \epsilon^3 + 4 \hat j \hat r \epsilon^2 }{ \hat u \hat r - \epsilon ( \hat r + \hat u ) + \epsilon^2 }
\\
\leq &
\frac{ \hat j^2 \hat r^2 + \hat j^2 \epsilon^2 + 2 \hat j^2 \hat r \epsilon + \epsilon^2 \hat r^2 + \epsilon^4 + 2 \hat r \epsilon^3 + 2 \hat j \hat r^2 \epsilon + 2 \hat j \epsilon^3 + 4 \hat j \hat r \epsilon^2 + \epsilon ( \hat r + \hat u ) - \epsilon^2 }{ \hat u \hat r }
\\
= &
\frac{ \hat j^2 \hat r^2 }{ \hat u \hat r }
+ O(\epsilon) ,
\end{align*}

According to lemma \ref{lemma 2}, the last inequality is held due to $\epsilon$ is upper bounded by the order of $\tilde O (\frac{1}{\sqrt{n}})$ while $\hat j(c,p)$, $\hat u(c)$ and $\hat r(p)$ are all the empirical probabilities between $0$ and $1$.
    
\end{proof}
    
\begin{lemma}
\label{lemma 2}
For any positive number $v, p ,q$ and $0 < v\leq p \leq q-v$, if $v\leq q^2 / ( 2p )$, then the following inequality holds:
\begin{align}
    \label{lemma 2 eq}
    \frac{p}{q - v} - \frac{p+v}{q} \leq 0.
\end{align}
\end{lemma}


\begin{proof}
(of Lemma \ref{lemma 2})

From the condition $v\leq q^2 / ( 2p )$ and $0 < v\leq p \leq q-v$, we have following hold:
\begin{align}
\label{eq:lemma_4_1}
v \leq q^2 / ( 2p )  \leq q^2 / (p+v ).
\end{align}
    
Inequality \ref{eq:lemma_4_1} could be transferred as follows:
\begin{align}
\label{eq:lemma_4_2}
( p + v ) v \leq vq \leq q^2.
\end{align}

Then, the following inequality hold:
\begin{align*}
&
\frac{p}{q - v} - \frac{p+v}{q}
\\ 
= &
\frac{ pq - ( p + v )( q - v ) }{ ( q - v ) q }
\\ 
= &
\frac{ pq - pq + pv - qv + v^2 }{ ( q - v ) q }
\\ 
= &
\frac{ v ( p - q + v ) }{ ( q - v ) q }
\\ 
= &
\frac{ v ( p + v ) - vq }{ q^2 - vq }
\\
\leq & 0,
\end{align*}
where the last inequality is due to the inequality \ref{eq:lemma_4_2}. 

Thus, Lemma \ref{lemma 2} is proved. 
\end{proof}

% \begin{proof}
% (of Lemma \ref{lemma:mean_of_sum_uniform_data})
% Define $X_{k,i} = \indicator[ \calE_{k,i} ]$ for an event $\calE_{k,i}$.
% This event $\calE$ can be instantiated with the components contained in $Q_\theta$. 
% For example, it can be defined by the case where for a data $(X, Y, S)$, $S = 1 $, $f_\theta(X) = 1$, or $S = 1 \text{ and } f_\theta(X) = 1$.

% Define $V_{k,i} := \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } X_{k, i}$.

% Then we apply Hoeffding inequality across $n$ data samples as follows
% \begin{align*}
% &
% \P\Bigg\{ \Bigg| \sum_{k=1}^K \sum_{i \in \calI_k } V_{k,i} - \E\Bigg[ \sum_{k=1}^K \sum_{i \in \calI_k } V_{k,i} \Bigg] \Bigg| \leq t \Bigg\}
% \\
% \geq &
%  1 - 2 \exp\Big( - \frac{ 2 t^2 }{ \sum_{k=1}^K \sum_{i \in \calI_k } (1)^2 } \Big) 
% \\
% \geq &
% 1 - \delta ,
% \end{align*} 
% or equivalently
% \begin{align*}
% \P\Bigg\{ \Bigg| \E [V] - \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } V_{k, i} \Bigg] \Bigg| \leq \sqrt{ \frac{ \log(2 / \delta)}{ 2 n } } \Bigg\}
% \geq 
% 1 - \delta .
% \end{align*}
% \end{proof}

\subsubsection{Proof of Lemma \ref{lemma:Q_norm}}

\begin{proof}
(of Lemma \ref{lemma:Q_norm})

\begin{align*} 
\| Q_{\theta} \|
= & 
\sqrt{\sum^C_{c = 1} \sum^P_{p =1}  \frac{ j(c,p) \cdot r(p) }{ \sqrt{ u(c) \cdot r(p) } } }
\\
\leq & 
\sqrt{ \sum^C_{c = 1} \sum^P_{p =1} \frac{ j(c,p) \cdot r(p) }{ u(c) } } 
\\
= & 
\sqrt{ \sum^C_{c = 1} \sum^P_{p =1} \frac{ r(p) }{ u(c) } }
\\
= & 
\sqrt{ \sum^C_{c = 1} \frac{ 1 }{ u(c) } }
\\
\leq & 
\sqrt{ \frac{C}{\rho} },
\end{align*}
where the first inequality is due to $j(c,p) \leq 1$, and the second inequality is due to the definition of $\rho$.

\end{proof}
    
    % \begin{lemma}
    % \label{lemma:mean_of_sum_uniform_data}
    % (Mean-of-sum for $\gamma_k = \frac{n_k}{n}$)
    % For any distribution $\calP_k$ on different clients, denoting $V_k = \frac{1}{n_k} \sum_{i \in \calI_k} V_{k, i}$, then the condition $\gamma_k = \frac{n_k}{n}$ gives
    % \begin{align*}
    % \P\Bigg\{ \Bigg| \E [V] - \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } V_{k, i} \Bigg] \Bigg| \leq \sqrt{ \frac{ \log(2 / \delta)}{ 2 n } } \Bigg\}
    % \geq 
    % 1 - \delta .
    % \end{align*}
    % \end{lemma}
    
    % \begin{proof}
    % (of Lemma \ref{lemma:mean_of_sum_uniform_data})
    % Define $X_{k,i} = \indicator[ \calE_{k,i} ]$ for an event $\calE_{k,i}$.
    % This event $\calE$ can be instantiated with the components contained in $Q_\theta$. 
    % For example, it can be defined by the case where for a data $(X, Y, S)$, $S = 1 $, $f_\theta(X) = 1$, or $S = 1 \text{ and } f_\theta(X) = 1$.
    
    % Define $V_{k,i} := \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } X_{k, i}$.
    
    % Then we apply Hoeffding inequality across $n$ data samples as follows
    % \begin{align*}
    % &
    % \P\Bigg\{ \Bigg| \sum_{k=1}^K \sum_{i \in \calI_k } V_{k,i} - \E\Bigg[ \sum_{k=1}^K \sum_{i \in \calI_k } V_{k,i} \Bigg] \Bigg| \leq t \Bigg\}
    % \\
    % \geq &
    %  1 - 2 \exp\Big( - \frac{ 2 t^2 }{ \sum_{k=1}^K \sum_{i \in \calI_k } (1)^2 } \Big) 
    % \\
    % \geq &
    % 1 - \delta ,
    % \end{align*} 
    % or equivalently
    % \begin{align*}
    % \P\Bigg\{ \Bigg| \E [V] - \frac{1}{n} \sum_{k=1}^K \sum_{i \in \calI_k } V_{k, i} \Bigg] \Bigg| \leq \sqrt{ \frac{ \log(2 / \delta)}{ 2 n } } \Bigg\}
    % \geq 
    % 1 - \delta .
    % \end{align*}
    % \end{proof}
    
    % \begin{lemma}
    % \label{lemma:mean_of_mean_uniform_clients}
    % (Mean-of-mean for $\gamma_k = \frac{1}{K}$)
    % For any distribution $\calP_k$ on different clients, 
    % % denoting $V_k = \frac{1}{n_k} \sum_{i \in \calI_k} V_{k, i}$, 
    % define $n_\textmin := \min_{k=1,...y,K} n_k$ as the minimal number of data samples across different clients, 
    % and $\mu_\textmin := \min_{k} V_k$,
    % then the condition $\gamma_k = \frac{1}{K}$ gives
    % \begin{align*}
    % &
    % \P\Bigg\{ \Bigg| \frac{1}{K} \sum_{k=1}^K V_k - \E\Bigg[ \frac{1}{K} \sum_{k=1}^K V_k \Bigg] \Bigg| \\
    % & \quad
    % \leq 
    % \sqrt{ \frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ \yanred{ K n_\textmin \mu_\textmin } } } \Bigg\}
    % \geq 
    % 1 - \delta 
    % \end{align*}
    % \end{lemma}
    
    % \begin{remark}
    % Lemma \ref{lemma:mean_of_sum_uniform_data} and Lemma \ref{lemma:mean_of_mean_uniform_clients} show that with high probability at least $1-\delta$, the approximation error bounds of any variables are $\sqrt{ \frac{ \log(2 / \delta)}{ 2 n } }$ and $\sqrt{ \frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ \yanred{ K n_\textmin \mu_\textmin } } } $ respectively. 
    % In FedR\'enyi algorithm, these variables refer to $\hat j(c, p)$, $\hat r(p)$ and $\hat u(c)$.
    % Denote the approximation error in lemma \ref{lemma:mean_of_sum_uniform_data} and lemma \ref{lemma:mean_of_mean_uniform_clients} as $\epsilon_1$ and $\epsilon_2$ respectively, then
    % $\epsilon_2 / \epsilon_1 \geq \Big (\sqrt{4 \log(2K / \delta)} \Big )$. When $K$ increase, the error proportion lower bound will also increase, which means that more clients will make the approximation error of uniform client larger. Our algorithm \ref{alg:FR} in uniform client setting will be more suitable for cross-silo scenario.
    % \end{remark}
    
    % \begin{lemma}
    % \label{lemma:fraction_error_bound}
    % Suppose $|j(c,p) - \hat j(c,p)| \leq \epsilon$, $|u(c) - \hat u(c)| \leq \epsilon$ and $|r(p) - \hat r(p)| \leq \epsilon$.
    % Under $\epsilon = O \Big ( \frac{1}{\sqrt{n}} \Big )$ and $\hat j(c,p) \backsim \hat u(c) \backsim \hat r(p) = \Omega \big ( \frac{1}{\sqrt{n}} \big )$, the following inequality holds
    % \begin{align*}
    % \frac{j(c,p) r(p)}{ \sqrt{ u(c) r(p) } }
    % \leq 
    % \frac{\hat j(c,p) \hat r(p)}{ \sqrt{ \hat u(c) \hat r(p)} } 
    % + O(\epsilon).
    % \end{align*}
    % \end{lemma}
    
    % \begin{remark}
    % The above result shows that the estimation error $\epsilon$ of each component in $q_{c,p}$ could be transferred to $q_{c,p}$, where $q_{c,p}$ is each entry of matrix $\hat Q_{\theta}$. In other words, $\epsilon$ could also bound the estimation error of $q_{c,p}$. The assumption is easily to hold because $\hat j(c,p)$, $\hat u(c)$ and $\hat r(p)$ are all the probability between 0 and 1. The specific number of $\epsilon$ is given in lemma \ref{lemma:mean_of_sum_uniform_data} and lemma \ref{lemma:mean_of_mean_uniform_clients}, which shows that $\epsilon$ is upper bounded the order of $\tilde O \Big (\frac{1}{\sqrt{n}} \Big )$.
    % \end{remark}
    
    
    % \begin{assumption}
    % \label{assumption}
    % For any distribution $\calP_k$ on different clients, $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$
    % \end{assumption}
    
    % \begin{theorem}
    % \label{theorem: regularization error of uniform data}
    % (The regularization estimation error of uniform distribution setting) 
    % When $\gamma_k = \frac{n_k}{n}$, with probability at least $1-\delta$ where $\delta >0 $, the following inequality holds:
    % \begin{align}
    % H(\theta, \v) \! - \! \hat G(\hat \theta, \v) 
    % \! \leq \! 
    % c^2 p^2 \! \Bigg(\frac{ \log(2 / \delta)}{ 2 n } \! + \! 2\sqrt{\frac{ \log(2 / \delta)}{ 2 n }} \Bigg)
    % \end{align}
    % \end{theorem}
    
    % \begin{remark}
    % This theorem shows that with high probability, the estimation error from global population to local empirical regularization function is upper bound by $\tilde O \Big (\frac{1}{\sqrt{n}} \Big )$ in uniform distribution setting. Larger number of data could improve the approximation accuracy.
    % \end{remark}
    
    % % \begin{proof}
    % % (of Theorem \ref{theorem: regularization error of uniform data})
    % % Define $ \varepsilon \in \R^{C \times P}$ is an estimation matrix where each entry $\varepsilon_{c, p} = \epsilon$ is the same.
    % % \begin{align}
    % % \begin{aligned}
    % % \label{ineq:proof}
    % % &
    % % H(\theta, \v)
    % % = \v^{\top} Q_{\theta}^{\top} Q_{\theta} \v
    % % \\
    % % & \quad \quad \quad
    % % = \v^{\top} (Q_{\theta}^{\top} Q_{\theta}) \v
    % % \\ 
    % % & \quad \quad \quad
    % % \leq 
    % % \v^{\top} \bigg [(\hat Q_{\hat \theta} + \varepsilon)^{\top} (\hat Q_{\hat\theta} + \varepsilon) \bigg ] \v
    % % \\
    % % & \quad \quad \quad
    % % =
    % % \v^{\top} \bigg [ \hat Q_{\theta}^{\top} \hat Q_{\theta} + \varepsilon^{\top} \hat Q_{\theta} + \hat Q_{\theta}^{\top} \varepsilon + \varepsilon^{\top} \varepsilon \bigg ] \v
    % % \\
    % % & \quad \quad \quad
    % % \leq
    % % \v^{\top} \bigg [ \hat Q_{\theta}^{\top} \hat Q_{\theta} + 2 \| \varepsilon \| \cdot \| \hat Q_{\theta} \| + \| \varepsilon \|^2 \bigg ] \v
    % % \\
    % % & \quad \quad \quad
    % % \leq
    % % \v^{\top} \bigg [ \hat Q_{\theta}^{\top} \hat Q_{\theta} + 2(cp)^2 \cdot  \epsilon  \cdot 1 + (cp)^2 \cdot  \epsilon^2 \bigg ] \v
    % % \\
    % % & \quad \quad \quad
    % % =
    % % \v^{\top} \hat Q_{\theta}^{\top} \hat Q_{\theta} \v+ \v^{\top} \bigg [ 2(cp)^2 \cdot \epsilon + (cp)^2 \cdot \epsilon^2 \bigg ] \v
    % % \\
    % % & \quad \quad \quad
    % % \leq
    % % \v^{\top} \hat Q_{\theta}^{\top} \hat Q_{\theta} \v + \| \v^{\top} \| \cdot \bigg [ (2(cp)^2 \cdot \epsilon + (cp)^2 \cdot \epsilon^2 \bigg ] \cdot \|\v \|
    % % \\
    % % & \quad \quad \quad
    % % \leq
    % % \v^{\top} \hat Q_{\theta}^{\top} \hat Q_{\theta} \v + \bigg [ 2(cp)^2 \cdot  \epsilon + (cp)^2 \cdot \epsilon^2 \bigg ]
    % % \end{aligned}
    % % \end{align}
    % % Where the second, third and fourth inequality is due to Cauchy–Schwarz inequality. 
    
    
    % % Combined with lemma \ref{lemma:mean_of_sum_uniform_data}, the following inequality holds: 
    % % \begin{align*}
    % % H(\theta, \v) - \hat G^{ud}(\hat \theta, \v) 
    % % \leq 
    % % c^2 p^2 (\frac{ \log(2 / \delta)}{ 2 n } + 2\sqrt{\frac{ \log(2 / \delta)}{ 2 n }})
    % % \end{align*}
    
    % % \end{proof}
    
    % \begin{theorem}
    % \label{theorem: regularization error of uniform client}
    % (The regularization estimation error of uniform client setting) 
    % Supposing Assumption \ref{assumption} holds, when $\gamma_k = \frac{1}{K}$, with probability at least $1-\delta$ where $\delta >0 $, the following inequality holds:
    % \begin{align}
    % \begin{aligned}
    % &
    % H(\theta, \v) - \hat G(\hat \theta, \v) 
    % \leq 
    % c^2 p^2 \Bigg(\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin } \\ 
    % & \quad \quad \quad \quad \quad \quad \quad \quad
    % + 2 \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }} \Bigg)
    % \end{aligned}
    % \end{align}
    % \end{theorem}
    
    % \begin{remark}
    % This theorem shows that with high probability, the estimation error from global population to local empirical regularization function is upper bound by $\tilde O \Big (\frac{1}{\sqrt{K n_\textmin}} \Big )$ in uniform client setting. 
    % % Because $K n_\textmin = O(n)$, the upper bound could also be rewritten as $\tilde O (\frac{1}{\sqrt{n}})$.
    % Larger number of clients could improve the approximation accuracy.
    % \end{remark}
    
    % % \begin{proof}
    % % (of Theorem \ref{theorem: regularization error of uniform client})
    % % Define the estimation error of $\hat j(c,p)$, $\hat u(c)$ and $\hat r(p)$ as $\epsilon_j$, $\epsilon_u$ and $\epsilon_r$ respectively. 
    
    % % Under the condition of assumption \ref{assumption}, according lemma \ref{lemma:mean_of_mean_uniform_clients}, the following holds:
    % % \begin{align*}
    % % &
    % % \epsilon_j= \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin j_\textmin }}
    % % \approx
    % % \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }}
    % % \\ &
    % % \epsilon_u= \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin u_\textmin }}
    % % \approx
    % % \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }}
    % % \\ &
    % % \epsilon_r= \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin r_\textmin }} 
    % % \approx
    % % \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }}
    % % \end{align*}
    
    % % Combined with lemma \ref{lemma:fraction_error_bound}, the $\epsilon$ in inequality (\ref{ineq:proof}) is:
    % % \begin{align*}
    % % \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }}
    % % \end{align*}
    
    % % Then, the following inequality holds: 
    % % \begin{align*}
    % % &
    % % H(\theta, \v) - \hat G^{ud}(\hat \theta, \v) 
    % % \leq 
    % % c^2 p^2 (\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin } + 2 \sqrt{\frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ K n_\textmin }} )
    % % \end{align*}
    % % \end{proof}
    
    % \begin{corollary}
    % \label{corollary 1}
    % Supposing that assumptions in Theorem \ref{theorem: regularization error of uniform client} hold, if $n_\textmin \backsim \shired{ \frac{n}{K}}$, then
    % \begin{align*}
    % \frac{ \log(2K / \delta) \cdot \log(4/\delta) }{ \yanred{ K n_\textmin} } 
    % \backsim
    % \tilde O \bigg (\frac{1}{\sqrt{n}} \bigg )
    % \end{align*}
    % \end{corollary}
    
    % \begin{remark}
    % The above corollary shows that if $n_\textmin$ is at the same order of the client average data samples $\frac{n}{K}$ and under assumption \ref{assumption}, 
    % % where $j_\textmin$, $u_\textmin$ and $r_\textmin$ are at the same order and close to constant, 
    % then the approximation error of Theorem \ref{theorem: regularization error of uniform client} is at $\tilde O \Big (\frac{1}{\sqrt{n}} \Big )$ order, which is the same to the order in Theorem \ref{theorem: regularization error of uniform data}. 
    % \end{remark}
    
    % \begin{corollary}
    % \label{corollary 2}
    % The total estimation error of problem (\ref{eq:fedrenyi_empirical}) is:\\
    % (i) when $\gamma_k= \frac{n_k}{n}$:
    % \begin{align*}
    % \bigg[L(\theta) + \lambda H(\theta, \v) \bigg] - 
    % \bigg[ \hat L(\theta) + \lambda \hat H(\theta, \v) \bigg]
    % \leq 
    % \tilde O \bigg (\frac{1}{\sqrt{n}} \bigg );
    % \end{align*}
    % (ii) supposing that assumptions in Theorem \ref{theorem: regularization error of uniform client} hold:
    % \begin{align*}
    % &
    % \bigg[L(\theta) + \lambda H(\theta, \v) \bigg] - 
    % \bigg[ \hat L(\theta) + \lambda \hat H(\theta, \v) \bigg] \\
    % & \leq 
    % \tilde O \bigg(\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \bigg).
    % \end{align*}
    % \end{corollary}
    
    % \begin{remark}
    % This corollary shows that the upper bound orders of total estimation error under uniform distribution and uniform client setting is $\tilde O \Big (\frac{1}{\sqrt{n}} \Big )$ and $\tilde O \Big (\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \Big )$ respectively. Under the condition of Corollary \ref{corollary 1}, the two orders are the same.
    % \end{remark}
    
% \newpage{}
% \section{Technical Proofs}

    % In this section, we prove the theoretical results in this paper.


\subsubsection{Proof of Proposition \ref{proposition:convergence_based_on_scaffold}}
\label{Appendix: Proof of proposition convergence_based_on_scaffold}
\begin{proposition}
\label{proposition:convergence_based_on_scaffold_appendix}
(Proposition \ref{proposition:convergence_based_on_scaffold} restated, convergence of FedR\'enyi)
Suppose $\eta \leq O(1/M)$ and $L_k(\theta)$ satisfies $(G_L, B_L)$-bounded gradient dissimilarity, where $\frac{1}{K} \sum^K_{k=1}  \| \frac{ \partial L_k(\theta) }{ \partial \theta } \|^2 \leq G_L^2 + B_L^2 \| \frac{ \partial L(\theta) }{ \partial \theta } \|^2$.
If 
$\| \frac{ \partial L(\theta) }{ \partial \theta } \|^2, \| \frac{ \partial Q_\theta }{ \partial \theta } \|^2, \| \frac{ \partial \widehat Q_\theta }{ \partial \theta } \|^2, \| \frac{ \partial \v_\theta }{ \partial \theta } \|^2$ are bounded by $\bar G $ for all $\theta$, 
Then, $H(\theta, \v)$ satisfies $(G_H, B_H)$-bounded gradient dissimilarity, where $G_H$ is $\frac{ C \bar G ( \rho + C) }{\rho^2}$ and $B_H$ is $1$.
$F(\theta)$ also satisfies $(G_F, B_F)$-bounded gradient dissimilarity, where $B_F = 2 B_L^2 $ and $G_F = 2G_L^2 + (4 \lambda - 2 B_L^2\lambda^2) \frac{ C \bar G ( \rho + C) }{\rho^2}
+ 4 B_L^2 \lambda \cdot \sqrt{\frac{ C \bar G^2 ( \rho + C) }{\rho^2} }$. 
Thus, FedR\'enyi algorithm achieves $\E[ \| \nabla \widehat F(\theta_T) \|^2 ] \leq \epsilon $ and $\E[ \| \nabla F(\theta_T) \|^2 ] \leq \epsilon + O\Big( \frac{1}{n} + \max_{\theta} \big\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q_\theta }{ \partial \theta } \big\|^2 \big)\Big)$ when $T \geq O(1/\epsilon^2)$.
\end{proposition}

\begin{proof}
(of Proposition \ref{proposition:convergence_based_on_scaffold} )

Before we prove that $F(\theta)$ satisfies $(G_F, B_F)$-bounded gradient dissimilarity, we first prove that $H(\theta, \v)$ satisfies $(G_H, B_H)$-bounded gradient dissimilarity:

\begin{align*}
& 
\frac{1}{K} \sum^K_{k=1} \bigg \| \frac{ \partial H_k(\theta_k, \v)}{ \partial \theta_k} \bigg \|^2 
\\
= & 
\frac{1}{K} \sum^K_{k=1} \bigg \| Q_{\theta_k} \cdot \v_{\theta} \cdot \Big [  \frac{ \partial Q_{\theta_k}}{ \partial \theta_k} \cdot \v_{\theta} + \frac{ \partial \v_{\theta}}{ \partial \theta} \cdot Q_{\theta_k} \Big ] \bigg \|^2
\\
= & 
\frac{1}{K} \sum^K_{k=1} \| Q_{\theta_k} \|^2 \cdot \| \v_{\theta} \|^2 \cdot \Bigg [ \bigg \| \frac{ \partial Q_{\theta_k}}{ \partial \theta_k} \bigg\|^2 \cdot \| \v_{\theta} \|^2 + \bigg \| \frac{ \partial \v_{\theta}}{ \partial \theta} \bigg\|^2 \cdot \| Q_{\theta_k}\|^2 \Bigg ]
\\
\leq & 
\frac{1}{K} \sum^K_{k=1} \Big (\frac{C}{\rho} \Big) \cdot \bigg [ \bar G + \bar G \cdot \Big (\frac{C}{\rho} \Big) \bigg ]
\\
= & 
\frac{ C \bar G ( \rho + C) }{\rho^2}
\\
\leq & 
\frac{ C \bar G ( \rho + C) }{\rho^2} + \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2 ,
\end{align*}
where the first inequality is due to the assumption that $\| \frac{ \partial Q_\theta }{ \partial \theta } \|^2, \| \frac{ \partial \v_\theta }{ \partial \theta } \|^2$ are bounded by $\bar G$, $\| \v_{\theta} \|^2 \leq 1$, and the last inequality is due to the non-negativity of $\bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2 $.

Therefore, $H(\theta, \v)$ satisfies $(G_H, B_H)$-bounded gradient dissimilarity, where $G_H$ is $\frac{ C \bar G ( \rho + C) }{\rho^2}$ and $B_H$ is $1$.

Then, we begin to prove that $F(\theta)$ satisfies $(G, B)$-bounded gradient dissimilarity:
\begin{align*}
& 
\frac{1}{K} \sum^K_{k=1} \bigg \| \frac{ \partial F_k(\theta_k)}{ \partial \theta_k} \bigg \|^2 
\\
= &
\frac{1}{K} \sum^K_{k=1} \bigg \| \frac{ \partial \big [ L_k(\theta_k) + \lambda H_k(\theta_k, \v) \big ] }{ \partial \theta_k} \bigg \|^2 
\\
\leq &
\frac{2}{K} \sum^K_{k=1} \bigg \| \frac{ \partial L_k(\theta_k)}{ \partial \theta_k} \bigg \|^2 + 
\frac{2 \lambda }{K} \sum^K_{k=1} \bigg \| \frac{ \partial H_k(\theta_k, \v)}{ \partial \theta_k} \bigg \|^2
\\
\leq &
2G_L^2 + 2 B_L^2 \cdot \bigg \| \frac{ \partial L(\theta)}{ \partial \theta} \bigg \|^2
+ 2 \lambda \cdot \frac{ C \bar G ( \rho + C) }{\rho^2} + 2 \lambda \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2
\\
= &
2 B_L^2 \cdot \bigg \| \frac{ \partial L(\theta)}{ \partial \theta} \bigg \|^2 + 2 B_L^2 \lambda^2 \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2 + 4 B_L^2 \lambda \bigg \langle \frac{ \partial L(\theta)}{ \partial \theta}, \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \rangle 
\\
& +
2G_L^2 + 2 \lambda \frac{ C \bar G ( \rho + C) }{\rho^2} + 2 (\lambda - B_L^2\lambda^2) \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2  - 4 B_L^2 \lambda \bigg \langle \frac{ \partial L(\theta)}{ \partial \theta}, \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \rangle 
\\
= &
2 B_L^2 \bigg \| \frac{ \partial F(\theta)}{ \partial \theta} \bigg \|^2  +
2G_L^2 + 2 \lambda \frac{ C \bar G ( \rho + C) }{\rho^2} + 2 (\lambda - B_L^2\lambda^2) \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2  - 4 B_L^2\lambda \bigg \langle \frac{ \partial L(\theta)}{ \partial \theta}, \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \rangle
\\
\leq &
2 B_L^2 \bigg \| \frac{ \partial F(\theta)}{ \partial \theta} \bigg \|^2  +
2G_L^2 + 2 \lambda \frac{ C \bar G ( \rho + C) }{\rho^2} + 2 (\lambda - B_L^2\lambda^2) \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \|^2  + 4 B_L^2 \lambda \bigg \| \frac{ \partial L(\theta)}{ \partial \theta} \bigg \| \cdot \bigg \| \frac{ \partial H(\theta, \v)}{ \partial \theta} \bigg \| 
\\
= &
2 B_L^2 \bigg \| \frac{ \partial F(\theta)}{ \partial \theta} \bigg \|^2  +
2G_L^2 + 2 \lambda \frac{ C \bar G ( \rho + C) }{\rho^2} + 2 (\lambda - B_L^2\lambda^2) \bigg \| Q_{\theta} \cdot \v_{\theta} \cdot \Big [\frac{ \partial Q_{\theta}}{ \partial \theta} \cdot \v_{\theta} + \frac{ \partial \v_{\theta}}{ \partial \theta} \cdot Q_{\theta} \Big ]  \bigg \|^2  
\\
&
+ 4 B_L^2 \lambda \bigg \| \frac{ \partial L(\theta)}{ \partial \theta} \bigg \| \cdot \bigg \| Q_{\theta} \cdot \v_{\theta} \cdot \Big [\frac{ \partial Q_{\theta}}{ \partial \theta} \cdot \v_{\theta} + \frac{ \partial \v_{\theta}}{ \partial \theta} \cdot Q_{\theta} \Big ] \bigg \| 
% \\
% \leq &
% 2 B_L^2 \bigg \| \frac{ \partial F(\theta)}{ \partial \theta} \bigg \|^2  +
% 2G_L^2 + 2 \lambda \frac{ C \bar G ( \rho + C) }{\rho^2} + 2 (\lambda - B_L^2\lambda^2) \frac{ C \bar G^2 ( \rho + C) }{\rho^2}
% + 4 B_L^2 \lambda \sqrt{\bar G } \cdot \sqrt{\frac{ C\rho \bar G^2 + C^2\bar G}{\rho^2}}
\\
\leq &
2 B_L^2 \bigg \| \frac{ \partial F(\theta)}{ \partial \theta} \bigg \|^2  +
2G_L^2 + (4 \lambda - 2 B_L^2\lambda^2) \frac{ C \bar G ( \rho + C) }{\rho^2}
+ 4 B_L^2 \lambda \cdot \sqrt{ \frac{ C \bar G^2 ( \rho + C) }{\rho^2} }, 
\end{align*}
where the first inequality is due to $(a+b)^2 \leq 2a^2 + 2 b^2$, and the second inequality is due to the $(G_L, B_L)$-bounded gradient dissimilarity condition of $L(\theta)$ and $H(\theta, \v)$, the third inequality is due to $- \| a\| \cdot \| b\| \leq \langle a, b \rangle $, the last inequality is due to the assumption that $\| \frac{ \partial L(\theta) }{ \partial \theta } \|^2, \| \frac{ \partial Q_\theta }{ \partial \theta } \|^2, \| \frac{ \partial \v_\theta }{ \partial \theta } \|^2$ are bounded by $\bar G $.

Therefore, $F(\theta)$ satisfies $(G_F, B_F)$-bounded gradient dissimilarity, where $B_F = 2 B_L^2 $ and $G_F = 2G_L^2 + (4 \lambda - 2 B_L^2\lambda^2) \frac{ C \bar G ( \rho + C) }{\rho^2}
+ 4 B_L^2 \lambda \cdot \sqrt{\frac{ C \bar G^2 ( \rho + C) }{\rho^2}}$.

Then, we begin to study the convergence of FedR\'enyi.
We first decompose $\| \nabla F(\theta) \|^2$ as follows
\begin{align*}
\| \nabla F(\theta) \|^2
= &
\| \nabla F(\theta) - \nabla \widehat F(\theta) + \nabla \widehat F(\theta) \|^2
\leq 
2 \| \nabla \widehat F(\theta) \|^2
+ 2 \| \nabla \widehat F(\theta) - \nabla F(\theta) \|^2
\\
= &
2 \| \nabla \widehat F(\theta) \|^2
+ 2 \lambda^2 \Bigg\| \frac{ \partial \widehat H(\theta, \widehat \v^*) }{ \partial \theta } - \frac{ \partial H(\theta, \v^*) }{ \partial \theta } \Bigg\|^2 ,
\end{align*}
where the first inequality is due to $(a+b)^2 \leq 2a^2 + 2 b^2$.

Define 
$\v^* = \arg\max_{ \v \perp \v_1, \|\v\|^2 \leq 1 } L(\theta) + \lambda H(\theta)$,
$\widehat \v^* = \arg\max_{ \v \perp \v_1, \|\v\|^2 \leq 1 } L(\theta) + \lambda \widehat H(\theta)$.
Then we upper bound the last term as follows.
\begin{align*}
&
\Bigg\| \frac{ \partial \widehat H(\theta, \widehat \v^*) }{ \partial \theta } - \frac{ \partial H(\theta, \v^*) }{ \partial \theta } \Bigg\|^2
=
4 \Bigg\| \widehat Q_\theta ( \widehat \v^* (\widehat\v^*)^\top ) \cdot \frac{ \partial \widehat Q_\theta}{ \partial \theta }
- Q_\theta ( \v^* (\v^*)^\top ) \cdot \frac{ \partial Q_\theta}{ \partial \theta } \Bigg\|^2
\\
= &
\Bigg\| 
\widehat Q_\theta \cdot ( \widehat \v^* (\widehat \v^*)^\top - \v^* (\v^*)^\top ) \cdot \frac{ \partial \widehat Q_\theta }{ \partial \theta }
+ \widehat Q_\theta \cdot ( \v^* (\v^*)^\top ) \cdot ( \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q }{ \partial \theta } )
+ ( \widehat Q_\theta - Q_\theta ) \cdot ( \v^* (\v^*)^\top ) \cdot \frac{ \partial Q_\theta }{ \partial \theta }
\Bigg\|^2
\\
\leq &
2 \| \widehat Q_\theta \|^2 \cdot \| \widehat \v^* (\widehat \v^*)^\top - \v^* (\v^*)^\top \|^2 \cdot \Bigg\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } \Bigg\|^2
+ 4 \| \widehat Q_\theta \|^2 \cdot \| \v^* (\v^*)^\top \|^2 \cdot \Bigg\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q }{ \partial \theta } \Bigg\|^2
\\
& 
+ 4 \| \widehat Q_\theta - Q_\theta \|^2 \cdot \| \v^* (\v^*)^\top \|^2 \cdot \Bigg\| \frac{ \partial Q_\theta }{ \partial \theta } \Bigg\|^2
\\
\leq &
2 \frac{C}{\rho} \cdot \| \widehat \v^* (\widehat \v^*)^\top - \v^* (\v^*)^\top \|^2 \cdot \bar G
+ 4 \frac{C}{\rho} \cdot \Bigg\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q }{ \partial \theta } \Bigg\|^2
+ 4 \| \widehat Q_\theta - Q_\theta \|^2 \cdot \bar G
\\
\leq &
O\Bigg( \frac{1}{ n} + \Bigg\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q_\theta }{ \partial \theta } \Bigg\|^2 \Bigg) ,
\end{align*}
where the inequality is due to $(a+b)^2 \leq 2a^2 + 2b^2$,
and the second inequality is due to $\| \frac{ \partial Q_\theta }{ \partial \theta } \|^2 \leq \bar G, \| \frac{ \partial \widehat Q_\theta }{ \partial \theta } \|^2 \leq \bar G, \| \v \| \leq 1, \| \widehat \v^* - \v^* \| \leq O(\sqrt{\frac{1}{n}}), \| \widehat Q_\theta - Q_\theta \| \leq O(\sqrt{\frac{1}{n}})$, and Lemma \ref{lemma:Q_norm}.

As a result, we have
\begin{align*}
\| \nabla F(\theta) \|^2
\leq 
2 \| \nabla \widehat F(\theta) \|^2
+ O\Bigg( \frac{1}{ n } + \Bigg\| \frac{ \partial \widehat Q_\theta }{ \partial \theta } - \frac{ \partial Q_\theta }{ \partial \theta } \Bigg\|^2 \Bigg) .
\end{align*}
Finally, the convergence result is the immediate from Theorem I in \cite{karimireddy2020scaffold}.
\end{proof}

\subsection{Proof for Section \ref{subsubsection:theoretical_analysis_asynchronous}}

In this subsection, we prove Proposition \ref{proposition:approximation_error} and Theorem \ref{theorem:estimation_error_asynchronous} in Section \ref{subsubsection:theoretical_analysis_asynchronous}.

\subsubsection{Proof of Proposition \ref{proposition:approximation_error}}
\label{Appendix: Proof of proposition approximation_error}

\begin{proposition}
\label{proposition:approximation_error_appendix}
(Proposition \ref{proposition:approximation_error} restated, approximation error of each straggler in asynchronous FedR\'enyi)
Define $\max_{k, k' \in [K]} \| \theta^e_{k, 0} - \theta^e_{k', 0}\| = \varepsilon^e_{0}$. Suppose that Assumption \ref{assumption:cocoercive} and \ref{assumption:lipschitz} hold. Then, for each communication round $e$, the approximation errors of model and local statistics on each stragglers $\widetilde k$ are upper bounded:
\begin{align}
&
\| \widetilde \theta^{e}_{\widetilde k, M} - \theta^{e}_{\widetilde k, M}\|
\leq 
\varepsilon^e_{0},
\nonumber \\
&
| \widetilde j^{e}_{\widetilde k, M}(c, p) -  \bar j^{e}_{\widetilde k, M}(c, p) |
\leq
L \varepsilon^e_{0} + \zeta,
\nonumber \\
&
| \widetilde u^{e}_{\widetilde k, M}(c) -  \bar u^{e}_{\widetilde k, M}(c) |
\leq
L \varepsilon^e_{0} + \zeta.
\end{align}
\end{proposition}

\begin{proof}
(of Proposition \ref{proposition:approximation_error}) 

% \shired{
% Recall that $(X_k,S_k)$ is the data distribution of client $k$, where $X$ is the feature space and $S$ is attributes space.  $\theta_{k, t+1}$ is the model parameter of client $k$ at $t+1$ iteration and $f_{\theta_{k,t+1}}(X_k,S_k) = c$ is the model prediction of input $(X_k,S_k)$ where model is parameterized with $\theta_{k,t+1}$. 
% }

% \shired{
% Recall that 
% \begin{align*}
% \bar j_k(c, p) 
% = & 
% \sum_{i \in \calI_k} \frac{1}{ n_k } \indicator[ f_\theta(x_{ki}, s_{ki}) = c | s_{ki} = p ] ,
% \\
% \bar u_k(c) 
% = &
% \sum_{i \in \calI_k} \frac{1}{ n_k } \indicator[ f_\theta(x_{ki}, s_{ki}) = c ] .
% \end{align*}
% }

Before proving Proposition \ref{proposition:approximation_error}, we show the following technical lemma:
\begin{lemma}
\label{lemma:nonexpansive_sgd_smooth}
(Non-expansiveness of SGD under $\beta$-co-coercive condition)
\begin{align*}
\| SGD(x) - SGD(y)\|
\leq 
\| x - y \|
.
\end{align*}
\end{lemma}

Now we begin to prove Proposition \ref{proposition:approximation_error}. 

We first bound the model distance between $\theta^{e}_{k, M}$ and $\theta^{e}_{k', M}$ on arbitrary two different clients $k,k' \in [K], k \neq k'$ by following inequality:
\begin{align}
\label{eq:epsilon_comparison}
\| \theta^{e}_{k, M} - \theta^{e}_{k', M}\| 
= 
\| \theta^e_{k, M-1} - \eta \nabla F (\theta^e_{k, M-1}) - \theta^e_{k', M-1} + \eta \nabla F (\theta^e_{k', M-1})\| 
\leq
\| \theta^{e}_{k, M-1} - \theta^{e}_{k', M-1} \|
\leq 
\varepsilon^{e}_{M-1}
\leq 
\varepsilon^{e}_0, 
\end{align}
where the first inequality is due to Assumption \ref{assumption:cocoercive} and Lemma \ref{lemma:nonexpansive_sgd_smooth}, the second inequality is due to the definition of $\varepsilon^e_t$, the last inequality is due to Lemma \ref{lemma:nonexpansive_sgd_smooth}.

% Next, we study the upper and lower bound of similarity  $W_{k,k'}$:
% \begin{align*}
% &
% 1 - \frac{\eta^{I} \varepsilon_0}{\rho}
% \\
% \leq & 
% -\frac{1}{\rho} dist (\theta_{k, I}, \theta_{k', I}) +1 
% \\
% \leq & 
% \exp \Big (\frac{ -dist (\theta_{k, I}, \theta_{k', I})}{\rho} \Big)
% \\
% \leq &
% \frac{1}{1+ \frac{1}{\rho} dist (\theta_{k, I}, \theta_{k', I})}
% \\
% \leq &
% 1.
% \end{align*}

Then, we begin to bound the approximation error between $\theta^e_{\widetilde k, M}$ and $ \widetilde \theta^e_{\widetilde k, M} $:
\begin{align*}
&
\| \widetilde \theta^e_{\widetilde k, M} - \theta^e_{\widetilde k, M}\|
\\
= &
\| \frac{\sum_{k'=1}^{K -\widetilde K^{e+1}} W^{k,k'}_{\theta} \theta^e_{k', M}}{\sum_{k'=1}^{K-\widetilde K^{e+1}} W^{k,k'}_{\theta} }  - \theta^e_{k, M} \| 
\\
\leq & 
\frac{\sum_{k'=1}^{K-\widetilde K^{e+1}} W^{k,k'}_{\theta} \| \theta^e_{k', M} - \theta^e_{k, M} \|}{\sum_{k'=1}^{K-\widetilde K^{e+1}} W^{k,k'}_{\theta} }  
\\
\leq & 
\frac{\sum_{k'=1}^{K-\widetilde K^{e+1}} W^{k,k'}_{\theta} }{\sum_{k'=1}^{K-\widetilde K^{e+1}} W^{k,k'}_{\theta} } \varepsilon^{e}_0 
\\
= & 
\varepsilon^{e}_0,  
\end{align*}
where the first inequality is due to the triangle inequality, and the second inequality is due to inequality \ref{eq:epsilon_comparison}. 

Next, we bound the approximation error between of local statistics $\widetilde j^e_{\widetilde k, M}(c, p)$ and $ \widetilde u^e_{\widetilde k, M}(c) $. Recall that $ \bar u^e_{\widetilde k, M}(c) $ and $\bar j^e_{\widetilde k, M}(c, p)$ are the empirical probability and empirical conditional probability of $\widehat \P [f_{\theta}(X_k,S_k) = c ]$, we could study the approximation error through $\widehat \P [f_{\theta^{e+1}_{k,0}}(X_k,S_k) = c ]$:
\begin{align*}
&
\bigg | \frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \widehat \P \big [f_{\theta^{e+1}_{k',0}}(X_k',S_k') =c \big ]}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} } - \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k,S_k) =c \big ] \bigg |
\\
\leq &
\frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \Big | \widehat \P \big[f_{\theta^{e+1}_{k',0}}(X_k',S_k') =c \big] - \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k,S_k) =c \big] \Big |}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} }  
\\
= &
\frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \Big | \widehat \P \big [f_{\theta^{e+1}_{k',0}}(X_k',S_k') =c \big] -  \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k',S_k') =c \big] +  \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k',S_k') =c \big] - \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k,S_k) =c \big] \Big |}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} }  
\\
\leq &
\frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \Big | \widehat \P \big [f_{\theta^{e+1}_{k',0}}(X_k',S_k') =c \big] -  \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k',S_k') =c \big] \Big |}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} }  
\\
&
+ 
\frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \Big | \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k',S_k') =c \big] - \widehat \P \big [f_{\theta^{e+1}_{k,0}}(X_k,S_k) =c \big] \Big |}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} }  
\\
\leq &
L \cdot \frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \Big \| \theta^{e+1}_{k',0} - \theta^{e+1}_{k,0} \Big \|}{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} }  
+ 
\frac{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} \zeta }{\sum_{k'=1}^{k' \in Rob_{\zeta}(\widetilde k)} W^{k,k'} }  
\\
\leq &
L \cdot \varepsilon^{e}_{0} 
+ 
\zeta,
\end{align*}
Where the first and second inequality is due to triangle inequality, and the third inequality is due to Assumption \ref{assumption:lipschitz} and definition of robust neighbor set $Rob_{\zeta}(\widetilde k)$ that $Rob_{\zeta}(\widetilde k) = \{k': \| \omega \bar u_{k'}(c) - \bar u_{\widetilde k}(c)\| \leq \zeta, k' \in [K], \forall c \text{ and } \forall \omega \in (0,1) \}$.

Combining the above inequality and definition of $j^e_{\widetilde k, M}(c, p)$ and $u^e_{\widetilde k, M}(c) $, we can bound the approximation error between $\widetilde j^e_{\widetilde k, M}(c, p)$ and $ j^e_{\widetilde k, M}(c, p) $, $\widetilde u^e_{\widetilde k, M}(c)$ and $ u^e_{\widetilde k, M}(c) $, respectively: 
\begin{align*}
&
| \widetilde j^e_{\widetilde k, M}(c, p) - \bar j^e_{\widetilde k,M}(c, p) |
\leq
L \varepsilon^{e}_0 + \zeta,
\\
&
| \widetilde u^e_{\widetilde k, M}(c) -  \bar u^e_{\widetilde k, M}(c) |
\leq
L \varepsilon^{e}_0 + \zeta.
\end{align*}

\end{proof}

% \begin{proposition}
% \label{theorem:convergence_appendix}
% (Proposition \ref{theorem:convergence} restated, the convergence analysis, Theorem 31 in \cite{jin2020local}) Suppose that $\big ( L(\theta) + \lambda H(\theta, \v)  \big )$ is $\ell_1$-smooth and $\beta < \frac{1}{2 \ell_1}$, then the output $\hat \theta_T$ of Algorithm \ref{alg:FR} with step size $\eta = \frac{1}{\sqrt{T+2}}$ will satisfy: \\
% \begin{align*}
% \E[ \| \nabla \phi_{\beta}(\hat \theta_{T}) \|^2] \leq 
% 2 \frac{ \big ( \phi_{\beta}(\hat \theta_0) - \min_{\theta'} \phi(\theta') \big) + 2 \ell_1^2}{\sqrt{T+2} } + 4 \ell_1 \varepsilon,
% \end{align*}
% where $\varepsilon = \tilde O \Big (\frac{1}{\sqrt{n}} \Big )$ when $\gamma_k= \frac{n_k}{n}$ or $\tilde O \Big(\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \Big)$ when assumptions in Theorem \ref{theorem: regularization error of uniform client} hold, respectively. 
% \end{proposition}

%     \begin{proof}
%     To analyze the convergence performance, we first define $\varepsilon_{gen}$ as the upper bound of generalization error, $\v^* \in \arg\max_{\v \bot \hat \v_1}  L(\theta) + \lambda H(\theta, \v)$ and 
%     $\v_{t+1} \in \arg\max_{\v \bot \hat \v_1}  \hat L(\theta) + \lambda \hat H(\theta, \v)$. \\
%     Then the following inequality hold:
%     \begin{align*}
%     &
%     L( \theta ) + \lambda H(\theta, \v_{t+1}) 
%     \\
%     \geq & 
%     \hat L( \theta ) + \lambda \hat H(\theta, \v_{t+1}) -\varepsilon_{gen}
%     \\
%     \geq & 
%     \hat L( \theta ) + \lambda \hat H(\theta, \v^*) - \varepsilon_{gen}
%     \\
%     \geq & 
%     L( \theta ) + \lambda H(\theta, \v^*) - 2\varepsilon_{gen}
%     \\
%     = & 
%     \max_{\v\bot \hat \v_1} \big [ L( \theta) + \lambda H(\theta, \v) \big ] - 2\varepsilon_{gen},
%     \end{align*}
%     where the first and third inequality are due to the Corollary \ref{corollary 2}, the second inequality is due to the definition of $\v_{t+1}$.\\
%     The above inequality could be simplified as follows:
%     \begin{align*}
%     &
%     L( \theta ) + \lambda H(\theta, \v_{t+1})
%     \geq 
%     \max_{\v\bot \hat \v_1} \big [ L( \theta) + \lambda H(\theta, \v) \big ] - 2 \varepsilon_{gen},
%     \end{align*}
%     which is aligned with the Algorithm 2 in \cite{jin2020local}.\\
%     Thus according to the Theorem 31 in \cite{jin2020local}, we have the following inequality hold:
%     \begin{align*}
%     \E[ \| \nabla \phi_{\beta}(\hat \theta_{T}) \|^2] \leq 
%     2 \frac{ \big ( \phi_{\beta}(\hat \theta_0) - \min_{\theta'} \phi(\theta') \big) + 2 \ell_1^2}{\sqrt{T+2} } + 4 \ell_1 \varepsilon,
%     \end{align*}
%     where $\varepsilon = \tilde O \Big (\frac{1}{\sqrt{n}} \Big )$ when $\gamma_k= \frac{n_k}{n}$ or $\tilde O \Big(\frac{1}{\sqrt{n}} + \frac{1}{\sqrt{K n_\textmin}} \Big)$ when assumptions in Theorem \ref{theorem: regularization error of uniform client} hold, respectively. 
%     \end{proof}

\subsubsection{Proof of Lemma \ref{lemma:nonexpansive_sgd_smooth}}


\begin{proof}
(of Lemma \ref{lemma:nonexpansive_sgd_smooth})

\begin{align*}
\| SGD(x) - SGD(y) \|^2
= &
\| ( x - \eta g(x) ) - ( y - \eta g(y)) \|^2
\\
= &
\| x-y \|^2
+ \eta^2 \| g(x) - g(y) \|^2
- 2 \eta (x-y)^\top (g(x) - g(y))
\\
\leq &
\| x-y \|^2
+ \eta ^2 \| g(x) - g(y) \|^2
- 2 \eta \beta \| g(x) - g(y)  \|^2
\\
= &
\| x-y \|^2
+ \eta (\eta  - 2 \beta ) \| g(x) - g(y) \|^2
\\
\leq &
\| x-y \|^2
,
\end{align*}
where the first inequality is due to $\beta$-co-coercive condition of $g(x)$, and the last inequality is due to $\eta \leq 2\beta$.
% \yanred{Can also consider $\beta$-co-coercive condition, closely related to L-smoothness}
% \begin{align*}
% \langle F(x) - F(y), x-y \rangle
% \geq 
% \beta \| F(x) - F(y) \|^2
% \end{align*}
\end{proof}

\subsubsection{Proof of Theorem \ref{theorem:estimation_error_asynchronous}}
\label{appendix:proof_of_theorem estimation_error_asynchronous}

\begin{theorem}
\label{theorem:estimation_error_asynchronous_appendix}
(Theorem \ref{theorem:estimation_error_asynchronous} restated, estimation error of R\'enyi regularization for asynchronous FedR\'enyi) 
Suppose $j_\textmin \backsim u_\textmin \backsim r_\textmin = O(1)$ and $n_\textmin \backsim \frac{n}{K \log(K)}$.
When $\gamma_k = \frac{n_k}{n}$ or $\frac{1}{K}$, for any communication round $e$, any global model $\theta^{e+1}$ and $\delta \in (0,1)$, we have the following inequality holds:
\begin{align*}
\P \Big [
\widehat H(\theta^{e + 1}, \widetilde \v^{e + 1}) -  H(\theta^{e + 1}, \v^{e + 1}) 
\leq
O \big( 1/ \sqrt{n} + ( L \varepsilon^e_{0} + \zeta )^2 \big) \Big | \theta^{e + 1} \Big ]
\geq 1 - \delta.
\end{align*}
\end{theorem}

\begin{proof}
(of Theorem \ref{theorem:estimation_error_asynchronous})

Recall that the $ H(\theta^{e+1}, \v^{e+1}) = {(\v^{e+1})}^{\top} {Q^{e+1}_{\theta}}^{\top} Q^{e+1}_{\theta} \v^{e+1}$, where $\v^{e+1}$ is the second largest singular vector of $Q^{e+1}_{\theta}$. 

Before proving Theorem \ref{theorem:estimation_error_asynchronous}, we propose the following lemma to bound $\| \v^{e+1} - \widetilde \v^{e+1} \|$.

\begin{lemma}
\label{lemma:v_bound}
Define $\xi = \min{|\lambda_2 - \lambda_3|,|\widetilde \lambda_2 - \widetilde \lambda_3|}$, where $\lambda_1 \geq \cdots \geq \lambda_p$ and $\widetilde \lambda_1 \geq \cdots \geq \widetilde \lambda_p$ are singular values of matrix $Q$ and $\widetilde Q$. Assume $\xi$ is at the constant order. Suppose that $\| Q - \widetilde Q\| \leq \epsilon_Q$ and $\hat j(c,p) \backsim \hat u(c) \backsim \hat r(p) = \Omega \big ( \frac{1}{\sqrt{n}} \big )$, the following inequality holds:
\begin{align*}
\| \v - \widetilde \v \| \leq \sqrt{2}\frac{\epsilon_Q}{\xi}.
\end{align*}
\end{lemma}

First, we study the approximation error between $Q_{\theta}$ and $\widetilde Q_{\theta}$.

Given fixed global model $\theta^{e+1}$ for arbitrary communication round $e+1$, 
For all participating client $k \in [K]\backslash I^{e+1}$, $\theta^{e+1}_{k, 0} = \theta^{e+1} = \sum_{k=1}^{K-\widetilde K^{e+1}} \gamma_k \theta^e_{k, M} + \sum_{\widetilde k=1}^{\widetilde K^{e+1}} \gamma_{\widetilde k} \widetilde \theta^e_{\widetilde k, M}$.
For any straggler $\widetilde k \in I^{e+1}$, $\theta^{e+1}_{\widetilde k, 0} = \theta^{e}_{\widetilde k, M}$.
Thus, $\epsilon^{e+1}_0 \leq \epsilon^{e}_M$, where $ \epsilon^{e}_M$ is bounded by Proposition \ref{proposition:approximation_error}.

Next, according to Lemma \ref{lemma:fraction_error_bound}, the approximation error between each entry of matrix $Q^{e+1}_{\theta}$ and $\widetilde Q^{e+1}_{\theta}$ is bounded by $O(\epsilon^{e+1}_0)$, 
We define $ \tilde \varepsilon \in \R^{C \times P}$ as an approximation error matrix and $\tilde \varepsilon = Q^{e+1}_{\theta} - \widetilde Q^{e+1}_{\theta}$, where each entry $\tilde \varepsilon_{c, p} = L\varepsilon^{e}_0 + \zeta$ is the same.

Now we start to prove Theorem \ref{theorem:estimation_error_asynchronous}.

\begin{align*}
&
\widehat H(\theta^{e+1}, \widetilde \v^{e+1}) - H(\theta^{e+1}, \v^{e+1}) 
\\
= &
\widehat H(\theta^{e+1}, \widetilde \v^{e+1}) - \widehat H(\theta^{e+1}, \v^{e+1}) + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1}) 
\\
= &
{(\widetilde \v^{e+1})}^{\top} {(\widetilde Q^{e+1}_{\theta})}^{\top} \widetilde Q^{e+1}_{\theta} \widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
{(\widetilde \v^{e+1})}^{\top} \big [ (\hat Q^{e+1}_{\theta}+ \tilde \varepsilon)^{\top} (\hat Q^{e+1}_{\theta}+ \tilde \varepsilon) \big ] \widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
{(\widetilde \v^{e+1})}^{\top} \big [ {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} + 2 \| \tilde \varepsilon \| \cdot \| \hat Q^{e+1}_{\theta} \| + \| \tilde \varepsilon \|^2 \big ] \widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
{(\widetilde \v^{e+1})}^{\top} \big [ {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} + 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ] \widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
= &
{(\widetilde \v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} \widetilde \v^{e+1} + {(\widetilde \v^{e+1})}^{\top} \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ] \widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} \\
& + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
{(\widetilde \v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta}\widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} + \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ] + \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
= &
{(\widetilde \v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta}\widetilde \v^{e+1}  - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta}\widetilde \v^{e+1} + {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta}\widetilde \v^{e+1} - {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top} \hat Q^{e+1}_{\theta} \v^{e+1} \\
&
+ \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
= &
({(\widetilde \v^{e+1})}^{\top} - {(\v^{e+1})}^{\top}) {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta}\widetilde \v^{e+1} + {(\v^{e+1})}^{\top} {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} (\widetilde \v^{e+1} - \v^{e+1} ) \\
&
+ \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
\| {(\widetilde \v^{e+1})}^{\top} - {(\v^{e+1})}^{\top} \| \cdot \| {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} \| \cdot \| \widetilde \v^{e+1} \| + \|  {(\v^{e+1})}^{\top} \| \cdot \| {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} \| \cdot \| \widetilde \v^{e+1} - \v^{e+1} \| \\
&
+ \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
= &
\| {(\widetilde \v^{e+1})}^{\top} - {(\v^{e+1})}^{\top} \| \cdot \| {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} \| \cdot \Big ( \| \widetilde \v^{e+1} \| + \|  {(\v^{e+1})}^{\top} \| \Big)
+ \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
\\
& 
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq & 
2 \| {(\widetilde \v^{e+1})}^{\top} - {(\v^{e+1})}^{\top} \| \cdot \| {(\hat Q^{e+1}_{\theta})}^{\top}\hat Q^{e+1}_{\theta} \| 
+ \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
2 \sqrt{2} \frac{L \varepsilon^e_0 + \zeta}{\xi} \cdot \| \hat Q^{e+1}_{\theta} \|^2 + \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1})
\\
\leq &
2 \sqrt{2} \frac{L \varepsilon^e_0 + \zeta}{\xi}  \cdot (\frac{C}{\rho}) + \big [ 2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta ) + ( L \varepsilon^e_0 + \zeta )^2 \big ]  
+ \widehat H(\theta^{e+1}, \v) - H(\theta^{e+1}, \v^{e+1})
\\
= &
2 C^2P^2 \cdot ( L \varepsilon^e_0 + \zeta )^2 + ( 2 C^2P^2 + 2 \sqrt{2} \frac{C}{\rho \xi} ) \cdot ( L \varepsilon^e_0 + \zeta )
+ \widehat H(\theta^{e+1}, \v^{e+1}) - H(\theta^{e+1}, \v^{e+1}),
\end{align*}
where the second and third inequality is due to Cauchy–Schwarz inequality, the sixth inequality is due to $\| \v \| \leq 1$ as $\v$ is the singular vector of matrix,
the seventh inequality is due to Lemma \ref{lemma:v_bound}, the last inequality is due to Lemma \ref{lemma:Q_norm}.

Then, combining the above inequality with Theorem \ref{theorem:estimation_error_synchronous}, we have that:
\begin{align}
\P \Bigg [ 
\widehat H(\theta^{e+1}, \widetilde \v^{e+1}) - H(\theta^{e+1}, \v^{e+1}) \leq
O \bigg( \sqrt{\frac{1}{ n }} + ( L \varepsilon^e_0 + \zeta )^2 \bigg) \Bigg | \theta^{e+1} \Bigg ]
\geq 1- \delta.
\end{align}


% \begin{align}
% \widehat H(\theta, \tilde \v) - H(\theta, \v) 
% =
% O \Big (\sqrt{\frac{ 1}{ n }} + ( L\eta^{t+1} \varepsilon_0 + \zeta )^2 \Big ).
% \end{align}
% \left\{
% \begin{aligned}
%     & 
%    O \Big (\sqrt{\frac{ 1}{ n }} + ( L\eta^{t+1} \varepsilon_0 + \zeta )^2 \Big ), \text{where } \gamma_k =  n_k / n,
%     \\
%     & 
%     O \Big ( \sqrt{\frac{ \log (K)}{ K n_{min} }} + ( L\eta^{t+1} \varepsilon_0 + \zeta )^2 \Big ), \text{where } \gamma_k = 1 / K.
%     \end{aligned}
% \right.

\end{proof}

\subsubsection{Proof of Lemma \ref{lemma:v_bound}}
\begin{proof}
(of Lemma \ref{lemma:v_bound})

Define $\alpha = \Theta (\v, \widetilde \v)$ as the angle between vector $\v$ and $\widetilde \v$. Then, according to Davis–Kahan theorem \cite{yu2015useful}, the following inequality hold:
\begin{align*}
\sin(\alpha) \leq \frac{\|Q - \widetilde Q \|}{\xi}. 
\end{align*}

Then we begin to bound $\| \v - \widetilde \v \|$:
\begin{align*}
&
\| \v - \widetilde \v \| 
\\
= &
\sqrt{\| \v \|^2 + \| \widetilde \v \|^2 - 2 \v^{\top} \widetilde \v}
\\
= &
\sqrt{2 \big (1 - \cos(\alpha) \big) }
\\
= &
\sqrt{2 \big (1 - \sqrt{1 - \sin^2(\alpha)} \big) }
\\
\leq &
\sqrt{2 \big (1 - \sqrt{1 - \frac{\|Q - \widetilde Q \|^2}{\xi^2} } \big) }
\\
\leq &
\sqrt{2 \big (1 - \sqrt{1 - \frac{\epsilon_Q^2}{\xi^2} } \big) }
\\
\leq &
\sqrt{2 - 2 \big (1 - \frac{\epsilon_Q^2}{\xi^2} \big ) }
\\
= &
\sqrt{2}\frac{\epsilon_Q}{\xi},
\end{align*}
where the first inequality is due to the Davis–Kahan theorem, the third inequality is due to $\sqrt{1 - x} \geq 1 - x$ when $0 \leq x \leq 1$ and $ 0 \leq \frac{\epsilon_Q^2}{\xi^2} \leq 1$.

\end{proof}


\newpage{}

\section{Supplementary Numerical Experiments}
    \label{Additional Numerical Experiments}
   
    \subsection{Experimental Details} 
        \label{experiments details}
        \paragraph{Dataset.} 
            \label{appendix: para: Dataset}
            To have an impartial experiment result, we conduct the test-bed in four widely used benchmark datasets, ADULT, COMPAS, DRUG, and DUTCH, following the setups of  \cite{chu2021fedfair}, \cite{du2020fairnessaware} and \cite{donini2020empirical}.
            Specifically, the ADULT dataset \cite{10.5555/3001460.3001502} contains $45,222$ instances, where the training and test part are two separated files consisting of $32K$ and $14K$ samples, respectively, and the training data is partitioned into $50$ clients. 
            The binary class label of each instance indicates whether a person’s annual income exceeds $50K$ dollars. 
            Following the settings of \cite{10.5555/3157382.3157469}, we take gender as the sensitive attribute. 
            The COMPAS \cite{compas_analysis} and the DRUG \cite{2017DRUG} dataset contain $5,278$ and $1,885$ data instances, respectively. 
            Following the design of \cite{chu2021fedfair}, we uniformly sample $4,800$ and $1,600$ instances as the training data from COMPAS and DRUG, respectively, and then use the remaining part as the test dataset.
            The training data in the experiments is divided into $20$ clients for the COMPAS dataset and $10$ clients for the DRUG dataset.
            % \shired{Analogously, the binary class label of an instance in two dataset represents the person's characteristics.}
            In COMPAS, the binary class label indicates whether the person is a recidivist or not, while in DRUG, it manifests whether the person abuses a volatile substance or not.
            Following \cite{chu2021fedfair}, we use the (`African-American', `Caucasian') as the sensitive attribute in COMPAS and (`white',`non-white') in DRUG. 
            The DUTCH dataset collects personal information of the inhabitants in the Netherlands and the task is to classify the individual into high-income or low-income, with gender as the protected attribute.
            It contains $60,419$ data instances.
            We also sample 80\% data to construct the training set and use the remaining part as the test dataset.
    
        \paragraph{Hyperparameters.} 
            \label{appendix: para: Hyperparameters}
            In this paper, several combinations of hyperparameters are adopted, including the regularization parameter $\lambda \in (\{0.1, 0.5, 1, 5, 1000\})$, temperature parameter $\rho $ is $0.1$, training rounds $T$ and local updates iteration $M$ $\in (\{(100,10), (100, 4), (100,2), (500,50), (1000,4) \})$, and proportion of straggler $\alpha \in (\{0, 0.3, 0.5\})$.
            % The total number of synchronous steps $M$ is calculated by the ratio of synchronous multiplying the algorithm epoch $T$.
            Most experimental settings of baselines follow the configurations proposed by the original authors.
            The structure of the logistics regression model follows  \cite{Baharlouei2020Rényi}.
            We fix the batch size as $64$.
            We tune the optimization step size $\eta$ in \{5e-3, 2e-3, 1e-3, 5e-2, 2e-2, 1e-2, 0.5, 0.2, 0.1, 5, 2, 1\}, and pick the optimal setting for each dataset by observing the average of top-20 HM values of
            the model trained by FedAvg.
            Then we set $\eta$ for every experiment in our work on ADULT, COMPAS, DRUG, and DUTCH as \{5, 0.1, 0.02, 0.1\}.
            The client number of FL system on ADULT, COMPAS, DRUG, and DUTCH are as \{50, 20, 10, 30\} following \cite{du2020fairnessaware, chu2021fedfair, 2023FairFed}.
            The fraction of activate client in FL system is set as $0.4$.
            The hyperparameter of regularization term $\mu$ in FedProx \cite{FedProx} and Scaffold \cite{karimireddy2020scaffold} are tuned in \{0.01, 0.1, 0.5, 1, 2\}, and we set $\mu=1$ according to the optimal result.
            The hyperparameter used in FedFair \cite{chu2021fedfair}, LCO \cite{chu2021fedfair}, FL-FairBatch \cite{2021FairBatch}, FedFB \cite{zeng2021improving}, and FairFed \cite{2023FairFed} are following the setting proposed by the authors.
            Additionally, we follow the common stopping criteria in FL that each algorithm stops training when the number of training rounds $T$ is reached.
            
            
        
        \paragraph{Baselines.} 
        In this paper, we adopt the following state-of-the-art algorithms as our baselines, which are designed for the problems of heterogeneity and group fairness:
        
        \begingroup
        \leftskip=1.0em

        % \begin{itemize}
            \textbullet~Local: To observe the effect of server aggregation, we also adopt the local training setting, where each client updates their model by only local training.
            
            \textbullet~FedAvg \cite{mcmahan2017communication}: the original FL algorithm which does not consider fairness for different demographic groups.
            
            
            \textbullet~FedProx \cite{FedProx}: the representative FL algorithm for tackling the statistical and system heterogeneous problem by sloving an optimization object with regularization constraint. 
            We compare the performance of FedProx to verify the effectiveness of FedR\'enyi in tackling heterogeneous problem.  
            
            \textbullet~Scaffold \cite{karimireddy2020scaffold}: the FL algorithm for statistical heterogeneous problem by constructing regularization term with aggregated variate.
            We set Scaffold in our comparative group to observe the impact of statistical heterogeneity.
            
            
            \textbullet~FedFair \cite{chu2021fedfair}: the cross-silo federated framework for group fairness by leveraging estimated statistics from participants.
            We use FedFair as a baseline to compare accuracy, fairness, and their trade-off.
            

            \textbullet~LCO \cite{chu2021fedfair}: the local variant of FedFair for the locally optimization problem.
            We take LCO as our baseline for the same reason as FedFair.
            

            \textbullet~FedAvg+FairBatch (FL-FairBatch) \cite{2021FairBatch}: the 
            enhancement of FedAvg that each client adopts the FairBatch to debias its local training data.
            To verify the effectiveness of FedR\'enyi in tackling group fairness problem, we take this algorithm as our baseline.
            
            
            \textbullet~FedFB \cite{zeng2021improving}: an in-processing debiasing approach in FL based on FairBatch, where the server computes new weights for each client based on their statistics.
            Improving group fairness by leveraging the aggregated local statistics from each client, we compare this method with FedR\'enyi.
            

            \textbullet~FairFed\cite{2023FairFed}:
            the federated framework which adjust the aggregated weights of clients according to the deviations between local and global fairness metrics or accuracy.
            We take this method as our baseline to compare the effect of different aggregation methods to solve the group fairness problem and optimize the accuracy of model.
            
        % \end{itemize}
        \endgroup
        
        
        \paragraph{Communication Simulation.}
            To simulate the network latency in practice, we use the beta distribution to simulate the communication ability of each client.
            Specifically, we use the beta distribution generation package in Scipy.
            The hyperparameters of beta distribution ($a$ and $b$) are set as 0.3 and 1, respectively.
            The results of the probability density function (PDF) are used to compute the latency.
            For PDF values that tend to be positively infinite, we trim them to 16 based on network programming in practice, where the network wait time has a specific upper limit.
            Each client has a communication delay of at least 1 second.
            % The values of the hyperparameters mentioned above, such as the maximum and minimum delay settings, can be modified based on realistic network conditions.      
   
        % \paragraph{Accuracy and Fairness.}
        %     The accuracy and fairness performances of the different methods on another three datasets are illustrated in Figure. \ref{fig: ADULT fairness and accuracy}-\ref{fig: DRUG fairness and accuracy}. 
        %     To observe the trade-off ability, we choose as diverse and fair experimental results as possible, that is, the results that show the trade-off ability of each algorithm for the accuracy and fairness performance as much as possible.
        %     Not surprisingly, the models trained by FedAvg cannot maintain outstanding performance in fairness due to the ignorance of fairness during training, according to the results in the diagrams of different datasets.
        %     In most cases, FedR\'enyi performed better than other baselines, obtaining higher accuracy with the same fairness level or better fairness with the same accuracy. 
        %     \begin{figure*}[htb]
        %         \centering
        %         \subfigure[FR and ACC in ADULT with Heterogeneous setting]{
        %             \begin{minipage}[b]{0.48\textwidth}
        %             \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/ADULT/LR/scatter.eps}
        %             \end{minipage}
        %         }
        %         \subfigure[FR and ACC in ADULT with Uniform setting]{
        %             \begin{minipage}[b]{0.48\textwidth}
        %             \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/ADULT/LR/scatter.eps}
        %             \end{minipage}
        %         }
        %         \vspace{-1.2mm}
        %         \caption{The FR and ACC performances of the models trained by all the methods in the ADULT dataset. Due to poor performance, some points may not be pointed in the main range, and some methods have less than five markers due to overlapping points.}
        %         \label{fig: ADULT fairness and accuracy}
        %     \end{figure*}
            
        %     \begin{figure*}[!ptb]
        %         \centering
        %         \subfigure[FR and ACC in COMPAS with Heterogeneous setting]{
        %             \begin{minipage}[b]{0.48\textwidth}
        %             \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/COMPAS/LR/scatter.eps}
        %             \end{minipage}
        %         }
        %         \subfigure[FR and ACC in COMPAS with Uniform setting]{
        %             \begin{minipage}[b]{0.48\textwidth}
        %             \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/COMPAS/LR/scatter.eps}
        %             \end{minipage}
        %         }
        %         \vspace{-1.2mm}
        %         \caption{The FR and ACC performances of the models trained by all the methods in the COMPAS dataset. Due to poor performance, some points may not be pointed in the main range, and some methods have less than five markers due to overlapping points.}
        %         \label{fig: COMPAS fairness and accuracy}
        %     \end{figure*}
            
        %     \begin{figure*}[!ptb]
        %         \centering
        %         \subfigure[FR and ACC in DRUG with Heterogeneous setting]{
        %             \begin{minipage}[b]{0.48\textwidth}
        %             \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DRUG/LR/scatter.eps}
        %             \end{minipage}
        %         }
        %         \subfigure[FR and ACC in DRUG with Uniform setting]{
        %             \begin{minipage}[b]{0.48\textwidth}
        %             \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DRUG/LR/scatter.eps}
        %             \end{minipage}
        %         }
        %         \vspace{-1.2mm}
        %         \caption{The FR and ACC performances of the models trained by all the methods in the DRUG dataset. Due to poor performance, some points may not be pointed in the main range, and some methods have less than five markers due to overlapping points.}
        %         \label{fig: DRUG fairness and accuracy}
        %     \end{figure*}

\subsection{Additional Experiments for Results in Main paper}
                \paragraph{Effect of Regularization parameter $\lambda$.}
            Next, we analyze how the regularization parameter $\lambda$ affects the performance of the models trained by FedR\'enyi in another three datasets. 
            As the result shown in Figs. \ref{fig: the effect of parameter lambda in Appendix}, for each dataset with two data distribution settings, we train and fine-tune the base models with $\lambda$ in $\{0.1, 0.5, 1, 5, 1000\}$.
            
            Among these empirical results, the fairness performance of FedR\'enyi shows a growing trend in most cases, while the accuracy performance shows a tendency to decrease or unchanged on the contrary, especially on DUTCH, as shown in Figure 2 of the main text.
            Recalling Equation \ref{eq:fedrenyi_empirical_objective}, the regularization term becomes more significant to the FL training object, when $\lambda$ becomes larger, which might compromise accuracy.
            Therefore, these results further demonstrate that FedR\'enyi can construct a trade-off between accuracy and fairness.
            However, the FedR\'enyi performance does not always exhibit the expected trade-off.
            As shown in Figure \ref{fig: the effect of parameter lambda in Appendix}, the accuracy of models does not always drop with $\lambda$ decreasing. 
            We speculate that the sensitive features in these datasets are easy to identify.
            Therefore, increasing $\lambda$ might over-optimize the empirical target, making the ACC unstable.
            Overall, the defects do not obscure the fact that the FedR\'enyi algorithm can balance the accuracy and fairness through different $\lambda$ values by adjusting the value of $\bf{\lambda}$ and obtain the optimal results, according to different preferences.
            % When the amount of data is suitable to the model (simple model trained by few data, complex model trained by abundant data), FedR\'enyi can have a desired grasp of trade-off by fine-tuning the regularization parameter.
            \label{appendix: effect of lambda}
            \begin{figure}[h]
                \centering
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Dirichlet05/no_attribute_skew/ADULT/LR/Lamda.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Dirichlet05/no_attribute_skew/COMPAS/LR/Lamda.eps}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Dirichlet05/no_attribute_skew/DRUG/LR/Lamda.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Dirichlet05/no_attribute_skew/DUTCH/LR/Lamda.eps}
                \end{minipage}
                
                
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Uniform/no_attribute_skew/ADULT/LR/Lamda.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Uniform/no_attribute_skew/COMPAS/LR/Lamda.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Uniform/no_attribute_skew/DRUG/LR/Lamda.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/lamda_diagram/Uniform/no_attribute_skew/DUTCH/LR/Lamda.pdf}
                \end{minipage}
                

                \caption{The effect of parameter $\lambda$ in four dataset.}
                \label{fig: the effect of parameter lambda in Appendix}
            \end{figure}
        
        \paragraph{The Trade-off between Accuracy and Fairness.}
            A comparison of testing ACC and FR about each algorithm is shown in Figure \ref{fig:Appendix fairness and accuracy (LR)}.
            Only the top-5 results (with better HM value) of each algorithm will be plotted, and some methods show less than 5 points are caused by overlap.
            Intuitively, red and yellow scatters (FedR\'enyi results) get closer to the optimal corner than others in most cases. 
            Besides, these scatters approximately form several curves, exhibiting the trade-off ability between ACC and FR.
            In particular, most baselines behave closely in these experiments, except the FairFed (blue).
            Some blue triangles tend to be towards the upper left, which means FairFed may over-emphasize fairness, thus penalizing the accuracy.
            
            \begin{figure*}[h]
                \centering
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/ADULT/LR/scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/COMPAS/LR/scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DRUG/LR/scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DUTCH/LR/scatter.pdf}
                \end{minipage}
                
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/ADULT/LR/scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/COMPAS/LR/scatter.pdf}
                \end{minipage}   
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DRUG/LR/scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DUTCH/LR/scatter.pdf}
                \end{minipage}
            
                % \vspace{-2mm}
                \caption{The ACC and FR trade-off in four datasets with two data distribution settings ($Dir$=0.5, $Dir$=+$\infty$).
                Some methods show less than 5 points are caused by overlap.
                FedR\'enyi performs closer to the optimal (the top right) and approximately forms trade-off curves from the bottom right (most accurate and least fair) to the top left (least accurate and most fair).
                }
                \label{fig:Appendix fairness and accuracy (LR)}
                % \vspace{-2.0mm}
            \end{figure*}
        
        % \newpage{}
        \paragraph{Convergence.}
            To further verify the convergence properties of FedR\'enyi in the heterogeneous and the uniform (isomorphism) data distribution setting, we visually record the training loss at different communication rounds in
            Figure \ref{fig:isomorphism loss} and \ref{fig:isomorphism ACC and FR}.
            Also, the testing ACC, testing FR of FedR\'enyi at different communication rounds on four datasets are presented.
            Intuitively, the training losses of FedR\'enyi decrease with increasing communication and become stable at around $25$ rounds in ADULT and DUTCH, and at around $50$ rounds in COMPAS and DRUG.
            % Specifically, in ARRHYTHMIA, the loss records of FedR\'enyi, Scaffold, FedFB, and FL-FairBatch are relatively stable.
            % While in COMPAS and DRUG, most of the algorithms except Scaffold and Fairfed can converge in both the uniform setting and the heterogeneous setting.
            \begin{figure}[!htbp]
              % \centering
              \begin{minipage}[t]{\textwidth}
                    \includegraphics[width=\textwidth]{new_figure/aaai2025/loss_diagram/Dirichlet05/loss.pdf}
                    \includegraphics[width=\textwidth]{new_figure/aaai2025/loss_diagram/Uniform/loss.pdf}
                    
              \end{minipage}
              \caption{The illustration about the training loss of FedR\'enyi under the uniform (Isomorphism) data settings on four datasets, which verify that FedR\'enyi converges to a stable range after a certain number of rounds}
              \label{fig:isomorphism loss}
            \end{figure}

            \begin{figure}[!htbp]
              % \centering
              \begin{minipage}[t]{\textwidth}
                \includegraphics[width=\textwidth]{new_figure/aaai2025/ACC_over_communication_diagram/Dirichlet05/ACC.pdf}
                \includegraphics[width=\textwidth]{new_figure/aaai2025/FR_over_communication_diagram/Dirichlet05/FR.pdf}
                \includegraphics[width=\textwidth]{new_figure/aaai2025/ACC_over_communication_diagram/Uniform/ACC.pdf}
                \includegraphics[width=\textwidth]{new_figure/aaai2025/FR_over_communication_diagram/Uniform/FR.pdf}
                    
              \end{minipage}
              \caption{The illustration about the training ACC, and FR of FedR\'enyi under the uniform (Isomorphism) data settings on four datasets, which verify that FedR\'enyi converges to a stable range after a certain number of rounds}
              \label{fig:isomorphism ACC and FR}
            \end{figure}
\clearpage
\newpage
    
    \subsection{Robustness Experiment}
    \label{subsection:appendix:robustness_experiment}
        Compared to the general machine learning scenario, one of the implementation challenges of FL is client communication.
        In this paper, we adjust the step $T$ and local updates interval $M$ to investigate the effect of communication frequency.
        We also adjust the client dropping rate to simulate the situation where some clients are lost in the communication round.
        To verify the robustness of FedR\'enyi about the condition change of communication, two groups of additional experiments are set up.
        
        \paragraph{Robustness of total iteration $T$ and local update iteration $M$.}
            \label{robustness: T & I}
            To evaluate the sensibility of FedR\'enyi to communication frequency, we compare the HM of FedR\'enyi with FedAvg and FairFed. 
            It is well known that communication costs are especially expensive in FL.
            Comparing the result in different columns in Table \ref{table:the effect of $T$ and $I$ on COMPAS and DRUG}, the performance of FedR\'enyi with different $\gamma_k$ settings improves as the training rounds $T$ or the number of local updates ($M$) are increased in most case.
            For example, when the number of communication round is increasing with the same training epoch, e.g., (T:100, M:10) to (T:100, M:4) or (T:100, M:4) to (T:100, M:2), the accuracy of our proposed method is improved.

            Comparing different results that testing with the same communication frequency, e.g., (T:100, M:10) and (T:500, M:50), the overall performances of FedR\'enyi achieve outstanding stability, which proves the robustness to communication costs of our algorithm.
            In conclusion, the results of the supplementary experiment provide evidence of the FedR\'enyi robustness specifically for different training epochs and communication rounds.

            
            \begin{table*}[htb]
                \centering
              \caption{The HM effect of different federated dropping rates on COMPAS and DRUG with Heterogeneous setting.}
              \label{table:the effect of $T$ and $I$ on COMPAS and DRUG}
                \begin{tabular}{|cccccc|}
                \hline
                \multicolumn{2}{|c|}{{ }} &
                  \multicolumn{2}{c|}{\textbf{Step T}} &
                  \multicolumn{2}{c|}{\textbf{Local Updates M}} \\ \cline{3-6} 
                \multicolumn{2}{|c|}{{ }} &
                  \multicolumn{1}{c|}{\textbf{T=100, M=10}} &
                  \multicolumn{1}{c|}{\textbf{T=100, M=4}} &
                  \multicolumn{1}{c|}{\textbf{T=100, M=2}} &
                  \textbf{T=500, M=50} \\ \cline{3-6} 
                \multicolumn{2}{|c|}{\multirow{-3}{*}{{ \textbf{Method}}}} &
                  \multicolumn{1}{c|}{{\textbf{Dir=0.5/+$\infty$}}} &
                  \multicolumn{1}{c|}{{\textbf{Dir=0.5/+$\infty$}}} &
                  \multicolumn{1}{c|}{{\textbf{Dir=0.5/+$\infty$}}} &
                  {\textbf{Dir=0.5/+$\infty$}} \\ \hline
                \multicolumn{6}{|c|}{\textit{\textbf{COMPAS}}} \\ \hline
                \multicolumn{2}{|c|}{\textbf{FedAvg}} &
                  \multicolumn{1}{c|}{0.72/0.71} &
                  \multicolumn{1}{c|}{0.723/0.713} &
                  \multicolumn{1}{c|}{0.72/0.71} &
                  0.713/0.72 \\ \hline
                \multicolumn{2}{|c|}{\textbf{FairFed}} &
                  \multicolumn{1}{c|}{0.68/0.71} &
                  \multicolumn{1}{c|}{0.73/0.65} &
                  \multicolumn{1}{c|}{0.64/0.68} &
                  0.69/0.63 \\ \hline
                \multicolumn{1}{|c|}{} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=1000}} &
                  \multicolumn{1}{c|}{0.71/0.715} &
                  \multicolumn{1}{c|}{0.703/0.717} &
                  \multicolumn{1}{c|}{0.72/0.717} &
                  0.72/0.715 \\ \cline{2-6} 
                \multicolumn{1}{|c|}{\multirow{-2}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}FedR\'enyi \\      ($1/K$)\end{tabular}}}} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=0.1}} &
                  \multicolumn{1}{c|}{0.71/0.715} &
                  \multicolumn{1}{c|}{0.717/0.718} &
                  \multicolumn{1}{c|}{0.72/0.722} &
                  0.715/0.713 \\ \hline
                \multicolumn{1}{|c|}{} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=1000}} &
                  \multicolumn{1}{c|}{0.712/0.713} &
                  \multicolumn{1}{c|}{0.718/0.717} &
                  \multicolumn{1}{c|}{0.72/0.71} &
                  0.725/0.722 \\ \cline{2-6} 
                \multicolumn{1}{|c|}{\multirow{-2}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}FedR\'enyi \\      ($n_k/n$)\end{tabular}}}} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=0.1}} &
                  \multicolumn{1}{c|}{0.718/0.715} &
                  \multicolumn{1}{c|}{0.722/0.72} &
                  \multicolumn{1}{c|}{0.723/0.72} &
                  0.718/0.715 \\ \hline
                \multicolumn{6}{|c|}{\textit{\textbf{DUTCH}}} \\ \hline
                \multicolumn{2}{|c|}{\textbf{FedAvg}} &
                  \multicolumn{1}{c|}{0.69/0.7} &
                  \multicolumn{1}{c|}{0.687/0.71} &
                  \multicolumn{1}{c|}{0.683/0.697} &
                  0.8/0.797 \\ \hline
                \multicolumn{2}{|c|}{\textbf{FairFed}} &
                  \multicolumn{1}{c|}{0.69/0.73} &
                  \multicolumn{1}{c|}{0.51/0.59} &
                  \multicolumn{1}{c|}{0.75/0.69} &
                  0.64/0.63 \\ \hline
                \multicolumn{1}{|c|}{} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=1000}} &
                  \multicolumn{1}{c|}{0.805/0.798} &
                  \multicolumn{1}{c|}{0.768/0.773} &
                  \multicolumn{1}{c|}{0.747/0.737} &
                  0.85/0.808 \\ \cline{2-6} 
                \multicolumn{1}{|c|}{\multirow{-2}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}FedR\'enyi \\      ($1/K$)\end{tabular}}}} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=0.1}} &
                  \multicolumn{1}{c|}{0.805/0.795} &
                  \multicolumn{1}{c|}{0.784/0.767} &
                  \multicolumn{1}{c|}{0.769/0.742} &
                  0.809/0.805 \\ \hline
                \multicolumn{1}{|c|}{} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=1000}} &
                  \multicolumn{1}{c|}{0.805/0.795} &
                  \multicolumn{1}{c|}{0.77/0.772} &
                  \multicolumn{1}{c|}{0.733/0.73} &
                  0.807/0.83 \\ \cline{2-6} 
                \multicolumn{1}{|c|}{\multirow{-2}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}FedR\'enyi \\      ($n_k/n$)\end{tabular}}}} &
                  \multicolumn{1}{c|}{\textbf{$\lambda$=0.1}} &
                  \multicolumn{1}{c|}{0.812/0.807} &
                  \multicolumn{1}{c|}{0.787/0.768} &
                  \multicolumn{1}{c|}{0.775/0.74} &
                  0.853/0.84 \\ \hline
                \end{tabular}
            \end{table*}

        \paragraph{Asynchronous affect.}            
            To verify the performance with the asynchronous FedR\'enyi (Option II in Algorithm \ref{alg:FR}), we build experiments and simulate different communication thresholds to control the proportion of straggler $\alpha$.
            Generally, as the proportion of stragglers ($\alpha$) increases, the amount of algorithm available data will decrease significantly, resulting in degraded HM.
            As shown in Table \ref{table:Appendix acceleration_bias} and \ref{table:asy in heterogeneous}, the asynchronous FedR\'enyi not only performs stable HM but also does fairly well in bias control.
            When the asynchronous scheme is utilized in the training process of FedR\'enyi, there exists a tolerable decline in HM (smaller than $0.06$). 
            These results demonstrate our method could accelerate the training process against stragglers with a small performance decline.
            \begin{table}[!htbp]
                \centering
                \caption{
                The HM and the average approximation errors over stragglers with different $\alpha$. 
                These approximation errors are measured by the L2 distance between the approximation values and the corresponding target from stragglers.
                }
                \label{table:Appendix acceleration_bias}
                \begin{tabular}{|cccc|}
                \hline
                \multicolumn{1}{|c|}{\textbf{Dir=0.5~$\lambda$=1}} &
                  \multicolumn{1}{c|}{\textbf{Drop $\alpha$:   0\%}} &
                  \multicolumn{1}{c|}{\textbf{Drop $\alpha$:   30\%}} &
                  \textbf{Drop $\alpha$:   50\%} \\ \hline
                \multicolumn{1}{|c|}{\textbf{(T, M) = (100, 4)}} &
                  \multicolumn{1}{c|}{\textbf{\begin{tabular}[c]{@{}c@{}}HM/j   Error/\\      u Error/$\theta$ Error\end{tabular}}} &
                  \multicolumn{1}{c|}{\textbf{\begin{tabular}[c]{@{}c@{}}HM/j   Error/\\      u Error/$\theta$ Error\end{tabular}}} &
                  \textbf{\begin{tabular}[c]{@{}c@{}}HM/j   Error/\\      u Error/$\theta$ Error\end{tabular}} \\ \hline

                \multicolumn{4}{|c|}{\textbf{ADULT}} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                % pre-trained \\
                Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.88/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.88/0.03/\\      0.04/0.01\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.87/0.03/\\      0.04/0.02\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                % pre-trained \\      
                Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.88/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.88/0.03/\\      0.04/0.01\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.87/0.03/\\      0.04/0.02\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}Synchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.88/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.88/0/\\      0/0\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.88/0/\\      0/0\end{tabular} \\ \hline  
                \multicolumn{4}{|c|}{\textbf{COMPAS}} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi ~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.73/0.03/\\      0.01/0.90\end{tabular}} &
                %   \begin{tabular}[c]{@{}c@{}}0.71/0.01/\\      0.02/1.45\end{tabular} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi ~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.75/0/\\      0/0\end{tabular}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.72/0.04/\\      0.01/0.21\end{tabular}} &
                %   \begin{tabular}[c]{@{}c@{}}0.74/0.01/\\      0.02/0.36\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                % pre-trained \\
                Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.77/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.73/0.03/\\      0.01/0.92\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.71/0.01/\\      0.02/1.41\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                % pre-trained \\      
                Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.75/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0.04/\\      0.01/0.27\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.78/0.01/\\      0.02/0.34\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}Synchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.76/0/\\      0/0\end{tabular} \\ \hline
                \multicolumn{4}{|c|}{\textbf{DRUG}} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi ~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.72/0.01/\\      0.02/0.21\end{tabular}} &
                %   \begin{tabular}[c]{@{}c@{}}0.73/0.01/\\      0.02/0.37\end{tabular} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi ~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.73/0/\\      0/0\end{tabular}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.71/0.01/\\      0.01/0.24\end{tabular}} &
                %   \begin{tabular}[c]{@{}c@{}}0.71/0.01/\\      0.02/0.35\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                % pre-trained \\
                Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0.01/\\      0.01/0.25\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.75/0.08/\\      0.03/0.40\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                % pre-trained \\
                Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.73/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.72/0.01/\\      0.02/0.29\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.73/0.09/\\      0.02/0.37\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}Synchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular} \\ \hline
                \multicolumn{4}{|c|}{\textbf{DUTCH}} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi ~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.78/0/\\      0/0\end{tabular}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.76/0.03/\\      0.02/2.41\end{tabular}} &
                %   \begin{tabular}[c]{@{}c@{}}0.77/0.01/\\      0.02/3.93\end{tabular} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi ~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
                %   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.79/0.03/\\      0.01/2.44\end{tabular}} &
                %   \begin{tabular}[c]{@{}c@{}}0.65/0.01/\\      0.02/3.79\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}      Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.78/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.77/0.03/\\      0.02/2.44\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.79/0.03/\\      0.01/4.04\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}     Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.74/0.03/\\      0.02/2.47\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.78/0.03/\\      0.01/4.01\end{tabular} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}Synchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.78/0/\\      0/0\end{tabular}} &
                  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}0.78/0/\\      0/0\end{tabular}} &
                  \begin{tabular}[c]{@{}c@{}}0.78/0/\\      0/0\end{tabular} \\ \hline
                  
                \end{tabular}
            \end{table}

            
            \begin{table}[htbp]
              \centering
              \caption{
              The comparison between FedProx and FedR\'enyi with different straggler proportions in the heterogeneous setting. 
              }
              \label{table:asy in heterogeneous}
                \begin{tabular}{|cccc|}
                \hline
                \multicolumn{2}{|c|}{\textbf{Dir=0.5}} &
                  \multicolumn{2}{c|}{\textbf{(T, M) = (100, 4),~$\lambda$=1}} \\ \hline
                \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{Method}}} &
                  \multicolumn{1}{c|}{\textbf{Drop:   0\%}} &
                  \multicolumn{1}{c|}{\textbf{Drop:   30\%}} &
                  \textbf{Drop:   50\%} \\ \cline{2-4} 
                \multicolumn{1}{|c|}{} &
                  \multicolumn{1}{c|}{\textbf{ACC/FR/HM}} &
                  \multicolumn{1}{c|}{\textbf{ACC/FR/HM}} &
                  \textbf{ACC/FR/HM} \\ \hline
                \multicolumn{4}{|c|}{\textbf{ADULT}} \\ \hline
                \multicolumn{1}{|c|}{\textbf{FedProx}} &
                  \multicolumn{1}{c|}{0.84/0.91/0.87} &
                  \multicolumn{1}{c|}{0.84/0.93/0.88} &
                  0.84/0.93/0.88 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.85/0.92/0.88} &
                %   \multicolumn{1}{c|}{0.85/0.92/0.88} &
                %   0.85/0.92/0.88 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.85/0.91/0.87} &
                %   \multicolumn{1}{c|}{0.85/0.91/0.87} &
                %   0.85/0.91/0.87 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}     Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\textbf{0.85/0.92/0.88}} &
                  \multicolumn{1}{c|}{\textbf{0.85/0.92/0.88}} &
                  \textbf{0.85/0.92/0.88} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}    Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{0.85/0.91/0.87} &
                  \multicolumn{1}{c|}{0.85/0.91/0.87} &
                  0.85/0.91/0.87 \\ \hline
                
                \multicolumn{4}{|c|}{\textbf{COMPAS}} \\ \hline
                \multicolumn{1}{|c|}{\textbf{FedProx}} &
                  \multicolumn{1}{c|}{0.68/0.83/0.75} &
                  \multicolumn{1}{c|}{0.69/0.85/0.76} &
                  0.69/0.85/0.76 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.68/0.89/0.77} &
                %   \multicolumn{1}{c|}{0.68/0.87/0.76} &
                %   0.67/0.92/0.78 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.70/0.80/0.75} &
                %   \multicolumn{1}{c|}{0.68/0.76/0.72} &
                %   0.69/0.81/0.74 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}      Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{0.68/0.89/0.77} &
                  \multicolumn{1}{c|}{0.66/0.81/0.73} &
                  0.67/0.76/0.71 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}      Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\textbf{0.70/0.80/0.75}} &
                  \multicolumn{1}{c|}{\textbf{0.69/0.86/0.76}} &
                  \textbf{0.68/0.91/0.78} \\ \hline
                \multicolumn{4}{|c|}{\textbf{DRUG}} \\ \hline
                \multicolumn{1}{|c|}{\textbf{FedProx}} &
                  \multicolumn{1}{c|}{0.63/0.84/0.72} &
                  \multicolumn{1}{c|}{0.63/0.84/0.72} &
                  0.62/0.88/0.73 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.66/0.84/0.74} &
                %   \multicolumn{1}{c|}{0.63/0.84/0.72} &
                %   0.62/0.83/0.71 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.61/0.93/0.73} &
                %   \multicolumn{1}{c|}{0.62/0.83/0.71} &
                %   0.62/0.82/0.71 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}     Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\textbf{0.66/0.84/0.74}} &
                  \multicolumn{1}{c|}{\textbf{0.65/0.84/0.74}} &
                  \textbf{0.67/0.85/0.75} \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}      Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{0.61/0.93/0.73} &
                  \multicolumn{1}{c|}{0.61/0.89/0.72} &
                  0.64/0.85/0.73 \\ \hline
                \multicolumn{4}{|c|}{\textbf{DUTCH}} \\ \hline
                \multicolumn{1}{|c|}{\textbf{FedProx}} &
                  \multicolumn{1}{c|}{0.81/0.59/0.68} &
                  \multicolumn{1}{c|}{0.81/0.63/0.71} &
                  0.81/0.66/0.73 \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.82/0.75/0.78} &
                %   \multicolumn{1}{c|}{0.82/0.72/0.77} &
                %   \textbf{0.82/0.77/0.79} \\ \hline
                % \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}W/O pre-trained \\      Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                %   \multicolumn{1}{c|}{0.82/0.67/0.74} &
                %   \multicolumn{1}{c|}{0.82/0.75/0.79} &
                %   0.82/0.54/0.65 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}    Asynchronous \\      FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{\textbf{0.82/0.75/0.78}} &
                  \multicolumn{1}{c|}{\textbf{0.82/0.77/0.79}} &
                  0.82/0.75/0.78 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}     Asynchronous \\      FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{0.82/0.67/0.74} &
                  \multicolumn{1}{c|}{0.82/0.67/0.74} &
                  0.82/0.74/0.78 \\ \hline
                \end{tabular}
            \end{table}
            
            
            In addition, in order to explore the trade-off ability of FedR\'enyi between FR and ACC under asynchronous settings, we set $\alpha$ as 0.3 and 0.5 respectively and record them in the form of scatter plots, as shown in Figure \ref{fig:Appendix Asynchronous fairness and accuracy}.
            Also, only the top-5 results (with better HM value) of each algorithm will be plotted, and some methods show less than 5 points are caused by overlap.
            As shown in Figure \ref{fig:Appendix Asynchronous fairness and accuracy}, the scatters of FedR\'enyi approximately form several curves, exhibiting the trade-off ability between ACC and FR in most cases.
            \begin{figure*}[htbp]
                \centering
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/ADULT/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/COMPAS/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DRUG/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DUTCH/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}


                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/ADULT/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/COMPAS/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DRUG/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Dirichlet05/no_attribute_skew/DUTCH/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}


                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/ADULT/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/COMPAS/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DRUG/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DUTCH/LR/alpha_03_asynchronous_scatter.pdf}
                \end{minipage}


                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/ADULT/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/COMPAS/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DRUG/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                \begin{minipage}[]{0.23\textwidth}
                \includegraphics[width=1\textwidth]{new_figure/ijcai2024/scatter_diagram/Uniform/no_attribute_skew/DUTCH/LR/alpha_05_asynchronous_scatter.pdf}
                \end{minipage}
                % \vspace{-2mm}
                \caption{
                    The ACC and FR trade-off in four datasets with two data distribution settings ($Dir$=0.5, $Dir$=+$\infty$) and two proportion of straggler settings ($\alpha=0.3$, $\alpha=0.5$).
                    Some methods show less than 5 points are caused by overlap.
                }
                \label{fig:Appendix Asynchronous fairness and accuracy}
                % \vspace{-2.0mm}
            \end{figure*}
            
        \newpage{}
        
        \paragraph{Robustness of client dropping.}
            \label{robustness: client dropping}
            
            In practice, a fraction of offline clients may drop out randomly during the communication stage (upload and download) in the training process. 
            This proportion of dropped clients is known as the drop rate.
            In this paper, the setting of the proportion of stragglers $\alpha$ is [$0$, $0.3$, $0.5$], where $0$ means no client is dropped.
            % Meanwhile, to reduce the impact from other experiment settings, chasing a fair comparison, we fix the epoch $T$ and communication frequency $I$ as 100 and 10 respectively, providing sufficient training steps and communication opportunities allows each algorithm to achieve the best performance.
            The effect of the proportion of stragglers is evaluated in four datasets as shown in Table \ref{table:robustness:client_dropping}. 
            % Except the complicated distribution of COMPAS, 
            With different dropping rates, the ACC/FR/HM of FedR\'enyi under two configurations both stay stable with tolerable fluctuation in most cases ($±0.05$), 
            which demonstrates the robustness of FedR\'enyi to different dropping rates and fits for the FL practical implementation.

            \begin{table*}[!htbp]
              \centering
              \caption{The effect of different proportion of federated stragglers on four datasets with different heterogeneous settings}
              \label{table:robustness:client_dropping}
              \resizebox{1\textwidth}{!}{
                \begin{tabular}{|cllllll|}
                  \hline
                  \multicolumn{1}{|c|}{\textbf{T:100   M:10}} &
                    \multicolumn{2}{c|}{\textbf{ACC}} &
                    \multicolumn{2}{c|}{\textbf{FR}} &
                    \multicolumn{2}{c|}{\textbf{HM}} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{Drop   $\alpha$}} &
                    \multicolumn{1}{c|}{\textbf{0\%/30\%/50\%}} &
                    \multicolumn{1}{c|}{\textbf{0\%/30\%/50\%}} &
                    \multicolumn{1}{c|}{\textbf{0\%/30\%/50\%}} &
                    \multicolumn{1}{c|}{\textbf{0\%/30\%/50\%}} &
                    \multicolumn{1}{c|}{\textbf{0\%/30\%/50\%}} &
                    \multicolumn{1}{c|}{\textbf{0\%/30\%/50\%}} \\ \hline
                  \multicolumn{1}{|c|}{{\textbf{Method}}} &
                    \multicolumn{1}{c|}{\textbf{{Dir = 0.5}}} &
                    \multicolumn{1}{c|}{\textbf{{Dir = +$\infty$}}} &
                    \multicolumn{1}{c|}{\textbf{{Dir = 0.5}}} &
                    \multicolumn{1}{c|}{\textbf{{Dir = +$\infty$}}} &
                    \multicolumn{1}{c|}{\textbf{{Dir = 0.5}}} &
                    \multicolumn{1}{c|}{\textbf{{Dir = +$\infty$}}} \\ \hline
                  \multicolumn{7}{|c|}{\textit{\textbf{ADULT}}} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedAvg}} &
                    \multicolumn{1}{l|}{0.58/0.41/\textbf{0.62}} &
                    \multicolumn{1}{l|}{0.6/\textbf{0.55}/0.46} &
                    \multicolumn{1}{l|}{0.92/0.72/\textbf{0.97}} &
                    \multicolumn{1}{l|}{0.93/\textbf{0.87}/\textbf{0.88}} &
                    \multicolumn{1}{l|}{0.71/0.53/\textbf{0.76}} &
                    0.73/\textbf{0.68}/0.61 \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($1/K$)}} &
                    \multicolumn{1}{l|}{\textbf{0.62}/\textbf{0.51}/0.5} &
                    \multicolumn{1}{l|}{\textbf{0.61}/0.52/0.51} &
                    \multicolumn{1}{l|}{\textbf{0.94}/\textbf{0.86}/0.91} &
                    \multicolumn{1}{l|}{\textbf{0.94}/0.84/0.85} &
                    \multicolumn{1}{l|}{\textbf{0.75}/\textbf{0.63}/0.64} &
                    \textbf{0.74}/0.62/0.63 \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($n_k/n$)}} &
                    \multicolumn{1}{l|}{0.61/0.47/0.47} &
                    \multicolumn{1}{l|}{0.6/0.54/\textbf{0.54}} &
                    \multicolumn{1}{l|}{0.93/0.8/0.9} &
                    \multicolumn{1}{l|}{0.93/0.84/0.85} &
                    \multicolumn{1}{l|}{0.74/0.59/0.61} &
                    0.73/0.65/\textbf{0.66} \\ \hline
                  % \multicolumn{7}{|c|}{\textit{\textbf{ARRHYTHMIA}}} \\ \hline
                  % \multicolumn{1}{|c|}{\textbf{FedAvg}} &
                  %   \multicolumn{1}{l|}{0.52/0.44/0.61} &
                  %   \multicolumn{1}{l|}{0.56/\textbf{0.63}/0.49} &
                  %   \multicolumn{1}{l|}{0.64/0.82/\textbf{0.97}} &
                  %   \multicolumn{1}{l|}{0.82/\textbf{0.93}/0.74} &
                  %   \multicolumn{1}{l|}{0.57/0.57/0.75} &
                  %   0.67/\textbf{0.75}/0.59 \\ \hline
                  % \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($1/K$)}} &
                  %   \multicolumn{1}{l|}{\textbf{0.68}/\textbf{0.66}/0.66} &
                  %   \multicolumn{1}{l|}{0.66/0.61/0.56} &
                  %   \multicolumn{1}{l|}{\textbf{0.92}/\textbf{0.91}/0.9} &
                  %   \multicolumn{1}{l|}{\textbf{0.94}/0.92/\textbf{0.9}} &
                  %   \multicolumn{1}{l|}{\textbf{0.78}/\textbf{0.77}/\textbf{0.76}} &
                  %   \textbf{0.78}/0.73/0.68 \\ \hline
                  % \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($n_k/n$)}} &
                  %   \multicolumn{1}{l|}{\textbf{0.68}/\textbf{0.66}/\textbf{0.67}} &
                  %   \multicolumn{1}{l|}{\textbf{0.67}/0.62/\textbf{0.63}} &
                  %   \multicolumn{1}{l|}{0.91/0.86/0.83} &
                  %   \multicolumn{1}{l|}{0.91/0.84/0.86} &
                  %   \multicolumn{1}{l|}{\textbf{0.78}/0.75/0.74} &
                  %   0.77/0.7/\textbf{0.73} \\ \hline
                  \multicolumn{7}{|c|}{\textit{\textbf{COMPAS}}} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedAvg}} &
                    \multicolumn{1}{l|}{0.66/\textbf{0.67}/0.66} &
                    \multicolumn{1}{l|}{\textbf{0.67}/0.66/\textbf{0.66}} &
                    \multicolumn{1}{l|}{0.8/0.75/\textbf{0.82}} &
                    \multicolumn{1}{l|}{0.76/\textbf{0.79}/0.76} &
                    \multicolumn{1}{l|}{0.72/0.71/\textbf{0.73}} &
                    0.71/\textbf{0.72}/0.7 \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($1/K$)}} &
                    \multicolumn{1}{l|}{\textbf{0.67}/0.66/\textbf{0.67}} &
                    \multicolumn{1}{l|}{\textbf{0.67}/\textbf{0.67}/\textbf{0.66}} &
                    \multicolumn{1}{l|}{0.82/0.78/0.76} &
                    \multicolumn{1}{l|}{0.83/0.77/\textbf{0.78}} &
                    \multicolumn{1}{l|}{\textbf{0.74}/0.71/0.71} &
                    0.74/\textbf{0.72}/\textbf{0.71} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($n_k/n$)}} &
                    \multicolumn{1}{l|}{\textbf{0.67}/0.66/\textbf{0.67}} &
                    \multicolumn{1}{l|}{\textbf{0.67}/\textbf{0.67}/\textbf{0.66}} &
                    \multicolumn{1}{l|}{\textbf{0.83}/\textbf{0.79}/0.76} &
                    \multicolumn{1}{l|}{\textbf{0.84}/0.77/0.77} &
                    \multicolumn{1}{l|}{\textbf{0.74}/\textbf{0.72}/0.71} &
                    \textbf{0.75}/0.71/\textbf{0.71} \\ \hline
                  \multicolumn{7}{|c|}{\textit{\textbf{DRUG}}} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedAvg}} &
                    \multicolumn{1}{l|}{0.64/\textbf{0.68}/\textbf{0.69}} &
                    \multicolumn{1}{l|}{\textbf{0.66}/0.62/0.63} &
                    \multicolumn{1}{l|}{0.77/0.89/\textbf{0.96}} &
                    \multicolumn{1}{l|}{0.71/0.65/0.63} &
                    \multicolumn{1}{l|}{0.7/\textbf{0.77}/\textbf{0.8}} &
                    \textbf{0.68}/0.63/0.63 \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($1/K$)}} &
                    \multicolumn{1}{l|}{\textbf{0.68}/0.67/0.66} &
                    \multicolumn{1}{l|}{\textbf{0.68}/\textbf{0.66}/0.66} &
                    \multicolumn{1}{l|}{\textbf{0.94}/0.9/0.9} &
                    \multicolumn{1}{l|}{0.92/0.88/0.9} &
                    \multicolumn{1}{l|}{\textbf{0.79}/0.76/0.77} &
                    0.78/\textbf{0.76}/0.76 \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($n_k/n$)}} &
                    \multicolumn{1}{l|}{\textbf{0.68}/0.66/0.67} &
                    \multicolumn{1}{l|}{\textbf{0.68}/\textbf{0.66}/\textbf{0.67}} &
                    \multicolumn{1}{l|}{\textbf{0.94}/0.89/0.9} &
                    \multicolumn{1}{l|}{\textbf{0.93}/\textbf{0.89}/\textbf{0.92}} &
                    \multicolumn{1}{l|}{\textbf{0.79}/0.76/0.77} &
                    \textbf{0.79}/\textbf{0.76}/\textbf{0.77} \\ \hline
                  \multicolumn{7}{|c|}{\textit{\textbf{DUTCH}}} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedAvg}} &
                    \multicolumn{1}{l|}{0.81/0.8/0.81} &
                    \multicolumn{1}{l|}{0.8/0.8/0.8} &
                    \multicolumn{1}{l|}{0.62/0.58/0.62} &
                    \multicolumn{1}{l|}{0.62/0.62/0.63} &
                    \multicolumn{1}{l|}{0.7/0.67/0.7} &
                    0.7/0.7/0.7 \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($1/K$)}} &
                    \multicolumn{1}{l|}{\textbf{0.83}/\textbf{0.83}/\textbf{0.83}} &
                    \multicolumn{1}{l|}{\textbf{0.83}/\textbf{0.83}/\textbf{0.83}} &
                    \multicolumn{1}{l|}{0.84/0.77/\textbf{0.79}} &
                    \multicolumn{1}{l|}{\textbf{0.84}/\textbf{0.77}/\textbf{0.75}} &
                    \multicolumn{1}{l|}{\textbf{0.84}/0.8/\textbf{0.81}} &
                    \textbf{0.84}/\textbf{0.8}/\textbf{0.79} \\ \hline
                  \multicolumn{1}{|c|}{\textbf{FedR\'enyi   ($n_k/n$)}} &
                    \multicolumn{1}{l|}{\textbf{0.83}/\textbf{0.83}/\textbf{0.83}} &
                    \multicolumn{1}{l|}{\textbf{0.83}/\textbf{0.83}/\textbf{0.83}} &
                    \multicolumn{1}{l|}{\textbf{0.86}/\textbf{0.82}/0.76} &
                    \multicolumn{1}{l|}{\textbf{0.84}/0.76/\textbf{0.75}} &
                    \multicolumn{1}{l|}{\textbf{0.84}/\textbf{0.82}/0.79} &
                    \textbf{0.84}/0.79/0.78 \\ \hline
                \end{tabular}
              }
            \end{table*}
\clearpage
\newpage
    \subsection{Scalability Experiment}
    \label{subsection:appendix:scalability_summary}
        To evaluate the scalability to large-scale dataset of our method, we conduct the image classification on CelebA dataset \cite{CelebA} following the setting in FairGrad \cite{FairGrad}.
        The CelebA contains 202599 samples with $162770$ for training, $19867$ for validation, and $19962$ for testing.
        Each image sample in the CelebA contains $40$ binary attribute labels, and we focus on $2$ attribution among $40$ for binary classification.
        We take the 21-th attribute (gender) as the sensitive attribute, take the third attribute (attractive) as the classification label for each image.
        The training data of CelebA is partitioned into 20 client.
        We compare the performance of our method and FedAvg with $20\%$ activate client as shown in Table \ref{table:Appendix Scalability}.

        \begin{table}[!htbp]
                \centering
                \caption{
                Performances of methods with the heterogeneous setting on CelebA. 
                The Accuracy, fairness, and harmonic mean are denoted by AC, FR, and HM, respectively. 
                }
                \label{table:Appendix Scalability}
                \begin{tabular}{|cccc|}
                \hline
                \multicolumn{4}{|c|}{\textbf{CelebA}} \\ \hline
                \multicolumn{1}{|c|}{\textbf{Dir=0.5,~$\lambda$=1}} &
                  \multicolumn{1}{c|}{\textbf{Drop $\alpha$:   0\%}} &
                  \multicolumn{1}{c|}{\textbf{Batch Size=256}} &
                  \textbf{9-layers CNN} \\ \hline
                \multicolumn{1}{|c|}{\textbf{(T, M) = (100, 10)}} &
                  \multicolumn{1}{c|}{\textbf{ACC}} &
                  \multicolumn{1}{c|}{\textbf{FR}} &
                  \textbf{HM} \\ \hline

                \multicolumn{1}{|c|}{\textbf{FedAvg}} &
                  \multicolumn{1}{c|}{0.702±0.005} &
                  \multicolumn{1}{c|}{1.0±0.0} &
                  0.825±0.003 \\ \hline  
                  		

                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                FedR\'enyi~($n_k/n$)\end{tabular}}} &
                  \multicolumn{1}{c|}{0.715±0.001} &
                  \multicolumn{1}{c|}{1.0±0.0} &
                  0.834±0.001 \\ \hline
                \multicolumn{1}{|c|}{\textbf{\begin{tabular}[c]{@{}c@{}}
                FedR\'enyi~($1/K$)\end{tabular}}} &
                  \multicolumn{1}{c|}{0.716±0.001} &
                  \multicolumn{1}{c|}{1.0±0.0} &
                  0.834±0.001 \\ \hline
                
                
                \end{tabular}
            \end{table}


        
    \subsection{Result Summary}
    \label{subsection:appendix:result_summary}
        To evaluate the performance of each algorithm on four different datasets, we construct several sets of experiments with different levels of heterogeneity.
        To avoid the unfair comparison caused by random factors in practice and hyperparameter settings, we record the mean and standard deviation of the top-20 performances (with better HM values) about different algorithms.  
        
        
        As shown in experimental results (see Table \ref{Table: ADULT full result summary}-\ref{Table: DUTCH full result summary}), FedR\'enyi outperforms other baseline methods and shows a satisfactory level of ACC, FR, and HM in most cases, under different heterogeneity scenarios. 
        These results support the effectiveness of our proposed method.

        \begin{table}[htb]
            \centering
            \caption{Performances of methods with the heterogeneous setting on ADULT.
            The Accuracy, fairness, and harmonic mean are denoted by AC, FR, and HM, respectively. A smaller Dir indicates a more heterogeneous distribution across clients.
            Dir=+$\infty$ represents the uniform data distribution setting.
            }
            \label{Table: ADULT full result summary}
            \begin{tabular}{|c|c|c|c|c|c|}
            \hline
            \textbf{ADULT} &
              \textbf{Method} &
              \textbf{Dir=0.5} &
              \textbf{Dir=1} &
              \textbf{Dir=8} &
              \textbf{Dir=+$\infty$} \\ \hline
             &
              \textbf{Local} &
              0.56±0.12 &
              0.67±0.06 &
              \textbf{0.67±0.06} &
              0.63±0.12 \\ \cline{2-6} 
             &
              \textbf{FedAvg} &
              0.62±0.12 &
              0.59±0.12 &
              0.59±0.12 &
              0.6±0.10 \\ \cline{2-6} 
             &
              \textbf{FedProx} &
              0.61±0.12 &
              0.63±0.07 &
              0.63±0.07 &
              0.58±0.12 \\ \cline{2-6} 
             &
              \textbf{Scaffold} &
              0.56±0.20 &
              0.62±0.06 &
              0.56±0.20 &
              0.56±0.14 \\ \cline{2-6} 
             &
              \textbf{FedFair} &
              0.51±0.07 &
              0.58±0.02 &
              0.52±0.08 &
              0.5±0.13 \\ \cline{2-6} 
             &
              \textbf{LCO} &
              0.52±0.01 &
              0.59±0.04 &
              0.52±0.07 &
              0.5±0.13 \\ \cline{2-6} 
             &
              \textbf{FL-FairBatch} &
              0.64±0.00 &
              0.64±0.00 &
              0.64±0.00 &
              0.64±0.00 \\ \cline{2-6} 
             &
              \textbf{FedFB} &
              0.65±0.00 &
              0.65±0.01 &
              0.64±0.00 &
              0.64±0.00 \\ \cline{2-6} 
             &
              \textbf{FairFed} &
              0.62±0.17 &
              0.64±0.21 &
              0.64±0.21 &
              0.63±0.17 \\ \cline{2-6} 
             
             &
              \textbf{FedR\'enyi ($1/K$)} &
              \textbf{0.67±0.03} &
              \textbf{0.69±0.04} &
              0.67±0.03 &
              \textbf{0.68±0.03} \\ \cline{2-6} 
             
            \multirow{-11}{*}{\textbf{ACC}} & \textbf{FedR\'enyi ($n_k/n$)} & 0.65±0.04 & 0.65±0.03 & 0.65±0.04 & \textbf{0.68±0.03} \\ \hline
             &
              \textbf{Local} &
              0.87±0.07 &
              0.93±0.02 &
              0.93±0.02 &
              \textbf{0.97±0.01} \\ \cline{2-6} 
             &
              \textbf{FedAvg} &
              0.87±0.1 &
              0.89±0.12 &
              0.89±0.12 &
              0.91±0.07 \\ \cline{2-6} 
             &
              \textbf{FedProx} &
              0.88±0.11 &
              0.84±0.06 &
              0.84±0.06 &
              0.76±0.14 \\ \cline{2-6} 
             &
              \textbf{Scaffold} &
              0.88±0.13 &
              0.85±0.05 &
              0.84±0.06 &
              0.79±0.11 \\ \cline{2-6} 
             &
              \textbf{FedFair} &
              0.84±0.17 &
              0.86±0.08 &
              0.85±0.11 &
              0.85±0.16 \\ \cline{2-6} 
             &
              \textbf{LCO} &
              0.86±0.07 &
              0.89±0.11 &
              0.87±0.04 &
              0.84±0.11 \\ \cline{2-6} 
             &
              \textbf{FL-FairBatch} &
              0.91±0.02 &
              0.93±0.02 &
              0.93±0.02 &
              0.92±0.02 \\ \cline{2-6} 
             &
              \textbf{FedFB} &
              0.92±0.03 &
              0.93±0.02 &
              0.93±0.02 &
              0.92±0.02 \\ \cline{2-6} 
             &
              \textbf{FairFed} &
              0.77±0.16 &
              \textbf{0.95±0.07} &
              \textbf{0.95±0.07} &
              0.92±0.08 \\ \cline{2-6} 
             
             &
              \textbf{FedR\'enyi ($1/K$)} &
              \textbf{0.94±0.04} &
              0.94±0.05 &
              0.95±0.05 &
              0.92±0.04 \\ \cline{2-6} 
             
            \multirow{-11}{*}{\textbf{FR}} &
              \textbf{FedR\'enyi ($n_k/n$)} &
              \textbf{0.94±0.04} &
              0.95±0.03 &
              0.92±0.05 &
              0.93±0.05 \\ \hline
             &
              \textbf{Local} &
              0.68±0.09 &
              0.78±0.03 &
              0.78±0.03 &
              0.76±0.02 \\ \cline{2-6} 
             &
              \textbf{FedAvg} &
              0.72±0.11 &
              0.71±0.12 &
              0.71±0.12 &
              0.72±0.08 \\ \cline{2-6} 
             &
              \textbf{FedProx} &
              0.72±0.11 &
              0.72±0.06 &
              0.72±0.06 &
              0.66±0.13 \\ \cline{2-6} 
             &
              \textbf{Scaffold} &
              0.68±0.16 &
              0.72±0.05 &
              0.67±0.09 &
              0.66±0.12 \\ \cline{2-6} 
             &
              \textbf{FedFair} &
              0.63±0.10 &
              0.69±0.03 &
              0.65±0.09 &
              0.63±0.14 \\ \cline{2-6} 
             &
              \textbf{LCO} &
              0.65±0.02 &
              0.71±0.06 &
              0.65±0.05 &
              0.63±0.12 \\ \cline{2-6} 
             &
              \textbf{FL-FairBatch} &
              0.75±0.00 &
              0.76±0.00 &
              0.76±0.00 &
              0.75±0.00 \\ \cline{2-6} 
             &
              \textbf{FedFB} &
              0.76±0.00 &
              0.77±0.01 &
              0.76±0.00 &
              0.75±0.00 \\ \cline{2-6} 
             &
              \textbf{FairFed} &
              0.69±0.16 &
              0.76±0.11 &
              0.76±0.11 &
              0.75±0.11 \\ \cline{2-6} 
             
             &
              \textbf{FedR\'enyi ($1/K$)} &
              \textbf{0.78±0.03} &
              \textbf{0.8±0.04} &
              \textbf{0.79±0.04} &
              0.78±0.03 \\ \cline{2-6} 
             
            \multirow{-11}{*}{\textbf{HM}} &
              \textbf{FedR\'enyi ($n_k/n$)} &
              0.77±0.04 &
              0.77±0.03 &
              0.76±0.04 &
              \textbf{0.79±0.04} \\ \hline
        \end{tabular}
        \end{table}

        \begin{table}[!htbp]
            \centering
            \caption{Performances of methods with the heterogeneous setting on COMPAS.
            The Accuracy, fairness, and harmonic mean are denoted by AC, FR, and HM, respectively. A smaller Dir indicates a more heterogeneous distribution across clients.
            Dir=+$\infty$ represents the uniform data distribution setting.
            }
            \label{Table: COMPAS full result summary}
                \begin{tabular}{|c|c|c|c|c|c|}
                    \hline
                    \textbf{COMPAS}          & \textbf{Method}             & \textbf{Dir=0.5}                           & \textbf{Dir=1}     & \textbf{Dir=8}     & \textbf{Dir=+$\infty$} \\ \hline
                     & \textbf{Local}              & 0.62±0.01          & 0.64±0.01          & 0.65±0.01          & 0.65±0.01              \\ \cline{2-6} 
                     & \textbf{FedAvg}             & 0.66±0.01          & \textbf{0.67±0.01} & 0.66±0.01          & 0.66±0.01              \\ \cline{2-6} 
                     &
                      \textbf{FedProx} &
                      0.66±0.01 &
                      \textbf{0.67±0.01} &
                      \textbf{0.67±0.01} &
                      \textbf{0.67±0.00} \\ \cline{2-6} 
                     & \textbf{Scaffold}           & 0.47±0.12          & 0.48±0.13          & 0.45±0.14          & 0.50±0.13              \\ \cline{2-6} 
                     & \textbf{FedFair}            & 0.62±0.03          & 0.57±0.04          & 0.62±0.03          & 0.59±0.06              \\ \cline{2-6} 
                     & \textbf{LCO}                & 0.59±0.03          & 0.58±0.06          & 0.56±0.07          & 0.56±0.06              \\ \cline{2-6} 
                     & \textbf{FL-FairBatch}       & 0.67±0.01
                     & 0.67±0.00          & 0.67±0.00          & 0.66±0.00              \\ \cline{2-6} 
                     & \textbf{FedFB}       & 0.67±0.01 & \textbf{0.67±0.01} & 0.67±0.0           & 0.66±0.01              \\ \cline{2-6} 
                     & \textbf{FairFed}            & 0.62±0.03          & 0.57±0.04          & 0.62±0.03          & 0.59±0.06              \\ \cline{2-6} 
                     
                     & \textbf{FedR\'enyi ($1/K$)} & \textbf{0.68±0.01}                                  & 0.66±0.02          & 0.66±0.01          & 0.66±0.01              \\ \cline{2-6} 
                     
                    \multirow{-11}{*}{\textbf{ACC}} &
                      \textbf{FedR\'enyi ($n_k/n$)} &
                      \textbf{0.68±0.01} &
                      0.65±0.01 &
                      0.66±0.01 &
                      0.66±0.01 \\ \hline
                     & \textbf{Local}              & \textbf{0.81±0.01} & 0.80±0.04          & 0.79±0.03          & 0.80±0.00              \\ \cline{2-6} 
                     & \textbf{FedAvg}             & 0.79±0.03          & 0.77±0.03          & 0.78±0.02          & 0.77±0.02              \\ \cline{2-6} 
                     & \textbf{FedProx}            & 0.79±0.03          & 0.79±0.03          & 0.78±0.03          & 0.77±0.02              \\ \cline{2-6} 
                     & \textbf{Scaffold}           & 0.82±0.10          & 0.74±0.10          & 0.71±0.05          & 0.81±0.06              \\ \cline{2-6} 
                     & \textbf{FedFair}            & 0.79±0.10          & 0.91±0.05          & 0.70±0.11          & 0.79±0.07              \\ \cline{2-6} 
                     & \textbf{LCO}                & \textbf{0.85±0.09}          & 0.83±0.06          & \textbf{0.87±0.04} & \textbf{0.90±0.05}     \\ \cline{2-6} 
                     & \textbf{FL-FairBatch}       & 0.78±0.02          & 0.79±0.01          & 0.79±0.01          & 0.78±0.01              \\ \cline{2-6} 
                     & \textbf{FedFB}              & 0.75±0.03          & 0.74±0.01          & 0.76±0.01          & 0.74±0.01              \\ \cline{2-6} 
                     & \textbf{FairFed}            & 0.79±0.10          & \textbf{0.91±0.05} & 0.70±0.11          & 0.79±0.07              \\ \cline{2-6} 
                     
                     & \textbf{FedR\'enyi ($1/K$)} & 0.81±0.02                                  & 0.81±0.03          & 0.77±0.05          & 0.82±0.02              \\ \cline{2-6} 
                     
                    \multirow{-11}{*}{\textbf{FR}} &
                      \textbf{FedR\'enyi ($n_k/n$)} &
                      0.82±0.01 &
                      0.81±0.01 &
                      0.80±0.06 &
                      0.81±0.02 \\ \hline
                     & \textbf{Local}              & 0.70±0.01           & 0.71±0.02          & 0.71±0.01          & 0.72±0.00                 \\ \cline{2-6} 
                     & \textbf{FedAvg}             & 0.72±0.01 & 0.72±0.01          & 0.72±0.01          & 0.71±0.01              \\ \cline{2-6} 
                     & \textbf{FedProx}            & 0.72±0.01 & 0.73±0.01          & 0.72±0.01 & 0.72±0.00                 \\ \cline{2-6} 
                     & \textbf{Scaffold}           & 0.60±0.11           & 0.58±0.11          & 0.55±0.07          & 0.62±0.08              \\ \cline{2-6} 
                     & \textbf{FedFair}            & 0.69±0.05          & 0.70±0.04           & 0.66±0.05          & 0.68±0.06              \\ \cline{2-6} 
                     & \textbf{LCO}                & 0.70±0.04           & 0.68±0.06          & 0.68±0.05          & 0.69±0.05              \\ \cline{2-6} 
                     & \textbf{FL-FairBatch}       & 0.72±0.01 & 0.73±0.00          & 0.73±0.00          & 0.72±0.00              \\ \cline{2-6} 
                     & \textbf{FedFB}              & 0.71±0.01          & 0.70±0.01           & 0.71±0.00          & 0.7±0.01               \\ \cline{2-6} 
                     & \textbf{FairFed}            & 0.69±0.05          & 0.70±0.04           & 0.66±0.05          & 0.68±0.06              \\ \cline{2-6} 
                     
                     & \textbf{FedR\'enyi ($1/K$)} & 0.72±0.03                                 & \textbf{0.73±0.02} & 0.71±0.02          & \textbf{0.73±0.01}     \\ \cline{2-6} 
                     
                    \multirow{-11}{*}{\textbf{HM}} &
                      \textbf{FedR\'enyi ($n_k/n$)} &
                      \textbf{0.73±0.02} &
                      0.71±0.01 &
                      \textbf{0.72±0.02} &
                      \textbf{0.73±0.01} \\ \hline
                  \end{tabular}
        \end{table}

        \begin{table}[!htbp]
            \centering
            \caption{
            Performances of methods with the heterogeneous setting on DRUG.
            The Accuracy, fairness, and harmonic mean are denoted by AC, FR, and HM, respectively. A smaller Dir indicates a more heterogeneous distribution across clients.
            Dir=+$\infty$ represents the uniform data distribution setting.
            }
          \label{Table: DRUG full result summary}
            \begin{tabular}{|c|c|c|c|c|c|}
                \hline
                \textbf{DRUG}            & \textbf{Method}             & \textbf{Dir=0.5}                  & \textbf{Dir=1} & \textbf{Dir=8}     & \textbf{Dir=+$\infty$} \\ \hline
                 & \textbf{Local}              & 0.65±0.01 & 0.66±0.01      & 0.66±0.02          & 0.67±0.01              \\ \cline{2-6} 
                 & \textbf{FedAvg}             & 0.67±0.02 & 0.67±0.02      & 0.67±0.02          & 0.64±0.02              \\ \cline{2-6} 
                 & \textbf{FedProx}            & 0.67±0.01 & 0.65±0.02      & 0.67±0.01          & 0.66±0.01              \\ \cline{2-6} 
                 & \textbf{Scaffold}           & 0.66±0.01 & 0.55±0.13      & 0.62±0.04          & 0.54±0.13              \\ \cline{2-6} 
                 & \textbf{FedFair}            & 0.67±0.02 & 0.67±0.02      & 0.67±0.02          & 0.64±0.02              \\ \cline{2-6} 
                 & \textbf{LCO}                & 0.49±0.05 & 0.60±0.06       & 0.49±0.11          & 0.65±0.01              \\ \cline{2-6} 
                 & \textbf{FL-FairBatch}       & 0.66±0.00 & 0.66±0.00      & 0.66±0.00          & 0.66±0.00              \\ \cline{2-6} 
                 & \textbf{FedFB}              & 0.66±0.00 & 0.66±0.00      & 0.66±0.00          & 0.66±0.00              \\ \cline{2-6} 
                 & \textbf{FairFed}            & 0.50±0.08 & 0.63±0.04      & 0.56±0.12          & 0.62±0.04              \\ \cline{2-6} 
                 
                 & \textbf{FedR\'enyi ($1/K$)} & 0.68±0.01                         & 0.68±0.01      & \textbf{0.69±0.01} & 0.68±0.01              \\ \cline{2-6} 
                 
                \multirow{-11}{*}{\textbf{ACC}} &
                  \textbf{FedR\'enyi ($n_k/n$)} &
                  \textbf{0.69±0.01} &
                  \textbf{0.69±0.01} &
                  \textbf{0.69±0.01} &
                  \textbf{0.69±0.01} \\ \hline
                 & \textbf{Local}              & 0.88±0.03 & 0.89±0.03      & 0.87±0.05          & 0.89±0.01              \\ \cline{2-6} 
                 & \textbf{FedAvg}             & 0.86±0.02 & 0.85±0.01      & 0.87±0.01          & 0.85±0.03              \\ \cline{2-6} 
                 & \textbf{FedProx}            & 0.86±0.02 & 0.85±0.01      & 0.87±0.01          & 0.85±0.03              \\ \cline{2-6} 
                 & \textbf{Scaffold}           & 0.82±0.06 & 0.84±0.01      & 0.87±0.06          & 0.83±0.05              \\ \cline{2-6} 
                 & \textbf{FedFair}            & 0.86±0.02 & 0.85±0.01      & 0.87±0.01          & 0.85±0.03              \\ \cline{2-6} 
                 & \textbf{LCO}                & 0.93±0.03 & 0.85±0.07      & 0.83±0.11          & 0.86±0.04              \\ \cline{2-6} 
                 & \textbf{FL-FairBatch}       & 0.84±0.00 & 0.84±0.00      & 0.84±0.01          & 0.84±0.00              \\ \cline{2-6} 
                 & \textbf{FedFB}              & 0.85±0.00 & 0.84±0.00      & 0.84±0.00          & 0.84±0.00              \\ \cline{2-6} 
                 & \textbf{FairFed}            & 0.77±0.10 & 0.84±0.11      & 0.81±0.06           & 0.90±0.06              \\ \cline{2-6} 
                 
                 & \textbf{FedR\'enyi ($1/K$)} & \textbf{0.96±0.03}                & 0.95±0.03      & \textbf{0.96±0.02} & \textbf{0.96±0.02}     \\ \cline{2-6} 
                 
                \multirow{-11}{*}{\textbf{FR}} &
                  \textbf{FedR\'enyi ($n_k/n$)} &
                  0.96±0.02 &
                  \textbf{0.96±0.02} &
                  \textbf{0.96±0.02} &
                  \textbf{0.96±0.02} \\ \hline
                 & \textbf{Local}              & 0.75±0.01 & 0.76±0.01      & 0.75±0.03          & 0.76±0.01              \\ \cline{2-6} 
                 & \textbf{FedAvg}             & 0.75±0.02 & 0.75±0.01      & 0.76±0.01          & 0.73±0.02              \\ \cline{2-6} 
                 & \textbf{FedProx}            & 0.75±0.01 & 0.74±0.01      & 0.76±0.01          & 0.74±0.01              \\ \cline{2-6} 
                 & \textbf{Scaffold}           & 0.73±0.02 & 0.66±0.02      & 0.72±0.05          & 0.65±0.07              \\ \cline{2-6} 
                 & \textbf{FedFair}            & 0.75±0.02 & 0.75±0.01      & 0.76±0.01          & 0.73±0.02              \\ \cline{2-6} 
                 & \textbf{LCO}                & 0.64±0.04 & 0.70±0.06      & 0.62±0.11          & 0.74±0.02              \\ \cline{2-6} 
                 & \textbf{FL-FairBatch}       & 0.74±0.00 & 0.74±0.00      & 0.74±0.00          & 0.74±0.00              \\ \cline{2-6} 
                 & \textbf{FedFB}              & 0.74±0.00 & 0.74±0.00      & 0.74±0.00          & 0.74±0.00              \\ \cline{2-6} 
                 & \textbf{FairFed}            & 0.61±0.09 & 0.72±0.06      & 0.66±0.08          & 0.73±0.05              \\ \cline{2-6} 
                 
                 & \textbf{FedR\'enyi ($1/K$)} & \textbf{0.80±0.01}                 & 0.79±0.01      & \textbf{0.80±0.01} & \textbf{0.80±0.01}     \\ \cline{2-6} 
                 
                \multirow{-11}{*}{\textbf{HM}} &
                  \textbf{FedR\'enyi ($n_k/n$)} &
                  \textbf{0.80±0.01} &
                  \textbf{0.80±0.01} &
                  \textbf{0.80±0.01} &
                  \textbf{0.80±0.01} \\ \hline
              \end{tabular}
        \end{table}

        \begin{table}[!htbp]
            \centering
          \caption{
            Performances of methods with the heterogeneous setting on DUTCH.
            The Accuracy, fairness, and harmonic mean are denoted by AC, FR, and HM, respectively. A smaller Dir indicates a more heterogeneous distribution across clients.
            Dir=+$\infty$ represents the uniform data distribution setting.
            }
          \label{Table: DUTCH full result summary}
            \begin{tabular}{|c|c|c|c|c|c|}
                \hline
                \textbf{DUTCH} &
                  \textbf{Method} &
                  \textbf{Dir=0.5} &
                  \textbf{Dir=1} &
                  \textbf{Dir=8} &
                  \textbf{Dir=+$\infty$} \\ \hline
                 &
                  \textbf{Local} &
                  0.79±0.01 &
                  0.80±0.01 &
                  0.81±0.01 &
                  0.80±0.01 \\ \cline{2-6} 
                 &
                  \textbf{FedAvg} &
                  0.81±0.01 &
                  0.81±0.01 &
                  0.81±0.01 &
                  0.81±0.01 \\ \cline{2-6} 
                 &
                  \textbf{FedProx} &
                  0.80±0.01 &
                  0.81±0.01 &
                  0.81±0.01 &
                  0.81±0.01 \\ \cline{2-6} 
                 &
                  \textbf{Scaffold} &
                  0.60±0.12 &
                  0.57±0.13 &
                  0.55±0.13 &
                  0.57±0.13 \\ \cline{2-6} 
                 &
                  \textbf{FedFair} &
                  0.61±0.16 &
                  0.62±0.08 &
                  0.61±0.15 &
                  0.61±0.13 \\ \cline{2-6} 
                 &
                  \textbf{LCO} &
                  0.62±0.03 &
                  0.67±0.03 &
                  0.61±0.05 &
                  0.61±0.13 \\ \cline{2-6} 
                 &
                  \textbf{FL-FairBatch} &
                  0.81±0.01 &
                  0.81±0.01 &
                  0.81±0.01 &
                  0.81±0.01 \\ \cline{2-6} 
                 &
                  \textbf{FedFB} &
                  0.69±0.05 &
                  0.74±0.05 &
                  0.74±0.04 &
                  0.60±0.10 \\ \cline{2-6} 
                 &
                  \textbf{FairFed} &
                  0.62±0.13 &
                  0.70±0.07 &
                  0.75±0.06 &
                  0.61±0.12 \\ \cline{2-6} 
                 
                 &
                  \textbf{FedR\'enyi ($1/K$)} &
                  \textbf{0.83±0.01} &
                  0.83±0.00 &
                  0.83±0.00 &
                  0.83±0.00 \\ \cline{2-6} 
                 
                \multirow{-11}{*}{\textbf{ACC}} &
                  \textbf{FedR\'enyi ($n_k/n$)} &
                  0.83±0.01 &
                  \textbf{0.83±0.01} &
                  \textbf{0.83±0.01} &
                  \textbf{0.83±0.01} \\ \hline
                 &
                  \textbf{Local} &
                  0.67±0.04 &
                  0.67±0.05 &
                  0.65±0.06 &
                  0.65±0.07 \\ \cline{2-6} 
                 &
                  \textbf{FedAvg} &
                  0.64±0.08 &
                  0.65±0.08 &
                  0.66±0.07 &
                  0.66±0.06 \\ \cline{2-6} 
                 &
                  \textbf{FedProx} &
                  0.63±0.09 &
                  0.67±0.06 &
                  0.65±0.07 &
                  0.63±0.08 \\ \cline{2-6} 
                 &
                  \textbf{Scaffold} &
                  0.84±0.18 &
                  0.87±0.13 &
                  0.87±0.16 &
                  0.84±0.16 \\ \cline{2-6} 
                 &
                  \textbf{FedFair} &
                  0.65±0.35 &
                  0.71±0.12 &
                  0.62±0.19 &
                  0.72±0.19 \\ \cline{2-6} 
                 &
                  \textbf{LCO} &
                  0.65±0.35 &
                  0.73±0.01 &
                  0.64±0.01 &
                  0.72±0.11 \\ \cline{2-6} 
                 &
                  \textbf{FL-FairBatch} &
                  0.66±0.06 &
                  0.66±0.06 &
                  0.66±0.07 &
                  0.64±0.07 \\ \cline{2-6} 
                 &
                  \textbf{FedFB} &
                  0.92±0.04 &
                  0.72±0.21 &
                  0.69±0.20 &
                  0.87±0.23 \\ \cline{2-6} 
                 &
                  \textbf{FairFed} &
                  0.78±0.25 &
                  0.80±0.18 &
                  0.75±0.11 &
                  0.80±0.20 \\ \cline{2-6} 
                 
                 &
                  \textbf{FedR\'enyi ($1/K$)} &
                  0.94±0.04 &
                  0.93±0.03 &
                  0.94±0.03 &
                  0.94±0.04 \\ \cline{2-6} 
                 
                \multirow{-11}{*}{\textbf{FR}} &
                  \textbf{FedR\'enyi ($n_k/n$)} &
                  \textbf{0.96±0.04} &
                  \textbf{0.95±0.04} &
                  \textbf{0.94±0.04} &
                  \textbf{0.96±0.04} \\ \hline
                 &
                  \textbf{Local} &
                  0.73±0.02 &
                  0.73±0.02 &
                  0.72±0.02 &
                  0.72±0.02 \\ \cline{2-6} 
                 &
                  \textbf{FedAvg} &
                  0.72±0.02 &
                  0.72±0.02 &
                  0.73±0.02 &
                  0.73±0.02 \\ \cline{2-6} 
                 &
                  \textbf{FedProx} &
                  0.70±0.02 &
                  0.73±0.02 &
                  0.72±0.02 &
                  0.71±0.02 \\ \cline{2-6} 
                 &
                  \textbf{Scaffold} &
                  0.70±0.14 &
                  0.69±0.13 &
                  0.67±0.14 &
                  0.68±0.14 \\ \cline{2-6} 
                 &
                  \textbf{FedFair} &
                  0.63±0.22 &
                  0.66±0.10 &
                  0.61±0.17 &
                  0.66±0.15 \\ \cline{2-6} 
                 &
                  \textbf{LCO} &
                  0.63±0.06 &
                  0.70±0.01 &
                  0.62±0.02 &
                  0.66±0.12 \\ \cline{2-6} 
                 &
                  \textbf{FL-FairBatch} &
                  0.73±0.02 &
                  0.73±0.02 &
                  0.73±0.02 &
                  0.72±0.02 \\ \cline{2-6} 
                 &
                  \textbf{FedFB} &
                  0.79±0.04 &
                  0.73±0.08 &
                  0.71±0.07 &
                  0.71±0.14 \\ \cline{2-6} 
                 &
                  \textbf{FairFed} &
                  0.69±0.17 &
                  0.75±0.10 &
                  0.75±0.08 &
                  0.69±0.15 \\ \cline{2-6} 
                 
                 &
                  \textbf{FedR\'enyi ($1/K$)} &
                  0.88±0.02 &
                  0.88±0.00 &
                  0.88±0.00 &
                  0.88±0.00 \\ \cline{2-6} 
                 
                \multirow{-11}{*}{\textbf{HM}} &
                  \textbf{FedR\'enyi ($n_k/n$)} &
                  \textbf{0.89±0.02} &
                  \textbf{0.89±0.02} &
                  \textbf{0.88±0.02} &
                  \textbf{0.89±0.02} \\ \hline
              \end{tabular}
        \end{table}

\end{document}
