%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
%\documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{float}
\usepackage{multirow}
%\DeclareRobustCommand{\disambiguate}[3]{#2~#3}
\usepackage{fullpage}
\usepackage{amsfonts}
%\usepackage{CJK}
\usepackage{caption}
\usepackage{graphicx}
\usepackage{mathrsfs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{subfigure}
\usepackage{adjustbox}

\numberwithin{equation}{section}
%\usepackage{float,color}
\usepackage{tabularx}
%\usepackage{ulem}
%\usepackage{enumerate}
\usepackage{enumitem}


%\usepackage{lineno}
%\usepackage{xr}
%\usepackage{diagbox}
\usepackage{booktabs}
\usepackage{hyperref}
\hypersetup{
	colorlinks=true,
	linkcolor=red,
	filecolor=magenta,      
	urlcolor=cyan,
	citecolor = blue
}
\usepackage{dsfont}
\usepackage[boxed,ruled,commentsnumbered]{algorithm2e}
%theorem, lemma
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{assumption}{Assumption}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{aggregation}{Aggregation}
\newtheorem{corollary}{Corollary}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{example}{Example}[section]


%math symbol
\def \hb{\widehat{\boldsymbol{b}}}
\def \tb{\widetilde{\boldsymbol{b}}}
\def \bc{\boldsymbol{c}}
\def \hd{\widehat{\boldsymbol{d}}}
\def \td{\widetilde{\boldsymbol{d}}}
\def \btheta{\boldsymbol{\theta}}
\def \htheta{\widehat{\boldsymbol{\theta}}}
\def \ttheta{\widetilde{\boldsymbol{\theta}}}
\def \bSigma{\boldsymbol{\Sigma}}
\def \hSigma{\widehat{\boldsymbol{\Sigma}}}
\def \bmu{\boldsymbol{\mu}}
\def \hmu{\widehat{\boldsymbol{\mu}}}
\def \tmu{\widetilde{\boldsymbol{\mu}}}
\def \T{\mathrm{T}}
\def \R{\mathbb{R}^p}
\def \E{\mathbb{E}}
\def \Prob{\mathbb{P}}
\def \mH{\mathcal{H}}
\def \bx{\boldsymbol{X}}
% inf norm
\newcommand{\infnorm}[1]{\|#1\|_{\infty}}
\newcommand{\LRinfnorm}[1]{\left\|#1\right\|_{\infty}}
%ell 2 norm
\newcommand{\twonorm}[1]{\|#1\|_{2}}
\newcommand{\LRtwonorm}[1]{\left\|#1\right\|_{2}}
%ell 1 norm
\newcommand{\onenorm}[1]{\|#1\|_{1}}
\newcommand{\LRonenorm}[1]{\left\|#1\right\|_{1}}


\newcommand{\xj}[1]{\textcolor{red}{\textbf{[XJ: #1]}}}
\newcommand{\byj}[1]{\textcolor{blue}{\textbf{[BYJ: #1]}}}
\newcommand{\rev}[1]{\textcolor{magenta}{\textbf{[Review: #1]}}}

%\usepackage[hang,multiple]{footmisc}

\newif\iffootpunct
\footpuncttrue

\newcommand\blfootnote[1]{%
	\begingroup
	\renewcommand\thefootnote{}\footnote{#1}%
	\footpunctfalse
	\addtocounter{footnote}{-1}%
	\endgroup
}
\usepackage{lastpage}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Byzantine-Tolerant Distributed Multiclass Sparse Linear Discriminant Analysis}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Yajie Bao}
\author[1,2]{Weidong Liu}
\author[1]{Xiaojun Mao\thanks{Corresponding author.}}
\author[3]{Weijia Xiong}
% Add affiliations after the authors
\affil[1]{%
    School of Mathematical Sciences\\
    Shanghai Jiao Tong University\\
    Shanghai, China
}
\affil[2]{
MoE Key Lab of Artificial Intelligence\\
Shanghai Jiao Tong University\\
Shanghai, China
}
\affil[3]{%
    School of Public Health,
    The University of Hong Kong\\
    Hong Kong, China
}

\begin{document}

\maketitle

\begin{abstract}
  Communication cost and security issues are both important in large-scale distributed machine learning. In this paper, we investigate a multiclass sparse classification problem under two distributed systems. We propose two distributed multiclass sparse discriminant analysis algorithms based on mean-aggregation and median-aggregation under the normal distributed system or Byzantine failure system. Both of them are computation and communication efficient. Several theoretical results, including estimation error bounds, and support recovery, are established. With moderate initial estimators, our iterative estimators achieve a (near-)optimal rate and exact support recovery after a constant number of rounds. Experiments on both synthetic and real datasets are provided to demonstrate the effectiveness of our proposed methods.
\end{abstract}

\section{Introduction}\label{intro}
	
	Multiclass classification is one of the most important topics in machine learning and plays a crucial role in many fields, such as facial recognition, text mining, and gene analysis \citep{heisele2001face,zhang2015character,ramaswamy2001multiclass}.
	
	Linear discriminant analysis (LDA) is a useful tool in classification problem, which aims to find linear discriminant directions to separate the samples from different classes. We consider the random variable $\boldsymbol{X}$ and its label $Y$, where $\boldsymbol{X}\in \mathbb{R}^p$ is a multivariate normal random variable with mean $\boldsymbol{\mu}_k$ and covariance matrix $\boldsymbol{\Sigma}$ when $Y=k$ for $k=1,2,...,K$.  Let $\pi_k = \mathbb{P}(Y=k)$ be the prior probability that variable $\boldsymbol{X}$ is observed from Class $k$, the oracle Bayes rule under the LDA model can be written as
	\begin{equation*}
	\widehat{Y}=\arg \max _{k}\left\{\boldsymbol{\theta}^{*\top}_{k}\left(\boldsymbol{X}-\frac{\boldsymbol{\mu}_{k} +\bmu_{1}}{2}\right)+\log \frac{\pi_k}{\pi_1}\right\},
	\end{equation*}
	where $\boldsymbol{\theta}^{*}_{k} = \boldsymbol{\Sigma}^{-1}(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1)$ for $k=1,2,...,K$ denotes Fisher's discriminant directions. In practice, we need to estimate $\boldsymbol{\mu}_1$, $\boldsymbol{\mu}_k$, and $\boldsymbol{\Sigma}$ to obtain the estimation of $\boldsymbol{\theta}_k^{*}$. Given independent samples $\{(\boldsymbol{X}_i, Y_i), i = 1,2,...,N\}$ from $K$ classes and denote $\sum_{i = 1}^{N} \mathds{I}(Y_i = k) = N_k$, the classical estimators of Fisher's discriminant directions are given by $\widehat{\boldsymbol{\theta}}_{k} = \widehat{\boldsymbol{\Sigma}}^{-1}(\widehat{\boldsymbol{\mu}}_k - \widehat{\boldsymbol{\mu}}_1)$ for $k = 2,...,K$ where $\widehat{\boldsymbol{\mu}}_k = \sum_{\{i: Y_i = k\}} X_i/N_k$
	and 
	$$
	\widehat{\boldsymbol{\Sigma}} = \frac{1}{N}\sum_{k=1}^{K}\sum_{\{i: Y_i = k\}}(\boldsymbol{X}_i - \widehat{\boldsymbol{\mu}}_k)(\boldsymbol{X}_i - \widehat{\boldsymbol{\mu}}_k)^{\top}.
	$$
	Then the classical discriminant rule is
	\begin{equation*}
	\widehat{Y}=\arg \max _{k}\left\{\htheta^{\top}_{k}\left(\boldsymbol{X}-\frac{\hmu_{k} + \hmu_{1}}{2}\right)+\log\frac{\widehat{\pi}_k}{\widehat{\pi}_1}\right\},
	\end{equation*}
	where $\widehat{\pi}_k = N_k/N$. It has been shown to be both theoretical and practical efficient in the classical fixed dimensionality regime. Nevertheless, the classical linear discriminant rule performs poorly (no better than random guessing) when the dimensionality $p$ increases as the sample size $N$ \citep{bickel2004some}. The main reason is that the sample covariance matrix will be ill-conditioned in such a case. Another related problem is over-fitting, and it leads to great performance loss to the model. 
	
	%Owing to the increasing dimensionality, many sparse LDA methods have been proposed by introducing sparsity assumption to the Fisher's discriminant directions \citep{shao2011sparse, witten2011penalized, cai2011direct, fan2012road, mai2012direct,mai2019multiclass}. For binary classification case, \citet{shao2011sparse} assumed both $\bmu_k$ and $\bSigma$ are sparse then estimated them using a thresholding procedure; \citet{cai2011direct} and \citet{mai2012direct} directly estimated the Bayesian discriminant direction $\bSigma^{-1}(\bmu_2 - \bmu_1)$ using Dantzig selector \citep{candes2007dantzig} and lasso penalty respectively. Unfortunately, these methods can not be generalized to multiclass classification cases easily. For the multiclass sparse LDA problem, penalized Fisher’s discriminant \citep{witten2011penalized}, sparse optimal scoring \citep{clemmensen2011sparse} and multiclass sparse discriminant analysis (MSDA) \citep{mai2019multiclass} are three most popular proposals. Specifically, the MSDA method estimates all the sparse Bayesian discriminant directions simultaneously by solving a quadratic group lasso problem.
	
	
	%\rev{The paper is unable to present the key challenges in the distributed computation of the sparse LDA with muli-class setting. The introduction simply list the issues of the other distributed problems rather then LDA. Since each machine learning method has its own challenges, it is expected to list and discuss them in the introduction, which the paper fails to present.}
	
	%\byj{Focus on distributed LDA in Introduction}
	
	Due to the rapid growth in the size of datasets and resource sharing, there has been tremendous interest in developing distributed machine learning methods in recent years. However, not many works focus on sparse LDA in a distributed environment. The main challenge of distributed estimation is the communication cost. The existing sparse LDA algorithms require constructing an overall sample covariance matrix, which is unrealistic in the distributed system when $p$ is large since the bandwidth of the local machine is limited. To reduce communication cost, \citet{tian2017communication} proposed a distributed sparse LDA algorithm that only required one round of communication. To the best of our knowledge, it is also the only existing distributed sparse LDA algorithm. Despite this, the computational issue is salient for each local machine. The algorithm involves solving large-scale linear programming and estimating the inverse of the covariance matrix, which is computation expensive in the high-dimensional setting. Furthermore, the convergence rate in \citet{tian2017communication} will be sub-optimal when the number of local machines is large.
	
	Another concerning issue in distributed machine learning is security. Most distributed machine learning algorithms require a master machine to aggregate the information from local machines, which are susceptible to errors due to unpredictable and potential attacks. The security issue is more prominent in large-scale distributed systems, such as Federated Learning \citep{konevcny2016federated}. Byzantine failure is used to model the local machine's inherent abnormal behavior, which means that some local machines may send wrong messages or behave completely arbitrarily \citep{lamport2019byzantine}. The algorithm in \citet{tian2017communication} takes simple averaging aggregation of the information from local machines, which is highly non-robust in the Byzantine failure system. We note that there are several works related to the robust LDA algorithm \citep{zhang2010worst,wen2018robust,nie2019robust}, whereas these methods only work for heavy-tailed data and are not resistant to Byzantine failure. Thus it is of great interest to develop Byzantine-robust multi-classification algorithm with a theoretical guarantee.
	
	Although there are extensive studies for distributed estimation and optimization, few are related to sparse LDA in the high-dimensional regime. \citet{tian2017communication} proposed a communication efficient distributed sparse LDA method by constructing a debiased version of linear programming discriminant (LPD) estimator \citep{cai2011direct} for the binary classification task. Recently, \citet{bian2020} proposed a distributed sparse LDA method that does not require the communications of data information among different local machines. Unfortunately, no theoretical guarantee was provided in their work. More importantly, both of them are sensitive to the abnormal behaviors of local machines in the distributed system.
	
	\subsection{Our Contributions}
	
	To address the challenge of increasing dimensionality and Byzantine failure, we propose a new communication efficient distributed sparse LDA algorithm for distributed multi-classification problem in two different systems, respectively:
	%\byj{moved two cases here}
    \begin{itemize}
        \item System I: The distributed system without Byzantine failures;
	    \item System II: There are $\alpha$ fraction Byzantine local machines, and the remaining $1-\alpha$ fraction local machines are normal.
    \end{itemize}
	Under System I, we propose the distributed sparse LDA method based on mean-aggregation (\texttt{Mean-DSLDA}). As for System II, the median-aggregation is applied against the potential Byzantine failure. Thus we propose the Byzantine-tolerant \texttt{Median-DSLDA}. With these two methods, we highlight the main contributions of this paper:
	\begin{enumerate}
		\item Our proposed algorithm shares the same $O(p)$ communication cost with the state-of-the-art distributed learning algorithms \citep{lee2017distributed, wang2017efficient}.
		\item Compared with \citet{tian2017communication}, our proposed algorithm requires less computation complexity in the local machine.
		
		\item The theoretical results guarantee that our proposed algorithm attains (near-)optimal statistical convergence rate and exact support recovery after a constant number of communication rounds.
		%attains $O_{\mathbb{P}}(\sqrt{s\log p/N})$ and $O_{\mathbb{P}}(\sqrt{s\log p/N} + \alpha\sqrt{s/n}+\sqrt{s}/n)$ under System I and II respectively, where $N$ is total sample size, $n$ is local sample size, $p$ is the dimensionality, $s$ is sparsity and $\alpha$ is the fraction of Byzantine machines.
		\item The experiments on synthetic and real data show that our proposed algorithm converges quickly, and \texttt{Median-DSLDA} is highly robust to Byzantine failures.
	\end{enumerate}
	
	
	%To address the challenge of increasing dimensionality and Byzantine failure, we aim to develop a new robust sparse LDA algorithm for the multi-classification problem in distributed system. The proposed algorithm shall achieve optimal statistical convergence rate after a constant number of communication rounds. As we have emphasized, there should be no constraint on the number of local machines.
	
	%A commonly used approach to reduce communication cost is divide-and-conquer \citep{zhang2013communication,zhang2015divide, shi2018massive,ghosh2018divide}. At first, each local machine obtains a local estimator and sends it back to the master machine. Then the master machine averages all local estimators to produce an aggregate estimator. However, the simple average aggregation can only reduce the variance of estimator instead of bias. Moreover, several works \citep{Lee2017,JMLR:v18:17-343,battey2018distributed,tian2017communication} were studied to improve the performance by using the debiased estimators proposed in \citet{van2014asymptotically, javanmard2014confidence} as local estimator and then taking the average. 
	
	%For example, \citet{Lee2017} developed communication-efficient sparse regression through averaging debiased lasso estimators. \citet{JMLR:v18:17-343} proposed the divide-and-conquer debiased $\ell_1$ norm support vector machine. \citet{battey2018distributed} investigated hypothesis testing and parameter estimation using debiased local estimator. \citet{tian2017communication} proposed the distributed binary sparse LDA by constructing debiased LPD\xj{What is LPD?} estimator. \xj{Same as before.}
	
	%However, there are several issues when taking the average of debiased estimators in high-dimensional setting \citep{chen2019distributed, jordan2019communication}. First, the estimation of the inverse of Hessian matrix leads to a large amount of computation complexity, especially when the dimensionality is large. Second, the constraint on the number of machines is usually quite rigorous, which is not practical in many real applications. Therefore, some multi-round distributed machine learning algorithms are proposed  \citep{shamir2014communication,jordan2019communication,chen2019quantile,chen2019distributed,wang2019distributed,fan2019communication}. The corresponding theoretical results showed that these methods can achieve certain optimal statistical convergence rate after a few rounds of communication.
	
	 %\xj{???Not complete sentence.}\byj{modified}
	
	%The rest of this paper is organized as follows. Section \ref{section 3} gives the review of related work.  Section \ref{section 3} introduces proposed Byzantine-tolerant distributed multiclass sparse discriminant analysis method. Section \ref{section 4} presents the theoretical results for $\ell_1$ and $\ell_2$ error bound, support recovery and upper bound of misclassification risk for proposed Byzantine-tolerant distributed estimator. In Section \ref{section 5}, we provide numerical experiment results to illustrate the performance of the proposed method. The discussion about the method and future work will be stated in Section \ref{section 6}.
	
	%Some additional robust aggregation rules such as marginal trimmed mean, dimensional median, Krum \citep{xie2018generalized,li2019rsa} are also studied in the existing literatures. 
	
	\subsection{Related Work}\label{r_work}
	
	Sparse LDA methods in the high-dimensional regime have been broadly investigated in recent years \citep{witten2011penalized,cai2011direct,fan2012road,mai2012direct,mai2019multiclass}. For binary classification case, \citet{cai2011direct} and \citet{mai2012direct} assumed the Bayesian discriminant direction $\bSigma^{-1}(\bmu_2 - \bmu_1)$ is sparse then directly estimated it by using Dantzig selector \citep{candes2007dantzig} and lasso penalty respectively. Unfortunately, these methods can not be generalized to multiclass classification cases easily. For the multiclass sparse LDA problem, penalized Fisher’s discriminant \citep{witten2011penalized}, sparse optimal scoring \citep{clemmensen2011sparse} and multiclass sparse discriminant analysis (MSDA) proposed in \citet{mai2019multiclass} are three popular proposals. Specifically, the MSDA method simultaneously estimates all the sparse Bayesian discriminant directions by solving a quadratic group lasso problem.
	
	Owing to the growth of sample size and dimensionality of datasets, extensive works on high-dimensional sparse distributed machine learning algorithms are proposed. A popular method for distributed sparse estimation is divide-and-conquer debiased (DC-debiased) framework proposed by \citet{lee2017distributed}. Thanks to easy implementation and low communication cost, the DC-debiased scheme has been broadly deployed in several sparse estimation problems \citep{lv2017debiased,tian2017communication,battey2018distributed,JMLR:v18:17-343}. However, the debiased operation requires estimating the inverse of the Hessian matrix, which leads to expensive computation costs for each local machine. There exists a constraint on the number of local machines for DC-debiased estimators to achieve optimal statistical convergence rate\citep{zhang2013communication,Lee2017,battey2018distributed}. \citet{wang2017efficient} and \citet{jordan2019communication} developed another framework, namely Communication-efficient Surrogate Likelihood (CSL), which refines the estimator by multi-round communication. Each local machine in the CSL framework only needs to compute and transmit gradients in each round, and then the master machine solves a penalized sub-problem. Particularly, this kind of method has no restriction on the number of machines. Relate literature based on CSL scheme are \citet{wang2019distributed,fan2019communication,chen2019distributed,chen2020distributed}.
	
     To tackle potential Byzantine failures in distributed learning, some related works are proposed by letting the master machine conduct a robust aggregation on the gradient information received from local machines \citep{yin2018byzantine,xie2018generalized,alistarh2018byzantine,li2019rsa,yin19a}. The most common robust aggregation rule is median-of-means (MOM). \citet{yin2018byzantine} established several optimal statistical rates under mild conditions for the proposed robust algorithms based on median and trimmed mean operations. Other robust aggregation rules such as marginal trimmed mean, dimensional median, Krum are also investigated in the existing literature \citep{xie2018generalized,li2019rsa}. \citet{tu2021variance} proposed a variance-reduced version of the median-of-means aggregation procedure motivated by the composite quantile. For distributed penalized regression problems, \citet{tu2021byzantine} developed a Byzantine-robust least-square Lasso method.
	

	\subsection{Notation}
	The following notations will be used throughout the paper. For a vector $\boldsymbol{x} \in \mathbb{R}^p$, $\|\boldsymbol{x}\|_1 = \sum_{j=1}^p |x_j|$, $\|\boldsymbol{x}\|_2 = (\sum_{j=1}^p x_j^2)^{1/2}$ and $\|\boldsymbol{x}\|_{\infty} = \max_{j}|x_j|$. For a matrix $\boldsymbol{A} = (A_{i,j}) \in \mathbb{R}^{p\times p}$, spectral norm is defined by $\|\boldsymbol{A}\|_2 = \sup_{\boldsymbol{x}\in \mathbb{R}^p} \|\boldsymbol{A}\boldsymbol{x}\|_2$, $\ell_{\infty}$ norm is defined by $\|\boldsymbol{A}\|_{\infty} = \max_{i}\sum_{j=1}^{p}|A_{i,j}|$ and $|\boldsymbol{A}|_{\infty} = \max_{i,j}|A_{i,j}|$. For symmetric matrix $\boldsymbol{A}$, the smallest and largest eigenvalue of $\boldsymbol{A}$ are denoted by $\lambda_{\min}(\boldsymbol{A})$ and $\lambda_{\max}(\boldsymbol{A})$ respectively. For a matrix $\boldsymbol{A} \in \mathbb{R}^{m\times n}$, $\boldsymbol{A}_{ST}$ denotes the submatrix $(a_{s_{i} t_{j}})$ for $S=\left\{s_{1}, \ldots, s_{r}\right\} \subseteq\{1, \ldots, m\}$ and $T=\left\{t_{1}, \ldots, t_{q}\right\} \subseteq \{1, \ldots, n\}$. For two sequences of positive numbers $c_n$ and $d_n$, we write $c_n \lesssim d_n$ if there exists some positive constant $c$ such that $c_n\leq c d_n$ holds for sufficiently large n; and $c_n \asymp d_n$ if $c_n \lesssim d_n$ and $d_n \lesssim c_n$. For a sequence of random variables $X_n$, $X_n = O_{\mathbb{P}}(d_n)$ means that for any $\varepsilon>0$ there exists some positive constant $C$ such that $\mathbb{P}(|X_n|>Cd_n)<\varepsilon$.
	
	\section{Models and Algorithms}\label{models}
	%\xj{Do not copy other's style}
	%In this section, we formally introduce the high-dimensional multiclass sparse LDA problem in the distributed system and some concepts. 
	
	%In this section, we introduce the high-dimensional multiclass sparse LDA model and propose two distributed algorithms for two different distributed systems.\byj{add intro of this section}
	
	Recall that the Bayesian discriminant directions to be estimated are $\boldsymbol{\theta}_k^{*} = \boldsymbol{\Sigma}^{-1}(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1)$ for $k=2,...,K$. According to \citet{mai2019multiclass}, the contribution to discriminant from $j$-th variable of $\boldsymbol{X}$ vanishes if and only if $\theta_{2,j}^{*}=\cdots = \theta_{K, j}^{*} = 0$, which means $\theta_{k, j}^{*}, k=2,...,K$ are grouped according to $j$. Denote the support set $S$ to be $S = \{j: \theta^{*}_{kj}\neq 0 \text{ for some }  k=2,...,K-1\}$ and $s=|S|$ is the sparsity. Given independent samples $\{(\boldsymbol{X}_i, Y_i), i = 1,2,...,N\}$ from $K$ classes, \citet{mai2019multiclass} proposed the multiclass sparse discriminant analysis (MSDA) method and estimated $\boldsymbol{\theta}^{*}_{k}$ for $k=2,...,K$ %\byj{use $\boldsymbol{\theta}^{*}_{k}$ instead of $\boldsymbol{\Theta}^{*}$} 
	simultaneously by solving the following group lasso problem
	\begin{equation*}
	\min_{\boldsymbol{\theta}_{2}, \ldots, \boldsymbol{\theta}_{K}} \sum_{k=2}^{K}\left\{\frac{1}{2} \boldsymbol{\theta}^{\top}_{k} \widehat{\boldsymbol{\Sigma}} \boldsymbol{\theta}_{k}-\left(\widehat{\boldsymbol{\mu}}_{k}-\widehat{\boldsymbol{\mu}}_{1}\right)^{\top} \boldsymbol{\theta}_{k}\right\}+\lambda \sum_{j=1}^{p}\left\|\btheta_{(j)}\right\|_2,
	\end{equation*}
	where $\btheta_{(j)} = (\theta_{2,j},\ldots,\theta_{K,j})^{\T}$ and $\lambda>0$ is the tuning parameter. MSDA achieves variable selection consistency in the centralized sample case. For new observation $\boldsymbol{X}_{\text{new}}$, we classify $\boldsymbol{X}_{\text{new}}$ to Class $\widehat{Y}$ if
	\begin{equation*}
	\widehat{Y} = \arg \max _{k}\left\{\left(\boldsymbol{X}_{\text{new}}-\frac{\widehat{\boldsymbol{\mu}}_{k} + \hmu_1}{2}\right)^{\top}\widehat{\boldsymbol{\theta}}_{k}+\log\frac{\widehat{\pi}_k}{\widehat{\pi}_1}\right\},
	\end{equation*}
	where $\widehat{\boldsymbol{\theta}}_{1} = \boldsymbol{0}$. %\byj{The definition of $\hmu_k$, $\hSigma$ and $\widehat{\pi}_k$ has been given in Introduction}.
	
	For the ease of presentation, we suppose that all the samples $\{(\boldsymbol{X}_i, Y_i), i = 1,2,...,N\}$ are stored in the master machine and $M$ local machines evenly. Denote the sample index in the $m$-th machine by $\mathcal{H}_m$ for $m=0,1,...,M$ where $\mathcal{H}_0$ is the master machine, then the samples in the $m$-th machine are $\{(\boldsymbol{X}_i,Y_i):i \in \mathcal{H}_m\}$. %\xj{Mention that $\mathcal{H}_0$ is a master machine.}\byj{modified} 
	When $Y_i = k$, the corresponding observation $\boldsymbol{X}_i$ is sampled from multivariate normal distribution $\mathcal{N}(\boldsymbol{\mu}_k, \boldsymbol{\Sigma})$ for $k=1,2,...,K$. Without loss of generality, we assume the samples from $K$ classes are evenly distributed in both the master machine and $M$ local machines. Thus the sample size of Class $k$ in each machine is $n_k$. Let $n=\sum_{k=1}^{K}n_k$ be the sample size of the master machine %\byj{$n$ denotes the sample size of the master machine}
	then $N = n(M+1)$ and $N_k = n_k(M+1)$. In System II, owing to the existence of Byzantine failure machines, we assume that the master machine can never be corrupted so that we can trust the information collected in it. For the remaining local machines, some of them may collect contaminated data or send arbitrary wrong values to the master machine. Among the $M$ local machines, the fraction of Byzantine machines is denoted by $\alpha$ and the remaining $1-\alpha$ local machines are normal. In each machine $\mathcal{H}_m$ for $m=0,1,...,M$, we compute the corresponding estimators $\widehat{\pi}_{m,k} = \sum_{i\in\mathcal{H}_m}\mathds{I}(Y_{i}=k)/n$, $\widehat{\boldsymbol{\mu}}_{m,k}=\sum_{\{i\in \mathcal{H}_m:Y_{i}=k\}} \boldsymbol{X}_{i}/n_k$ and
	\begin{equation*}
	   \widehat{\boldsymbol{\Sigma}}_{m}=\frac{1}{n} \sum_{k=1}^{K} \sum_{\{i\in \mathcal{H}_m:Y_{i}=k\}}\left(\boldsymbol{X}_{i}-\widehat{\boldsymbol{\mu}}_{m,k}\right)\left(\boldsymbol{X}_{i}-\widehat{\boldsymbol{\mu}}_{m,k}\right)^{\top}.
	\end{equation*}
	%\xj{use $\mathcal{H}_0$, otherwise it is not easy to describe the master machine.}\xj{Modify correspondingly.}\byj{modified}
	
	Given initial estimators $\widehat{\boldsymbol{\theta}}^{(0)}_{k}$ for $k=2,...,K$ and motivated by CSL framework \citep{jordan2019communication}, we update the estimator in the $t$-th iteration by solving the following quadratic group lasso problem%\byj{mentioned the $t$-th iteration}
	\begin{equation}\label{iter-problem}
	\begin{aligned}
		\min _{\boldsymbol{\theta}_{2}, \ldots, \boldsymbol{\theta}_{K}} \sum_{k=2}^{K} \left\{\frac{1}{2}\boldsymbol{\theta}^{\top}_{k} \widehat{\boldsymbol{\Sigma}}_{0} \boldsymbol{\theta}_{k}-(\widehat{\boldsymbol{\Sigma}}_{0}\widehat{\boldsymbol{\theta}}^{(t-1)}_{k}-\boldsymbol{b}_{k}^{(t-1)})^{\top} \boldsymbol{\theta}_{k}\right\}&\\
		 +\lambda\sum_{j=1}^{p}\left\|\btheta_{(j)}\right\|_2&,
	\end{aligned}
	\end{equation}
	where $\boldsymbol{b}_{k}^{(t-1)}$ is a consistent estimator of $\bSigma\htheta^{(t-1)}_{k}-(\bmu_k - \bmu_1)$ given $\htheta^{(t-1)}_{k}$. The optimization problem (\ref{iter-problem}) can be efficiently solved by several well studied methods, such as group coordinate descent algorithm \citep{yuan2006model} and iterative soft-thresholding algorithm \citep{beck2009fast}. Thus the keystone is to construct $\boldsymbol{b}_{k}^{(t-1)}$ in the master machine by using the local information under different distributed systems (System I and II) efficiently and safely. %\xj{What is $g$? Same for the following.}
	
	%\byj{two operations to estimate $\bSigma\htheta^{(t-1)}_{k}-(\bmu_k - \bmu_1)$.}
	
	The following are two kinds of aggregations considered in our algorithms to construct $\boldsymbol{b}_{k}^{(t-1)}$ and estimate $\bmu_k$, $\pi_k$.
	
	\begin{aggregation}[Mean-aggregation]\label{def2}
	The master machine estimate $\bmu_d$ and $\pi_d$ by using $\hmu_d = \sum_{m=0}^M \hmu_{m,d}/(M+1)$ and $\widehat{\pi}_d = \sum_{m=0}^M \widehat{\pi}_{m,d}/(M+1)$ respectively for $d = 1,2,...,K$. Denote 
	\begin{equation*}
	    \hd_{m,k}^{(t-1)}=\frac{1}{n}\sum_{d=1}^{K}\sum_{\{i:i \in \mH_{m}, Y_i = d\}}(\bx_i-\hmu_{d})(\bx_i-\hmu_{d})^{\T}\htheta_{k}^{(t-1)},
	\end{equation*}
	and construct $\boldsymbol{b}_{k}^{(t-1)}$ by using
	\begin{equation*}
	    \hb_{k}^{(t-1)} = \hd_{k}^{(t-1)} - (\hmu_k-\hmu_1),
	\end{equation*}
	where $\hd_{k}^{(t-1)} = \sum_{m=0}^{M}\hd_{m,k}^{(t-1)}/(M+1)$.
	\end{aggregation}
	%\xj{Change definition to be Aggregation.}\byj{modified}
	
	For vectors $\boldsymbol{x}_m \in \mathbb{R}^{p},\ m=0,1,...,M$, the coordinate-wise median operator is denoted by $\operatorname{cmed}$. Then $\boldsymbol{z} := \operatorname{cmed}\{\boldsymbol{x}_m,m=1,2...,M\}$ is a $p$-vector and $z_j$ is the median of $\{x_{m,j}:m=0,1,...,M\}$. Then we define the following aggregator and estimators.
	\begin{aggregation}[Median-aggregation]\label{def3}
	The master machine estimate $\bmu_d$ and $\pi_d$ by using $\tmu_d = \operatorname{cmed}\{\widehat{\boldsymbol{\mu}}_{m,d}:\ m=0,1,...,M\}$ and $\widetilde{\pi}_d = \operatorname{median}\{\widehat{\pi}_{m,d}:m=0,1,...,M\}$ respectively for $d = 1,2,...,K$. Denote
	$\td_{m,k}^{(t-1)} = \widehat{\boldsymbol{\Sigma}}_{m}\widehat{\boldsymbol{\theta}}_{k}^{(t-1)}$ and construct $\boldsymbol{b}_{k}^{(t-1)}$ by using
	\begin{equation*}
	    \tb_{k}^{(t-1)} = \td_{k}^{(t-1)} - (\tmu_k - \tmu_1),
	\end{equation*}
	where $\td_{k}^{(t-1)} = \operatorname{cmed}\{\td_{m,k}^{(t-1)}:\ m=0,1,...,M\}$.
	\end{aggregation}
	
	%\byj{this paragraph was modified.}
	With the help of Aggregation \ref{def2} and \ref{def3}, we propose \texttt{Mean-DSLDA} and \texttt{Median-DSLDA} under System I and System II, respectively. 
	%We start by the plug-in naive initial MSDA estimator obtained using only the data in the master machine. 
	We start by obtaining $\hmu_k$, $\widehat{\pi}_k$, $\tmu_k$, and $\widetilde{\pi}_k$ in the master machine through one round of communication. Given the initial estimators $\widehat{\btheta}_{k}^{(1)}$ satisfying some mild conditions (see Section \ref{theory}), the vectors $\hd_{m,k}^{(1)}$ or $\td_{m,k}^{(1)}$ are parallelly computed on the $m$-th local machine. We only need to communicate these $p$-dimension vectors to the master machine, thus the communication cost is $O(p)$. With these constructed vectors, the master machine computes $\boldsymbol{b}_{k}^{(1)}$ by Aggregation \ref{def2} and \ref{def3} under System I and II respectively and obtains the updated estimators $\widehat{\btheta}_{k}^{(1)}$ by solving (\ref{iter-problem}). These steps can be repeated iteratively to refine the estimators at each communication round. 
	
	\begin{algorithm}[ht!]
		\caption{\small Distributed Multiclass Sparse Linear Discriminant Analysis (DSLDA)\label{algorithm}}
		%\SetKwInOut{Input}{\textbf{Input}}\SetKwInOut{Output}{\textbf{Output}}
		{\bf Input:} Local data sets $\{\bx_i, Y_i:i\in \mH_{m}\}$ for $m=0,1,...,M$, the number of iterations $T$, the initial estimators $\{\htheta_k^{(0)}:k=2,...,K\}$, the tuning parameters $\lambda_{t}$ for $t=1,...,T$.\\
		{\bf Output:} Final estimators $\{\htheta_k^{(T)}:k=2,...,K\}$.
		\BlankLine
		\For{$m=0,1,...,M$}{
		    \textbf{The $m$-th machine}: Compute $\hmu_{m,k}$, $\widehat{\pi}_{m,k}$ then send them to the master machine.
		}
		\textbf{The master machine:} Compute $\hmu_k$, $\widehat{\pi}_k$, $\tmu_k$ and $\widetilde{\pi}_k$ for $k=1,2,...,K$ then broadcast $\widehat{\boldsymbol{\theta}}_{k}^{(0)}$ and $\hmu_k$ to all local machines.\\
		\For {$t=1,2,...,T$}{
			\For{$m=0,1,...,M$}{
				\textbf{The $m$-th machine}: Compute
			$$
			\left\{\begin{array}{ll} 
			\hd_{m,k}^{(t-1)}, & \text {System I} \\ 
			\td_{m,k}^{(t-1)}, & \text {System II}
			\end{array}\right.,
			$$
			according to Aggregation \ref{def2} and \ref{def3} then send it to the master machine.
			}
			\textbf{The master machine}: Construct $\boldsymbol{b}_{k}^{(t-1)}$ by
			$$
			\boldsymbol{b}_{k}^{(t-1)} \leftarrow\left\{\begin{array}{ll} 
			\hb_{k}^{(t-1)}, & \text {System I} \\ 
			\tb_{k}^{(t-1)}, & \text {System II}
			\end{array}\right. ,
			$$
			according to Aggregation \ref{def2} and \ref{def3} and obtain $\widehat{\boldsymbol{\theta}}_{k}^{(t)}$ by solving (\ref{iter-problem}). Then broadcast $\widehat{\boldsymbol{\theta}}_{k}^{(t)}$ for $k=2,...,K$ to all local machines.
		}
	\end{algorithm}
	
	After $T$ communication rounds, we can obtain the final estimators $\widehat{\boldsymbol{\theta}}_{k}^{(T)}$ for $k=2,...,K$. Then for new observation $\boldsymbol{X}_{\text{new}}$, we classify $\boldsymbol{X}_{\text{new}}$ to Class
	\begin{equation*}
	   \small
	\begin{aligned}
	    \left\{\begin{array}{ll}\arg \max_{k}\left(\boldsymbol{X}_{\text{new}}-\frac{\hmu_{k} + \hmu_1}{2}\right)^{\top} \widehat{\boldsymbol{\theta}}_{k}^{(T)}+\log \widehat{\pi}_{k}, & \text {System I} \\ \arg \max _{k}\left(\boldsymbol{X}_{\text{new}}-\frac{\tmu_{k} + \tmu_{1}}{2}\right)^{\top} \widehat{\boldsymbol{\theta}}_{k}^{(T)}+\log \widetilde{\pi}_{k}, & \text {System II}\end{array}\right. .
	\end{aligned}
	\end{equation*}
	%\byj{modified the discussion}
	
	The details of \texttt{Mean-DSLDA} and \texttt{Median-DSLDA} are described in Algorithm \ref{algorithm}. Our proposed algorithm and the DC-debiased algorithm in \citet{tian2017communication} both require computing and storing the local covariance matrix $\hSigma_{m}$ in each local machine. In addition to this operation, the DC-debiased algorithm also needs to estimate the sparse discriminant direction and the inverse of the covariance matrix in each local machine, leading to $O(np^2)$ extra computation complexity least. In each communication round of Algorithm \ref{algorithm}, each local machine only needs to compute $\hd_{m,k}^{(t-1)}$ or $\td_{m,k}^{(t-1)}$. The local computation complexity of our method is $O(T p^2)$, which is sufficiently reduced compared with the DC-debiased algorithm since $T$ is a constant based on our theory (see the discussion of Corollary \ref{cor_T}).
	
	
	\section{Theoretical Results}\label{theory}
	%\byj{\texttt{Mean-DSLDA} requires the sample size in the first machine is large enough and don't demand that every local machine must have $n$ samples. However, \texttt{Median-DSLDA} requires that every local machine must have $O(n)$ samples since MOM operation. It means that we still need some constraint on the number of local machines for \texttt{Median-DSLDA} to achieve optimal order.}
	
	In this section, we present the theoretical results of our proposed \texttt{Mean-DSLDA} and \texttt{Median-DSLDA} including the estimation error bounds and support recovery. With slightly abusing notations, we denote 
	\begin{align*}
	&\theta^{*}_{\min} = \min\left\{|\theta_{k,j}^{*}|:|\theta_{k,j}^{*}|\neq 0, j\in S, 2\leq k \leq K\right\},\\
	&\Delta_{\min} = \min_{1\leq k,d \leq K, k\neq d} \sqrt{\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_d\right)^{\top}\boldsymbol{\Sigma}^{-1}\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_d\right)},
	\end{align*}
	%\xj{Where is $k$ in first equation?}
	and
	\begin{equation*}
	\Delta_{\max} = \max_{1\leq k,d \leq K,k\neq d} \sqrt{\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_d\right)^{\top}\boldsymbol{\Sigma}^{-1}\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_d\right)}.
	\end{equation*}
	Let $\boldsymbol{Z}_{j}^{*}\in \mathbb{R}^{K-1}$ be the subgradient of $\|\btheta\|_2$ evaluated at $\btheta_{(j)}^{*}=(\theta_{2,j}^{*},\ldots,\theta_{K,j}^{*})^{\T}$ and $\boldsymbol{Z}_{S,k}^{*} = (Z_{j,k}^{*}:j\in S)$.
	Before presenting the formal results of our proposed method, we introduce the following technical assumptions for the clarity of the theoretical guarantee.
	\begin{enumerate}[label=($\mathbf{C}$\arabic*)]
		\item There exists a positive constant $c\ge 1$ such that $c^{-1}\leq \lambda_{\min}(\boldsymbol{\Sigma})\leq \lambda_{\max }(\boldsymbol{\Sigma})\leq c$. There exist some constants $c_1>0$ and $c_2<\infty$ such that $\Delta_{\min} >c_1$, $\Delta_{\max} < c_2$.
		%\byj{added condition $\infnorm{\bSigma_{S^{c}S}\bSigma_{SS}^{-1}}<\infty$}
		\item The sample size of each class satisfies $N_1 \asymp N_2 \asymp \cdots \asymp N_K$. The dimensionality $p$ satisfies $\log p = O(n^{\nu})$ with $\nu < \frac{1}{3}$. The sparsity $s$ satisfies that $s=O(n^{\beta})$ with $\beta<\frac{1}{3}$. The sample size of the master machine satisfies that $n\gtrsim N^{\psi}$ with $0<\psi<1$.
		
		\item The initial estimators $\widehat{\boldsymbol{\theta}}^{(0)}_{k}$ for $k=2,...,K$ have the common support set $\widehat{S}^{(0)}$ and satisfy that $\max_{2\leq k\leq K}\|\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*}\|_2 =O_{\mathbb{P}}(a_n)$ with $a_n = o(1)$. 
		Moreover, assume that $\mathbb{P}(\widehat{S}^{(0)} \subseteq S) \to 1$.
		%\xj{We do not need a specific rate.} \xj{Change P to be probability notation.}\byj{modified}
		\item Suppose that $\boldsymbol{\Sigma}$ satisfies that $\infnorm{\bSigma_{S^{c}S}\bSigma_{SS}^{-1}}<\infty$ and
		for some $\kappa \in (0,1)$,
		$$
		\max_{j \in S^c} \left\{\sum_{k=2}^K\left(\boldsymbol{\Sigma}_{j,S}\boldsymbol{\Sigma}_{SS}^{-1}\boldsymbol{Z}_{S,k}^{*}\right)^2\right\}^{1/2} = 1 - \kappa.
		$$
		\item The fraction of Byzantine local machines $\alpha<\frac{1}{2}$.
	\end{enumerate}
	
	%\byj{added the discussion about the conditions}
	
	Condition ($\mathbf{C}1$) is common in sparse LDA literatures \citep{shao2011sparse,cai2011direct,mai2012direct}. Condition ($\mathbf{C}2$) is considered when establishing the support recovery consistency results, which also appears in \cite{mai2019multiclass}. From condition ($\mathbf{C}3$), the dimension $p$ is allowed to be greater than the local sample size $n$. Condition ($\mathbf{C}4$) can be easily satisfied if we choose some sparse estimators obtained by local samples as the initial estimators. Condition ($\mathbf{C}5$) guarantees the statistical consistency of the median-aggregation against Byzantine failures, similar assumption can be found in \citet{yin2018byzantine,tu2021byzantine,tu2021variance}. 
	
	\subsection{Estimation Error Bound}\label{ell2}
	\begin{theorem}\label{thm_ell}
		Suppose that conditions ($\mathbf{C}1$)-($\mathbf{C}3$) and ($\mathbf{C}5$) hold. By choosing the tuning parameter $\lambda_{t} = $
		\begin{equation*}
		     \begin{aligned}
		         &\left\{\begin{array}{ll}
		         C\left(\sqrt{\frac{\log p}{N}} + a_n\left(\frac{s\log p}{n}\right)^{t/2}\right),& \text{System I}\\
		         C\left(\sqrt{\frac{\log p}{N}} + a_n\left(\frac{s\log p}{n}\right)^{t/2} +\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right), & \text{System II}
		    \end{array}
		    \right.
		     \end{aligned}
		\end{equation*}
		for some sufficiently large positive constant $C$, we are guaranteed that
	\begin{equation}\label{ell_2_bound}
	    \max_{2\leq k \leq K}\|\widehat{\boldsymbol{\theta}}_{k}^{(t)}-\boldsymbol{\theta}_{k}^{*}\|_2 = O_{\mathbb{P}}\left(\sqrt{s}\lambda_{t}\right),
	\end{equation}
	for $k=2,...,K$ under both System I and II.
	\end{theorem}
	
	Theorem \ref{thm_ell} provides $\ell_2$ estimation error bounds after the $t$-th iteration in Algorithm \ref{algorithm}. The first term in (\ref{ell_2_bound}) is the minimax rate of $\ell_2$ error bound for (group)lasso estimators in the centralized sample case (see \citet{raskutti2009minimax,buhlmann2011statistics, wainwright2019high}). The second term implies that the $\ell_2$ estimation error converges geometrically to the optimal order with contraction rate $\sqrt{s\log p/n}$. Note that \texttt{Median-DSLDA} has two additional terms in the convergence rate. The term $\alpha\sqrt{s/n}$ is owing to the existence of Byzantine failure machines while $\sqrt{s}/n$ results from the median-aggregation. Therefore, Theorem \ref{thm_ell} also indicates that \texttt{Mean-DSLDA} is more efficient than \texttt{Median-DSLDA} under System I. Considering that mean-aggregation is not resistant to Byzantine failures, \texttt{Median-DSLDA} is preferred under System II.
    
    %\byj{added discussion about the initial estimators}
    
%    Note that the initial estimator can be obtained using the samples in the master machine based on the MSDA method in \citet{mai2019multiclass}. As a consequence, the initial convergence rate $a_n$ can achieve $\sqrt{s\log p/n}$ and the second terms in (\ref{ell_2_bound}) become $s^{(2t+1)/2}(\log p/n)^{(t+1)/2}$. Then Theorem \ref{thm_ell} also provides a rough guide of selecting iteration rounds to achieve an optimal statistical convergence rate. If we set the iteration rounds such that
%	\begin{equation}\label{iter_num}
%		T \geq \frac{\log (N/n)}{\log \left(n /\left(s^{2} \log p\right)\right)},
%	\end{equation}
%	the second terms in (\ref{ell_2_bound}) will be dominated by the lead term $\sqrt{s\log p/N}$. Consequently,
%    the $\ell_2$ estimation error bounds of \texttt{Mean-DSLDA} and \texttt{Median-DSLDA} become $O_{\mathbb{P}}(\sqrt{s\log p/N})$ and $O_{\mathbb{P}}(\sqrt{s\log p/N}+\alpha\sqrt{s/n}+\sqrt{s}/n)$ respectively after $T$ iteration rounds. It is easy to see that (\ref{iter_num}) can be bounded by a constant under condition ($\mathbf{C}3$). Formally, we have the following corollary.
    \begin{corollary}\label{cor_T}
    Under the same conditions and settings in Theorem \ref{thm_ell}, if the initial estimator satisfies that 
    \begin{equation*}
        \max_{2\leq k \leq K}\twonorm{\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*}} = O_{\mathbb{P}}\left(\sqrt{\frac{s\log p}{n}}\right),
    \end{equation*}
    and the number of iteration round $T$ satisfies 
    	\begin{equation}\label{iter_num}
		T \geq \frac{\log (N/n)}{\log \left(n /\left(s^{2} \log p\right)\right)},
	\end{equation}
    we are guaranteed that
    \begin{equation*}
      	\begin{aligned}
      		&\max_{2\leq k \leq K}\twonorm{\widehat{\boldsymbol{\theta}}_{k}^{(T)}-\boldsymbol{\theta}_{k}^{*}} \\
      		 = &\left\{\begin{array}{ll}
		         O_{\mathbb{P}}\left(\sqrt{\frac{s\log p}{N}}\right),& \text{System I}\\
		         O_{\mathbb{P}}\left(\sqrt{\frac{s\log p}{N}} +\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right). & \text{System II}
		    \end{array}
		    \right.
      	\end{aligned}
    \end{equation*}
    \end{corollary}
    
    In System I, there is no constraint on the number of local machines $M$ since we only require the sample size of the master machine satisfies $n\geq N^{\psi}$. For System II, we require all the local machines have $O(n)$ samples to ensure the consistency of Median-aggregation. In fact, if the number of local machines satisfies $M\lesssim \sqrt{N}$ in the System II, the $\ell_2$ error bounds of \texttt{Median-DSLDA} becomes $O_{\mathbb{P}}(\sqrt{s\log p/N}+\alpha\sqrt{s/n})$, which cannot be improved due to the existence of Byzantine machines. In accordance with the assumption on the sparsity $s$, the right hand side of (\ref{iter_num}) can be bounded by
    \begin{align*}
        \frac{\log (N/n)}{\log \left(n /\left(s^{2} \log p\right)\right)} \leq \frac{1-1 / \psi}{1-\nu-2 \beta}.
    \end{align*}
    It connotes that our proposed method can achieve optimal convergence rate after a constant number of communication rounds.
    
    %\texttt{Mean-DSLDA} and \texttt{Median-DSLDA} attain the same optimal convergence rate $O_{\mathbb{P}}(\sqrt{s\log p/N})$ under System I ($\alpha = 0$).
    
    
    It is worth comparing our results with \citet{tian2017communication} for the binary classification case ($K=2$) under System I. To achieve an optimal convergence rate, they required that the number of local machines $M$ satisfies $M \lesssim \sqrt{N / \log p} / \max (s, s^{\prime})$, where $s^{'}$ is the maximum number of nonzero elements in each column of $\boldsymbol{\Sigma}^{-1}$. However, under System I, the \texttt{Mean-DSLDA} algorithm has no constraint on the number of local machines and does not require the sparsity of $\boldsymbol{\Sigma}^{-1}$.
    


	\subsection{Support Recovery}\label{support}
	Due to the group lasso property, $\widehat{\boldsymbol{\theta}}_k^{(t)}$ for $k=2,...,K$ have the same support set. We denote the common support set of estimator $\widehat{\boldsymbol{\theta}}_k^{(t)}$ by $\widehat{S}^{(t)}$ for $t =1,2,...,T$.
	\begin{theorem}\label{thm_supp}
		Suppose that conditions ($\mathbf{C}1$)$-$($\mathbf{C}5$) hold, with the same choices of the tuning parameter $\lambda_t$ in Theorem \ref{thm_ell}, we have $\widehat{S}^{(t)}\subseteq S$ holds with probability tending to 1. Moreover, suppose that there exists a sufficiently large constant $C>0$ such that
		\begin{equation}\label{beta_min}
		\theta^{*}_{\min}\geq C\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\lambda_{t},
		\end{equation}
		then we have $\widehat{S}^{(t)} = S$ with probability tending to 1 for both \texttt{Mean-DSLDA} and \texttt{Median-DSLDA}.
	\end{theorem}
	Theorem \ref{thm_supp} guarantees the exact support recovery consistency of Algorithm \ref{algorithm} under the beta-min condition (\ref{beta_min}). Note that the beta-min condition becomes weaker as iteration round $t$ increases. And if $T$ satisfies (\ref{iter_num}), the beta-min condition of \texttt{Mean-DSLDA} in Algorithm \ref{algorithm} under System I will reduce to $\theta^{*}_{\min}\geq C\|\boldsymbol{\Sigma}_{SS}^{-1}\|_{\infty}\sqrt{\log p/N}$, which coincides with the order in \citet{wainwright2009sharp,mai2019multiclass}.
	
% 	\subsection{Misclassification Risk Bound}\label{mis}
% 	Given training samples and classification directions $\widehat{\boldsymbol{\theta}}_{k}^{(T)}$ for $k = 1,2,...,K$ where $\widehat{\boldsymbol{\theta}}_{0}^{(T)} = \boldsymbol{0}$, we define the conditional misclassification risk of Class $k$ by
% 	\begin{equation*}
% 	\small
% 	R_k = \mathbb{P}\left(\widehat{Y}\neq k\bigg| Y_{\text{new}} = k\right),
% 	\end{equation*}
% 	where $\widehat{Y}$ is defined as (\ref{d_rule}). The following theorem provides the upper bounds of the conditional misclassification risks $R_k, k=1,...,K$ for Mean-MSDA and Median-MSDA.
% 	\begin{theorem}\label{thm_mis}
% 		With the same choices of $\lambda_t$ in Theorem \ref{thm_ell} and assume the total iteration step $T$ satisfies (\ref{t4.3}), denote
% 		\begin{equation*}
% 		    \small
% 		    R_k^{*} = \sum_{d\neq k}\Phi\left(\frac{\frac{1}{2}\left(\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_{d}\right)^{\mathrm{T}}\boldsymbol{\Sigma}^{-1}\left(\boldsymbol{\mu}_{d}-\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_{1}\right)- \log \frac{\pi_{k}}{\pi_{d}}}{\sqrt{\left(\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_{d}\right)^{\mathrm{T}}\boldsymbol{\Sigma}^{-1}\left(\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_{d}\right)}}\right).
% 		\end{equation*}
% 		Then under Conditions ($\mathbf{C}1$), ($\mathbf{C}3$), ($\mathbf{C}4$) and ($\mathbf{C}5$), the conditional misclassification risk satisfies that
% 		\begin{equation*}
% 		\small
% 		R_k \leq \left\{
% 		\begin{array}{ll}
% 		 R_k^{*} + K\gamma\left(\sqrt{\frac{\log p}{N}}+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right), & \text{System I}\\
% 		 R_k^{*} + K\gamma\left(\sqrt{\frac{\log p}{N}}\right), & \text{System II}
% 		\end{array}
% 		\right.
% 		\end{equation*}
% 		with probability tending to 1.
% 	\end{theorem}

	\section{Simulation Results}\label{experiment}
	In this section, we will investigate the numerical performance of the proposed Byzantine-tolerant distributed sparse LDA method on synthetic data. Three metrics are used to evaluate the performance of algorithms: the average $\ell_2$ estimation error $\sum_{k=2}^{K}\|\widehat{\boldsymbol{\theta}}_k-\boldsymbol{\theta}_k^{*}\|_2/K$, the misclassification rate and the $F_1$ score. The $F_1$ score is defined as
	$$
	F_{1}= 2 \cdot\frac{\text { precision } \cdot \text { recall }}{\text { precision }+\text { recall }},
	$$
	where $\text{precision} = |\widehat{S} \cap S|/|\widehat{S}|$ and $\text{recall} =  |\widehat{S} \cap S|/|S|$ and $\widehat{S}$ is the support set of $\widehat{\boldsymbol{\theta}}_k$. In the following experiments, we generate a training set of size $N$, a validation set of size $1,000$ and a test set of size $1,000$ independently, then randomly partition the training set into $M+1$ machines (including the master machine) evenly. The validation set is used to choose tuning parameter and the test set is used to compute misclassification rate. All the results are averaged over 200 independent trails.
	
	%For multi-class task, we compare our proposed algorithm's performance with MSDA method \citep{mai2019multiclass} using the centralized data, which is abbreviated as \texttt{C-MSDA}. For binary-class task, we compare our method with the debiased procedure in \citet{tian2017communication}, which is abbreviated as DC-LDA.
	



% \begin{table*}[tb]
% \centering
% \caption{The average evaluation metrics with dimensionality varying from 400 to 1,000. The total sample size is 10,000 and the number of machines (including the master machine) is 50. The fraction of Byzantine machines in System II is 0.1.}\label{table:vary_p}
% \begin{adjustbox}{width=1\textwidth}
% \begin{tabular}{@{}cccccccccccccc@{}}
% \toprule
% &              & \multicolumn{4}{c}{Misclassification Rate} & \multicolumn{4}{c}{$\ell_2$ Error} & \multicolumn{4}{c}{$F_1$ Score}   \\ \midrule
% & $p$            & 400       & 600      & 800      & 1000     & 400    & 600    & 800    & 1000  & 400   & 600   & 800   & 1000  \\
% \cmidrule(lr){3-6} \cmidrule(lr){7-10} \cmidrule(lr){11-14}
% Central Setting & \texttt{C-MSDA}       & 0.128     & 0.132    & 0.127    & 0.128    & 0.507  & 0.494  & 0.515  & 0.520  & 0.998 & 0.998 & 1     & 0.997 \\
% \multirow{2}{*}{System I}  & \texttt{Mean-DSLDA}   & 0.129     & 0.133    & 0.129    & 0.130    & 0.704  & 0.719  & 0.697  & 0.762 & 1     & 0.999 & 0.998 & 0.993 \\
%                           & \texttt{Median-DSLDA} & 0.130      & 0.134    & 0.129    & 0.131    & 0.720  & 0.725  & 0.703  & 0.754 & 1     & 0.998 & 0.998 & 0.980  \\
% \multirow{2}{*}{System II} & \texttt{Mean-DSLDA}   & 0.303     & 0.289    & 0.263    & 0.271    & 1.746  & 1.764  & 1.656  & 1.688 & 0.775 & 0.828 & 0.822 & 0.796 \\
%                           & \texttt{Median-DSLDA} & 0.130     & 0.133    & 0.130    & 0.130    & 0.737  & 0.747  & 0.718  & 0.747 & 1     & 1     & 0.998 & 0.990  \\ \bottomrule
% \end{tabular}
% \end{adjustbox}
% \end{table*}

	\subsection{Multi-class Task}
	The generation of synthetic data is as follows. Denote the label of each class by $k$ for $k = 1,2,...,K$. We set $K = 5$, $p = 600$, $\beta_{jk} = 1.6$ for $j=2k-1, 2k; k=1,...,K$  and $\beta_{jk} = 0$ otherwise. The covariance matrix is $\boldsymbol{\Sigma} = (\sigma_{ij})_{p\times p}$ where $\sigma_{ij} = 0.5^{|i-j|}$. Let $\boldsymbol{\mu}_k = \boldsymbol{\Sigma}\boldsymbol{\beta}_k$ and $\boldsymbol{\theta}_k^{*} = \boldsymbol{\beta}_k-\boldsymbol{\beta}_1$, then the support set $S = \{1,2,...,10\}$. In each machine, we generate Class $k$ samples independently from $\mathcal{N}(\boldsymbol{\mu}_k,\boldsymbol{\Sigma})$ for $k = 1,2,...,K$ with equal sample size. For Byzantine local machines, the label $Y$ is replaced by $6-Y$. The master machine is normal in our setting. For comparison, we also run MSDA algorithm \citep{mai2019multiclass} using the centralized data, which is abbreviated as \texttt{C-MSDA}. The results are recorded by taking averages over 100 independent trials.
	
\begin{figure*}[tb]
		\centering
		\subfigure[$\ell_2$ Error]{\includegraphics[width=0.32\linewidth]{plots/ell_iter_CR.eps}}
		\subfigure[Misclassification Rate]{\includegraphics[width=0.32\linewidth]{plots/error_iter_CR.eps}}
		\subfigure[$F_1$ Score]{\includegraphics[width=0.32\linewidth]{plots/Fscore_iter_CR.eps}}
	\caption{The evaluation metrics of \texttt{Mean-DSLDA}, \texttt{Median-DSLDA} and \texttt{C-MSDA} versus the number of iterations under two systems. The total sample ise $N$ is 50,000 and the number of machines is 100. The dimension is $p = 100$. The fraction of Byzantine machines in System II is $\alpha = 0.2$.}
		\label{fig1}
\end{figure*}
%\subsubsection{The Numerical Convergence of Algorithm \ref{algorithm}}

\begin{table*}[tb]
\caption{The evaluation metrics of \texttt{Mean-DSLDA} and \texttt{Median-DSLDA} under different number of machines. The local sample size is fixed as $n = 200$ and the dimension is $p = 500$. The faction of Byzantine machines in System II is $\alpha = 0.2$.}\label{table:vary_m}
\resizebox{\textwidth}{!}{\begin{tabular}{@{}ccccccccccccc@{}}
\toprule
& \multicolumn{4}{c}{L2 Error} 
& \multicolumn{4}{c}{Misclassification rate}        
& \multicolumn{4}{c}{F1 Score}  \\ \midrule
\multicolumn{1}{l}{} & 
\multicolumn{2}{c}{System I} & \multicolumn{2}{c}{System II} & \multicolumn{2}{c}{System I} & \multicolumn{2}{c}{System II} & \multicolumn{2}{c}{System I} & \multicolumn{2}{c}{System II} \\\cmidrule(lr){2-5}\cmidrule(lr){6-9}\cmidrule(lr){10-13}
$m$                    & Mean         & Median        & Mean                & Median  & Mean         & Median        & Mean          & Median        & Mean         & Median        & Mean          & Median        \\\cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}\cmidrule(lr){8-9}\cmidrule(lr){10-11}\cmidrule(lr){11-13}
100                  & 0.645        & 0.677         & \textgreater 10   & 0.683   & 0.135        & 0.135         & 0.615         & 0.135         & 1            & 1             & 0.278         & 1             \\
200                  & 0.698        & 0.709         & \textgreater 10   & 0.748   & 0.130        & 0.131         & 0.577         & 0.130         & 1            & 1             & 0.330         & 1             \\
300                  & 0.666        & 0.696         & \textgreater 10   & 0.700   & 0.129        & 0.130         & 0.619         & 0.130         & 1            & 1             & 0.275         & 1             \\
400                  & 0.663        & 0.666         & \textgreater 10   & 0.677   & 0.130        & 0.135         & 0.679         & 0.135         & 1            & 1             & 0.133         & 1             \\
500                  & 0.681        & 0.694         & \textgreater 10   & 0.716   & 0.128        & 0.128         & 0.706         & 0.127         & 1            & 1             & 0.155         & 1             \\ \bottomrule
\end{tabular}}
\end{table*}

\begin{table*}[tb]
\centering
\caption{The evaluation metrics and their standard deviations (in parentheses) of \texttt{Median-DSLDA} under System II. The total sample ise $N$ is 20,000 and the number of machines (include the master machine) is 100. The dimension is $p = 500$.}\label{table_alpha}
\begin{tabular}{@{}cccccc@{}}
\toprule
& \multicolumn{1}{c}{$\alpha = 0$} & \multicolumn{1}{c}{$\alpha = 0.05$} & \multicolumn{1}{c}{$\alpha = 0.1$} & \multicolumn{1}{c}{$\alpha = 0.15$} & \multicolumn{1}{c}{$\alpha = 0.2$} \\ \midrule
$\ell_2$ Error             & 0.687 (0.237)         & 0.691 (0.243)            & 0.696 (0.181)           & 0.702 (0.189)            & 0.717 (0.198)           \\
Misclassification Rate & 0.134 (0.034)         & 0.134 (0.031)            & 0.134 (0.027)           & 0.133 (0.026)            & 0.134 (0.027)           \\
$F_1$ Score             & 0.995 (0.053)         & 0.996 (0.054)            & 0.996 (0.047)           & 0.996 (0.052)            & 0.996 (0.053)           \\ \bottomrule
\end{tabular}
\end{table*}
	
In the first experiment, we investigate the communication rounds of our proposed method needed to achieve numerical convergence. We set the total sample size $N$ as 50,000, and the number of machines (including the master machine) is 100. In System II, the fraction of Byzantine local machines is $\alpha = 0.2$. The trajectories of three evaluation metrics are presented in Figure \ref{fig1}, where the horizontal lines represent the results of \texttt{C-MSDA}. As we can see, all metrics of \texttt{Mean-DSLDA} diverge under System II, and \texttt{Median-DSLDA} shows robustness against Byzantine failure. In addition, the evaluation metrics of \texttt{Mean-DSLDA} (under System I) and \texttt{Median-DSLDA} converge numerically within 5 communication rounds, which corroborates the statement in Corollary \ref{cor_T}. The difference of $\ell_2$ error between our proposed two methods and \texttt{C-MSDA} is tiny, which indicates the performance loss caused by distributed estimation is negligible in two systems.

In the second experiment, we investigate the effect of the number of local machines on our proposed algorithm. The local sample size is fixed as $n = 200$, and the number of machines (including the master machine) varies from 100 to 500. We summarize the averaged results in Table \ref{table:vary_m}. It implies that both \texttt{Mean-DSLDA} (under System I) and \texttt{Median-DSLDA} are not sensitive to the number of machines since our proposed method can attain an optimal convergence rate without the constraint on the number of local machines.


In the third experiment, we run \texttt{Median-DSLDA} under System II with the fraction of Byzantine local machines varying from 0 to 0.2. The total sample size $N$ is fixed as 20,000, and the number of machines $M+1$ is 100. We report averages and standard deviations of three evaluation metrics after the 5-th iteration in Table \ref{table_alpha}. 
There is no significant performance deterioration for \texttt{Median-DSLDA} with the increasing fraction of Byzantine local machines.

%the $\ell_2$ estimation error increases slightly, meanwhile the misclassification rate and $F_1$ score are relatively stable. Compared with the results under System I,  with the existence of Byzantine local machines.




% \begin{table*}[h]
% \renewcommand\arraystretch{0.6}
% \centering
% \caption{The evaluation metrics and their standard deviations (in parentheses) of \texttt{C-MSDA}, \texttt{Mean-DSLDA} and \texttt{Median-DSLDA}. The number of machines (include the master machine) is fixed as $50$. \texttt{Mean-DSLDA} is performed under System I and \texttt{Median-DSLDA} is performed under System II with $\alpha = 0.1$.}\label{table_local_size}
% \begin{tabular}{@{}lccccc@{}}
% \toprule
% $N$               & 10,000          & 20,000          & 30,000           & 40,000           & 50,000          \\ \midrule
%                 & \multicolumn{5}{c}{$\ell_2$ error}                                   \\ \midrule
% \texttt{C-MSDA} & 0.649 (0.054) & 0.638 (0.047) & 0.638 (0.042) & 0.640 (0.044)  & 0.636 (0.041) \\
% \texttt{Mean-DSLDA}   & 0.681 (0.085) & 0.657 (0.059) & 0.651 (0.055) & 0.653 (0.051) & 0.641 (0.041) \\
% \texttt{Median-DSLDA} & 0.715 (0.188) & 0.667 (0.058) & 0.661 (0.059) & 0.655 (0.045) & 0.654 (0.055) \\\midrule
% {}       & \multicolumn{5}{c}{Misclassification Rate}                          \\\midrule
% \texttt{C-MSDA} & 0.132 (0.011) & 0.131 (0.011) & 0.132 (0.010)  & 0.130 (0.010)   & 0.132 (0.010)  \\
% \texttt{Mean-DSLDA}   & 0.133 (0.011) & 0.132 (0.011) & 0.133 (0.010)  & 0.130 (0.011)  & 0.132 (0.010)  \\
% \texttt{Median-DSLDA} & 0.135 (0.025) & 0.132 (0.010)  & 0.132 (0.010)  & 0.131 (0.011) & 0.132 (0.010)  \\\midrule
% {}       & \multicolumn{5}{c}{$F_1$ Score}                                      \\\midrule
% \texttt{C-MSDA} & 1 (0)         & 1 (0)         & 1 (0)         & 1 (0)         & 1 (0)         \\
% \texttt{Mean-DSLDA}   & 0.998 (0.013) & 1 (0)         & 1 (0)         & 1 (0)         & 1 (0)         \\
% \texttt{Median-DSLDA} & 0.991 (0.059) & 1 (0)         & 1 (0)         & 1 (0)         & 1 (0)         \\ \bottomrule
% \end{tabular}
% \end{table*}


%\subsubsection{The Effect of Local Sample Size}
%	In the second experiment, we verify the effect of local sample size to our proposed method. We set the number of machines $m+1$ as $50$ and very the local sample size from $200$ to $1,000$. The fraction of Byzantine local machines in System II $\alpha$ is $0.1$. Table \ref{table_local_size} summarizes averages and standard deviations of the results after the 5-th iteration over 200 independent replications. From the results, we observe that the $\ell_2$ estimation errors of \texttt{Mean-DSLDA} and \texttt{Median-DSLDA} decrease as the local sample size $n$ increasing, which coincides with our theoretical result in Theorem \ref{thm_ell}. As we expected, the $\ell_2$ estimation error of \texttt{Median-DSLDA} is slightly greater than that of \texttt{Mean-DSLDA} since the existence of Byzantine local machines. In spite of this, \texttt{Median-DSLDA} has nearly equivalent performance compared with \texttt{C-MSDA} and \texttt{Mean-DSLDA} from the results of misclassification rate and $F_1$ score.

\subsection{Binary-class Task}
For binary-class task, we compare our method with the debiased procedure in \citet{tian2017communication}, which is abbreviated as \texttt{DC-LPD}. For the fairness of comparison, we follow the same data generation regime in \texttt{DC-LPD}. From the results in Table \ref{table:binary}, it can be seen that our method has a better performance over \texttt{DC-LPD}, and the computational superiority is salient.

\begin{table*}[tb]
\centering
\caption{The average evaluation metrics and local computational time. The total sample size is 10,000 and the number of machines is 20. The dimension is $p = 200$.}\label{table:binary}
\begin{adjustbox}{width=0.95\textwidth}
\begin{tabular}{@{}ccccccccccccc@{}}
\toprule
& \multicolumn{3}{c}{Misclassification Rate} & \multicolumn{3}{c}{$\ell_2$ Error} & \multicolumn{3}{c}{$F_1$ Score} & \multicolumn{3}{c}{Running Time (s)} \\ \midrule
$p$                & 300          & 400          & 500          & 300       & 400       & 500      & 300     & 400     & 500     & 300            & 400            & 500            \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10} \cmidrule(lr){11-13}
\texttt{Mean-DSLDA}       & 0.161        & 0.168        & 0.166        & 0.467     & 0.440      & 0.446    & 0.984   & 0.979   & 0.98    & 1.69          & 2.15          & 3.08          \\
\texttt{DC-LPD} & 0.166        & 0.171        & 0.170         & 1.190      & 1.161     & 1.239    & 0.714   & 0.733   & 0.722   & 37.79         & 92.03         & 203.84        \\ \bottomrule
\end{tabular}
\end{adjustbox}
\end{table*}

\section{Real Data}
	In this section, we use the MNIST dataset\footnote{http://yann.lecun.com/exdb/mnist/} and ISOLET dataset\footnote{https://archive.ics.uci.edu/ml/datasets/isolet} to verify the performance of our proposed algorithm in real data. A brief description of the two datasets is given in Table \ref{table3}. We randomly divide the training sets of the MNIST dataset and ISOLET dataset into 20 and 10 machines, respectively (including the master machine) with an equal sample size. For Byzantine local machines, we use a similar adversarial setting in synthetic data experiments. Then we conduct Algorithm \ref{algorithm} by setting the iteration step $T = 20$. The tuning parameter $\lambda_t$ in each iteration is selected by five-fold cross-validation.
\begin{table}[h]
\centering
\caption{Data description of MNIST and ISOLET.}\label{table3}
\begin{adjustbox}{width=0.48\textwidth}
\begin{tabular}{@{}ccccccc@{}}
\toprule
Dataset & $K$ & Training size & Test size & Dimension & Label  \\ \midrule
MNIST  & 10  & 60,000        & 10,000    & 784       & 0-9    \\
ISOLET  & 26  & 6,238         & 1,559     & 617       & 1-26   \\ \bottomrule
\end{tabular}
\end{adjustbox}
\end{table}

	The experiment results are reported in Figure \ref{fig r1}. As we can see, the test errors of our proposed methods decrease dramatically after the first communication round, then becomes stable in the future iterations. Under System I, the test error of \texttt{Mean-DSLDA} is lower than \texttt{Median-DSLDA}. Under System II, the classification performance of \texttt{Mean-DSLDA} is severely affected by the Byzantine machines. In addition, the utility of \texttt{Median-DSLDA} does not degrade significantly under System II.
	
\section{Conclusions}\label{section 6}
	
In this paper, we proposed a communication efficient distributed sparse linear discriminant analysis (\texttt{Mean-DSLDA}) algorithm under a normal distributed system and its Byzantine-tolerant version (\texttt{Median-DSLDA}) for the multi-classification problem. Compared with the existing distributed sparse LDA algorithm, our proposed algorithm sufficiently reduces the computation complexity of each local machine. To achieve the optimal statistical convergence rate, \texttt{Mean-DSLDA} does not require any restrictions on the number of local machines $M$, which can be applied in a large scale distributed system. Experiments on synthetic and real data corroborate the theoretical results and the superiority of \texttt{Median-DSLDA} against Byzantine failures.

\begin{figure}[ht!]
		\centering
		\subfigure[MNIST]{\includegraphics[width=0.7\linewidth]{plots/mnist_CR.eps}}
		\subfigure[ISOLET]{\includegraphics[width=0.7\linewidth]{plots/isolet_CR.eps}}
		\caption{The test classification error versus the number of iterations on real data. The fraction of Byzantine machines under System II is $\alpha = 0.1$. The numbers of machines in MNIST and ISOLET are respectively $100$ and $10$.}
		\label{fig r1}
\end{figure}

\begin{acknowledgements} 
    Weidong Liu's research is supported by National Program on Key Basic Research Project (2018AAA0100704), NSFC Grant No. 11825104 and 11690013, Youth Talent Support Program, Shanghai Municipal Science and Technology Major Project (2021SHZDZX0102). Xiaojun Mao's research is supported by NSFC Grant No. 12001109 and 92046021, the Science and Technology Commission of Shanghai Municipality grant 20dz1200600.
\end{acknowledgements}

	\bibliography{docbib}

\end{document}
