\documentclass[11pt]{article}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage{lipsum,pgffor}
\setcounter{tocdepth}{2}% Include up to \subsection in ToC
\usepackage{subfigure}
%\usepackage{subcaption}
\usepackage{adjustbox}
\usepackage{caption}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}

\usepackage{hyperref}       % hyperlinks
\hypersetup{
	colorlinks=true,
	linkcolor=red,
	filecolor=magenta,      
	urlcolor=cyan,
	citecolor = blue
}
\usepackage{url}            % simple URL typesettings
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{lipsum}		% Can be removed after putting your text content
\usepackage{amsmath,graphicx,amssymb}
\usepackage{dsfont}
\usepackage[linesnumbered,boxed,ruled,commentsnumbered]{algorithm2e}
\usepackage{float}
%\renewcommand{\theequation}{\arabic{section}.\arabic{equation}}
\numberwithin{equation}{section}
% Use these for theorems, lemmas, proofs, etc.
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{assumption}{Assumption}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{definition}{Definition}[section]

% set page geometry
\renewcommand{\topfraction}{0.95}   % let figure take up nearly whole page
\renewcommand{\textfraction}{0.05}  % let figure take up nearly whole page

% Specify the dimensions of each page

\oddsidemargin .25in    %   Note \oddsidemargin = \evensidemargin
\evensidemargin .25in
\marginparwidth 0.07 true in
%\marginparwidth 0.75 true in
%\topmargin 0 true pt           % Nominal distance from top of page to top of
%\topmargin 0.125in
\topmargin -0.5in
\addtolength{\headsep}{0.25in}
\textheight 8.5 true in       % Height of text (including footnotes & figures)
\textwidth 6.0 true in        % Width of text line.
\widowpenalty=10000
\clubpenalty=10000

% float placement
\usepackage{natbib}
\bibliographystyle{abbrvnat}
\usepackage{authblk}

% math symbol
\newcommand{\hb}{\widehat{\boldsymbol{b}}}
\newcommand{\tb}{\widetilde{\boldsymbol{b}}}
\newcommand{\bc}{\boldsymbol{c}}
\newcommand{\hd}{\widehat{\boldsymbol{d}}}
\newcommand{\td}{\widetilde{\boldsymbol{d}}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\htheta}{\widehat{\boldsymbol{\theta}}}
\newcommand{\ttheta}{\widetilde{\boldsymbol{\theta}}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\hSigma}{\widehat{\boldsymbol{\Sigma}}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\hmu}{\widehat{\boldsymbol{\mu}}}
\newcommand{\tmu}{\widetilde{\boldsymbol{\mu}}}
\newcommand{\T}{\mathrm{T}}
\newcommand{\R}{\mathbb{R}^p}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Prob}{\mathbb{P}}
\newcommand{\mH}{\mathcal{H}}
\newcommand{\bx}{\boldsymbol{X}}
% inf norm
\newcommand{\infnorm}[1]{\|#1\|_{\infty}}
\newcommand{\LRinfnorm}[1]{\left\|#1\right\|_{\infty}}
%ell 2 norm
\newcommand{\twonorm}[1]{\|#1\|_{2}}
\newcommand{\LRtwonorm}[1]{\left\|#1\right\|_{2}}
%ell 1 norm
\newcommand{\onenorm}[1]{\|#1\|_{1}}
\newcommand{\LRonenorm}[1]{\left\|#1\right\|_{1}}

\newcommand{\xj}[1]{\textcolor{red}{\textbf{[XJ: #1]}}}
\newcommand{\byj}[1]{\textcolor{blue}{\textbf{[BYJ: #1]}}}

\title{Supplementary Material for ``Byzantine-tolerant Distributed Multiclass Sparse Discriminant Analysis''}
\author{}
\date{}

\begin{document}
	\maketitle
	\bigskip
	\begin{abstract}
	This document provides supplementary material to the article ``Byzantine-tolerant Distributed Multiclass Sparse Discriminant Analysis" written by the same authors.
\end{abstract}
	
	\section{Proofs of Main Results}
	\subsection{Proof of Theorem 3.1}
	\begin{theorem}\label{theorem1}
		Let $\max_{2\leq k\leq K}\|\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*}\|_1 = O_{\mathbb{P}}(a_n)$ and choose some sufficiently large positive constant $\eta_1$ such that
		\begin{equation*}
		\lambda_{1} = \left\{\begin{array}{ll}
		\eta_1\left(\sqrt{\frac{\log p}{N}} + a_n\sqrt{\frac{\log p}{n}}\right),& \text{System I}\\
		\eta_1\left(\sqrt{\frac{\log p}{N}} + a_n\sqrt{\frac{\log p}{n}}+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right), & \text{System II}
		\end{array}
		\right.
		\end{equation*}
		under conditions $(\mathbf{C}1)$, $(\mathbf{C}3)$ and $(\mathbf{C}5)$, we have
		\begin{equation}\label{t2.1}
		\max_{2\leq k \leq K}\left\|\widehat{\boldsymbol{\theta}}_{k}^{(1)}-\boldsymbol{\theta}_{k}^{*}\right\|_2=O_{\mathbb{P}}(\sqrt{s}\lambda_1),
		\end{equation}
		and
		\begin{equation}\label{t2.2}
		\max_{2\leq k \leq K}\left\|\widehat{\boldsymbol{\theta}}_{k}^{(1)}-\boldsymbol{\theta}_{k}^{*}\right\|_1=O_{\mathbb{P}}(s\lambda_1).
		\end{equation}
	\end{theorem}
	Note that the initial error for the $t$-th iteration would be $\max_{2\leq k\leq K}\|\widehat{\boldsymbol{\theta}}_{k}^{(t-1)}-\boldsymbol{\theta}_{k}^{*}\|_1$ by plug-in the initial estimator $\widehat{\boldsymbol{\theta}}_{k}^{(t-1)}$. Then we can obtain the $\ell_1$ and $\ell_2$ error bound of the $\widehat{\boldsymbol{\theta}}_{k}^{(t)}$ easily by induction according to \eqref{t2.1} and \eqref{t2.2} in Theorem \ref{theorem1}. In the next, to show the proof of Theorem \ref{theorem1}, we present several useful lemmas in the following.
	\begin{lemma}\label{lemma1}
		For $\boldsymbol{x}_k\in \R, k=1,...,K-1$ such that 
		$$
		\sum_{k=1}^{K-1}\|\boldsymbol{x}_k\|_1 \leq 4\sqrt{s(K-1)}\left(\sum_{k=1}^{K-1}\|\boldsymbol{x}_k\|_2^2\right)^{1/2},
		$$ 
		we have
		$$
		\sum_{k=1}^{K-1}\boldsymbol{x}_{k}^{\T}\widehat{\boldsymbol{\Sigma}}_{(0)}\boldsymbol{x}_{k}^{\T} \geq L\sum_{k=1}^{K-1}\|\boldsymbol{x}_k\|_2^2,
		$$
		holds with probability tending to 1.
	\end{lemma}
	\begin{lemma}\label{inf_lemma1}
		Let $\|\htheta_k-\btheta_k^{*}\|_1 = O_{\mathbb{P}}(a_n)$, then for $k=2,...,K$ we have
		\begin{equation*}
		\left\|\widehat{\boldsymbol{\Sigma}}_{(0)}(\htheta_k^{(0)}-\btheta_k^{*})-\hb_{k,0}\right\|_{\infty} = O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}} + a_n\sqrt{\frac{\log p}{n}}\right),
		\end{equation*}
		under System I.
	\end{lemma}
	\begin{lemma}\label{inf_lemma2}
		Let $\|\htheta_k-\btheta_k^{*}\|_1 = O_{\mathbb{P}}(a_n)$, then for $k=2,...,K$ we have
		\begin{equation*}
		\left\|\widehat{\boldsymbol{\Sigma}}_{(0)}(\htheta_k^{(0)}-\btheta_k^{*})-\tb_{k,0}\right\|_{\infty} = O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}} + a_n\sqrt{\frac{\log p}{n}}+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right),
		\end{equation*}
		under System II.
	\end{lemma}
	%\xj{$k$ missed on the first term?}
	The proofs of above lemmas are relegated to Section \ref{appendix B} in the following.
	\begin{proof}[Proof of Theorem 3.1]
		For simplicity of notations, we use $\htheta_{k}$ to denote $\htheta_{k}^{(1)}$. By the optimality of $(\widehat{\boldsymbol{\theta}}_2,\cdots,\widehat{\boldsymbol{\theta}}_K)$, we have 
		\begin{align*}
		&\frac{1}{2}\widehat{\boldsymbol{\theta}}_{k}^{\T}\widehat{\boldsymbol{\Sigma}}_{(0)}\htheta_{k}-\left(\hSigma_{(0)}\htheta_{k}^{(0)}-\boldsymbol{b}_{k,0}\right)^{\T}\htheta_{k}+\lambda_{1}\sum_{j=1}^{p}\left(\sum_{l=2}^{K}\htheta_{l,j}^2\right)^{1/2}\\
		\leq &\frac{1}{2}\btheta_{k}^{*\T}\widehat{\boldsymbol{\Sigma}}_{(0)}\btheta_{k}^{*}-\left(\hSigma_{(0)}\htheta_{k}^{(0)}-\boldsymbol{b}_{k,0}\right)^{\T}\btheta_{k}^{*}+\lambda_{1}\sum_{j=1}^{p}\left(\sum_{l\neq k}\htheta_{l,j}^2+(\btheta_{k,j}^{*})^2\right)^{1/2}.
		\end{align*}
		By rearranging the terms above, we have
		\begin{equation}\label{A-1}
		\begin{aligned}
		&\frac{1}{2}\left(\htheta_{k}-\boldsymbol{\theta}_{k}^{*}\right)^{\T}\widehat{\boldsymbol{\Sigma}}_{(0)}\left(\widehat{\boldsymbol{\theta}}_k-\boldsymbol{\theta}^{*}_k\right)\\
		\leq &\left(\widehat{\boldsymbol{\Sigma}}_{(0)}(\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}^{*}_{k})-\boldsymbol{b}_{k,0}\right)^{\T}\left(\widehat{\boldsymbol{\theta}}_k-\boldsymbol{\theta}^{*}_k\right)+\lambda_1\left\|\boldsymbol{\theta}_k^{*}-\widehat{\boldsymbol{\theta}}_k\right\|_1.
		\end{aligned}
		\end{equation}
		According to Lemmas \ref{inf_lemma1} and \ref{inf_lemma2},
		\begin{equation*}
		\left\|\widehat{\boldsymbol{\Sigma}}_{(0)}(\htheta_k^{(0)}-\btheta_k^{*})-\boldsymbol{b}_{k,0}\right\|_{\infty} = \left\{
		\begin{array}{ll}
		O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}} + a_n\sqrt{\frac{\log p}{n}}\right),& \text{System I}\\
		O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}} + a_n\sqrt{\frac{\log p}{n}}+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right). & \text{System II}
		\end{array}
		\right.
		\end{equation*}
		%\xj{$k$ missed on the first term?}
		Thus $\|\widehat{\boldsymbol{\Sigma}}_{(0)}\boldsymbol{\theta}_{k}^{*}-\boldsymbol{b}_{k,0}\|_{\infty}\leq \lambda_1/2$ holds with high probability for some sufficiently large positive constant $\eta_1$ under both System I and II. Then (\ref{A-1}) indicates that
		\begin{equation}\label{A1-1}
		\frac{1}{2}\sum_{k=2}^{K}\left(\htheta_{k}-\boldsymbol{\theta}_{k}^{*}\right)^{\T}\hSigma_{(0)}\left(\htheta_{k}-\boldsymbol{\theta}_{k}^{*}\right)\leq \frac{3\lambda_1}{2}\sum_{k=2}^{K}\left\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\right\|_1.
		\end{equation}
		By the optimality of $(\widehat{\boldsymbol{\theta}}_2,\cdots,\widehat{\boldsymbol{\theta}}_K)$, we can also obtain that
		\begin{equation}\label{global_opt}
		 \begin{aligned}
		&\frac{1}{2}\sum_{k=2}^K\left(\htheta_{k}-\boldsymbol{\theta}_{k}^{*}\right)^{\T}\widehat{\boldsymbol{\Sigma}}_{(0)}\left(\widehat{\boldsymbol{\theta}}_k-\boldsymbol{\theta}_{k}^{*}\right)+\lambda_1\sum_{j=1}^p\|\htheta_{(j)}\|_2\\
		\leq &\sum_{k=2}^K\left(\widehat{\boldsymbol{\Sigma}}_{(0)}(\widehat{\boldsymbol{\theta}}^{(0)}_k-\boldsymbol{\theta}_{k}^{*})-\boldsymbol{b}_{k,0}\right)^{\T}\left(\widehat{\boldsymbol{\theta}}_k-\btheta_{k}^{*}\right)+\lambda_1\sum_{j=1}^p\|\btheta_{(j)}^{*}\|_2.
		\end{aligned}
		\end{equation}
		Let $\bc_k = \widehat{\boldsymbol{\Sigma}}_{(0)}(\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*})-\boldsymbol{b}_{k,0}$ and $\bc_{(j)} = (\bc_{2,j},...,\bc_{K,j})^{\T}$, then it follows from (\ref{global_opt}) and $\infnorm{\bc_k}\leq \lambda_1/2$ that
		\begin{align*}
		\lambda_1\sum_{j=1}^p\|\htheta_{(j)}\|_2 &\leq \sum_{j=1}^p\bc_{(j)}^{\T}(\htheta_{(j)}-\btheta_{(j)}^{*})+\lambda_1\sum_{j=1}^p\|\btheta_{(j)}^{*}\|_2\\
		&\leq \sum_{j=1}^p\|\bc_{(j)}\|_2\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2+\lambda_1\sum_{j=1}^p\|\btheta_{(j)}^{*}\|_2\\
		&\leq \max_{j}\|\bc_{(j)}\|_2\sum_{j=1}^p\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2+\lambda_1\sum_{j=1}^p\|\btheta_{(j)}^{*}\|_2\\
		&\leq \sqrt{K-1}\max_{2\leq k\leq K}\|\bc_k\|_{\infty}\sum_{j=1}^p\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2+\lambda_1\sum_{j=1}^p\|\btheta_{(j)}^{*}\|_2\\
		&\leq \frac{\lambda_{1}}{2}\sum_{j=1}^p\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2+\lambda_1\sum_{j=1}^p\|\btheta_{(j)}^{*}\|_2,
		\end{align*}
		which implies that $\sum_{j\in S^c}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2\leq 3\sum_{j\in S}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2$. In conjunction with the fact that
		\begin{align*}
		\left(\sum_{j \in S}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2\right)^2\leq s\sum_{j\in S}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2^2,
		\end{align*}
		we have
		\begin{align*}
		\sum_{j\in S^c}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_1 &\leq \sqrt{K-1}\sum_{j\in S^c}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2 \leq 3\sqrt{K-1}\sum_{j\in S}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2\\
		&\leq 3\sqrt{s(K-1)}\left(\sum_{j\in S}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_2^2\right)^{1/2}\\
		&\leq 3\sqrt{s(K-1)}\left(\sum_{k=2}^K\|\htheta_{k,S}^{(1)}-\btheta_{k,S}^{*}\|_2^2\right)^{1/2}.
		\end{align*}
		Similarly, we have $\sum_{j\in S}\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_1\leq \sqrt{s(K-1)}(\sum_{k=2}^K\|\htheta_{k,S}^{(1)}-\btheta_{k,S}^{*}\|_2^2)^{1/2}$. It implies that
		\begin{equation}\label{A1-2}
		\sum_{k=2}^{K}\left\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\right\|_1 = \sum_{j=1}^p\|\htheta_{(j)}-\btheta_{(j)}^{*}\|_1\leq 4\sqrt{s(K-1)}\left(\sum_{k=2}^K\|\htheta_{k}^{(1)}-\btheta_{k}^{*}\|_2^2\right)^{1/2}.
		\end{equation}
		By applying Lemma \ref{lemma1}, we have
		\begin{equation}\label{A2}
		\frac{1}{2}\sum_{k=2}^{K}\left(\htheta_{k}-\boldsymbol{\theta}_{k}^{*}\right)^{\T}\hSigma_{(0)}\left(\htheta_{k}-\boldsymbol{\theta}_{k}^{*}\right)\geq L\sum_{k=2}^{K}\left\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\right\|_2^2.
		\end{equation}
		Combining the inequalities (\ref{A1-1}), (\ref{A1-2}) and (\ref{A2}), we have
		$$
		\left(\sum_{k=2}^{K}\left\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\right\|_2^2\right)^{1/2}\leq \frac{6\sqrt{s(K-1)}}{L}\lambda_1,
		$$
		and
		$$		
		\sum_{k=2}^K\left\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\right\|_1\leq \frac{24s(K-1)}{L}\lambda_1.
		$$
		It also indicates that 
		$$
		\max_{2\leq k\leq K}\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\|_2 = O_{\Prob}(\sqrt{s}\lambda_{1}),
		$$
		and
		$$
		\max_{2\leq k\leq K}\|\boldsymbol{\theta}_{k}^{*}-\htheta_{k}\|_1 = O_{\Prob}(s\lambda_{1}).
		$$
	\end{proof}
	\subsection{Proof of Theorem 3.2}
	It suffices to prove Theorem \ref{theorem3} in the following.
	\begin{theorem}\label{theorem3}
		 Under conditions $(\mathbf{C}1)$-$(\mathbf{C}5)$, with the same choice of $\lambda_1$ as in Theorem \ref{theorem1}, we have $\widehat{S}^{(1)}\subseteq S$ holds with probability tending to 1 and $\widehat{\boldsymbol{\theta}}_{k}^{(1)}$ satisfies that
		\begin{equation}
		\left\|\widehat{\boldsymbol{\theta}}_{k}^{(1)}-\boldsymbol{\theta}_{k}^{*}\right\|_{\infty} = O_{\mathbb{P}}\left(\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\lambda_1\right).
		\end{equation}
		Moreover, suppose that there exists a sufficiently large constant $C>0$ such that
		\begin{equation}
		\theta^{*}_{\min}\geq C\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\lambda_1,
		\end{equation}
		we have $\widehat{S}^{(1)} = S$ with probability tending to 1.
	\end{theorem}
	
	\begin{lemma}\label{lemma3}
		By partitioning $\boldsymbol{\Sigma}$ as
		$$
		\boldsymbol{\Sigma} = \left(\begin{matrix}
		\boldsymbol{\Sigma}_{SS} & \boldsymbol{\Sigma}_{SS^c}\\
		\boldsymbol{\Sigma}_{S^cS} & \boldsymbol{\Sigma}_{S^cS^c}
		\end{matrix}\right),
		$$
		and $\boldsymbol{\mu}_k$ according to sets $S$ and $S^c$ for $k = 2,...,K$ respectively, we have
		\begin{equation}
		\boldsymbol{\theta}_{k,S}^{*} = \boldsymbol{\Sigma}_{SS}^{-1}\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1\right)_S,
		\end{equation}
		and
		\begin{equation}
		\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1\right)_{S^c} = \boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1\right)_S.
		\end{equation}
	\end{lemma}
	\begin{proof}[Proof of Theorem \ref{theorem3}]
		Here we only prove the results in System II and the proof for System I is similar. First we define the oracle sub-problem as
		\begin{equation}\label{A4}
		\widehat{\boldsymbol{\theta}}^{o}_{S} = \arg \min_{\boldsymbol{\theta}_{k,S^c}=\boldsymbol{0}}\sum_{k=2}^{K}\left\{\frac{1}{2} \boldsymbol{\theta}^{\T}_{k} \widehat{\boldsymbol{\Sigma}}_{(0)} \boldsymbol{\theta}_{k}-\tb_{k,g-1}^{\T} \boldsymbol{\theta}_{k}\right\}+\lambda_1\sum_{j\in S}\left\|\btheta_{(j)}\right\|_2.
		\end{equation}
		Once we show $\widehat{\boldsymbol{\theta}}_{k}^{(1)} = (\widehat{\boldsymbol{\theta}}^{o}_{k,S}, \boldsymbol{0})$ is the solution to (7), it is clear that $\widehat{S}^{(1)}\subseteq S$. According to the KKT condition, for any $j\in S$,
		\begin{equation}\label{A5}
		\begin{aligned}
		\left(\begin{matrix}
		\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}\widehat{\boldsymbol{\theta}}_{2,S}^{o} - (\widetilde{\boldsymbol{b}}_{2,0})_S\right)_j\\
		\vdots\\
		\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}\widehat{\boldsymbol{\theta}}_{K,S}^{o} - (\widetilde{\boldsymbol{b}}_{K,0})_S\right)_j
		\end{matrix}\right) + \lambda_{1}\boldsymbol{Z}_{j} = 0,
		\end{aligned}
		\end{equation}
		and for any $j\notin S$,
		\begin{equation}\label{A6}
		\begin{aligned}
		\left(\begin{matrix}
		\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS^c}\widehat{\boldsymbol{\theta}}_{2,S}^{o} - (\widetilde{\boldsymbol{b}}_{2,0})_{S^c}\right)_j\\
		\vdots\\
		\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS^c}\widehat{\boldsymbol{\theta}}_{K,S}^{o} - (\widetilde{\boldsymbol{b}}_{K,0})_{S^c}\right)_j
		\end{matrix}\right) + \lambda_{1}\boldsymbol{Z}_{j} = 0,
		\end{aligned}
		\end{equation}
		where $\boldsymbol{Z}_j\in \mathbb{R}^{K-1}$ is subgradient of $\twonorm{\btheta}$ evaluated at $\htheta_{(j)}$.
		It suffices to show that
		\begin{equation}\label{A7}
		\lambda_1^{-1}\max_{j \in S^c}\left(\sum_{k=2}^K\left\{\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\theta}}^{o}_{k,S}\right)_j-(\widetilde{\boldsymbol{b}}_{k,0})_j\right\}^2\right)^{1/2}<1,
		\end{equation}
		holds with probability tending to 1.
		From equation (\ref{A5}), we have $\widehat{\boldsymbol{\theta}}_{k,S}^{o} = \widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}((\widetilde{\boldsymbol{b}}_{k,0})_S-\lambda_{1}\widetilde{\boldsymbol{Z}}_{k,S})$ where $\widetilde{\boldsymbol{Z}}_{k,S} = (Z_{j,k}: j\in S) \in \mathbb{R}^{s}$ and $\sum_{k=2}^K(Z_{j,k})^2 = 1$. Note that
		\begin{align*}
		&\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\theta}}^{o}_{k,S}-(\widetilde{\boldsymbol{b}}_{k,0})_{S^c} \\
		=& \widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\left((\widetilde{\boldsymbol{b}}_{k,0})_S-\lambda_{1}\widetilde{\boldsymbol{Z}}_{k,S}\right)-(\widetilde{\boldsymbol{b}}_{k,0})_{S^c}\\
		=&\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\left\{\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}-\boldsymbol{\Sigma}_{SS}\right)\left(\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right)+ \boldsymbol{\Sigma}_{SS}\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-(\widetilde{\boldsymbol{d}}_{k,0})_S+\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_S- \boldsymbol{\Sigma}_{SS}\boldsymbol{\theta}_{k,S}^{*}\right\}\\
		&-\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right)\left(\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right)-\boldsymbol{\Sigma}_{SS}\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}+(\widetilde{\boldsymbol{d}}_{k,0})_{S^c}+\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\theta}_{k,S}^{*}-\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_{S^c}\\
		&-\lambda_{1}\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\widetilde{\boldsymbol{Z}}_{k,S}.
		\end{align*}
		We denote 
		$$
		I_1 = \widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\left\{\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}-\boldsymbol{\Sigma}_{SS}\right)\left(\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right)+ \boldsymbol{\Sigma}_{SS}\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-(\widetilde{\boldsymbol{d}}_{k,0})_S+\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_S- \boldsymbol{\Sigma}_{SS}\boldsymbol{\theta}_{k,S}^{*}\right\},
		$$
		and
		$$
		I_2 = \left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right)\left(\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right)+\boldsymbol{\Sigma}_{SS}\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-(\widetilde{\boldsymbol{d}}_{k,0})_{S^c}-\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\theta}_{k,S}^{*}+\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_{S^c}.
		$$
		Observe that
		\begin{align*}
		\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1} - \boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}&=\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right)\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right)+\boldsymbol{\Sigma}_{S^cS}\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right)\\
		&+\boldsymbol{\Sigma}_{SS}^{-1}\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right),
		\end{align*}
		which implies
		\begin{align*}
		\left\|	\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1} - \boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\leq& \left\|\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right)\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right)\right\|_{\infty}\\
		&+\left\|\boldsymbol{\Sigma}_{S^cS}\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right)\right\|_{\infty}+\left\|\boldsymbol{\Sigma}_{SS}^{-1}\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right)\right\|_{\infty}\\
	    \leq& s^{3/2}\left|\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right|_{\infty}\left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{2}\\
		&+s^{3/2}\left|\boldsymbol{\Sigma}_{S^cS}\right|_{\infty}\left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{2}+s^{3/2}\left|\boldsymbol{\Sigma}_{SS}^{-1}\right|_{\infty}\left\|\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}\right\|_{2}.
		\end{align*}
		Using the inequalities (58a) and (58b) in \cite{wainwright2009sharp}, we have
		$$
		\left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}-\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{2} = O_{\mathbb{P}}\left(\sqrt{\frac{s}{n}}\right),
		$$
		and
		$$
		\left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}-\boldsymbol{\Sigma}_{SS}\right\|_{2} = O_{\mathbb{P}}\left(\sqrt{\frac{s}{n}}\right).
		$$
		Combining with the fact $|\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}-\boldsymbol{\Sigma}_{S^cS}|_{\infty} = O_{\mathbb{P}}(\sqrt{\log p/n})$, it yields
		$$
		\left\|	\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1} - \boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty} = O_{\mathbb{P}}\left(s^{3/2}\sqrt{\frac{\log p+s}{n}}\right).
		$$
		Owing to the fact that $\boldsymbol{\theta}_{k,S} = \boldsymbol{\Sigma}_{SS}^{-1}(\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_{1})_S$ in Lemma \ref{lemma3}, we have
		\begin{align*}
		\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_S-\boldsymbol{\Sigma}_{SS}\boldsymbol{\theta}_{k,S}^{*}=\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_S-\left(\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_{1}\right)_S.
		\end{align*}
		It yields that
		$$
		\left\|\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_S-\boldsymbol{\Sigma}_{SS}\boldsymbol{\theta}_{k,S}^{*}\right\|_{\infty} = O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}}\right).
		$$
		Moreover, note that
		$$
		\left\|\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}-\boldsymbol{\Sigma}_{SS}\right)\left(\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right)\right\|_{\infty}\leq \left|\widehat{\boldsymbol{\Sigma}}_{(0),SS}-\boldsymbol{\Sigma}_{SS}\right|_{\infty}\left\|\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right\|_{1} =O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{n}}a_n\right).
		$$
		Then together with the assumption $\| \boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\|_{\infty}\leq \xi$ and Lemma \ref{inf_lemma1} we have
		\begin{align*}
		\|I_1\|_{\infty} = O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}}+\sqrt{\frac{\log p}{n}}a_n+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right).
		\end{align*}
		Similarly, we can show that
		$$
		\|I_2\|_{\infty} = O_{\mathbb{P}}\left(\sqrt{\frac{\log p}{N}}+\sqrt{\frac{\log p}{n}}a_n+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right).
		$$
		Owing to the choice of $\lambda_{1}$ in Theorem \ref{theorem1} and following the analysis above, there exists some positive constant $C_1$ such that for $k = 2,...,K$,
		\begin{equation}\label{A8}
		\begin{aligned}
		&\left\|\widehat{\boldsymbol{\theta}}_{k,S}^{o}-\boldsymbol{\theta}_{k,S}^{*}\right\|_{\infty}\\
		\leq &\lambda_{1}\left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\widetilde{\boldsymbol{Z}}_{k,S}\right\|_{\infty} + \left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\left(\widehat{\boldsymbol{\Sigma}}_{(0),SS}-\boldsymbol{\Sigma}_{SS}\right)\left(\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-\boldsymbol{\theta}_{k,S}^{*} \right)\right\|_{\infty}\\
		+&\left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\left(\boldsymbol{\Sigma}_{SS}\widehat{\boldsymbol{\theta}}_{k,S}^{(0)}-(\widetilde{\boldsymbol{d}}_{k,0})_S \right)\right\|_{\infty}+ \left\|\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\left(\left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)_S- \boldsymbol{\Sigma}_{SS}\boldsymbol{\theta}_{k,S}^{*}\right)\right\|_{\infty}\\
		\leq& C_1\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\lambda_{1},
		\end{aligned}
		\end{equation}
		holds with probability tending to 1.
		Moreover, for $j \in S^c$, we have
		\begin{align*}
		\left|\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\widetilde{\boldsymbol{Z}}_{k,S}\right)_j\right| &\leq \left\|	\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1} - \boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\left\|\widetilde{\boldsymbol{Z}}_{k,S}\right\|_{\infty}\\
		&+\LRinfnorm{\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}}\LRinfnorm{\widetilde{\boldsymbol{Z}}_{k,S}-\boldsymbol{Z}_{k,S}^{*}}+\left|\left(\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\boldsymbol{Z}_{k,S}^{*}\right)_j\right|,
		\end{align*}
		and
		\begin{align*}
		\|\widetilde{\boldsymbol{Z}}_{k,S}-\boldsymbol{Z}_{k,S}^{*}\|_{\infty} &= \max_{j \in S}\left|\frac{\widehat\theta_{kj}^{o}}{\|\btheta_{(j)}^{o}\|_2}-\frac{\theta_{kj}^{*}}{\|\btheta_{(j)}^{*}\|_2}\right|\\
		& \leq \max_{j \in S}\frac{\left|\widehat\theta_{kj}^{o}-\theta_{kj}^{*}\right|}{\|\btheta_{(j)}^{*}\|_2} + \max_{j \in S}|\widehat\theta_{kj}^{o}|\frac{\left|\|\btheta_{(j)}^{o}\|_2-\|\btheta_{(j)}^{*}\|_2\right|}{\|\btheta_{(j)}^{o}\|_2\|\btheta_{(j)}^{*}\|_2}\\
		& \leq \max_{j \in S}\frac{\left|\widehat\theta_{kj}^{o}-\theta_{kj}^{*}\right|}{\|\btheta_{(j)}^{*}\|_2} + \max_{j \in S}\frac{\|\btheta_{(j)}^{o}-\btheta_{(j)}^{*}\|_2}{\|\btheta_{(j)}^{*}\|_2}\\
		&\lesssim 2\max_{2\leq k\leq K}\|\widehat{\boldsymbol{\theta}}_{k,S}^{o}-\boldsymbol{\theta}_{k,S}^{*}\|_{\infty}/\theta_{\min}^{*}.
		\end{align*}
		Combining with the Conditions $(\mathbf{C}2)$ and inequality (\ref{A8}), with probability tending to 1 we have
		\begin{equation}\label{A9}
		\begin{aligned}
		&\lambda_{1}^{-2}\max_{j \in S^c}\sum_{k=2}^K\left\{\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\theta}}^{o}_{k,S}\right)_j-(\boldsymbol{b}_{k,g-1})_j\right\}^2\leq \sum_{k=2}^K\left|\left(\widehat{\boldsymbol{\Sigma}}_{(0),S^cS}\widehat{\boldsymbol{\Sigma}}_{(0),SS}^{-1}\widetilde{\boldsymbol{Z}}_{k,S}\right)_j\right|^2+o(1)\\
		\leq& \max_{j \in S^c}\sum_{k=2}^K\left|\left(\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\boldsymbol{Z}_{k,S}^{*}\right)_j\right|^2+C_1^2\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}^2\left\|\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}^2(K-1)\lambda_{1}^2/\theta_{\min}^{*2}+o(1)\\
		\leq& 1-\kappa +C_1^2\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}^2\left\|\boldsymbol{\Sigma}_{S^cS}\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}^2(K-1)\lambda_{1}^2/\theta_{\min}^{*2}+o(1)\\
		\leq& 1-\kappa/2,
		\end{aligned}
		\end{equation}
		then we have shown the inequality (\ref{A7}) holds. Recall that with inequality (\ref{A8}), we have
		$$
		\left\|\widehat{\boldsymbol{\theta}}_{k}^{o}-\boldsymbol{\theta}_{k}^{*}\right\|_{\infty} \leq C_1\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\lambda_1,
		$$
		holds with probability tending to 1. And note that $\widehat{\boldsymbol{\theta}}_{k}^{o}$ is a solution to (7) with probability tending to 1, that is $\mathbb{P}(\widehat{\boldsymbol{\theta}}_{k}^{(1)} = \widehat{\boldsymbol{\theta}}_{k}^{o})\to 1$. It yields
		$$
		\left\|\widehat{\boldsymbol{\theta}}_{k}^{(1)}-\boldsymbol{\theta}_{k}^{*}\right\|_{\infty} \leq C_1\left\|\boldsymbol{\Sigma}_{SS}^{-1}\right\|_{\infty}\lambda_1,
		$$
		holds with probability tending to 1. If $\theta_{\min}^{*} \geq C\|\boldsymbol{\Sigma}_{SS}^{-1}\|_{\infty}\lambda_{1}$	for some sufficiently large positive constant $C$, then $\widehat{S}^{(1)} = S$ holds with probability tending to 1. In fact, the inequality (\ref{A9}) still holds if we choose sufficiently large $C$. Therefore, we have finished the proof of Theorem \ref{theorem3}.
	\end{proof}
	
	\section{Proof of Auxiliary Lemmas}\label{appendix B}
	\subsection{Proof or Lemma \ref{lemma1}}
	\begin{proof}[Proof or Lemma \ref{lemma1}]
		With probability tending to 1, there exists some sufficiently large positive constant $L$ such that
		\begin{align*}
		\sum_{k=1}^{K-1}\boldsymbol{x}_{k}^{\T}\hSigma_{(0)}\boldsymbol{x}_{k}&\geq \sum_{k=1}^{K-1}\boldsymbol{x}_{k}^{\T}\bSigma \boldsymbol{x}_{k}-\left|\hSigma_{(0)}-\bSigma\right|_{\infty}\sum_{k=1}^{K-1}\|\boldsymbol{x}_{k}\|_{1}^2\\
		&\geq \sum_{k=1}^{K-1}\boldsymbol{x}_{k}^{\T}\bSigma \boldsymbol{x}_{k}-\left|\hSigma_{(0)}-\bSigma\right|_{\infty}\left(\sum_{k=1}^{K-1}\|\boldsymbol{x}_{k}\|_{1}\right)^2\\
		&\geq \lambda_{\min}(\bSigma)\sum_{k=1}^{K-1}\|\boldsymbol{x}_{k}\|_2^2 - 16s(K-1)\left|\hSigma_{(0)}-\bSigma\right|_{\infty}\sum_{k=1}^{K-1}\|\boldsymbol{x}_{k}\|_2^2\\
		&\geq L\sum_{k=1}^{K-1}\|\boldsymbol{x}_{k}\|_2^2,
		\end{align*}
		and the last inequality follows from the fact that $|\widehat{\boldsymbol{\Sigma}}_{(0)} - \boldsymbol{\Sigma}|_{\infty}=O_{\mathbb{P}}(\sqrt{\log p/ n})$ and $s\sqrt{\log p/ n} = o(1)$.
	\end{proof}
	\subsection{Proof of Lemma \ref{inf_lemma1}}
	First note that,
	\begin{align*}
	\widehat{\boldsymbol{\Sigma}}_{(0)}(\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*})-\hb_{k,0} = \left(\widehat{\boldsymbol{\Sigma}}_{(0)}-\widehat{\boldsymbol{\Sigma}}\right)\left(\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}^{*}_k\right) - \widehat{\boldsymbol{\Sigma}}\boldsymbol{\theta}^{*}_k+ \left(\widehat{\boldsymbol{\mu}}_k-\widehat{\boldsymbol{\mu}}_1\right).
	\end{align*}
	Due to the definition of $\hSigma$, we have
	\begin{align*}
	    \widehat{\boldsymbol{\Sigma}}\boldsymbol{\theta}_k^{*}-\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*}&= \frac{1}{N}\sum_{d=1}^{K}\sum_{\{i: Y_i = d\}}(\bx_i-\hmu_d)(\bx_i-\hmu_d)^{\T}\boldsymbol{\theta}_k^{*}-\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*}\\
	    &=\frac{1}{N}\sum_{d=1}^{K}\sum_{\{i: Y_i = d\}}(\bx_i-\bmu_d)(\bx_i-\bmu_d)^{\T}\boldsymbol{\theta}_k^{*}+\frac{1}{N}\sum_{d=1}^{K}N_d(\hmu_d-\bmu_d)(\hmu_d-\bmu_d)^{\T}\boldsymbol{\theta}_k^{*}-\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*}.
	\end{align*}
	We note that $\bx_i^{\top}\btheta_k^{*} \sim \mathcal{N}(\boldsymbol{\mu_k}^{\top}\btheta_k^{*}, (\bmu_k - \bmu_1)^{\top}\bSigma^{-1}(\bmu_k - \bmu_1))$ for $i$ such that $Y_i = k$ and $k\neq 1$, which yields that $|(\hmu_d-\bmu_d)^{\T}\boldsymbol{\theta}_k^{*}| = O_{\mathbb{P}}(\Delta_{\max}/\sqrt{N})$.
	Let $\boldsymbol{D}_{di} = (\bx_i-\bmu_d)(\bx_i-\bmu_d)^{\T}\boldsymbol{\theta}_k^{*}-\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*}$, then $\boldsymbol{D}_{di,j}$ is sub-exponential variable with parameter $\sigma_{j,j}\Delta_{k}$. According to Bernstein's inequality for sub-exponential variable \citep{vershynin2018high}, we have 
	\begin{equation*}
	    \LRinfnorm{\frac{1}{N}\sum_{d=1}^{K}\sum_{\{i: Y_i = d\}}\boldsymbol{D}_{di}} = O_{\mathbb{P}}\left(\Delta_{\max}\sqrt{\frac{\log p}{N}}\right).
	\end{equation*}
	It follows that
	\begin{align*}
	    \LRinfnorm{\widehat{\boldsymbol{\Sigma}}\boldsymbol{\theta}_k^{*}-\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*}}&\leq \LRinfnorm{\frac{1}{N}\sum_{d=1}^{K}\sum_{\{i: Y_i = d\}}\boldsymbol{D}_{di}}+\max_{d}\LRinfnorm{(\hmu_d-\bmu_d)(\hmu_d-\bmu_d)^{\T}\boldsymbol{\theta}_k^{*}}\\
	   & \leq \LRinfnorm{\frac{1}{N}\sum_{d=1}^{K}\sum_{\{i: Y_i = d\}}\boldsymbol{D}_{di}}+\max_{d}\LRinfnorm{(\hmu_d-\bmu_d)}|(\hmu_d-\bmu_d)^{\T}\boldsymbol{\theta}_k^{*}|\\
	   & \lesssim \Delta_{\max}\sqrt{\frac{\log p}{N}}+\sqrt{\frac{\log p}{N}}\frac{\Delta_{\max}}{\sqrt{N}}
	\end{align*}
	with high probability. It yields that
	\begin{align*}
	\left\|\widehat{\boldsymbol{\Sigma}}_{(0)}(\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*})-\hb_{k,0}\right\|_{\infty}\leq& \left|\widehat{\boldsymbol{\Sigma}}_{(0)}-\widehat{\boldsymbol{\Sigma}}\right|_{\infty}\left\|\boldsymbol{\theta}_{k}^{*}-\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right\|_1+\left\|\widehat{\boldsymbol{\Sigma}}\boldsymbol{\theta}^{*}_k-\left(\widehat{\boldsymbol{\mu}}_k-\widehat{\boldsymbol{\mu}}_1\right)\right\|_{\infty}\\
	\leq& \left|\widehat{\boldsymbol{\Sigma}}_{(0)}-\boldsymbol{\Sigma}\right|_{\infty}\left\|\boldsymbol{\theta}_{k}^{*}-\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right\|_1 + \left|\widehat{\boldsymbol{\Sigma}}-\boldsymbol{\Sigma}\right|_{\infty}\left\|\boldsymbol{\theta}_{k}^{*}-\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right\|_1\\
	&+ \left\|\widehat{\boldsymbol{\Sigma}}\boldsymbol{\theta}_k^{*}-\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*}\right\|_{\infty}+\left\|\left(\widehat{\boldsymbol{\mu}}_k-\widehat{\boldsymbol{\mu}}_1\right) - \left(\boldsymbol{\mu}_k-\boldsymbol{\mu}_1\right)\right\|_{\infty}\\
	\lesssim & \sqrt{\frac{\log p }{n}}a_n+\sqrt{\frac{\log p }{N}}a_n + \Delta_{\max}\sqrt{\frac{\log p }{N}}+\sqrt{\frac{\log p }{N}}\\
	 = & O_{\mathbb{P}}\left(\sqrt{\frac{\log p }{n}}a_n+\sqrt{\frac{\log p }{N}}\right),
	\end{align*}
	where the third inequality follows from the basic bound $\|\hmu_k - \hmu\|_{\infty}=O_{\mathbb{P}}(\sqrt{\log p/N})$.
	\subsection{Proof of Lemma \ref{inf_lemma2}}
	\begin{lemma}[Berry-Esseen inequality \citep{petrov1975sums}]
		Let $X_1,\cdots,X_n$ are i.i.d random variables and suppose
		$$
		\mathbb{E} X_{1}=0, \quad \mathbb{E} X_{1}^{2}=\sigma^{2}>0, \quad \mathbb{E}\left|X_{1}\right|^{3}<\infty, \quad \varrho=\frac{\mathbb{E}\left|X_{1}\right|^{3}}{\sigma^{3}}.
		$$
		Then for some absolute positive constants $A$
		$$
		\sup _{x}\left|\mathbb{P}\left(\frac{1}{\sigma \sqrt{n}} \sum_{j=1}^{n} X_{j}<x\right)-\Phi(x)\right| \leqq A \frac{\varrho}{\sqrt{n}}.
		$$
	\end{lemma}
	\begin{proof}[Proof of Lemma \ref{inf_lemma2}]
		Denote the Byzantine local machines by $\mathcal{B}$ and $|\mathcal{B}|=\alpha M$. Let $Y_l = \sqrt{n}\left(\widetilde{\boldsymbol{\mu}}_1-\boldsymbol{\mu}_1\right)_l/\sqrt{\sigma_{ll}}$ then
		$$
		Y_l = \operatorname{med}\left\{Y_{l,0}, Y_{l,1} ,\cdots,Y_{l,M} \right\}.
		$$
		where $Y_{l,m} = \sqrt{n}(\widehat{\boldsymbol{\mu}}_{1}^{(m)}-\boldsymbol{\mu}_1)_l/\sqrt{\sigma_{ll}}\sim N(0,1)$ if $m \notin \mathcal{B}$ .
		Using the uniform bound and the fact that for any $t\in \mathbb{R}$
		$$
		\left|\frac{1}{M+1}\sum_{m=0}^{M}\mathds{I}(Y_{l,m}\geq t)-\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}\mathds{I}(Y_{l,m}\geq t)\right|\leq \alpha,
		$$
		we have
		\begin{align*}
		&\mathbb{P}\left(\max_{l}\left|\frac{\sqrt{n}}{\sigma_{ll}}\left(\widetilde{\boldsymbol{\mu}}_1-\boldsymbol{\mu}_1\right)_l\right|\geq u_n\right) \\
		\leq &p\max_{l}\mathbb{P}\left(\left|\frac{\sqrt{n}}{\sigma_{ll}}\left(\widetilde{\boldsymbol{\mu}}_1-\boldsymbol{\mu}_1\right)_l\right|\geq u_n\right)\\
		=&p\max_{l}\mathbb{P}\left(\frac{1}{M+1}\sum_{m=0}^{M}\mathds{I}(Y_{l,m}\geq u_n)\geq \frac{1}{2}\right)+p\max_{l}\mathbb{P}\left(\frac{1}{M+1}\sum_{m=0}^{M}\mathds{I}(Y_{l,m}\leq -u_n)\leq \frac{1}{2}\right)\\
		=&p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}\mathds{I}(Y_{l,m}\geq u_n)-\mathbb{P}(Y_{l,m}\geq u_n)\geq \frac{1}{2}-\alpha-(1-\Phi(u_n))\right)\\
		+&p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}\mathds{I}(Y_{l,m}\leq -u_n)-\mathbb{P}(Y_{l,m}\leq -u_n)\leq \frac{1}{2}+\alpha -\Phi(-u_n)\right).
		\end{align*}
		By Taylor expansion we have
		$$
		\Phi(u_n) = \Phi(0)+\phi(0)u_n + o(u_n).
		$$
		Thus
		\begin{align*}
		&\mathbb{P}\left(\max_{l}\left|\frac{\sqrt{n}}{\sigma_{ll}}\left(\widetilde{\boldsymbol{\mu}}_1-\boldsymbol{\mu}_1\right)_l\right|\geq u_n\right) \\
		\leq &p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}\mathds{I}(Y_{l,m}\geq u_n)-\mathbb{P}(Y_{l,m}\geq u_n)\geq \phi(0)u_n + o(u_n)-\alpha\right)\\
		+&p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}\mathds{I}(Y_{l,m}\leq -u_n)-\mathbb{P}(Y_{l,m}\leq -u_n)\leq \phi(0)u_n + o(u_n)+\alpha\right).
		\end{align*}
		Let $u_n = \rho^{'}(\sqrt{\log p/((1-\alpha)M+1)}+\alpha)$ for some sufficiently large positive constant $\rho^{'}$ and using Bernstein's inequality we have
		$$
		\mathbb{P}\left(\max_{l}\frac{\sqrt{n}}{\sigma_{ll}}\left(\widetilde{\boldsymbol{\mu}}_1-\boldsymbol{\mu}_1\right)_l\geq u_n\right) \leq 2p^{-1}.
		$$
		And this means that
		$$
		\left\|\widetilde{\boldsymbol{\mu}}_1-\boldsymbol{\mu}_1\right\|_{\infty}\lesssim \sqrt{\frac{\log p}{N}}+\frac{\alpha}{\sqrt{n}},
		$$
		holds with at least probability $1-2p^{-1}$. Similarly, we can prove
		$$
		\left\|\widetilde{\boldsymbol{\mu}}_k-\boldsymbol{\mu}_k\right\|_{\infty}\lesssim \sqrt{\frac{\log p}{N}}+\frac{\alpha}{\sqrt{n}},
		$$
		holds with at least probability $1-2p^{-1}$. For the second inequality, note that
		\begin{align*}
		\left(\widetilde{\boldsymbol{d}}_{k,0}\right)_l = \operatorname{med}\left\{\frac{1}{n}\sum_{d=1}^{K}\sum_{\{i\in \mathcal{H}_m: Y_i = d\}}(X_{il}-\widehat{\mu}_{dl}^{(m)})(\boldsymbol{X}_i-\widehat{\boldsymbol{\mu}}_d^{(m)})^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}:\ m=1,2,\cdots, M\right\},
		\end{align*}
		where $X_{il}$ is the $l-$th entry of $\boldsymbol{X}_i$, $\widehat{\mu}_{dl}^{(m)}$ is the $l-$th entry of $\widehat{\boldsymbol{\mu}}_d^{(m)}$. By straightforward calculation we can write
		\begin{align*}
		& \frac{1}{n}\sum_{d=1}^{K}\sum_{\{i\in \mathcal{H}_m: Y_i = d\}}(X_{il}-\widehat{\mu}_{dl}^{(m)})(\boldsymbol{X}_i-\widehat{\boldsymbol{\mu}}_d^{(m)})^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}\\
		=&\frac{1}{n}\sum_{d=1}^{K}\sum_{\{i\in \mathcal{H}_m: Y_i = d\}}(X_{il} - \mu_{dl})(\boldsymbol{X}_i-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}+\frac{1}{n}\sum_{d=1}^{K}n_k(\widehat{\mu}_{dl}^{(m)}-\mu_{dl})(\widehat{\boldsymbol{\mu}}_d^{(m)}-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)},
		\end{align*}
		and for $m\notin \mathcal{B}$
		$$
		(\boldsymbol{X}_i-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}\sim \mathcal{N}\left(0, (\widehat{\boldsymbol{\theta}}_k^{(0)})^{\T}\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}}_k^{(0)})\right), \quad i \in \mathcal{H}_m 
		$$
		where $ (\widehat{\boldsymbol{\theta}}_k^{(0)})^{\T}\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}}_k^{(0)})\lesssim \boldsymbol{\theta}_k^{*T}\boldsymbol{\Sigma}\boldsymbol{\theta}_k^{*} \leq \Delta_{\max}^2$. Conditioning on $\widehat{\boldsymbol{\theta}}_k^{(0)}$, we have
		\begin{align*}
		&\mathbb{E}\left[(X_{il} - \mu_{dl})(\boldsymbol{X}_i-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}\bigg|\widehat{\boldsymbol{\theta}}_k^{(0)}\right] = \left(\boldsymbol{\Sigma}\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right)_l,
		\end{align*}
		and
		\begin{align*}
		\widetilde{\sigma}_l^2& : = \operatorname{Var}\left[(X_{il} - \mu_{dl})(\boldsymbol{X}_i-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}\bigg|\widehat{\boldsymbol{\theta}}_k^{(0)}\right]\\
		&\leq \left(\mathbb{E}(X_{il} - \mu_{dl})^4\right)^{1/2}\left(\mathbb{E}\left((\boldsymbol{X}_i-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)}\right)^4\right)^{1/2}\lesssim 3\sigma_{ll}^2\Delta_k^2,
		\end{align*}
		for $i\in \mathcal{H}_{m}$ and $m\notin \mathcal{B}$ and $\widetilde{\sigma}_l^2<\infty$ according to assumption.
		Let 
		$$
		W_{l,m}=\frac{1}{\sqrt{n}}\sum_{d=1}^{K}\sum_{\{i\in \mathcal{H}_m: Y_i = d\}}(X_{il} - \mu_{dl})(\boldsymbol{X}_i-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)} - \left(\boldsymbol{\Sigma}\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right)_l,
		$$
		and
		$$
		V_{l,m} = \frac{1}{\sqrt{n}}\sum_{d=1}^{K}n_k(\widehat{\mu}_{dl}^{(m)}-\mu_{dl})(\widehat{\boldsymbol{\mu}}_d^{(m)}-\boldsymbol{\mu}_d)^{\T}\widehat{\boldsymbol{\theta}}_k^{(0)},
		$$
		then for $m\notin \mathcal{B}$
		$$
		W_{l,m} \stackrel{d}{\to} N\left(0, \widetilde{\sigma}_l^2\right)\text{ and }V_{l,m}=O_{\mathbb{P}}\left(\frac{\widetilde{\sigma}_l}{\sqrt{n}}\right).
		$$
		Denote
		$$
		Z_{l,m}=\mathds{I}(W_{l,m}+V_{l,m}\geq u_n)-\mathbb{P}(W_{l,m}+V_{l,m}\geq u_n),
		$$
		and
		$$
		Z_{l,m'}=\mathds{I}(W_{l,m}+V_{l,m}\leq -u_n)-\mathbb{P}(W_{l,m}+V_{l,m}\leq -u_n).
		$$
		Owing to the definition of sample median, we have
		\begin{align*}
		&\mathbb{P}\left(\max_{l}\sqrt{n}\left|\left(\widetilde{\boldsymbol{d}}_{k,0} - \boldsymbol{\Sigma}\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right)_l\right|\geq u_n\right)\leq p\max_{l}\mathbb{P}\left(\sqrt{n}\left|\left(\widetilde{\boldsymbol{d}}_{k,0} - \boldsymbol{\Sigma}\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right)_l\right|\geq u_n\right)\\
		=&p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}Z_{l,m}\geq \frac{1}{2}-\alpha -\mathbb{P}(W_{l,m}+V_{l,m}\geq u_n)\right)\\
		+&p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}Z_{l,m'}\leq \frac{1}{2}+\alpha -\mathbb{P}(W_{l,m}+V_{l,m}\leq -u_n)\right).
		\end{align*}
		Using the fact $\mathbb{P}(W_{l,m}+V_{l,m}\leq u_n)=\mathbb{P}(W_{l,m}/\widetilde{\sigma}_l+V_{l,m}/\widetilde{\sigma}_l\leq u_n/\theta_l)$ we have
		\begin{align*}
		&\left|\mathbb{P}\left(\frac{W_{l,m}}{\widetilde{\sigma}_l}+\frac{V_{l,m}}{\widetilde{\sigma}_l}\leq \frac{u_n}{\widetilde{\sigma}_l}\right)-\Phi\left(\frac{u_n}{\widetilde{\sigma}_l}\right)\right|\\
		\leq&\left|\mathbb{P}\left(\frac{W_{l,m}}{\widetilde{\sigma}_l}\leq \frac{u_n}{\widetilde{\sigma}_l}-\frac{V_{l,m}}{\widetilde{\sigma}_l}\right)-\Phi\left(\frac{u_n}{\widetilde{\sigma}_l}-\frac{V_{l,m}}{\widetilde{\sigma}_l}\right)\right|+\left|\Phi\left(\frac{u_n}{\widetilde{\sigma}_l}-\frac{V_{l,m}}{\widetilde{\sigma}_l}\right)-\Phi\left(\frac{u_n}{\widetilde{\sigma}_l}\right)\right|\\
		\leq &\sup_{x\in \mathbb{R}}\left|\mathbb{P}\left(\frac{W_{l,m}}{\widetilde{\sigma}_l}\leq x\right)-\Phi(x)\right|+\left|\Phi\left(\frac{u_n}{\widetilde{\sigma}_l}-\frac{V_{l,m}}{\widetilde{\sigma}_l}\right)-\Phi\left(\frac{u_n}{\widetilde{\sigma}_l}\right)\right|\\
		= & \sup_{x\in \mathbb{R}}\left|\mathbb{P}\left(\frac{W_{l,m}}{\widetilde{\sigma}_l}\leq x\right)-\Phi(x)\right|+\phi\left(\frac{u_n}{\widetilde{\sigma}_l}\right)\frac{V_{l,m}}{\widetilde{\sigma}_l}+o(V_{l,m})\\
		\lesssim &\frac{1}{\sqrt{n}} +  \frac{1}{\sqrt{n}},
		\end{align*}
		where the last inequality follows from Berry-Esseen inequality and the normal density function $\phi(x)$ is bounded. It yields that
		\begin{align*}
		&\mathbb{P}\left(\max_{l}\sqrt{n}\left|\left(\widetilde{\boldsymbol{d}}_{k,0} - \boldsymbol{\Sigma}\widehat{\boldsymbol{\theta}}_{k}^{(0)}\right)_l\right|\geq u_n\right) \\
		\leq &p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}Z_{l,m}\geq \phi(0)u_n/\widetilde{\sigma}_l-\alpha+O(\frac{1}{\sqrt{n}})+o(u_n)\right)\\
		+ & p\max_{l}\mathbb{P}\left(\frac{1}{(1-\alpha)M+1}\sum_{m\notin \mathcal{B}}Z_{l,m'}\leq \phi(0)u_n/\widetilde{\sigma}_l+\alpha+O(\frac{1}{\sqrt{n}})+o(u_n)\right).
		\end{align*}
		
		Let $u_n = \rho^{''}(\Delta_{\max}\sqrt{\log p/(M+1)}+1/\sqrt{n}+\alpha)$ for some sufficiently large positive constant $\rho^{''}$, then by Bernstein's inequality we can prove that
		$$
		\left\|\widetilde{\boldsymbol{d}}_{k,0} - \boldsymbol{\Sigma}\widehat{\boldsymbol{\theta}}_{k}^{(0)} \right\|_{\infty} = O_{\mathbb{P}}\left(\Delta_{\max}\sqrt{\frac{\log p }{N}}+\frac{\alpha}{\sqrt{n}}+\frac{1}{n}\right).
		$$
		By the definition of $\widetilde{\boldsymbol{b}}_{k,0}$, we have
		\begin{align*}
		&\left\|\widehat{\boldsymbol{\Sigma}}_{(0)}(\htheta_k^{(0)}-\btheta_k^{*})-\tb_{k,0}\right\|_{\infty}\\
		= &\left\|\left(\widehat{\boldsymbol{\Sigma}}_{(0)}-\bSigma\right)\left(\widehat{\boldsymbol{\theta}}_{k}^{(0)}-\boldsymbol{\theta}_{k}^{*}\right) - \widetilde{\boldsymbol{d}}_{k,0} + \bSigma\widehat{\boldsymbol{\theta}}_{k}^{(0)}  - \bSigma\boldsymbol{\theta}_{k}^{*} + \left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)\right\|_{\infty}\\
		\leq &\left\|\left(\widehat{\boldsymbol{\Sigma}}_{(0)}-\bSigma\right)\left(\boldsymbol{\theta}_{k}^{*} - \widehat{\boldsymbol{\theta}}_{k}^{(0)}\right)\right\|_{\infty} + \left\|\widetilde{\boldsymbol{d}}_{k,0} - \bSigma\widehat{\boldsymbol{\theta}}_{k}^{(0)} \right\|_{\infty} + \left\|\boldsymbol{\mu}_{k}-\boldsymbol{\mu}_1 - \left(\widetilde{\boldsymbol{\mu}}_k-\widetilde{\boldsymbol{\mu}}_1\right)\right\|_{\infty},
		\end{align*}
		then the results follow.
	\end{proof}
	\subsection{Proof of Lemma \ref{lemma3}}
	\begin{proof}[Proof of Lemma \ref{lemma3}]
		By the definition of the support set $S$ and $\boldsymbol{\Sigma}\boldsymbol{\theta}_{k}^{*} = \boldsymbol{\mu}_k - \boldsymbol{\mu}_1$ we have
		\begin{equation*}
		\left(\begin{matrix}
		\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1\right)_S\\
		\left(\boldsymbol{\mu}_k - \boldsymbol{\mu}_1\right)_{S^c}
		\end{matrix}\right) =
		\left(\begin{matrix}
		\boldsymbol{\Sigma}_{SS} & \boldsymbol{\Sigma}_{SS^c}\\
		\boldsymbol{\Sigma}_{S^cS} & \boldsymbol{\Sigma}_{S^cS^c}
		\end{matrix}\right)
		\left(\begin{matrix}
		\boldsymbol{\theta}_{k,S}^{*}\\
		\boldsymbol{0}
		\end{matrix}\right),
		\end{equation*}
		then the results follow immediately.
	\end{proof}

	\bibliography{docbib}
\end{document}