%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography

\usepackage{xcolor}         % colors
\usepackage{microtype}
\usepackage{graphicx}
% \usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage {subcaption}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2019} with \usepackage[nohyperref]{icml2019} above.
\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
% \newcommand{\theHalgorithm}{\arabic{algorithm}}

\usepackage{graphicx} % more modern
%\usepackage{epsfig} % less modern



% For algorithms
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{amsmath}
\usepackage{amssymb}

\usepackage{amsthm}
% For citations
\usepackage{wrapfig}
% For algorithms
\usepackage{multicol}

\usepackage{hyperref}
\usepackage{epstopdf}

\makeatletter
\newcommand{\rmnum}[1]{\romannumeral #1}
\newcommand{\Rmnum}[1]{\expandafter\@slowromancap\romannumeral #1@}
\makeatother
\newtheorem{theorem}{\textbf{Theorem}}
\newtheorem{lemma}{\textbf{Lemma}}
\newtheorem{corollary}{\textbf{Corollary}}
\newtheorem{fact}{\textbf{Fact}}
\newtheorem{definition}{\textbf{Definition}}
\newtheorem{observation}{\textbf{Observation}}
\newtheorem{condition}{\textbf{Condition}}
\newtheorem{property}{\textbf{Property}}
\newtheorem{remark}{\textbf{Remark}}
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
\newtheorem{claim}{Claim}




% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}

% \twocolumn[

% \aistatstitle{Sublinear Time Algorithms for Greedy Selection in High Dimensions}

% \aistatsauthor{ Author 1 \And Author 2 \And  Author 3 }

% \aistatsaddress{ Institution 1 \And  Institution 2 \And Institution 3 } ]

\begin{document}

\title{Sublinear Time Algorithms for Greedy Selection in High Dimensions}

\author[1]{Qi Chen\thanks{The first two authors contributed equally to this work.}}
\author[2]{Kai Liu$^*$}
\author[2]{Ruilong Yao}
\author[2]{\href{mailto:huding@ustc.edu.cn}{Hu Ding\thanks{Corresponding author.}}}
% Add affiliations after the authors
\affil[1]{%
    School of Data Science\\
    University of Science and Technology of China\\
    Anhui, China
}
\affil[2]{%
    School of Computer Science and Technology\\
    University of Science and Technology of China\\
    Anhui, China
}
  

\maketitle

\begin{abstract}
Greedy selection is a widely used idea for solving many machine learning problems. But greedy selection algorithms often have high complexities and thus may be prohibitive for large-scale data. In this paper, we consider two fundamental optimization problems in machine learning: $k$-center clustering and convex hull approximation, where they both can be solved via greedy selection. We propose sublinear time algorithms for them through combining the strategies of randomization and greedy selection. Our results are similar in spirit to the linear time stochastic greedy selection algorithms for submodular maximization [Mirzasoleiman et al., AAAI 2015, Hassidim and Singer, ICML 2017], but with several important differences. Our runtimes are independent of the number of input data items $n$. In particular, our runtime for $k$-center clustering significantly improves upon that of the uniform sampling approach, especially when the dimensionality is high. Our sublinear algorithms can also reduce the computational complexities for various applications, such as data selection and compression, active learning, and topic modeling, {\em etc}.
\end{abstract}

% \section{Introduction}
\section{INTRODUCTION}
% \MakeUppercase

\label{sec-intro}

Greedy algorithm is one of the most fundamental tools for algorithm design~\citep{cormenintroduction}. 
In particular, many optimization problems in machine learning can be solved through {\em greedy selection} method. The method iteratively selects a subset of data items from input based on some greedy strategy. One representative example is the Gonzalez's algorithm for {\em $k$-center clustering}~\citep{GONZALEZ1985293}. Given a set of data items ({\em e.g.,} a point set in $\mathbb{R}^d$), the algorithm is to iteratively select $k$ items from the input; if one draws $k$ equal-sized balls centered at these $k$ items, the whole input data set can be covered by these balls and the radius is no larger than two times the optimal one (the formal definition for $k$-center clustering is shown in Section~\ref{sec-pre}). 

The algorithm is simple but has many important applications in real world. One natural application is constructing {\em coreset} for compressing a large-scale data, especially when we want to maximize diversity or coverage~\citep{DBLP:conf/pods/IndykMMM14}. Another closely related application is {\em batch active learning}~\citep{sener2018active,DBLP:conf/iclr/ColemanYMMBLLZ20}. Most machine learning models heavily depend on high-quality labeled training datasets. However, because it is expensive to acquire a large number of labeled data, we may only be able to select a small number of data items (via $k$-center clustering) to label in each round (as an active learning process).  

Another high dimensional optimization problem that depends on greedy selection is {\em convex hull approximation}~\citep{blum2019sparse,DBLP:journals/anor/AwasthiKZ20}, where the goal is to find a convex hull so that each data item can be approximately represented by the vertices. The problem has a number of applications in machine learning,  such as topic modeling, sparse approximation, and non-negative matrix factorization~\citep{DBLP:books/cu/20/0001M20}. Though the convex hull algorithms have been well studied in low dimensions~\citep{cgtextbook}, the problem in high dimensions is much more challenging. Similar with the $k$-center clustering, a common idea for convex hull approximation is using greedy selection to find the vertices iteratively. 



Although these greedy selection methods enjoy promising performances in practice, they often suffer from high complexities when data sizes are extremely large. For instance, the vanilla Gonzalez's algorithm needs to run $k$ iterations and each iteration needs to scan the whole dataset in one pass (a detailed introduction on the previous work is shown in Section~\ref{sec-related}).  Similarly, the greedy selection method for convex hull approximation also needs to repeatedly scan the whole dataset and thus yields large runtime. So a natural question is: 

{\em \hspace{0.2in} Can we modify these greedy selection algorithms to  achieve lower time complexities, {\em e.g.,} sublinear time complexities that are independent of  input data size, and meanwhile preserve their quality guarantees?  }

%As  the rapid increase of data volumes in big data, designing sublinear time algorithms has become a popular topic in the past decades; more detailed discussion on sublinear time algorithms can be found in the survey papers~\cite{rubinfeld2006sublinear,czumaj2006sublinear}.
 
 \subsection{Related Work}
\label{sec-related}
 We introduce several important existing results related to this paper in this section. 

\textbf{$k$-center clustering.} As mentioned before, the greedy selection based $k$-center clustering algorithm~\citep{GONZALEZ1985293} can yield a $2$-approximation result; moreover,  it was proved that any approximation ratio lower than 2 implies $P=NP$~\citep{HS85}. To speed up the Gonzalez's algorithm, several improvements have been proposed before~\citep{DBLP:conf/stoc/FederG88,har2006fast}; however, they usually require some additional assumptions ({\em e.g.,} the dimensionality or intrinsic dimensionality should be small). To deal with large-scale data, a number of streaming algorithms which only need to read the data in one-pass were introduced in~\cite{DBLP:journals/siamcomp/CharikarCFM04,mccutchen2008streaming,DBLP:conf/icdt/Guha09,DBLP:journals/corr/abs-1802-09205}. The well known ``coreset'' technique is also applied to compress data size for $k$-center clustering~\cite{badoiu2002approximate,aghamolaei2019composable}, but their coreset construction algorithms already take at least linear time.  Furthermore, several uniform sampling based ideas were presented for achieving sublinear complexity for $k$-center clustering (with outliers)~\citep{charikar2003better,huang2018epsilon}. 


\textbf{Convex hull approximation.} Several elegant convex hull algorithms for low-dimensional space have been introduced in the community of computational geometry before~\citep{cgtextbook}. The high-dimensional convex hull approximation problem is closely related to non-negative matrix factorization and topic modeling~\citep{DBLP:books/cu/20/0001M20}. Roughly speaking, the vertices of the obtained convex hull can help us to generate the low rank non-negative matrices and discover the hidden topics. In general, this problem is intractable but it is possible to achieve an efficient solution under the {\em separability} assumption~\citep{DBLP:conf/nips/DonohoS03,DBLP:conf/focs/AroraGM12}. Recently, several practical algorithms with provable guarantees were also proposed, such as~\cite{blum2019sparse,DBLP:journals/anor/AwasthiKZ20,DBLP:conf/icml/AroraGHMMSWZ13}. 


\textbf{Other applications of greedy selection in machine learning.} Besides the aforementioned two problems, greedy selection also has several other applications in machine learning. To name a few: {\em submodular maximization}~\citep{nemhauser1978analysis}, {\em column subset selection}~\citep{DBLP:journals/kais/FarahatEGK15}, {\em reinforcement learning}~\citep{DBLP:conf/icml/Painter-WakefieldP12}, {\em sparse approximation}~\citep{DBLP:journals/tit/Tropp04}, and  SVM~\cite{gartner2009coresets}.




 
\subsection{Our Contributions}

\label{sec-our}
 In this paper, we aim to develop sublinear time algorithms for the $k$-center clustering and convex hull approximation problems. 
We assume the input data size and the dimensionality are both large. We combine the strategies of  greedy selection and randomization, and 
 show that the randomized greedy selection methods can achieve almost the same approximation guarantees, and meanwhile, the time complexities can be reduced to be sublinear. 

\textbf{Comparison with the algorithms of~\cite{DBLP:conf/aaai/MirzasoleimanBK15,DBLP:conf/icml/HassidimS17}.} Actually, the high complexity issue of greedy selection has been discussed in~\cite{DBLP:conf/aaai/MirzasoleimanBK15,DBLP:conf/icml/HassidimS17} for the submodular maximization problem. They showed that if the greedy selection step is replaced by random sampling, a quality guarantee still holds but the complexity ({\em i.e.,} the number of function evaluations) can be reduced to be linear. Our proposed algorithms are inspired by the similar stochastic intuition but with several important differences.  
%Our framework has two major differences with the methods of~\cite{DBLP:conf/aaai/MirzasoleimanBK15,DBLP:conf/icml/HassidimS17}. 
\textbf{(\rmnum{1})} First, both $k$-center clustering and convex hull approximation are geometric optimizations in high dimensions which have different objective functions other than submodular maximization. 
%So our analyses also need some novel insights from geometry. 
\textbf{(\rmnum{2})} Second, our framework yields the sublinear time complexities that are independent of the number of input data items; this property is particularly important when we cannot access the input data  and can only take a small sample via an oracle each time ({\em e.g.}, due to privacy preserving or the challenge of data acquisition). \textbf{(\rmnum{3})} Finally, we also consider the scenario that the number of iterations for greedy selection is unknown. For example, the number of clusters ``$k$'' of the $k$-center clustering may not be given; instead, we may just run the Gonzalez's algorithm iteratively until the obtained radius is no larger than a pre-specified threshold $r_0>0$. 
We need to emphasize that designing the sublinear time algorithm becomes  much more challenging with such a change, since it will be difficult to set the sample size in each iteration and determine when the algorithm should terminate. To remedy these issues, we propose a novel stratified sampling method and design a sampling based stopping condition for the greedy selection. 


%the data size is extremely large or even infinity ({\em e.g.,} the data may be a continuous probability distribution in the space).  

\textbf{Comparison with the streaming and uniform sampling algorithms.} As mentioned in Section~\ref{sec-related}, the one-pass streaming algorithms~\citep{DBLP:journals/siamcomp/CharikarCFM04,mccutchen2008streaming,DBLP:conf/icdt/Guha09,DBLP:journals/corr/abs-1802-09205} can avoid repeatedly reading the input data, however, they still suffer from high time complexities ({\em e.g.,} the ``doubling algorithm''~\citep{DBLP:journals/siamcomp/CharikarCFM04} takes a total $O\big(k(\log k)nd\big)$ time that is even higher than the complexity of the vanilla Gonzalez's algorithm, where $n$ is the number of input points). On the other hand, our proposed sublinear time algorithms have  the complexities independent of $n$. 

It is also worth to compare our results with the uniform sampling algorithms for $k$-center clustering~\citep{charikar2003better,huang2018epsilon}. For example,  a simple uniform sample $S$ of size $\tilde{O}(\frac{kd}{\epsilon^2})$~\footnote{The asymptotic notation $\tilde{O}(f)=O\big(f\cdot \mathtt{polylog}(\frac{kd}{\eta\epsilon})\big)$, where $\eta\in(0,1)$ is the parameter controlling the success probability of sampling.} can approximately represent the whole input data $P$ based on the theory of VC dimension~\citep{huang2018epsilon}, where $\epsilon\in(0,1)$ indicates the small fraction of uncovered points; that is, if one runs the $2$-approximate Gonzalez's algorithm on the sample $S$, the obtained $k$ balls  still form a $2$-approximate solution in terms of the whole input $P$ but except for $\epsilon n $ uncovered points of $P$. The running time of the Gonzalez's algorithm on $S$ should be $\tilde{O}(k|S|d)=\tilde{O}(\frac{k^2d^2}{\epsilon^2})$. In Section~\ref{sec-ouralg}, we show that our algorithm takes $\tilde{O}(\frac{k^3d}{\epsilon})$ time (also with $\epsilon n $ uncovered points). Usually, $k$ is much smaller than the dimensionality $d$, and thus our improvement   is significant. In particular, if $k$ is assumed to be constant, we improve their complexity by a factor up to $\frac{d}{\epsilon}$. 

The reader may wonder that whether dimension reduction technique ({\em e.g.}, the JL-transform~\cite{dasgupta2003elementary}) can be applied. 
 Actually the complexities of both our method and the uniform sampling can be reduced by the JL-transform, and our improvement   is still significant (just replace the dimension $d$  by the new dimension $d'$ for both the two complexities).
Also, even we apply the JL-transform, the reduced dimensionality could be still high (which is $O(\log |S|/\mu^2)$, if supposing $|S|$ is the total sample size and “$\mu$” is the pairwise distance distortion error). For example, if we let $\mu=0.01$, the new dimension is still high.  
 
 Not only the runtime, another benefit comparing with the uniform sampling is that we have smaller sample size. Our algorithm takes $\tilde{O}(k^2/\epsilon)$ samples in total after $k$ iterations, which is much lower than  $\tilde{O}(kd/\epsilon^2)$
 if assuming $k$ is not large. In particular, \textbf{our sample size is independent of the dimension $d$}. The dimension can be very high or even infinity if using kernel. The smaller sample size is also important in some specific setting like relational database~\citep{DBLP:journals/corr/abs-1911-06577}. It is very expensive to materialize the whole data matrix for a relational database, and a smaller sample size can significantly reduce the total computational complexity~\citep{DBLP:conf/sigmod/ZhaoC0HY18}. 

 
 %  \section{Preliminaries}
\section{PRELIMINARIES}
 \label{sec-pre}
  In this section, we introduce several important definitions that will be used throughout this paper. 
 Let $c\in\mathbb{R}^d$ and $r\geq 0$; we use $\mathbb{B}(c, r)$ to denote the ball centered at $c$ with radius $r$.  Also, given a set $S$ of points in $\mathbb{R}^d$, we use $\mathtt{conv}(S)$ to denote the convex hull of $S$. We use the function $\mathtt{dist}(p, U)$ to measure the shortest distance from a point $p$ to a set $U$, {\em i.e.,} $\mathtt{dist}(q, U):=\min_{q\in U}||p-q||$. 
 
 \begin{definition}[\textbf{$k$-center clustering}]
 \label{def-kc}
 Given a set $P$ of $n$ points in $\mathbb{R}^d$ and $k\in\mathbb{Z}^+$, the goal of $k$-center clustering is to find $k$ balls $\mathbb{B}(c_1, r), \cdots, \mathbb{B}(c_k, r)$ with the smallest radius $r$ to cover the set $P$, that is, $P$ is partitioned into $k$ clusters with each cluster being covered by an individual ball, and the radius $r$ is minimized. 
 \end{definition}
 \begin{remark}
 The $k$-center clustering problem can be also defined for any abstract metric, where the only difference is that the Euclidean distance is replaced by the distance defined in the metric. In fact, our proposed sublinear algorithms for $k$-center clustering in this paper can be  applied to any abstract metric with the same quality guarantees.  
 \end{remark}
 
 Let \textbf{$r_{\mathtt{opt}}$} be the radius of the optimal solution for the $k$-center clustering on $P$. For any solution having a radius $r\leq \lambda r_{\mathtt{opt}}$ with some $\lambda\geq 1$, we call it a ``\textbf{$\lambda$-approximation}''. 
 
 
 
 \begin{definition}[\textbf{convex hull approximation}]
 \label{def-ch}
 Given a set $P$ of $n$ points in $\mathbb{R}^d$ and an integer $k\geq 1$, the goal of convex hull approximation is to find a subset $P_c\subset P$ with $|P_c|=k$, such that the error, {\em i.e.,} 
 %\begin{eqnarray}
 $\max_{p\in P}\mathtt{dist}\big(p, \mathtt{conv}(P_c)\big)$ 
 % \label{for-ch}
 %\end{eqnarray}
 is minimized (so if all the  points of $P$ are covered by $\mathtt{conv}(P_c)$, the error is $0$). 
 \end{definition}
 \begin{remark}
 In general, we can remove the requirement ``$P_c\subset P$'', {\em i.e.,} $P_c$ can contain any points in the space. But we often want $P_c$ to be meaningful or interpretable in practice, and thus it is natural to require it to be a subset of the original input data.  
 \end{remark}
 Similar with $k$-center clustering, we can also define the approximation solution for convex hull approximation. But since the convex hull approximation is much more challenging, we often obtain bi-criteria approximations. Suppose $\alpha, \beta\geq 1$. If letting $\delta_{\mathtt{opt}}$ be the optimal error,  a bi-criteria  $(\alpha, \beta)$-approximation means that the obtained convex hull has the error $\delta\leq \alpha \delta_{\mathtt{opt}}$ and the number of vertices $k'\leq \beta k$. 
 
 \textbf{The rest of this paper is organized as follows.}  In Section~\ref{sec-kcenter}, we propose our sublinear time algorithm for $k$-center clustering. In particular, we also consider the practical case that the number of clusters $k$ is not given (Section~\ref{sec-kcu}). Further, in Section~\ref{sec-ch} we consider developing sublinear time algorithm for convex hull approximation by extending the idea from Section~\ref{sec-kcu}. Finally, we present our experiments in Section~\ref{sec-exp}. 
 
 
 
 % \section{$k$-Center Clustering}
\section{$k$-CENTER CLUSTERING}
\label{sec-kcenter}
 In this section, we focus on the $k$-center clustering problem. For the sake of completeness, we briefly introduce the aforementioned $2$-approximate Gonzalez's algorithm~\citep{GONZALEZ1985293} first. 

\textbf{Gonzalez's algorithm.} It selects an arbitrary point, say $c_1$, from the input $P$ and lets $C=\{c_1\}$. In each of the following $k-1$ iterations, it selects a new point that has the largest distance to $C$ among the points of $P$ and adds it to $C$. Suppose $C=\{c_1, \cdots, c_k\}$, and then $P$ is covered by the $k$ balls $\mathbb{B}(c_1, r), \cdots, \mathbb{B}(c_k, r)$ with  $r\leq\min\{||c_i-c_j||\mid 1\leq i\neq j\leq k\}$. It is not difficult to prove that the obtained radius $r\leq 2 r_{\mathtt{opt}}$. It is also easy to know that the running time of the Gonzalez's algorithm is $O(knd)$. 
As mentioned before, a major drawback of the algorithm is the high complexity, especially when  $n$ and $d$ are  large.  


 
  \subsection{Our Sublinear Algorithm}
 \label{sec-ouralg}
  Our proposed algorithm can be viewed as a randomized version of the Gonzalez's algorithm. The key change is that we randomly select the next point for $C$ in each round, instead of always picking the furthest one. Below we prove that this strategy can achieve the same $2$-approximation except for a small error on the number of covering points. 
 
 \begin{algorithm}[h]
    \caption{\textsc{Sublinear $k$-Center Clustering}}
    \label{alg-ourkc}
 \begin{algorithmic}
   \STATE {\bfseries Input:} A set $P$ of $n$ points in $\mathbb{R}^d$, $k\in\mathbb{Z}^+$, and two parameters $\eta, \epsilon\in (0,1)$. 
 %   \REPEAT
    \STATE
    \begin{enumerate}
    \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $i=1$. 
    \item Repeat the following steps $k-1$ times:
    \begin{enumerate}
   \item Sample a set $Q$ of $\frac{k}{\epsilon}\log \frac{k}{\eta}$ points from $P$ uniformly at random. 
   \item Select the furthest point, say $q_0$, from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
   \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
    \end{enumerate}
    \item Return $C$. 
      \end{enumerate}
  %   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
      %     \UNTIL{The objective value becomes stable.}
 \end{algorithmic}
  \end{algorithm}
 
 
 \begin{theorem}
 \label{the-ourkc}
 Let $C=\{c_1, \cdots, c_k\}$ be the output from Algorithm~\ref{alg-ourkc}. With probability at least $1-\eta$, there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-\epsilon)n$, such that $\tilde{P}$ is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. 
 \end{theorem}
 To prove Theorem~\ref{the-ourkc}, we need the following claim first. 
 \begin{claim}
 \label{cla-sample}
 Let $U$ be a set of elements and $V\subseteq U$ with $\frac{|V|}{|U|}=\tau>0$. Given $\eta \in(0,1)$, we uniformly select a set $S$ of elements from $U$ at random. Then if $|S|\geq \frac{1}{\tau}\log\frac{1}{\eta}$, with probability at least $1-\eta$, $S$ contains at least one element from $V$.
 \end{claim}
 Actually, the above claim is a folklore result that has been presented in several papers before (such as~\cite{DX14}). Since each sampled element falls in $V$ with probability $\tau$, we know that the sample $S$ contains at least one element from $V$ with probability $1-(1-\tau)^{|S|}$. If we want  to guarantee $1-(1-\tau)^{|S|}\geq 1-\eta$, $|S|$ should be at least $\frac{\log 1/\eta}{\log 1/(1-\tau)}\leq\frac{1}{\tau}\log\frac{1}{\eta}$.
 
  \begin{proof}\textbf{(of Theorem~\ref{the-ourkc}])}
 To help our analysis, we define $C_1:=\emptyset$, and $C_i :=\{c_1, c_2, \cdots, c_{i-1}\}$ for each $i=2,\cdots, k$ of Algorithm~\ref{alg-ourkc}. Further, we define 
 \begin{eqnarray}
 P_i:=\big\{p\in P\mid \mathtt{dist}(p, C_i)>\mathtt{dist}(c_i, C_i)\big\} \label{for-the-ourkc-1}
 \end{eqnarray} 
 for $2\leq i\leq k$. We also define $F_i$ to be the farthest $\frac{\epsilon}{k}|P|$ points from $P$ to $C_i$. Claim~\ref{cla-sample} implies that the sample $Q$ should contain at least one point from $F_i$ with probability at least $1-\frac{\eta}{k}$. If this is true, the selected $c_i$ should come from $F_i$ and thus $|P_i|\leq |F_i|$. Therefore, we have $|P_i| \leq \frac{\epsilon}{k}|P|$ with probability at least $1-\frac{\eta}{k}$. Through taking the union bound over all the $P_i$s, we have: with probability at least $(1-\frac{\eta}{k})^{k-1}> 1-\eta$, 
 \begin{eqnarray}
 \forall i=2, 3, \cdots, k, \hspace{0.2in} |P_i|<\frac{\epsilon}{k}|P|. \label{for-the-ourkc-2}
 \end{eqnarray}
 Let $\tilde{P}:=P\setminus\cup^k_{i=2}P_i$. It is easy to know the size 
 \begin{eqnarray}
 |\tilde{P}|\geq (1-\frac{\epsilon}{k}\times (k-1))|P|>(1-\epsilon)n.
 \end{eqnarray}
 Next, we only need to prove that $\tilde{P}$  is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. We present the following lemma first.
 
 \begin{lemma}
 \label{lem-tildep}
% The set $C=\{c_1, c_2, \cdots, c_k\}$ is returned by Algorithm~\ref{alg-ourkc}. 
 For any point $p\in \tilde{P}$, $\mathtt{dist}(p, C)\leq \min_{1\leq i\neq i'\leq k}||c_i-c_{i'}||$. 
 \end{lemma}
  
  
 Let $O_1, O_2, \cdots, O_k$ be the $k$ clusters obtained from  the optimal solution, {\em i.e.,} $P=\cup^k_{i=1}O_i$ and each cluster $O_i$ can be covered by a ball with radius $r_{\mathtt{opt}}$. We consider two cases. Case \textbf{(\rmnum{1})}: $\{c_1, \cdots, c_k\}$ fall into the $k$ clusters $O_1, O_2, \cdots, O_k$ separately. Without loss of generality, we assume $c_i\in O_i$ for $i=1, 2, \cdots, k$. By using the triangle inequality, we know the input set $P$ is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2 r_{\mathtt{opt}})$. Consequently, $\tilde{P}$  is also covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. 
 
 Case \textbf{(\rmnum{2})}: there exist two points, say $c_{i_a}$ and $c_{i_b}$, of $C$ that belong to one optimal cluster, say $O_l$. Thus $||c_{i_a}-c_{i_b}||\leq 2r_{\mathtt{opt}}$. From Lemma~\ref{lem-tildep}, we know 
 \begin{eqnarray}
 \forall p\in\tilde{P}, \hspace{0.05in}\mathtt{dist}(p, C) &\leq& \min_{1\leq i\neq i'\leq k}||c_i-c_{i'}|| \nonumber\\
 & \leq &||c_{i_a}-c_{i_b}||\leq 2r_{\mathtt{opt}}.
 \end{eqnarray}
 Hence $\tilde{P}$  is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2 r_{\mathtt{opt}})$.
  \end{proof}
 
 \begin{proof}\textbf{(of Lemma~\ref{lem-tildep}])}
  Suppose Lemma~\ref{lem-tildep} is not true. Then there exist some  $p_0\in\tilde{P}$ and two points $c_{i_1}$ and $c_{i_2}\in C$, such that 
 \begin{eqnarray}
 \mathtt{dist}(p_0, C)>||c_{i_1}-c_{i_2}||. \label{for-lem-tildep-1}
 \end{eqnarray}
 Without loss of generality, we assume $i_1<i_2$. Since $||c_{i_1}-c_{i_2}||\geq \mathtt{dist}(c_{i_2}, C_{i_2})$, the inequality (\ref{for-lem-tildep-1}) implies
 \begin{eqnarray}
 \mathtt{dist}(p_0, C)>\mathtt{dist}(c_{i_2}, C_{i_2}). 
 \end{eqnarray}
 So from (\ref{for-the-ourkc-1}) we know $p_0\in P_{i_2}$, which is in contradiction with the assumption $p_0\in \tilde{P}=P\setminus\cup^k_{i=2}P_i$. 
 \end{proof}
 
 
 \textbf{Time complexity.} It is easy to see that the time complexity of Algorithm~\ref{alg-ourkc}  is independent of $n$. It takes $k$ rounds, and each round needs to compute the distances from the sampled $\frac{k}{\epsilon}\log \frac{k}{\eta}$ points to $C$. So the total complexity is $O(k\times \frac{k}{\epsilon}\log \frac{k}{\eta}\times kd)=O(\frac{k^3}{\epsilon}d\log\frac{k}{\eta})$. 
 %Below, we analyze the theoretical quality guarantee. 
 
 
 
 
 %
 %
 %\subsection{Extension \Rmnum{1}: $k$-Center Clustering for Continuous Probability Distribution}
 %\label{sec-kcc}
 %
 %We consider the extension of Theorem~\ref{the-ourkc} for (continuous or discrete) probability distribution. 
 In some scenarios, we may not be able to access the whole data, {\em e.g.,}  due to privacy preserving or the challenge of data acquisition. 
 %For example, in a data marketplace, a buyer wants to buy a dataset ``$P$'' from an owner\cite{DBLP:journals/jacm/KoutrisUBHS15}.  Obviously, the owner cannot leak the whole P to the buyer before completing the deal. To estimate the value of $P$, the owner may allow the buyer to sample a small number of data items from P to study its distribution. 
 %We present several such scenarios in Section~\ref{sec-app-cor} of our supplement.
 Instead, we may be only allowed to take a small sample each time. 
   %have an oracle to sample data based on the density function. 
 Specifically, we assume the data is a (continuous or discrete) probability distribution with the probability  density function $f$ in $\Omega\subset\mathbb{R}^d$, where $\int_{p\in \Omega}f(p)\mathtt{d} p=1$; the function $f$ can be hid and we only assume that there is an oracle to sample data based on $f$. %We still use $O_1, O_2, \cdots, O_k$ to denote the optimal $k$ clusters as Section~\ref{sec-ouralg}. 
 Obviously, it is prohibitive to directly run the Gonzalez's algorithm in such scenario. On the other hand, our proposed Algorithm~\ref{alg-ourkc} can be naturally applied to solve this problem because it only takes a random sample in each round. 
 The following result is a straightforward extension of Theorem~\ref{the-ourkc}. 
 
 
 \begin{corollary}
 \label{the-con}
 We run Algorithm~\ref{alg-ourkc} on a (continuous or discrete) probability distribution over $\Omega$; each sampled point is taken by an oracle based on the probability  density function $f$. With probability at least $1-\eta$, there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral $\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p\geq  1-\epsilon$, such that $\tilde{\Omega}$ is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. 
  \end{corollary}
 
 %Further, we consider adding two realistic assumptions to achieve a solution covering the whole domain $\Omega$ (instead of a subset $\tilde{\Omega}$).  
 %
 %
 %\begin{assumption}
 %\label{ass-1}
 %There exists a constant $\alpha>0$ such that $\frac{\min_{p\in \Omega}f(p)}{\max_{p\in \Omega} f(p)}\geq \alpha$. 
 %\end{assumption}
 %
 %Before proposing the second assumption, we need to introduce an important definition for measuring intrinsic dimension for high dimensional data. 
 %
 %%For any $p\in \mathbb{R}^d$ and $r\geq 0$, we use $Ball(p, r)=\{q\in  \mathbb{R}^d\mid ||q-p||\leq r\}$ to indicate the ball of radius $r$ around $p$. 
 %
 %\begin{definition}[Doubling Dimension]
 %\label{def-dd}
 %The doubling dimension of a set $P\subset\mathbb{R}^d$ is the smallest number $\rho$, such that for any $p\in P$ and $r\geq 0$, $P\cap \mathbb{B}(p, 2r)$ is always covered by the union of at most $2^\rho$ balls with radius $r$.
 %\end{definition}
 %\begin{remark}
 %Usually, the doubling dimension is defined for an abstract metric (such as~\cite{har2006fast,talwar2004bypassing,DBLP:journals/talg/ChanGMZ16}). Here, since we focus on the applications for high-dimensional data with low intrinsic dimension, we directly describe the doubling dimension for point set in high-dimensional Euclidean space. 
 %\end{remark}
 %
 %\begin{assumption}
 %\label{ass-2}
 %The input data $P$ has a constant doubling dimension $\rho>0$. 
 %\end{assumption}
 %
 %
 %%As a warm-up, we prove the following result first. 
 %
 %
 %
 %
 %\begin{algorithm}[h]
 %   \caption{\textsc{Sublinear $k$-Center Clustering \Rmnum{2}}}
 %   \label{alg-ourkc2}
 %\begin{algorithmic}
 %  \STATE {\bfseries Input:} A continuous probability distribution $P$ over $\Omega\subset\mathbb{R}^d$ under Assumption~\ref{ass-1} and \ref{ass-2}, $k\in\mathbb{Z}^+$, and two parameters $\eta, \sigma\in (0,1)$. 
 %%   \REPEAT
 %   \STATE
 %   \begin{enumerate}
 %   \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $i=1$. 
 %   \item Repeat the following steps $\frac{k}{(\sigma/4)^{\rho}}-1$ times:
 %   \begin{enumerate}
 %  \item Randomly pick a set $Q$ of $\frac{1}{\alpha}(\frac{k}{ (\sigma/4)^\rho})^2\cdot (\log \frac{k}{\eta}+\rho\log \frac{1}{\sigma})$ points from $P$. 
 %  \item Select the furthest point, say $q_0$, from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
 %  \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
 %   \end{enumerate}
 %   \item Run the Gonzalez's algorithm on $C$ and return the obtained $k$ cluster centers $\hat{C}=\{\hat{c}_1, \cdots, \hat{c}_k\}$. 
 %     \end{enumerate}
 % %   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
 %     %     \UNTIL{The objective value becomes stable.}
 %\end{algorithmic}
 % \end{algorithm}
 %
 %\begin{theorem}
 %\label{the-con-1}
 %We run Algorithm~\ref{alg-ourkc2}  and suppose Assumption~\ref{ass-1} and \ref{ass-2} are true. With probability at least $1-\eta$, $P$ is covered by $\cup^k_{j=1}\mathbb{B}(\hat{c}_j, (2+\sigma) r_{\mathtt{opt}})$. 
 %\end{theorem} 
 %\begin{proof}
 %Recall $r_{\mathtt{opt}}$ is the radius of the optimal $k$-center clustering on $P$, that is, $P$ can be covered by $k$ balls with radius $r_{\mathtt{opt}}$. If we repeatedly apply Definition~\ref{def-dd} $\log\frac{4}{\sigma}$ times, we know that $P$ can be covered by $2^{\rho\log \frac{4}{\sigma}}k=\frac{k}{(\sigma/4)^\rho}$ balls with radius $\frac{\sigma}{4} r_{\mathtt{opt}}$. Thus, we can view the input $P$ as an instance of $\frac{k}{(\sigma/4)^\rho}$-center clustering with the optimal radius no larger than $\frac{\sigma}{4} r_{\mathtt{opt}}$. We denote the corresponding optimal clusters as $D_1, D_2, \cdots, D_{\frac{k}{(\sigma/4)^\rho}}$, respectively. From Theorem~\ref{the-con}, we know that with probability at least $1-\eta$, there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral 
 %\begin{eqnarray}
 %\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p> 1-\frac{\alpha}{k/(\sigma/4)^\rho}, \label{for-the-con-1-1}
 %\end{eqnarray}
 %such that $\tilde{\Omega}$ is covered by $\cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, 2\times \frac{\sigma}{4} r_{\mathtt{opt}})$ (we just need to replace $k$ by $\frac{k}{(\sigma/4)^\rho}$ and $\epsilon$ by $\frac{\alpha}{k/(\sigma/4)^\rho}$). 
 %Meanwhile, from Assumption~\ref{ass-1} we know that for each optimal cluster $D_j$, $1\leq j\leq \frac{k}{(\sigma/4)^\rho}$, the size
 %\begin{eqnarray}
 %\int_{p\in D_j}f(p)\mathtt{d} p\geq \frac{\alpha}{k/(\sigma/4)^\rho}. 
 %\end{eqnarray}
 %Together with (\ref{for-the-con-1-1}), we know that for any $j$, $\tilde{\Omega}\cap D_j\neq \emptyset$. Since each $D_j$ has the radius no larger than $\frac{\sigma}{4} r_{\mathtt{opt}}$, we know 
 %\begin{eqnarray}
 %D_j\subset \cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, 2\times \frac{\sigma}{4} r_{\mathtt{opt}}+2\times \frac{\sigma}{4} r_{\mathtt{opt}})=\cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, \sigma r_{\mathtt{opt}}). 
 %\end{eqnarray}
 %Consequently, 
 %the whole input data
 %\begin{eqnarray}
 %P\subset\cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, \sigma r_{\mathtt{opt}}). \label{for-the-con-1-2}
 %\end{eqnarray} 
 %In step 3, Algorithm~\ref{alg-ourkc2} runs the $2$-approximate Gonzalez's algorithm on $C$. Since $C\subset P$, the optimal radius for $C$ should be no larger than $r_{\mathtt{opt}}$. Namely, 
 %\begin{eqnarray}
 %C\subset\cup^k_{j=1}\mathbb{B}(\hat{c}_j, 2r_{\mathtt{opt}}). \label{for-the-con-1-3}
 %\end{eqnarray}
 %Combining (\ref{for-the-con-1-2}) and (\ref{for-the-con-1-3}), we have
 %\begin{eqnarray}
 %P\subset\cup^k_{j=1}\mathbb{B}(\hat{c}_j, (2+\sigma)r_{\mathtt{opt}}).
 %\end{eqnarray}
 %So we complete the proof for Theorem~\ref{the-con-1}. 
 %\end{proof}
 %
 %\textbf{The time complexity of Algorithm~\ref{alg-ourkc2}.} It is not difficult to calculate the runtime which is $O\Big(\frac{d}{\alpha}\big(\frac{k}{(\sigma/4)^\rho}\big)^4(\log \frac{k}{\eta}+\rho\log \frac{1}{\sigma})\Big)$. 
 %
 
  \subsection{When $k$ Is Not Given}
 \label{sec-kcu}
  
 In many real scenarios, the number of clusters $k$ is often not given. For instance, we may only have a threshold $r_0>0$ for the radius; so we just try to perform the $k$-center clustering algorithm for different values of $k$ until the obtained radius is no larger than $r_0$. The reader may realize that this problem is related to the well known {\em geometric set cover} problem~\citep{DBLP:journals/dcg/BronnimannG95,DBLP:journals/dcg/AgarwalP20}; however, existing geometric set cover algorithms often have large (super linear) running time and can only handle low dimensional case. Actually, the geometric set cover problem is NP-hard and has only constant factor approximation in 2D plane (the problem is even harder in high dimensions).
 
 
 In this paper, we simplify the problem and  consider a practical approach: using the Gonzalez's algorithm to achieve our goal. Suppose the given set $P$ can be covered by $\tilde{k}\in \mathbb{Z}^+$ balls with radius $r_0/2$ ({\em i.e.,} $\tilde{k}$ is the value that the optimal radius of $\tilde{k}$-center clustering on $P$ is no larger than $r_0/2$). Then, if we just run the Gonzalez's algorithm  iteratively, the resulting radius will reach $r_0$ within at most $\tilde{k}$ rounds (because it is a $2$-approximation algorithm). \textbf{Now we discuss how to implement this procedure in sublinear time. }
 We cannot directly adapt this procedure to our sublinear Algorithm~\ref{alg-ourkc}, due to the following two issues. 
 \textbf{(1)} The sample size $\frac{k}{\epsilon}\log \frac{k}{\eta}$ in step 2(a) depends on a given $k$ (also note that Algorithm~\ref{alg-ourkc} is a randomized algorithm and its success probability depends on the sample size); \textbf{(2)} we do not know when to terminate if $k$ is not given. 
  
 To resolve these two issues,  we introduce a \textbf{stratified sampling method}. Let $k_0\geq 1$ be any fixed constant. Imagine we run  step 2(a)-2(c) of Algorithm~\ref{alg-ourkc} iteratively. We partition the process into different phases and modify the sample size in step 2(a)  for each phase accordingly:
  \begin{itemize}
 \item \textbf{Phase $t=0$:} for $i=1, 2, \cdots, k_0$, we set  $|Q|=\frac{2k_0}{\epsilon}\log\frac{k_0}{\eta}$. 
 \item \textbf{Phase $t\geq 1$:} for $i=\sum^{t-1}_{s=0}2^s k_0+1, \sum^{t-1}_{s=0}2^s k_0+2, \cdots, \sum^{t}_{s=0}2^s k_0$, we set $|Q|=2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{\eta}$. 
 \end{itemize}
  
 
  So phase $t$ contains $2^t k_0$ iterations. The sample size also increases from phase $t$ to phase $t+1$. 
  
  
  
 For completeness, we also need to set the stopping condition. Suppose $r_0>0$ is the given threshold. At the end of each $i$-th iteration, we take a sample $S$ from $P$ uniformly at random, and compute the ratio 
 \begin{eqnarray}
 \tau=\frac{\Big|S\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|}{|S|}. \label{for-tau}
 \end{eqnarray}
 The following lemma introduces an \textbf{oracle} that can help us to decide when to terminate.
 
 \begin{lemma}
 \label{lem-terminate}
 Suppose $\eta_0\in (0,1)$. We set the sample size $|S|\geq \frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$. With probability at least $1-\eta_0$, the following oracle returns the correct answer:  if $\tau\leq \frac{3}{2}\epsilon$, return ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$''; else, return ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|>\epsilon n$''. 
 \end{lemma} 
 \begin{proof}
 For convenience, we use $\tilde{\epsilon}$ to denote the ratio $\frac{\big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\big|}{n}$. We consider two cases: (\rmnum{1}) $\tilde{\epsilon}\leq \eta_0\epsilon$ and (\rmnum{2}) $\tilde{\epsilon}> \eta_0\epsilon$. For case (\rmnum{1}), $\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|=\tilde{\epsilon} n\leq \eta_0\epsilon n<3\epsilon n$. Due to the Markov's inequality, we know that $\tau\leq \frac{1}{\eta_0}\times\eta_0\epsilon=\epsilon<\frac{3}{2}\epsilon$ with probability at least $1-\eta_0$.  Thus, it returns ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$'' which is a correct answer, with probability at least $1-\eta_0$. So we  focus on the second case below. 
 
 We use the Chernoff bound~\citep{alon2004probabilistic}. Define $|S|$ random variables $\{y_1, \cdots, y_{|S|}\}$: for each $1\leq j\leq |S|$, $y_j=1$ if the $j$-th sampled element falls in $P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)$, otherwise, $y_j=0$. So $E[y_j]=\tilde{\epsilon}$ for each $y_j$. As a consequence, we have
 \begin{align}
 \textbf{Pr}\big(\big|\sum^{|S|}_{j=1}y_j-\tilde{\epsilon} |S|\big|\leq \frac{1}{2}\tilde{\epsilon}|S|\big)\geq 1-2e^{-\frac{\tilde{\epsilon}}{12}|S|}. \label{for-oct11-1}
 \end{align}
 Since we assume $\tilde{\epsilon}> \eta_0\epsilon$, if $|S|\geq\frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$, the above (\ref{for-oct11-1}) implies that with probability at least $1-\eta_0$, $\big|\sum^{|S|}_{j=1}y_j-\tilde{\epsilon} |S|\big|\leq \frac{1}{2}\tilde{\epsilon}|S|$, {\em i.e.,}
 \begin{eqnarray}
 \tau=\frac{\sum^{|S|}_{j=1}y_j}{|S|}\in [\frac{1}{2}\tilde{\epsilon}, \frac{3}{2}\tilde{\epsilon}]. \label{for-terminate-1}
 \end{eqnarray}
 Therefore,  if $\tau\leq \frac{3}{2}\epsilon$, we know $\frac{1}{2}\tilde{\epsilon}\leq \frac{3}{2}\epsilon$ from (\ref{for-terminate-1}),  and it implies $\tilde{\epsilon}\leq 3\epsilon$. Otherwise, we know $\frac{3}{2}\tilde{\epsilon}>\frac{3}{2}\epsilon$ and it implies $\tilde{\epsilon}>\epsilon$. 
 \end{proof}
 
 Now, we are ready to present our algorithm for the case without knowing $\tilde{k}$.
 % Suppose $\tilde{k}$ is the number of clusters returned by the Gonzalez's algorithm with a given threshold $r_0>0$ for the radius. Namely,  the Gonzalez's algorithm needs to run $\tilde{k}$ iterations until the resulting radius reduces to be no larger than $r_0$. 
  Let $i_{\mathtt{ter}}$ be the size of $C$ when Algorithm~\ref{alg-ourkc3} terminates. To evaluate the performance of the algorithm, we need to compare $i_{\mathtt{ter}}$ with $\tilde{k}$ and investigate the number of points that are covered by $\cup^{i_{\mathtt{ter}}}_{j=1}\mathbb{B}(c_j,r_0)$.
 
 
 
 
 \begin{algorithm}[h]
    \caption{\textsc{Sublinear $k$-Center Clustering \Rmnum{2}}}
    \label{alg-ourkc3}
 \begin{algorithmic}
   \STATE {\bfseries Input:} A set $P$ of $n$ points in $\mathbb{R}^d$, a threshold $r_0>0$, an arbitrary constant integer  $k_0\in\mathbb{Z}^+$, and two parameters $\eta, \epsilon\in (0,1)$. 
 %   \REPEAT
    \STATE
    \begin{enumerate}
    \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $t=i=0$. 
    \item Repeat the following steps as the stratified sampling procedure:
    \begin{enumerate}
       \item Take a sample $S$ from $P$ uniformly at random, where $|S|= \frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$ and $\eta_0=\frac{\eta}{2^{2t}k_0}$. 
    \item  Repeat the following steps $2^t k_0$ times ({\em i.e.,} phase $t$):
    \begin{enumerate}
    \item Randomly pick a set $Q$ from $P$, where  $|Q|= 2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{\eta}$. 
    \item Let $q_0$ be the furthest point from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
%   \item Randomly pick a set $Q$ of $2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{2\eta}$ points from $P$. 
%   \item Select the furthest point, say $q_0$, from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
   \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
   \item Apply Lemma~\ref{lem-terminate} as the oracle (using the sample $S$ from step 2(a)) to determine whether to terminate: if it returns ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$'', stop the algorithm, set $i_{\mathtt{ter}}=i$, and return $C$. 
      \end{enumerate}
    \item $t=t+1$. 
    \end{enumerate}
 %   \item Return $C$. 
      \end{enumerate}
  %   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
      %     \UNTIL{The objective value becomes stable.}
 \end{algorithmic}
  \end{algorithm}
 
 
 \begin{theorem}
 \label{the-kcu}
 Let $C=\{c_1, \cdots, c_{i_\mathtt{ter}}\}$ be the output from Algorithm~\ref{alg-ourkc3}.  
 With probability at least $1-4\eta$,  $i_{\mathtt{ter}}\leq \tilde{k}$, and  there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-3\epsilon)n$, such that $\tilde{P}$ is covered by $\cup^{i_{\mathtt{ter}}}_{j=1}\mathbb{B}(c_j,r_0)$. 
 \end{theorem}
 \begin{proof}
 To prove Theorem~\ref{the-kcu}, we first imagine the ``fancied'' scenario that $\tilde{k}$ is given: we just run Algorithm~\ref{alg-ourkc} with $k=\tilde{k}$ and $|Q|=\frac{\tilde{k}}{\epsilon}\log \frac{\tilde{k}}{\eta}$. 
 Recall the proof of Theorem~\ref{the-ourkc}, where we define a sequence of subsets $P_2, P_3, \cdots, P_{\tilde{k}}$ and define $\tilde{P}=P\setminus\cup^{\tilde{k}}_{i=2}P_i$. 
 To guarantee $|\tilde{P}|\geq (1-\epsilon)n$, we prove that each $P_i$ contains at most $\frac{\epsilon}{\tilde{k}}n$ points. For Algorithm~\ref{alg-ourkc3}, we also define a sequence of subsets  $P_2, P_3, \cdots, P_{\tilde{k}}$ (by using (\ref{for-the-ourkc-1})), but we need to modify their sizes. At each $t$-th phase, since we have the sample size  $2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{\eta}$, by using the similar idea from the proof of Theorem~\ref{the-ourkc}     we know that the size 
 \begin{eqnarray}
 |P_i|\leq \frac{\epsilon}{2^{2t}\times 2k_0}n, \text{with probability  $\geq1-\frac{\eta}{2^{2t}k_0}$.} \label{for-the-kcu-1}
 \end{eqnarray}
 Suppose we run Algorithm~\ref{alg-ourkc3} until $i=\tilde{k}$. Let $t_0$ be the total number of phases that the algorithm takes. Consequently, we have 
 \begin{eqnarray}
\frac{\big|\cup^{\tilde{k}}_{i=2}P_i\big|}{n} \leq& \frac{\epsilon}{2k_0}\times k_0+\frac{\epsilon}{2^2\times 2k_0}\times 2k_0 \nonumber \\
&+\cdots+\frac{\epsilon}{2^{2t_0}\times 2k_0}\times 2^{t_0}k_0 \nonumber\\
=&\frac{\epsilon}{2}(1+\frac{1}{2}+\cdots+\frac{1}{2^{t_0}})\leq\epsilon.\label{for-the-kcu-2}
 \end{eqnarray}
 So we can still guarantee $|\tilde{P}|=|P\setminus\cup^{\tilde{k}}_{i=2}P_i|\geq (1-\epsilon)n$. Furthermore, the total success probability is at least 
%  \begin{small}
 \begin{eqnarray}
 &(1-\frac{\eta}{k_0})^{k_0}\times(1-\frac{\eta}{2^2k_0})^{2k_0} \nonumber\\
 &\times\cdots\times(1-\frac{\eta}{2^{2(t_0-1)}k_0})^{2^{t_0-1}k_0}\nonumber\\
&>(1-\eta)\times(1-\frac{\eta}{2})\times\cdots\times(1-\frac{\eta}{2^{t_0-1}}) \nonumber\\
 &>1-(1+\frac{1}{2}+\cdots+\frac{1}{2^{t_0-1}})\eta>1-2\eta. \label{for-the-kcu-3}
 \end{eqnarray}
% \end{small}

 
 
 The remaining issue is that we do not know the value of $\tilde{k}$ in reality (in other words, we do not know when to terminate the algorithm). Therefore, we apply Lemma~\ref{lem-terminate} as an oracle in step~2(b)(\rmnum{4}), where the success probability for each time is $1-\eta_0=1-\frac{\eta}{2^{2t}k_0}$. When it returns ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|>\epsilon n$'', we know that the algorithm needs to continue. We stop the algorithm when it returns ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$''. Since we relax the error of covering number to be $3\epsilon>\epsilon$, we know that $i_{\mathtt{ter}}$ should be no larger than $\tilde{k}$. By using the similar idea of (\ref{for-the-kcu-3}), we can obtain the overall success probability of the oracle that is at least 
 \begin{eqnarray}
 1-2\eta. \label{for-the-kcu-4}
 \end{eqnarray}
 
 Combining (\ref{for-the-kcu-3}) and (\ref{for-the-kcu-4}), the overall success probability of Algorithm~\ref{alg-ourkc3} is at least $1-4\eta$. 
 \end{proof}
 
  
 \textbf{The time complexity of Algorithm~\ref{alg-ourkc3}.} We analyze the runtime for each phase. We set $k_0\geq 2$ to be a constant integer.  At the $t$-th phase, step (b)(\rmnum{1})-(\rmnum{3}) take $O(\frac{2^{3t}}{\epsilon}\log\frac{2^{2t}}{\eta}d)$ time; step (b)(\rmnum{4}) takes $O(\frac{2^{2t}}{\eta\epsilon}\log\frac{2^{2t}}{\eta} d)$ time. Also, the phase repeats step (b)(\rmnum{1})-(\rmnum{4}) $O(2^{t})$ times. Thus, the  $t$-th phase takes  $O((2^t+\frac{1}{\eta})\frac{2^{3t}}{\epsilon}\log\frac{2^{2t}}{\eta} d)$ time. 
  Let $t_0$ be the total number of phases. Then we can calculate the bounds for $\tilde{k}$: 
 \begin{eqnarray}
 \sum^{t_0-1}_{s=0}2^s k_0<\tilde{k}\leq \sum^{t_0}_{s=0}2^s k_0,
 \end{eqnarray}
 which implies $t_0\leq \log \frac{\tilde{k}}{k_0}+1\leq \log\tilde{k}$. So the total time complexity of Algorithm~\ref{alg-ourkc3} is $O((\tilde{k}+\frac{1}{\eta})\frac{\tilde{k}^{3}}{\epsilon}\log\frac{\tilde{k}}{\eta} d)$.
 Compared with the case that $\tilde{k}$ is given, the runtime is increased by only a factor $(\tilde{k}+\frac{1}{\eta})$ (the runtime of Algorithm~\ref{alg-ourkc} is $O(\frac{\tilde{k}^{3}}{\epsilon}\log\frac{\tilde{k}}{\eta} d)$). 
 
 We also have the following result for Algorithm~\ref{alg-ourkc3} which is similar with Corollary~\ref{the-con}.
 
 \begin{corollary}
 \label{the-con2}
 We run Algorithm~\ref{alg-ourkc3} on a (continuous or discrete) probability distribution over $\Omega$; each sampled point is taken by an oracle based on the probability density function $f$. 
 With probability at least $1-4\eta$,  $i_{\mathtt{ter}}\leq \tilde{k}$, and 
 there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral $\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p\geq  1-3\epsilon$, such that $\tilde{\Omega}$ is covered by $\cup^{i_{\mathtt{ter}}}_{i=1}\mathbb{B}(c_i,r_0)$. 
 %
 %
 %
 % there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-3\epsilon)n$, such that $\tilde{P}$ is covered by $\cup^{i_{\mathtt{ter}}}_{j=1}\mathbb{B}(c_j,r_0)$. 
 %
 %
 %
 %
 %With probability at least $1-\eta$, there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral $\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p\geq  1-\epsilon$, such that $\tilde{\Omega}$ is covered by $\cup^k_{j=1}\mathbb{B}(c_j, 2r_{\mathtt{opt}})$. 
  \end{corollary}
 
 
 
 %  \section{Convex Hull Approximation in High Dimensions}
 \section{CONVEX HULL APPROXIMATION IN HIGH DIMENSIONS}
 \label{sec-ch}
  
  
  
  
  
  
%  Blum {\em et al.}
 \cite{blum2019sparse} introduced a simple greedy convex hull approximation algorithm that is  similar in spirit to the Gonzalez's algorithm for $k$-center clustering. Given an instance $P\subset\mathbb{R}^d$, it also maintains a set $C$ that contains an arbitrarily selected $p\in P$ at the beginning. In each round, the algorithm always selects the farthest point to $\mathtt{conv}(C)$ and adds it to $C$, until some specified stopping condition is satisfied. For ease of presentation, we assume that $P$ is contained in a unit ball of $\mathbb{R}^d$. The algorithm yields a bi-criteria approximate result: given an error parameter $\delta\in(0,1)$, suppose $k_{\mathtt{opt}}=\min\big\{k\mid Q\subset P, |Q|=k, \max_{p\in P}\mathtt{dist}(p, \mathtt{conv}(Q))\leq \delta\big\}$; the algorithm can yield a subset $C\subset P$ such that 
  \begin{eqnarray}
  |C|&=&O(k_{\mathtt{opt}}/\delta^{2/3}) \nonumber \\
  \mathtt{dist}(p, \mathtt{conv}(C))&\leq& 8\delta^{1/3}+\delta, \forall p\in P. \label{for-blum}
   \end{eqnarray}
  
  We consider applying our previous sampling idea to implement this convex hull approximation algorithm in sublinear time. Here, we have the same issue as Section~\ref{sec-kcu}, that is, we do not know the exact value of $k_{\mathtt{opt}}$ so that we cannot determine the sample size in each iteration and when to terminate. Thus we apply the same stratified sampling method. We also use Lemma~\ref{lem-terminate} as the oracle to determine whether the stopping condition is satisfied. 
  
  A minor technical issue for implementation is that it is costly to compute the distance from a given point to  a convex hull (it needs to solve a quadratic programming for achieving the exact result); instead we can apply the Gilbert's algorithm~\citep{gilbert1966iterative,gartner2009coresets} or some other variants like the Triangle algorithm~\citep{DBLP:journals/anor/AwasthiKZ20} to compute an approximate solution efficiently. Thus, we need another small parameter $\xi\in (0,1)$ to indicate the approximation error induced by this step. Compared with the ratio ``$\tau$'' for $k$-center clustering, we add an extra factor $(1+\xi)$ to $\tau$ below.
 
Let $r_0=8\delta^{1/3}+\delta$. We use $C_i$ to denote the set of selected vertices $\{c_1, \cdots, c_i\}$ at the first $i$  rounds. For convenience, we use $\mathtt{conv}(U,r)$ to denote the set
$$\big\{p\mid p\in \mathbb{R}^d, \mathtt{dist}\big(p, \mathtt{conv}(U)\big)\leq r\big\}$$
for any given set $U$ and $r\geq 0$. Then, we compute the ratio 
\begin{eqnarray}
\tau=\frac{\Big|S\setminus  \mathtt{conv}(C_i, (1+\xi)r_0) \Big|}{|S|}. \label{for-tau-2}
\end{eqnarray}
Similar  to the case of $k$-center clustering, the following lemma introduces an \textbf{oracle} that can help us to decide when to terminate (the proof is almost identical to that of Lemma~\ref{lem-terminate}).


\begin{algorithm}[h]
  \caption{\textsc{Sublinear Convex Hull Approximation}}
  \label{alg-ourkc3-2}
\begin{algorithmic}
 \STATE {\bfseries Input:} A set $P$ of $n$ points in $\mathbb{R}^d$, a threshold $r_0>0$, an arbitrary constant integer  $k_0\in\mathbb{Z}^+$, and three parameters $\eta, \epsilon, \xi\in (0,1)$. 
%   \REPEAT
  \STATE
  \begin{enumerate}
  \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $t=i=0$. 
  \item Repeat the following steps as the stratified sampling procedure:
  \begin{enumerate}
     \item Take a sample $S$ from $P$ uniformly at random, where $|S|= \frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$ and $\eta_0=\frac{\eta}{2^{2t}k_0}$. 
  \item  Repeat the following steps $2^t k_0$ times ({\em i.e.,} phase $t$):
  \begin{enumerate}
%  \item Randomly pick a set $Q$ of $2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{2\eta}$ points from $P$.
 \item Randomly pick a set $Q$ from $P$, where  $|Q|= 2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{\eta}$.
 \item Select the $(1+\xi)$-approximate furthest point, say $q_0$, from $Q$ to $\mathtt{conv}(C)$ via the algorithm of~\cite{gartner2009coresets}. 
 \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
 \item Apply Lemma~\ref{lem-terminate-2} as the oracle (using the sample $S$ from step 2(a)) to determine whether to terminate: if it returns ``$\Big|P\setminus  \mathtt{conv}(C_i, (1+\xi)r_0) \Big|\leq 3\epsilon n$'', stop the algorithm, set $i_{\mathtt{ter}}=i$, and return $C$. 
    \end{enumerate}
  \item $t=t+1$. 
  \end{enumerate}
%   \item Return $C$. 
    \end{enumerate}
%   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
    %     \UNTIL{The objective value becomes stable.}
\end{algorithmic}
\end{algorithm}






\begin{lemma}
\label{lem-terminate-2}
Suppose $\eta_0\in (0,1)$. We set the sample size $|S|\geq \frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$. With probability at least $1-\eta_0$, the following oracle returns the correct answer:  if $\tau\leq \frac{3}{2}\epsilon$, return ``$\Big|P\setminus  \mathtt{conv}(C_i, (1+\xi)r_0) \Big|\leq 3\epsilon n$''; else, return ``$\Big|P\setminus  \mathtt{conv}(C_i, (1+\xi)r_0) \Big|>\epsilon n$''. 
\end{lemma} 

%We leave the details of our algorithm to Section~\ref{sec-app-convex} due to the space limit, and present the theorem below. 




 






\begin{theorem}
\label{the-kcu-2}
Let $C=\{c_1, \cdots, c_{i_\mathtt{ter}}\}$ be the output from Algorithm~\ref{alg-ourkc3-2}.  Let $\tilde{k}$ be the number of vertices returned by the greedy selection algorithm~\citep{blum2019sparse} (see (\ref{for-blum})).  
With probability at least $1-4\eta$,  $i_{\mathtt{ter}}\leq \tilde{k}$, and  there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-3\epsilon)n$, such that $\tilde{P}$ is covered by $\mathtt{conv}(C, (1+\xi)r_0)$. 
\end{theorem}

\textbf{Time complexity.} The computation for the time complexity is similar with that for $k$-center clustering in Section~\ref{sec-kcu}, where the only difference is that we have to compute the $(1+\xi)$-approximate polytope distance from each sampled point to $C$ in each iteration. From the analysis of~\cite{gartner2009coresets}, we know it takes $O(\frac{1}{\xi\delta^2}|C|d)$ time. The total complexity of our convex hull algorithm is $O((\tilde{k}+\frac{1}{\eta})\frac{\tilde{k}^{3}}{\epsilon\xi\delta^2}\log\frac{\tilde{k}}{\eta} d)$. 

% #####################################################


\newcounter{sd1}
 \begin{figure*} [htbp]
   \begin{center}
     %\vspace{-0.1in}
     \includegraphics[height=0.168\textwidth]{figure/alg1/radius_k_cifar10}  
     \hspace{0.12in}
     \includegraphics[height=0.169\textwidth]{figure/alg1/runtime_k_cifar10} 
     \hspace{0.12in}
     \includegraphics[height=0.168\textwidth]{figure/alg1/runtime_vs_radius_cifar10}  
      %		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_cost} 
     \centerline{ \hspace{-0.48in}\hfill \stepcounter{sd1}  (\alph{sd1})\hfill \stepcounter{sd1} (\alph{sd1}) \hfill \stepcounter{sd1} (\alph{sd1})\hspace{2.1in}}
     \caption{The experimental performances on \textbf{CIFAR-10} for the case that $k$ is given. All the results (radius and runtime) are respectively normalized over the results obtained by \textsc{Gonzalez}. In (c), we show the radius obtained versus runtime for different values of $k$.}     
    
    %  \caption{The Experimental Performances for the Case That $k$ Is Given. In (c), we show the radius obtained versus runtime for different values of $k$.}     
   
    \label{fig-exp-1}
   \end{center}
  \end{figure*}
 %
 %
 %\begin{figure*}
 %%	\begin{center}
 %%			% \vspace{-0.1in}
 %%		\includegraphics[width=0.33\textwidth]{resultfigs/real_kc_r}  
 %%		\includegraphics[width=0.33\textwidth]{resultfigs/real_km_cost}   
 %%		\includegraphics[width=0.33\textwidth]{resultfigs/median_real_km_cost} 
 %%		 %\vspace{-0.1in}
 %%		\caption{The normalized objective values on the  real  datasets.}     
 %%		\label{fig-exp-objective2}
 %%	\end{center}
 %%	 %\vspace{-0.3in}
 %%\end{figure*}
 \newcounter{sd2}
 \begin{figure*}[htbp] 
  % \vspace{.3in}
   \begin{center}
        \includegraphics[height=0.18\textwidth]{figure/alg2/radius_cifar10} 
         \hspace{0.1in}
       \includegraphics[height=0.183\textwidth]{figure/alg2/runtime_cifar10}  
                     \hspace{0.1in}						
        \includegraphics[height=0.198\textwidth]{figure/alg2/num_k_cifar10}  						
  %		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_time}
 
         \centerline{ \hspace{-0.8in}\hfill \stepcounter{sd2}  (\alph{sd2})\hfill \stepcounter{sd2} (\alph{sd2}) \hfill \stepcounter{sd2} (\alph{sd2})\hspace{2.2in}}
          \caption{The experimental performances on \textbf{CIFAR-10} for the case that a radius threshold $r_0$ is given. All the results (radius, runtime, and the number of returned centers) are respectively normalized over the results obtained by \textsc{Gonzalez}.}     
        %  \caption{The Experimental Performances for the Case That a Radius Threshold $r_0$ Is Given. All the results (radius, runtime, and the number of returned centers) are respectively normalized over the results obtained by \textsc{Gonzalez}.}     
     
         \label{fig-exp-2}
   \end{center}
  \end{figure*}
  
  
 \newcounter{sd3}
\begin{figure*}[htbp]
	\begin{center}
			 %\vspace{-0.1in}
		\includegraphics[height=0.168\textwidth]{figure/alg1/radius_k_mnist} \hspace{0.12in}
		\includegraphics[height=0.168\textwidth]{figure/alg1/runtime_k_mnist} 
		\hspace{0.12in}
		\includegraphics[height=0.168\textwidth]{figure/alg1/runtime_vs_radius_mnist}  
 		%		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_cost} 
		% \vspace{-0.02in}
		\centerline{ \hspace{-0.5in}\hfill \stepcounter{sd3}  (\alph{sd3})\hfill \stepcounter{sd3} (\alph{sd3}) \hfill \stepcounter{sd3} (\alph{sd3})\hspace{2.15in}}
 		\caption{The experimental performances on \textbf{MNIST} for the case that $k$ is given. All the results (radius and runtime) are respectively normalized over the results obtained by \textsc{Gonzalez}. In (c), we show the radius obtained versus runtime for different values of $k$.}     
% 		\caption{The Experimental Performances for the Case That $k$ Is Given. In (c), we show the radius obtained versus runtime for different values of $k$.}     
		
		\label{fig-exp-3}
	\end{center}
%	 \vspace{-0.2in}
\end{figure*}
 

 \newcounter{sd4}
\begin{figure*} [htbp]
	\begin{center}
 		\includegraphics[height=0.185\textwidth]{figure/alg2/radius_mnist} 
		\hspace{0.1in}
		\includegraphics[height=0.185\textwidth]{figure/alg2/runtime_mnist} \hspace{0.1in}						
		\includegraphics[height=0.2\textwidth]{figure/alg2/num_k_mnist}  						
 %		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_time}

 		\centerline{ \hspace{-0.7in}\hfill \stepcounter{sd4}  (\alph{sd4})\hfill \stepcounter{sd4} (\alph{sd4}) \hfill \stepcounter{sd4} (\alph{sd4})\hspace{2.3in}}
 		 \caption{The experimental performances on \textbf{MNIST} for the case that a radius threshold $r_0$ is given. All the results (radius, runtime, and the number of returned centers) are respectively normalized over the results obtained by \textsc{Gonzalez}.}     
 		 %\caption{The Experimental Performances for the Case That a Radius Threshold $r_0$ Is Given. All the results (radius, runtime, and the number of returned centers) are respectively normalized over the results obtained by \textsc{Gonzalez}.}     
		  
		  \label{fig-exp-4}
	\end{center}
%	\vspace{-0.2in}
 \end{figure*}
 
 
  
 %  \section{Experimental Results}
 \section{EXPERIMENTAL RESULTS}
 \label{sec-exp}
  All the experiments were conducted on an Ubuntu workstation with 2.40GHz Intel(R) Xeon(R) CPU E5-2680 and 256GB main memory. The algorithms were implemented in MATLAB R2019b. For each instance, we repeat the experiment $10$ times and report the average results with their standard deviations. 
 
 We consider several baseline methods including the $2$-approximate \textsc{Gonzalez}~\citep{GONZALEZ1985293} and the recently proposed streaming $k$-center clustering algorithm \textsc{CPP }~\citep{DBLP:journals/corr/abs-1802-09205}; we also compare our algorithms with the uniform sampling method~\cite{huang2018epsilon}  that is denoted as \textsc{Uniform-$r$}, where $r$ denotes the sampling rate ({\em e.g.,} \textsc{Uniform-$0.1$} means we take $10\%$ points from the input uniformly at random). 
 
 
 
  We run our proposed Algorithm~\ref{alg-ourkc} and Algorithm~\ref{alg-ourkc3} on  the real image dataset  \textbf{CIFAR-10}~\citep{krizhevsky2009learning} which consists of $60,000$ color images  with  each image being represented by a $3072$-dimensional vector. 
 In Figure~\ref{fig-exp-1}, we can see that our Algorithm~\ref{alg-ourkc} runs significantly faster than the other methods.  Also it is worth emphasizing that in our evaluation, we compute the  radius for covering all the input points, rather than excluding the farthest $\epsilon n$ points as the theoretical analysis in Theorem~\ref{the-ourkc}. We can see that our algorithm and \textsc{Gonzalez} can achieve very close radii,  even though we did not exclude the farthest $\epsilon n$ points. 
 
 In Figure~\ref{fig-exp-2}, we illustrate the results of Algorithm~\ref{alg-ourkc3}. Similar with Figure~\ref{fig-exp-1}, we can see that our algorithm runs much faster than \textsc{Gonzalez}. An interesting observation is that our algorithm returns much less centers than \textsc{Gonzalez} for a fixed radius threshold $r_0$ (Figure~\ref{fig-exp-2} (c)).  We believe one possible reason is  that our random sampling approach is more likely to select a point closer to the optimal ball center, and thus the obtained radius can decrease faster, while the greedy selection of \textsc{Gonzalez} always selects the most ``extreme'' point which could be far to the optimal ball center. 
 
 
 
 
 
 
We also run the algorithms  on another real dataset  \textbf{MNIST}~\citep{lecun98}; it contains $n=60,000$ handwritten digit images from $0$ to $9$, where each image is represented by a $784$-dimensional vector. To illustrate the scalability of our algorithms for large-scale data, we enlarge  \textbf{MNIST} by $6$ times; namely, for each image vector, we generate $5$ copies and  add small Gaussian noises to them. The results are shown in Figure~\ref{fig-exp-3} and \ref{fig-exp-4}. 




 
 
 
 
 
Due to the space limit, we place the experimental results for convex hull approximation to  our full version. 
%For $k$-center clustering, we also test the algorithms on the real dataset  \textbf{MNIST}~\citep{lecun98}; it contains $n=60,000$ handwritten digit images from $0$ to $9$, where each image is represented by a $784$-dimensional vector.  To illustrate the scalability of our algorithms for large-scale data, we enlarge  \textbf{MNIST} by $6$ times; namely, for each image vector, we generate $5$ copies and  add small Gaussian noises to them. 
%For convex hull approximation, we use the same real datasets from the experimental section of~\cite{DBLP:journals/anor/AwasthiKZ20}: the \textbf{NIPS} data set with 1500 documents, and a pruned vocabulary of 12k words; the \textbf{NYTimes} Corpus with sub sampled 30000 documents, and a pruned vocabulary of 5k words\footnote{\url{https://archive.ics.uci.edu/ml/datasets/bag+of+words}}.  
 
 
 
  
 %  \section{Conclusion}
 \section{CONCLUSION}
 \label{sec-con}
  In this paper, we propose the sublinear algorithms for greedy selection methods. Following this work, there are also several interesting problems deserving to study in future. For example, in our experiments we observe that our random sampling based approach can  achieve very close radii   with the vanilla greedy selection approach \textsc{Gonzalez} (even without excluding  the farthest $\epsilon n$ points). So we expect to have a strict analysis on this phenomenon in theory, {\em e.g.,} adding some reasonable assumption to the data distribution from the perspective of {\em beyond worst-case analysis} \citep{DBLP:journals/cacm/Roughgarden19}.
  
  
  \section{ACKNOWLEDGEMENTS}
  The authors would like to thank the anonymous reviewers for their helpful discussions and suggestions on improving this paper. This work was supported in part by National Key R \& D Program of China No. 2021YFA1000900.
 


 
 
%  Also, as  the energy consumption from IT  infrastructures has become a serious issue to environment ({\em e.g.,} climate change), we believe that it is necessary to consider other aspects (like the energy consumption) for designing algorithms. The future work along this line 
% could have a long-term impact to the society. 
% \newpage
 
 %Geometric optimization is one of most fundamental problems in computational geometry \cite{Agarwal2019GeometricOR}.
 %Consider the common problems of involving sets of points in $\mathbb{R}^d$ in Computational Geometry.
 %Let P be a set of n points in $\mathbb{R}^d$. Instead of solving the original proble
 %consider the problem of finding a small representative subset (e.g. coreset) $Q \subseteq  P$, 
 %such that the solution just on the subset is guaranteed to be a good
 %approximation of the solution on the original set, while preserving both its size and approximation quality. 
 %Such subset approximate algorithms ensure faster and approximate solutions.
 %The clustering problem (e.g. $k$-center clustering problem) is one of most popular geometric optimization problems, 
 %where goal is to partition the points into several clusters based on their similarities or dissimilarities;
 %the problem has been widely applied to practial applications, such as data mining~\cite{10.5555/1095618} and active learning~\cite{sener2018active}.
 %Another important topic is the convex hull problem,
 %where the goal is find a subset, such that every $p \in P$ can be represented as a convex combination of points in the subset.
 %In most applications where the convex hull is applied, i.e. surface simplification \cite{Heckbert95surveyof}, 
 %approximate convex hulls are more competitive than the true convex hull because they are using much less space to approximate the boundary of a set of points.
 %Finding approximate convex hulls is an important component of algorithms for $\epsilon$-kernels,
 %which is applied in data analysis \cite{agarwal2017efficient} and computer graphics \cite{barequet1999efficiently}.
 %Moreover, it turns out that $\epsilon$-approximate convex hulls can be used for sparse non-negative matrix factorization (NMF) \cite{blum2019sparse},
 %which has been widely applied in many applications, such as text mining and image analysis \cite{5360278}.
 %
 %However, in big data and machine learning era, the real-world data size could be large-scale. Many algorithms applied
 %to large-scale datasets may be difficult because of requiring prohibitive running time or memory.
 %In fact, the real-word datasets are often too large to fit in memory, which makes the problems to be much more
 %challenging.
 %
 %\subsection{Related Work and Our Contribution}
 %
 %\paragraph{Sublinear time algorithms.}
 %As extremely large-scale datasets grow more prevalent in machine learning era, 
 %it is natural to wonder what one can do in sublinear time algorithms design. 
 %In fact, there has been a lot of research on this direction \cite{Czumaj2010, article}. 
 %For example, a number of sublinear time algorithms design on clustering have been studied
 %in \cite{https://doi.org/10.1002/rsa.20157, 10.1145/301250.301366, 10.1023/B:MACH.0000033115.78247.f0, 10.5555/365411.365499, ding:LIPIcs:2020:12904}.
 %In addition, the sublinear time algorithms design on graph model or probability distributions has been studied extensively \cite{goldreich1998property}.
 %
 %In this paper, we consider to develop sublinear time algorithms for several geometric optimization problems by random sampling approach.
 %\paragraph{$k$-center clustering problem.}
 %In the $k$-center clustering problem, the goal is to find a set of $k$ points in $P$, says $Q$, such that the maximum distance of a point in 
 %$P \subseteq \mathbb{R}^d$ to its closet point in $Q$ is minimized. 
 %It is known that the problem is NP-Hard. Even in $\mathbb{R}^2$, 
 %it is NP-Hard to obtain a $(1+\sqrt{7})/2$-approximate algorithm\cite{10.1145/62212.62255}.
 %
 %\paragraph{convex hull problem.}
 %The problem of convex hull is to find a subset of $P$, $Q \subseteq  P$, such that every
 %$p \in P$ can be represented as a convex combination of points in $Q$.
 %A greedy algorithm to construct approximate convex hull was proposed by Avrim Blum et al \cite{blum2019sparse}.
 %They proved that after $O(k_{opt}/ \epsilon^{\frac{2}{3}})$ iterations, 
 %their algorithm ouputs a set which is an $O(\epsilon^{\frac{1}{3}})$-approximation to the original set of points
 %where $k_{opt}$ denote the minimum size of an $\epsilon$-approximate convex hull.
 %They also proposed a  novel algorithm to improve the running time further.
 %However, their algorithms are linear time algorithms.
 %For solving the problem of large-scale datasets (i.e. the set is too large to fit in memory), 
 %Avrim Blum et al proposed streaming algorithms for $\epsilon$-approximate convex hull with space complexity comparable 
 %to the optimal approximation \cite{blum2018approximate}.
 %
 %Note, existing algorithms often have high time complexities or high memory consumption, 
 %since they are not independent of the size of datasets.
 %In big data era, we are wondering that whether it is possible to remove the dependency 
 %on the the size of datasets $n$ in each iteration of the algorithm. 
 %We need to implement the each selection step
 %by a random sampling approach, but it is challenging to guarantee the resulting quality.
 %We redesigned the basic algorithms based on greedy selection and the sampling manner.
 %For the $k$-center clustering problem, we propose a sublinear time algorithm with a approximate factor of 2;
 %for the convex hull problem, we propose a sublinear time algorithm for construct $\epsilon$-approximate convex hull.
 %It can be proved that the effectiveness of our proposed algorithms.
 %Meanwhile, in each round of step of the proposed algorithm, the sampling manner avoids reading all data into memory at one time, 
 %which can release computing resources.
 %
 %Moreover, we observe that our proposed framework of algorithms can be used to solve a broader range of geometric
 %optimization problems.
 %
 %\section{Definition and Preliminaries}
 %\label{others}
 %
 %\subsection{Clustering Problem}
 %
 %\begin{Definition}[$k$-center Clustering]
 %  \label{Def1}
 %  Given a set P of n points in $\mathbb{R}^d$ with one positive integers k, the problem of k-center clustering 
 %  is to find k cluster centers $\{c_1, \cdots , c_k\} \subset \mathbb{R}^d$, such that $max _{p \in P}  
 % in_{1\leq j \leq k} \|p - c_{j}\Vert $ is minimized.
 %\end{Definition}
 %
 %\begin{Definition}[$k_\epsilon$-center Clustering]
 %\label{Def2}
 %Let P be an instance of k-center clustering, and $\epsilon \geq 0$. 
 %$k_\epsilon$-center clustering is to find a subset $P^{'}$ of $P$, where $|P^{'}| \geq (1-\epsilon)|P|$,
 %such that the corresponding clustering cost of Definition \ref{Def1} on $P^{'}$ is minimized.
 %\end{Definition}
 %Let the optimal partition forming $P^{'}$ denote $C^*$ and $r_{opt}$ denote resulting clustering cost of the optimal solution.
 %In this paper, we use $\{C_1, \cdots , C_k\}$ to be the k clusters forming the subset of $P$ with size $(1-\epsilon)|P|$,
 %and the resulting clustering cost be $r$.
 %
 %\paragraph{Other notations.}
 %For convenience, let $dist(p, Q)$ to denote the shortest distance
 %between a point p and a point set $Q$, i.e., $min_{q\in Q} \|p - q\Vert $. Further, given two point sets $Q_1$
 %and $Q_2$, let $dist(Q_1, Q_2) = min_{q, p \in Q} \|p - q\Vert $.
 %
 %\subsection{Convex Hull Problems}
 %
 %\begin{Definition}[convex hull]
 %\label{Def3}
 %Given a set $P$ of $n$ points in $\mathbb{R}^d$, $Q \subseteq  P$ is a convex hull of $P$ if every
 %$p \in P$ can be represented as a convex combination of points in $Q$.
 %\end{Definition}
 %
 %\begin{Definition}[$\epsilon$-approximate convex hull]
 %\label{Def4}
 %Given a set $P$ of $n$ points in $\mathbb{R}^d$ and  $\epsilon \in (0, 1)$,
 %and let $\Delta $ denote the diameter of P, i.e., $\Delta  = max_{q, p \in P} \|p - q\Vert$. 
 %The problem of convex hull approximation is to find a subset $Q \in P$, 
 %such that $max_{p \in P} dist(P, \mathcal{C}_Q) \leq  \epsilon\Delta$, where $\mathcal{C}_Q$ denote the convex hull of $Q$.
 %Then $Q$ is an $\epsilon$-approximate convex hull of $P$.
 %\end{Definition}
 %Let $k_{opt}$ denote the smallest size of $Q$.
 %For a set $P$ , its one sided Hausdorff distance from $Q$ is $dist_H(P, Q) = max_{p \in P}dist(p, Q)$.
 %
 %\section{Algorithm}
 %\subsection{Algorithms For Clustering Problem}
 %
 %The basic algorithm~\cite{GONZALEZ1985293} starts with an arbitrary point from $P$, 
 %and iteratively selects the following $k-1$ points, 
 %where each $j$-th step $(2 \leq j\leq k)$ chooses the point which has the largest minimum distance to the
 %already selected $j - 1$ points; finally, each input point is assigned to its nearest neighbor of
 %these $k$ points. It can be proved that this greedy strategy results in a 2-approximation of
 %k-center clustering.
 %
 %\renewcommand{\algorithmicrequire}{ \textbf{Input:}} %Use Input in the format of Algorithm
 %\renewcommand{\algorithmicensure}{ \textbf{Output:}} %UseOutput in the format of Algorithm
 %
 %\begin{algorithm}[htb] 
 %  \caption{2-Approximation Algorithm.} 
 %  \label{alg:1}
 %  \begin{algorithmic}[1] 
 %  \REQUIRE ~~\\ 
 %  An instance $ P \subset \mathbb{R}^d$ of k-center clustering, and $|P| = n $; the
 %$\epsilon > 0 $, $ \eta \in (0, 1)$, and $k \in \mathbb{Z}^{+}$ .\\
 %  \ENSURE ~~\\
 %  \STATE Let $n^{'} = \frac{k}{\epsilon} log \frac{k}{\eta}$, and $C = \phi $ . 
 %  \STATE Initially, $j = 1$; randomly select one point $c_1$ from $P$ and let $C = \{c_1\}$.
 %  \WHILE {$ j < k$}
 %  \label{code:3}
 %  \STATE Randomly sample $n^{'}$ points from $P$, say $Q_j$;
 %  \STATE Let $c_j$ denote the furthest point from $Q_j$ to $C$ and add $c_j$ to $C$;
 %  \STATE $j = j + 1$;
 %  \ENDWHILE 
 %  \RETURN $C$. 
 %  \end{algorithmic}
 %  \end{algorithm}
 %
 %\begin{theorem} \label{theorem1}
 %  With $probability\geq 1-\eta$, the clustering cost of Algorithm~\ref{alg:1} by Definition~\ref{Def1} $r \leq 2r_{opt}$.
 %\end{theorem}
 %  \paragraph{Running time. }
 %  In each round of Step~\ref{code:3}, there are $\frac{k}{\epsilon} log \frac{k}{\eta}$ points selected 
 %  to update the distances from the points of $P$ to $C$. 
 %  Overall, the running time of Algorithm~\ref{alg:1} is $O(k\frac{k}{\epsilon} log \frac{k}{\eta}kd)$.
 %
 %Before proving Theorem~\ref{theorem1}, we present the following two lemmas first.
 %
 %\paragraph{Proposition 1.}
 %Let $U$ be a set of elements, $\eta \in(0, 1)$and $V\subset U$ with $\frac{|V|}{|U|}= \tau > 0$. 
 %We randomly samples S from $U$, if $|S|\geq \frac{1}{\epsilon} log \frac{1}{\eta}$, with $probability\geq 1-\eta$, 
 %$|S\cap V| \geq 1 $.
 %
 %To help the analysis, let 
 %$C_{j-1} = \{c_1, \cdots , c_{j-1}\}, P_j = \{p\in P | dist(p,C_{j-1}) > dist(c_j, C_{j-1})\}$, 
 %for $j = 1, 2, \cdots , k$.
 %
 %
 %\begin{lemma} \label{lemma1}
 %In each round of Step 3 of Algorithm~\ref{alg:1}, with $probability\geq 1-\frac{\eta}{k}$, $|P_j| < \frac{\epsilon}{k}|P|$.
 %\end{lemma}
 %% \paragraph{Lemma 1.} 
 %
 %
 %% \begin{equation} \label{eq:LL}
 %% \end{equation}
 %
 %\paragraph{Proof. }
 %Suppose that Lemma is not true, i.e., with $probability\geq 1-\frac{\eta}{k}$, $|P_j| \geq  \frac{\epsilon}{k}|P|$. 
 %By the Proposition 1, if randomly selecting $\frac{k}{\epsilon} log \frac{k}{\eta}$ (i.e., $|Q_j|$) points from $P$, with probability $1 - \frac{\eta}{k}$, 
 %it contains at least one point from $P_j$, i.e., $|Q_j \cap P_j|\geq 1$. 
 %Then, in each round of Step 3 of Algorithm~\ref{alg:1}, there contains at least one point, say $q \in Q_j \cap P_j$,
 %we have $dist(q, C_{j-1}) > dist(c_j, C_{j-1})$, which in contradiction with the the definition of $c_j$, 
 %since $c_j$ denote the furthest point from $Q_j$ to $C_{j-1}$.
 %
 %Further we consider $\widetilde{P} = P \setminus \bigcup\limits_{j=1}^{k}P_j$. 
 %By the Lemma~\ref{lemma1}, with $probability \geq (1-\frac{\eta}{k})^{k} > 1-\eta$, we have $|\widetilde{P}| >(1 - \frac{\epsilon}{k}k)|P| = (1- \epsilon)|P|$.
 %
 %\begin{lemma} \label{lemma2}
 %$\forall p \in \widetilde{P}$, $dist(p, C) \leq \underset{1\leq i < j\leq k}{min} \|c_i - c_j\Vert$.
 %\paragraph{Proof. }
 %Suppose $\exists p_0 \in \widetilde{P}$, $c_i, c_j \in C(i < j)$, s.t.,
 %$dist(p_0, C) > \underset{1\leq i < j\leq k}{min}\|c_i - c_j\Vert$.
 %\end{lemma}
 %
 %Then we have
 %\begin{equation}
 %    \label{eqa:eq1}
 %  \begin{split}
 %  dist(p_0, C_{j-1})& \geq dist(p_0, C) \\
 %  &> \underset{1\leq i < j \leq k}{min}\|c_i - c_j\Vert \\
 %  & \geq dist(c_j, C_{j-1})
 %  \nonumber
 %  \end{split}
 %\end{equation}
 %The last inequality implies that when adding $c_j$, $p_0 \in P_j$, which in contradiction with 
 %$p_0 \in \widetilde{P} = P \setminus \bigcup\limits_{j=1}^{k}P_j$.
 %
 %\paragraph{Proof. (of Theorem~\ref{alg:1})}
 %Returning to the proof of Theorem~\ref{alg:1}, we have two cases:
 %
 %\begin{enumerate}
 %  \item \textbf{Case 1}: The k cluster centers $\{c_1, \cdots , c_k\}$ fall into the $k$ different optimal cluster, 
 %which directly implies that $dist(p, C) \leq 2r_{opt}$ for any $p \in P^{'}$ based on the triangle inequality.
 %  \item \textbf{Case 2}: : Otherwise, By the pigeonhole principle, at least two centroids fall into
 %one cluster of the partition $C^{*}$.  Let the two centroids be $c_i$, $c_j$.
 %Then $\|c_i - c_j\Vert \leq 2r_{opt}$. By the lemma~\ref{lemma2},  $\forall p \in \widetilde{P}$, $dist(p, C) \leq 2r_{opt}$.
 %\end{enumerate}
 %
 %This exhausts all cases and completes the proof.
 %
 %
 %\subsection{Algorithms For Convex Hull Problems}
 %
 %The basic greedy algorithm is similar to the Gonzalez algorithm for k-center clustering:
 %Iteratively select the point $p \in P$ that is furthest from the linear subspace spanned by the point set $Q \subseteq P$
 %and then add it into $Q$ if this distance is greater than some threshold.
 %Let $\mathcal{S}_Q$ denote the linear subspace spanned by the point set $Q$. 
 %The algorithm runs $O(k_{opt}/ \epsilon^{\frac{2}{3}}) $ steps \cite{blum2019sparse}.
 %
 %% \paragraph{Theorem 1.} 
 %
 %Suppose we know $k_{opt}$, let N =$ k_{opt}$. Then we have Algorithm~\ref{alg:2} for constructing the $\epsilon$--approximate convex hull.
 %\begin{algorithm}[htb] 
 %  \caption{sublinear time $\epsilon$--approximate convex hull.} 
 %  \label{alg:2} 
 %  \begin{algorithmic}[1]
 %  \REQUIRE ~~\\ 
 %  An instance $ P \subset \mathbb{R}^d$ of convex hull; the
 %$\delta  > 0 $, $ \eta \in (0, 1)$, and $k_0 \in \mathbb{Z}^{+}$ .\\
 %  \ENSURE ~~\\ 
 %  \STATE Let $C = \phi $ . 
 %  \STATE Initially, $t = 2$; randomly select one point $c_1$ from $P$ and $|P| = n$; let $C = \{c_1\}$.
 %  \FOR{$j=1$ to $k_0$}
 %  \label{alg:2:code:3}
 %  \STATE random sample $2\frac{k_0}{\delta } log \frac{k_0}{\eta}$ points from $P$, say $Q_j$;
 %  \STATE Let $c_j$ denote the furthest point from $Q_j$ to $\mathcal{S}_C$ and add $c_j$ to $C$;
 %  \ENDFOR  
 %  \WHILE {$j < N$}
 %  \FOR{$j=(2^0+\cdots+2^{t-2})k_0+1$ to $(2^0+\cdots+2^{t-1})k_0$}
 %  \label{alg:2:code:8}
 %  \STATE random sample $2^{2(t-1)}2\frac{k_0}{\delta} log \frac{2^{2(t-1)k_0}}{\eta}$ points from $P$, say $Q_j$;
 %  \STATE Let $c_j$ denote the furthest point from $Q_j$ to $\mathcal{S}_C$ and add $c_j$ to $C$;
 %  \ENDFOR  
 %  \STATE $t = t + 1$
 %  \ENDWHILE 
 %  \RETURN $C$.
 %  \end{algorithmic}
 %  \end{algorithm}
 %
 %\paragraph{Running time. }
 %Let $T(N, d)$ denote the time to compute the distance between point and $\mathcal{S}_C$.
 %Then, the running time of Algorithm~\ref{alg:2} is $O(\frac{N^3}{\delta} log \frac{1}{\eta}T(N,d))$.
 %
 %For the sake of analysis, let $C_{j-1} = \{c_1, \cdots$, $c_{j-1}\}$, 
 %and $P_j = \{p\in P | dist(p,\mathcal{S}_{C_{j-1}}) > dist_H ( Q_j, \mathcal{S}_{C_{j-1}} )\}$, 
 %for $j = 1, 2, \cdots , N$.
 %
 %\begin{lemma} \label{lemma3}
 %In each round of Step~\ref{alg:2:code:3} or step~\ref{alg:2:code:8} of Algorithm~\ref{alg:2}, we have
 %\begin{equation}
 %  \begin{split}
 %  probability &\geq 1-\frac{\eta}{k_0}, |P_j| < \frac{\delta}{2k_0}|P|, j = 1,\cdots , k_0 \\
 %  probability &\geq 1-\frac{\eta}{2^{2}k_0},|P_j| < \frac{\delta}{2^{2}2k_0}|P|,j = k_0+1, \cdots , k_0 + 2k_0  \\
 %  \cdots \\
 %  probability &\geq 1-\frac{\eta}{2^{2(t-1)}k_0},|P_j| < \frac{\delta}{2^{2(t-1)}2k_0}|P|,j = (2^0+\cdots+2^{t-2})k_0+1, \cdots ,(2^0+\cdots+2^{t-1})k_0 
 %  \nonumber
 %  \end{split}
 %\end{equation}
 %\end{lemma}
 %The proof is similar to the proof of lemma~\ref{lemma1}.
 %
 %Further we consider $\widetilde{P} = P \setminus \bigcup\limits_{j=1}^{N}P_j$.
 %By the lemma~\ref{lemma3}, we have $|\widetilde{P}| \geq  (1- \delta)|P|$,
 %since $\frac{\delta}{2k_0}k_0 + \frac{\delta}{2^{2}2k_0}2k_0 + \cdots + \frac{\delta}{2^{2(t-1)}2k_0}2^{t-1}k_0 \leq \delta$.
 %Then the success probability of $|\widetilde{P}| \geq  (1- \delta)|P|$
 %
 %\begin{equation}
 %  \begin{split}
 %  probability &\geq (1-\frac{\eta}{k_0})^{k_0}(1-\frac{\eta}{2^2k_0})^{2k_0} \cdots(1-\frac{\eta}{2^{2(t-1)}k_0})^{2^{t-1}k_0} \\
 %  & > (1-\eta)(1-\frac{\eta}{2}) \cdots(1-\frac{\eta}{2^{t-1}}) \\
 %  & \geqslant 1-\sum_{j=0}^{t-1}\frac{\eta}{2^{j}} \\
 %  & > 1- 2\eta
 %  \nonumber
 %  \end{split}
 %\end{equation}
 %
 %\begin{lemma} \label{lemma4}
 %With $probability\geq 1-2\eta$, 
 %$\forall p \in \widetilde{P}$, $dist(p, \mathcal{S}_C) \leq \underset{1\leq j\leq N}{min} dist_H(Q_j, \mathcal{S}_{C_{j-1}})$.
 %\paragraph{Proof. }
 %Suppose with $probability\geq 1-2\eta$, $\exists p_0 \in \widetilde{P}$, $c_i, c_j \in C$, s.t.,
 %$dist(p_0, C) > \underset{1\leq j\leq N}{min}dist_H(Q_j, \mathcal{S}_{C_{j-1}})$.
 %\end{lemma}
 %Then we have
 %\begin{equation}
 %  \begin{split}
 %  dist(p_0, \mathcal{S}_{C_{j-1}})& \geq dist(p_0, \mathcal{C_{C}}) \\
 %  &> \underset{1\leq j \leq N}{min}dist_H(Q_j, \mathcal{S}_{C_{j-1}})
 %  \nonumber
 %  \end{split}
 %\end{equation}
 %The last inequality implies that when adding $c_j$, $p_0 \in P_j$, which in contradiction with 
 %$p_0 \in \widetilde{P} = P \setminus \bigcup\limits_{j=1}^{N}P_j$.
 %
 %
 %\begin{theorem} \label{theorem2} 
 %  With $probability\geq 1-2\eta$, Algorithm~\ref{alg:2} outputs a $\epsilon$--approximate convex hull of $\widetilde{P}$.
 %  \end{theorem}
 %
 %\paragraph{Proof.}
 %
 %First, we consider this case, that is,
 %we let Algorithm~\ref{alg:2} stop as soon as $dist_H(Q_j, \mathcal{S}_{C_{j-1}}) \leq \epsilon \Delta $, and outputs $C_{j-1}$.
 %By the lemma~\ref{lemma4}, in each iteration of j of Algorithm~\ref{alg:2}, once the distance between $Q_j$ and $\mathcal{S}_{C_{j-1}}$, 
 %i.e., $dist_H(Q_j, \mathcal{S}_{C_{j-1}}) \leq \epsilon \Delta $, we have $dist_H(\widetilde{P}, \mathcal{S}_{C_{j-1}}) \leq \epsilon \Delta $.
 %Then Algorithm~\ref{alg:2} outputs $C_{j-1}$ and the $\mathcal{C}_{C_{j-1}}$ is a $\epsilon$-approximate convex hull of $\widetilde{P}$.
 %However, it is linear time to get the $\Delta$ by a linear scan of the points.
 %Instead, one can use the check $|\mathcal{S}_{C}| \geq  (1- \delta)|P|$ as a stopping condition,
 %where $|\mathcal{S}_{C}|$ denote the number of points covered by $\mathcal{S}_{C}$. 
 %For ease of presentation, let $\beta = \frac{|\mathcal{S}_{C}|}{|P|}$.
 %We can use random sampling to estimate $\beta$.
 %Let $\{x_{i} | 1 \leq i \leq n^{'}\}$ be $n^{'}$ independent random variables with $x_{i} = 1$ 
 %if the i-th sampled point $p_{i}$ of $Q$ belongs to $\mathcal{S}_{C}$ 
 %(i.e. the distance between $p_{i}$ and $|\mathcal{S}_{C}|$ is smaller than some threshold), and $x_{i} = 1$ otherwise.
 %Let $X=\sum_{i=1}^{n^{'}}$ and $\sigma \in (0, 1)$ be a small parameter.
 %Then, we have $E[x_{i}] = \beta$ for each i and the estimator is $Y=\frac{X}{n^{'}}$.
 %By using the Chernoff bound,
 %we have $\textbf{Pr}(|Y-\beta|> \sigma\beta) =\textbf{Pr}(|X-n^{'}\beta|> \sigma n^{'}\beta) \leq e^{-O(\sigma^{2}n^{'}\beta)} $.
 %Thus, in each iteration of j of Algorithm~\ref{alg:2},
 %one can use the check $ X \geq |Q_j|\beta \geq |Q_j|(1- \delta)$ as a stopping condition.
 %Finally, we have $|\mathcal{S}_{C}| \geq  (1- \delta)|P|$ and 
 %the success probability Algorithm~\ref{alg:2} is greater than $1-e^{-O(\sigma^{2}|Q_j|(1- \delta))}$ with tolerating slight error.
 %\section{Applications}
 %
 %
 %\subsection{}
 

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


% \subsubsection*{Acknowledgements}
% All acknowledgments go at the end of the paper, including thanks to reviewers who gave useful comments, to colleagues who contributed to the ideas, and to funding agencies and corporate sponsors that provided financial support. 
% To preserve the anonymity, please include acknowledgments \emph{only} in the camera-ready papers.


% \subsubsection*{References}

% References follow the acknowledgements.  Use an unnumbered third level
% heading for the references section.  Please use the same font
% size for references as for the body of the paper---remember that
% references do not count against your page length total.

% \begin{thebibliography}{}
% \setlength{\itemindent}{-\leftmargin}
% \makeatletter\renewcommand{\@biblabel}[1]{}\makeatother
% \bibitem{} J.~Alspector, B.~Gupta, and R.~B.~Allen (1989).
%     \newblock Performance of a stochastic learning microchip.
%     \newblock In D. S. Touretzky (ed.),
%     \textit{Advances in Neural Information Processing Systems 1}, 748--760.
%     San Mateo, Calif.: Morgan Kaufmann.

% \bibitem{} F.~Rosenblatt (1962).
%     \newblock \textit{Principles of Neurodynamics.}
%     \newblock Washington, D.C.: Spartan Books.

% \bibitem{} G.~Tesauro (1989).
%     \newblock Neurogammon wins computer Olympiad.
%     \newblock \textit{Neural Computation} \textbf{1}(3):321--323.



%   \end{thebibliography}

% \bibliographystyle{abbrv}
\bibliography{chen_318}  


% +++++++++++++++++++++++++++++++++++++++++++++++++++++++

% \appendix
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}


% % \newcommand{\theHalgorithm}{\arabic{algorithm}}

% \makeatletter



% % If your paper is accepted and the title of your paper is very long,
% % the style will print as headings an error message. Use the following
% % command to supply a shorter title of your paper so that it can be
% % used as headings.
% %
% %\runningtitle{I use this title instead because the last one was very long}

% % If your paper is accepted and the number of authors is large, the
% % style will print as headings an error message. Use the following
% % command to supply a shorter version of the authors names so that
% % they can be used as headings (for example, use only the surnames)
% %
% %\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}




% % Supplementary material: To improve readability, you must use a single-column format for the supplementary material.


% % \vspace{-61mm}
% \clearpage
% % \section{The Algorithm for Convex Hull}




% \section{MORE EXPERIMENTAL RESULTS}
% \label{sec-app-moreexp}
% % \section{More Experimental Results}




% \textbf{Convex hull approximation.} We conduct the experiments for convex hull approximation. We consider the baselines \textsc{BHR}~\citep{blum2019sparse} and the recently proposed \textsc{AKZ}~\citep{DBLP:journals/anor/AwasthiKZ20}. Our algorithms are named \textsc{Sub-BHR1} and \textsc{Sub-BHR2}, for the cases that $k$ is given or not respectively. We use the same real datasets from the experimental section of~\cite{DBLP:journals/anor/AwasthiKZ20}\footnote{\url{https://archive.ics.uci.edu/ml/datasets/bag+of+words}}: the \textbf{NIPS} data set with 1500 documents, and a pruned vocabulary of 12k words; the \textbf{NYTimes} Corpus with sub sampled 30000 documents, and a pruned vocabulary of 5k words. Similar with  \textbf{MNIST}, we enlarge each dataset by $2$ times. The results are shown in Figure~\ref{fig-exp-5}-\ref{fig-exp-8}. In Figure~\ref{fig-exp-5} and \ref{fig-exp-6}, we can see that our algorithm \textsc{Sub-BHR1} can achieve similar error level with \textsc{BHR} but with much lower runtime; \textsc{AKZ} is faster than the other two algorithms, but has much higher error level. When $k$ is not given, in Figure~\ref{fig-exp-7} and \ref{fig-exp-8} we can see that \textsc{AKZ} needs to return much more vertices than  \textsc{Sub-BHR2}, and their error levels are very close. 

% \newcounter{sd5}
% \begin{figure*} [htb]
% 	\begin{center}
% 			 %\vspace{-0.1in}
% 		\includegraphics[height=0.168\textwidth]{figure/sub_figure/new1/k/nips_error@k}  
% 		\hspace{0.12in}
% 		\includegraphics[height=0.168\textwidth]{figure/sub_figure/new1/k/nips_time@k} 
% 				\hspace{0.12in}
% 						\includegraphics[height=0.18\textwidth]{figure/sub_figure/new1/k/nips_time_error@k}  
%  		%		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_cost} 
% 		\vspace{-0.02in}
% 				\centerline{ \hspace{-0.8in}\hfill \stepcounter{sd5}  (\alph{sd5})\hfill \stepcounter{sd5} (\alph{sd5}) \hfill \stepcounter{sd5} (\alph{sd5})\hspace{1.6in}}
% 		\vspace{-0.05in}
% 		\caption{The experimental performances for the case that $k$ is given on the dataset \textbf{NIPS}. The error indicates the maximum polytope distance from the given point set to the obtained convex hull. In (c), we show the error obtained versus runtime for different values of $k$.}     
% % 		\caption{The Experimental Performances for the Case That $k$ Is Given on the Dataset \textbf{NIPS}. The error indicates the maximum polytope distance from the given point set to the obtained convex hull. In (c), we show the error obtained versus runtime for different values of $k$.}     
	
% 		\label{fig-exp-5}
% 	\end{center}
% %	 \vspace{-0.2in}
% \end{figure*}






% %
% %
% %\begin{figure*}
% %%	\begin{center}
% %%			% \vspace{-0.1in}
% %%		\includegraphics[width=0.33\textwidth]{resultfigs/real_kc_r}  
% %%		\includegraphics[width=0.33\textwidth]{resultfigs/real_km_cost}   
% %%		\includegraphics[width=0.33\textwidth]{resultfigs/median_real_km_cost} 
% %%		 %\vspace{-0.1in}
% %%		\caption{The normalized objective values on the  real  datasets.}     
% %%		\label{fig-exp-objective2}
% %%	\end{center}
% %%	 %\vspace{-0.3in}
% %%\end{figure*}
% \newcounter{sd6}
% \begin{figure*}
% 	\begin{center}
%  		\includegraphics[height=0.17\textwidth]{figure/sub_figure/new1/k/ny_error@k} 
%  						\includegraphics[height=0.185\textwidth]{figure/sub_figure/new1/k/ny_time@k}  
%  								\includegraphics[height=0.185\textwidth]{figure/sub_figure/new1/k/ny_time_error@k}  						
%  %		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_time}

%  \vspace{-0.05in}
% 				\centerline{ \hspace{-0.8in}\hfill \stepcounter{sd6}  (\alph{sd6})\hfill \stepcounter{sd6} (\alph{sd6}) \hfill \stepcounter{sd6} (\alph{sd6})\hspace{1.7in}}
%  		 \caption{The experimental performances for the case that $k$ is given on the dataset \textbf{NYTimes}. The error indicates the maximum polytope distance from the given point set to the obtained convex hull. In (c), we show the error obtained versus runtime for different values of $k$.}     
%  		 %\caption{The Experimental Performances for the Case That $k$ Is Given on the Dataset \textbf{NYTimes}. The error indicates the maximum polytope distance from the given point set to the obtained convex hull. In (c), we show the error obtained versus runtime for different values of $k$.}     
		
% 		  \label{fig-exp-6}
% 	\end{center}
% %	\vspace{-0.2in}
%  \end{figure*}
 
 

% \newcounter{sd7}
% \begin{figure*} 
% 	\begin{center}
% 			 %\vspace{-0.1in}
% 		\includegraphics[height=0.168\textwidth]{figure/sub_figure/new1/nok/nips_norm_error@nok}  
% 		\hspace{0.12in}
% 		\includegraphics[height=0.168\textwidth]{figure/sub_figure/new1/nok/nips_norm_time@nok} 
% 				\hspace{0.12in}
% 						\includegraphics[height=0.18\textwidth]{figure/sub_figure/new1/nok/nips_norm_vertices@nok}  
%  		%		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_cost} 
% 		\vspace{-0.02in}
% 				\centerline{ \hspace{-0.8in}\hfill \stepcounter{sd7}  (\alph{sd7})\hfill \stepcounter{sd7} (\alph{sd7}) \hfill \stepcounter{sd7} (\alph{sd7})\hspace{1.6in}}
% 		\vspace{-0.05in}
% 		\caption{The experimental performances for the case that a error threshold $\delta$ is given on the dataset \textbf{NIPS}. All the results (error, runtime, and the number of returned vertices) are respectively normalized over the results obtained by \textsc{BHR}.}     
% % 		\caption{The Experimental Performances for the Case That a Error Threshold $\delta$ Is Given on the Dataset \textbf{NIPS}. All the results (error, runtime, and the number of returned vertices) are respectively normalized over the results obtained by \textsc{BHR}.}     
	
% 		\label{fig-exp-7}
% 	\end{center}
% %	 \vspace{-0.2in}
% \end{figure*}
 
%  \newcounter{sd8}
% \begin{figure*}
% 	\begin{center}
%  		\includegraphics[height=0.17\textwidth]{figure/sub_figure/new1/nok/ny_norm_error@nok} 
% 				\hspace{0.1in}
% 						\includegraphics[height=0.178\textwidth]{figure/sub_figure/new1/nok/ny_norm_time@nok}  
% 										\hspace{0.1in}						
% 								\includegraphics[height=0.178\textwidth]{figure/sub_figure/new1/nok/ny_norm_vertices@nok}  						
%  %		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_time}

%  \vspace{-0.05in}
% 				\centerline{ \hspace{-0.8in}\hfill \stepcounter{sd8} (\alph{sd8})\hfill \stepcounter{sd8} (\alph{sd8}) \hfill \stepcounter{sd8} (\alph{sd8})\hspace{1.7in}}
%  		 \caption{The experimental performances for the case that a error threshold $\delta$ is given on the dataset \textbf{NYTimes}. All the results (error, runtime, and the number of returned vertices) are respectively normalized over the results obtained by \textsc{BHR}.}     
%  		 %\caption{The Experimental Performances for the Case That a Error Threshold $\delta$ Is Given on the Dataset \textbf{NYTimes}. All the results (error, runtime, and the number of returned vertices) are respectively normalized over the results obtained by \textsc{BHR}.}     
		
% 		  \label{fig-exp-8}
% 	\end{center}
% %	\vspace{-0.2in}
%  \end{figure*}
 
 
 
 
 
 
 

 
 
% \section{MORE EXPLANATION ON COROLLARY 1 AND 2}
% \label{sec-app-cor}
% % \section{More Explanation on Corollary 1 and 2}

%  To see why we consider the case that accessing to the whole data is prohibited, we can consider the following scenarios.

% (1) In a data marketplace, a buyer wants to buy a dataset ``$P$'' from an owner~\cite{DBLP:journals/jacm/KoutrisUBHS15}.  Obviously, the owner cannot leak the whole P to the buyer before completing the deal. To estimate the value of $P$, the owner may allow the buyer to sample a small number of data items from P to study its distribution.





% %In a data marketplace, a buyer wants to buy a dataset “P” from an owner. To estimate the value of P, the owner may allow the buyer to sample a small number of data items from P. Obviously, the owner cannot leak the whole P to the buyer before completing the deal, and the buyer can only use the small sample to estimate the distribution of P ({\em e.g.}, \cite{DBLP:journals/jacm/KoutrisUBHS15}).

% Some other scenario, {\em e.g.}, single cell sequencing in biological experiment, also has limit on the number of queries (because each ``query'' usually is expensive).

% (2) Suppose we have trained a generative model, and we want to check that whether the model can generate a distribution satisfying our expectation. It may be difficult to directly study the distribution in the target space. We can learn the distribution in the target space by taking small sample, and this small sample in the target space can be generated through sampling from the latent space (because the data items in the target space are generated by the latent variables). This is can be viewed as an Oracle for query.



% \section{MORE EXPLANATION ON THE CASE $K$ IS NOT GIVEN}
% \label{sec-app-k}
% % \section{More Explanation on The Case $k$ Is Not Given}
 
%  To solve the case that $k$ is not given for $k$-center clustering, a natural idea is using Algorithm~\ref{alg-ourkc} as the black-box and performing binary search for the value of $k$. But we note that Algorithm~\ref{alg-ourkc} is a randomized algorithm (the success probability depends on the sample size which is a function of $k$). We do not know how many time we should call the black box Algorithm~\ref{alg-ourkc} in advance (the number of “$\tilde{k}$” is unknown), so we still need the techniques proposed in Section 3.2, like the “stratified sampling” and “Lemma~\ref{lem-terminate}”, to guarantee the overall success probability and runtime. Also, using the “doubling idea” and “binary search” will cause some redundancy in computing. For example, if we perform binary search for “$\tilde{k}$” within an interval, we do not need to always re-run Algorithm~\ref{alg-ourkc} since some of the cluster centers have been obtained in the previous runs. In our paper, we show that just running Algorithm~\ref{alg-ourkc} together with the techniques based on “stratified sampling” and "Lemma~\ref{lem-terminate}" one time (i.e., Algorithm~\ref{alg-ourkc3}) should be already sufficient. 
 
 
 
 
 
 

%
%
%\begin{figure*}
%%	\begin{center}
%%			% \vspace{-0.1in}
%%		\includegraphics[width=0.33\textwidth]{resultfigs/real_kc_r}  
%%		\includegraphics[width=0.33\textwidth]{resultfigs/real_km_cost}   
%%		\includegraphics[width=0.33\textwidth]{resultfigs/median_real_km_cost} 
%%		 %\vspace{-0.1in}
%%		\caption{The normalized objective values on the  real  datasets.}     
%%		\label{fig-exp-objective2}
%%	\end{center}
%%	 %\vspace{-0.3in}
%%\end{figure*}



%\begin{abstract}
%Greedy selection is a widely used idea for solving many machine learning problems. But greedy selection algorithms often have high complexities and thus may be prohibitive for large-scale data. In this paper, we consider two fundamental optimization problems in machine learning: $k$-center clustering and convex hull approximation, where they both can be solved via greedy selection. We propose sublinear time algorithms for them through combining the strategies of randomization and greedy selection. Our results are similar in spirit to the linear time  stochastic greedy selection algorithms for submodular maximization [Mirzasoleiman et al., AAAI 2015, Hassidim and Singer, ICML 2017], but with several important differences. Our runtimes are independent of the number of input data items $n$. In particular, our runtime for $k$-center clustering significantly improves upon that of the uniform sampling approach [Huang et al, FOCS 2018], especially when the dimensionality is high. Moreover, our algorithms are particularly suitable for  the scenario that we cannot directly access the whole input data (due to the reasons like privacy preserving, data storage and transmission) and can only take a small sample via an oracle each time.  Our sublinear algorithms  yield the improvement on the efficiency for various applications, such as data selection and compression, active learning, topic modeling, {\em etc}. 
%% Finally, we conduct the experiments to evaluate our proposed algorithms in practice.  
%\end{abstract}
%
%\section{Introduction}
%\label{sec-intro}
%Greedy algorithm is one of the most fundamental tools for algorithm design~\cite{cormenintroduction}. 
%In particular, many optimization problems in machine learning can be solved through {\em greedy selection} method. The method iteratively selects a subset of data items from input based on some greedy strategy. One representative example is the Gonzalez's algorithm for {\em $k$-center clustering}~\cite{GONZALEZ1985293}. Given a set of data items ({\em e.g.,} a point set in $\mathbb{R}^d$), the algorithm is to iteratively select $k$ items from the input; if one draws $k$ equal-sized balls centered at these $k$ items, the whole input data set can be covered by these balls and the radius is no larger than two times the optimal one (the formal definition for $k$-center clustering is shown in Section~\ref{sec-pre}). 
%
%The algorithm is simple but has many applications in real world. One natural application is constructing {\em coreset} for compressing a large-scale data, especially when we want to maximize diversity or coverage~\cite{DBLP:conf/pods/IndykMMM14}. Another closely related application is {\em batch active learning}~\cite{sener2018active,DBLP:conf/iclr/ColemanYMMBLLZ20}. Most machine learning models heavily depend on high-quality labeled training datasets. However, because it is expensive to acquire a large number of labeled data, we may only be able to select a small number of data items (via $k$-center clustering) to label in each round (as an active learning process).  
%
%Another high dimensional optimization problem that depends on greedy selection is {\em convex hull approximation}~\cite{blum2019sparse,DBLP:journals/anor/AwasthiKZ20}, where the goal is to find a convex hull so that each data item can be approximately represented by the vertices. The problem has a number of applications in machine learning,  such as topic modeling, sparse approximation, and non-negative matrix factorization~\cite{DBLP:books/cu/20/0001M20}. Though the convex hull algorithms have been well studied in low dimensions~\cite{cgtextbook}, the problem in high dimensions is much more challenging. Similar with the $k$-center clustering, a common idea for convex hull approximation is using greedy selection to find the vertices iteratively. 
%
%
%
%Although these greedy selection methods enjoy promising performances in practice, they often suffer from high complexities when data sizes are extremely large. For instance, the vanilla Gonzalez's algorithm needs to run $k$ iterations and each iteration needs to scan the whole dataset in one pass (a detailed introduction on the previous work is shown in Section~\ref{sec-related}).  Similarly, the greedy selection method for convex hull approximation also needs to repeatedly scan the whole dataset and thus yields large runtime. So a natural question is: 
%
%{\em \hspace{0.2in} Can we modify these greedy selection algorithms to  achieve lower time complexities, {\em e.g.,} sublinear time complexities that are independent of  input data size, and meanwhile preserve their quality guarantees?  }
%
%%As  the rapid increase of data volumes in big data, designing sublinear time algorithms has become a popular topic in the past decades; more detailed discussion on sublinear time algorithms can be found in the survey papers~\cite{rubinfeld2006sublinear,czumaj2006sublinear}.
% 
%\vspace{-0.1in}
%\subsection{Related Work}
%\label{sec-related}
%\vspace{-0.05in}
%We introduce several important existing results that are related to this paper. 
%
%\textbf{$k$-center clustering.} As mentioned before, the greedy selection based $k$-center clustering algorithm~\cite{GONZALEZ1985293} can yield a $2$-approximation result; moreover,  it was proved that any approximation ratio lower than 2 implies $P=NP$~\cite{HS85}. To speed up the Gonzalez's algorithm, several improvements have been proposed before~\cite{DBLP:conf/stoc/FederG88,har2006fast}; however, they usually require some additional assumptions ({\em e.g.,} the dimensionality or intrinsic dimensionality should be small). To deal with large-scale data, a number of streaming algorithms which only need to read the data in one-pass were introduced in~\cite{DBLP:journals/siamcomp/CharikarCFM04,mccutchen2008streaming,DBLP:conf/icdt/Guha09,DBLP:journals/corr/abs-1802-09205}. Furthermore, several uniform sampling based ideas were presented for achieving sublinear complexity for $k$-center clustering (with outliers)~\cite{charikar2003better,huang2018epsilon}. 
%
%
%\textbf{Convex hull approximation.} Several elegant convex hull algorithms for low-dimensional space have been introduced in the community of computational geometry before~\cite{cgtextbook}. In this paper we only consider the convex hull approximation problem in high dimensions. The problem is closely related to non-negative matrix factorization and topic modeling~\cite{DBLP:books/cu/20/0001M20}. Roughly speaking, the vertices of the obtained convex hull can help us to generate the low rank non-negative matrices and discover the hidden topics. In general, this problem is intractable but it is possible to achieve an efficient solution under the {\em separability} assumption~\cite{DBLP:conf/nips/DonohoS03,DBLP:conf/focs/AroraGM12}. Recently, several practical algorithms with provable guarantees were also proposed, such as~\cite{blum2019sparse,DBLP:journals/anor/AwasthiKZ20,DBLP:conf/icml/AroraGHMMSWZ13}. 
%
%
%\textbf{Other applications of greedy selection in machine learning.} Besides the aforementioned two problems, greedy selection also has several other applications in machine learning. To name a few: {\em submodular maximization}~\cite{nemhauser1978analysis}, {\em column subset selection}~\cite{DBLP:journals/kais/FarahatEGK15}, {\em reinforcement learning}~\cite{DBLP:conf/icml/Painter-WakefieldP12}, and {\em sparse approximation}~\cite{DBLP:journals/tit/Tropp04}. 
%
%
%
%
%\vspace{-0.1in}
%\subsection{Our Contributions}
%\label{sec-our}
%\vspace{-0.05in}
%In this paper, we aim to develop sublinear time algorithms for the $k$-center clustering and convex hull approximation problems. 
%We assume the input data size and the dimensionality are both large. We combines the strategies of  greedy selection and randomization, and 
% show that the randomized greedy selection methods can achieve almost the same approximation guarantees, and meanwhile, the time complexities can be reduced to be sublinear. 
%
%\textbf{Comparison with the algorithms of~\cite{DBLP:conf/aaai/MirzasoleimanBK15,DBLP:conf/icml/HassidimS17}.} Actually, the high complexity issue of greedy selection has been discussed in~\cite{DBLP:conf/aaai/MirzasoleimanBK15,DBLP:conf/icml/HassidimS17} for the submodular maximization problem. They showed that if the greedy selection step is replaced by random sampling, a quality guarantee still holds but the complexity ({\em i.e.,} the number of function evaluations) can be reduced to be linear. Our proposed algorithms are inspired by the similar stochastic intuition but with several important differences.  
%%Our framework has two major differences with the methods of~\cite{DBLP:conf/aaai/MirzasoleimanBK15,DBLP:conf/icml/HassidimS17}. 
%\textbf{(\rmnum{1})} First, both $k$-center clustering and convex hull approximation are geometric optimizations in high dimensions which have different objective functions other than submodular maximization. 
%%So our analyses also need some novel insights from geometry. 
%\textbf{(\rmnum{2})} Second, our framework yields the sublinear time complexities that are independent of the number of input data items; this property is particularly important when we cannot access the input data (due to the reasons like privacy preserving, data storage and transmission) and can only take a small sample via an oracle each time. \textbf{(\rmnum{3})} Finally, we also consider the scenario that the number of iterations for greedy selection is unknown. For example, the number of clusters ``$k$'' of the $k$-center clustering may not be given; instead, we may just run the Gonzalez's algorithm iteratively until the obtained radius is no larger than a pre-specified threshold $r_0>0$. 
%We need to emphasize that designing sublinear time algorithm becomes more much challenging with such a change, since it will be difficult to set the sample size in each iteration and determine when the algorithm should terminate. To remedy these issues, we propose a novel stratified sampling method and design a sampling based stopping condition for the greedy selection. 
%
%
%
%
%%the data size is extremely large or even infinity ({\em e.g.,} the data may be a continuous probability distribution in the space).  
%
%\textbf{Comparison with the streaming and uniform sampling algorithms.} As mentioned in Section~\ref{sec-related}, the one-pass streaming algorithms~\cite{DBLP:journals/siamcomp/CharikarCFM04,mccutchen2008streaming,DBLP:conf/icdt/Guha09,DBLP:journals/corr/abs-1802-09205} can avoid repeatedly reading the input data, however, they still suffer from high time complexities ({\em e.g.,} the ``doubling algorithm''~\cite{DBLP:journals/siamcomp/CharikarCFM04} takes a total $O\big(k(\log k)nd\big)$ time that is even higher than the complexity of the vanilla Gonzalez's algorithm, where $n$ is the number of input points). On the other hand, our proposed sublinear time algorithms have  the complexities independent of $n$. It is also worth to compare our results with the uniform sampling algorithms for $k$-center clustering~\cite{charikar2003better,huang2018epsilon}. For example, the recent result from Huang {\em et al.}~\cite{huang2018epsilon} showed that a simple uniform sample $S$ of size $\tilde{O}(\frac{kd}{\epsilon^2})$~\footnote{The asymptotic notation $\tilde{O}(f)=O\big(f\cdot \mathtt{polylog}(\frac{kd}{\eta\epsilon})\big)$, where $\eta\in(0,1)$ is the parameter controlling the success probability of sampling.} can approximately represent the whole input data $P$, where $\epsilon\in(0,1)$ indicates the small fraction of uncovered points; that is, if one runs the $2$-approximate Gonzalez's algorithm on the sample $S$, the obtained $k$ balls  still form a $2$-approximate solution in terms of the whole input $P$ but except for $\epsilon n $ uncovered points of $P$. The running time of the Gonzalez's algorithm on $S$ should be $\tilde{O}(k|S|d)=\tilde{O}(\frac{k^2d^2}{\epsilon^2})$. In Section~\ref{sec-ouralg}, we show that our algorithm takes $\tilde{O}(\frac{k^3d}{\epsilon})$ time. Usually, $k$ is much smaller than the dimensionality $d$, and thus our improvement over the result of~\cite{huang2018epsilon} is significant. In particular, if $k$ is assumed to be constant, we improve their complexity by a factor up to $\frac{d}{\epsilon}$. 
%
% 
% \vspace{-0.1in}
% \section{Preliminaries}
%\label{sec-pre}
%\vspace{-0.05in}
%In this section, we introduce several important definitions that will be used throughout this paper. 
%Let $c\in\mathbb{R}^d$ and $r\geq 0$, we use $\mathbb{B}(c, r)$ to denote the ball centered at $c$ with radius $r$.  Also, given a set $S$ of points in $\mathbb{R}^d$, we use $\mathtt{conv}(S)$ to denote the convex hull of $S$. We use the function $\mathtt{dist}(p, U)$ to measure the shortest distance from a point $p$ to a set $U$, {\em i.e.,} $\mathtt{dist}(q, U):=\min_{q\in U}||p-q||$. 
%
%\begin{definition}[\textbf{$k$-center clustering}]
%\label{def-kc}
%Given a set $P$ of $n$ points in $\mathbb{R}^d$ and an integer $k\geq 1$, the goal of $k$-center clustering is to find $k$ balls $\mathbb{B}(c_1, r), \cdots, \mathbb{B}(c_k, r)$ with the smallest radius $r$ to cover the set $P$, that is, $P$ is partitioned into $k$ clusters with each cluster being covered by an individual ball, and the radius $r$ is minimized. 
%\end{definition}
%\begin{remark}
%The $k$-center clustering problem can be also defined for any abstract metric, where the only difference is that the Euclidean distance is replaced by the distance defined in the metric. Moreover, the proposed sublinear algorithms for $k$-center clustering in this paper can be also applied to any abstract metric with the same quality guarantees.  
%\end{remark}
%
%Let $r_{\mathtt{opt}}$ be the radius of the optimal solution for the $k$-center clustering on $P$. For any solution having a radius $r\leq \lambda r_{\mathtt{opt}}$ with some $\lambda\geq 1$, we call it a ``$\lambda$-approximation''. 
%
%
%
%\begin{definition}[\textbf{convex hull approximation}]
%\label{def-ch}
%Given a set $P$ of $n$ points in $\mathbb{R}^d$ and an integer $k\geq 1$, the goal of convex hull approximation is to find a subset $P_c\subset P$ with $|P_c|=k$, such that the error, {\em i.e.,} 
%%\begin{eqnarray}
%$\max_{p\in P}\mathtt{dist}\big(p, \mathtt{conv}(P_c)\big)$ 
%% \label{for-ch}
%%\end{eqnarray}
%is minimized (so if all the  points of $P$ are covered by $\mathtt{conv}(P_c)$, the error is $0$). 
%\end{definition}
%\begin{remark}
%In general, we can remove the requirement ``$P_c\subset P$'', {\em i.e.,} $P_c$ can contain any points in the space. But we often want $P_c$ to be meaningful or interpretable in practice, thus it is natural to require it to be a subset of the original input data.  
%\end{remark}
%Similar with $k$-center clustering, we can also define the approximation solution for convex hull approximation. But since the convex hull approximation is much more challenging, we often obtain bi-criteria approximations. Suppose $\alpha, \beta\geq 1$. If letting $\delta_{\mathtt{opt}}$ be the optimal error,  a bi-criteria  $(\alpha, \beta)$-approximation means that the obtained convex hull has the error $\delta\leq \alpha \delta_{\mathtt{opt}}$ and the number of vertices $k'\leq \beta k$. 
%
%\textbf{The rest of this paper is organized as follows.}  In Section~\ref{sec-kcenter}, we propose our sublinear time algorithm for $k$-center clustering. In particular, we also consider the practical case that the number of clusters $k$ is not given (Section~\ref{sec-kcu}). Further, in Section~\ref{sec-ch} we consider developing sublinear time algorithm for convex hull approximation by extending the idea from Section~\ref{sec-kcu}. Finally, we present our experimental results in Section~\ref{sec-exp}. 
%
%
%
%
%\vspace{-0.1in}
%\section{$k$-Center Clustering}
%\label{sec-kcenter}
%\vspace{-0.07in}
%In this section, we focus on the $k$-center clustering problem. For the sake of completeness, we briefly introduce the aforementioned $2$-approximate Gonzalez's algorithm~\cite{GONZALEZ1985293} first. 
%
%\textbf{Gonzalez's algorithm.} Initially, it selects an arbitrary point, say $c_1$, from the input $P$ and lets $C=\{c_1\}$; then it iteratively selects a new point that has the largest distance to $C$ among the points of $P$ and adds it to $C$, until $|C|=k$ (the distance between a point $q$ and $C$ is measured by $\mathtt{dist}(q, C)$); suppose $C=\{c_1, \cdots, c_k\}$, and then $P$ is covered by the $k$ balls $\mathbb{B}(c_1, r), \cdots, \mathbb{B}(c_k, r)$ with  $r\leq\min\{||c_i-c_j||\mid 1\leq i\neq j\leq k\}$. By using the triangle inequality, we can prove that the obtained radius $r\leq 2 r_{\mathtt{opt}}$. It is also easy to know that the running time of the Gonzalez's algorithm is $O(knd)$. 
%As mentioned before, a major drawback of the algorithm is the high complexity, especially when  $n$ and $d$ are  large.  
%
%
%
%\vspace{-0.1in}
%\subsection{Our Sublinear Algorithm}
%\label{sec-ouralg}
%\vspace{-0.07in}
%Our proposed algorithm can be viewed as a randomized version of the Gonzalez's algorithm. The key change is that we randomly select the next point for $C$ in each round, instead of always picking the furthest one. Below we prove that this strategy can achieve the same $2$-approximation except for a small error on the number of covering points. 
%
%\begin{algorithm}[h]
%   \caption{\textsc{Sublinear $k$-Center Clustering}}
%   \label{alg-ourkc}
%\begin{algorithmic}
%  \STATE {\bfseries Input:} A set $P$ of $n$ points in $\mathbb{R}^d$, $k\in\mathbb{Z}^+$, and two parameters $\eta, \epsilon\in (0,1)$. 
%%   \REPEAT
%   \STATE
%   \begin{enumerate}
%   \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $i=1$. 
%   \item Repeat the following steps $k-1$ times:
%   \begin{enumerate}
%  \item Sample a set $Q$ of $\frac{k}{\epsilon}\log \frac{k}{\eta}$ points from $P$ uniformly at random. 
%  \item Select the furthest point, say $q_0$, from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
%  \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
%   \end{enumerate}
%   \item Return $C$. 
%     \end{enumerate}
% %   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
%     %     \UNTIL{The objective value becomes stable.}
%\end{algorithmic}
% \end{algorithm}
%
%
%\begin{theorem}
%\label{the-ourkc}
%Let $C=\{c_1, \cdots, c_k\}$ be the output from Algorithm~\ref{alg-ourkc}. With probability at least $1-\eta$, there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-\epsilon)n$, such that $\tilde{P}$ is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. 
%\end{theorem}
%To prove Theorem~\ref{the-ourkc}, we need the following claim first. 
%\begin{claim}
%\label{cla-sample}
%Let $U$ be a set of elements and $V\subseteq U$ with $\frac{|V|}{|U|}=\tau>0$. Given $\eta, \delta\in(0,1)$, we uniformly select a set $S$ of elements from $U$ at random. Then if $|S|\geq \frac{1}{\tau}\log\frac{1}{\eta}$, with probability at least $1-\eta$, $S$ contains at least one element from $V$.
%\end{claim}
%Actually, the above claim is a folklore result that has been presented in several papers before (such as~\cite{DX14}). Since each sampled element falls in $V$ with probability $\tau$, we know that the sample $S$ contains at least one element from $V$ with probability $1-(1-\tau)^{|S|}$. Therefore, if we want  to guarantee $1-(1-\tau)^{|S|}\geq 1-\eta$, $|S|$ should be at least $\frac{\log 1/\eta}{\log 1/(1-\tau)}\leq\frac{1}{\tau}\log\frac{1}{\eta}$.
%
%\vspace{-0.1in}
%\begin{proof}\textbf{(of Theorem~\ref{the-ourkc}])}
%To help our analysis, we define $C_1:=\emptyset$, and $C_i :=\{c_1, c_2, \cdots, c_{i-1}\}$ for each $i=2,\cdots, k$ of Algorithm~\ref{alg-ourkc}. Further, we define 
%\begin{eqnarray}
%P_i:=\big\{p\in P\mid \mathtt{dist}(p, C_i)>\mathtt{dist}(c_i, C_i)\big\} \label{for-the-ourkc-1}
%\end{eqnarray} 
%for $1\leq i\leq k$. If we fix the index $i$, Claim~\ref{cla-sample} implies that the size $|P_i|$ is smaller than $\frac{\epsilon}{k}|P|$ with probability at least $1-\frac{\eta}{k}$. Otherwise, if $|P_i|\geq \frac{\epsilon}{k}|P|$, the sampled $\frac{k}{\epsilon}\log \frac{k}{\eta}$ points at the $i$-th round should contain at least one point from $P_i$ (with probability at least $1-\frac{\eta}{k}$), which is in contradiction with the definition of $P_i$ as (\ref{for-the-ourkc-1}) (that is, the selected $c_i$ should not be the furthest point of $Q$ to $C$, if $Q\cap P_i\neq \emptyset$). Therefore, through taking the union bound over all the $P_i$s, we have: with probability at least $(1-\frac{\eta}{k})^k\geq 1-\eta$, 
%\begin{eqnarray}
%\forall i=1, 2, \cdots, k, \hspace{0.2in} |P_i|<\frac{\epsilon}{k}|P|. \label{for-the-ourkc-2}
%\end{eqnarray}
%Let $\tilde{P}:=P\setminus\cup^k_{i=1}P_i$. It is easy to know the size $|\tilde{P}|\geq (1-\frac{\epsilon}{k}\times k)|P|=(1-\epsilon)n$. Next, we only need to prove that $\tilde{P}$  is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. We present the following lemma first.
%
%\begin{lemma}
%\label{lem-tildep}
%The set $C=\{c_1, c_2, \cdots, c_k\}$ is returned by Algorithm~\ref{alg-ourkc}. For any point $p\in \tilde{P}$, $\mathtt{dist}(p, C)\leq \min_{1\leq i\neq i'\leq k}||c_i-c_{i'}||$. 
%\end{lemma}
%\begin{proof}
%Suppose Lemma~\ref{lem-tildep} is not true. Then there exist some  $p_0\in\tilde{P}$ and two points $c_{i_1}$ and $c_{i_2}\in C$, such that 
%\begin{eqnarray}
%\mathtt{dist}(p_0, C)>||c_{i_1}-c_{i_2}||. \label{for-lem-tildep-1}
%\end{eqnarray}
%Without loss of generality, we assume $i_1<i_2$. Since $||c_{i_1}-c_{i_2}||\geq \mathtt{dist}(c_{i_2}, C_{i_2})$, the inequality (\ref{for-lem-tildep-1}) implies
%\begin{eqnarray}
%\mathtt{dist}(p_0, C)>\mathtt{dist}(c_{i_2}, C_{i_2}). 
%\end{eqnarray}
%So $p_0\in P_{i_2}$ according to (\ref{for-the-ourkc-1}), which is in contradiction with the assumption $p_0\in \tilde{P}=P\setminus\cup^k_{i=1}P_i$. 
%\end{proof}
%
%
%
%
%
%
%
%Let $O_1, O_2, \cdots, O_k$ be the $k$ clusters obtained from  the optimal solution, {\em i.e.,} $P=\cup^k_{i=1}O_i$ and each cluster $O_i$ can be covered by a ball with radius $r_{\mathtt{opt}}$. We consider two cases. Case (\rmnum{1}): $\{c_1, \cdots, c_k\}$ fall into the $k$ clusters $O_1, O_2, \cdots, O_k$ separately. Without loss of generality, we assume $c_i\in O_i$ for $i=1, 2, \cdots, k$. By using the triangle inequality, we know the input set $P$ is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2 r_{\mathtt{opt}})$. Consequently, $\tilde{P}$  is also covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. 
%
%Case (\rmnum{2}): there exist two points, say $c_{i_a}$ and $c_{i_b}$, of $C$ that belong to one optimal cluster, say $O_l$. Thus $||c_{i_a}-c_{i_b}||\leq 2r_{\mathtt{opt}}$. From Lemma~\ref{lem-tildep}, we know 
%\begin{eqnarray}
%\forall p\in\tilde{P},\hspace{0.2in} \mathtt{dist}(p, C)\leq \min_{1\leq i\neq i'\leq k}||c_i-c_{i'}||\leq ||c_{i_a}-c_{i_b}||\leq 2r_{\mathtt{opt}}.
%\end{eqnarray}
%Hence $\tilde{P}$  is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2 r_{\mathtt{opt}})$.
% \end{proof}
%
%
%
%
%\textbf{Time complexity.} It is easy to see that the time complexity of Algorithm~\ref{alg-ourkc}  is independent of $n$. It takes $k$ rounds, and each round needs to compute the distances from the sampled $\frac{k}{\epsilon}\log \frac{k}{\eta}$ points to $C$. So the total complexity is $O(k\times \frac{k}{\epsilon}\log \frac{k}{\eta}\times kd)=O(\frac{k^3}{\epsilon}d\log\frac{k}{\eta})$. 
%%Below, we analyze the theoretical quality guarantee. 
%
%
%
%
%%
%%
%%\subsection{Extension \Rmnum{1}: $k$-Center Clustering for Continuous Probability Distribution}
%%\label{sec-kcc}
%%
%%We consider the extension of Theorem~\ref{the-ourkc} for (continuous or discrete) probability distribution. 
%In some scenarios, we may not be able to access the whole data ({\em e.g.,} due to the reasons like privacy preserving, data storage and transmission); instead, we may be only allowed to take a small sample each time. 
%%have an oracle to sample data based on the density function. 
%Specifically, we assume the data is a (continuous or discrete) probability distribution with the probability  density function $f$ in $\Omega\subset\mathbb{R}^d$, where $\int_{p\in \Omega}f(p)\mathtt{d} p=1$; the function $f$ can be hid and we only assume that there is an oracle to sample data based on $f$. %We still use $O_1, O_2, \cdots, O_k$ to denote the optimal $k$ clusters as Section~\ref{sec-ouralg}. 
%Obviously, it is prohibitive to directly run the Gonzalez's algorithm in such scenario. On the other hand, our proposed Algorithm~\ref{alg-ourkc} can be naturally applied to solve this problem because it only takes a random sample in each round. 
%The following result is a straightforward extension of Theorem~\ref{the-ourkc}. 
%
%
%\begin{corollary}
%\label{the-con}
%We run Algorithm~\ref{alg-ourkc} on a (continuous or discrete) probability distribution over $\Omega$; each sampled point is taken by an oracle based on the probability  density function $f$. With probability at least $1-\eta$, there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral $\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p\geq  1-\epsilon$, such that $\tilde{\Omega}$ is covered by $\cup^k_{i=1}\mathbb{B}(c_i, 2r_{\mathtt{opt}})$. 
% \end{corollary}
%
%%Further, we consider adding two realistic assumptions to achieve a solution covering the whole domain $\Omega$ (instead of a subset $\tilde{\Omega}$).  
%%
%%
%%\begin{assumption}
%%\label{ass-1}
%%There exists a constant $\alpha>0$ such that $\frac{\min_{p\in \Omega}f(p)}{\max_{p\in \Omega} f(p)}\geq \alpha$. 
%%\end{assumption}
%%
%%Before proposing the second assumption, we need to introduce an important definition for measuring intrinsic dimension for high dimensional data. 
%%
%%%For any $p\in \mathbb{R}^d$ and $r\geq 0$, we use $Ball(p, r)=\{q\in  \mathbb{R}^d\mid ||q-p||\leq r\}$ to indicate the ball of radius $r$ around $p$. 
%%
%%\begin{definition}[Doubling Dimension]
%%\label{def-dd}
%%The doubling dimension of a set $P\subset\mathbb{R}^d$ is the smallest number $\rho$, such that for any $p\in P$ and $r\geq 0$, $P\cap \mathbb{B}(p, 2r)$ is always covered by the union of at most $2^\rho$ balls with radius $r$.
%%\end{definition}
%%\begin{remark}
%%Usually, the doubling dimension is defined for an abstract metric (such as~\cite{har2006fast,talwar2004bypassing,DBLP:journals/talg/ChanGMZ16}). Here, since we focus on the applications for high-dimensional data with low intrinsic dimension, we directly describe the doubling dimension for point set in high-dimensional Euclidean space. 
%%\end{remark}
%%
%%\begin{assumption}
%%\label{ass-2}
%%The input data $P$ has a constant doubling dimension $\rho>0$. 
%%\end{assumption}
%%
%%
%%%As a warm-up, we prove the following result first. 
%%
%%
%%
%%
%%\begin{algorithm}[h]
%%   \caption{\textsc{Sublinear $k$-Center Clustering \Rmnum{2}}}
%%   \label{alg-ourkc2}
%%\begin{algorithmic}
%%  \STATE {\bfseries Input:} A continuous probability distribution $P$ over $\Omega\subset\mathbb{R}^d$ under Assumption~\ref{ass-1} and \ref{ass-2}, $k\in\mathbb{Z}^+$, and two parameters $\eta, \sigma\in (0,1)$. 
%%%   \REPEAT
%%   \STATE
%%   \begin{enumerate}
%%   \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $i=1$. 
%%   \item Repeat the following steps $\frac{k}{(\sigma/4)^{\rho}}-1$ times:
%%   \begin{enumerate}
%%  \item Randomly pick a set $Q$ of $\frac{1}{\alpha}(\frac{k}{ (\sigma/4)^\rho})^2\cdot (\log \frac{k}{\eta}+\rho\log \frac{1}{\sigma})$ points from $P$. 
%%  \item Select the furthest point, say $q_0$, from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
%%  \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
%%   \end{enumerate}
%%   \item Run the Gonzalez's algorithm on $C$ and return the obtained $k$ cluster centers $\hat{C}=\{\hat{c}_1, \cdots, \hat{c}_k\}$. 
%%     \end{enumerate}
%% %   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
%%     %     \UNTIL{The objective value becomes stable.}
%%\end{algorithmic}
%% \end{algorithm}
%%
%%\begin{theorem}
%%\label{the-con-1}
%%We run Algorithm~\ref{alg-ourkc2}  and suppose Assumption~\ref{ass-1} and \ref{ass-2} are true. With probability at least $1-\eta$, $P$ is covered by $\cup^k_{j=1}\mathbb{B}(\hat{c}_j, (2+\sigma) r_{\mathtt{opt}})$. 
%%\end{theorem} 
%%\begin{proof}
%%Recall $r_{\mathtt{opt}}$ is the radius of the optimal $k$-center clustering on $P$, that is, $P$ can be covered by $k$ balls with radius $r_{\mathtt{opt}}$. If we repeatedly apply Definition~\ref{def-dd} $\log\frac{4}{\sigma}$ times, we know that $P$ can be covered by $2^{\rho\log \frac{4}{\sigma}}k=\frac{k}{(\sigma/4)^\rho}$ balls with radius $\frac{\sigma}{4} r_{\mathtt{opt}}$. Thus, we can view the input $P$ as an instance of $\frac{k}{(\sigma/4)^\rho}$-center clustering with the optimal radius no larger than $\frac{\sigma}{4} r_{\mathtt{opt}}$. We denote the corresponding optimal clusters as $D_1, D_2, \cdots, D_{\frac{k}{(\sigma/4)^\rho}}$, respectively. From Theorem~\ref{the-con}, we know that with probability at least $1-\eta$, there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral 
%%\begin{eqnarray}
%%\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p> 1-\frac{\alpha}{k/(\sigma/4)^\rho}, \label{for-the-con-1-1}
%%\end{eqnarray}
%%such that $\tilde{\Omega}$ is covered by $\cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, 2\times \frac{\sigma}{4} r_{\mathtt{opt}})$ (we just need to replace $k$ by $\frac{k}{(\sigma/4)^\rho}$ and $\epsilon$ by $\frac{\alpha}{k/(\sigma/4)^\rho}$). 
%%Meanwhile, from Assumption~\ref{ass-1} we know that for each optimal cluster $D_j$, $1\leq j\leq \frac{k}{(\sigma/4)^\rho}$, the size
%%\begin{eqnarray}
%%\int_{p\in D_j}f(p)\mathtt{d} p\geq \frac{\alpha}{k/(\sigma/4)^\rho}. 
%%\end{eqnarray}
%%Together with (\ref{for-the-con-1-1}), we know that for any $j$, $\tilde{\Omega}\cap D_j\neq \emptyset$. Since each $D_j$ has the radius no larger than $\frac{\sigma}{4} r_{\mathtt{opt}}$, we know 
%%\begin{eqnarray}
%%D_j\subset \cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, 2\times \frac{\sigma}{4} r_{\mathtt{opt}}+2\times \frac{\sigma}{4} r_{\mathtt{opt}})=\cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, \sigma r_{\mathtt{opt}}). 
%%\end{eqnarray}
%%Consequently, 
%%the whole input data
%%\begin{eqnarray}
%%P\subset\cup^{\frac{k}{(\sigma/4)^\rho}}_{j=1}\mathbb{B}(c_j, \sigma r_{\mathtt{opt}}). \label{for-the-con-1-2}
%%\end{eqnarray} 
%%In step 3, Algorithm~\ref{alg-ourkc2} runs the $2$-approximate Gonzalez's algorithm on $C$. Since $C\subset P$, the optimal radius for $C$ should be no larger than $r_{\mathtt{opt}}$. Namely, 
%%\begin{eqnarray}
%%C\subset\cup^k_{j=1}\mathbb{B}(\hat{c}_j, 2r_{\mathtt{opt}}). \label{for-the-con-1-3}
%%\end{eqnarray}
%%Combining (\ref{for-the-con-1-2}) and (\ref{for-the-con-1-3}), we have
%%\begin{eqnarray}
%%P\subset\cup^k_{j=1}\mathbb{B}(\hat{c}_j, (2+\sigma)r_{\mathtt{opt}}).
%%\end{eqnarray}
%%So we complete the proof for Theorem~\ref{the-con-1}. 
%%\end{proof}
%%
%%\textbf{The time complexity of Algorithm~\ref{alg-ourkc2}.} It is not difficult to calculate the runtime which is $O\Big(\frac{d}{\alpha}\big(\frac{k}{(\sigma/4)^\rho}\big)^4(\log \frac{k}{\eta}+\rho\log \frac{1}{\sigma})\Big)$. 
%%
%
%
%\subsection{When $k$ Is Not Given}
%\label{sec-kcu}
%In many real scenarios, the number of clusters $k$ is often not given. For instance, we may only have a threshold $r_0>0$ for the radius; so we just try to perform the $k$-center clustering algorithm for different values of $k$ until the obtained radius is no larger than $r_0$. The reader may realize that this problem is related to the well known {\em geometric set cover} problem~\cite{DBLP:journals/dcg/BronnimannG95,DBLP:journals/dcg/AgarwalP20}; however, existing geometric set cover algorithms often have large (super linear) running time and can only handle low dimensional case. 
%
%In this paper, we simplify the problem and  consider a practical approach: using the Gonzalez's algorithm to achieve our goal. Suppose the given set $P$ can be covered by $\tilde{k}\in \mathbb{Z}^+$ balls with radius $r_0/2$ ({\em i.e.,} $\tilde{k}$ is the value that the optimal radius of $\tilde{k}$-center clustering on $P$ is no larger than $r_0/2$). Then, if we just run the Gonzalez's algorithm  iteratively, the resulting radius will reach $r_0$ within at most $\tilde{k}$ rounds (because it is a $2$-approximation algorithm). Now we discuss how to implement this procedure in sublinear time. 
%We cannot directly adapt this procedure to our sublinear Algorithm~\ref{alg-ourkc}, due to the following two issues. 
%\textbf{(1)} The sample size $\frac{k}{\epsilon}\log \frac{k}{\eta}$ in step 2(a) depends on a given $k$; \textbf{(2)} we do not know when to terminate if $k$ is not given. 
%In this section, we focus on resolving these two issues. 
%
%First, we introduce a \textbf{stratified sampling method}. Let $k_0\geq 1$ be any fixed constant. Imagine we run  step 2(a)-2(c) of Algorithm~\ref{alg-ourkc} iteratively. We partition the process into different phases and modify the sample size in step 2(a)  for each phase accordingly:
%\begin{itemize}
%\item \textbf{Phase $t=0$:} for $i=1, 2, \cdots, k_0$, we set $|Q|=\frac{2k_0}{\epsilon}\log\frac{k_0}{2\eta}$. 
%\item \textbf{Phase $t\geq 1$:} for $i=\sum^{t-1}_{s=0}2^s k_0+1, \sum^{t-1}_{s=0}2^s k_0+2, \cdots, \sum^{t}_{s=0}2^s k_0$, we set $|Q|=2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{2\eta}$. 
%\end{itemize}
% So phase $t$ contains $2^t k_0$ iterations. The sample size also increases from phase $t$ to phase $t+1$. 
% 
% 
% 
%For completeness, we also need to set the stopping condition. Suppose $r_0>0$ is the given threshold. At the end of each $i$-th iteration, we take a sample $S$ from $P$ uniformly at random, and compute the ratio 
%\begin{eqnarray}
%\tau=\frac{\Big|S\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|}{|S|}. \label{for-tau}
%\end{eqnarray}
%The following lemma introduces an \textbf{oracle} that can help us to decide when to terminate.
%
%\begin{lemma}
%\label{lem-terminate}
%Suppose $\eta_0\in (0,1)$. We set the sample size $|S|\geq \frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$. With probability at least $1-\eta_0$, the following oracle returns the correct answer:  if $\tau\leq \frac{3}{2}\epsilon$, return ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$''; else, return ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|>\epsilon n$''. 
%\end{lemma} 
%\begin{proof}
%For convenience, we use $\tilde{\epsilon}$ to denote the ratio $\frac{\big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\big|}{n}$. We consider two cases: (\rmnum{1}) $\tilde{\epsilon}\leq \eta_0\epsilon$ and (\rmnum{2}) $\tilde{\epsilon}> \eta_0\epsilon$. For case (\rmnum{1}), $\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|=\tilde{\epsilon} n\leq \eta_0\epsilon<3\epsilon n$. Due to the Markov's inequality, we know that $\tau\leq \frac{1}{\eta_0}\times\eta_0\epsilon=\epsilon<\frac{3}{2}\epsilon$ with probability at least $1-\eta_0$.  Thus, it returns ``$\Big|P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$'' which is a correct answer, with probability at least $1-\eta_0$. So we  focus on the second case below. 
%
%We use the Chernoff bound~\cite{alon2004probabilistic}. Define $|S|$ random variables $\{y_1, \cdots, y_{|S|}\}$: for each $1\leq j\leq |S|$, $y_j=1$ if the $j$-th sampled element falls in $P\setminus \big(\cup^i_{l=1}\mathbb{B}(c_l, r_0)\big)$, otherwise, $y_j=0$. So $E[y_j]=\tilde{\epsilon}$ for each $y_j$. As a consequence, we have
%\begin{align}
%\textbf{Pr}\big(\big|\sum^{|S|}_{j=1}y_j-\tilde{\epsilon} |S|\big|\leq \frac{1}{2}\tilde{\epsilon}|S|\big)\geq 1-2e^{-\frac{\tilde{\epsilon}}{12}|S|}. 
%\end{align}
%Since $\tilde{\epsilon}> \eta_0\epsilon$, if $|S|\geq\frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$, with probability at least $1-\eta_0$, $\big|\sum^{|S|}_{j=1}y_j-\tilde{\epsilon} |S|\big|\leq \frac{1}{2}\tilde{\epsilon}|S|$, {\em i.e.,}
%\begin{eqnarray}
%\tau=\frac{\sum^{|S|}_{j=1}y_j}{|S|}\in [\frac{1}{2}\tilde{\epsilon}, \frac{3}{2}\tilde{\epsilon}]. \label{for-terminate-1}
%\end{eqnarray}
%Therefore,  if $\tau\leq \frac{3}{2}\epsilon$, we know $\frac{1}{2}\tilde{\epsilon}\leq \frac{3}{2}\epsilon$ from (\ref{for-terminate-1}),  and it implies $\tilde{\epsilon}\leq 3\epsilon$. Otherwise, we know $\frac{3}{2}\tilde{\epsilon}>\frac{3}{2}\epsilon$ and it implies $\tilde{\epsilon}>\epsilon$. 
%\end{proof}
%
%Now, we are ready to present our algorithm for the case without knowing $\tilde{k}$.
%% Suppose $\tilde{k}$ is the number of clusters returned by the Gonzalez's algorithm with a given threshold $r_0>0$ for the radius. Namely,  the Gonzalez's algorithm needs to run $\tilde{k}$ iterations until the resulting radius reduces to be no larger than $r_0$. 
% Let $i_{\mathtt{ter}}$ be the size of $C$ when Algorithm~\ref{alg-ourkc3} terminates. To evaluate the performance of the algorithm, we need to compare $i_{\mathtt{ter}}$ with $\tilde{k}$ and investigate the number of points that are covered by $\cup^{i_{\mathtt{ter}}}_{j=1}\mathbb{B}(c_j,r_0)$.
%
%
%
%
%\begin{algorithm}[h]
%   \caption{\textsc{Sublinear $k$-Center Clustering \Rmnum{2}}}
%   \label{alg-ourkc3}
%\begin{algorithmic}
%  \STATE {\bfseries Input:} A set $P$ of $n$ points in $\mathbb{R}^d$, a threshold $r_0>0$, an arbitrary constant integer  $k_0\in\mathbb{Z}^+$, and two parameters $\eta, \epsilon\in (0,1)$. 
%%   \REPEAT
%   \STATE
%   \begin{enumerate}
%   \item Initially, let $C=\{c_1\}$, where $c_1$ is an arbitrary point picked from $P$; $t=i=0$. 
%   \item Repeat the following steps as the stratified sampling procedure:
%   \begin{enumerate}
%      \item Take a sample $S$ from $P$ uniformly at random, where $|S|= \frac{12}{\eta_0\epsilon}\log\frac{2}{\eta_0}$ and $\eta_0=\frac{\eta}{2^{2t}k_0}$. 
%   \item  Repeat the following steps $2^t k_0$ times ({\em i.e.,} phase $t$):
%   \begin{enumerate}
%  \item Randomly pick a set $Q$ of $2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{2\eta}$ points from $P$. 
%  \item Select the furthest point, say $q_0$, from $Q$ to $C$, {\em i.e.,} $q_0=\arg_{q\in Q}\max\mathtt{dist}(q, C)$. 
%  \item Let $c_{i+1}=q_0$, $C=C\cup \{c_{i+1}\}$, and $i=i+1$. 
%  \item Apply Lemma~\ref{lem-terminate} as the oracle (using the sample $S$ from step 2(a)) to determine whether to terminate: if it returns ``$\Big|P\setminus \big(\cup^i_{l=i}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$'', stop the algorithm, set $i_{\mathtt{ter}}=i$, and return $C$. 
%     \end{enumerate}
%   \item $t=t+1$. 
%   \end{enumerate}
%%   \item Return $C$. 
%     \end{enumerate}
% %   \STATE {\bfseries Output}  the leaves of $\mathcal{H}$.
%     %     \UNTIL{The objective value becomes stable.}
%\end{algorithmic}
% \end{algorithm}
%
%
%\begin{theorem}
%\label{the-kcu}
%Let $C=\{c_1, \cdots, c_{\mathtt{ter}}\}$ be the output from Algorithm~\ref{alg-ourkc3}.  
%With probability at least $1-4\eta$,  $i_{\mathtt{ter}}\leq \tilde{k}$, and  there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-3\epsilon)n$, such that $\tilde{P}$ is covered by $\cup^{i_{\mathtt{ter}}}_{j=1}\mathbb{B}(c_j,r_0)$. 
%\end{theorem}
%\begin{proof}
%To prove Theorem~\ref{the-kcu}, we first imagine the ``fancied'' scenario that $\tilde{k}$ is given: we just run Algorithm~\ref{alg-ourkc} with $k=\tilde{k}$ and $|Q|=\frac{\tilde{k}}{\epsilon}\log \frac{\tilde{k}}{\eta}$. 
%Recall the proof of Theorem~\ref{the-ourkc}, where we define a sequence of subsets $P_1, P_2, \cdots, P_{\tilde{k}}$ and define $\tilde{P}=P\setminus\cup^{\tilde{k}}_{i=1}P_i$. 
%To guarantee $|\tilde{P}|\geq (1-\epsilon)n$, we prove that each $P_i$ contains at most $\frac{\epsilon}{\tilde{k}}n$ points. For Algorithm~\ref{alg-ourkc3}, we also define a sequence of subsets  $P_1, P_2, \cdots, P_{\tilde{k}}$ (by using (\ref{for-the-ourkc-1})), but we need to modify their sizes. At each $t$-th phase, since we have the sample size $2^{2t}\frac{2k_0}{\epsilon}\log\frac{2^{2t}k_0}{2\eta}$, by using the similar idea from the proof of Theorem~\ref{the-ourkc}     we know that the size 
%\begin{eqnarray}
%|P_i|\leq \frac{\epsilon}{2^{2t}\times 2k_0}n, \text{with probability at least $1-\frac{\eta}{2^{t}k_0}$.} \label{for-the-kcu-1}
%\end{eqnarray}
%Suppose we run Algorithm~\ref{alg-ourkc3} until $i=\tilde{k}$. Let $t_0$ be the total number of phases that the algorithm takes. Consequently, we have 
%\begin{eqnarray}
%\frac{\big|\cup^{\tilde{k}}_{i=1}P_i\big|}{n}&\leq& \frac{\epsilon}{2k_0}\times k_0+\frac{\epsilon}{2^2\times 2k_0}\times 2k_0+\cdots+\frac{\epsilon}{2^{2t_0}\times 2k_0}\times 2^{t_0}k_0\nonumber\\
%&=&\frac{\epsilon}{2}(1+\frac{1}{2}+\cdots+\frac{1}{2^{t_0}})\leq\epsilon.\label{for-the-kcu-2}
%\end{eqnarray}
%So we can still guarantee $|\tilde{P}|=|P\setminus\cup^{\tilde{k}}_{i=1}P_i|\geq (1-\epsilon)n$. Furthermore, the total success probability is at least 
%\begin{eqnarray}
%&&(1-\frac{\eta}{k_0})^{k_0}\times(1-\frac{\eta}{2^2k_0})^{2k_0}\times\cdots\times(1-\frac{\eta}{2^{2(t_0-1)}k_0})^{2^{t_0-1}k_0}\nonumber\\
%&>&(1-\eta)\times(1-\frac{\eta}{2})\times\cdots\times(1-\frac{\eta}{2^{t_0-1}})\nonumber\\
%&>&1-(1+\frac{1}{2}+\cdots+\frac{1}{2^{t_0-1}})\eta>1-2\eta. \label{for-the-kcu-3}
%\end{eqnarray}
%
%
%The remaining issue is that we do not know the value of $\tilde{k}$ in reality (in other words, we do not know when to terminate the algorithm). Therefore, we apply Lemma~\ref{lem-terminate} as an oracle in step~2(b)(\rmnum{4}), where the success probability for each time is $1-\eta_0=1-\frac{\eta}{2^{2t}k_0}$. When it returns ``$\Big|P\setminus \big(\cup^i_{l=i}\mathbb{B}(c_l, r_0)\big)\Big|>\epsilon n$'', we know that the algorithm needs to continue. We stop the algorithm when it returns ``$\Big|P\setminus \big(\cup^i_{l=i}\mathbb{B}(c_l, r_0)\big)\Big|\leq 3\epsilon n$''. Since we relax the error of covering number to be $3\epsilon>\epsilon$, we know that $i_{\mathtt{ter}}$ should be no larger than $\tilde{k}$. By using the similar idea of (\ref{for-the-kcu-3}), we can obtain the overall success probability of the oracle that is at least 
%\begin{eqnarray}
%1-2\eta. \label{for-the-kcu-4}
%\end{eqnarray}
%
%Combining (\ref{for-the-kcu-3}) and (\ref{for-the-kcu-4}), the overall success probability of Algorithm~\ref{alg-ourkc3} is at least $1-4\eta$. 
%\end{proof}
%
%\vspace{-0.05in}
%
%\textbf{The time complexity of Algorithm~\ref{alg-ourkc3}.} We analyze the runtime for each phase. We set $k_0\geq 2$ to be a constant integer.  At the $t$-th phase, step (b)(\rmnum{1})-(\rmnum{3}) take $O(\frac{2^{3t}}{\epsilon}\log\frac{2^{2t}}{\eta}d)$ time; step (b)(\rmnum{4}) takes $O(\frac{2^{2t}}{\eta\epsilon}\log\frac{2^{2t}}{\eta} d)$ time. Also, the phase repeats step (b)(\rmnum{1})-(\rmnum{4}) $O(2^{t})$ times. Thus, the  $t$-th phase takes  $O((2^t+\frac{1}{\eta})\frac{2^{3t}}{\epsilon}\log\frac{2^{2t}}{\eta} d)$ time. 
% Let $t_0$ be the total number of phases. Then we can calculate the bounds for $\tilde{k}$: 
%\begin{eqnarray}
%\sum^{t_0-1}_{s=0}2^s k_0<\tilde{k}\leq \sum^{t_0}_{s=0}2^s k_0,
%\end{eqnarray}
%which implies $t_0\leq \log \frac{\tilde{k}}{k_0}+1\leq \log\tilde{k}$. So the total time complexity of Algorithm~\ref{alg-ourkc3} is $O((\tilde{k}+\frac{1}{\eta})\frac{\tilde{k}^{3}}{\epsilon}\log\frac{\tilde{k}}{\eta} d)$.
%Compared with the case that $\tilde{k}$ is given, the runtime is increased by only a factor $(\tilde{k}+\frac{1}{\eta})$ (the runtime of Algorithm~\ref{alg-ourkc} is $O(\frac{\tilde{k}^{3}}{\epsilon}\log\frac{\tilde{k}}{\eta} d)$). 
%
%We also have the following result for Algorithm~\ref{alg-ourkc3} which is similar with Corollary~\ref{the-con}.
%
%\begin{corollary}
%\label{the-con2}
%We run Algorithm~\ref{alg-ourkc3} on a (continuous or discrete) probability distribution over $\Omega$; each sampled point is taken by an oracle based on the probability density function $f$. 
%With probability at least $1-4\eta$,  $i_{\mathtt{ter}}\leq \tilde{k}$, and 
%there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral $\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p\geq  1-3\epsilon$, such that $\tilde{\Omega}$ is covered by $\cup^{i_{\mathtt{ter}}}_{i=1}\mathbb{B}(c_i,r_0)$. 
%%
%%
%%
%% there exists a subset $\tilde{P}\subset P$ with size $|\tilde{P}|\geq (1-3\epsilon)n$, such that $\tilde{P}$ is covered by $\cup^{i_{\mathtt{ter}}}_{j=1}\mathbb{B}(c_j,r_0)$. 
%%
%%
%%
%%
%%With probability at least $1-\eta$, there exists a subset $\tilde{\Omega}\subset \Omega$ with the integral $\int_{p\in\tilde{\Omega}}f(p)\mathtt{d} p\geq  1-\epsilon$, such that $\tilde{\Omega}$ is covered by $\cup^k_{j=1}\mathbb{B}(c_j, 2r_{\mathtt{opt}})$. 
% \end{corollary}
%
%
%
%\vspace{-0.1in}
%\section{Convex Hull Approximation in High Dimensions}
%\label{sec-ch}
%\vspace{-0.1in}
%
%Blum {\em et al.}~\cite{blum2019sparse} introduced a simple greedy convex hull approximation algorithm that is  similar in spirit to the Gonzalez's algorithm for $k$-center clustering. Given an instance $P\subset\mathbb{R}^d$, it also maintains a set $C$ that contains an arbitrarily selected $p\in P$ at the beginning. In each round, the algorithm always select the farthest point to $\mathtt{conv}(C)$ and add it to $C$, until some specified stopping condition is satisfied. For ease of presentation, we assume that $P$ is contained in a unit ball of $\mathbb{R}^d$. The algorithm yields a bi-criteria approximate result: given an error parameter $\delta\in(0,1)$, suppose $k_{\mathtt{opt}}=\min\big\{k\mid Q\subset P, |Q|=k, \max_{p\in P}\mathtt{dist}(p, \mathtt{conv}(Q))\leq \delta\big\}$; the algorithm can yield a subset $C\subset P$ such that 
% \begin{eqnarray}
% |C|=O(k_{\mathtt{opt}}/\delta^{2/3})  \hspace{0.1in}\text{and}\hspace{0.1in} \mathtt{dist}(p, \mathtt{conv}(C))\leq 8\delta^{1/3}+\delta, \forall p\in P. 
%  \end{eqnarray}
% 
% We consider applying our previous sampling idea to implement this convex hull approximation algorithm in sublinear time. Here, we have the same issue as Section~\ref{sec-kcu}, that is, we do not know the exact value of $k_{\mathtt{opt}}$ so that we cannot determine the sample size in each iteration and when to terminate. Thus we apply the same stratified sampling method. We also use Lemma~\ref{lem-terminate} as the oracle to determine whether the stopping condition is satisfied. A minor technical issue for implementation is that it is costly to compute the distance from a given point to  a convex hull (it needs to solve a quadratic programming for achieving the exact result); instead we can apply the Gilbert's algorithm~\cite{gilbert1966iterative} or some other variants like the Triangle algorithm~\cite{DBLP:journals/anor/AwasthiKZ20} to compute an approximate solution efficiently. Due to the space limit, we leave the full details for convex hull approximation to our supplement. 
% 
% 
% \vspace{-0.1in}
%\section{Experimental Results}
%\label{sec-exp}
%\vspace{-0.05in}
%All the experiments were conducted on an Ubuntu workstation with 2.40GHz Intel(R) Xeon(R) CPU E5-2680 and 256GB main memory. The algorithms were implemented in MATLAB R2019b. For each instance, we repeat the experiment $10$ times and report the average results with their standard deviations. 
%\newcounter{sd1}
%\begin{figure*} [h]
%	\begin{center}
%			 %\vspace{-0.1in}
%		\includegraphics[height=0.168\textwidth]{figure/alg1/radius_k_cifar10}  
%		\hspace{0.12in}
%		\includegraphics[height=0.168\textwidth]{figure/alg1/runtime_k_cifar10} 
%				\hspace{0.12in}
%						\includegraphics[height=0.18\textwidth]{figure/alg1/runtime_vs_radius_cifar10}  
% 		%		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_cost} 
%		\vspace{-0.02in}
%				\centerline{ \hspace{-0.8in}\hfill \stepcounter{sd1}  (\alph{sd1})\hfill \stepcounter{sd1} (\alph{sd1}) \hfill \stepcounter{sd1} (\alph{sd1})\hspace{1.6in}}
%		\vspace{-0.05in}
%		\caption{The experimental performances for the case that $k$ is given. In (c), we show the radius obtained versus runtime for different values of $k$.}     
%	\label{fig-exp-1}
%	\end{center}
%	 \vspace{-0.2in}
%\end{figure*}
%%
%%
%%\begin{figure*}
%%%	\begin{center}
%%%			% \vspace{-0.1in}
%%%		\includegraphics[width=0.33\textwidth]{resultfigs/real_kc_r}  
%%%		\includegraphics[width=0.33\textwidth]{resultfigs/real_km_cost}   
%%%		\includegraphics[width=0.33\textwidth]{resultfigs/median_real_km_cost} 
%%%		 %\vspace{-0.1in}
%%%		\caption{The normalized objective values on the  real  datasets.}     
%%%		\label{fig-exp-objective2}
%%%	\end{center}
%%%	 %\vspace{-0.3in}
%%%\end{figure*}
%\newcounter{sd2}
%\begin{figure*}[h] 
%	\begin{center}
% 		\includegraphics[height=0.17\textwidth]{figure/alg2/radius_cifar10_6x} 
%				\hspace{0.1in}
%						\includegraphics[height=0.185\textwidth]{figure/alg2/runtime_cifar10_6x}  
%										\hspace{0.1in}						
%								\includegraphics[height=0.198\textwidth]{figure/alg2/num_k_cifar10_6x}  						
% %		\includegraphics[width=0.33\textwidth]{resultfigs/median_syn_km_time}
%
% \vspace{-0.05in}
%				\centerline{ \hspace{-0.8in}\hfill \stepcounter{sd2}  (\alph{sd2})\hfill \stepcounter{sd2} (\alph{sd2}) \hfill \stepcounter{sd2} (\alph{sd2})\hspace{1.7in}}
% 		 \caption{The experimental performances for the case that a radius threshold $r_0$ is given. All the results (radius, runtime, and the number of returned centers) are respectively normalized over the results obtained by \textsc{Gonzalez}.}     
%		\label{fig-exp-2}
%	\end{center}
%	\vspace{-0.2in}
% \end{figure*}
%
%
% We run our proposed Algorithm~\ref{alg-ourkc} and Algorithm~\ref{alg-ourkc3} on  the real image dataset  \textbf{CIFAR-10}~\cite{krizhevsky2009learning} which consists of $60,000$ color images  with  each image being represented by a $3072$-dimensional vector. To illustrate the scalability of our algorithms for large-scale data, we enlarge  \textbf{CIFAR-10} by $6$ times; namely, for each image vector, we generate $5$ copies and  add small Gaussian noises to them. We consider several baseline methods including the $2$-approximate \textsc{Gonzalez}~\cite{GONZALEZ1985293} and the recently proposed streaming $k$-center clustering algorithm \textsc{CPP }~\cite{DBLP:journals/corr/abs-1802-09205}; we also compare our algorithms with the uniform sampling method \textsc{Uniform-$r$}, where $r$ denotes the sampling ratio ({\em e.g.,} \textsc{Uniform-$0.1$} means we take $10\%$ points from the input uniformly at random). 
% 
%In Figure~\ref{fig-exp-1}, we can see that our Algorithm~\ref{alg-ourkc} runs significantly faster than the other methods.  Also it is worth to emphasize that in our evaluation, we compute the  radius for covering all the input points, rather than excluding the farthest $\epsilon n$ points as the theoretical analysis in Theorem~\ref{the-ourkc}. We can see that our algorithm and \textsc{Gonzalez} can achieve very close radii,  even we did not exclude the farthest $\epsilon n$ points. 
%
%In Figure~\ref{fig-exp-2}, we illustrate the results of Algorithm~\ref{alg-ourkc3}. Similar with Figure~\ref{fig-exp-1}, we can see that our algorithm runs much faster than \textsc{Gonzalez}. An interesting observation is that our algorithm returns much less centers than \textsc{Gonzalez} for a fixed radius threshold $r_0$ (Figure~\ref{fig-exp-2} (c)).  We believe one possible reason is  that our random sampling approach is more likely to select a point closer to the optimal ball center, and thus the obtained radius can decrease faster, while the greedy selection of \textsc{Gonzalez} always selects the most ``extreme'' point which could be far to the optimal ball center. 
%
%
%Due to the space limit, we place more details of our experiments (including the experiments for convex hull approximation) to  our supplement. 
% 
%\vspace{-0.1in}
%\section{Conclusion}
%\label{sec-con}
%\vspace{-0.1in}
%In this paper, we propose the sublinear algorithms for greedy selection methods for dealing with large-scale data. Following this work, there are also several interesting problems deserving to study in future. For example, in our experiments we observe that our random sampling based approach can  achieve very close radii   with the vanilla greedy selection approach \textsc{Gonzalez} (even without excluding  the farthest $\epsilon n$ points). So we expect to have a strict analysis on this phenomenon in theory, {\em e.g.,} adding some reasonable assumption to the data distribution from the perspective of {\em beyond worst-case analysis} \cite{DBLP:journals/cacm/Roughgarden19}. Also, as  the energy consumption from IT  infrastructures has become a serious issue to environment ({\em e.g.,} climate change), we believe that it is necessary to consider other aspects (like the energy consumption) for designing algorithms. The future work along this line 
%could have a long-term impact to the society. 
%\newpage
%
%%Geometric optimization is one of most fundamental problems in computational geometry \cite{Agarwal2019GeometricOR}.
%%Consider the common problems of involving sets of points in $\mathbb{R}^d$ in Computational Geometry.
%%Let P be a set of n points in $\mathbb{R}^d$. Instead of solving the original proble
%%consider the problem of finding a small representative subset (e.g. coreset) $Q \subseteq  P$, 
%%such that the solution just on the subset is guaranteed to be a good
%%approximation of the solution on the original set, while preserving both its size and approximation quality. 
%%Such subset approximate algorithms ensure faster and approximate solutions.
%%The clustering problem (e.g. $k$-center clustering problem) is one of most popular geometric optimization problems, 
%%where goal is to partition the points into several clusters based on their similarities or dissimilarities;
%%the problem has been widely applied to practial applications, such as data mining~\cite{10.5555/1095618} and active learning~\cite{sener2018active}.
%%Another important topic is the convex hull problem,
%%where the goal is find a subset, such that every $p \in P$ can be represented as a convex combination of points in the subset.
%%In most applications where the convex hull is applied, i.e. surface simplification \cite{Heckbert95surveyof}, 
%%approximate convex hulls are more competitive than the true convex hull because they are using much less space to approximate the boundary of a set of points.
%%Finding approximate convex hulls is an important component of algorithms for $\epsilon$-kernels,
%%which is applied in data analysis \cite{agarwal2017efficient} and computer graphics \cite{barequet1999efficiently}.
%%Moreover, it turns out that $\epsilon$-approximate convex hulls can be used for sparse non-negative matrix factorization (NMF) \cite{blum2019sparse},
%%which has been widely applied in many applications, such as text mining and image analysis \cite{5360278}.
%%
%%However, in big data and machine learning era, the real-world data size could be large-scale. Many algorithms applied
%%to large-scale datasets may be difficult because of requiring prohibitive running time or memory.
%%In fact, the real-word datasets are often too large to fit in memory, which makes the problems to be much more
%%challenging.
%%
%%\subsection{Related Work and Our Contribution}
%%
%%\paragraph{Sublinear time algorithms.}
%%As extremely large-scale datasets grow more prevalent in machine learning era, 
%%it is natural to wonder what one can do in sublinear time algorithms design. 
%%In fact, there has been a lot of research on this direction \cite{Czumaj2010, article}. 
%%For example, a number of sublinear time algorithms design on clustering have been studied
%%in \cite{https://doi.org/10.1002/rsa.20157, 10.1145/301250.301366, 10.1023/B:MACH.0000033115.78247.f0, 10.5555/365411.365499, ding:LIPIcs:2020:12904}.
%%In addition, the sublinear time algorithms design on graph model or probability distributions has been studied extensively \cite{goldreich1998property}.
%%
%%In this paper, we consider to develop sublinear time algorithms for several geometric optimization problems by random sampling approach.
%%\paragraph{$k$-center clustering problem.}
%%In the $k$-center clustering problem, the goal is to find a set of $k$ points in $P$, says $Q$, such that the maximum distance of a point in 
%%$P \subseteq \mathbb{R}^d$ to its closet point in $Q$ is minimized. 
%%It is known that the problem is NP-Hard. Even in $\mathbb{R}^2$, 
%%it is NP-Hard to obtain a $(1+\sqrt{7})/2$-approximate algorithm\cite{10.1145/62212.62255}.
%%
%%\paragraph{convex hull problem.}
%%The problem of convex hull is to find a subset of $P$, $Q \subseteq  P$, such that every
%%$p \in P$ can be represented as a convex combination of points in $Q$.
%%A greedy algorithm to construct approximate convex hull was proposed by Avrim Blum et al \cite{blum2019sparse}.
%%They proved that after $O(k_{opt}/ \epsilon^{\frac{2}{3}})$ iterations, 
%%their algorithm ouputs a set which is an $O(\epsilon^{\frac{1}{3}})$-approximation to the original set of points
%%where $k_{opt}$ denote the minimum size of an $\epsilon$-approximate convex hull.
%%They also proposed a  novel algorithm to improve the running time further.
%%However, their algorithms are linear time algorithms.
%%For solving the problem of large-scale datasets (i.e. the set is too large to fit in memory), 
%%Avrim Blum et al proposed streaming algorithms for $\epsilon$-approximate convex hull with space complexity comparable 
%%to the optimal approximation \cite{blum2018approximate}.
%%
%%Note, existing algorithms often have high time complexities or high memory consumption, 
%%since they are not independent of the size of datasets.
%%In big data era, we are wondering that whether it is possible to remove the dependency 
%%on the the size of datasets $n$ in each iteration of the algorithm. 
%%We need to implement the each selection step
%%by a random sampling approach, but it is challenging to guarantee the resulting quality.
%%We redesigned the basic algorithms based on greedy selection and the sampling manner.
%%For the $k$-center clustering problem, we propose a sublinear time algorithm with a approximate factor of 2;
%%for the convex hull problem, we propose a sublinear time algorithm for construct $\epsilon$-approximate convex hull.
%%It can be proved that the effectiveness of our proposed algorithms.
%%Meanwhile, in each round of step of the proposed algorithm, the sampling manner avoids reading all data into memory at one time, 
%%which can release computing resources.
%%
%%Moreover, we observe that our proposed framework of algorithms can be used to solve a broader range of geometric
%%optimization problems.
%%
%%\section{Definition and Preliminaries}
%%\label{others}
%%
%%\subsection{Clustering Problem}
%%
%%\begin{Definition}[$k$-center Clustering]
%%  \label{Def1}
%%  Given a set P of n points in $\mathbb{R}^d$ with one positive integers k, the problem of k-center clustering 
%%  is to find k cluster centers $\{c_1, \cdots , c_k\} \subset \mathbb{R}^d$, such that $max _{p \in P}  
%% in_{1\leq j \leq k} \|p - c_{j}\Vert $ is minimized.
%%\end{Definition}
%%
%%\begin{Definition}[$k_\epsilon$-center Clustering]
%%\label{Def2}
%%Let P be an instance of k-center clustering, and $\epsilon \geq 0$. 
%%$k_\epsilon$-center clustering is to find a subset $P^{'}$ of $P$, where $|P^{'}| \geq (1-\epsilon)|P|$,
%%such that the corresponding clustering cost of Definition \ref{Def1} on $P^{'}$ is minimized.
%%\end{Definition}
%%Let the optimal partition forming $P^{'}$ denote $C^*$ and $r_{opt}$ denote resulting clustering cost of the optimal solution.
%%In this paper, we use $\{C_1, \cdots , C_k\}$ to be the k clusters forming the subset of $P$ with size $(1-\epsilon)|P|$,
%%and the resulting clustering cost be $r$.
%%
%%\paragraph{Other notations.}
%%For convenience, let $dist(p, Q)$ to denote the shortest distance
%%between a point p and a point set $Q$, i.e., $min_{q\in Q} \|p - q\Vert $. Further, given two point sets $Q_1$
%%and $Q_2$, let $dist(Q_1, Q_2) = min_{q, p \in Q} \|p - q\Vert $.
%%
%%\subsection{Convex Hull Problems}
%%
%%\begin{Definition}[convex hull]
%%\label{Def3}
%%Given a set $P$ of $n$ points in $\mathbb{R}^d$, $Q \subseteq  P$ is a convex hull of $P$ if every
%%$p \in P$ can be represented as a convex combination of points in $Q$.
%%\end{Definition}
%%
%%\begin{Definition}[$\epsilon$-approximate convex hull]
%%\label{Def4}
%%Given a set $P$ of $n$ points in $\mathbb{R}^d$ and  $\epsilon \in (0, 1)$,
%%and let $\Delta $ denote the diameter of P, i.e., $\Delta  = max_{q, p \in P} \|p - q\Vert$. 
%%The problem of convex hull approximation is to find a subset $Q \in P$, 
%%such that $max_{p \in P} dist(P, \mathcal{C}_Q) \leq  \epsilon\Delta$, where $\mathcal{C}_Q$ denote the convex hull of $Q$.
%%Then $Q$ is an $\epsilon$-approximate convex hull of $P$.
%%\end{Definition}
%%Let $k_{opt}$ denote the smallest size of $Q$.
%%For a set $P$ , its one sided Hausdorff distance from $Q$ is $dist_H(P, Q) = max_{p \in P}dist(p, Q)$.
%%
%%\section{Algorithm}
%%\subsection{Algorithms For Clustering Problem}
%%
%%The basic algorithm~\cite{GONZALEZ1985293} starts with an arbitrary point from $P$, 
%%and iteratively selects the following $k-1$ points, 
%%where each $j$-th step $(2 \leq j\leq k)$ chooses the point which has the largest minimum distance to the
%%already selected $j - 1$ points; finally, each input point is assigned to its nearest neighbor of
%%these $k$ points. It can be proved that this greedy strategy results in a 2-approximation of
%%k-center clustering.
%%
%%\renewcommand{\algorithmicrequire}{ \textbf{Input:}} %Use Input in the format of Algorithm
%%\renewcommand{\algorithmicensure}{ \textbf{Output:}} %UseOutput in the format of Algorithm
%%
%%\begin{algorithm}[htb] 
%%  \caption{2-Approximation Algorithm.} 
%%  \label{alg:1}
%%  \begin{algorithmic}[1] 
%%  \REQUIRE ~~\\ 
%%  An instance $ P \subset \mathbb{R}^d$ of k-center clustering, and $|P| = n $; the
%%$\epsilon > 0 $, $ \eta \in (0, 1)$, and $k \in \mathbb{Z}^{+}$ .\\
%%  \ENSURE ~~\\
%%  \STATE Let $n^{'} = \frac{k}{\epsilon} log \frac{k}{\eta}$, and $C = \phi $ . 
%%  \STATE Initially, $j = 1$; randomly select one point $c_1$ from $P$ and let $C = \{c_1\}$.
%%  \WHILE {$ j < k$}
%%  \label{code:3}
%%  \STATE Randomly sample $n^{'}$ points from $P$, say $Q_j$;
%%  \STATE Let $c_j$ denote the furthest point from $Q_j$ to $C$ and add $c_j$ to $C$;
%%  \STATE $j = j + 1$;
%%  \ENDWHILE 
%%  \RETURN $C$. 
%%  \end{algorithmic}
%%  \end{algorithm}
%%
%%\begin{theorem} \label{theorem1}
%%  With $probability\geq 1-\eta$, the clustering cost of Algorithm~\ref{alg:1} by Definition~\ref{Def1} $r \leq 2r_{opt}$.
%%\end{theorem}
%%  \paragraph{Running time. }
%%  In each round of Step~\ref{code:3}, there are $\frac{k}{\epsilon} log \frac{k}{\eta}$ points selected 
%%  to update the distances from the points of $P$ to $C$. 
%%  Overall, the running time of Algorithm~\ref{alg:1} is $O(k\frac{k}{\epsilon} log \frac{k}{\eta}kd)$.
%%
%%Before proving Theorem~\ref{theorem1}, we present the following two lemmas first.
%%
%%\paragraph{Proposition 1.}
%%Let $U$ be a set of elements, $\eta \in(0, 1)$and $V\subset U$ with $\frac{|V|}{|U|}= \tau > 0$. 
%%We randomly samples S from $U$, if $|S|\geq \frac{1}{\epsilon} log \frac{1}{\eta}$, with $probability\geq 1-\eta$, 
%%$|S\cap V| \geq 1 $.
%%
%%To help the analysis, let 
%%$C_{j-1} = \{c_1, \cdots , c_{j-1}\}, P_j = \{p\in P | dist(p,C_{j-1}) > dist(c_j, C_{j-1})\}$, 
%%for $j = 1, 2, \cdots , k$.
%%
%%
%%\begin{lemma} \label{lemma1}
%%In each round of Step 3 of Algorithm~\ref{alg:1}, with $probability\geq 1-\frac{\eta}{k}$, $|P_j| < \frac{\epsilon}{k}|P|$.
%%\end{lemma}
%%% \paragraph{Lemma 1.} 
%%
%%
%%% \begin{equation} \label{eq:LL}
%%% \end{equation}
%%
%%\paragraph{Proof. }
%%Suppose that Lemma is not true, i.e., with $probability\geq 1-\frac{\eta}{k}$, $|P_j| \geq  \frac{\epsilon}{k}|P|$. 
%%By the Proposition 1, if randomly selecting $\frac{k}{\epsilon} log \frac{k}{\eta}$ (i.e., $|Q_j|$) points from $P$, with probability $1 - \frac{\eta}{k}$, 
%%it contains at least one point from $P_j$, i.e., $|Q_j \cap P_j|\geq 1$. 
%%Then, in each round of Step 3 of Algorithm~\ref{alg:1}, there contains at least one point, say $q \in Q_j \cap P_j$,
%%we have $dist(q, C_{j-1}) > dist(c_j, C_{j-1})$, which in contradiction with the the definition of $c_j$, 
%%since $c_j$ denote the furthest point from $Q_j$ to $C_{j-1}$.
%%
%%Further we consider $\widetilde{P} = P \setminus \bigcup\limits_{j=1}^{k}P_j$. 
%%By the Lemma~\ref{lemma1}, with $probability \geq (1-\frac{\eta}{k})^{k} > 1-\eta$, we have $|\widetilde{P}| >(1 - \frac{\epsilon}{k}k)|P| = (1- \epsilon)|P|$.
%%
%%\begin{lemma} \label{lemma2}
%%$\forall p \in \widetilde{P}$, $dist(p, C) \leq \underset{1\leq i < j\leq k}{min} \|c_i - c_j\Vert$.
%%\paragraph{Proof. }
%%Suppose $\exists p_0 \in \widetilde{P}$, $c_i, c_j \in C(i < j)$, s.t.,
%%$dist(p_0, C) > \underset{1\leq i < j\leq k}{min}\|c_i - c_j\Vert$.
%%\end{lemma}
%%
%%Then we have
%%\begin{equation}
%%    \label{eqa:eq1}
%%  \begin{split}
%%  dist(p_0, C_{j-1})& \geq dist(p_0, C) \\
%%  &> \underset{1\leq i < j \leq k}{min}\|c_i - c_j\Vert \\
%%  & \geq dist(c_j, C_{j-1})
%%  \nonumber
%%  \end{split}
%%\end{equation}
%%The last inequality implies that when adding $c_j$, $p_0 \in P_j$, which in contradiction with 
%%$p_0 \in \widetilde{P} = P \setminus \bigcup\limits_{j=1}^{k}P_j$.
%%
%%\paragraph{Proof. (of Theorem~\ref{alg:1})}
%%Returning to the proof of Theorem~\ref{alg:1}, we have two cases:
%%
%%\begin{enumerate}
%%  \item \textbf{Case 1}: The k cluster centers $\{c_1, \cdots , c_k\}$ fall into the $k$ different optimal cluster, 
%%which directly implies that $dist(p, C) \leq 2r_{opt}$ for any $p \in P^{'}$ based on the triangle inequality.
%%  \item \textbf{Case 2}: : Otherwise, By the pigeonhole principle, at least two centroids fall into
%%one cluster of the partition $C^{*}$.  Let the two centroids be $c_i$, $c_j$.
%%Then $\|c_i - c_j\Vert \leq 2r_{opt}$. By the lemma~\ref{lemma2},  $\forall p \in \widetilde{P}$, $dist(p, C) \leq 2r_{opt}$.
%%\end{enumerate}
%%
%%This exhausts all cases and completes the proof.
%%
%%
%%\subsection{Algorithms For Convex Hull Problems}
%%
%%The basic greedy algorithm is similar to the Gonzalez algorithm for k-center clustering:
%%Iteratively select the point $p \in P$ that is furthest from the linear subspace spanned by the point set $Q \subseteq P$
%%and then add it into $Q$ if this distance is greater than some threshold.
%%Let $\mathcal{S}_Q$ denote the linear subspace spanned by the point set $Q$. 
%%The algorithm runs $O(k_{opt}/ \epsilon^{\frac{2}{3}}) $ steps \cite{blum2019sparse}.
%%
%%% \paragraph{Theorem 1.} 
%%
%%Suppose we know $k_{opt}$, let N =$ k_{opt}$. Then we have Algorithm~\ref{alg:2} for constructing the $\epsilon$--approximate convex hull.
%%\begin{algorithm}[htb] 
%%  \caption{sublinear time $\epsilon$--approximate convex hull.} 
%%  \label{alg:2} 
%%  \begin{algorithmic}[1]
%%  \REQUIRE ~~\\ 
%%  An instance $ P \subset \mathbb{R}^d$ of convex hull; the
%%$\delta  > 0 $, $ \eta \in (0, 1)$, and $k_0 \in \mathbb{Z}^{+}$ .\\
%%  \ENSURE ~~\\ 
%%  \STATE Let $C = \phi $ . 
%%  \STATE Initially, $t = 2$; randomly select one point $c_1$ from $P$ and $|P| = n$; let $C = \{c_1\}$.
%%  \FOR{$j=1$ to $k_0$}
%%  \label{alg:2:code:3}
%%  \STATE random sample $2\frac{k_0}{\delta } log \frac{k_0}{\eta}$ points from $P$, say $Q_j$;
%%  \STATE Let $c_j$ denote the furthest point from $Q_j$ to $\mathcal{S}_C$ and add $c_j$ to $C$;
%%  \ENDFOR  
%%  \WHILE {$j < N$}
%%  \FOR{$j=(2^0+\cdots+2^{t-2})k_0+1$ to $(2^0+\cdots+2^{t-1})k_0$}
%%  \label{alg:2:code:8}
%%  \STATE random sample $2^{2(t-1)}2\frac{k_0}{\delta} log \frac{2^{2(t-1)k_0}}{\eta}$ points from $P$, say $Q_j$;
%%  \STATE Let $c_j$ denote the furthest point from $Q_j$ to $\mathcal{S}_C$ and add $c_j$ to $C$;
%%  \ENDFOR  
%%  \STATE $t = t + 1$
%%  \ENDWHILE 
%%  \RETURN $C$.
%%  \end{algorithmic}
%%  \end{algorithm}
%%
%%\paragraph{Running time. }
%%Let $T(N, d)$ denote the time to compute the distance between point and $\mathcal{S}_C$.
%%Then, the running time of Algorithm~\ref{alg:2} is $O(\frac{N^3}{\delta} log \frac{1}{\eta}T(N,d))$.
%%
%%For the sake of analysis, let $C_{j-1} = \{c_1, \cdots$, $c_{j-1}\}$, 
%%and $P_j = \{p\in P | dist(p,\mathcal{S}_{C_{j-1}}) > dist_H ( Q_j, \mathcal{S}_{C_{j-1}} )\}$, 
%%for $j = 1, 2, \cdots , N$.
%%
%%\begin{lemma} \label{lemma3}
%%In each round of Step~\ref{alg:2:code:3} or step~\ref{alg:2:code:8} of Algorithm~\ref{alg:2}, we have
%%\begin{equation}
%%  \begin{split}
%%  probability &\geq 1-\frac{\eta}{k_0}, |P_j| < \frac{\delta}{2k_0}|P|, j = 1,\cdots , k_0 \\
%%  probability &\geq 1-\frac{\eta}{2^{2}k_0},|P_j| < \frac{\delta}{2^{2}2k_0}|P|,j = k_0+1, \cdots , k_0 + 2k_0  \\
%%  \cdots \\
%%  probability &\geq 1-\frac{\eta}{2^{2(t-1)}k_0},|P_j| < \frac{\delta}{2^{2(t-1)}2k_0}|P|,j = (2^0+\cdots+2^{t-2})k_0+1, \cdots ,(2^0+\cdots+2^{t-1})k_0 
%%  \nonumber
%%  \end{split}
%%\end{equation}
%%\end{lemma}
%%The proof is similar to the proof of lemma~\ref{lemma1}.
%%
%%Further we consider $\widetilde{P} = P \setminus \bigcup\limits_{j=1}^{N}P_j$.
%%By the lemma~\ref{lemma3}, we have $|\widetilde{P}| \geq  (1- \delta)|P|$,
%%since $\frac{\delta}{2k_0}k_0 + \frac{\delta}{2^{2}2k_0}2k_0 + \cdots + \frac{\delta}{2^{2(t-1)}2k_0}2^{t-1}k_0 \leq \delta$.
%%Then the success probability of $|\widetilde{P}| \geq  (1- \delta)|P|$
%%
%%\begin{equation}
%%  \begin{split}
%%  probability &\geq (1-\frac{\eta}{k_0})^{k_0}(1-\frac{\eta}{2^2k_0})^{2k_0} \cdots(1-\frac{\eta}{2^{2(t-1)}k_0})^{2^{t-1}k_0} \\
%%  & > (1-\eta)(1-\frac{\eta}{2}) \cdots(1-\frac{\eta}{2^{t-1}}) \\
%%  & \geqslant 1-\sum_{j=0}^{t-1}\frac{\eta}{2^{j}} \\
%%  & > 1- 2\eta
%%  \nonumber
%%  \end{split}
%%\end{equation}
%%
%%\begin{lemma} \label{lemma4}
%%With $probability\geq 1-2\eta$, 
%%$\forall p \in \widetilde{P}$, $dist(p, \mathcal{S}_C) \leq \underset{1\leq j\leq N}{min} dist_H(Q_j, \mathcal{S}_{C_{j-1}})$.
%%\paragraph{Proof. }
%%Suppose with $probability\geq 1-2\eta$, $\exists p_0 \in \widetilde{P}$, $c_i, c_j \in C$, s.t.,
%%$dist(p_0, C) > \underset{1\leq j\leq N}{min}dist_H(Q_j, \mathcal{S}_{C_{j-1}})$.
%%\end{lemma}
%%Then we have
%%\begin{equation}
%%  \begin{split}
%%  dist(p_0, \mathcal{S}_{C_{j-1}})& \geq dist(p_0, \mathcal{C_{C}}) \\
%%  &> \underset{1\leq j \leq N}{min}dist_H(Q_j, \mathcal{S}_{C_{j-1}})
%%  \nonumber
%%  \end{split}
%%\end{equation}
%%The last inequality implies that when adding $c_j$, $p_0 \in P_j$, which in contradiction with 
%%$p_0 \in \widetilde{P} = P \setminus \bigcup\limits_{j=1}^{N}P_j$.
%%
%%
%%\begin{theorem} \label{theorem2} 
%%  With $probability\geq 1-2\eta$, Algorithm~\ref{alg:2} outputs a $\epsilon$--approximate convex hull of $\widetilde{P}$.
%%  \end{theorem}
%%
%%\paragraph{Proof.}
%%
%%First, we consider this case, that is,
%%we let Algorithm~\ref{alg:2} stop as soon as $dist_H(Q_j, \mathcal{S}_{C_{j-1}}) \leq \epsilon \Delta $, and outputs $C_{j-1}$.
%%By the lemma~\ref{lemma4}, in each iteration of j of Algorithm~\ref{alg:2}, once the distance between $Q_j$ and $\mathcal{S}_{C_{j-1}}$, 
%%i.e., $dist_H(Q_j, \mathcal{S}_{C_{j-1}}) \leq \epsilon \Delta $, we have $dist_H(\widetilde{P}, \mathcal{S}_{C_{j-1}}) \leq \epsilon \Delta $.
%%Then Algorithm~\ref{alg:2} outputs $C_{j-1}$ and the $\mathcal{C}_{C_{j-1}}$ is a $\epsilon$-approximate convex hull of $\widetilde{P}$.
%%However, it is linear time to get the $\Delta$ by a linear scan of the points.
%%Instead, one can use the check $|\mathcal{S}_{C}| \geq  (1- \delta)|P|$ as a stopping condition,
%%where $|\mathcal{S}_{C}|$ denote the number of points covered by $\mathcal{S}_{C}$. 
%%For ease of presentation, let $\beta = \frac{|\mathcal{S}_{C}|}{|P|}$.
%%We can use random sampling to estimate $\beta$.
%%Let $\{x_{i} | 1 \leq i \leq n^{'}\}$ be $n^{'}$ independent random variables with $x_{i} = 1$ 
%%if the i-th sampled point $p_{i}$ of $Q$ belongs to $\mathcal{S}_{C}$ 
%%(i.e. the distance between $p_{i}$ and $|\mathcal{S}_{C}|$ is smaller than some threshold), and $x_{i} = 1$ otherwise.
%%Let $X=\sum_{i=1}^{n^{'}}$ and $\sigma \in (0, 1)$ be a small parameter.
%%Then, we have $E[x_{i}] = \beta$ for each i and the estimator is $Y=\frac{X}{n^{'}}$.
%%By using the Chernoff bound,
%%we have $\textbf{Pr}(|Y-\beta|> \sigma\beta) =\textbf{Pr}(|X-n^{'}\beta|> \sigma n^{'}\beta) \leq e^{-O(\sigma^{2}n^{'}\beta)} $.
%%Thus, in each iteration of j of Algorithm~\ref{alg:2},
%%one can use the check $ X \geq |Q_j|\beta \geq |Q_j|(1- \delta)$ as a stopping condition.
%%Finally, we have $|\mathcal{S}_{C}| \geq  (1- \delta)|P|$ and 
%%the success probability Algorithm~\ref{alg:2} is greater than $1-e^{-O(\sigma^{2}|Q_j|(1- \delta))}$ with tolerating slight error.
%%\section{Applications}
%%
%%
%%\subsection{}
%
%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section*{Checklist}
%
%%%%% BEGIN INSTRUCTIONS %%%
%%The checklist follows the references.  Please
%%read the checklist guidelines carefully for information on how to answer these
%%questions.  For each question, change the default \answerTODO{} to \answerYes{},
%%\answerNo{}, or \answerNA{}.  You are strongly encouraged to include a {\bf
%%justification to your answer}, either by referencing the appropriate section of
%%your paper or providing a brief inline description.  For example:
%%\begin{itemize}
%%  \item Did you include the license to the code and datasets? \answerYes{See Section~\ref{gen_inst}.}
%%  \item Did you include the license to the code and datasets? \answerNo{The code and the data are proprietary.}
%%  \item Did you include the license to the code and datasets? \answerNA{}
%%\end{itemize}
%%Please do not modify the questions and only use the provided macros for your
%%answers.  Note that the Checklist section does not count towards the page
%%limit.  In your paper, please delete this instructions block and only keep the
%%Checklist section heading above along with the questions/answers below.
%%%%% END INSTRUCTIONS %%%
%
%\begin{enumerate}
%
%\item For all authors...
%\begin{enumerate}
%  \item Do the main claims made in the abstract and introduction accurately reflect the paper's contributions and scope?
%    \answerYes{}
%  \item Did you describe the limitations of your work?
%    \answerYes{See  Section~\ref{sec-con}.}
%  \item Did you discuss any potential negative societal impacts of your work?
%    \answerYes{See  Section~\ref{sec-con}.}
%  \item Have you read the ethics review guidelines and ensured that your paper conforms to them?
%    \answerYes{}
%\end{enumerate}
%
%\item If you are including theoretical results...
%\begin{enumerate}
%  \item Did you state the full set of assumptions of all theoretical results?
%    \answerYes{}
%	\item Did you include complete proofs of all theoretical results?
%    \answerYes{}
%\end{enumerate}
%
%\item If you ran experiments...
%\begin{enumerate}
%  \item Did you include the code, data, and instructions needed to reproduce the main experimental results (either in the supplemental material or as a URL)?
%    \answerYes{In our supplemental material.}
%  \item Did you specify all the training details (e.g., data splits, hyperparameters, how they were chosen)?
%    \answerYes{}
%	\item Did you report error bars (e.g., with respect to the random seed after running experiments multiple times)?
%    \answerYes{We report the average experimental results including variances in Section~\ref{sec-exp}. }
%	\item Did you include the total amount of compute and the type of resources used (e.g., type of GPUs, internal cluster, or cloud provider)?
%    \answerYes{See the first paragraph in Section~\ref{sec-exp}.}
%\end{enumerate}
%
%\item If you are using existing assets (e.g., code, data, models) or curating/releasing new assets...
%\begin{enumerate}
%  \item If your work uses existing assets, did you cite the creators?
%    \answerYes{We use real datasets and cite the creators in Section~\ref{sec-exp}.}
%  \item Did you mention the license of the assets?
%    \answerNA{}
%  \item Did you include any new assets either in the supplemental material or as a URL?
%    \answerNo{}
%  \item Did you discuss whether and how consent was obtained from people whose data you're using/curating?
%    \answerNA{}
%  \item Did you discuss whether the data you are using/curating contains personally identifiable information or offensive content?
%    \answerNA{}
%\end{enumerate}
%
%\item If you used crowdsourcing or conducted research with human subjects...
%\begin{enumerate}
%  \item Did you include the full text of instructions given to participants and screenshots, if applicable?
%    \answerNA{}
%  \item Did you describe any potential participant risks, with links to Institutional Review Board (IRB) approvals, if applicable?
%    \answerNA{}
%  \item Did you include the estimated hourly wage paid to participants and the total amount spent on participant compensation?
%    \answerNA{}
%\end{enumerate}
%
%\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \section{FORMATTING INSTRUCTIONS}

% To prepare a supplementary pdf file, we ask the authors to use \texttt{aistats2022.sty} as a style file and to follow the same formatting instructions as in the main paper.
% The only difference is that the supplementary material must be in a \emph{single-column} format.
% You can use \texttt{supplement.tex} in our starter pack as a starting point, or append the supplementary content to the main paper and split the final PDF into two separate files.

% Note that reviewers are under no obligation to examine your supplementary material.

% \section{MISSING PROOFS}

% The supplementary materials may contain detailed proofs of the results that are missing in the main paper.

% \subsection{Proof of Lemma 3}

% \textit{In this section, we present the detailed proof of Lemma 3 and then [ ... ]}

% \section{ADDITIONAL EXPERIMENTS}

% If you have additional experimental results, you may include them in the supplementary materials.

% \subsection{The Effect of Regularization Parameter}

% \textit{Our algorithm depends on the regularization parameter $\lambda$. Figure 1 below illustrates the effect of this parameter on the performance of our algorithm. As we can see, [ ... ]}

% \vfill
% \bibliographystyle{abbrv}




\end{document}
