%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}


\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{import}
%%%%% start contents of packages.tex
% ICML Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
% \usepackage{tikz}
%\usepackage{subfigure} % Switched to floatrow + subfig
%\usepackage{booktabs} % for professional tables
% END ICML Reccomended

\usepackage{xspace} % need it for the etal/etc/eg macros. CVPR includes, ICML 
% doesn't

% \usepackage[dvipsnames]{xcolor}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{etoolbox}

\usepackage[thmmarks, amsmath, thref]{ntheorem}
\usepackage{bbm}
\usepackage{bm}

%\usepackage{ulem} % for strikethrough \sout
\usepackage[normalem]{ulem} % for \sout w/o the annoying underline in \emph

\usepackage{times}
\usepackage{epsfig}
\usepackage{graphicx}
% \usepackage{multirow}
% \usepackage{multicol}
\usepackage{xcolor}
% Subfigures
%\usepackage{subfig}
%\usepackage{floatrow}
\usepackage{subcaption}
%\floatsetup[figure]{subcapbesideposition=top}
%\floatsetup[table]{capposition=top}


% \usepackage[linesnumbered,algoruled,boxed,lined]{algorithm2e}
%  \usepackage[linesnumbered,algoruled,noend,noline]{algorithm2e}
% \usepackage[algoruled,noend,noline]{algorithm2e} % a bit shorter with no line numbers


% \usepackage{slashbox}
\usepackage{diagbox}


\usepackage{lipsum}  

\usepackage{threeparttable}
\usepackage[singlelinecheck=true,justification=centering]{caption}
\captionsetup[table]{skip=10pt}
\usepackage{makecell}
\usepackage{multirow}
\usepackage{color,soul}
\usepackage{xcolor}  

% \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
\definecolor{dimgray}{rgb}{0.35, 0.35, 0.35}

\usepackage[linesnumbered,algoruled,noend,noline]{algorithm2e}

%%%%% end contents of packages.tex

%%%%% start contents of macros.tex
 %--------------------------------------------
% Add a period to the end of an abbreviation unless there's one
% already, then \xspace.
\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
% \def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}

\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{cf}\onedot} \def\Cf{\emph{Cf}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} 
% \def\wrt{with respect to }  % The extra space at the end means \wrt blah would 
% not end up as ``with respect toblah''. 
 
\def\dof{d.o.f\onedot}
% \def\etal{\emph{et al}\onedot}
\def\etal{\emph{et al}\onedot}
% \def\etal{\emph{et al}\onedot}
%
\def\iid{\emph{i.i.d}\onedot}
% \def\pdf{\emph{p.d.f}\onedot}



\makeatother
%--------------------------------------------------

\newcommand{\FIG}{Fig.~}
\newcommand{\FIGS}{Figs.~}
\newcommand{\EQN}{Eq.~}
% \newcommand{\EQN}{Equation }
\newcommand{\EQNS}{Eqs.~}
% \newcommand{\EQNS}{Equations }

%\newcommand{\SEC}{Sec.~}.  Use \autoref or ~\autoref instead.



% integers
\newcommand{\ZZ}{\ensuremath{\mathbb{Z}}}
\newcommand{\RR}{\ensuremath{\mathbb{R}}}
\newcommand{\Rtwo}{\ensuremath{\RR^2}}
\newcommand{\Rthree}{\ensuremath{\RR^3}}
\newcommand{\Rfive}{\ensuremath{\RR^5}}

\newcommand{\Rsix}{\ensuremath{\RR^6}}

\newcommand{\Rn}{\ensuremath{\RR^n}}
\newcommand{\Rd}{\ensuremath{\RR^d}}
\newcommand{\RD}{\ensuremath{\RR^D}}
\newcommand{\Rk}{\ensuremath{\RR^k}}

% 
%positive integers
\newcommand{\Zplus}{\ensuremath{\ZZ^+}}
%positive reals
% \newcommand{\Rplus}{\ensuremath{\RR^+}}

\newcommand{\Rnonneg}{\ensuremath{\RR_{\ge0}}}
\newcommand{\Rpos}{\ensuremath{\RR_{>0}}}


% set notation
% \newcommand{\set}[1]{\ensuremath{{\left\{#1\right\}}}}
\newcommand{\set}[1]{\ensuremath{{\{#1\}}}}
\newcommand{\tuple}[1]{\ensuremath{{(#1)}}}
\newcommand{\tupleLarge}[1]{\ensuremath{{\left(#1\right)}}}
\newcommand{\argmin}[1]{\ensuremath{\operatorname*{arg\,min}_{#1}}}
\newcommand{\argmax}[1]{\ensuremath{\operatorname*{arg\,max}_{#1}}}



\newcommand{\InnerProduct}[2]{\left\langle #1,#2 \right\rangle}

\newcommand{\norm}[1]{{{\left\|#1\right\|}}}
\newcommand{\sign}[1]{{\mathrm{sign}\left(#1\right)}}

\newcommand{\ellTwo}{\ell_2}
\newcommand{\ellTwoNorm}[1]{\norm{#1}_{\ellTwo}}
\newcommand{\ellOne}{\ell_1}




\newcommand{\MATRIX}[2][cccccccccccccccccccc]{\left[
 \begin{array}{#1}
 #2
 \end{array}
\right]}

\newcommand{\Mod}[1]{\ (\mathrm{mod}\ #1)}



% homogeneous coordinates
\newcommand{\bxh}{\widetilde{\bx}}

\newcommand{\RED}[1]{{{\color{red}{#1}}}}  
\newcommand{\BLUE}[1]{{{\color{blue}{#1}}}}
\newcommand{\GREEN}[1]{{{\color{green}{#1}}}}
\definecolor{orcgreen}{RGB}{100,150,100}
\newcommand{\ORCGREEN}[1]{{{\color{orcgreen}{#1}}}}
\newcommand{\MAGENTA}[1]{{{\color{magenta}{#1}}}}  

\definecolor{brown}{RGB}{165,42,42}
\newcommand{\BROWN}[1]{{{\color{brown}{#1}}}}



\newcommand{\FREIFELD}[1]{\textbf{\MAGENTA{[FREIFELD says: #1]}}}
\newcommand{\OR}[1]{\textbf{\ORCGREEN{[OR says: #1]}}}
% Make your own

\newcommand{\TBD}{\RED{[TBD]}}
\newcommand{\TODO}[1]{\RED{[TODO: #1]}}
\newcommand{\REPLACE}[2]{\RED{REPLACE. } \BLUE{#1} \RED{WITH. } \BLUE{#2}}



% In python: print_iterable(['\\bmdefine\\b{0}'.format(x)+'{'+x+'}' for x in 
% string.ascii_letters])

\bmdefine\ba{a}
\bmdefine\bb{b}
\bmdefine\bc{c}
\bmdefine\bd{d}
\bmdefine\be{e}
% \bmdefine\bf{f}  # Clashes with a standard command
\bmdefine\boldf{f}
\bmdefine\bg{g}
\bmdefine\bh{h}
\bmdefine\bi{i}
\bmdefine\bj{j}
\bmdefine\bk{k}
\bmdefine\bl{l}
\bmdefine\bm{m}
\bmdefine\bn{n}
\bmdefine\bo{o}
\bmdefine\bp{p}
\bmdefine\bq{q}
\bmdefine\br{r}
\bmdefine\bs{s}
\bmdefine\bt{t}
\bmdefine\bu{u}
\bmdefine\bv{v}
\bmdefine\bw{w}
\bmdefine\bx{x}
\bmdefine\by{y}
\bmdefine\bz{z}
\bmdefine\bA{A}
\bmdefine\bB{B}
\bmdefine\bC{C}
\bmdefine\bD{D}
\bmdefine\bE{E}
\bmdefine\bF{F}
\bmdefine\bG{G}
\bmdefine\bH{H}
\bmdefine\bI{I}
\bmdefine\bJ{J}
\bmdefine\bK{K}
\bmdefine\bL{L}
\bmdefine\bM{M}
\bmdefine\bN{N}
\bmdefine\bO{O}
\bmdefine\bP{P}
\bmdefine\bQ{Q}
\bmdefine\bR{R}
\bmdefine\bS{S}
\bmdefine\bT{T}
\bmdefine\bU{U}
\bmdefine\bV{V}
\bmdefine\bW{W}
\bmdefine\bX{X}
\bmdefine\bY{Y}
\bmdefine\bZ{Z}


\bmdefine\balpha{\alpha}
\bmdefine\bbeta{\beta}
\bmdefine\bgamma{\gamma}

\bmdefine\bdelta{\delta}
\bmdefine\btheta{\theta}
\bmdefine\blambda{\lambda}
\bmdefine\bphi{\phi}
\bmdefine\bxi{\xi}
\bmdefine\bzeta{\zeta}
\bmdefine\boldeta{\eta}
\bmdefine\bpi{\pi}

\bmdefine\bmu{\mu}
\bmdefine\brho{\rho}
\bmdefine\bomega{\omega}
\bmdefine\bOmega{\Omega}
\bmdefine\bPi{\Pi}

\bmdefine\bvarepsilon{\varepsilon}
\bmdefine\bepsilon{\epsilon}


\bmdefine\bDelta{\Delta}

\bmdefine\bTheta{\Theta}

\bmdefine\bsigma{\sigma}
\bmdefine\bSigma{\Sigma}
\bmdefine\bPsi{\Psi}
\bmdefine\bLambda{\Lambda}



\bmdefine\bzero{0}
\bmdefine\bone{1}
\bmdefine\binfty{\infty}

 \newcommand{\indicator}{\mathbbm{1}}


% In python 
% print_iterable(['\\newcommand{\\'+'{0}cal'.format(x)+'}{\\mathcal{'+x+'}}' 
% for x in string.ascii_uppercase])
\newcommand{\Acal}{\mathcal{A}}
\newcommand{\Bcal}{\mathcal{B}}
\newcommand{\Ccal}{\mathcal{C}}
\newcommand{\Dcal}{\mathcal{D}}
\newcommand{\Ecal}{\mathcal{E}}
\newcommand{\Fcal}{\mathcal{F}}
\newcommand{\Gcal}{\mathcal{G}}
\newcommand{\Hcal}{\mathcal{H}}
\newcommand{\Ical}{\mathcal{I}}
\newcommand{\Jcal}{\mathcal{J}}
\newcommand{\Kcal}{\mathcal{K}}
\newcommand{\Lcal}{\mathcal{L}}
\newcommand{\Mcal}{\mathcal{M}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Ocal}{\mathcal{O}}
\newcommand{\Pcal}{\mathcal{P}}
\newcommand{\Qcal}{\mathcal{Q}}
\newcommand{\Rcal}{\mathcal{R}}
\newcommand{\Scal}{\mathcal{S}}
\newcommand{\Tcal}{\mathcal{T}}
\newcommand{\Ucal}{\mathcal{U}}
\newcommand{\Vcal}{\mathcal{V}}
\newcommand{\Wcal}{\mathcal{W}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\Zcal}{\mathcal{Z}}


 \newcommand{\DP}{\mathrm{DP}}
 \newcommand{\DIR}{\mathrm{Dir}}

%%%%% end contents of macros.tex


\usepackage{booktabs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
% Oren says:
% Load hyperref as the last package. You will live longer (and will have less warnings)
% \usepackage[colorlinks=true,    % declare we want color links
% 	    bookmarksopen=true,  % expanded bookmarks
% 	    pagebackref=false,        % or backref
% 	    linkcolor=blue,
% 	    urlcolor  = blue
% 	    ]{hyperref}
% \usepackage[backref, colorlinks,citecolor=blue]{hyperref}
% \usepackage[backref]{hyperref} %\hypersetup{urlcolor=blue, colorlinks=true} 
\hypersetup{
  colorlinks   = true, %bColours links instead of ugly boxes
  urlcolor     = blue, %Colour for external hyperlinks
  linkcolor    = blue, %Colour of internal links
  citecolor   = blue, %Colour of citations
}

% \hypersetup{citecolor=blue}

% \usepackage{capt-of}
% \hypersetup{citecolor=blue}   
\usepackage{ellipsis} % The comment about regarding hyperref is more of a guideline than a rule... 
                      %``Note that ellipsis must be loaded after hyperref. 
                      %(The ellipsis documentation doesn't mention this, but the hyperref README does.)''

 \hypersetup{final}  % The things you learn the hard way: when draft mode is on (in \documentclass)
                     % Then the links created by hyperref are tuned off. Even if I pass draft=false and final=true to hyperref.
                     % The hypersetup{final} makes sure we have links active even in draft mode.
%   \hypersetup{draft}
               
% To have short pointers for sections and subsections. Oren Freifeld
\renewcommand{\sectionautorefname}{\S} 
\renewcommand{\subsectionautorefname}{\S} 
\renewcommand{\subsubsectionautorefname}{\S} 
 

\renewcommand{\algorithmautorefname}{Algorithm}
% \renewcommand{\algorithmautorefname}{Alg.~}
%\newcommand{\Exampleautorefname}{Example}



\title{Revisiting DP-Means: Fast Scalable Algorithms \\ via Parallelism and Delayed Cluster Creation}


% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dinari@post.bgu.ac.il >?Subject=Your UAI 2022 paper}{Or Dinari}{}}
\author[1]{\href{mailto:<orenfr@cs.bgu.ac.il >?Subject=Your UAI 2022 paper}{Oren Freifeld}{}}
% Add affiliations after the authors
\affil[1]{%
    The Department of Computer Science,
    Ben-Gurion University of the Negev\\
    Be'er Sheva, Israel
}



\begin{document}


\maketitle

\begin{abstract}
DP-means, a nonparametric generalization of  K-means, extends the latter to the  case where the number of clusters is unknown. Unlike K-means, however, DP-means is hard to parallelize, a limitation hindering its usage in large-scale tasks. This work bridges this practicality gap by rendering the DP-means approach a viable, fast, and highly-scalable solution. First, we study the strengths and weaknesses of previous attempts to parallelize the DP-means algorithm. Next, we propose a new parallel algorithm, called PDC-DP-Means (Parallel Delayed Cluster DP-Means), based in part on delayed creation of clusters. Compared with DP-Means, PDC-DP-Means provides not only a major speedup but also performance gains. Finally, we propose two extensions of PDC-DP-Means. The first combines it with an existing method, leading to further speedups. The second extends PDC-DP-Means to  a Mini-Batch setting (with an optional support for an online mode),  allowing for another major speedup. We verify the utility of the proposed methods on multiple datasets. We also show that the proposed methods outperform other nonparametric methods (\emph{e.g.}, DBSCAN). Our highly-efficient code can be used to reproduce our experiments and is available at~\url{https://github.com/BGU-CS-VIL/pdc-dp-means}. 
\end{abstract}
%%%%% start contents of introduction.tex
\section{INTRODUCTION}
%%%%% start contents of fig_objective.tex
\begin{figure}[t]%[h!]
\centering
  \includegraphics[height=6cm,trim=0 0 0 0, clip]{objective.pdf}
  \captionsetup{justification=justified, singlelinecheck=false}
\caption{Convergence of the different algorithms 
(note the logarithmic scale of the abscissa). MiniBatch PDC-DP-Means converges before 
the completion of the first iteration of PDC-DP-Means,
 which in turn converges before the completion of the first iteration of either DP-Means or P-DP-Means. 
 Data: 1 million 2D points generated from 50 Gaussians. 
 Results are average (solid lines) $\pm$ std.~dev.~(shaded areas) of 5 runs. 
 See~\autoref{Sec:Results} for details.}
\label{fig:objective}
\end{figure}
%%%%% end contents of fig_objective.tex


In the age of ``Big Data'',  algorithms that scale poorly, 
even if they offer desiderata of useful properties,
are often discarded in favor of faster and more scalable alternatives.
%
This phenomenon is exemplified, in clustering tasks,  by the wide popularity 
of the simple K-Means  algorithm. 
One main reason for that popularity is that, by the virtue
of the ease in which K-Means lends itself to parallelization and optimized computations, 
its speed is unrivaled by most other methods.
Thus, in large-scale clustering tasks, 
K-Means is often the weapon of choice. 

In particular, in practice K-Means is usually preferred over
the more powerful and elegant DP-Means algorithm~\citep{Kulis:ICML:2012:revisiting} (and its variants). 
This is despite the fact that with DP-Means, the user obtains K-Means-like clustering 
with the added benefit of being free from having to specify (or guess) the value of $K$, the number of clusters. 


Although there are numerous cases where, at least in theory, it would have made sense to use DP-Means
instead of K-Means, DP-Means has a major drawback that hinders its applicability:
at least in its original formulation, DP-Means cannot be parallelized as efficiently as K-Means. 
Bridging this practical gap is the topic of this paper
(see~\autoref{fig:objective}). 

We start by studying two existing methods that attempt to parallelize DP-Means: 
\textbf{DACE}~\citep{Jiang:2017:dace} and Parallel DP-Means (\textbf{P-DP-Means})~\citep{Pan:NeRIPS:2013:optimistic}.
This study then leads us to understand not only what makes DP-Means hard to parallelize
but also how to overcome this difficulty.
%
Based on those insights, we propose  new algorithms, all targeting the minimization of the DP-Means cost function.
The first proposed algorithm, on top of which the others are built, is Delayed Cluster DP-Means (\textbf{DC-DP-Means}). 
This \emph{serial} algorithm is a new variant of the 
original (and also serial) DP-Means algorithm. DC-DP-Means has two important advantages 
over the vanilla DP-means: 1) it is less prone to over-clustering
and usually achieves better clustering results;
2) it removes the limitation which halted K-Means-like  parallelization in DP-Means.
%
That second advantage lets us propose our first parallel algorithm, Parallel DC-DP-Means (\textbf{PDC-DP-Means}), which
lends itself,  by design, to an extremely-efficient implementation 
which \emph{rivals the speed of popular and optimized K-Means implementations} while having the same performance as DC-DP-Means.  

Next, we extend PDC-DP-Means to \emph{two additional parallel algorithms}. 
The first,  \textbf{DACE-PDC-
DP-Means}, uses PDC-DP-Means in conjunction with DACE,
achieving greater speed than either DACE or PDC-DP-Means.
The second, \textbf{MiniBatch PDC-DP-Means},  is for a Mini-Batch setting, can be used in either an online or an offline mode,
and offers an additional major speedup over our already-fast PDC-DP-Means. 
To put the speedup in perspective, our Mini-Batch PDC-DP-Means 
clusters the entirety of ImageNet's~\citep{Deng:CVPR:2009:imagenet} train-set
(following dimensionality reduction, via feature extraction, to 128) in as little as $13$ seconds. 
\emph{This is orders of magnitude faster than the competitors} (who were given the same 128-dimensional features as input). 
The full comparison and details for that experiment appear in~\autoref{Sec:Results}.
Importantly: 1) such a large-scale task is outside the scope of the original DP-Means; 2) while previous parallel DP-Means methods can handle such large data, 
we do it in a fraction of the time it takes them. More generally, 
we are unaware of any nonparametric method (including ones unrelated to DP-Means) that is even close to that speed.

To summarize, our main contributions are:
 1) recognizing what stymied previous DP-Means parallelization methods;
 2) we propose the Delayed Cluster DP-Means algorithm, and its parallel version, the PDC-DP-Means, offering a performance gain 
 and a major speedup over DP-Means. 
 3) we propose two extensions of PDC-DP-Means (DACE-PDC-DP-Means and MiniBatch PDC-DP-Means) that offer additional major speedups.
 
 

%%%%% end contents of introduction.tex



%%%%% start contents of related.tex
\section{RELATED WORK}
\textbf{K-Means. }
The K-Means cost function is the sum of squared $\ellTwo$ norms of the residuals between 
the observations and the mean of the cluster each of them is assigned to; \ie, 
\begin{align}
% \hspace{-2mm}
% f((\bmu_k)_{k=1}^K,(z_i)_{i=1}^N)=
\sum\limits_{k=1}^K \sum\limits_{i:z_i=k}  \ellTwoNorm{\bx_i-\bmu_{k}}^2 
\end{align}
where 
$N$ is the number of points (or observations), 
$K$ is the number of clusters, 
$\bx_i\in \Rd$ is data point $i$, 
$\bmu_k\in \Rd$ is the mean (or center) of cluster $k$, 
and the label $z_i=k$ if and only $\bx_i$ belongs to cluster $k$. Thus, cluster $k$, denoted by $\Ccal_k$, consists 
of the points whose assignment is $k$: $\Ccal_k=(\bx_i)_{i:z_i=k}$. 
The function is to be minimized \wrt both $(\bmu_k)_{k=1}^K$
and $(z_i)_{i=1}^N$. Importantly, the user must specify $K$. 
Since initially proposed by~\cite{Forgy:1965:cluster}, 
many variations of K-Means have been developed. 
Today, the popular approaches for performing K-Means inference are either Lloyd's algorithm~\citep{Lloyd:PCM:1982:kmeans} (usually referred to as the K-Means \emph{algorithm}) or Elkan's~\citep{Elkan:2003LICMLusing}.
Both are easy to parallel, thereby offering a very fast clustering method. We remark that in this work we borrow from~\cite{Lloyd:PCM:1982:kmeans}
several ideas related to parallelization and mini-batches. 

\textbf{DP-Means. } Proposed by~\cite{Kulis:ICML:2012:revisiting}, the DP-Means is a nonparametric extension of K-Means, 
rooted in Bayesian nonparametrics, and closely-related to 
the Dirichlet Process Mixture model~\citep{Antoniak:AoS:1974:DPMM,West:Book:1993:hierarchical}.
In the DP-Means algorithm, when an observation's squared distance from the mean of its closest cluster exceeds a user-defined parameter $\lambda$ ($\lambda>0)$, a new cluster is formed, and the observation 
is assigned to it. The associated cost function is similar to K-Means, 
except that there is an added penalty term and the minimization is also \wrt
$K$: 
\begin{align}
% \hspace{-2mm}
% f(K,(\bmu_k)_{k=1}^K,(z_i)_{i=1}^N)=
\left(
\sum\limits_{k=1}^K \sum\limits_{i:z_i=k}  \ellTwoNorm{\bx_i-\bmu_{k}}^2
\right) +\lambda K \, .
    \label{eqn:dpmeans}
\end{align}
Note that the penalty term, $\lambda K$, penalizes the creation of new clusters. As we will explain later, the DP-Means algorithm is serial and thus is inherently slow. 

Several works have extended the original DP-Means algorithm.  \cite{Bachem:ICML:2015coresets} use corsets to achieve fast approximated inference; \ie, the entire dataset is efficiently summarized by a small weighted subset of representative points.
That approach allows to use slow algorithms, such as the original DP-Means, on large datasets.
\cite{Odashima:ECMLKDD:2016:split} proposed several DP-Means-related algorithms. First, they  developed an Online DP-Means and a Batch DP-Means algorithms, both based on the MiniBatch K-Means~\citep{Sculley:ICWWW:2010:batch-kmeans}.
In addition, they have developed Split-DP-Means and Merge-DP-Means, and then combined them to a Split-Merge DP-Means.
The splits/merges moves are used in order to try to escape poor local minima
(for other Bayesian nonparametric clustering methods using splits and merges, 
see, \eg,~\cite{Jain:Jounral:2004:split,Chang:NIPS:2013:ParallelSamplerDP,Chang:NIPS:2014:ParallelSamplerHDP,Dinari:UAI:2020:vHDP,Ronen:CVPR:2022:DeepDPM}).
%
\cite{Kobayashi:Neurocomputing:2021generalized} add another term to the DP-Means cost function,
 making it more robust to outliers. 
\cite{Paul:Stat:2020bayesian} proposed the EWDP-Means, which incorporates optimal feature weighting using Gibbs sampling.
%
While all the DP-Means methods above are serial,
two works which are of particular interest to us are the Parallel DP-Means,
proposed by~\cite{Pan:NeRIPS:2013:optimistic}, and DACE, proposed by~\cite{Jiang:2017:dace}.
Both these methods use parallelization and we will return to 
them later in this paper. 
%%%%% end contents of related.tex

%%%%% start contents of background.tex
\section{Background}\label{sec:background}
Below we review several well-known facts (of interest in our context) 
about the K-Means and DP-Means algorithms. 

\textbf{Optimizing Lloyd's Algorithm. } Lloyd's K-Means algorithm 
(\autoref{alg:kmeans}) is simple enough  so it can be both optimized (in terms of efficiency 
 of the computations) and parallelized such that its speed is
 virtually unrivaled. 
 In terms of running time, the most expensive part is calculating the distance between each data point and each of the $K$ cluster means.
 To optimize this part, it is better to use matrix multiplication than calculating such distances one by one;
 \eg, with matrix multiplication the user
 can leverage Basic Linear Algebra Subprograms (BLAS)~\citep{Lawson:1979:BLAS} as much as possible.
 For that aim, note first that the (squared) distance calculation has three parts:
 \begin{align}
     ||\bx_i-\bmu_k||^2_{\ellTwo}=||\bx_i||^2_{\ellTwo}-2\bx_i^T\bmu_k+||\bmu_k||^2_{\ellTwo}\,.
     \label{Eqn:SquaredDist}
 \end{align}
In the RHS of~\EQN\eqref{Eqn:SquaredDist}, 
 the first term, $\ellTwoNorm{\bx_i}^2$, is constant \wrt $k$
 while the last term, $\ellTwoNorm{\bmu_k}^2$, can be computed
 just once per iteration (instead of $N$ times per iteration).
 It follows that the main effort lies with computing the middle term, $2\bx_i^T\bmu_k$. 
 To make that computation efficient 
 two things are done. 
 First, $\bX$ is split into $P$ parts,  $(\bX_p)_{p=1}^P$, 
which can be processed in parallel. Let $N_p$ denote the number of points in part $p$, and let  
$\bX_p=\MATRIX{\bx_{1,p}&\ldots&\bx_{N_p,p}}^T\in\RR^{N_p\times d}$ denote
that part, written as a matrix. 
Second, instead of computing $N_p\times K$  individual 
vector-vector multiplication computations of $\bx_{j,p}^T\bmu_k$
(one for each $(j,k)$ pair), a single matrix-matrix multiplication is performed:
$\bX_p \bM^T$ where $\bM=\MATRIX{\bmu_{1} , \ldots , \bmu_{K} }^T$. 
 The above steps are the root reasons for the speedup 
 that Lloyd's algorithm achieves over a naive implementation. However, it can be optimized even further: once each part is processed, it can produce its relative contributions to the cluster means,
 which in turn can be aggregated in the main process before the next iteration.
 %%%% start contents of alg_kmeans.tex
{    \SetKwComment{Comment}{}{}
    \begin{algorithm}[t]
    \KwIn{$K$}
    \KwData{$\bX=(\bx_i)_{i=1}^N\subset \Rd$}
    $(\bmu_k)_{k=1}^K\leftarrow \text{$K$ randomly-chosen points from } \bX$\\
    \DontPrintSemicolon
    \While{Not Converged}  
    {
      \For{$i\in \set{1,\ldots,N}$}
      {
        $z_i \leftarrow \argmin{k\in\set{1,\ldots,K}} 
        \ellTwoNorm{\bx_i-\bmu_k}^2$
      }
      \For{$k=1\in \set{1,\ldots,K}$}
      { 
        $n_k\leftarrow|\set{i:z_i=k}|$ \\ 
        $\bmu_k\leftarrow 
             \frac{\sum_{i:z_i=k}\bx_i}{n_k}$
      }
    }
        % \Comment{// see text for convergence criteria}
   \caption{Lloyd's K-Means Algorithm~\citep{Lloyd:PCM:1982:kmeans}}
   \label{alg:kmeans}
    \end{algorithm}
}
%%%% end contents of alg_kmeans.tex
%%%% start contents of alg_dpmeans.tex
{
    \SetKwComment{Comment}{}{}
    \begin{algorithm}[ht]
    \KwIn{$\lambda$}
    \KwData{$\bX=(\bx_i)_{i=1}^N\subset \Rd$}
%           \KwOut{$K,(\bmu_k)_{k=1}^K$}
    $K\leftarrow 1$\\
    $\bmu_1\leftarrow  \frac{\sum_{i=1}^N\bx_i}{N}$\\
    $(z_i)_{i=1}^N\leftarrow 1$ \hfill \Comment{// init.~all labels to 1}
    \DontPrintSemicolon
    \While{Not Converged}
    {
      \For{$i\in \set{1,\ldots,N}$}
      {
       $z_i \leftarrow \argmin{k\in\set{1,\ldots,K}} 
        \ellTwoNorm{\bx_i-\bmu_k}^2$\\
        \If{$\ellTwoNorm{\bx_i-\bmu_{z_i}}^2>\lambda$}
        { % \Comment{// create a new cluster} 
          $K\leftarrow K+1$\\
          $\bmu_K\leftarrow \bx_i$ \\
          $z_i\leftarrow K$ 
        }  
      }
      \For{$k=1\in \set{1,\ldots,K}$}
      { 
        $n_k\leftarrow|\set{i:z_i=k}|$ \\ 
        $\bmu_k\leftarrow 
             \frac{\sum_{i:z_i=k}\bx_i}{n_k}$
      }
    }
  %  \Comment{// see text for convergence criteria}
   \caption{DP-Means~\citep{Kulis:ICML:2012:revisiting}}
   \label{alg:dpmeans}
    \end{algorithm}
}
%%%% end contents of alg_dpmeans.tex
%
 \textbf{The DP-Means algorithm. } There are obvious similarities between the K-Means algorithm (\autoref{alg:kmeans})
 and the DP-Means algorithm (\autoref{alg:dpmeans}). 
 However, while in the former $K$ is assumed to be known and is predefined, 
 in the latter $K$ evolves during the algorithm's run and depends on:
 1) the data; 2) $\lambda$; 3) the ordering in which one visits the observations.
 When assessing the convergence of~\autoref{alg:dpmeans}, 
 one can see that apart from  the cluster creation (\eg lines 7-10),
 it has the same guarantees as the classical K-Means (meaning, 
 every step in the algorithm cannot increase the cost, which is bounded below by zero). 
 When taking into account the addition of clusters, and examining the DP-Means cost function,
 it can be noted that adding a cluster is done only when all 
 the squared distances between some observation $\bx_i$ and each of the $K$ cluster means exceed
  $\lambda$. Thus, the penalty term (\ie, $\lambda K$) 
 is smaller than the squared distance between $\bx_i$ and any of the existing $K$ 
 clusters. Thus, the creation of the cluster necessarily decreases the cost. 
%%%%% end contents of background.tex



\section{Method}
In~\autoref{Sec:Opt} below, we explain why, unlike K-Means,
the original DP-Means algorithm~\citep{Kulis:ICML:2012:revisiting} does not lend itself to parallelization. In that section, we also describe two previous works that tried, with only a partial success, to attack that problem. Next, in~\autoref{Sec:ProposedAlgs}, we discuss our solution and the resulting proposed algorithms. 

%%%% start contents of parallel.tex
\subsection{WHY SCALING DP-MEANS IS DIFFICULT}\label{Sec:Opt}
A natural question arises: 
can the original DP-means algorithm~\citep{Kulis:ICML:2012:revisiting} be optimized and parallelized 
as easily as it was in the K-Means case? Unfortunately, the answer is negative, as we explain below. 

When attempting to create a parallel version of DP-Means,
the main obstacle is the cluster creation.
A naive solution would be to try to simply mimic the steps that are performed when parallelizing the K-Means algorithm: 
\ie, split the data into $P$ parts, and perform the iteration's main loop
(lines 5-10 in~\autoref{alg:dpmeans}) in parallel.
However, when the processing of each part is done, 
we will (usually) have multiple new clusters
and, with a high probability, many of them will overlap with each other.
This \textbf{over-clustering problem} has two major negative implications.
The first and most obvious one is that it harms the results of the clustering.
The second ramification of the over-clustering problem is a significant increase of the running time:
as the latter grows with $K$, 
redundant clusters translate directly into a longer running time.
Moreover, \textbf{a second problem} that stems from the same root cause
is that \textbf{the efficient computation presented in~\autoref{sec:background} 
for optimizing the distance calculations cannot be done as efficiently as in the 
K-Means case}: since $K$ (usually) grows during the main iteration, we 
 cannot precompute, before each iteration over the data, all of the $\ellTwoNorm{\bmu_k}^2$ and $\bx_i^T \bmu_k$ values.
Rather, we would need to compute these values, on the fly, separately for each new cluster.
Consequently, this will deprive us of the benefit of utilizing the maximal efficiency of BLAS. 

The remainder of this section reviews 
two smart existing parallel methods that address the first problem. However, neither of them addresses the second.

\textbf{P-DP-Means}, proposed  by~\cite{Pan:NeRIPS:2013:optimistic},
splits the data into several parts and then processes them in parallel. The core of that method is that when the calculation of each part is done, the new clusters are not immediately added to the existing clusters; rather, another subroutine, coined `DPValidate', is called. The additional subroutine consolidates the results from the different parts, and adds the new clusters one by one, as long as the new cluster is distanced by at least $\sqrt{\lambda}$ from all the current existing clusters. However, if a new cluster is not far enough from the existing clusters, then the subroutine will change all the relevant labels to the closest existing cluster.
That solution has many merits, and it can indeed reduce the running time drastically without harming the results. However, the `DPValidate' subroutine itself is serial and slow. 
 This is especially evident in the first iteration of the algorithm, where most of the clusters are added. For example, 
 consider~\autoref{fig:objective} which shows that 
 the first iteration of P-DP-Means is \emph{very} slow,
 though after that iteration it converges quite fast. 
 This is also due to the fact that once most of the clusters have been added in the first iteration, one can optimize the distance calculations for all existing clusters for the following iterations, considerably improving the speed of each such iteration.
 
 \textbf{DACE} was proposed by~\cite{Jiang:2017:dace}
 for clustering extremely-large sequence data in a specific application domain. However, DACE is also fairly easy to  adapt to other types of data as well. \cite{Jiang:2017:dace} have approached the problem differently from how it was done in  P-DP-Means~\citep{Pan:NeRIPS:2013:optimistic}.
 Instead of separating the data into parts and consolidating the results after each iteration, they consolidate only once, running a standard DP-Means algorithm  separately on each one of the parts until convergence. Here, the core of the method lies with how the separation into parts is done. Unlike in P-DP-Means, where the partitioning is done using some set heuristic  (\eg, it could be random, or according to the data ordering),  DACE uses a locality-sensitive hashing  approach~\citep{Datar:geom:2004:locality} for partitioning the data  such that the different parts should have a minimal overlap with each other.  This approach has several benefits.  First and foremost,  it allows the parallelization of DP-Means across the data parts. Second,  recall that the runtime grows linearly with $K$.  When partitioning the data such that the clusters have a minimal overlap  with each other, each part tends to have a low number of clusters. This is  in contrast to P-DP-Means, where computations in each part must use all of the clusters.  This difference allows DACE a better speedup.  DACE, however, has two main drawbacks.  The first is that each run of DP-Means  has the same optimization problems we have described earlier. The second is that the final result is drastically affected by the initial partitioning and in many cases this leads to the degradation of the results. 
%%%% end contents of parallel.tex

%%%%% start contents of fig_clustering.tex
\begin{figure*}[t]
    \centering
    \newcommand{\MyHeight}{4.95cm}
    \subcaptionbox{Data and the Mean.\label{Fig:clust:data}}[0.245\linewidth]{\includegraphics[height=\MyHeight,trim=0.0cm 0.0cm 0.0cm 0.0cm]{over_clustering1.pdf}}
    \subcaptionbox{DP-Means: Bad Ordering\label{Fig:clust:bad-ordering}}[0.245\linewidth]{\includegraphics[height=\MyHeight,trim=0.0cm 0.0cm 0.0cm 0.0cm]{over_clustering2.pdf}}
    \subcaptionbox{DP-Means: Good Ordering\label{Fig:clust:good-ordering}}[0.245\linewidth]{\includegraphics[height=\MyHeight,trim=0.0cm 0.0cm 0.0cm 0.0cm]{over_clustering3.pdf}}
    \subcaptionbox{DC-DP-Means\label{Fig:clust:PDC-DP-Means}}[0.245\linewidth]{\includegraphics[height=\MyHeight,trim=0.0cm 0.0cm 0.0cm 0.0cm]{over_clustering4.pdf}}
    \captionsetup{justification=centering, singlelinecheck=false}
    \caption{DP-Means is more susceptible to over-clustering than PDC-DP-Means. 
    A large number of points are sampled from the bottom cluster (grey)
     while far fewer ones are sampled from the top cluster (green).
     Circles of radius $\sqrt{\lambda}$ 
     are drawn around the mean and the instantiated clusters. 
     Red dots mark the first observations visited in DP-Means.
    }
     \label{Fig:clustering}
\end{figure*}
%%%%% end contents of fig_clustering.tex

%%%% start contents of method.tex
\subsection{The Proposed Algorithms}\label{Sec:ProposedAlgs}
%%%% start contents of alg_dcdpmeans
{
    \SetKwComment{Comment}{}{}
    \begin{algorithm}[t]
    \KwIn{$\lambda$}
    \KwData{$\bX=(\bx_i)_{i=1}^N\subset \Rd$}
%           \KwOut{$K,(\bmu_k)_{k=1}^K$}
    $K\leftarrow 1$\\
    $\bmu_1\leftarrow  \frac{\sum_{i=1}^N\bx_i}{N}$\\
    $(z_i)_{i=1}^N\leftarrow 1$ \hfill \Comment{// init.~all labels to 1}
    \DontPrintSemicolon
    \While{Not Converged}
    {
      $j_{\mathrm{max}}\leftarrow -1$ \\
     $d_{\mathrm{max}}\leftarrow -1$ \\
      \For{$i\in \set{1,\ldots,N}$}
      {
       $z_i \leftarrow \argmin{k\in\set{1,\ldots,K}} 
        \ellTwoNorm{\bx_i-\bmu_k}^2$\\
        \If{$\ellTwoNorm{\bx_i-\bmu_{z_i}}^2>d_{\mathrm{max}}$}
        {
          $j_{\mathrm{max}}\leftarrow i$ \\
          $d_{\mathrm{max}}\leftarrow \ellTwoNorm{\bx_i-\bmu_{z_i}}^2$
        }

      }
      \If{$d_{\mathrm{max}}>\lambda$}
      { % \Comment{// create a new cluster} 
        $K\leftarrow K+1$\\
        $\bmu_K\leftarrow \bx_{j_{\mathrm{max}}}$ \\
        $z_{j_{\mathrm{max}}}\leftarrow K$ 
      }  
      \For{$k=1\in \set{1,\ldots,K}$}
      { 
        $n_k\leftarrow|\set{i:z_i=k}|$ \\ 
        $\bmu_k\leftarrow 
             \frac{\sum_{i:z_i=k}\bx_i}{n_k}$
      }
    }
  %  \Comment{// see text for convergence criteria}
   \caption{DC-DP-Means}\label{alg:dcdpmeans}
    \end{algorithm}
    }
%%%% end contents of alg_dcdpmeans

In~\autoref{Sec:Opt} we have identified that the main problem with scaling DP-Means is related to cluster creation.
This insight leads us to our first proposal:
\emph{deferring the cluster creation to the end of the assignment step.} 
% 
Concretely, when the squared distance between an observation
$\bx_i$ and the center of its nearest cluster exceeds $\lambda$,
instead of opening a new cluster, we save the index and distance of that observation in $i_{\mathrm{max}}$ and $d_{\mathrm{max}}$,  respectively.
We update $i_{\mathrm{max}}$ and $d_{\mathrm{max}}$ 
whenever we find another observation whose associated distance is larger. 
Only when the assignment step is complete,
and provided that there was at least one observation whose
squared distance (from its nearest cluster) exceeds $\lambda$,
do we open a new cluster. 
In which case, that cluster is initialized 
with the single point whose associated distance
was the maximal one across the entire dataset. 
Next, we continue to update the  means of all the existing clusters.
We refer to that algorithm, summarized in~\autoref{alg:dcdpmeans}, as \textbf{DC-DP-Means} (where DC stands for Delayed Cluster).  

\emph{While the delayed cluster creation might seem 
as a mild change, it has a remarkably-profound threefold effect}:
1) it facilitates the usage of \emph{all} of the K-Means-related optimizations 
from~\autoref{sec:background};
2) DC-DP-Means's results are (trivially)
invariant to the ordering of the observations 
and this is in sharp contrast 
to the original DP-Means which might severely over-cluster the data due to an unfortunate ordering of the observations (see~\autoref{Fig:clustering});
3) while DC-DP-Means takes more iterations to converge (since at each iteration at most one cluster
can be formed), 
most of the iterations are \emph{much} faster (due to, \eg, the fewer clusters
and the availability of the aforementioned optimizations), 
resulting in significantly-short running times.

 \subsubsection{Convergence Guarantees for DC-DP-Means} 
 The proposed change (from~\autoref{alg:dpmeans} to our proposed~\autoref{alg:dcdpmeans}) does not break the convergence guarantees of the original DP-Means.
 In the reassignment step, DC-DP-Means is identical to K-Means
 as no new clusters are created
  and the distance between a point and the assigned cluster cannot 
  exceed its pre-reassignment distance.
  During the cluster creation step, it is guaranteed that 
  $d_{\mathrm{max}}^2>\lambda$
  and thus the added penalty is smaller than $d_{\mathrm{max}}^2$, 
  which is the contribution of the associated observation to the cost function.
  Finally, during the update of the means,  
  the mean of the observations assigned to a cluster 
  minimizes its squared distance between them and the cluster center.
  As for an empirical evidence of convergence, see~\autoref{fig:objective}.

%%%% start contents of alg_pdcdpmeans.tex
{
    \SetKwComment{Comment}{}{}
    \begin{algorithm}[!t]
    \KwIn{$\lambda,P$}
%     \KwData{$\bX=(\bx_i)_{i=1}^N\subset \Rd$}
    \KwData{$\bX=\MATRIX{\bx_1 & \ldots & \bx_N}^T\in \RR^{N\times d}$}
%       \KwOut{$K,(\bmu_k)_{k=1}^K$}
      $K\leftarrow 1$\\
    $\bmu_1\leftarrow  \frac{\sum_{i=1}^N\bx_i}{N}$\\
    $(z_i)_{i=1}^N\leftarrow 1$ \hfill \Comment{// init.~all labels to 1}
%     $(\bX_p)_{p=1}^P \leftarrow $ Split $\bX$ into  $P$  parts\\
    $(\bX_p)_{p=1}^P \leftarrow $ Split $\bX$ into  $P$  parts\\
    $(N_p)_{p=1}^P \leftarrow  (\# \text{of points in }\bX_p)_{p=1}^P$    \hfill \Comment{// $\bX_p\in \RR^{N_p \times d}$}
    \ForPar{$p\in \set{1,\ldots,P}$}
    {$\bs_p \leftarrow (\ellTwoNorm{\bX_p[j,:]}^2)_{j=1}^{N_p}$
    \Comment{// $\bs_p\in \RR^{N_p}$}
    }
    \DontPrintSemicolon
    \While{Not Converged}
    {
      $\bM\leftarrow\MATRIX{\bmu_{1} & \ldots & \bmu_{K} }^T$
          \hfill \Comment{// $\bM\in \RR^{K \times d}$}
      % $\bmu\leftarrow \text{Stack of } \{\bmu_1,\ldots,\bmu_K\}$\\
      $ \bar{\bs}
      %=\MATRIX{\bar{s}_1 & \ldots & \bar{s}_K} 
      \leftarrow
      \left[\begin{smallmatrix}
       \ellTwoNorm{\bmu_1}^2& \ldots & \ellTwoNorm{\bmu_K}^2
      \end{smallmatrix}\right]$
      \hfill\Comment{// $\bar{\bs} \in \RR^K$}
      $(j_{\mathrm{max}}^p)_{p=1}^P\leftarrow -1$ 
       \hfill \Comment{// init.~argmax vals}
      $(d_{\mathrm{max}}^p)_{p=1}^P\leftarrow -1$ 
       \hfill \Comment{// init.~max vals}
      \ForPar{$p\in\{1,\ldots,P\}$}
      {
      $\bM_p=\MATRIX{\bmu_{1,p} & \ldots & \bmu_{K,p} }^T\leftarrow \bzero_{K\times d}$\\
%       $(\bmu_{k,p})_{k=1}^K \leftarrow \bzero_{d\times 1} $ \hfill \Comment{// init.~centers}
      $(n_{k,p})_{k=1}^K \leftarrow 0 $ 
      \hfill \Comment{// init.~counts} 
%         $\bY_p \leftarrow \bX_p \bM^T $ 
%         \hfill \Comment{// $\bY_p\in\RR^{N_p\times K}$}
%         $\bD_p \leftarrow -2\bY_p+\bar{\bs}$ 
%       \hfill \Comment{// $\bD_p\in\RR^{N_p\times K}$}
        $\bD_p \leftarrow -2 \bX_p \bM^T +
        \underbrace{\MATRIX{\bar{\bs}& \ldots &\bar{\bs} }}_{K \text{ copies of }\bar{\bs}}$ 
      \hfill \Comment{// $\bD_p\in\RR^{N_p\times K}$}
        $\bz_p \leftarrow \text{row-wise argmin } (\bD_p)$\\
        \For{$j\in \set{1,\ldots,N_p}$}
        {
      $  k\leftarrow \bz_p[j]$\\
        $\bM_p[k,:] \leftarrow \bM_p[k,:] + \bX_p[j,:]$
        \\
        %   $\bmu_{\bx_i^z} \leftarrow \bmu_{\bx_i^z}+\bx_i$\\
           $ n_{k,p} %_{\bx_i^z}
           \leftarrow  n_{k,p}+1$\\
           %\text{counts}_{\bx_i^z}^p+1$\\
          % \If{$\lambda < \bD_p[i,k]+s_{i,p} > d_{\mathrm{max}}^p $}
          \If{$\bD_p[j,k]+\bs_p[j] >\max(d_{\mathrm{max}}^p,\lambda ) $}
           {
              $d_{\mathrm{max}}^p\leftarrow \bD_p[j,k]+\bs_p[j]$\\
              $j_{\mathrm{max}}^p\leftarrow j$\\
           }
        }
      }
      $ p_{\mathrm{max}} \leftarrow \argmax{p} (d_{\mathrm{max}}^p)_{p=1}^P $\\
      $\left[\begin{smallmatrix}n_1 & \ldots & n_K\end{smallmatrix}\right] \leftarrow 
        \left[\begin{smallmatrix} \sum_{p=1}^P n_{1,p} & \ldots & \sum_{p=1}^P n_{K,p}\end{smallmatrix}\right]
    $
     % \MATRIX{ \sum_{p=1}^P n_{1,p}  & \ldots & \sum_{p=1}^P n_{K,p}  }
      \\
      $\bM\leftarrow \sum_{p=1}^P \bM_p$\\
    %   $\bmu\text{\textbf{-sum}} \leftarrow sum(\{\bmu^p:p\in\{1,\ldots,P\}\})$\\
      \If{ $p_{max} \neq -1$}
      {
        %\Comment{// Create a cluster}
        $j\leftarrow j_{\mathrm{max}}^{p_\mathrm{max}}$ \\ %\bI_{\mathrm{max}}[p_{\mathrm{max}}]$\\
        $k \leftarrow \bz_{p_\mathrm{max}}[j]$\\
        $\bM[k,:]\leftarrow \bM[k,:]-\bX_p[j,:]$\\
        $n_k\leftarrow n_k-1$\\
        $K\leftarrow K+1$\\
        $\bz_{p_\mathrm{max}}[j]\leftarrow K$\\
        $\bM[K,:]\leftarrow \bX_p[j,:]$\\
        $n_K\leftarrow 1$\\
      }
      \ForPar{$k\in\set{1,\ldots,K}$}
      {
        $\bmu_k\leftarrow \bM[k,:]/n_k$
      }
    }
            % \Comment{// see text for convergence criteria}
   \caption{PDC-DP-Means}\label{alg:pdcdpmeans}
    \end{algorithm}
    }
%%%% end contents of alg_pdcdpmeans.tex

  \subsubsection{PDC-DP-Means}
  Importantly, and by design, DC-DP-Means naturally lends itself to parallelization.  It follows that combining the K-Means-related optimization and parallelization steps (described in~\autoref{sec:background}) 
 together with our proposed delayed cluster creation
 lets us propose our first \emph{parallel} algorithm,
 called PDC-DP-Means  and summarized in~\autoref{alg:pdcdpmeans}.
%
 Where at first sight~\autoref{alg:pdcdpmeans}
 seems longer and more complex than either~\autoref{alg:dpmeans}
 or~\autoref{alg:dcdpmeans}, 
 this is mostly because of the added optimizations we have implemented
 (and that were enabled by our delayed cluster creation).
 In other words, while the \emph{parallelizable}~\autoref{alg:dcdpmeans} captures the key conceptual details that allow for massive parallelization and optimizations, 
 the \emph{parallel (and optimized)}~\autoref{alg:pdcdpmeans} 
 also contains the needed technical/engineering details. 
  We now explain the details.  
 
  In \textit{lines 4-5}, 
 we split the data into $P$ parts (where $P$ is the number of available computer processes),
 where $\bX_p$ denotes the observations in part $p$.
 Throughout the algorithm, 
 we use $j$ as the running index within a part
   while $i$ is used as the running index within the entire dataset. 
  In \textit{lines 6-7} we pre-calculate
  $\bs_p$, which stands for the squared norms of the points, 
  for each of the $P$  parts. This $\bs_p$ will be used later
  for comparing distances.
  In each of the iterations of the \textit{while} loop (\textit{line 8}) 
   we initially calculate the squared norms $\bar{\bs}$ of the cluster centers
   (to be used by all the $P$ processes), 
   and initialize auxiliary variables 
   $(j_{\mathrm{max}}^p)_{p=1}^P,(d_{\mathrm{max}}^p)_{p=1}^P$ 
   to hold the index and distance of the observation with the maximal distance 
   (from the existing clusters) within each part.

   We then process the data in parallel, 
   where for each part $p$ of the data we initialize several auxiliary variables:
  $\bM_p$, for storing the sum of the observations (in part $p$) 
  assigned to each cluster, and
  $(n_{k,p})_{k=1}^K$, for storing the number of observations (in part $p$)
  assigned to each cluster.
  In $\bD_p$ we store the computed distances between the observations 
  in $\bX_p$ and each of the $K$ centers. 
  Taking the argmin for each row of $\bD_p$, 
  we then procure the labels for the data in $\bX_p$.
  
  Iterating over the observations, in \textit{lines 18-24} we update the 
  contribution of each observation to the cluster centers,
  and update both $\bM_p$ and $(n_{k,p})_{k=1}^K$ accordingly. 
  While doing so we also check for the maximal distance between an observation 
  and its assigned center in this data part,
  updating $j_{\mathrm{max}}^p$ and $d_{\mathrm{max}}^p$ accordingly.
  Finally, in \textit{lines 28-36} we create a new cluster if necessary,
 using the farthest observation across all the $P$ data parts, 
 but only provided that the squared distance exceeds $\lambda$. 
We then aggregate the results from the parts and use them to update the cluster centers.

 To declare convergence, we may choose one of three possible criteria:
 1) no label switching between iterations;
 2) measuring the difference in the cost function values between two iterations,
  and declaring convergence if it is smaller than some pre-defined tolerance value;
 3) measuring the distances between the centers in two consecutive iterations,
  and declaring convergence if they are smaller than some pre-defined tolerance value.
  Criterion 1 is the strongest, and usually takes longer to achieve than the others.
  While criteria 1 and 3 cannot be met if a new cluster has been introduced 
  in the latest iteration,
  criterion 2 can be met even if there is a cluster which existed only in the last iteration.
  

 \subsubsection{DACE-PDC-DP-Means} Recall that a key property in DACE is 
 that the vanilla DP-Means is used as a subroutine: 
the data is partitioned according to the desired parallelism, 
and the DP-Means subroutine is used independently on each of the parts.
We now propose replacing that subroutine with our proposed PDC-DP-Means.
This has several benefits over the original DACE.
 First, we utilize all the pros of using PDC-DP-Means,
 which include  both parallelism and, which is more important in this case, 
 optimizations of the different calculations.
 Second, this overcomes the main drawback of PDC-DP-Means, 
 where the delayed cluster creation can slow down convergence on datasets presenting 
 a very high number of clusters.
Splitting the calculation into several parts (especially if
there is little or no overlap of clusters between the parts -- as is the case with DACE), 
solves this problem. Moreover, it also gives the user 
control over the pace at which $K$ grows. That is, 
the user can choose between maximal data-partition parallelism for an increased cluster-creation speed
and a coarser data partition that allows 
the PDC-DP-Means subroutine to employ additional parallelism 
within each part.

\subsubsection{MiniBatch PDC-DP-Means}
The delayed creation also lets us extend PDC-DP-Means 
to a Mini-Batch setting. 
The transition is  similar to the transition from K-Means to Mini-Batch K-Means~\citep{Sculley:ICWWW:2010:batch-kmeans}.
Instead of evaluating the entire dataset at once, 
we randomly sample a subset (called a mini-batch) $\bX_b$ of size $b$ from the dataset 
and run a PDC-DP-Means iteration on it, \emph{without updating the centers}.
We parallelize the processing of $\bX_b$ across the available cores. 
In each batch $\bX_b$ we cache both the index and the distance 
of the most distances observation $\bx_j$,
and if that observation is at a distance of at least $\sqrt{\lambda}$ from its nearest cluster,
we instantiate a new cluster, centered at $\bx_j$.
Unlike in PDC-DP-Means, however, here we do not recalculate the cluster centers in 
each iteration; rather, instead we take a step towards the 
observations assigned to the cluster, using the following (gradient-based) formula,
\begin{align}
\bmu_k \leftarrow \left(1-\tfrac{1}{ n_k }\right)\bmu_k + \tfrac{1}{ n_k }\bx_j ,
\end{align}
where $\bmu_k$ is the current cluster center,
$\bx_j$ is the new observation assigned to cluster $k$,
and $ n_k $ is the total number of observations assigned to cluster $k$,
including $\bx_j$.
We present the full algorithm in~\autoref{alg:mb-pdcdpmeans}.
To determine convergence we need to modify some of the aforementioned criteria.
While the criterion regarding the distance between the centers can remain the same
(though the \emph{tolerance} value needs to be adjusted),
the other two are no longer applicable as they require the entire dataset to be processed.
Instead (and as is done in MiniBatch K-Means), we can evaluate these two criteria on a pre-defined validation set.

\textbf{Online Setting. } As in the original MiniBatch K-Means, our algorithm 
also supports an online setting,
where the main iteration (\emph{lines 6-33}) is executed not on some sample from the dataset,
but on the current available data. 
When new data arrives, we process it in the main iteration.
Thus, there is no need to store the previously-seen data at any point in time.

%%%% end contents of method.tex

%%%% start contents of tab_clustering_results.tex
\begin{table*}[t]
    \setlength\tabcolsep{6pt}
    \caption{Comparing running time and NMI of different algorithms on various datasets. Our proposed methods uniformly have better results than the other DP-Means variants, and in most cases, better than the K-Means variants as well.
    Note that the parametric methods (marked by $^\dagger$), which had to be given the true $K$ so they had an unfair advantage, are included here only for completeness. 
    The important comparison, however, is between the nonparametric ones.
    }
    \resizebox{1.0\textwidth}{!}{
    \begin{tabular}{@{}lllllllllll@{}}
    \multicolumn{1}{l|}{Dataset}                                                                                    & \multicolumn{2}{c}{2D Gaussian} & \multicolumn{2}{c}{10D Gaussian} & \multicolumn{2}{c}{MNIST}      & \multicolumn{2}{c}{ImageNet100} & \multicolumn{2}{c}{ImageNet1K} \\ \midrule
    \multicolumn{1}{l|}{Method}                                                                & \multicolumn{1}{c}{NMI} & \multicolumn{1}{c}{Time [sec]} & \multicolumn{1}{c}{NMI} & \multicolumn{1}{c}{Time [sec]} & \multicolumn{1}{c}{NMI}     & \multicolumn{1}{c}{Time [sec]} & \multicolumn{1}{c}{NMI} & \multicolumn{1}{c}{Time [sec]} & \multicolumn{1}{c}{NMI} & \multicolumn{1}{c}{Time [sec]} \\ \hline
    \multicolumn{1}{l|}{K-Means$^\dagger$}                                                               &  $.872\pm.002$ &  $1.47\pm0.01$           &  $.634\pm.003$ &  $1.35\pm0.0$   &$.492\pm.005$&$0.12\pm0.00$     &$.770\pm.001$&$1.53\pm0.06$      &  $.736\pm.000$ &  $198\pm17$   \\
    %\multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}} \\  
    \multicolumn{1}{l|}{MiniBatch K-Means$^\dagger$}
    %\end{tabular
    %}}
    &  $.875\pm.004$ & $0.27\pm0.11$            &  $.632\pm.013$ & $0.04\pm0.02$   &$.451\pm.025$&$0.15\pm0.04$     &$.762\pm.002$&$0.29\pm0.00$     &  $.727\pm.000$ & $4.97\pm0.28$    \\ \midrule
    \multicolumn{1}{l|}{DP-Means}                                                              &  $.883\pm.002$ &  $865\pm9$               &  $.666\pm.001$ &  $459\pm63$     &$.534\pm.001$&$204\pm 57$       &$.765\pm.001$&$205\pm87$         &  N/A &  N/A     \\
    \multicolumn{1}{l|}{DACE}                                                                  &  $.890\pm.003$ &  $35.4\pm6.5$            & $.648\pm.003$  &  $9.92\pm1.19$  &$.506\pm.003$&$4.86\pm0.64$     &$.730\pm.003$&$34.5\pm3.9$      & $.720\pm.002$  &  $8501\pm613$  \\
    \multicolumn{1}{l|}{P-DP-Means}                                                            &  $.884\pm.002$ &  $117\pm1$               & $.686\pm.007$  &  $37.1\pm7.33$  &$.532\pm.000$&$17.5\pm0.8$     &$.765\pm.001$&$8.53\pm0.65$     & $.729\pm.000$  &  $424\pm24$   \\
    \multicolumn{1}{l|}{PDC-DP-Means (Ours)}                                                     &  $\mathbf{.891\pm.006}$ &  $3.55\pm0.25$  & $\mathbf{.713\pm.000}$  &  $10.8\pm1$  &$\mathbf{.540\pm.002}$&$0.96\pm0.03$     &$\mathbf{.767\pm.000}$&$2.47\pm0.32$     & $\mathbf{.734\pm.000}$  &  $1232\pm66$   \\
    \multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}}DACE - \\ PDC-DP-Means (Ours)\end{tabular}} &  $.888\pm.006$   &$9.96\pm2.65$             &  $.663\pm.012$ &$2.38\pm0.12$    &$.498\pm.001$&$0.51\pm0.01$     &$.749\pm.003$&$3.73\pm0.20$     &  $.731\pm.005$ &$123\pm17$  \\
    \multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}}MiniBatch\\ PDC-DP-Means (Ours)\end{tabular}} &  $.882\pm.010$ &$\mathbf{1.07\pm0.16}$    &  $.645\pm.017$ &$\mathbf{0.33\pm0.01}$    &$.501\pm.004$&$\mathbf{0.43\pm0.15}$     &$.758\pm.006$&$\mathbf{0.39\pm0.23}$     &  $.728\pm.000$ &$\mathbf{12.9\pm1.07}$ \\   \bottomrule
    %\multicolumn{11}{l}{\small $^\dagger$ Parametric methods. 
    %%These methods (which had to be given the true $K$ so they had %an unfair advantage) are included here for completeness. 
    %%The important comparison, however, is between the nonparametric %ones.
    %} \\
\end{tabular}
    \label{tab:main-results}    
    }
    \end{table*}
%%%% end contents of tab_clustering_results.tex


%%% start contents of results.tex
\section{EXPERIMENTS AND RESULTS}\label{Sec:Results}
%%%% start contents of alg_mb_pdcdpmeans.tex
{
    \SetKwComment{Comment}{}{}
    \begin{algorithm}[!ht]
    \KwIn{$\lambda,P,b$}
    % Hyperparams. }
    \KwData{$\bX=\MATRIX{\bx_1 & \ldots & \bx_N}^T\in \RR^{N\times d}$}
%      \KwOut{$K,(\bmu_k)_{k=1}^K$}
    $K\leftarrow 1$\\
    $\mu_1\leftarrow \text{Random point from }\bX$\\
    $n_1\leftarrow 1$\\
    % $X_1,\ldots,X_p\leftarrow \text{Partition } \bX\text{ to } P \text{ parts}$\\
    \DontPrintSemicolon
    \While{Not Converged}
    {
      $\bX^b\leftarrow b \text{ random points from }\bX$\\
      $(\bX_p)_{p=1}^P \leftarrow $ Split $\bX^b$ into  $P$  parts\\
      $(N_p)_{p=1}^P \leftarrow  (\# \text{of pts in }\bX_p)_{p=1}^P$  \Comment{//\hspace*{-0.05cm}$\bX_p\in \RR^{N_p \times d}$}
      \ForPar{$p\in \set{1,\ldots,P}$}
      {$\bs_p \leftarrow (\ellTwoNorm{\bX^b_p[j,:]}^2)_{j=1}^{N_p}$
      \Comment{// $\bs_p\in \RR^{N_p}$}
      }
      % $\bmu\leftarrow \text{Stack of } \{\mu_1,\ldots,\mu_K\}$\\
      $\bM\leftarrow\MATRIX{\bmu_{1} & \ldots & \bmu_{K} }^T$
      \hfill \Comment{// $\bM\in \RR^{K \times d}$}
      $ \bar{\bs}
      %=\MATRIX{\bar{s}_1 & \ldots & \bar{s}_K} 
      \leftarrow
      \left[\begin{smallmatrix}
       \ellTwoNorm{\bmu_1}^2& \ldots & \ellTwoNorm{\bmu_K}^2
      \end{smallmatrix}\right]$
      \hfill\Comment{// $\bar{\bs} \in \RR^K$}
      $(j_{\mathrm{max}}^p)_{p=1}^P\leftarrow -1$ 
      \hfill \Comment{// init.~argmax vals}
     $(d_{\mathrm{max}}^p)_{p=1}^P\leftarrow -1$ 
      \hfill \Comment{// init.~max vals}
      \ForPar{$p\in\{1,\ldots,P\}$}
      {
        $\bD_p \leftarrow -2 \bX_p \bM^T +\bar{\bs}$ 
      \hfill \Comment{// $\bD_p\in\RR^{N_p\times K}$}
        $\bz_p \leftarrow \text{row-wise argmin } (\bD_p)$\\
        \For{$j\in \set{1,\ldots,N_p}$}
        {
          \If{$\bD_p[j,k]+\bs_p[j] >\max(d_{\mathrm{max}}^p,\lambda ) $}
           {
              $d_{\mathrm{max}}^p\leftarrow \bD_p[j,k]+\bs_p[j]$\\
              $j_{\mathrm{max}}^p\leftarrow j$\\
           }
        }
      }      
      $ p_{\mathrm{max}} \leftarrow \argmax{p} (d_{\mathrm{max}}^p)_{p=1}^P $\\
      \If{ $p_{max} \neq -1$}
      {
%         \Comment{// Create a cluster}
        $j\leftarrow j_{\mathrm{max}}^{p_\mathrm{max}}$ \\ %\bI_{\mathrm{max}}[p_{\mathrm{max}}]$\\
        $K\leftarrow K+1$\\
        $\bz_{p_\mathrm{max}}[j]\leftarrow K$\\
        $n_K\leftarrow 0$\\
        $\bmu_K\leftarrow \bX_p[j,:]$\\
      }
      $\bz\leftarrow (\bz_1,\ldots,\bz_P)$\\

      \ForPar{$k\in \set{1,\ldots,K}$}
      {
        $X^b_k\leftarrow \{\bx_j\in\bX^b:\bz[j] = k\}$\\
        \For{$x_j\in X^b_k$}
        {
          $n_k\leftarrow n_k+1$\\
          $\bmu_k\leftarrow (1-\frac{1}{n_k})\bmu_k+\frac{\bx_j}{n_k}$\\
        }
      }      
    }
   \caption{MiniBatch PDC-DP-Means}\label{alg:mb-pdcdpmeans}
    \end{algorithm}
    }
%%%% end contents of alg_mb_pdcdpmeans.tex



To validate the utility of our methods, we have compared them with various methods in different settings.
All of our experiments were run on an Ubuntu 20.4 machine with 64GB RAM and Intel® Core™ i9-11900K Processor.


\textbf{Methods and Implementations. } The implementation of our proposed methods is in Python and Cython,
and we have integrated it within the code base of 
Scikit-learn~\citep{Pedregosa:JMLR:2011:scikit-learn}. 
When writing the implementation we had both efficiency and accessibility in mind.
In particular, we exploit Scikit-learn's efficient codebase, 
while making the use of our code an easy ``drop-and-replacement''. That is,
a user that previously used Scikit-learn's K-Means or MiniBatch K-Means,
can now simply change to our code using the same interface
(except that instead of passing $K$ as a parameter the user will pass $\lambda$).
For the vanilla DP-Means we have used the publicly-available \emph{R} package \textit{`maotai'}.
As public implementation of DACE~\citep{Jiang:2017:dace} is aimed towards RNA sequence data,
we have created our version of it which is more general and can handle any data type. 
Our pure DACE version uses the aforementioned `maotai' package for the DP-Means subroutine, 
while in our DACE+DCP version we have simply changed the subroutine to our proposed PDP-DP-Means.
 For P-DP-Means~\citep{Pan:NeRIPS:2013:optimistic} there is no publicly-available
 implementation, so we have created our own efficient implementation of it,
  written in Python and utilizing Scikit-learn's efficient Cython subroutines.
  Finally, for K-Means and MiniBatch K-Means, 
  we have used the available optimized Scikit-learn implementations.

\textbf{Datasets. } We have used several datasets: 
a synthetic 2D Dataset, with $N=10^6$ points, sampled from a 50-component Gaussian Mixture Model (GMM); 
a synthetic 10D Dataset, with $N=10^5$ points, sampled from a 20-component GMM;
MNIST~\citep{Lecun:1998:mnist} handwritten digits dataset with dimensionality reduced to $16$ using Principal Component Analysis (PCA);
ImageNet100~\citep{Deng:CVPR:2009:imagenet}
(a subset of the entire ImageNet dataset), which consists of 125K images that belong to 100 classes 
from the entire ImageNet, where we also used 
SWAV~\citep{Caron:NIPS:2020:unsupervised} to extract features from the images, followed by PCA 
to reduce the dimensionality of the features to 64;
ImageNet1K~\citep{Deng:CVPR:2009:imagenet}, which is the the full ILSRVC2012 dataset train set, 
containing 1.2M images from 1000 classes and where we again used
SWAV followed by PCA to reduce the dimensionality, this time to 128.
We emphasize that the dimensionality reduction was done mostly for the benefit of the other methods
(our methods, which scale better, can handle higher dimensions). 

%%%% start contents of tab_np_timing.tex
\begin{table*}[t]
    \caption{Comparison with nonparametric clustering algorithms}
    \setlength\tabcolsep{6pt}
    \resizebox{\columnwidth*2}{!}{
    \begin{tabular}{lllllll}
    Dataset                                                                                 & \multicolumn{2}{c}{2D Gaussian}          & \multicolumn{2}{c}{MNIST}            & \multicolumn{2}{c}{ImageNet100}       \\ \hline
    \multicolumn{1}{l|}{Method}                                                             & NMI            & Time [sec]              &  NMI        &   Time  [sec]         &  NMI        &   Time [sec]          \\ \hline
    \multicolumn{1}{l|}{DBSCAN}                                                             & $.69\pm.00$   & $0.58\pm0.03$           &$.35\pm.00$ & $0.96\pm0.05$          &$.580\pm.00$ & $94\pm14$               \\
    \multicolumn{1}{l|}{MeanShift}                                                          & $.74\pm.00$   & $125\pm2.0$             &$.43\pm.00$ & $781\pm11$             &   N/A       & N/A                     \\
    \multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}}Agglomerative Clustering\end{tabular}} & $.82\pm.00$   & $37.7\pm0.3$            &$.50\pm.00$ & $74.4\pm1.9$           &   N/A       & N/A                     \\
    \multicolumn{1}{l|}{OPTICS}                                                             & $.75\pm.00$   & $27.2\pm0.21$           &$.02\pm.00$ & $49.3\pm0.46$          &   N/A       & N/A                     \\
    \multicolumn{1}{l|}{PDC-DP-Means (Ours)}                                                       & $\mathbf{.83\pm.01}$   & $0.07\pm0.00$    &$\mathbf{.51\pm.00}$ & $0.84\pm0.26$ &$\mathbf{.76\pm.00}$ & $1.9\pm0.29$   \\
    \multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}}MiniBatch PDC-DP-Means (Ours) \end{tabular}}   & $.82\pm.03$   & $\mathbf{0.04\pm0.02}$  &$.45\pm.01$ & $\mathbf{0.19\pm0.04}$ &$.74\pm.00$ & $\mathbf{0.52\pm0.00}$  \\ \hline
    \end{tabular}
    }
    \label{tab:nonparmetric-results}
\end{table*}
%%%% end contents of tab_np_timing.tex

\textbf{Evaluation. }We have split the data into Train-Validation-Test sets,
in proportions of $0.9,0.02,0.08$, respectively.
In order to evaluate the results of the clustering, we have used a model-independent metric,
the
Normalized Mutual Information (NMI) score.
While all DP-Means variants share the same cost function,
given the same $\lambda$, the expected clustering results differ by a lot:
DACE and MiniBatch PDC-DP-Means will usually output a higher number of clusters 
than DP-Means and P-DP-Means,
while the latter two usually output a higher number of cluster than PDC-DP-Means.
As such, we have optimized the $\lambda$ value independently for each of the models,
using the validation set, setting NMI as the target function for the optimization and using~\citep{Knysh:arxiv:2016:blackbox} as the optimizer.
The full results for this setting appear in~\autoref{tab:main-results}.
From the table, it is observable that PDC-DP-Means
outperforms all DP-Means variants in terms of NMI  
and that in most cases it outperforms even the parametric methods, 
which had to be given the true $K$.
In terms of running time, our MiniBatch PDC-DP-Means is always the fastest DP-Means-related method,
usually by a very large margin, despite having only a slight reduction 
in the quality of the results.
Also, in almost all cases it outperforms the only-slightly faster MiniBatch K-Means, 
in terms of NMI score.
An interesting observation is that in the ImageNet1K case (where the true $K$ is high: 1000), 
our proposed PDC-DP-Means is slower than P-DP-Means.
This is the only case where the delayed cluster creation harms the running time
due to the large number of clusters. P-DP-Means instantiates most of the clusters 
in the first few iterations, and this enables it optimizing 
the distance calculations for most of the clusters very early.
However, both our MiniBatch PDC-DP-Means and our DACE-PDC-DP-Means do not suffer from the large 
$K$: while both use delayed cluster creation, the MiniBatch PDC-DP-Means does so 
after every MiniBatch, 
while DACE-PDC-DP-Means splits the data into parts 
and thus can create multiple clusters at the same time.
As evident by the results, both methods converge much faster 
than PDC-DP-Means, with a similar quality of results.

\textbf{Comparing with Nonparametric Algorithms.}
So far we have focused on comparisons with either DP-Means, K-Means, or their variants.
However, there are other nonparametric clustering algorithms 
(with existing efficient implementations) that are unrelated to DP-Means.
In particular, we have compared with the following popular algorithms:
DBSCAN~\citep{Ester:kdd:1996:dbscan}, MeanShift~\citep{Comaniciu:TPAMI:2002:mean},
Agglomerative Clustering~\citep{Maimon:book:2005data} and OPTICS~\citep{Ankerst:1999:optics}.
All the above implementations are available in 
Scikit-learn~\citep{Pedregosa:JMLR:2011:scikit-learn}.
To compare with those algorithms, we have used smaller versions of the 2D GMM and MNIST datasets, 
the former with only $50K$ observations sampled from 20 2D Gaussians, 
while the latter is MNIST train set with dimensionality reduced to 8 via PCA. 
In addition to those 2 datasets,
we have used the previously-discussed ImageNet100 for comparing with DBSCAN, the only other nonparametric method which could scale to a dataset of such a size.
We note that while our method (and some of the other methods) 
can gracefully handle very large datasets, 
MeanShift's~\citep{Comaniciu:TPAMI:2002:mean} runtime and 
Agglomerative Clustering's~\citep{Maimon:book:2005data} memory consumption
makes them impractical for large datasets.
\autoref{tab:nonparmetric-results} summarizes the results,
showing that our approach not only outperforms the others in terms of clustering results but also does it in a fraction of the time.
Note that for each method, we have optimized its parameters using black-box optimization~\citep{Knysh:arxiv:2016:blackbox} on the data test set. This is in contrast to the previous experiment where we have used the validation set. Here, however, some algorithms (\eg, DBSCAN) cannot predict the labels of new samples. Thus, we have evaluated the performance on the clustering results of the train set
 (hence the discrepancy between the ImageNet100 results here of PDC-DP-Means and the Minibatch and their counterparts in~\autoref{tab:main-results}).

%%% end contents of results.tex

%%%% start contents of conclusion.tex
\section{CONCLUSION}\label{Sec:Conclusion}
In this paper we have focused on the practical aspects of parallelizing DP-Means. We have examined previous attempts at that goal
and proposed several algorithms which are parallel, highly efficient, and usually achieve better clustering results than their counterparts.
%
Our main contribution, the PDC-DP-Means, has one key limitation: as the number of clusters can only increase by one in each iteration, in data with a large $K$ (\eg, ImageNet1K), this can lead to a large number of iterations.
However, our other proposed algorithms, DACE-PDC-DP-Means and MiniBatch PDC-DP-Means, offer a remedy in those cases as the number of clusters can increase very fast, as is evident by our results. To summarize, our recommendation is to use PDC-DP-Means for datasets where one may expect to find only a moderate $K$. This will usually yield the best results. If $K$ is expected to be large, using either DACE-PDC-DP-Means or MiniBatch PDC-DP-Means is preferred (despite the mild drop in the quality of the results) due to the major speedups.
%%%% end contents of conclusion.tex


\begin{acknowledgements} 
  This work was supported by the Lynn
and William Frankel Center at BGU CS, by the Israeli Council for
Higher Education via the BGU Data Science Research Center, and
by Israel Science Foundation Personal Grant \#360/21. O.D. was
also funded by the Jabotinsky Scholarship from Israel’s Ministry
of Technology and Science, and by BGU’s Hi-Tech Scholarship.
\end{acknowledgements}


%\clearpage
% \small
% \bibliographystyle{ieee}
%\bibliographystyle{ieee_fullname}
% \bibliographystyle{abbrvnat}
\bibliography{refs}
%\bibliography{my_refs}


\end{document}
