% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{caption}
\usepackage{subcaption}
\usepackage{hyperref}
\usepackage{url}


\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amsfonts}

% new imports jack
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{wrapfig}

\def\ie{\textit{i.e.}}
\def\eg{\textit{e.g.}}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}


\newcommand{\mx}{\mathbf{x}}
\newcommand{\mX}{\mathbf{X}}
\newcommand{\my}{\mathbf{y}}
\newcommand{\mY}{\mathbf{Y}}
\newcommand{\mF}{\mathbf{F}}
\newcommand{\mf}{\mathbf{f}}
\newcommand{\mz}{\mathbf{z}}
\newcommand{\mZ}{\mathbf{Z}}
\newcommand{\mU}{\mathbf{u}}
\newcommand{\mI}{\mathbf{I}}
\newcommand{\mm}{\mathbf{m}}
\newcommand{\mS}{\mathbf{S}}
\newcommand{\md}{\mathbf{d}}

\newcommand{\mD}{\mathcal{D}}
\newcommand{\mN}{\mathcal{N}}
\newcommand{\mL}{\mathcal{L}}

\newcommand{\bbR}{\mathbb{R}}
\newcommand{\intd}{\boldsymbol{\mathrm{d}}}

\newcommand{\mmu}{\boldsymbol{\mu}}
\newcommand{\mnu}{\boldsymbol{\nu}}
\newcommand{\mSig}{\boldsymbol{\Sigma}}
\newcommand{\mLambda}{\boldsymbol{\Lambda}}
\newcommand{\mTheta}{\boldsymbol{\Theta}}
\newcommand{\mtheta}{\boldsymbol{\theta}}
\newcommand{\mphi}{\boldsymbol{\phi}}
\newcommand{\mPhi}{\boldsymbol{\Phi}}
\newcommand{\mpi}{\boldsymbol{\pi}}
\newcommand{\mc}{\mathbf{c}}
\newcommand{\mH}{\mathbf{H}}
\newcommand{\mW}{\mathbf{W}}

\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}


\title{Deep Dirichlet Process Mixture Models}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \href{mailto:<>
\author[1]{\href{mailto:<lnq18@mails.tsinghua.edu.cn>?Subject=DDPM}{Naiqi Li\thanks{Equal contributions.}}{}}
\author[1]{Wenjie Li$^{*}$}
\author[1,2]{Yong Jiang}
\author[1,2]{Shu-Tao Xia}
% Add affiliations after the authors
\affil[1]{%
    Tsinghua Shenzhen International Graduate School, Tsinghua University, China
}
\affil[2]{%
    Peng Cheng Laboratory, Shenzhen, China
}
  
  \begin{document}
\maketitle

\begin{abstract}
In this paper we propose the deep Dirichlet process mixture (DDPM) model, which is an unsupervised method that simultaneously performs clustering and feature learning. 
The traditional Dirichlet process mixture model can infer the number of mixture components, but its flexibility is restricted since the clustering is performed in the raw feature space. 
Our method alleviates this limitation by using the flow-based deep neural network to learn more expressive features. 
DDPM unifies Dirichlet processes and the flow-based model with Monte Carlo expectation-maximization, and uses Gibbs sampling to sample from the posterior. 
This combination allows our method to exploit the mutually beneficial relation between clustering and feature learning. 
The effectiveness of DDPM is demonstrated by thorough experiments in various synthetic and real-world datasets.
\end{abstract}



\begin{figure*}
    \centering
    \includegraphics[width=0.19\linewidth]{imgs/fig_gau/cl_gt.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_gau/bw_2.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_gau/bw_4.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_gau/bw_7.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_gau/cl_final.pdf}\\
    \includegraphics[width=0.19\linewidth]{imgs/fig_moon/cl_gt.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_moon/bw_10.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_moon/bw_20.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_moon/bw_35.pdf}\hfill
    \includegraphics[width=0.19\linewidth]{imgs/fig_moon/cl_final.pdf}
    \caption{Demonstration of the clustering and representation learning process on two synthetic datasets. The leftmost figures are the ground truth clustering results. The middle figures show the latent representation learned by DDPM during the training. The rightmost figures show the final clustering results, with the circles denoting 2 standard deviations of the Gaussian distributions. We can see that DDPM is able to learn better representation during clustering. Particularly in the second example, the raw data representation is challenging for many centroid-based clustering methods, and the benefit of the new representation learned by DDPM is quite evident. Also note that the number of clusters is unknown in advance.}
    \label{fig:synthetic}
\end{figure*}

\section{Introduction}

Clustering is one of the most long-standing and fundamental tasks in computer science. Besides the well-known $k$-means algorithm \citep{macqueen1967some} and Gaussian mixture models (GMMs) \citep{bishop:2006:PRML}, a plethora of methods have been proposed \citep{ester1996density, szekely2005hierarchical,frey2007clustering,zhao2008g}. In those early investigations, clustering is performed in the raw feature space, and the models lack the capacity of learning or improving the expressiveness of the representation.

With the recent success of deep neural networks (DNNs), a new line of research termed deep clustering has emerged \citep{xie2016unsupervised, yang2016joint, jiang2017variational, caron2018deep}. These works are based on the intuitive principle that good representation encourages better clustering, and similarly, good clustering can lead to better representation. Their methods take advantage of the successful deep neural network structures, such as convolution neural networks (CNNs) \citep{krizhevsky2012imagenet} and variational autoencoders (VAEs) \citep{KingmaW13}. Their superior performance demonstrates the benefits of jointly clustering and representation learning.

However, all these methods share the same shortcoming – they consider the number of clusters (\ie, $k$ in $k$-means) as a hyperparameter that needs to be specified by the user. This brings severe restrictions to their applications: 1) most of these algorithms are sensitive to the choice of $k$; 2) users lack the prior knowledge of the number of clusters; 3) this value itself may be time-varying (\eg, $k$ may increase as more data is accumulated); 4) in some scenarios, particularly for large datasets, there is no golden ground truth for this value \citep{li2018learning}.


The Dirichlet process mixture (DPM) model, which belongs to the Bayesian nonparametric family, is a popular method that can solve this conundrum \citep{antoniak1974mixtures}. Its solid mathematical background originates from the Dirichlet process \citep{ferguson1973bayesian}, which is an infinite generalization of the Dirichlet distribution. DPM can model the data with a possibly infinite number of mixtures, and the exact number of mixtures is rigorously inferred by the Bayesian principle. %Furthermore, DPM also possesses the ability to adapt the number of mixtures as more data arrives. 
It is thus desirable to combine the strengths of DPM and deep neural networks, which could lead to a clustering method that simultaneously adjusts the number of mixture components and learns better representation.



In this paper, we propose the deep Dirichlet process mixture (DDPM) model, which brings the above idea into realization.
Two working examples of DDPM are presented in Fig. \ref{fig:synthetic}, where the potentials of DDPM are clearly demonstrated, particularly its ability to enjoy the mutually beneficial relationship between clustering and feature learning. Our method bridges the standard DPM with the recently proposed flow-based generative models \citep{DinhKB14, kingma2018glow}, which is a special kind of invertible deep neural network that learns better representation through density transformation. The overall clustering process is guided by the Bayesian principle, while the optimization of model parameters is derived from the Monte Carlo expectation-maximization (EM) algorithm \citep{bishop:2006:PRML}. During the iterations, Gibbs sampling is used to obtain samples from the posterior as in the standard DPM literature. DDPM also works as a generative model, so that unseen new samples could be obtained by introducing noises to the learned features. The source code for reproducing our main experiments is publicly available at \url{https://github.com/naiqili/DDPM}.

% In the nex section we ...

\section{Related work} 

Clustering and representation learning are both among the most well-investigated topics in computer science. Besides the well-known $k$-means \citep{macqueen1967some}, GMMs \citep{bishop:2006:PRML}, and their variants \citep{zhao2008g, li2021tk}, the recent success of deep neural networks have inspired a new paradigm called deep clustering. The work of \citep{yang2016joint, caron2018deep} tries to take advantage of the convolution neural networks in computer vision tasks. Their major difference lies in the loss function and the training scheme.
Other DNN structures are also exploited. \cite{xie2016unsupervised} proposed Deep Embedded Clustering (DEC), which jointly optimizes deep embeddings and performs clustering based on features extracted from deep autoencoders. In \citep{jiang2017variational}, Variational Deep Embedding (VaDE) was introduced, which combines the variational autoencoder with the GMM model.
However all these works share the same insufficiency, \ie, the number of the clusters is a hyperparameter that needs to be specified by the user.
As aforementioned this presents challenges for their applications in real-world scenarios, where prior knowledge about the number of clusters is generally unavailable.
%As aforementioned in many scenarios the users lack precise knowledge about the number of clusters, and this number may also increase as more data is accumulated.

The dirichlet process mixture model \citep{antoniak1974mixtures}, which belongs to the Bayesian nonparametric family, is one of the most popular methods that are capable of inferring the number of clusters automatically. This distinguishing ability is further utilized and strengthened. \cite{teh2006hierarchical} proposed Hierarchical Dirichlet processes, which can share mixture components among different clusters. The maximum margin DPM (MMDPM) introduces a discriminate model for clustering, which bridges DPM and the SVM classifier \citep{chen2016maximum}. All these methods operate in the raw feature space, without the ability to learn more expressive representation.
Recently \cite{echraibi2020variational} proposed the Dirichlet process deep latent Gaussian mixture model (DP-DLGMM), which combines the Dirichlet process prior with the deep latent Gaussian mixture model so that the number of mixture components can be adjusted. However, their work focuses on representation learning rather than clustering.

Naturally one may wonder whether it is possible to simultaneously infer the number of mixture components and learn better (possibly nonlinear) features. 
\cite{ehsan2017infinite} proposed to use an infinite mixture of VAEs to model the data. Since the number of effective VAEs may increase as more data arrives, their model can adapt to the complexity of the data. In the paper of \citep{NalisnickS17}, stick-breaking variational autoencoders (SB-VAEs) were presented, which model the latent variables in VAEs to be infinite-dimensional. Dirichlet processes and particularly the stick-breaking construction serve as the cornerstones in their method. Our work distinguishes from theirs in both goal and methodology:  their works focus on improving the performance of semi-supervised classification tasks, while our research considers the unsupervised task of clustering where the number of mixtures and better representation needs to be jointly learned. Both of their models are based on VAEs while our method utilizes the recently proposed flow-based invertible deep neural network, which demonstrates superior performance in various density estimation and computer vision tasks \citep{DinhKB14, kingma2018glow}.

\section{Methodology}

\subsection{Overview}

Consider the input as a set $\mX=\{\mx_i \in \bbR^D\}_{i=1}^N.$ Our first step is to use a standard dimension reduction technique to extract representative features, so that the following clustering can be performed in the lower dimensional \textbf{feature space}. In our work we use the stack autoencoder \citep{vincent2010stacked, xie2016unsupervised} to extract the features as $\mY=\{\my_i=h_e(\mx_i)\in \bbR^d\}_{i=1}^N$ ($d \ll D$), where $h_e$ and $h_d$ denote the encoder and decoder functions respectively, such that $h_d(h_e(\mx)) \approx \mx$. Next the features are transformed by a nonlinear learnable function $f(\my ; \mtheta)$ into $\mZ = \{\mz_i=f(\my_i ; \mtheta)\in \bbR^d\}_{i=1}^N$. Clustering is performed in this \textbf{transformed space}. We assume that each $\mz_i$ follows an isotropic Gaussian distribution. Suppose that $\mz_i$ belongs to the $k$-th cluster, the likelihood is given by:
\begin{align}
    p(\mz_i| \mmu_k, \lambda_k) & =\mN(\mz_i| \mmu_k,  \lambda_k^{-1} \mI) \notag\\
    &=(\frac{\lambda_k}{ 2\pi})^{\frac{d}{2}} \exp\left( {-\frac{\lambda_k}{2}||\mz_i - \mmu_k ||^2}\right), \label{eq:llh}
\end{align}
where $\mmu_k$ and $\lambda_k$ denote the mean and the precision of the $k$-th cluster. Note that the isotropic Gaussian assumption does not restrict the model's capacity since the transformation $f(\my_i ; \mtheta)$ is nonlinear, which is implemented by a deep neural network in practice. The cluster assignment variables are denoted as $\mc=\{c_i\in \{1, ..., K\}\}_{i=1}^N$, where $c_i=k$ indicates that $\mz_i$ belongs to the $k$-th cluster. Here the total number of clusters $K$ is unknown to us and could be arbitrarily large. 

From the Bayesian perspective, the task of clustering is equivalent to the inference of
\begin{align}
    p(\mc, \{\mmu_k\}_{k=1}^K, \{\lambda_k\}_{k=1}^K | \mY; \mtheta, \mPhi). \label{eq:obj}
\end{align}
Here $\mPhi$ is the set of hyperparameters that specify the prior, which will soon be introduced in the next section. We emphasize the challenges of the task: 1) the number of clusters $K$ is unknown; 2) $\mtheta$ parameterizes a deep neural network which needs to be learned; 3) the cluster information (\ie, $\mmu_k$ and $\lambda_k$) need to be computed at the same time. In what follows we will see how the proposed method can address all these challenges under a unified framework.

\subsection{Model Specification}

\textbf{Likelihood in the feature space} \quad Eq. (\ref{eq:llh}) describes the likelihood function in the transformed space. Now we derives the likelihood in the feature space (\ie, before the transformation) as follows:
\begin{align}
    &p(\mY | \mc, \{\mmu_k\}_{k=1}^K, \{\lambda_k\}_{k=1}^K; \mtheta, \mPhi) \notag\\
    =&\prod p(\my_i | \mmu_{c_i}, \lambda_{c_i}; \mtheta, \mPhi)\notag\\
    =&\prod p(\mz_i | \mmu_{c_i}, \lambda_{c_i})|\det \frac{\partial f(\my_i ; \mtheta)}{\partial \my_i}|. \label{eq:llh-jacob}
\end{align}
The last term in Eq. (\ref{eq:llh-jacob}) is due to the change of variable $\mz_i=f(\my_i ; \mtheta)$. Recall that $p(\mz_i | \mmu_{c_i}, \lambda_{c_i})$ is given in Eq (\ref{eq:llh}).

For a general nonlinear function $f(\my_i ; \mtheta)$ implemented by a neural network, the Jacobian term in Eq. (\ref{eq:llh-jacob}) is analytically intractable. To address this problem we utilize the NICE model \citep{DinhKB14}, which is a flow-based deep neural network with the appealing property that the determinant of Jacobian can be trivially computed. The basic idea of NICE is that, in each layer it splits the input $\my$ into two parts as $\my=\{\my_1,\my_2\}$, and defines the output as $\mz=\{\mz_1,\mz_2\}$ where $\mz_1=\my_1$ and $\mz_2=\my_2+\sigma(\my_1)$. It is easy to verify that for such function the determinant of Jacobian equals one. After stacking multiple such layers, we have a highly nonlinear function whose Jacobian term can be trivially cancelled out. Another useful property of NICE is that the transformation is invertible. Particularly if $\mz = f(\my ; \mtheta)$,  $\my = f^{-1}(\mz ; \mtheta)$ can also be easily computed. Interested readers may refer to \citep{DinhKB14} for further details.

\noindent \textbf{Dirichlet process mixture model in the transformed space} \quad After the transformation $\mz_i=f(\my_i ; \mtheta)$, we can now perform clustering in this transformed space. In our problem the number of clusters $K$ is unknown. The Dirichlet process mixture (DPM) models \citep{antoniak1974mixtures,neal2000markov, li2019tutorial} is one of the most popular tools in this situation. Here we present a concise review of DPM models, and in the next subsection we will see how to connect it with NICE.

Given a measurable space $(\Theta, \mathcal{A})$ where $\mathcal{A}$ is a $\sigma$-algebra defined on $\Theta$, the Dirichlet process (DP) \citep{ferguson1973bayesian} is characterized by a probability measure $G_0$ on the measure space, and a positive scaling parameter $\alpha$. A DP is a random probability measure over $(\Theta, \mathcal{A})$, denoted as $G\sim DP(G_0, \alpha)$, such that for any partition $(A_1,...,A_r)$ of $\Theta$ we have
\begin{align}
    (G(A_1),...,G(A_r)) \sim Dir(\alpha_0 G_0(A_1),...,\alpha_0 G_0(A_r)),\notag
\end{align}
where $Dir$ is the finite-dimensional Dirichlet distribution. In other words, a DP is a ``distribution over distribution''.

The DPM model is based on DP. Under our formulation, it is defined as
\begin{align}
    & G | G_0, \alpha \sim DP(G_0, \alpha) \label{eq:dpm1}\\
    & \mmu_k, \lambda_k | G \sim G(\mmu_k, \lambda_k) \label{eq:dpm2}\\
    & \mz_i| \mmu_k, \lambda_k \sim p(\mz_i| \mmu_k, \lambda_k) \label{eq:dpm3}
\end{align}
The likelihood in (\ref{eq:dpm3}) is given in Eq. (\ref{eq:llh}), \ie, in our work the DPM model is applied in the transformed space $\mZ = \{\mz_i=f(\my_i ; \mtheta)\}_{i=1}^N$. We define the base distribution $G_0$ as the conjugate prior of the likelihood, which is the normal-gamma distribution \citep{bishop:2006:PRML}\footnote{In our work we consider $\mmu_k$ as a vector, while in the standard normal-gamma distribution it is a scalar.}:
\begin{align}
    \mmu_k,& \lambda_k |\mmu_0, \kappa_0, \alpha_0, \beta_0 \sim NG(\mmu_k, \lambda_k |\mmu_0, \kappa_0, \alpha_0, \beta_0) \notag \\
    &=\mN(\mmu_k| \mmu_0, (\kappa_0 \lambda_k)^{-1}\mI) Gamma(\lambda_k|\alpha_0, \beta_0).
\end{align}

A key property of DPM models is that the marginalized conditional distribution of the cluster assignment variable has closed from:
\begin{align}
p\left(c_{i}=k \mid \mc_{-i}, \alpha\right)=
\begin{cases}
\dfrac{n_{-i, k}}{N-1+\alpha}, & n_{-i,k}>0
\vspace{1em}\\
\dfrac{\alpha}{N-1+\alpha}, & n_{-i,k}=0
\end{cases}
\end{align}
where $\mc_{-i}=\mc\setminus\{c_i\}$, $N$ is the number of all data points, and $n_{-i, k}$ is the size of the $k$-th cluster excluding the $i$-th datum.
Intuitively the first formula is the probability of assigning the $i$-th datum into the $k$-th existing cluster, while the second formula is the probability of assigning it to a new cluster.
The proof of this result is available in many related literature \citep{gorur2010dirichlet,chen2016maximum,li2019tutorial}.

We collect all the hyperparameters in the DPM model as $\mPhi=\{\alpha, \mmu_0, \kappa_0, \alpha_0, \beta_0\}$, which was used in Eq. (\ref{eq:obj}). To keep the representation succinct we will suppress $\mPhi$ in the following discussions, unless it is explicitly needed.

\subsection{Unified Parameter Estimation}
To combine NICE and DPM models, the key question is how to learn the deep neural network, or in other words how to optimize the parameters $\mtheta$ in $f(\my_i;\mtheta)$. In this subsection we will address this challenge with the Monte Carlo  expectation-maximization (MC-EM) algorithm. The whole process is summarized in Algorithm \ref{alg:main}.

\subsubsection{The Overall MC-EM Framework}
We consider $\mc, \{\mmu_k\}_{k=1}^K$ and $\{\lambda_k\}_{k=1}^K$ in Eq. (\ref{eq:obj}) as hidden variables, $\mY$ as the observed variables, and $\mtheta$ as the set of parameters need to be optimized. To keep the representation succinct, we denote $\mH=\{\{\mmu_k\}_{k=1}^K, \{\lambda_k\}_{k=1}^K\}$. Following the maximal likelihood principle, the optimal $\mtheta^*$ is:
\begin{align}
    \mtheta^*={\arg \max}_{\mtheta} p(\mY | \mtheta).
\end{align}
This can be solved by the   expectation-maximization (EM) algorithm \citep{dempster1977maximum}, which iterates between the E-step and M-step until converges:
\begin{align}
    \mbox{E: } 
    &  Q(\mtheta, \mtheta^{(old)}) = E_{\mH, \mc| \mY,  \mtheta^{(old)}}[\log p(\mH, \mc, \mY | \mtheta)] \label{eq:estep}\\
    \mbox{M: }
    &\mtheta^{(new)} = {\arg\max}_{\mtheta } Q(\mtheta, \mtheta^{(old)}) \label{eq:mstep}\\
    &\mtheta^{(old)} \leftarrow \mtheta^{(new)}
\end{align}
In the case that the E-step (\ref{eq:estep}) has no closed-form solution, it can be numerically estimated as
\begin{align}
    Q(\mtheta, \mtheta^{(old)}) \approx \frac{1}{G} \sum_g \log p(\mH^{(g)}, \mc^{(g)}, \mY | \mtheta), \label{eq:mc-em}
\end{align}
where $\mH^{(g)}, \mc^{(g)} \sim  p(\mH, \mc| \mY,  \mtheta^{(old)})$ are i.i.d. samples and  $G$ is the sample size. This method is called the Monte Carlo EM algorithm \citep{bishop:2006:PRML}. As $\mtheta$ denotes the parameters of a deep neural network, we can use stochastic gradient descent to find the maximal value in Eq. (\ref{eq:mstep}):
\begin{align}
    \mtheta_{t+1} & \leftarrow\mtheta_t + \lambda_s\frac{\partial }{\partial \mtheta}Q(\mtheta, \mtheta^{(old)})\\
     & \approx\mtheta_t + \frac{\lambda_s}{G} \sum_g \frac{\partial }{\partial \mtheta}\log p(\mH^{(g)}, \mc^{(g)}, \mY | \mtheta) \label{eq:sgd},
\end{align}
where $\lambda_s$ is the learning rate. Finally to complete the picture, we need to:
\begin{itemize}
    \item Present the analytical form of the complete data likelihood $p(\mH^{(g)}, \mc^{(g)}, \mY | \mtheta)$, and particularly the derivative of the log-likelihood in Eq. (\ref{eq:sgd});
    \item Develop a method to obtain the samples $\mH^{(g)}, \mc^{(g)} \sim  p(\mH, \mc| \mY,  \mtheta^{(old)})$.
\end{itemize}
These two questions will be addressed respectively in the following two subsections.

\subsubsection{The Complete Data Likelihood}

With our discussions in the model specification section, the complete data likelihood can be derived as follows:
\begin{align*}
&p(\mH^{(g)}, \mc^{(g)}, \mY | \mtheta) \notag\\
=&  p(\mY |\mH^{(g)}, \mc^{(g)}; \mtheta) p(\mH^{(g)}, \mc^{(g)})\\
=& p(\mZ | \mH^{(g)}, \mc^{(g)}) p(\mH^{(g)}, \mc^{(g)}) \prod_i |\det \frac{\partial f(\my_i ; \mtheta)}{\partial \my_i}|\\
=& p(\mZ | \mH^{(g)}, \mc^{(g)}) p(\mH^{(g)})p(\mc^{(g)})\\
=& \prod_i p(\mz_i | \mmu_{c_i}^{(g)}, \lambda_{c_i}^{(g)}) \prod_k p(\mmu_{k}^{(g)}, \lambda_{k}^{(g)})p(\mc^{(g)}), 
\end{align*}
where $p(\mz_i | \mmu_{c_i}^{(g)}, \lambda_{c_i}^{(g)})$ is given in Eq. (\ref{eq:llh}).
As we are interested in optimizing the neural network's parameters $\mtheta$, and the likelihood only involves $\mtheta$ through $\mz_i = f(\my_i;\mtheta)$, the derivative of the log-likelihood is:
\begin{align}
    &\frac{\partial }{\partial \mtheta}\log p(\mH^{(g)}, \mc^{(g)}, \mY | \mtheta) \notag\\
    = &\sum_i \frac{\partial }{\partial \mtheta}\log p(\mz_i | \mmu_{c_i}^{(g)}, \lambda_{c_i}^{(g)}) \notag\\
    = &\sum_i -\lambda_{c_i}^{(g)} (\mz_i -\mmu_{c_i}^{(g)}) \frac{\partial f(\my_i;\mtheta) }{\partial \mtheta}.\label{eq:grad}
\end{align}
Since $f(\my_i;\mtheta)$ is implemented by a DNN, its derivative can be automatically computed by many modern machine learning frameworks like PyTorch \citep{NEURIPS2019_9015}.


\begin{algorithm}[t]
    \caption{DDPM$(\mX, h_e(\cdot), \mPhi, \lambda_s)$} \label{alg:main}
     \begin{algorithmic}[1]
     \REQUIRE Input dataset $\mX$; encoder $h_e(\cdot)$; hyperparameters $\mPhi$; learning rate $\lambda_s$
     \ENSURE Cluster parameters $\{\mmu_k, \lambda_k\}_{k=1}^K$; cluster assignment vector $\mc$.
     
     \STATE Initialize neural network's parameters $\mtheta$
     \STATE $\mY \leftarrow \{h_e(\mx_i)|\mx_i \in \mX\}_{i=1}^N.$
     
     \FOR{$epoch$ in \{1, ..., EPOCHS\}}{
     \STATE $\mZ^{(old)} \leftarrow \{f(\my_i;\mtheta)\}_{i=1}^N$
     \STATE $\backslash\backslash$ E-step; iterations for the Gibbs sampling
     
        \FOR{$t$  in \{1, ..., GIBBS\_STEPS\}}{
        \STATE For each $k$ sample $\mmu_k, \lambda_k \sim p(\mmu_k, \lambda_k | \mH \setminus\{\mmu_k, \lambda_{k}\}, \mc,  \mZ^{(old)} )$ by Eq. (\ref{eq:cond-para})
        
        \STATE For each $i$ sample $c_i \sim p(c_i | \mc_{-i}, \mZ^{(old)}, \mH)$ by Eq. (\ref{eq:cond-c-old}) and  Eq. (\ref{eq:cond-c-new})
        }
        \ENDFOR
        
    \STATE For each $k$ sample $\mmu_k^{(g)}, \lambda_k^{(g)}$, for each $i$ sample $c_i^{(g)}$
        
     \STATE $\backslash\backslash$ M-step; optimization of the NICE model
     
        \FOR{$t$  in \{1, ..., OPT\_STEPS\}}{
        \STATE Sample a batch $\mY^{(b)}\leftarrow\{\my_i\in\mY\}_{i=1}^B$
        
        \STATE $\nabla _\theta \leftarrow \frac{\partial }{\partial \mtheta}\log p(\mH^{(g)}, \mc^{(g)}, \mY^{(b)} | \mtheta)$ (Eq. (\ref{eq:grad}))
        
        \STATE $\mtheta \leftarrow \mtheta + \lambda_s \nabla _\theta$
        }
        \ENDFOR
     }
     \ENDFOR
     
     \STATE Return sampled cluster parameters $\{\mmu_k^{(g)}, \lambda_k^{(g)}\}_{k=1}^K$, and the  cluster assignment vector $\mc^{(g)}$
    \end{algorithmic}
\end{algorithm}

\subsubsection{Gibbs Sampling}

Next we consider how to obtain the samples $\mH^{(g)}, \mc^{(g)} \sim  p(\mH, \mc| \mY,  \mtheta^{(old)}) = p(\mH, \mc| \mZ^{(old)})$, where we define that $\mZ^{(old)}=\{\mz^{(old)}_i=f(\my_i;  \mtheta^{(old)})\}_{i=1}^N$. This can be achieved by Gibbs sampling, which states that we can sample from the joint distribution by iteratively sampling from the conditional distribution of each variable while keeping others fixed \citep{bishop:2006:PRML}. So in what follows we will derive the conditional distribution of each variable.

\textbf{Conditional distribution of $\mmu_k$ and $\lambda_k$:}
\begin{align}
&p(\mmu_k, \lambda_k | \mH \setminus\{\mmu_k, \lambda_{k}\}, \mc,  \mZ^{(old)} ) \notag\\
=&  p(\mmu_k, \lambda_k | \mc,  \mZ^{(old)}  )  \notag\\
=& p(\mmu_k, \lambda_k | [\mZ^{(old)} ]_k) \notag \\
= & NG(\mmu_k, \lambda_k | \mmu_n, \kappa_n, \alpha_n, \beta_n), \label{eq:cond-para}
\end{align}
where $[\mZ^{(old)} ]_k=\{\mz_i \in \mZ^{(old)} | c_i=k\}$ denotes all the latent variables which are assigned to the $k$-th cluster. So the result is a normal-gamma distribution, with parameters given by:
\begin{align*}
& n_k = \#[\mZ^{(old)} ]_k,\quad
\boldsymbol {\bar {z
}_k}= \frac{1}{n_k} \sum _{\mz_i \in [\mZ^{(old)} ]_k}\mz_i, \\
& 
{\boldsymbol {\mu }}_{n}= {\frac {\kappa_0  {\boldsymbol {\mu }}_{0}+n_k {\boldsymbol {\bar {z
}_k}}}
{\kappa_0 +n_k}}, \quad
\kappa _{n}= \kappa_0 +n_k, \\
& 
\alpha_n = \alpha_0 + \frac{n_k d}{2}, \\
& 
\beta_n = \beta_0 + \frac{1}{2} \sum _{\mz_i \in [\mZ^{(old)} ]_k} ||\mz_i - \boldsymbol {\bar {z
}_k}||_2^2 +\frac{\kappa_0 n_k ||\boldsymbol {\bar {z
}_k}-\mmu_0||_2^2}{2(\kappa_0+n_k)}.
\end{align*}

\textbf{Conditional distribution of $c_i$:}

$\bullet$ If $n_{-i,k}>0$ (assign to an existing cluster):
\begin{align}
& \log p(c_i=k | \mc_{-i}, \mZ^{(old)}, \mH)\notag\\
= & \log p\left(c_{i}=k \mid \mc_{-i}, \alpha\right) + \log p(\mz_i | \mmu_k, \lambda_k) + \mathrm{const}\notag\\
= & \log \dfrac{n_{-i, k}}{N-1+\alpha} + \log \mN(\mz_i | \mmu_k, \lambda_k^{-1}\mI) + \mathrm{const} \label{eq:cond-c-old}
\end{align}

$\bullet$ If $n_{-i,k}=0$ (assign to a new cluster):
\begin{align}
& \log p(c_i=k | \mc_{-i}, \mZ^{(old)}, \mH)\notag\\%=  \log p(c_i=k | \mc_{-i}, \mZ^{(old)})
= & \log p(\mz_i | \mmu_0, \kappa_0, \alpha_0, \beta_0) p\left(c_{i}=k \mid \mc_{-i}, \alpha\right)  + \mathrm{const}\notag\\
= & \log \int p(\mz_i | \mmu, \lambda) NG(\mmu, \lambda | \mmu_0, \kappa_0, \alpha_0, \beta_0) d \mmu d \lambda + \notag\\
&\log p\left(c_{i}=k \mid \mc_{-i}, \alpha\right)  + \mathrm{const}\notag\\
= &\log \Gamma(\alpha_n')  - \log \Gamma(\alpha_0) + \alpha_0 \log \beta_0 - \alpha_n' \log \beta_n' + \notag\\
& \frac{1}{2} (\log \kappa_0 - \log \kappa_n') - \frac{nd}{2} \log 2 \pi +\notag\\
& \log \dfrac{\alpha}{N-1+\alpha} + \mathrm{const, } \label{eq:cond-c-new}
\end{align}
where
\begin{align*}
&\kappa_n' = \kappa_0 + 1, \\
&\alpha_n' = \alpha_0 + d/2, \notag\\
&\beta_n' = \beta_0 + \frac{\kappa_0 ||\mz_i-\mmu_0||_2^2}{2(\kappa_0+1)}.\notag    
\end{align*}

Since the normal-gamma distribution is the conjugate prior of the likelihood, the integration in the third line of Eq. (\ref{eq:cond-c-new}) is analytically tractable \citep{murphy2007conjugate}.

% We summarize our method in Algorithm \ref{alg:main}. In practice GIBBS\_STEP is a varying parameter that decreases with the epochs, as the Gibbs sampling needs more time to ensure convergence at the beginning.

\section{Experiments}

\subsection{Synthetic datasets}

We begin by demonstrating DDPM's potential on two synthetic datasets, and the results are shown in Fig. \ref{fig:synthetic}.

\textbf{Non-isotopic Gaussian dataset:} \quad In the first example we generate three clusters. The data points in each cluster are sampled from a non-isotopic Gaussian distribution with different covariance matrices. As the training continues, the data points gradually ``concentrate'' and approximate the standard Gaussian distribution, which is a more convenient representation for clustering.

\textbf{Intertwined moon dataset:} \quad In this example the initial raw data consists of two intertwined clusters of moon shapes. Note that such representation is challenging for many centroid-based clustering methods, including $k$-means and DPM. Interestingly, during the training of DDPM, the clusters in the latent representation space are automatically disentangled and finally be successfully identified. This example shows that DDPM can learn better representation during the training, and also articulates the mutually beneficial relationship between clustering and feature learning.

\subsection{Real-world datasets}

\subsubsection{Datasets and settings}

\textbf{Datasets}: We evaluate our method on 4 widely used  real-world datasets including MNIST \citep{lecun1998gradient}, HHAR \citep{stisen2015smart}, STL-10 \citep{coates2011analysis} and REU-10K \citep{lewis2004rcv1}. MNIST is a  handwritten digit database, containing 10 classes of 786-dimensional training samples with 7000 samples for each class.  HHAR is a sensor signal classification dataset,  containing 10 classes of 561-dimensional training samples and 10200 samples in total. STL-10 is an image recognition dataset, which contains unlabeled data for unsupervised learning. It contains 10 classes of 2048-dimensional training samples and 1300 samples for each class. REU-10K is a text classification dataset consisting of the TF-IDF features of the word. It has  4 classes of 2000-dimensional training instances, and 10000 samples in total.

\begin{table}[t]
\centering
\caption{Hyperparameters of the prior distribution. } \label{table:parameter}
\begin{tabular}{l|lllll}
\toprule
Dataset & $\alpha$ & $\mu_0$ & $\kappa_0$    & $\alpha_0$ & $\beta_0$   \\
\midrule
MNIST   & 1.0E-03 & 0.0 & 0.005 & 2000  & 1000 \\
HHAR    & 1.0E-10 & 0.0 & 0.005 & 6000  & 1000 \\
STL-10  & 1.0E-10 & 0.0 & 0.005 & 10000 & 1000 \\
REU-10K & 1.0E-10 & 0.0 & 0.005 & 6000  & 1000\\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[t]
\centering
\caption{Performance comparison on real-world datasets. } \label{table:compare_result}
\begin{tabular}{c|l|lll}
\toprule
{Dataset}     &  Methods    & ARI             & F score         & V score         \\
\midrule
\multirow{3}{*}{MNIST}   & G-means  & 0.1126          & 0.1255          & 0.5314          \\
                         & DPM  & 0.3974          & 0.4511          & 0.5571          \\
                         & DDPM & \textbf{0.4400} & \textbf{0.4917} & \textbf{0.6016} \\
\midrule
\multirow{3}{*}{HHAR}     & G-means  & 0.0904          & 0.1146          & 0.4358          \\
                         & DPM  & 0.4342          & 0.5385          & 0.5761          \\
                         & DDPM & \textbf{0.4473} & \textbf{0.5449} & \textbf{0.5865} \\
\midrule
\multirow{3}{*}{STL-10}  & G-means  & 0.2140          & 0.2512          & 0.4830          \\
                         & DPM  & 0.2156          & 0.3073          & 0.4679          \\
                         & DDPM & \textbf{0.2269} & \textbf{0.3193} & \textbf{0.4917} \\
\midrule
\multirow{3}{*}{REU-10K} & G-means  & 0.0581          & 0.0933          & 0.3147          \\
                         & DPM  & 0.1406          & 0.2365          & 0.3662          \\
                         & DDPM & \textbf{0.1827} & \textbf{0.2756} & \textbf{0.3918}\\
                         \bottomrule
\end{tabular}
\end{table}

\textbf{Model structure and settings}: We train the autoencoder by following the prior work of \citep{jiang2017variational}. The network structure is $d$-500-500-2000-10 for encoder and 10-2000-500-500-$d$ for decoder, where $d$ denotes the dimension of preprocessed input samples. The encoded features are normalized to have 0 mean and 1 standard deviation. The NICE model has 6 layers, each containing 512 units. The training starts with running the standard DPM model for 3 epochs, and each epoch sweeps through the whole dataset for 3 times. After that the main DDPM algorithm runs for 5 epochs. In each epoch the loop of Gibbs sampling sweeps the dataset 3 times, and the  NICE model is trained for $0.2N$ iterations ($N$ is the size of the dataset). The batch size is 128 and the learning rate is set to be 1.0E-6.
The hyperparameters of the prior are listed in Table \ref{table:parameter}.

% \subsubsection{Evaluation Metrics and Baselines}
\subsubsection{Numerical Results}


\begin{figure*}[t]
\centering

\begin{subfigure}[b]{\linewidth}
         \centering
         \includegraphics[width=\linewidth]{imgs/flow_advance_1x4_dpm.pdf} 
         \caption{The comparison of DDPM and DPM. The benefit of better feature learning is significant.}
\end{subfigure}
\hfill
\begin{subfigure}[b]{\linewidth}
         \centering
         \includegraphics[width=\linewidth]{imgs/flow_advance_1x4_kmeans.pdf}
         \caption{The comparison of applying $k$-means to DDPM's learned feature and the raw feature.}
\end{subfigure}

\caption{The performance of clustering using the raw autoencoder features (ae repr) and DDPM's learned features (flow repr). (a) DDPM significantly outperforms DPM by learning better representation. (b) By using the learned features in the standard $k$-means, all metrics in almost all the datasets are improved, and the improvement in the MNIST dataset is particularly significant. This demonstrates DDPM's ability to learn better and transferable representation.} \label{fig:flow_advance}
\end{figure*}


\begin{figure*}[t]
\centering
\includegraphics[width=\linewidth]{imgs/tsne_mnist_flow2.pdf}
\centering
\caption{The representation learning process of DDPM on MNIST. The clearest example is the cluster of number 1, which becomes denser and more concentrated as the training process progresses. Its shape gradually changes from a crescent to a circle, making it easier to distinguish from other clusters.} \label{fig:xxn}
\end{figure*}

In our study, we assume the number of clusters $K$ is unknown, so we focus on three $K$-agnostic evaluation metrics for performance comparison: \textit{adjust random index (ARI)} \citep{steinley2004properties}, \textit{clustering F1 score (F score)}, and \textit{V-measure score (V score)} \citep{rosenberg2007v}.
For all these metrics larger means better, and their definitions are presented as follows.

\begin{itemize}
    \item{
\textbf{ARI} is an adjusted version of the Rand Index (RI). Suppose that $C$ is the ground truth class assignment and $K$ is the predictive clustering. We define $a$ as the number of pairs of elements that are in the same set in $C$ and in the same set in $K$, and define $b$ as the number of pairs of elements that are in different sets in $C$ and in different sets in $K$. RI is then given by $\mathrm{RI}=\frac{a+b}{C_{N}^{2}}$, where $C_{N}^{2}$ is the total number of possible data pairs. Finally, ARI is defined as
$\mathrm{ARI}=\frac{\mathrm{RI}-E[\mathrm{RI}]}{\max (\mathrm{RI})-E[\mathrm{RI}]}.$
}

\item{
\textbf{F score} for clustering evaluation is just a traditional F1 score calculated based on a pair confusion matrix. Similar to ARI, The pair confusion matrix (\cite{hubert1985comparing}) computes a 2 by 2 similarity matrix between two clusters by considering all pairs of samples and counting pairs that are assigned into the same or into different clusters under the ground truth cluster assignment.
}

\item{
\textbf{V score} is defined based on the homogeneity term $h=1-\frac{H(C \mid K)}{H(C)}$ and the completeness term $c=1-\frac{H(K \mid C)}{H(K)}$. Here $H(C)$ is the entropy of the classes and $H(C \mid K)$ is the conditional entropy of the classes, defined as
\begin{align*}
H(C)&=-\sum_{c=1}^{|C|} \frac{n_{c}}{N} \cdot \log \left(\frac{n_{c}}{N}\right), \\
H(C \mid K)&=-\sum_{c=1}^{|C|} \sum_{k=1}^{|K|} \frac{n_{c, k}}{N} \cdot \log \left(\frac{n_{c, k}}{n_{k}}\right)
\end{align*}
Homogeneity encourages each cluster contains only members of a single class, and completeness prefers all members of a given class to be assigned to the same cluster. The V score is finally defined as
$
v=2 \cdot \frac{h \cdot c}{h+c}.
$
}
\end{itemize}

We compare with two other clustering baselines that can infer the number of clusters $K$, \ie, the standard Dirichlet Process Mixture (DPM) Model and G-means \citep{zhao2008g}. The numerical results are presented in Table \ref{table:compare_result}, where the best results are highlighted with bold font. We can see that DDPM consistently outperforms other baselines across various datasets and metrics.


\begin{figure*}[t]
\centering
\includegraphics[width=\linewidth]{imgs/mnist_gen_flow.pdf}
\caption{The generated handwritten digits in the MNIST dataset.} \label{fig:mnist_ddpm_gen}
\end{figure*}

\subsubsection{Representation quality}

To show that the learned representation of DDPM is better than DPM and even transferable to other algorithms, we examine and compare the features before and after processing by the model in all datasets. We additionally apply the $k$-means clustering algorithms in the feature space (\ie, $\{\my_i=h_e(\mx_i)\in \bbR^d\}_{i=1}^N$) and the transformed space (\ie, $\{\mz_i=f(\my_i ; \mtheta)\in \bbR^d\}_{i=1}^N$), with the number of clusters $K$ specified by DDPM. The results are presented in Fig. \ref{fig:flow_advance}. It is obvious that the DPM performs better in all cases when the features learned from the DDPM are used. Moreover, with the same prior of K, the $k$-means can also benefit from the learned representation (e.g. particularly evident in the MNIST dataset), indicating the transferability of the enhanced features. This means that the feature learned from DDPM is not only suitable for DPM but also for other algorithms. We also visualize the learning process of DDPM in Fig. \ref{fig:xxn}. We randomly select 5,000 samples from the MNIST datasets and visualize their $t$-SNE embedding over different epochs. We can see the clusters become denser and more concentrated as the training progresses, which is potentially beneficial for clustering and label discrimination.

\subsubsection{DDPM as a generative model}

Benefiting from the reversibility of the flow model, DDPM can  be utilized as a generative model. %So we further visualize the generated images of DDPM learned embeddings in Figure 2. 
We select the largest cluster for each ground truth label, and generate 25 random samples by adding scaled noise to the cluster centers. Specifically for each selected cluster $k$, we obtain $\hat{\mmu}=\mmu_k+n \epsilon$, where $\epsilon\sim \mN(0,1)$ is a Gaussian noise and $n$ is the noise scale. Since the flow model is invertible, we can obtain the sample as $\hat{\mx}=h_d(f^{-1}(\hat{\mmu}; \mtheta))$. 
The visualization results are presented in Fig. \ref{fig:mnist_ddpm_gen}. It can be observed that DDPM is capable of generating clear handwritten digits with sharp edges.
%The results show that our method produces meaningful cluster center embeddings capable to generate rich chirography details and correct discriminative digital property.

\subsubsection{Impact of the autoencoder}

In this subsection we study the impact of the autoencoder's quality to the performance of our clustering method.
We trained the autoencoder for a different number of iterations and repeated our experiments. As we previously discussed, DPM performs clustering directly on the features extracted by the autoencoder, while DDPM further applies the flow model. The results are presented below (the fully trained autoencoder is optimized for 100000 iterations).
We can see that the performance of the clustering methods generally improves as the autoencoder is better trained. Nonetheless, for all the \#iter. settings, DDPM consistently outperforms DPM, and the improvement is robust. So our method is effective even if the autoencoder is not sufficiently trained or has low quality.


\begin{table*}[t]
\centering
\caption{Impact of the autoencoder trained with different number of iterations.}
\begin{tabular}{c|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{\#iter.} & \multicolumn{3}{c|}{ARI} & \multicolumn{3}{c|}{F SCORE}            & \multicolumn{3}{c}{V SCORE}            \\
                         \cmidrule(lr){2-10}
                         & DPM     & DDPM   & $\uparrow$     & DPM    & DDPM   & $\uparrow$ & DPM    & DDPM   & $\uparrow$ \\
\midrule
250000                   & 0.2352  & 0.2618 & 11\% & 0.3309 & 0.3549 & 7\%                  & 0.3754 & 0.3928 & 5\%                  \\
500000                   & 0.3001  & 0.3316 & 10\% & 0.3844 & 0.4071 & 6\%                  & 0.4306 & 0.4481 & 4\%                  \\
750000                   & 0.2584  & 0.2815 & 9\%  & 0.3567 & 0.3790  & 6\%                  & 0.4117 & 0.4488 & 9\%                  \\
1000000                  & 0.3974  & 0.4400   & 11\% & 0.4511 & 0.4917 & 9\%                  & 0.5571 & 0.6016 & 8\%                 \\
\bottomrule
\end{tabular}
\end{table*}

\section{Conclusion}

In this paper we proposed the deep Dirichlet process mixture (DDPM) model, which jointly achieves clustering and feature learning in an unsupervised fashion. Our method combines the strengths of the traditional DPM models and deep neural networks. Based on the Dirichlet process, DDPM inherits the ability to adapt the number of mixture components, which is an important and useful feature in many real-world scenarios where prior knowledge of the clusters is unavailable. The invertible flow-based deep neural network component further enables DDPM to learn complex and nonlinear features. Experimental results suggested that DDPM can learn more expressive representation, and achieve better clustering performance compared to other baselines.
As for future work, we would like to improve the training efficiency by employing other inference techniques, such as the variational inference framework \citep{blei2006variational}. It is also interesting to extend our method to more sophisticated DPM models like the hierarchical Dirichlet processes \citep{teh2006hierarchical}.


\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    This work is supported in part by the National Natural Science Foundation of China under Grant 62171248, and the PCNL KEY project (PCL2021A07). It's also supported in part by Shenzhen Science and Technology Innovation Commission (Research Center for Computer Network (Shenzhen) Ministry of Education).
\end{acknowledgements}

\bibliography{li_223.bib}

% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}



\end{document}
