% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% \usepackage{algorithm}
% \usepackage{algpseudocode}
% \usepackage{algorithmic}


% \usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsfonts}
% \usepackage{booktabs} % commands to create good-looking tables
% \usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{float}

\usepackage{subfigure}

%\newcommand{\theHalgorithm}{\arabic{algorithm}}

\newcommand{\csize}{
\fontsize{8}{8}\selectfont
}

\newcommand{\csizenine}{
\fontsize{9}{9}\selectfont
}
\newenvironment{proofof}[1]{{\bf Proof of #1.  }}{\hfill$\Box$}

\newcommand{\csizenineplus}{
\fontsize{9.5}{9.5}\selectfont
}

\newcommand{\csizeten}{
\fontsize{10}{10}\selectfont
}

\newcommand{\tabsize}{
\fontsize{7}{7}\selectfont
}

\newcommand{\cA}{{\mathcal{A}}}
\newcommand{\cB}{{\mathcal{B}}}
\newcommand{\cC}{{\mathcal{C}}}
\newcommand{\cD}{{\mathcal{D}}}
\newcommand{\cG}{{\mathcal{G}}}
\newcommand{\cI}{{\mathcal{I}}}
\newcommand{\cN}{{\mathcal{N}}}
\newcommand{\cM}{{\mathcal{M}}}
\newcommand{\cO}{{\mathcal{O}}}
\newcommand{\cP}{{\mathcal{P}}}
\newcommand{\cQ}{{\mathcal{Q}}}
\newcommand{\bP}{{\mathbf{P}}}
\newcommand{\cR}{{\mathcal{R}}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cH}{{\mathcal{H}}}
\newcommand{\cK}{{\mathcal{K}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{{\mathcal{U}}}
\newcommand{\cV}{{\mathcal{V}}}
\newcommand{\cY}{{\mathcal{Y}}}
\newcommand{\cZ}{{\mathcal{Z}}}
\newcommand{\newsetminus}{{\!-\!}}
\newcommand{\cVmA}{{\cV\newsetminus\cA}}
\newcommand{\cX}{{\mathcal{X}}}
\newcommand{\cs}{s}
\newcommand{\cVms}{{\cV-\cs}}

\newcommand{\ba}{{\mathbf{a}}}
\newcommand{\bb}{{\mathbf{b}}}
\newcommand{\bu}{{\mathbf{u}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\resid}{\cR}

\newcommand{\NP}{{\mathbf{NP}}}

% \DeclareMathOperator{\MIF}{MI} 

\newcommand{\bs}[1]{\boldsymbol{#1}}
\newcommand{\mb}[1]{\mathbf{#1}}

\newcommand{\mhk}{\cM^h_k}

\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\eqnref}[1]{Eq.~\ref{#1}}
\newcommand{\secref}[1]{Sec.~\ref{#1}}
\newcommand{\appref}[1]{Appendix~\ref{#1}}
\newcommand{\prcref}[1]{Procedure~\ref{#1}}
\newcommand{\assmref}[1]{Assumption~\ref{#1}}
\newcommand{\crlref}[1]{Corollary~\ref{#1}}
\newcommand{\algoref}[1]{Alg.~\ref{#1}}
\newcommand{\prpref}[1]{Proposition~\ref{#1}}
\newcommand{\cnjref}[1]{Conjecture~\ref{#1}}
\newcommand{\axmref}[1]{Axiom~\ref{#1}}
\newcommand{\lmaref}[1]{Lemma~\ref{#1}}

\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[lemma]{Corollary}
\newtheorem{procedure}[lemma]{Procedure}
\newtheorem{assumption}[lemma]{Assumption}
\newtheorem{claim}[lemma]{Claim}
\newtheorem{conclusion}[lemma]{Conclusion}
\newtheorem{proposition}[lemma]{Proposition}
\newtheorem{conjecture}[lemma]{Conjecture}
\newtheorem{axiom}[lemma]{Axiom}
\newtheorem{algo}[lemma]{Algorithm}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}

% \definecolor{shadecolor}{gray}{0.95}
% \newcommand{\algshade}[1]{
%     \hspace*{-\fboxsep}
%     %\vspace*{-\fboxsep}
%     \colorbox{shadecolor}{
%         \parbox{\linewidth}{#1}
%     }
% }

% \newcommand{\fg}[1]{{\color{red}(FG: #1)}}
% \newcommand{\fw}[1]{{\color{brown}(FW: #1)}}
% \newcommand{\sd}[1]{{\color{purple}SD: #1}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Information Theoretic Clustering via Divergence Maximization among Clusters}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{Sahil Garg}
\author[1]{Mina Dalirrooyfard}
\author[1]{Anderson Schneider}
\author[1]{Yeshaya Adler}
\author[1]{Yuriy Nevmyvaka}
\author[1]{Yu Chen}
\author[1]{Fengpei Li}
\author[2]{Guillermo Cecchi}

% Add affiliations after the authors
\affil[1]{%
    Dept. of Machine Learning Research\\
    Morgan Stanley\\
    New York, New York, USA
}
\affil[2]{%
    IBM T. J. Watson Research Center\\
    Yorktown Heights, New York, USA\\
}

\affil[*]{
Corresponding Author: sahil.garg@morganstanley.com, sahil.garg.cs@gmail.com
}
  
\begin{document}
\maketitle

\begin{abstract}
Information-theoretic clustering is one of the most promising and principled approaches to finding clusters with minimal {apriori assumptions}. The key criterion therein is to maximize  the  mutual information between the data points and their cluster labels. Such an approach, however, does not explicitly promote any type of inter-cluster behavior. We instead propose to maximize the Kullback–Leibler divergence between the underlying data distributions associated to clusters (referred to as \emph{cluster distributions}).
We show it to entail the mutual information criterion along with maximizing cross entropy between the cluster distributions. For practical efficiency, we propose to empirically estimate the objective of KL-D between clusters in its dual form leveraging deep neural nets as a dual function approximator. Remarkably, our theoretical analysis establishes that estimating the divergence measure in its dual form simplifies the problem of clustering to one of optimally finding \emph{$k-1$ cut points for $k$ clusters in the 1-D dual functional space}. Overall, our approach enables linear-time clustering algorithms with theoretical guarantees of near-optimality, owing to the submodularity of the objective. We show the empirical superiority of our approach w.r.t. current state-of-the-art methods on the challenging task of clustering noisy timeseries as observed in domains such as neuroscience, healthcare, financial markets, spatio-temporal environmental dynamics, etc.
\end{abstract}
% 
\section{Introduction}
\label{sec:introduction}
% 
Clustering~\citep{jain2010data} is one of the most fundamental problems in machine learning. It is particularly challenging in domains such as neuroscience, healthcare, and finance, where data often consists of noisy timeseries of significant length. In neuroscience, the functional structure of neuronal ensemble activity is key to understanding how brain regions interact with each other \citep{siegle2021survey,de2020large}. In finance, systemically grouping assets that change in value together plays a critical role in portfolio optimization ~\citep{ClusterPortOptim}.
% 
\begin{figure}[t!]
\centering
\includegraphics[width=0.8\columnwidth]{example_clustering.eps}
\caption{A high level toy example of our approach. On the l.h.s., we show 6 points in the original 3-dimensional space. These data points are represented in the 1-D dual space such that KL-divergence can be computed between any two subsets of data points~(clusters) as a deterministic function of the representations. With the objective of maximizing the KL-divergence between clusters, we find clusters simply by greedy search of cut points in the dual space.}
\label{fig:dv_repr}
\end{figure}
% 
Clustering algorithms are typically employed for exploratory analysis. Therefore, they should ideally be sufficiently flexible and make minimal assumptions on the data \citep{ver2014demystifying}. They should not require that ``prototypical'' clusters be specified \citep{bohm2006robust}, nor explicitly define notions of similarity between data points \citep{slonim2005information}. With these considerations, one of the most promising and principled approaches is \emph{information theoretic clustering} introduced by \cite{gokcay2002information}. Owing to its theoretical appeal, it has been studied extensively~\citep{sugar2003finding,still2004many,banerjee2005clustering,ver2014demystifying,cicalese2019new}, primarily in the form where the objective is to maximize mutual information~(\textbf{MI}) between data points and their cluster labels.
% 
Despite the popularity of this approach, it is noteworthy that such an objective characterizes only intra-cluster properties, i.e. minimizing entropy~(variance) within each cluster, while inter-cluster properties are only implied from the former.

As our \emph{first contribution}, we argue that a fundamental criterion for clustering is that the distributions implied from any two clusters~(of the same support) should have minimal overlap with each other, as quantified by the Kullback–Leibler Divergence~(\emph{KL-D})~\citep{cover1999elements}.
% 
Such a criterion is general enough to satisfy the desiderata of a ``good'' clustering solution, while simultaneously stating, explicitly, required inter-cluster behavior.
% 
Additional constraints, such as properties of data distributions within a cluster~(e.g. low entropy of distribution) or continuity of manifold, are problem-specific,  potentially impractical, and can be implied from the primary criterion itself.
    
While information theoretic clustering is theoretically appealing, it is nontrivial to estimate the required functions~(including mutual information), and often intractable to optimize them w.r.t. cluster labels. Various ITC models have been explored in practice, including those based on $k$-nearest neighbors~\citep{faivishevsky2010nonparametric}, minimal spanning trees~\citep{muller2012information}, kernel functions, eigen-decomposition, max-k-cut, etc.~\citep{davis2006differential,he2015information,bohm2006robust,singhinformation,wang2011information,sugiyama2014information}. Yet, assuming a specific model of the data counteracts the theoretical appeal of the framework of being \emph{model agnostic}. Further, these prior works rely on traditional (non-parametric) models which may be unsuitable for noisy, high-dimensional data which often arise in practice.
    
In light of the above, we \emph{propose to estimate the divergence measure in its dual form} by \cite{donsker1983asymptotic}, employing deep neural nets as the dual function approximators~\citep{belghazi2018mutual}. Estimating divergence in its dual form circumvents the need to learn or characterize the cluster distributions and it suffices to have samples from those cluster distributions - i.e. data points belonging to their respective clusters. Moreover, neural networks lend expressiveness as universal approximators, and capacity to operate in high dimensional (noisy) settings.
    
Through \emph{theoretical analysis} of the dual form of the proposed objective, we establish that clusters are \emph{optimally contiguous in 1-D dual functional space}. This theoretical result is highly valuable, not only for its interpretability, but also because it simplifies the combinatorial problem of searching for optimal cluster labels to finding cut points in the 1-D dual space. Consequently, $k$ clusters can be near-optimally identified by a \emph{greedy search of $k-1$ cut points in the dual space}.

\paragraph{Contributions}
We make the following contributions to the information theoretic clustering literature:
% 
(i) While MI is known as the most principled objective for clustering from an information theoretic perspective, we show that the objective of KL-D is superior, entailing the former, and we advocate for it as the new fundamental criterion for optimization of cluster labels, as well as evaluation.
% 
(ii) Our theoretical analysis establishes that clusters are optimally contiguous in the dual function space of the objective thus simplifying the combinatorial optimization of cluster labels to one of finding cut points in the 1-D dual space. 
% 
% This theoretical result is significant as it paves the way for invention of algorithms for finding cut points in the dual space.
% 
(iii) Owing to the submodularity of the proposed objective, we propose nearly-optimal greedy algorithms for finding $k-1$ cut points to obtain k clusters.
% 
(iv) We evaluate our approach for clustering noisy timeseries observed in domains like healthcare, finance, environmental dynamics, etc., along with a synthetic dataset, and demonstrate its competitiveness to the other information theoretic, traditional, and advanced deep learning methods for clustering.
% 
(v) Codebase at \href{https://github.com/morganstanley/MSML/tree/main/papers/Clustering_via_Dual_Divergence_Maximization}{github.com/morganstanley/MSML/tree/main/papers}.
            
\paragraph{Other Related Works}
Deep learning has been extensively applied to the problem of clustering \citep{min2018survey}. A common theme is to apply traditional clustering algorithms (e.g. K-Means, spectral, Gaussian mixture, subspace, nearest neighbors matching, etc.) in the latent representation space of an autoencoder~\citep{ji2017deep,law2017deep,madiraju2018deep,ma2019estimating,yang2019deep,bo2020structural,dang2021nearest}. Another paradigm for deep clustering is to minimize the KL-D between an auxiliary target distribution and the posterior distribution of the data represented by their cluster labels ~\citep{xie2016unsupervised}. Aside from information theoretic clustering, information theory has been explored for representation learning including for the task of deep clustering such as based on variational autoencoders~\citep{chen2016infogan,hu2017learning,dit:yang2019deep,dit:yang2020clustering,yang2022learning, dit:ntelemis2021information, dit:ahmadi2022deep}. However, these works differ from the niche field of so called “information theoretic clustering" for their primary criteria to cluster are not information theoretic. A KL-Divergence objective was previously considered with the highly restrictive assumption of clusters being Gaussian distributed~\citep{das2015kl}; while \cite{dhillon2003divisive}, used the KL-D between two words for the problem of clustering words, but not as a measure of divergence between clusters.
        
\section{Our Approach}
\label{sec:main_sec}
Next, we discuss our approach of information theoretic clustering by maximizing KL-D between cluster distributions.
         
The standard objective in the information theoretic clustering literature is to maximize mutual information~(\emph{MI}) between data points~($\mathbf{X}$) and cluster labels~($Y$).
% 
\begin{align}
I(\mb{X} : Y) = \mathbb{E}_{(\mathbf{x}, y) \sim (\bs{\cX}, \cY)} 
\left[
\log \frac{P(\mb{x}, y)}{P(\mb{x})P(y)}
\right]
\nonumber
\end{align}
% 
\noindent Here, $\mb{\cX}$ is the high dimensional data distribution, and $\cY$ is the distribution of cluster labels; $\mb{X}$ and $Y$ are the respective random variables. $I(\mb{X}:Y)$ is the MI function, a fundamental measure of dependence. Typically, the MI function is expressed in terms of the conditional entropy function:
% 
\begin{align}
\label{eqn:MI}
 \arg\!\max_{Y} I(\mb{X}:Y)
& =  \arg\!\max_{Y} H(\mb{X}) - H(\mb{X} |\ Y)
\end{align}
% 
Since entropy of $\mb{X}$, $H(\mb{X})$, is a constant, the problem of maximizing MI is tantamount to minimizing conditional entropy of data points $\mb{X}$ given the clusters $Y$. This objective accounts for intra-cluster characteristics explicitly and \emph{inter-cluster characteristics are only implied} from the former. To further elucidate, we consider a two-cluster problem with cluster labels $Y=0$ and $Y=1$. Let $\mb{X}_{y0}$ and $\mb{X}_{y1}$ denote conditional random variables given the cluster labels, and corresponding cluster distributions, $\mb{\cX}_{y0}$ and $\mb{\cX}_{y1}$, both with support $\mb{\cX}$. We note that the conditional entropy explicitly decomposes into intra-cluster entropy terms.
% 
\begin{align}
\arg\!\min_{Y} H(\mb{X} |\ Y)
\nonumber
= 
\arg\!\min_{Y}
P_{y0} H(\mb{X}_{y0}) + P_{y1} H(\mb{X}_{y1})
\nonumber
\end{align}
% 
Here, $P_{y0}$ is shorthand for $P(Y=0)$. Similarly, $P_{y1}$ denotes $P(Y=1)$. Thus, maximizing the mutual information is equivalent to minimizing entropy of both the cluster distributions.
% 
Considering the limitations of MI criterion in capturing inter-cluster characteristics, we instead propose an objective of maximizing KL-D between the cluster distributions, $\mb{\cX}_{|Y=0}$ and $\mb{\cX}_{|Y=1}$, as below.
% 
\begin{align}
% 
&
\arg\!\max_{Y}
D(\mb{\cX}_{y0} \| \mb{\cX}_{y1}) 
+ 
D(\mb{\cX}_{y1} \| \mb{\cX}_{y0})
% 
\\
&
=
\arg\!\min_{Y}
\underbrace{
H(\mb{X}_{y0})
\!+\! 
H(\mb{X}_{y1})
}_{\text{intra-cluster}}
\!-\!\underbrace{ 
H_{\mb{X}_{y0}}(\mb{X}_{y1}) 
\!-\! 
H_{\mb{X}_{y1}}(\mb{X}_{y0})
}_{\text{inter-cluster}}
\nonumber
\end{align}
% 
Clearly, the proposed objective minimizes entropy of cluster distributions, while maximizing cross entropy between the distributions, i.e. minimizing their overlap. For the problem of k-clusters, a variety of simple extensions are applicable, which we discuss later in this section. It is interesting to note that cross entropy is a pure and fundamental (directed) measure of non-overlap between two distributions whereas KL-D also accounts for entropy of one of the distributions itself. This insight is important in establishing that the proposed KL-D objective \emph{entails} the MI function.
% 
\begin{theorem}
\label{thm:1-KLD-MI}
% 
Let $\mb{X}_{y0}$ and $\mb{X}_{y1}$ be the conditional random variables associated with the conditional distributions of data points, $\mb{\cX}_{y0}$ and $\mb{\cX}_{y1}$, given cluster labels $Y\!=\!0$ and $Y\!=\!1$ respectively.
% 
Optimizing the two clusters such that KL-Divergence between the two distributions is maximized,
% 
\begin{align}
&
{\arg\!\max}_{Y} \   
P_{y0} D(\mb{\cX}_{y0} \| \mb{\cX}_{y1}) 
+ 
P_{y1} D(\mb{\cX}_{y1} \| \mb{\cX}_{y0}),
\nonumber
\\
& 
\text{is equivalent to},
\nonumber
\\
&
{\arg\!\max}_{Y} I(\mb{X} : Y) \ 
+ 
P_{y1} H_{\mb{X}_{y1}}(\mb{X}_{y0}) 
+
P_{y0} H_{\mb{X}_{y0}}(\mb{X}_{y1}),
\nonumber 
\end{align}
%
where, $I(\mb{X}:Y)$ is mutual information function, and $H_{\mb{X}_{y1}}(\mb{X}_{y0})$ and $H_{\mb{X}_{y0}}(\mb{X}_{y1})$ are cross entropy functions.
\end{theorem}
% 
Note, the above theoretic result depends upon $P_{y0}$ and $P_{y1}$ for establishing the equivalence. In practice, assuming a prior of clusters of equal sizes, we propose to simply maximize the objective, $D(\mb{\cX}_{y0} \| \mb{\cX}_{y1}) + D(\mb{\cX}_{y1} \| \mb{\cX}_{y0})$.
            
Although the objective of maximizing KL-D between cluster distributions is fundamental and intuitive, doing so w.r.t. cluster labels and input samples is not straightforward. To understand this challenge, we present KL-D between the two cluster distributions in the standard expression below.
% 
\begin{align}
D(\mb{\cX}_{y0} \| \mb{\cX}_{y1})
& 
=
\mathbb{E}_{\mb{X} \sim \mb{\cX}_{y0}}
\log 
\frac{P(\mb{X} | Y=0)}{P( \mb{X} | Y=1)}
\end{align}
% 
To estimate the KL-D objective from the above expression, one needs to obtain the conditional densities, $P(\mb{X}|Y=0)$ and $P(\mb{X}|Y=1)$, for the respective (unknown) cluster distributions $\mb{\cX}_{y0}$ and $\mb{\cX}_{y1}$, even if the expectation is computed empirically from the data in cluster $Y=0$ as the empirical realization of $\mb{\cX}_{y0}$. 
% 
We do not have these densities available, and it may be impossible to learn the densities from samples coming from the clusters, due to limited data, small clusters, high dimensionality, or noise that is prevalent in neural or financial timeseries data, etc. There are also practical challenges in estimating the KL-Divergence objective above, such as the function being unbounded in its value, variance of the empirical estimate, compute cost, vulnerability of nonparameteric kNN based KL-D estimators to noise, etc. One practical solution, which we propose next, is to empirically estimate the KL-D objective in its dual form by \cite{donsker1983asymptotic}, leveraging deep learning as the dual function approximator.
    
We argue that estimating the KL-D function in its dual form, as shown below, is particularly suitable for its use as the clustering objective.
% 
\begin{align}
D(\mb{\cX}_{y0} \| \mb{\cX}_{y1}) 
=
\max_{f(.) \in L^{\infty}(\mb{\cX})}
\mathbb{E}_{\mb{\cX}_{y0}}
f(\mb{x})
-
\log 
\mathbb{E}_{\mb{\cX}_{y1}}
e^{f(\mathbf{x})}
% \label{eqn:dv_repr}
\nonumber
\end{align}
% 
Here, $f: \mb{\cX} \to \mathbb{R}$, is any function from the space of locally $\infty$-integrable functions such that expectations in the expression are finite, referred as the \emph{dual function}.
% 
To estimate the dual form of KL-D, we only need samples from cluster distributions and not actual density functions. A cluster distributions' existence is only implied by data points in clusters of the same support. Thus, a cluster is the optimal empirical realization of the cluster distribution, and both expectations in the dual form are empirically computable from the respective clusters only. 
% 
\begin{align}
\hat{D}(\mb{X}_{y0} \| \mb{X}_{y1})
=
\max_{\hat{f}(.)\in \cH}
\sum_{\mathbf{x}_{y0} \in \mb{X}_{y0}}
\frac{\hat{f}(\mb{x})}{n_{y0}}
-
\log
\sum_{\mathbf{x}_{y1} \in \mb{X}_{y1}}
\frac{e^{\hat{f}(\mb{x})}}
{n_{y1}}
\nonumber
% \label{eqn:dv_repr_emp}
\end{align}
% 
Here, $\hat{D}(\mb{X}_{y0} \| \mb{X}_{y1})$ is an empirical estimate of $D(\mb{\cX}_{y0} \| \mb{\cX}_{y1})$ from clusters, $\mb{X}_{y0} = \{\mb{x}_i:y_i =0 \}_{i=1}^n$ and $\mb{X}_{y1} = \{\mb{x}_i:y_i =1 \}_{i=1}^n$; $n_{y0}$ and $n_{y1}$ are the respective cluster sizes. As mentioned above, since the cluster distributions, $\mb{\cX}_{y0}$ and $\mb{\cX}_{y1}$, are themselves defined from their respective clusters, $\mb{X}_{y0}$ and $\mb{X}_{y1}$, it is only $\hat{D}(\mb{X}_{y0} \| \mb{X}_{y1})$ that is of interest as the clustering objective whereas $D(\mb{\cX}_{y0} \| \mb{\cX}_{y1})$ is notional. Correspondingly, $\hat{f}(.)$ is the dual function for estimating $\hat{D}(\mb{X}_{y0} \| \mb{X}_{y1})$, where the maximization is over a fixed class of functions $\cH$.
                
As proposed in \cite{belghazi2018mutual}, $f(.)$ can be a deep neural net function; for instance, neural timeseries models like LSTMs, RNNs, Transformers, TCNs, NBeats, etc.~\citep{bai2018empirical,oreshkin2019n,kitaev2019reformer,benidis2020neural,zeng2021topological,fan2021depts,gu2021efficiently,challu2022n}, are all relevant for clustering of time series in this framework. To learn a stable neural dual function and avoid high variance in estimating the objective~\citep{song2019understanding}, practical tricks, such as early stopping, large batch size, low learning rate, etc., are well known.
% 
Furthermore, our goal is not to estimate the divergence measure exactly, but to find assignments that maximize divergence across clusters. It is the relative estimate of divergence across different sets of cluster assignments that matters. Besides, to avoid potential numerical instability from $\log$ \emph{sum} $\exp$ function~(smooth max), we propose a practical trick of using $\max$ function as a known approximation of the former~\cite{boyd2004convex}.
% 
\begin{align}
&
\hat{D}(\mb{X}_{y0} \| \mathbf{X}_{y1}) 
\nonumber
\\
&
\approx
\max_{\hat{f}(.)}
\sum_{\mb{x}_{y0} \in \mb{X}_{y0}}
\hat{f}(\mathbf{x}_{y0})
-
\max_{\mb{x}_{y1} \in \mb{X}_{y1}}
\hat{f}(\mathbf{x}_{y1})
+ \log |n_{y1}|
\nonumber
% \label{eq:obj-approx}
\end{align}
% 
Here, note that the $\max$ function is not sensitive to outliers in cluster $\mb{X}_{y1}$ since that term is being minimized. 
% 
This not only stabilizes the optimization, but gives a nice interpretation for the expression in the context of clustering. From a deep learning perspective, the $\max$ function is essentially a \emph{$\max$ pooling} operation, over the dual function outputs of the data points in $\mb{X}_{y1}$.
    
\begin{figure}
\centering
\includegraphics[width=0.8\columnwidth]{alg_2_clustering2.eps}
\caption{2-clustering algorithm. First we optimize $f_{{y0}\to {y1}}$ and $f_{{y1}\to {y0}}$, next we optimize clusters $X_{y0}$ and $X_{y1}$. Note that in the second step we only consider clusters $X_{y0}$ and $X_{y1}$ such that their representations are contiguous in one of $f_{{y0}\to {y1}}$ or $f_{{y1}\to {y0}}$. One example is shown for each case.}
\label{fig:2clustering}
\end{figure}
    
The overall expression for optimizing cluster labels is:
\begin{align}
&
\arg\!\max_{\mathbf{y}}
\log(n_{y0} n_{y1})
\label{eqn:full_opt_log_sum_exp}
\\
&
+ \max_{f_{y0 \to y1}}
\sum_{\mb{x}_{y0} \in \mb{X}_{y0}}
\frac{f_{y0 \to y1}(\mb{x}_{y0})}{n_{y0}}
\!-
\log 
\sum_{\mathbf{x}_{y1} \in \mb{X}_{y1}}
e^{f_{y0 \to y1}(\mathbf{x}_{y1})}
\nonumber
\\
& 
+
\max_{f_{y1 \to y0}}
\sum_{\mb{x}_{y1} \in \mb{X}_{y1}}
\frac{\hat{f}_{y1 \to y0}(\mb{x}_{y1})}{n_{y1}}
-
\log 
\sum_{\mb{x}_{y0} \in \mb{X}_{y0}}
e^{f_{y1 \to y0}(\mathbf{x}_{y0})} 
\nonumber
\end{align}
% 
Here, $f_{y0 \to y1}(.)$ and $f_{y1 \to y0}(.)$ are the dual functions corresponding to estimating KL-D in both directions, $\hat{D}(\mathbf{X}_{y0} \| \mathbf{X}_{y1})$ and $\hat{D}(\mathbf{X}_
{y1} \| \mathbf{X}_{y0} )$. Note, $log|n_{y0} n_{y1}|$ naturally encourages balanced clusters. Next, we establish that the optimization in \eqnref{eqn:full_opt_log_sum_exp} has a solution which uniquely recovers the two clusters.
% 
\begin{theorem}
\label{thm:convergence}
The optimal solution for the objective in \eqnref{eqn:full_opt_log_sum_exp} exists when $f$ (for both $f_{y0 \rightarrow y1}$ and  $f_{y1 \rightarrow y0}$) is continuous and bounded between $[a,b]$ for some $a\leq b$. Moreover, if $f$ is also $L$-Lipschitz, then for two clusters where the distance between cluster (defined as the minimum between points in separate cluster) is more than the distance within cluster (defined as the maximum distance between points in the same cluster), there exists some Lipschitz constant $L$ where the optimal solution in \eqnref{eqn:full_opt_log_sum_exp} uniquely recovers the clusters.
\end{theorem}
    
\paragraph{Clusters are optimally contiguous in the dual space} Our \emph{key observation} about the optimization problem~(\eqnref{eqn:full_opt_log_sum_exp}) which enables highly efficient and near-optimal algorithms for clustering, is that clusters are \emph{optimally contiguous} in the space of dual functions~($f_{y0 \to y1}$ and $f_{y1 \to y0}$) i.e. \emph{dual space}, as we theoretically prove in the following. This simplifies the combinatorial-optimization of finding clusters to that of finding a cut point in the dual space. \thmref{thm:2-cluster-optimality} proves the contiguity of two clusters in the dual space.
% 
\begin{theorem}
\label{thm:2-cluster-optimality}
Consider a dual function $\hat{f}(.)$, and the associated representation of data points in the dual space, $\{ \hat{f}(\mathbf{x}_i) \}_{i=1}^n$, and the KL-D estimate between clusters $\mathbf{X}_{y0} = \{ \mathbf{x}_i: y_i=0 \}_{i=1}^n$, $\mathbf{X}_{y1} = \{ \mathbf{x}_i: y_i=1 \}_{i=1}^n$ is
% 
$$
\hat{D}_{\hat{f}}(\mb{X}_{y0} \| \mb{X}_{y1})
=
\!\!\!
\sum_{\mathbf{x}_{y0} \in \mb{X}_{y0}}
\!\!\!
\frac{\hat{f}(\mb{x})}{n_{y0}}
-
\log\!\!\!
\sum_{\mathbf{x}_{y1} \in \mb{X}_{y1}}
\!\!\!
e^{\hat{f}(\mb{x})}
+
\log(n_{y1}).
% \nonumber
$$
% 
Then the clusters that maximize $\hat{D}_{\hat{f}}(\mb{X}_{y0}\| \mb{X}_{y1})$, i.e.
% 
$$
\arg\!\max_{\mathbf{y}} \hat{D}_{\hat{f}}(\mb{X}_{y0}\|\mb{X}_{y1}),
$$
% 
are contiguous in the dual space: there is a cut point $c$ such that $\mathbf{x}_i \in \mb{X}_{y0}$ if $\hat{f}(\mathbf{x}_i)\ge c$ and $\mathbf{x}_i \in \mb{X}_{y1}$ if $\hat{f}(\mathbf{x}_i) < c$.
\end{theorem}
% 
\paragraph{Cut Point Algorithm for Clustering}
This theoretical result on the contiguity of clusters in the dual space naturally leads to a cut-point based clustering algorithm as illustrated in \figref{fig:2clustering}. Here, we consider KL-D in both directions. We start by random clusters $\mb{X}_{y0}$ and $\mb{X}_{y1}$ and optimize $f_{y0 \to y1}(.)$ and $f_{y1 \to y0}(.)$ with respect to these clusters. Then, in order to optimize clusters $\mb{X}_{y0}$ and $\mb{X}_{y1}$, we only consider cluster pairs $(\mb{X}_{y0},\mb{X}_{y1})$ such that both $\mb{X}_{y0}$ and $\mb{X}_{y1}$ are ``contiguous'' in one of the representations defined by dual functions, $f_{y0 \to y1}(.)$ and $f_{y1 \to y0}(.)$. 
    
\begin{figure}
\centering
\includegraphics[width=0.8\columnwidth]{kcluster1-2.eps}
\caption{\emph{Greedy bisection of the highest entropy cluster}. First we find two clusters using our 2 cluster algorithm, then we take the cluster with the highest entropy which is cluster 2, we update $f_{y0 \to y1}$ and $f_{y1 \to y0}$ and then split cluster 2 using our 2 clustering algorithm. Note that we are not showing cluster 1 in the updated $f_{y0 \to y1}$ representation.}
\label{fig:kcluster1}
\end{figure}
% 
More formally, we first consider the one dimensional space defined by $f_{y0 \to y1}(.)$. We sort the indices of the corresponding data points with respect to the values of the function $f_{y0 \to y1}(.)$. For each pair of clusters defined by a cut point $i$, we evaluate the divergence objective.
% 
For a given cut point, it is computed from mean and (smooth) max statistics, values for all the cut points can be computed iteratively in $f_{y0 \to y1}$, with linear time compute complexity.
% 
We do the same for the one dimensional space defined by $f_{y1 \to y0}$ and output the cluster pair that maximizes the divergence from either of the two dimensions. 
    % 
We continue optimizing for a fixed number of iterations~(100) or until convergence of the cluster labels. 
    
When employing DNN as a dual function, the step of optimizing the dual functions given cluster labels as shown in \figref{fig:2clustering} is a single iteration of updating weights of the corresponding two DNNs via backpropagation, rather than retraining from scratch for a change in cluster labels. Both updating the dual function for a change in cluster labels, and  optimizing cluster labels by finding cut points in the re-optimized dual functional spaces, are highly efficient. Moreover, for the first few \emph{warmup} iterations~(10), we only update weights of the neural estimators and not the cluster labels. 
% 
In our experiments, it takes only a few seconds to run the entire procedure to obtain cluster in a dataset of few thousand timeseries.
         
For the \emph{k-clusters} problem, we want to maximize divergence between each pair of clusters, or maximize divergence of each cluster w.r.t. the rest. While in theory, there should be a different estimator for each pair of clusters, it suffices in practice to produce two estimates $\hat{D}(\mb{X}_{yi} \| \mb{X}_{yj})$ and $\hat{D}(\mb{X}_{yj} \| \mb{X}_{yi})$ respectively from $\hat{f}_{y0 \to y1}$ and $\hat{f}_{y1 \to y0}$, as in the two-cluster problem. For training any of the two estimators, in each batch update, two out of $k$ clusters are randomly sampled, for which the estimator learns to maximize the estimate of KL-D. To optimize cluster labels, we propose a greedy search for $k-1$ cut points in the dual space, in which various variants of the KL-D objective are applicable. Next, we establish contiguity of $k$ clusters in the dual space for one such objective.
% 
\begin{theorem}
\label{thm:k-cluster-contiguous}
Consider a dual function $\hat{f}(.)$ and the associated representation of input points in the dual space, $\{ \hat{f}(\mathbf{x}_i) \}_{i=1}^n$. Let $\mb{X}_{y1}^*=\{ \mathbf{x}_i : y_i^*=1 \}_{i=1}^n, \ldots, \mb{X}_{yk}^* = \{ \mathbf{x}_i: y_i^*=k \}_{i=1}^n$ be the clusters that maximize the objective,
% 
$$
\arg\!\max_{\mb{y}}
\sum_{i=1}^{k-1} \hat{D}_{\hat{f}}(\mathbf{X}_{y>i} \| \mathbf{X}_{yi});
$$
%
$$
% 
\hat{D}_{\hat{f}}(\mb{X}_{y>i} \| \mb{X}_{yi}) = 
\mathbb{E}_{\mathbf{x} \in \mb{X}_{y>i}}
\hat{f}(\mathbf{x})
-
\log
\mathbb{E}_{\mathbf{x}_{yi} \in \mb{X}_{yi}}
e^{\hat{f}(\mathbf{x}_{yi})},
%
$$
% 
where $\mb{X}_{y>i}$ is the set of all data points which lie on the r.h.s. of $\mb{X}_{yi}$ in the dual space.
% 
Then $\mathbf{X}_{y1}^*, \ldots, \mathbf{X}_{yk}^*$ form contiguous clusters in the dual space.
\end{theorem}
% 
We now propose two intuitive, greedy algorithms for finding $k-1$ cut points in dual space.
     
\begin{figure}
\centering
\includegraphics[width=0.8\columnwidth]{kclustering2.eps}
\caption{\emph{Greedy cuts.} For a randomly initialized dual function, $f(.)$, data points are represented in the dual space, $\{ f(\mathbf{x}_i)\}_{i=1}^n$, and sorted accordingly. The $k-1$ cut points are searched greedily to obtain $k$ clusters, with each data point being a candidate as a cut point. Cut points and the dual function are updated iteratively.}
\label{fig:kcluster2}
\end{figure}
    
\paragraph{Greedy bisection of the  entropy cluster}
First, as illustrated in \figref{fig:kcluster1}, we use a recursive bisection approach. In each greedy iteration we pick a cluster with the highest entropy~(cluster size being a good proxy for it if cluster sizes are non-uniform), and bisect it using the two-clusters algorithm~(\figref{fig:2clustering}). As described previously, the two dual functions, $\hat{f}_{y0 \to y1}$ and $\hat{f}_{y1 \to y0}$, are updated considering all the clusters learned so far, maximizing the estimate of KL-D for each pair of clusters~(old or new) in a batch update. We evaluate this algorithm extensively in our experiments.
            
\paragraph{Greedy cuts} Another approach, as illustrated in \figref{fig:kcluster2}, is to find $k-1$ greedy cuts in the dual space. Updating the dual function and the cuts is done iteratively. While the algorithm is generally applicable for many possible objectives based on KL-D between clusters, we prove theoretical guaranties for one such objective owing to its submodularity.
% 
\begin{theorem}
\label{thm:k-cluster-optimality}
Consider a dual function $\hat{f}(.)$ and the associated representation of input data points in the dual space, $\{ \hat{f}(\mathbf{x}_i) \}_{i=1}^n$. Let $OPT$ be the optimal value of the objective,
$$
\arg\!\max_{\mb{y}}
\sum_{i=1}^{k-1} \hat{D}_{\hat{f}}(\mathbf{X}_{y>i} \| \mathbf{X}_{yi});
$$
$$
\hat{D}_{\hat{f}}(\mb{X}_{y>i} \| \mb{X}_{yi}) 
= 
\mathbb{E}_{\mathbf{x} \in \mb{X}_{y>i}}
\hat{f}(\mathbf{x})
-
\log
\mathbb{E}_{\mathbf{x}_{yi} \in \mb{X}_{yi}}
e^{\hat{f}(\mathbf{x}_{yi})},
% 
$$
% 
where $\mathbf{X}_{y>i}$ is the set of all data points which lie on the r.h.s. of $\mathbf{X}_{yi}$ in the dual space. Optimizing $k$-$1$ cuts greedily in the dual space finds clusters $\mathbf{X}_{y1}^*, \ldots, \mathbf{X}_{yk}^*$ such that,
% 
\begin{equation}
\sum_{i=1}^{k-1} \hat{D}_{\hat{f}}(\mathbf{X}_{y>i}^* \| \mathbf{X}_{yi}^*) \ge \frac{e-1}{e} OPT.
\nonumber
\end{equation}
% 
\end{theorem}
% 
In practice, there is space to explore various algorithms for finding the cut points while greedy algorithms as proposed above enjoy theoretical guaranties.
    
\section{Empirical Evaluation}
\label{sec:experiments}
% 
One of the best-motivated application of (especially information theoretic) clustering algorithms, is clustering (noisy) timeseries in domains such as neuroscience, healthcare, finance, environmental dynamics, etc. For instance, in neuroscience, it is of substantial interest to find a subset of neurons in which neural activity exhibits high dependence (MI) w.r.t. each other. 
    
\paragraph{Datasets}
We evaluate our approach on the following timeseries datasets: (i) electrophysiological Neuropixels, (ii) US stock returns, (iii) EEG, (iv) ECG, (v) Rain, (vi) Wind, (vii) Pollution, and four representative UCR datasets, (viii) UCR-Mallat, (ix) UCR-Trace, (x) UCR-Small Kitchen Appliances, (xi) UCR-ECG-Torso, and (xii) a synthetic timeseries dataset. See the supplement for more details.

\paragraph{Competitive Methods}
We compare our information theoretic clustering approach of divergence maximization~(referred as ``\textbf{ITC-DM*}") w.r.t. the traditional baseline models, ``KMeans", ``Spectral" clustering, ``kShape" clustering~\citep{paparrizos2015k}. 
We use two important baseline estimators of MI based ITC: (i) a kNN based nonparameteric estimator, referred to as ``ITC-kNN"~\citep{faivishevsky2010nonparametric}, and (ii) a minimum spanning trees estimator, referred as ``ITC-MST"~\citep{muller2012information}.
% 
We also evaluate various deep learning baselines: DEC \citep{xie2016unsupervised}, NNM \citep{dang2021nearest}, include temporal clustering models, DTC \citep{sai2018deep}, and DTCR \citep{ma2019learning}. 
    
\begin{figure*}[ht!]
% 
\centering
% 
\includegraphics[width=1.9\columnwidth,scale=0.2]{legend.png}
% 
\subfigure[Synthetic Timeseries]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{synthetic_intra_inter_mi.pdf}
\label{fig:synthetic_mi}
}
% 
\subfigure[Neuropixels]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{neuro_intra_inter_mi.pdf}
\label{fig:neuropixels_mi}
}
% 
\subfigure[US Stock Returns]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{russel_intra_inter_mi.pdf}
\label{fig:stocks_mi}
}
% 
\subfigure[Rain]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{rain_intra_inter_mi.pdf}
\label{fig:rain_mi}
}
% 
\subfigure[Wind]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{wind_intra_inter_mi.pdf}
\label{fig:wind_mi}
}
% 
\subfigure[Pollution]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{pollution_intra_inter_mi.pdf}
\label{fig:pollution_mi}
}
% 
\subfigure[EEG]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{eeg_intra_inter_mi.pdf}
\label{fig:eeg_mi}
}
%
\subfigure[ECG]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{ecg_intra_inter_mi.pdf}
\label{fig:ecg_mi}
}
% 
\subfigure[UCR-Mallat]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{ucr_mallat_intra_inter_mi.pdf}
\label{fig:ucr_mallat_mi}
}
% 
\subfigure[UCR-Trace]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{ucr_trace_intra_inter_mi.pdf}
\label{fig:ucr_trace_mi}
}
% 
\subfigure[UCR-Small Kitchen Appl.]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{ucr_ska_intra_inter_mi.pdf}
\label{fig:ucr_ska_mi}
}
% 
\subfigure[UCR-ECG-Torso]{
\includegraphics[width=0.45\columnwidth,scale=0.3]{ucr_ecg_torso_intra_inter_mi.pdf}
\label{fig:ucr_ecg_torso_mi}
}
% 
\caption{Evaluating clusters in terms of pairwise mutual information between timeseries within clusters~(intra-cluster $\uparrow$) and across clusters~(inter-cluster $\downarrow$). The proposed method is ITC-DM shown in solid red circles.}
\label{fig:expr_mi}
\end{figure*}
    
\paragraph{Hyperparameters Selection}
Our task is to obtain the best possible clusters within an input dataset in an unsupervised setting. The deep learning optimization of estimating and maximizing KL-D w.r.t. cluster labels is unique to every single input of a dataset. Hyperparameters can be chosen independently for a given input of dataset by maximizing the proposed objective itself. We consider it valuable if some hyper-parameter choices perform well across all the datasets, to avoid the overhead of tuning as discussed next. Across all 12 datasets, we use the entire input dataset for rather than batch sampling. This is aligned with previous works on dual divergence estimation~\cite{belghazi2018mutual,song2019understanding} which suggest to use a large batch size to avoid high variance. We chose LSTMs with one hidden layer of 32 units with a learning rate of 1e-1, weight parameters initialized with std of 0.1. We perform 100 iterations in the greedy algorihtm, with 10 warmup iterations, to update the dual function and not optimize the cut points~(clusters labels). We use the greedy bisection algorithm for the primary analysis~(\figref{fig:kcluster1}). These choices were made via preliminary clustering analysis on a stock price dataset independently of the datasets in this paper. In \secref{sec:ablation}, we present an extensive ablation study on the Neuropixels dataset, varying each hyperparameter from the above defaults. For the baseline clustering methods, we follow the respective strategies for selecting the hyperparameters as described in their papers or codebases. 
            
\subsection{Evaluation Results}
Next, we present our extensive empirical results on many real world datasets along with a synthetic timeseries dataset.
    
\subsubsection{How to evaluate clusters of timeseries?}
% 
As the science of clustering objectives and algorithms advance in consideration of challenges presented by high dimensional noisy datasets in the modern times, we must further the science of evaluation metrics as well. Next, we discuss two evaluation metrics which we deem to be the most appropriate in terms of being fundamental, robust to noise, and can be estimated reliably.

\paragraph{Pairwise mutual information between timeseries}
We propose to evaluate clusters in terms of pairwise MI between timeseries within- and across- clusters. This is independent of the clustering objective and simple to compute. We treat timeseries observations as I.I.D. samples from a univariate random variable ignoring the temporal correlations;  we employ a kNN based estimator~($k=3$). Cluster level statistics of intra- and inter-cluster MI are obtained from the pairwise MI function by taking averages and normalizing it using the respective cluster sizes. Note, this MI function is \emph{not to be confused} with the clustering criterion of mutual information between high dimensional data points~(timeseries) and cluster labels.
    
\paragraph{KL-divergence metric}
In addition, the KL-Divergence objective can itself serve as an evaluation metric. For every pair of clusters, we evaluate empirical KL-D between their respective cluster distributions. We are only interested in the relative values of KL-D between and across clusters obtained from all the methods. For estimating the KL-D criterion as a metric, we ensure that it is estimated independently of its estimation as the objective. We employ Transformers with 10 attention heads and feedforward dimension of 32 to estimate the KL-D metric; we use learning rate of 3e-4, dropout rate of 0.2, and 200 iterations of weight updates or until convergence.
      
\paragraph{On the role of domain knowledge in evaluating clusters} 
We argue that, since clustering is purely an unsupervised problem for exploratory analysis, it is not apt to consider class labels or any domain knowledge gleaned from supervised tasks as a proxy for cluster labels. Class labels can only serve as yet another human annotation of cluster labels, and not as the ground truth.

\subsubsection{Comparative Analysis}
First, we focus on the metric of inter- and intra-cluster MI, and present the comparative analysis of our approach and others in \figref{fig:expr_mi}. We use the same number of clusters as the classes for the datasets where domain specific class labels are available, like the brain regions in the neuropixel dataset. This is designed to present a fair comparison concerning the class labels~(yet not to be considered as ground truth). For the remaining datasets~(Rain, Wind, Pollution, Stock Prices), we tune the number of clusters across all the methods and then present results on the same number of clusters. We note that some comparison methods fail to obtain clusters despite many trials with random seeds; results for such cases are missing in the plots.
    
\begin{figure*}[ht!]
\centering
% 
\subfigure[N: Brain Regions]{
\includegraphics[width=0.3\columnwidth]{neuro_results/brain_regions_kl_div.pdf}
}
% 
\includegraphics[width=0.06\columnwidth]{neuro_results/colormap.pdf}
% 
\subfigure[N: ITC-kNN]{
\includegraphics[width=0.3\columnwidth]{neuro_results/itc_knn_kl_div.pdf}
}
% 
\subfigure[N: ITC-MST]{
\includegraphics[width=0.3\columnwidth]{neuro_results/itc_mst_kl_div.pdf}
}
% 
\subfigure[N: NNM]{
\includegraphics[width=0.3\columnwidth]{neuro_results/nnm_kl_div.pdf}
}
% 
\subfigure[N: DTCR]{
\includegraphics[width=0.3\columnwidth]{neuro_results/dtcr_kl_div.pdf}
}
% 
\subfigure[N: ITC-DM*]{
\includegraphics[width=0.3\columnwidth]{neuro_results/itc_dv_transformer_kl_div.pdf}
}
% 
\subfigure[SR: ITC-kNN]{
\includegraphics[width=0.3\columnwidth]{finance_results/itc_knn_kl_div.pdf}
}
% 
\subfigure[SR: ITC-MST]{
\includegraphics[width=0.3\columnwidth]{finance_results/itc_mst_kl_div.pdf}
}
% 
\subfigure[SR: NNM]{
\includegraphics[width=0.3\columnwidth]{finance_results/nnm_kl_div.pdf}
}
% 
\subfigure[SR: DTCR]{
\includegraphics[width=0.3\columnwidth]{finance_results/dtcr_kl_div.pdf}
}
% 
\subfigure[SR: DEC]{
\includegraphics[width=0.3\columnwidth]{finance_results/dec_kl_div.pdf}
}
% 
\subfigure[SR: ITC-DM*]{
\includegraphics[width=0.3\columnwidth]{finance_results/itc_dv_transformer_kl_div.pdf}
}
\caption{For the Neuropixels and Stock Returns datasets, corresponding to the prefixes "N:" and "SR:", KL-D values~(normalized by the maximum value) for intra-cluster~(diagonal entries) and inter-cluster~(off-diagonal entries) are shown across all the competitive methods while excluding those methods (due to space constraints) which produce clusters of highly poor quality. The clusters are sorted in ascending order of the cluster size.}
\label{fig:expr_neuro}
\end{figure*}
    
In terms of achieving high intra-cluster MI but low inter-cluster MI, our method ITC-DM* performs competitively across all the datasets. In contrast, ITC-kNN which uses the mutual information objective~(\ref{eqn:MI}) achieves high intra-cluster MI for some of the datasets~(US stock returns, EEG, ECG, UCR-Mallat) but at the expense of higher inter-cluster MI. For some of the other datasets~(Rain, Wind, Pollution, UCR-Trace, UCR-Small Kitchen Appliances), ITC-kNN finds clusters which are poor at both the metrics, intra- and inter-cluster MI, in comparison to ITC-DM. ITC-MST, which is also based on the objective of mutual information performs poorly across many datasets~(Synthetic, Neuropixels, US Stock Returns, Pollution, EEG), partly due to its reliance upon minimal spanning trees for estimating MI. 
    
Traditional methods like kMeans and Spectral clustering are ineffective due to sensitivity to noise and degrade when clusters are high imbalanced; see the results for datasets: Synthetic, Neuropixels, US Stock Returns, Rain, Pollution, EEG, ECG, UCR-Small Kitchen Appliances. Even deep learning based approaches such as NNM, i.e. clustering via nearest neighbor matching of input data points in their deep neural representations, are vulnerable to the noisy timeseries datas including, Neuropixels, US Stock Returns, Rain, Wind, Pollution, EEG, ECG, UCR-Mallat, UCR-Trace. Similarly, the deep clustering method DEC exhibits unreliable performance, finding poor choices of clusters for some of the datasets, Neuropixels, Rain, Wind, EEG, and UCR-Trace. While DTCR is consistently superior to DTC~(except for the synthetic data), it also finds clusters of poor quality for Neuropixels, US Stock Returns, Wind, and UCR-Mallat. 
    
In \figref{fig:expr_mi}, it is also interesting to see that, for some datasets, ITC-DM* outperforms w.r.t. class labels as well; see the results for Neuropixels, UCR-Mallat and UCR-Trace. 
% 
In \figref{fig:neuropixels_mi}, when comparing ITC-DM* w.r.t. brain regions. ITC-DM* finds clusters of neurons with lower inter-cluster MI.
% 
This empirical result conforms to knowledge of human brains where strong dependence between neurons across brain regions imply information flow in the visual system. As a matter of fact, all the clustering methods, except for kMeans, spectral, and DTC, find clusters with inter-cluster MI lower than the brain regions. Among those, ITC-DM* find the ones with the highest intra-cluster MI. 

\paragraph{Comparisons with KL-D metric}
Next, in \figref{fig:expr_neuro}, we compare the most competitive methods above in terms of the KL-D metric. As desired, we observe higher KL-D~($\uparrow$) between clusters from our method ITC-DM* vs the other methods. For neuropixels dataset, we observe high divergence between brain regions as well.
    
\begin{table}[tp!]
\centering
\csize
\renewcommand{\arraystretch}{0.85}
\renewcommand{\tabcolsep}{2.0pt}
\begin{tabular}{lllllllllllllllll}
\toprule
kMeans&Spectral&ITC-kNN&ITC-MST&DEC
% &DTC
&DTCR&NNM&\textbf{ITC-DM*}\\
\toprule
418&5902&2&15&289
% &305
&545&231&168\\
\toprule
\end{tabular}
\vspace{-2mm}
\caption{Average compute time (in seconds) for all the clustering methods on Neuropixels dataset.}
\label{tab:compute_time}
\end{table}

\paragraph{Compute time}
In \tabref{tab:compute_time}, we observe that compute time is competitively lower w.r.t. the neural baselines. On the other hand, ITC-kNN and ITC-MST which rely upon nearest neighbor distances instead of neural representations exhibit comparably negligible compute cost. As for Spectral clustering, compute time can vary as per the Eigen spectrum.
        
\subsubsection{Ablation Study}
\label{sec:ablation}
% 
We present a detailed analysis for our approach using the Neuropixels dataset. 
    
\begin{figure}[tp!]
\centering
%
\subfigure[Vary No. of Clusters]{
\includegraphics[width=0.46\columnwidth]{neuro_results/neuro_intra_cluster_mi_wrt_num_clusters.pdf}
\label{fig:intra_num_clusters}
}
% 
\subfigure[Vary No. of Clusters]{
\includegraphics[width=0.46\columnwidth]{neuro_results/neuro_inter_cluster_mi_wrt_num_clusters.pdf}
\label{fig:inter_num_clusters}}
% 
\subfigure[Multiple Trials]{
\includegraphics[width=0.46\columnwidth]{neuro_results/neuro_intra_cluster_mi_multiple_trials.pdf}
\label{fig:intra_trials}}
% 
\subfigure[Multiple Trials]{
\includegraphics[width=0.46\columnwidth]{neuro_results/neuro_inter_cluster_mi_multiple_trials.pdf}
\label{fig:inter_trials}}
\caption{Detailed analysis of our ITC-DM model (also referred as "DV-LSTM") for Neuropixels dataset.}
\label{fig:mi_var_num_clusters}
\end{figure}
            
In \figref{fig:intra_num_clusters} and \ref{fig:inter_num_clusters}, we analyze intra- and inter-cluster MI as we increase the number of clusters from 2 to 50. We observe that the inter-cluster MI initially declines and attains a minimum at 6 clusters, and then continues to increase. Interestingly, the optimal number of clusters~(6) as indicated by the lowest inter-cluster MI is also the number of brain regions. 
% 
In \figref{fig:intra_trials} and \ref{fig:inter_trials}, we analyze clusters obtained from 100 different trials of neural activity, and correlate intra- and inter-cluster MI of the clusters w.r.t. the brain regions. Note that MI metrics vary even for fixed brain regions, since neural activity varies across all the trials.
% 
We observe that the intra- and inter- cluster MI of clusters learned by our algorithm remain close to the corresponding brain regions, and exhibit high correlations with the latter.
    
In \figref{fig:hp_ablation_neuro}, we present an ablation study varying all the hyperparameters in our model one by one. To provide relative measures, we show intra- and inter-cluster MI for all the competitive methods and ITC-DM with its default hyperparameters, as in \ref{fig:neuropixels_mi}. Starting from the default configuration, we vary each hyperparameters to observe the corresponding change in the two metrics. We vary the number of layers in the default LSTM model~(\emph{1}, 2, 3, 4, 5, 8, 10) and observe only marginal changes in the metrics. However, the number of hidden units is a sensitive hyperparameter~(8, 16, \emph{32}, 64, 128, 256, 512, 1024); large or small numbers of units degrade the performance. Std for the initialization of weight parameters is an important parameter~(0.01, 0.03, 0.05, \emph{0.1}, 0.2, 0.3, 0.5) with high variability in the results; however, the default value of 0.1 works consistently across all the experiments. Perhaps surprising, the learning rate~(LR) is only mildly sensitive ~(1e-5 to \emph{1e-1}) with no clear pattern for whether lower or higher LR is better.
        
From varing the number of iterations in the greedy bisection algorithm~(10 to 300), we find that a minimum of 30 iterations is necessary to ensure good performance. We vary warm up iterations between two extremes from 0 to 50~(default is 10) and notice that extremly low or high values are detrimental. We also find that FNNs perform well, and their performance varies by learning rate. We find Transformers suited for estimating KL-D as a metric for their stability in learning, but their performance for clustering underperforms FNNs and LSTMs. Lastly, we evaluate the greedy cut point algorithm~(\ref{fig:kcluster2}) and observe its performance comparable but not superior to greedy bisection algorithm. Overall, it is noteworthy that even changing any hyperparameter to extreme values, ITC-DM* remain highly competitive.
    
\begin{figure}[tp!]
\centering
\includegraphics[width=0.8\columnwidth]{neuro_intra_inter_mi_ablation.pdf}
\caption{Ablation study for our approach using Neuropixels dataset. ITC-DM refers to our model with default configuration used for the primary analysis. We analyze the performance of our model w.r.t. change in all the hyperparametes, such as the number of layers, learning rate, etc.}
\label{fig:hp_ablation_neuro}
\end{figure}
    
\section{Conclusions}
\label{sec:conclusions}
% 
To the best of our knowledge, this paper presents the first deep learning based information theoretic approach for clustering, together with a novel KL-Divergence criterion for optimization with no assumptions underlying the true data distribution. This new criterion subsumes the objective of mutual information. We propose to estimate KL-D in its dual form which gives us a highly efficient framework for optimization along with theoretical guaranties. Our experimental results on 12 real world timeseries datasets demonstrate that our approach is highly competitive w.r.t. other information theoretic clustering methods as well as advanced deep learning methods in ensuring two desirable properties: high KL-divergence among cluster distributions, and low inter-cluster pairwise mutual information.
    
% \subsubsection*{Author Contributions}

\subsubsection*{Acknowledgements}
The authors would like to thank, Syamantak Datta Gupta, Sanghamitra Dutta, Majid Behbahani, Satish Kumar Thittamaranahalli, Irina Rish, Frederik Wenkel, Fernando Gama, Umang Gupta, Volodymyr Volchenko, Pablo Vicente Juan, Antonio Musumeci, for providing their valuable feedback.
    
\bibliography{references}

\end{document}
