%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\usepackage{graphicx}
\usepackage{makecell}
\usepackage{subfigure}
\usepackage{epsfig}
\usepackage{url}
\usepackage{amsfonts}
\usepackage{tipa}
\usepackage{times}
\usepackage{mathrsfs}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{amsmath}
%\usepackage[ruled,vlined,linesnumbered]{algorithm2e}
%\usepackage{algorithmicx}
%\usepackage{algpseudocode}
\usepackage{caption}

%\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{remark}{Remark}
\newtheorem{property}{Property}
\newtheorem{claim}{Claim}

\usepackage{algorithm}
\usepackage{algpseudocode}
\algtext*{EndWhile}% Remove "end while" text
\algtext*{EndIf}% Remove "end if" text
\algtext*{EndFor}% Remove "end if" text

\newcommand{\vol}{\textrm{vol}}
\newcommand{\cost}{\textrm{cost}}
\newcommand{\poly}{\textrm{poly}}
\newcommand{\parent}{\textrm{parent}}




%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{An Information-theoretic Perspective of Hierarchical Clustering on Graphs}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<yichengp@buaa.edu.cn>?Subject=An Information-theoretic Perspective of Hierarchical Clustering on Graphs}{Yicheng~Pan}{}}
\author[1]{Bingchen~Fan}
\author[1]{Pengyu~Long}
\author[1]{Feng~Zheng}
% Add affiliations after the authors
\affil[1]{%
    State Key Laboratory of Complex \& Critical Software Environment\\
    Beihang University\\
    Beijing, China
}

  
\begin{document}
\maketitle

\begin{abstract}
	The seminal work of \citep{dasgupta2016cost} has introduced a combinatorial cost function for hierarchical graph clustering that has inspired numerous follow-up studies adopting similar combinatorial approaches.
	In this paper, we investigate this problem from the \emph{information-theoretic} perspective. We formulate a new cost function that is fully explainable and establish the relationship between combinatorial and information-theoretic perspectives. We present two algorithms for expander-like and well-clustered cardinality weighted graphs, respectively, and show that both of them achieve $O(1)$-approximation for our new cost function. Addressing practical needs, we consider non-binary hierarchical clustering problem, and propose a hyperparameter-free framework HCSE that recursively stratifies cluster trees through sparsity-aware partitioning, automatically determining the optimal hierarchy depth via an interpretable mechanism. Extensive experimental results demonstrate the superiority of our cost function and algorithms in binary clustering performance, hierarchy level identification, and reconstruction accuracy compared to existing approaches.
\end{abstract}



\section{Introduction} \label{sec:introduction}

Hierarchical clustering on graphs plays an important role in the structural analysis of a given data set. Understanding hierarchical structures on the levels of multi-granularity is fundamental in various disciplines including artificial intelligence, physics, biology, sociology, etc \citep{brown1992class,eisen1998cluster,gorban2008principal,culotta2007author}. Hierarchical clustering requires a cluster tree that represents a recursive partitioning of a graph into smaller clusters as the tree nodes get deeper. A leaf represents a graph node while a non-leaf node represents a cluster containing its descendant leaves. The root is the largest one containing all leaves.
In this paper, we investigate the hierarchical clustering problem from the perspective of information theory. Our study is based on Li and Pan's structural information theory \citep{li2016structural} whose core concept named structural entropy measures the complexity of hierarchical networks. We propose from the information-theoretical perspective a new and fully explainable cost function that has advantages over the combinatorial ones, and also establish the relationship to them. In the remaining parts of this section, we introduce the combinatorial perspective first and then our contributions.

\subsection{The combinatorial perspective of hierarchical clustering} \label{subsec:comb}

Clustering is typically formulated as an optimization problem for some cost function. For hierarchical clustering, \citep{dasgupta2016cost} introduced his celebrated cost function as a combinatorial explanation for cluster trees. In this definition, similarity or dissimilarity between data points is represented by weighted edges. Taking the similarity-based metrics as an example, a cluster is a set of nodes with relatively denser intra-links compared with its inter-links, and in a good cluster tree, heavier edges tend to connect leaves whose lowest common ancestor (LCA) is assigned as deep as possible. This intuition leads to Dasgupta's cost function that is a bilinear combination of edge weights and the sizes of corresponding LCAs (refer to Section \ref{sec:cost_functions} for the formal definition).

Motivated by Dasgupta's cost function, \citep{cohen2019hierarchical} proposed admissible cost functions. In their definition, the size of each LCA in Dasgupta's objective is generalized to be a function of the sizes of its left and right children. For all similarity-based graphs generated from a minimal ultrametric (see Section 2.2, \citep{cohen2019hierarchical}), a cluster tree achieves the minimum cost if and only if it is a generating tree that is a ``natural'' ground truth tree in an axiomatic sense therein. A necessary condition of admissibility of a cost function is that it achieves the same value for every cluster tree for a uniformly weighted clique that has no significant clustering structure in common sense. However, any slight deviation of edge weights would generally separate the two end-points of a light edge on a high level of its optimal (similarity-based) cluster tree. Thus, it seems that admissible cost functions, which take Dasgupta's cost function as a specific form, ought to be an unchallenged criterion in evaluating cluster trees since they are formulated by an axiomatic approach.

However, an admissible cost function seems imperfect in practice. The arbitrariness of optima of cluster trees for cliques indicates that the division of each internal node on an optimal cluster tree totally neglects the \emph{balance} of its two children. Edge weight therein is the unique factor that decides the structure of optimal trees. But a balanced tree is commonly considered as an ideal candidate in hierarchical clustering compared to an unbalanced one. Balance of clustering has many applications in practice, for example, load balancing for cloud computing and management, network design, and avoidance of outlier cluster formation, etc. Balance has also been widely considered in partitioning and clustering problems, which has motivated some classic optimization problems such as balanced cut and balanced clustering (e.g., balanced k-means). Even clustering for cliques, a balanced partition should be preferable for each internal node. At least, an optimal cluster tree whose height is logarithm of graph size $n$ is intuitively more reasonable than a caterpillar shaped cluster tree whose height is $n-1$. Moreover, a simple proof would imply that the optimal cluster tree for any connected graphs is binary. This property is not always useful in practice since a real system usually has its inherent number of hierarchies and a natural partition for each internal cluster. For instance, the natural levels of administrative division in a country is usually intrinsic, and it is not suitable to differentiate hierarchies for parallel cities in the same state. This structure cannot be obtained by simply minimizing admissible cost functions, and new non-binary clustering algorithms are worth study.



\subsection{Related work} \label{sec:related_work}

Along with the line of study on Dasgupta's cost function, several alternative objectives have been presented. All of them are bilinear functions of edge weights and some function of the corresponding LCAs. For Dasgupta's cost function and the worst case study, Dasgupta showed that by applying Arora's seminal recursive bipartition algorithm for the sparsest cut problem \citep{DBLP:journals/jacm/AroraRV09}, we have $O(\log^{1.5}n)$-approximation. This guarantee was improved by \citep{DBLP:journals/jmlr/RoyP17} and \citep{DBLP:conf/soda/CharikarC17,cohen2019hierarchical} to $O(\log n)$ and $\sqrt{\log n}$, respectively. It is NP-hard to optimize the cluster tree \citep{dasgupta2016cost} and even a $O(1)$-approximation is impossible under the Small Set Expansion hypothesis \citep{DBLP:journals/jmlr/RoyP17,DBLP:conf/soda/CharikarC17}. Beyond the worst case, Cohen-Addad et al. \citep{cohen2019hierarchical} showed that a SVD-based algorithm achieves a $O(1+o(1))$-approximation for the stochastic block model with high probability. Manghiuc and Sun \citep{DBLP:conf/nips/ManghiucS21} presented a $O(1)$-appromation algorithm for more generalized well-clustered graphs. The outline of their method is to utilize a flat clustering algorithm \citep{DBLP:conf/soda/GharanT14} to obtain the underlying clusters first, and then some relatively easy heuristics for clustering in and out of these clusters are enough for the guarantee. Our proof follows this route also.

For other lines of this study, Moseley and Wang \citep{DBLP:conf/nips/MoseleyW17} studied the dual of Dasgupta's cost function and showed that the average-linkage algorithm achieves a $(1/3)$-approximation. This factor has been improved by a series of works to $0.336$ \citep{DBLP:conf/soda/CharikarCN19}, $0.4246$ \citep{DBLP:conf/aistats/ChatziafratisYL20} and $0.585$ \citep{DBLP:conf/colt/AlonAV20}, respectively. Cohen-Addad et al. \citep{cohen2019hierarchical} considered maximization of Dasgupta's cost function for the dissimilarity-based metrics. They proved that the average-link and random partitioning algorithms achieve a $(2/3)$-approximation, which has been improved to $0.667$ \citep{DBLP:conf/soda/CharikarCN19}, $0.716$ \citep{DBLP:journals/corr/abs-2111-06863} and $0.74$ \citep{DBLP:conf/aaai/NaumovYA21}, respectively.

For non-binary cluster tree construction, the most popular and representative algorithm is LOUVAIN \citep{blondel2008fast}. More recently, a hierarchical label propagation based algorithm HLP has been presented \citep{rossi2020fast}. Both of these two algorithms construct a non-binary cluster tree with the same efficient framework, that is, the hierarchies are formed from bottom to top one by one. In each round, they invoke different flat clustering heuristics, Modularity and Label Propagation, respectively.

For other metrics of cluster trees, Charpentier and Bonald \citep{cha2019tree} proposed an information-theoretic metric named tree sampling divergence (TSD) for hierarchical clustering. Two distributions of non-leaf nodes on cluster tree $T$ of graph $G$ are defined, one by sampling a random edge of $G$ with probability proportional to edge weights and taking the LCA of its two endpoints, and the other, considered as a null model, by sampling two random leaves and taking their LCA. TSD is defined as the Kullback-Leibler divergence of these two distributions that measures the distance of them. Intuitively, if $G$ is well clustered by $T$, then the mass of the former distribution will aggregate on LCAs that are far down from the root, which makes it far from the null model, and thus TSD is large. Otherwise, TSD is small. Note that this definition is quite different from our cost function since they measure the uncertainties of different distributions.



\subsection{Our contributions}

Our study is able to address above issues to some extent. We summarize our contributions as follows.

(1) We formulate \textbf{a new and explainable cost function for cluster trees from the information-theoretic perspective}, which bridges combinatorial and information-theoretic perspectives of hierarchical clustering. For this cost function, the balance of cluster trees will be involved naturally as a factor just like we design optimal codes (cf. Huffman codes), for which the balance of probability over objects is fundamental in constructing an efficient coding tree. Our theoretical result on complete graphs (Proposition \ref{pro:SE_for_cliques}) and experimental results demonstrate the advantages of our cost function.

(2) For our new cost function, we present \textbf{two polynomial-time approximation algorithms} respectively for two cases of the conductance $\Phi(G)$ of a cardinality weighted graph $G$. Our first result shows that \emph{any} cluster tree of $G$ has an approximation factor $O(\Phi(G)^{-1})$ (Theorem \ref{thm:expander}). So any cluster tree achieves $O(1)$-approximation when $\Phi(G)$ is a constant. The second result is a $O(1)$-approximation algorithm for $G$ that can be well clustered into a constant number of expander-like clusters (Theorem \ref{thm:well-clustered}). The main idea of this algorithm is inspired by Manghiuc and Sun's work \citep{DBLP:conf/nips/ManghiucS21}, and our approximation factors for our new objective also match their results in these two cases. An expander-like graph has high conductance, and a well-clustered graph is its opposite. So, our results has covered two representative scenarios in the field of clustering study.

(3) For practical scenarios, we develop \textbf{a new interpretable framework for natural hierarchical clustering} that outputs a non-binary cluster tree. The idea of our framework is essentially different from the traditional recursive division or agglomeration ones. In our framework, the \emph{sparsest level} of the cluster tree is stratified recursively. This coincides with the intuition that when we differentiate the hierarchies of a complex system, the clearest level should be stratified first, rather than in a rigid divisive or agglomerative fashion. Therefore, this framework has much better interpretability than the traditional ones. 

(4) We develop \textbf{a new non-binary clustering algorithm} (HCSE) under the new clustering framework. To find the sparsest level in each iteration, we formulate two basic operations called \emph{stretch} and \emph{compress}, respectively. Guided by our new cost function,  HCSE constructs a binary cluster tree in the stretch step, and then the sparsest level is stratified after compress. HCSE terminates when a specific criterion that intuitively coincides with the natural hierarchies is met, and \emph{no} hyperparameter is needed.
Our experimental results demonstrate that HCSE has a great advantage in both finding the intrinsic number of hierarchies and hierarchy reconstructions.


\section{A cost function from information-theoretic perspective}
\label{sec:cost_functions}

In this section, we first introduce the structural information theory of \citep{li2016structural} as well as the combinatorial cost functions of \citep{dasgupta2016cost} and \citep{cohen2019hierarchical}. Then we propose a new cost function that is developed from structural information theory and establish the relationship between the information-theoretic and combinatorial perspectives.

\noindent\textbf{Notations.} Let $G=(V,E,w)$ be an undirected weighted graph with a set of vertices $V$, a set of edges $E$ and a weight function $w:E\rightarrow\mathbb{R}^+$, where $\mathbb{R}^+$ denotes the set of all positive real numbers. An unweighted multigraph can be viewed as a cardinality weighted one whose edge weight is the number of parallel edges. For each vertex $u\in V$, denote by $d_u=\sum_{(u,v)\in E} w(u,v)$ the weighted degree of $u$. For a subset of vertices $S\subseteq V$, define the volume of $S$ to be the sum of degrees of vertices. We denote it by $\vol(S)=\sum_{u\in S} d_u$. The conductance of $S$, denoted by $\Phi_G(S)$, is defined as $w(E(S,\overline{S}))/\vol(S)$ where $w(E(S,\overline{S}))$ is the total weight of edges crossing $S$ and $\overline{S}$, and the conductance of graph $G$ is the minimum conductance over all subset $S$ whose volume is at most $\vol(V)/2$. We denote by $G[S]$ the subgraph induced by $S$. A cluster tree $T$ for graph $G$ is a rooted tree with $|V|$ leaves, each of which is labeled by a distinct vertex $v\in V$. Each non-leaf node on $T$ is labeled by a subset $S$ of $V$ that consists of all the leaves treating $S$ as an ancestor. For each node $\alpha$ on $T$, denote by $\alpha^-$ the parent of $\alpha$, and by $|\alpha|$ its size. For each pair of leaves $u$ and $v$, denote by $u\vee v$ the LCA of them on $T$.

\noindent\textbf{Structural entropy of graphs.} We first give the formal definition of structural entropy, and then introduce the idea behind it. Given a weighted graph $G=(V,E,w)$ and a cluster tree $T$ for $G$, the \emph{structural entropy of $G$ on $T$} is defined as
\begin{equation} \label{eqn:structural_entropy}
	\mathcal{H}^T (G)=-\sum_{\alpha\in T} \frac{g_\alpha}{\vol(V)} \log \frac{\vol(\alpha)}{\vol(\alpha^-)},\footnote{For notational convenience, for the root $\lambda$ of $T$, set $\lambda^-=\lambda$. So the term for $\lambda$ in the summation is $0$. In this paper, the default base of logarithm is always $2$.}
\end{equation}
where $g_\alpha$ denotes the sum of weights of edges in $G$ with exactly one end-point in the set of vertices corresponding to $\alpha$. The \emph{structural entropy of $G$} is defined as the minimum one among all cluster trees, denoted by
$\mathcal{H}(G)=\min_T\{\mathcal{H}^T (G)\}.$

The main idea of structural information is based on the random walk that has been well-known as an effective tool in revealing clustering structures. In this theory, the random walk is encoded with a certain rule by using a high-dimensional encoding system for a graph $G$. In each step of the random walk, a neighbor is randomly chosen with probability proportional to edge weights, and so it has a stationary distribution on vertices that is proportional to vertex degree. For connected graphs, this stationary distribution is unique, but not for disconnected ones. Here, we consider this canonical one for all graphs. So, to position a random walk under its stationary distribution, the amount of information needed is typically the Shannon's entropy, denoted by $\mathcal{H}^{(1)}(G)=-\sum_{v\in V} \frac{d_v}{\vol(V)} \log \frac{d_v}{\vol(V)}.$ By Shannon's noiseless coding theorem, $\mathcal{H}^{(1)}(G)$ is the tight lower bound of average code length generated from the \emph{memoryless} source for one step of the random walk. However, dependence of locations may shorten the code length.

The core concept of structural information is structural entropy. To demonstrate the underlying idea, consider flat (rather than hierarchical) clustering as a simple case example. Structural entropy measures the uncertainty (information) of locations under the stationary distribution of the random walk. This uncertainty consists of two parts, one from that of clusters, and the other from that of vertices in its cluster. These two parts can also be viewed as two parts of an encoding for each vertex. By the additivity of Shannon's entropy, the sum of these two parts of uncertainty should be equal to the original one of locations for memoryless sampling under the stationary distribution. The key idea is as follows. For one step of the random walk from a vertex $u$ to $v$, to measure the uncertainty of $v$, if $u$ and $v$ are in the same cluster, the uncertainty of clusters may not be involved. For a good clustering structure, this kind of in-cluster movements will happen with high probability, and thus the uncertainty of clusters will decrease sharply due to the dependence of steps of random walks. The resulting measurement of uncertainty under the stationary distribution is exactly the structural entropy of flat clustering.

The above idea can be generalized to hierarchical clustering. For each level of a cluster tree, the uncertainty of locations during the random walk is measured by the entropy of the stationary distribution on the clusters of this level. Consider an encoding for every cluster, including the leaves. Each non-root node $\alpha$ is labeled by its order among the children of its parent $\alpha^-$. So, the amount of self-information of $\alpha$ within this local parent-children substructure is $-\log(\vol(\alpha)/\vol(\alpha^-))$, which is also roughly the length of Shannon code for $\alpha$ and its siblings. The codeword of $\alpha$ consists of the sequential labels of nodes along the unique path from the root (excluded) to itself (included). For one step of the random walk from $u$ to $v$ in $G$, to indicate $v$, we omit from $v$'s codeword the longest common prefix of $u$ and $v$ that is exactly the codeword of $u\vee v$. This means that the random walk takes this step in the cluster $u\vee v$ (and also in $u\vee v$'s ancestors) and the uncertainty at this level may not be involved. Therefore, intuitively, a quality similarity-based cluster tree would trap the random walk with high frequency in the deep clusters that are far from the root, and the long codeword of $u\vee v$ would be omitted. This shortens the average code length of the random walk. Note that we ignore the uniqueness of decoding since a practical design of codewords is not our purpose. We utilize this scheme to evaluate hierarchical structures.

Then we formulate the above scheme and measure the average code length as follows. Given a weighted graph $G=(V,E,w)$ and a cluster tree $T$ for $G$, note that under the stationary distribution, the random walk takes one step out of a cluster $\alpha$ on $T$ with probability $g_\alpha/\vol(V)$. Therefore, the aforementioned uncertainty measured by the average code length is exactly $\mathcal{H}^T (G)$ that has been defined as the structural entropy of $G$ on $T$ (Eq. (\ref{eqn:structural_entropy})). To minimize this uncertainty, the structural entropy $\mathcal{H}(G)$ of $G$ has been defined as the minimum one among all cluster trees.
%To formulate the natural cluster tree with a certain height, we define the \emph{$k$-dimensional structural entropy of $G$} for any positive integer $k$ to be the minimum one among all cluster trees of height at most $k$, denoted by
%$$\mathcal{H}^{(k)}(G)=\min_{T:\text{height}(T)\leq k}\{\mathcal{H}^T (G)\}.$$
Note that the structural entropy of $G$ on the trivial $1$-level cluster tree is consistent with the previously defined $\mathcal{H}^{(1)}(G)$. It doesn't have any non-trivial cluster.



\noindent\textbf{Combinatorial explanation of structural entropy.}
Dasgupta's cost function \citep{dasgupta2016cost} of a cluster tree $T$ for graph $G=(V,E)$ is defined to be $c^T(G)=\sum_{(u,v)\in E} w(u,v) |u\vee v|$. The admissible cost function introduced by \citep{cohen2019hierarchical} generalizes the term $|u\vee v|$ in the definition of $c^T(G)$ to be a general function $g(|L|,|R|)$ for a binary cluster tree, where $L$ and $R$ are the two children of $u\vee v$, respectively. Thus, Dasgupta had defined $g(x,y)=x+y$. For both definitions, the optimal hierarchical clustering of $G$ is in correspondence with a cluster tree of minimum cost in the combinatorial sense that heavy edges are cut as far down the tree as possible.

The following proposition establishes the relationship between structural entropy and this kind of combinatorial form of cost functions.

\begin{proposition} \label{pro:equvi_cost_func}
	For a weighted graph $G=(V,E,w)$, minimizing $\mathcal{H}^T(G)$ (over $T$) is equivalent to minimizing the cost function
	\begin{equation} \label{eqn:SE_cost_form}
		\cost^T(G)=\sum_{(u,v)\in E} w(u,v) \log\vol(u\vee v).
	\end{equation}
\end{proposition}

We defer the proof of Proposition \ref{pro:equvi_cost_func} to Appendix \ref{supsec:proof_equvi_cost_func}. We denote by \emph{cost(SE)} the cost function in Proposition \ref{pro:equvi_cost_func} from now on. Proposition \ref{pro:equvi_cost_func} indicates that when we view $g$ as a function of the LCA rather than that of its size and define $g(u,v)=\log \vol(u\vee v)$, the ``admissible'' function becomes equivalent to structural entropy in evaluating cluster trees, although it is not admissible any more.

So what is the difference between these two cost functions? As stated by \citep{cohen2019hierarchical}, an important axiomatic hypothesis for admissible function, thus also for Dasgupta's cost function, is that the cost for every binary cluster tree of an unweighted clique is identical. So any binary tree for clustering on cliques is reasonable, which coincides with the common sense that structureless datasets can be organized hierarchically free. However, for structural entropy, the following theorem indicates that balanced organization is of importance even though for structureless datasets.

\begin{proposition} \label{pro:SE_for_cliques}
	For any positive integer $n$, let $K_n$ be the clique of $n$ vertices with identical weight on every edge. Then a cluster tree $T$ of $K_n$ achieves minimum structural entropy if and only if $T$ is a balanced binary tree, that is, the two children clusters of each non-leaf node of $T$ have difference in size at most $1$.
\end{proposition}

The proof of Proposition \ref{pro:SE_for_cliques} is a bit technical, and we defer it to Appendix \ref{supsec:proof_SE_for_cliques}. The intuition behind Proposition \ref{pro:SE_for_cliques} is that balanced codes are the most efficient encoding scheme for unrelated data, cf. Huffman codes (recall the main idea behind the structural entropy). So the codewords of the random walk that jumps freely among clusters on each level of a cluster tree have the minimum average length if all the clusters on this level are in balance.

Why is Proposition \ref{pro:SE_for_cliques} able to explain that cost(SE) has taken balance into account in hierarchical clustering? We remark that two factors impact the quality of hierarchical clustering and should be considered in a cost function, one is edge weight, and the other is balance. Edge weight means structure, which should be the main factor in clustering. If one needs to show that balance is also a factor, it is better to exclude the influence of structure. Complete graph is the most ideal candidate, because it is unstructured. We utilize complete graph since it is a counterpart to Dasgupta’s equal-cost property of unstructured graphs with his cost function.

Note that for regular graphs (e.g. cliques), replacing $\vol(u\vee v)$ by $|u\vee v|$ is equivalent for optimization. \citep{dasgupta2016cost} claimed that for the clique $K_4$ of four vertices, a balanced tree is preferable when replace $|u\vee v|$ by $g(|u\vee v|)$ for any strictly increasing concave function $g$ with $g(0)=0$. However, it is worth noting that this does not hold for all those concave functions. For example, it is easy to check that for $g(x)=1-e^{-x}$, the cluster tree of $K_6$ that achieves minimum cost partitions $K_6$ into $K_2$ and $K_4$ on the first level, rather than $K_3$ and $K_3$. In contrast, Proposition \ref{pro:SE_for_cliques} shows that for all cliques, balanced trees are preferable when concave $g$ is a logarithmic function.

It is worth noting that the admissible function introduced by \citep{cohen2019hierarchical} is defined from the viewpoint of generating trees. A generating tree $T$ of a similarity-based graph $G$ is generated from a minimal ultrametric and achieves the minimum cost. In this definition, the monotonicity of edge weights between sibling clusters from bottom to top on $T$, which is given by \citep{cohen2019hierarchical} as a property of a ``natural'' ground-truth hierarchical clustering, is the unique factor when evaluating $T$. However, as discussed earlier, Proposition \ref{pro:SE_for_cliques} implies that for cost(SE), besides edge weights, the balance factor of cluster trees is implicitly involved as another factor. Moreover, for cliques, the minimum cost should be achieved on every subtree, which makes an optimal cluster tree balanced everywhere. This unique optimal clustering for cliques is also robust in the sense that a slight perturbation to the minimal ultrametric, which can be considered as slight variations to the weights of a batch of edges, will not change the optimal cluster tree wildly due to the holdback force of balance.



\section{Approximation algorithms for cost(SE)} \label{sec:approximation_alg}

In this section, we present approximation algorithms for expander-like and well-clustered graphs, respectively. These approximation factors work for cardinality edge weights whose value is at least one (e.g. the multiplicity of edges).

\noindent \textbf{Why cardinality weights?} In general, the term $\log\vol(u\vee v)$ in Eq. \ref{eqn:SE_cost_form} and cost(SE) can become negative as the volume of $u\vee v$ varies, causing the approximation analysis to fail. The cardinality weight function $w$ is at least one, which makes cost(SE) non-negative. Although the dependence of cost(SE) on the scale of edge weights violates the scale-invariance principle, we emphasize that $\mathcal{H}^T(G)$ is scale-invariant and Proposition \ref{pro:equvi_cost_func} holds for any scale variation. In this paper, we present approximation algorithms for cost(SE) in well-defined settings, and our approximation guarantees hold for all graphs with edge weights at least one.

%First, we have the following theorem.

\begin{theorem} \label{thm:expander}
	For any cardinality weighted graph $G = (V,E,w)$ with conductance $\Phi(G)$, it holds that for any cluster tree, $cost^T(G)=O(\Phi(G)^{-1})\cdot OPT$, where $OPT$ is the minimum cost(SE) of $G$.
\end{theorem}

We defer the proof of Theorem \ref{thm:expander} to Appendix \ref{supsec:proof_expander}. When $\Phi(G)$ is a constant, Theorem \ref{thm:expander} implies that any cluster tree achieves $O(1)$-approximation for expander-like graphs. We remark that expander-like graphs do not have good clustering structures, and so the cost of any cluster tree is constant times near to the universal upper bound (as shown in the proof). This result is in fact a lower bound of cost(SE) for graphs without clustering structure, which demonstrates the rationality of our objective.
Considering balance as an important factor, we present a Huffman-merging heuristic (Algorithm \ref{alg:huffman_merge}). It will serve as a subroutine within expander-like clusters in the algorithm for well-clustered graphs.

% \begin{algorithm}
	% 	\caption{HuffmanMerge}
	% 	\label{alg:huffman_merge}
	% 	\KwIn{a graph $G=(V,E,w)$}
	% 	\KwOut{A cluster tree $T$ of $G$}
	% 	%\BlankLine
	% 	Create $n$ singleton trees\;
	% 	\While{there are at least two trees}{
		% 		Select the two trees $T_1$ and $T_2$ with the least volumes\;
		% 		Construct a new tree $T_0$ with $T_1$ and $T_2$ as two subtrees of the root;}
	% 	\textbf{return} the resulting binary tree $T_0$.	
	% \end{algorithm}

\begin{algorithm}[tb]
	\caption{HuffmanMerge}
	\label{alg:huffman_merge}
	\begin{algorithmic}
		\State {\bfseries Input:} a graph $G=(V,E,w)$.
		\State {\bfseries Output:} a cluster tree $T$ of $G$.
		\State Create $n$ singleton trees.
		\While{there are at least two trees}
		\State Select the two trees $T_1$ and $T_2$ with the least volumes.
		\State Construct a new tree $T_0$ with $T_1$ and $T_2$ as two subtrees of the root.
		\EndWhile
		\State {\bfseries return} the resulting binary tree $T_0$.
	\end{algorithmic}
\end{algorithm}

Next, we consider well-clustered graphs that are composed by a collection of densely-connected components with high inner conductance and weakly interconnections. Our settings for well-clustered graphs is the same as those in \citep{DBLP:conf/nips/ManghiucS21}. We start from the following $(\Phi_{in}, \Phi_{out})$-decomposition presented by Gharan and Trevisan \citep{DBLP:conf/soda/GharanT14}. Let $\lambda_k$ be the $k$-th smallest eigenvalue of the normalized Laplacian matrix of $G$ and $\Phi_G(S)$ be the conductance of a vertex set $S$ in $G$.

\begin{lemma} \label{lem:eigenvalue_cluster}
	(\citep{DBLP:conf/soda/GharanT14}, Theorem 1.5) Let $G = (V,E,w)$ be a graph such that $\lambda_k > 0$, for some $k 
	\ge 1$. Then, there is a local search algorithm that finds a $l$-partition $\{P_i\}_{i = 1}^l$ of V, for some $l<k$, such that for every $1 \le i \le l$, $\Phi_G(P_i) = \mathcal{O}(k^6\sqrt{\lambda_{k - 1}})$ and $\Phi(G[P_i]) = \Omega(\lambda_k^2 \slash k^4).$
\end{lemma}

Lemma \ref{lem:eigenvalue_cluster} implies that, when $G$ exhibits a clear clustering structure, there is a partition $\{P_i\}_{i = 1}^l$ of $V$ such that for each $P_i$ both the outer and inner conductance can be bounded. This is a crucial insight that we can use $\{P_i\}_{i = 1}^l$ directly to construct a cluster tree.

Our algorithm consists of two phases: Partition and Merge. In the Partition phase, it invokes the algorithm in Lemma \ref{lem:eigenvalue_cluster} to partion $V$ into subsets $\{P_i\}_{i = 1}^l$. In the Merge phase, it combines the trees in a "caterpillar style" according to an increasing order of their volumes. This algorithm is described as Algorithm \ref{alg:balanced_binary_merge}. Note that Algorithm \ref{alg:balanced_binary_merge} degenerates to Algorithm \ref{alg:huffman_merge} when $k=2$. For the approximation guarantee, we have the following theorem.

% \begin{algorithm}
	% 	\caption{Balanced Binary Merge (BBM)}
	% 	\label{alg:balanced_binary_merge}
	% 	\KwIn{A graph $G = (V,E,w)$, an integer $k\geq 2$ such that $\lambda_k>0$}
	% 	\KwOut{A cluster tree $T$ of $G$}
	% 	Apply the partitioning algorithm in Lemma \ref{lem:eigenvalue_cluster} on input $(G,k)$ to obtain $\{P_i\}_{i=1}^l$ for some $l<k$\;
	% 	Sort $P_1,...,P_l$ be such that $\vol_G(P_i) \le \vol_G(P_{i+1})$, for all $1 \le i < l$\;
	% 	% Scale $w$ to make $\vol(P_i)^o \ge 2$\;
	% 	Let $T_i = $ HuffmanMerge$(G[P_i])$\;
	% 	Initialize $T = T_1$\;
	% 	\For{$i = 2,...,l$}{Let $T$ be the tree with $T$ and $T_i$ as its two children\;}
	% 	\textbf{return} $T$.
	% \end{algorithm}

\begin{algorithm}[tb]
	\caption{Balanced Binary Merge (BBM)}
	\label{alg:balanced_binary_merge}
	\begin{algorithmic}
		\State {\bfseries Input:} a graph $G = (V,E,w)$, an integer $k\geq 2$ such that $\lambda_k>0$.
		\State {\bfseries Output:} a cluster tree $T$ of $G$.
		\State Apply the partitioning algorithm in Lemma \ref{lem:eigenvalue_cluster} on input $(G,k)$ to obtain $\{P_i\}_{i=1}^l$ for some $l<k$.
		\State Sort $P_1,...,P_l$ be such that $\vol_G(P_i) \le \vol_G(P_{i+1})$, for all $1 \le i < l$.
		\State Let $T_i = $ HuffmanMerge$(G[P_i])$.
		\State Initialize $T = T_1$.
		\For{$i = 2,...,l$}
		\State Let $T$ be the tree with $T$ and $T_i$ as its two children.
		\EndFor
		\State {\bfseries return} $T$.
	\end{algorithmic}
\end{algorithm}

\begin{theorem} \label{thm:well-clustered}
	Let $G = (V, E, w)$ be a cardinality weighted graph such that $\lambda_k>0$ for some $k\geq 1$. Then Algorithm \ref{alg:balanced_binary_merge} constructs in polynomial time a cluster tree $T$ of $G$ that achieves $O\left(\frac{1}{(1-\alpha)\beta}\log\frac{k}{1-\alpha}\right)$-approximation for $\cost^T(G)$, where $\alpha=O(k^6\sqrt{\lambda_{k-1}})$, $\beta=\Omega(\lambda_k^2/k^4)$. Consequently, when $\lambda_k=\Omega(1/\poly(k))$ and $\lambda_{k-1}=O(1/k^{12})$ such that $\alpha<1-\rho$ for some constant $\rho\in(0,1)$, Algorithm \ref{alg:balanced_binary_merge} achieves $O(\poly(k))$-approximation. In addition, when $k$ is a constant, Algorithm \ref{alg:balanced_binary_merge} achieves $O(1)$-approximation.
\end{theorem}

The proof of Theorem \ref{thm:well-clustered} is given in Appendix \ref{supsec:proof_well-clustered}. It follows the intuition that the cost contributed by the inner edges of clusters is upper bounded due to the cluster volumes, and that contributed by the outer edges is upper bounded due to their few number. Our proof is more natural than that in \citep{DBLP:conf/nips/ManghiucS21} since it circumvents the complicated decomposition into critical nodes.



\section{Non-binary hierarchical clustering algorithm HCSE} \label{sec:practical_alg}

In this section, we develop a non-binary hierarchical clustering algorithm. At present, most algorithms for hierarchical clustering can be categorized into two frameworks: top-down division and bottom-up agglomeration \citep{cohen2019hierarchical}. The top-down division approach usually yields a binary tree by recursively dividing a cluster into two parts by a cut-based subroutine. But a binary clustering tree is far from practical application. For practical use, bottom-up agglomeration that is also known as hierarchical agglomerative clustering (HAC) is commonly preferable. It constructs a cluster tree from leaves to the root recursively, during each round of which the newly generated clusters shrink into single vertices. We remark that there is no proper cost function for non-binary hierarchical clustering yet, and all existing algorithms for this purpose cannot be evaluated by any cost. We provide a new one by using cost(SE) in its subroutines. However, it is unclear and worth study whether cost(SE) and cost(Das) fit to evaluate non-binary cluster trees.

Our algorithm jumps out of these two frameworks. We establish a new one that stratifies the \emph{sparsest} level of a cluster tree recursively rather than in a sequential order. In general, guided by cost(SE), we construct a $(k+1)$-level cluster tree from the previous $k$-level one, during which we find the level whose stratification makes the average cost in a local reduced subgraph decrease most, and then differentiate it into two levels. The process of stratification consists of two basic operations: \emph{stretch} and \emph{compression}. In stretch step, given an internal node of a cluster tree, a local binary subtree is constructed, while in compression step, the paths that are overlong from the root to leaves on the binary tree are compressed by shrinking tree edges that make the cost reduce most. The intuition behind the ``stretch-and-compress'' scheme is as follows. First, we run a fast and simple, but probably rough clustering algorithm to obtain a binary cluster subtree. So, after stretch, we have unfolded all the potential hierarchies such that the sparsest level can probably be seen. Second, we compress every overlong path that is supposed to get through each level of this subtree, during which, the tree edge on the sparsest level whose compression makes too many graph edges amplify the sizes of their LCAs greatly will be retained. This edge is expected in the backbone of the final non-binary cluster tree.
We remark that this framework can be collocated with any cost function and any binary cluster tree algorithm. For computational efficiency, we will adopt in our experiments an HAC construction of binary cluster trees with no hyper-parameter in stretch steps.

\textbf{Stretch and compress.} Given a cluster tree $T$ for graph $G=(V,E)$, let $u$ be an internal node on $T$ and $v_1,v_2,\ldots,v_\ell$ be its children. We call this $1$-level local parent-children structure rooted at $u$ to be a \emph{$u$-triangle} of $T$, denoted by $T_u$. These two operations are defined on $u$-triangles. Note that each child $v_i$ of $u$ represents a cluster in $G$. We reduce $G$ by shrinking each $v_i$ to be a single vertex $v_i'$ while maintaining each inter-link and ignoring each internal edge of $v_i$. This reduction captures the connections of clusters at this level in the parent cluster $u$. The stretch operation constructs a binary tree for $u$-triangle with an HAC process. Initially, view each $v_i'$ as a cluster and then recursively merge two clusters into a new one such that cost(SE) drops most. This yields a binary subtree $T_u'$ rooted at $u$ which has $v_1,v_2,\ldots,v_\ell$ as leaves. Then the compression operation is proposed to reduce the height of $T_u'$ to be $2$. Let $\hat{E}(T')$ be the set of edges on $T'$, each of which appears on a path of length more than $2$ from the root of $T'$ to some leaf. Denote by $\Delta(e)$ the amount of structural entropy enhanced by shrinking edge $e$. We pick from $\hat{E}(T_u')$ the edge $e$ with least $\Delta(e)$. Note that compressing a tree edge makes the grandchildren of some internal node to be children, which must amplify the cost. The compression operation picks the least amplification. The processes of stretch and compress are illustrated in Figure \ref{fig:stretch_compress} and stated as Algorithms \ref{alg:stretch} and \ref{alg:compress}, respectively.

% \begin{algorithm}
	% 	\caption{Stretch($T_u$)}
	% 	\label{alg:stretch}
	% 	\KwIn{a $u$-triangle $T_u$}
	% 	\KwOut{a binary tree rooted at $u$}
	% 	Let $\{v_1,v_2,\ldots,v_\ell\}$ be the set of leaves of $T_u$\;
	% 	Compute $\eta(a,b)$ which is the cost reduced by merging siblings $a,b$ into a single cluster\;
	% 	\For{$t\in [\ell-1]$}{
		% 		$(\alpha,\beta) \gets \arg\max_{(a,b) \text{ are siblings}} \{\eta(a,b)\}$\;
		% 		Add a new node $\gamma$\;
		% 		$\gamma.parent \gets \alpha.parent$\;
		% 		$\alpha.parent = \gamma$\;
		% 		$\beta.parent = \gamma$\;
		% 	}
	% 	return $T_u$
	% \end{algorithm}

\begin{algorithm}[!h]
	\caption{Stretch($T_u$)}
	\label{alg:stretch}
	\begin{algorithmic}
		\State {\bfseries Input:} a $u$-triangle $T_u$.
		\State {\bfseries Output:} a binary tree rooted at $u$.
		\State Let $\{v_1,v_2,\ldots,v_\ell\}$ be the set of leaves of $T_u$.
		\State Compute $\eta(a,b)$ which is the cost reduced by merging siblings $a,b$ into a single cluster.
		\For{$t\in [\ell-1]$}
		\State $(\alpha,\beta) \gets \arg\max_{(a,b) \text{ are siblings}} \{\eta(a,b)\}$.
		\State Add a new node $\gamma$.
		\State $\gamma.parent \gets \alpha.parent$.
		\State $\alpha.parent = \gamma$.
		\State $\beta.parent = \gamma$.
		\EndFor
		\State {\bfseries return} $T_u$.
	\end{algorithmic}
\end{algorithm}
% \begin{algorithm}
	% 	\caption{Compress($T$)}
	% 	\label{alg:compress}
	% 	\KwIn{a binary tree $T$}
	% 	%\KwOut{a two-layer tree}
	% 	\While{$T$'s height is more than $2$}{
		% 		$e \gets \arg\min_{e'\in\hat{E}(T)} \{\Delta(e')\}$\;
		% 		Denote $e=(u,v)$ where $u$ is the parent of $v$\;
		% 		\For{$w\in v.children$}{
			% 			$w.parent \gets u$\;
			% 		}
		% 		Delete $v$ from $T$\;
		% 	}	
	% 	%return $T$
	% \end{algorithm}
\begin{algorithm}[!h]
	\caption{Compress($T$)}
	\label{alg:compress}
	\begin{algorithmic}
		\State {\bfseries Input:} a binary tree $T$.
		\While{$T$'s height is more than $2$}
		\State $e \gets \arg\min_{e'\in\hat{E}(T)} \{\Delta(e')\}$
		\State Denote $e=(u,v)$ where $u$ is the parent of $v$\
		\For{$w\in v.children$}
		\State$w.parent \gets u$
		\EndFor
		\State Delete $v$ from $T$
		\EndWhile
	\end{algorithmic}
\end{algorithm}



\textbf{Sparsest level.} Let $U_j$ be the set of $j$-level nodes on cluster tree $T$, that is, $U_j$ is the set of nodes each of which has distance $j$ from $T$'s root. Suppose that the height of $T$ is $k$, then $U_0,U_1,\ldots,U_{k-1}$ is a partition for all internal nodes of $T$. For each internal node $u$, define $\mathcal{H}(u)=-\sum_{v:v^-=u} \frac{g_v}{\vol(V)} \log \frac{\vol(v)}{\vol(u)}$. Note that $\mathcal{H}(u)$ is the partial sum contributed by $u$ in $\mathcal{H}^T(G)$. After a ``stretch-and-compress'' round on $u$-triangle, denote by $\Delta\mathcal{H}(u)$ the structural entropy by which the new cluster tree reduces. Since the reconstruction of $u$-triangle stratifies cluster $u$, $\Delta\mathcal{H}(u)$ is always non-negative. Define the sparsity of $u$ to be $\text{Spar}(u)=\Delta\mathcal{H}(u)/\mathcal{H}(u)$, which is the relative variation of structural entropy in cluster $u$. From the information-theoretic perspective, this means that the uncertainty of random walk can be measured locally in any internal cluster, which reflects the quality of clustering in this local area. At last, we define the \emph{sparsest level} of $T$ to be the $j$-th level such that the average sparsity of triangles rooted at nodes in $U_j$ is maximum, that is $\arg\max_j \{\overline{\text{Spar}}_j(T)\}$, where $\overline{\text{Spar}}_j(T)=\sum_{u\in U_j}\text{Spar(u)}/|U_j|$. Then stratification works for the sparsest level of $T$.
This process is illustrated in Figure \ref{fig:stratify} in Appendix \ref{supsec:figures_pseudocodes}.



For a given positive integer $k$, to construct a cluster tree of height $k$, we start from the trivial $1$-level cluster tree that involves all vertices of $G$ as leaves. Then we do not stop stratifying at the sparsest level recursively until a $k$-level cluster tree is obtained. The pseudocode is described as Algorithm \ref{alg:k-HCSE} in Appendix \ref{supsec:figures_pseudocodes}.

To determine the height of the cluster tree automatically, we derive the natural clustering from the variation of sparsity on each level. Intuitively, a natural hierarchical cluster tree $T$ should have not only sparse boundary on clusters, but also low sparsity for triangles of $T$, which means that further stratification within the reduced subgraphs corresponding to such triangles makes little sense. For this reason, we consider the inflection points of the sequence $\{\delta_t(\mathcal{H})\}_{t=1,2,\ldots}$, where $\delta_t(\mathcal{H})$ is the structural entropy by which the $t$-th round of stratification reduces. Formally, denote $\Delta_t \delta=\delta_{t-1}(\mathcal{H})-\delta_t(\mathcal{H})$ for each $t\geq 2$. We say that $\Delta_t \delta$ is an inflection point if both $\Delta_t \delta \geq \Delta_{t-1} \delta$ and $\Delta_t \delta \geq \Delta_{t+1} \delta$ hold. Our algorithm finds the least $t$ such that $\Delta_t \delta$ is an inflection point and fix the height of the cluster tree to be $t$ (Note that after $t-1$ rounds of stratification, the number of levels is $t$). The pseudocode is described as Algorithm \ref{alg:HCSE} in Appendix \ref{supsec:figures_pseudocodes}.



\textbf{Time complexity.} The running time of HCSE on graph $G=(V,E)$ for which $|V|=n$ and $|E|=m$ depends mainly on the iterations of stratification for the sparsest level. For each round of $t$-HCSE in Algorithm \ref{alg:HCSE}, since the change of structure entropy can be calculated incrementally and locally when merge siblings, the time complexity for the Stretch process is $O((m+n)\log n)$. Note that the LCA of each edge of $G$ can be recorded during Stretch. Since at most $n$ times of shrinking operations on tree edges will happen, and $\Delta(e)$ can be calculated locally, the time complexity for the Compress process is $O((m+n)\log n)$. Combining these two, the time complexity of HCSE (and also $k$-HCSE) is $O(k(m+n)\log n$.
Practically, $k$ is usually very small, for example $O(\log n)$. In this case, the time complexity is merely $O((m+n)\log^2 n)$.



\section{Experiments} \label{sec:experiments}

In this section, we evaluate experimentally our binary clustering algorithm BBM and the non-binary hierarchical clustering algorithm HCSE mainly on synthetic networks generated from the stochastic block model (SBM) \citep{ana2003robust} and the hierarchical stochastic block model (HSBM) \citep{lyzinski2017Community}, respectively. Due to the lack of ground truth for hierarchical clustering on real datasets, we conduct our experiments on the Amazon network that is the only one we suppose to have overlapping, possibly hierarchical, ground-truth clusters (see Appendix \ref{supsubsec:Jaccard_Amazon}).
All algorithms were implemented in python 3.8 and the experiments were performed using an Intel(R) Core(TM) i5-12400 CPU @ 2.50GHz processor, with 16 GB RAM.
For the source codes, please refer to \url{https://github.com/Hardict/HCSE}.

%\subsection{Binary clustering: cost and balancedness on SBM graphs}

\noindent\textbf{Binary clustering: cost and balancedness on SBM graphs.}
We evaluate our binary clustering algorithm BBM in two aspects: cost and balancedness. We denote Dasgupta's cost function by cost(Das). The baselines include four linkage-based methods \citep{cohen2019hierarchical}: average-linkage (AL), single-linkage (SL), complete-linkage (CL) and Linkage++ (L++, previous state-of-the-art for cost(Das)). PruneMerge proposed by \citep{DBLP:conf/nips/ManghiucS21} always has complicated operations on critical nodes and sometimes is hard to terminate in short time. So we do not include it as a baseline.

\begin{table*}[h]
	\centering
	\renewcommand\arraystretch{1.2}
	\tabcolsep=2.9pt
	\scalebox{1}{
		\begin{tabular}{c|ccccc|ccccc}
			\toprule[1.5pt]
			%\cmidrule(r){1-4}
			& \multicolumn{5}{c|}{(Biased)}	& \multicolumn{5}{c}{(Random)} \\
			Method & $\text{B}_{\text{size}}$ & $\text{B}_{\text{vol}}$ & $\text{B}_{\text{dep}}$ & cost(Das) & cost(SE) & $\text{B}_{\text{size}}$ & $\text{B}_{\text{vol}}$ & $\text{B}_{\text{dep}}$ & cost(Das) & cost(SE) \\
			\hline
			AL & 0.619 & 0.666 & 6.79 & 1.07E7 & 7.16E5 & 0.411 & 0.438 & 2.83 & 4.68E5 & 6.20E4 \\
			%\hline
			SL & 0.232 & 0.241 & 1.41 & 1.62E7 & 7.43E5 & 0.263 & 0.271 & 1.51 & 7.45E5 & 6.72E4 \\
			%\hline
			CL & 0.977 & 0.978 & 64.7 & 1.43E7 & 7.50E5 & 0.933 & 0.935 & 22.2 & 7.08E5 & 6.75E4 \\
			%\hline
			L++ & 0.897 & 0.921 & 30.2 & \textbf{9.45E6} & 7.12E5 & 0.721 & 0.735 & 7.31 & \textbf{3.82E5} & 6.09E4 \\
			%\hline
			BBM & \textbf{0.0148} & \textbf{0.0681} & \textbf{0.181} & 9.48E6 & \textbf{7.03E5} & \textbf{0.0981} & \textbf{0.0980} & \textbf{0.645} & 3.94E5 & \textbf{6.06E4} \\
			\bottomrule[1.5pt]
		\end{tabular}
	}
	\caption{Cost and balancedness on SBM graphs. ``Biased'' and ``Random'' denote the two strategies for generating cluster sizes for SBM. Five clusters are generated by Biased strategy, and their sizes are set to be $[32,32,64,128,256]$ with $\pm 5$ random perturbation on each figure. The sizes of five clusters generated by Random strategy are uniformly and randomly drawn from integers in the interval $[2^4,2^6)$. The probability of presence for intra-cluster edges is $0.9$ and that for inter-cluster edges is $0.1$. All values are averaged over $100$ trials.}
	\label{tab:cost_balance}
\end{table*}

The datasets we use are random graphs generated from SBM. We remark that a good clustering algorithm should always treat clustering structure as the most significant impact, and when cluster outlines become vague, the balance factor makes an effect. Therefore, we produce a series of SBM graphs, each of which has a significant variation in cluster sizes. Two strategies for generating clusters of different sizes are used, one is a totally biased way and the other is random. More details are introduced in Appendix \ref{supsec:synthetic_data}.


For balance evaluation on a binary tree, we define three indices, for each of which, the smaller, the better.
The first is \emph{size balance index}, denoted by $\text{B}_{\text{size}}$. For each node $\alpha$ on $T$, if $\alpha$ is a non-leaf node, let $\alpha_\ell$ and $\alpha_r$ be its two children, and $\alpha$'s size balance index is defined as a normalized size difference between the children, that is
$\text{B}_{\text{size}}(\alpha)=\lvert |\alpha_\ell|-|\alpha_r| \rvert / |\alpha|$.
If $\alpha$ is a leaf, $\text{B}_{\text{size}}(\alpha)=0$. The size balance index of $T$ is a weighted sum of $\text{B}_{\text{size}}(\alpha)$, that is $\text{B}_{\text{size}}(T)=\sum_{\alpha\in T}\rho_\alpha\cdot\text{B}_{\text{size}}(\alpha)$, where $\rho_\alpha=|\alpha|/\sum_{\alpha'\in T} |\alpha'|$. So, after simplicity, we have
$$\text{B}_{\text{size}}(T)=\sum_{\alpha\in T} \frac{\lvert |\alpha_\ell|-|\alpha_r| \rvert}{\sum_{\alpha'\in T}|\alpha'|}.$$
The second index is the counterpart of $\text{B}_{\text{size}}$ that uses volume, named \emph{volume balance index} and denoted by $\text{B}_{\text{vol}}$. Formally,
$$\text{B}_{\text{vol}}(T)=\sum_{\alpha\in T} \frac{\left| \vol(\alpha_\ell)-\vol(\alpha_r)\right|}{\sum_{\alpha'\in T}\vol(\alpha')}.$$
The last index is \emph{depth balance index}, denoted by $\text{B}_{\text{dep}}$. $\text{B}_{\text{dep}}(T)$ is simply the standard deviation of the depths of all leaves on $T$.

We show the cost and balancedness results on SBM graphs in Table \ref{tab:cost_balance}. On both datasets with two different generating strategies, our algorithm BBM achieves the best cost(SE), and also competitive cost(Das) with Linkage++. Meanwhile, BBM achieves the best balances factors for all three indices, which means that small cost(SE) corresponds to good balance of cluster trees in the case that the underlying ground truth clusters can be well constructed. More comprehensive experimental results are provided in Appendix \ref{supsubsec:BBM_results}.



%\subsection{Non-binary clustering: NMI results on HSBM graphs}

\noindent\textbf{Non-binary clustering: NMIs on HSBM graphs.}
To evaluate the effective of HCSE, we adopt two popular and representative non-binary clustering methods as baselines, LOUVAIN \citep{blondel2008fast} and HLP \citep{rossi2020fast}, that follow the traditional bottom-up agglomeration framework.
LOUVAIN admits a sequential input of vertices. To avert the worst-case trap, vertices come randomly, and the resulting cluster tree depends on their order. HLP invokes the common Label Propagation algorithm recursively, and so it cannot be guaranteed to avoid under-fitting in each round. This can be seen in our experiments on synthetic datasets, for which these two algorithms sometimes miss ground-truth levels.
To evaluate the effectiveness of our cost function, we replace the stretch step of HCSE by linkage-based methods AL, SL and CL, each of which also depends on no hyper-parameter. Our experiments evaluate our frame work incorporated with these methods in the Stretch step, for which we denote by HC$_\text{ave}$, HC$_\text{sin}$, HC$_\text{com}$ these three algorithms, respectively.

We utilize the $k$-level HSBM (introduced in Appendix \ref{supsubsec:HSBM}) for the task to reconstruct the ground-truth HSBM cluster trees. Table \ref{tab:HSBM_NMI} in Appendix \ref{tab:HSBM_NMI} demonstrates the results on two groups of random graphs generated from HSBM whose heights are $k=4,5$, respectively. We compare the Normalized Mutual Information (NMI) at each level of the ground-truth cluster tree. Due to the randomness in LOUVAIN and convergence of HLP, we choose the most effective strategy and pick the best results in five runs for both of these two baselines. In contrast, our algorithm HCSE yields deterministic results. We need to emphasize that since there is no proper cost function for non-binary clustering yet, we do not evaluate HCSE with any cost.

The main results are summarized as follows. HCSE is always able to find the correct height $k$, while HLP and LOUVAIN are not. Although LOUVAIN is comparable with HCSE in NMI values, it cannot find the correct level number in any group. HLP and the three linkage-based methods cannot achieve the best NMI for any probability vector, even if we calculate the NMI between each ground-truth level and the one that achieves the maximum NMI in the resulting tree when the height of cluster tree is incorrect. The reason why HCSE performs not so well as LOUVAIN on deep levels is as follows. The accuracy of HCSE for deep levels depends on that for the intermediate levels that are probably stratified earlier. The early errors will accumulate later to the bottom. Comparatively, LOUVAIN starts with the deepest level in a bottom-up fashion. So it has better NMIs on deep levels than HCSE, but due to error accumulation also, it has worse NMIs on high levels, even miss the top level. For the three linkage-based methods, the structure of resulting non-binary tree is quite different from the ground truth. This indicates that the binary tree after Stretch are not accurate enough in the sense that the real sparse levels have not been properly unfolded by the internal nodes of the binary tree, and so they cannot be stratified properly. This also demonstrates the advantage of our cost function in binary clustering. More ablation experiments are provided in Appendix \ref{supsubsec:ablation}.



\section{Conclusions and Discussions} \label{sec:conclusions_discussions}

In this paper, we investigate the hierarchical clustering problem on graphs from an information-theoretic perspective and propose a new cost function that relates to the combinatorial objective raised by Dasgupta \citep{dasgupta2016cost}. We present two $O(1)$-approximation algorithms for it on two canonical kinds of graphs, i.e., expander-like and well-clustered cardinality weighted ones, respectively. For non-binary hierarchical clustering, we propose a new interpretable framework that stratifies the sparsest level of the cluster tree recursively, which can be collocated with any binary clustering algorithm. We also present an interpretable strategy to find the intrinsic number of levels without any hyper-parameter. The experimental results have verified the effectiveness of our cost function and algorithms.

There are several directions that are worth further study. The first problem is about the complexity of minimizing cost(SE). It is unknown whether computing $\min$ cost(SE) is NP-hard. For approximation guarantee, note that there is a trivial factor $\log \vol(V)$ for the cardinality weighted graphs simply due to the definition of cost(SE). However, there is no non-trivial factor in the worst case yet.
The second problem is about the relationship between the concavity of $g$ of the cost function and the balance of the optimal cluster tree. It can be checked that for cliques, being concave is not a sufficient condition for total balance. Whether is it a necessary condition? Moreover, is there any explicit necessary and sufficient condition for total balance of the optimal cluster tree for cliques?
The third one is about more precise characterizations for ``natural'' hierarchical clustering whose depth is limited. Since any reasonable choice of $g$ makes the cost function achieve optimum on some binary tree, a blind pursuit of minimization of cost functions seems not to be a rational approach. More criteria in this scenario are worth study.









\subsubsection*{Acknowledgements}

The corresponding author is Yicheng Pan. This work is partly supported by National Key R\&D Program of China (2021YFB3500700), and partly supported by State Key Laboratory of Complex \& Critical Software Environment (CCSE-2024ZX-20).



% References
\bibliography{bibtex}

\newpage

\onecolumn



\title{An Information-theoretic Perspective of Hierarchical Clustering on Graphs\\(Supplementary Material)}
\maketitle

\appendix



\section{Proof of Proposition \ref{pro:equvi_cost_func}} \label{supsec:proof_equvi_cost_func}

\begin{proof}
	For each internal node $\alpha$ on $T$, denote by $\partial(\alpha)$ the sets of edges in $G$ with exactly one end-point in the set of vertices corresponding to $\alpha$. So $g_\alpha=\sum_{e\in\partial(\alpha)} w(e)$. Note that
	\begin{eqnarray*}
		\mathcal{H}^T(G) &=& -\sum_{\alpha\in T} \frac{g_\alpha}{\vol(V)}\log\frac{\vol(\alpha)}{\vol(\alpha^-)}\\
		&=& -\sum_{\alpha\in T} \sum_{(u,v)\in \partial(\alpha)} \frac{w(u,v)}{\vol(V)}\log\frac{\vol(\alpha)}{\vol(\alpha^-)}\\
		&=& -\sum_{(u,v)\in E} \left(\frac{w(u,v)}{\vol(V)} \sum_{\alpha:(u,v)\in g_\alpha} \log\frac{\vol(\alpha)}{\vol(\alpha^-)}\right).
	\end{eqnarray*}	
	For a single edge $(u,v)\in E$, all the terms $\log(\vol(\alpha)/\vol(\alpha^-))$ for leaf $u$ satisfying $(u,v)\in g_\alpha$ sum (over $\alpha$) up to $\log (d_u/\vol(u\vee v))$ along the unique path from $u$ to $u\vee v$. It is symmetric for $v$. Therefore, considering ordered pair $(u,v)\in E$,		
	\begin{eqnarray*}
		\mathcal{H}^T(G) &=& -\sum_{\text{ordered }(u,v)\in E} \frac{w(u,v)}{\vol(V)}\log\frac{d_u}{\vol(u\vee v)}\\
		&=& \frac{1}{\vol(V)} \left( -\sum_{u\in V} d_u \log d_u + \sum_{\text{ordered }(u,v)\in E} w(u,v) \log \vol(u\vee v) \right)\\
		&=& \frac{1}{\vol(V)} \left( -\sum_{u\in V} d_u \log d_u + 2\cdot\sum_{(u,v)\in E} w(u,v) \log \vol(u\vee v) \right).
	\end{eqnarray*}
	
	The second equality follows from the fact that for a fixed $u$, $d_u=\sum_{v:(u,v)\in E} w(u,v)$, and the last equality from the symmetry of $(u,v)$. Since the first summation is independent of $T$, Proposition \ref{pro:equvi_cost_func} follows.
\end{proof}


\section{Proof of Proposition \ref{pro:SE_for_cliques}} \label{supsec:proof_SE_for_cliques}

\begin{proof}
	
	Note that a balanced binary tree (BBT for abbreviation) means the tree is balanced on every internal node. Formally, for an internal node of cluster size $k$, its two sub-trees are of cluster sizes $\lfloor k/2 \rfloor$ and $\lceil k/2 \rceil$, respectively.
	
	For cliques, since the weights of each edge are identical, we assume it safely to be $1$. By Proposition 2.1, minimizing the structural entropy is equivalent to minimizing the cost function (over $T$)
	\begin{eqnarray*}
		\cost^T(G) &=& \sum\limits_{(u,v) \in E} \log \vol(u \vee v) \\
		&=& \sum\limits_{(u,v) \in E}\log\left((n-1)|u \vee v|\right) \\
		&=& \sum\limits_{(u,v) \in E}\log(n-1)+\sum\limits_{(u,v) \in E}\log|u \vee v|
	\end{eqnarray*}
	Since the first term in the last equation is independent of $T$, the optimization turns to minimizing the last term, which we denote by $\Gamma(T)$. Grouping all edges in $E$ by LCA of two end-points, the cost $\Gamma(T)$ can be written as the sum of the cost $\gamma$ at every internal node $N$ of $T$. Formally, for every internal node $N$, let $A,B \subseteq V$ be the leaves of the sub-trees rooted at the left and right child of $N$, respectively. We have
	\begin{eqnarray*}
		\Gamma(T)&=&\sum\limits_{N}\gamma(N) \\
		\gamma(N)&=& \left(\sum\limits_{x \in A,y \in B}1 \right) \cdot \log\left(|A|+|B|\right) \\
		&=&|A|\cdot|B|\cdot\log(|A|+|B|)
	\end{eqnarray*}
	Now we only have to show the following lemma.
	
	\begin{lemma} \label{lem:Gamma_equiv}
		For any positive integer $n$, a cluster tree $T$ of $K_n$ achieves minimum cost $\Gamma(T)$ if and only if $T$ is a BBT.
	\end{lemma}
	
	
	\begin{proof}
		
		Lemma \ref{lem:Gamma_equiv} is proved by induction on $|V|$. The key technique of tree swapping we use here is inspired by Cohen-Addad et al [4]. The basis step holds since for $|V|=2$ or $3$, the cluster tree is balanced and unique. It certainly achieves the minimum cost exclusively.
		
		Now, consider a clique $G=(V,E)$ with $n=|V| \ge 4$. Let $T_1$ be an arbitrary unbalanced cluster tree and $\lambda$ be its root. We need to prove that the cost $\Gamma(T_1)$ does not achieve the minimum.	Without loss of generality, we can safely assume the root node is unbalanced, since otherwise, we set $T_1$ to be the sub-tree that is rooted at an unbalanced node. Let $T_2$ be a tree with root $\lambda$ whose left and right sub-trees are BBTs such that they have the same sizes with the left and right sub-trees of $T_1$, respectively. Let $V_{ll}$, $V_{lr}$, $V_{rl}$ and $V_{rr}$ be the sets of nodes on the four sub-trees at the second level of $T_2$ ($V_{ll}$ means the left child of left child, $V_{lr}$ the right child of left child, and so on), and $n_{ll}$, $n_{lr}$, $n_{rl}$ and $n_{rr}$ denote their sizes, respectively. Our proof is also available when some of them are empty. We always assume $n_{ll} \le n_{lr}$ and $n_{rl} \ge n_{rr}$. Next, we construct $T_3$ by swapping (transplanting) $V_{lr}$ and $V_{rl}$ with each other. Finally, let $T_4$ be a tree with root $\lambda$ whose left and right sub-trees are BBTs after balancing the left and right sub-trees of $T_3$. So $T_4$ is a BBT. Then we only have to prove that $\Gamma(T_1) > \Gamma(T_4)$. Note that the strict ``$>$'' is necessary since we need to negate all unbalanced cluster trees.
		
		Then we show that the transformation process that consists of the above three steps makes the cost decrease step by step. Formally,
		\begin{itemize}
			\item[(a)] $T_1$ to $T_2$. The sub-trees of $T_1$ become BBTs in $T_2$. Since the number of edges whose end-points treat the root as LCA is the same, by induction we have $\Gamma(T_1) \ge \Gamma(T_2)$.
			\item[(b)] $T_2$ to $T_3$. We will show that $\Gamma(T_2) > \Gamma(T_3)$ in Lemma \ref{lem:T2_T3_transplant}.
			\item[(c)] $T_3$ to $T_4$. The sub-trees of $T_3$ become BBTs in $T_4$. For the same reason as (a), we have $\Gamma(T_3) \ge \Gamma(T_4)$. 
		\end{itemize}
		%This transformation process is illustrated in Figure \ref{fig:transformation}.
		Putting them together, we get $\Gamma(T_1) > \Gamma(T_4)$ and Lemma \ref{lem:Gamma_equiv} follows.
		
		%	\begin{figure}[h]
			%		\centering
			%		\includegraphics[width=1\textwidth]{3_steps}
			%		\caption{Illustration of transformation from $T_1$ to $T_4$.}
			%		\label{fig:transformation}
			%	\end{figure}
	\end{proof}
	
	
	
	\begin{lemma} \label{lem:T2_T3_transplant}
		After swapping $V_{lr}$ and $V_{rl}$, we obtain $T_3$ from $T_2$, for which $\Gamma(T_2) > \Gamma(T_3)$. 
	\end{lemma}
	
	\begin{proof}
		We only need to consider the changes in cost of three nodes: root and its left and right children, since the cost contributed by each of the remaining nodes does not change after swapping. Ignoring the unchanged costs, define
		\begin{eqnarray*}
			\cost(T_2) &=& n_l n_r \log n+n_{ll} n_{lr} \log{n_l}+n_{rl} n_{rr} \log{n_r} \\
			&=& n_ln_r \log n+\left\lfloor \frac{n_l}{2} \right\rfloor \left\lceil \frac{n_l}{2} \right\rceil  \log{n_l}+\left\lceil \frac{n_r}{2} \right\rceil \left\lfloor \frac{n_r}{2} \right\rfloor \log{n_r},
		\end{eqnarray*}
		where $n_l = n_{ll} + n_{lr}$, $n_r = n_{rl} + n_{rr}$. Both of them are at least $1$. Similarly, define
		\begin{eqnarray*}
			\cost(T_3) &=& (n_{ll}+n_{rl})(n_{lr}+n_{rr}) \log n+n_{ll}n_{rl} \log{(n_{ll}+n_{rl})}+n_{lr}n_{rr} \log{(n_{lr}+n_{rr})} \\
			&=& \left\lfloor \frac{n}{2} \right\rfloor \left\lceil \frac{n}{2} \right\rceil \log n+\left\lfloor \frac{n_l}{2} \right\rfloor \left\lceil \frac{n_r}{2} \right\rceil \log\left(\left\lfloor \frac{n_l}{2} \right\rfloor+\left\lceil \frac{n_r}{2} \right\rceil\right)+\left\lceil \frac{n_l}{2} \right\rceil \left\lfloor \frac{n_r}{2} \right\rfloor \log \left(\left\lceil \frac{n_l}{2} \right\rceil+\left\lfloor \frac{n_r}{2} \right\rfloor \right) \\
		\end{eqnarray*}
		Denote
		\begin{eqnarray} \label{eqn:Delta}
			\Delta &=& \Gamma(T_2) - \Gamma(T_3) \nonumber \\
			&=& \cost(T_2) - \cost(T_3) \nonumber \\
			&=&\left\lfloor \frac{n_l}{2} \right\rfloor \left\lceil \frac{n_l}{2} \right\rceil \log\left(\frac{n_l}{n}\right)+\left\lceil \frac{n_r}{2} \right\rceil \left\lfloor \frac{n_r}{2} \right\rfloor \log\left(\frac{n_r}{n}\right) \nonumber \\
			& &-\left\lfloor \frac{n_l}{2} \right\rfloor \left\lceil \frac{n_r}{2} \right\rceil \log\left(\frac{\left\lfloor \frac{n_l}{2} \right\rfloor+\left\lceil \frac{n_r}{2} \right\rceil}{n}\right)-\left\lceil \frac{n_l}{2} \right\rceil \left\lfloor \frac{n_r}{2} \right\rfloor \log\left(\frac{\left\lceil \frac{n_l}{2} \right\rceil+\left\lfloor \frac{n_r}{2} \right\rfloor}{n}\right)
		\end{eqnarray}
		So we only have to show that $\Delta > 0$. We consider the following three cases according to the odevity of $n_l$ and $n_r$.
		\begin{itemize}
			\setlength{\itemindent}{2.5em}
			\item[\textbf{Case $1$}:] $n_l$ and $n_r$ are even.
			\item[\textbf{Case $2$}:] $n_l$ and $n_r$ are odd.
			\item[\textbf{Case $3$}:] $n_l$ is odd while $n_r$ is even.
		\end{itemize}
		The case that $n_l$ is even while $n_r$ is odd is symmetric to \textbf{Case $3$}.
		
		For \textbf{Case $1$}, if both $n_l$ and $n_r$ are even, then notations of rounding in Eq. (\ref{eqn:Delta}) can be removed and $\Delta$ can be simplified as
		\begin{eqnarray*}
			\Delta=\frac{n_l^2}{4} \log\left(\frac{n_l}{n}\right)+\frac{n_r^2}{4} \log\left(\frac{n_r}{n}\right)+\frac{n_l n_r}{2}.
		\end{eqnarray*}
		Let $p=n_l/n,q=n_r/n$, and so $p+q=1$. Recall that $T_1$ is unbalanced on the root $\lambda$, so is $T_2$. Thus $p\neq q$. Multiplying by $\frac{4}{n^2}$ on both sides, we only have to prove that
		\[
		p^2 \log p+q^2 \log q+2pq > 0.
		\]	
		That is,
		$$\frac{p}{q} \log p+\frac{q}{p} \log q+2 > 0.$$			
		Let $g(x)=\frac{x}{1-x} \log x$. Then we only need to show that $g(p)+g(q)+2>0$ when $p\neq q$. Since
		\begin{eqnarray*}
			g'(x) &=& \frac{(1-x)+\ln x}{\ln 2 \cdot (1-x)^2}, \\
			g''(x) &=& -\frac{x^2-2x\ln x-1}{\ln 2 \cdot x(1-x)^3}.
		\end{eqnarray*}
		It is easy to check that $g''(x)>0$ when $0 < x < 1$. So $g(x)$ is strictly convex in the interval $(0,1)$. Since $p\neq q$,
		$$g(p)+g(q) > 2g\left(\frac{p+q}{2}\right) = -2.$$
		Thus $\Delta>0$ holds.
		
		For \textbf{Case $2$}, if both $n_l$ and $n_r$ are odd, then $\Delta$ can be split into two parts $\Delta=\Delta_1+\Delta_2$, in which
		\begin{eqnarray*}
			\Delta_1 &=& \frac{n_l^2}{4} \log\left(\frac{n_l}{n}\right)+\frac{n_r^2}{4} \log\left(\frac{n_r}{n}\right)+\frac{n_l n_r}{2} \\
			\Delta_2 &=& -\frac{1}{4} \log\left(\frac{n_l}{n}\right)-\frac{1}{4} \log\left(\frac{n_r}{n}\right)-\frac{1}{2}
		\end{eqnarray*}
		Since we have shown that $\Delta_1>0$, if we can prove $\Delta_2 \ge 0$, then the lemma will hold for \textbf{Case $2$}. Due to the convexity of logarithmic function, this holds clearly since
		\begin{eqnarray*}
			2 \log\left(\frac{n}{2}\right) \ge \log n_l + \log n_r.
		\end{eqnarray*}
		
		
		For \textbf{Case $3$}, if $n_l$ is odd while $n_r$ is even,
		\begin{eqnarray*}
			\Delta=\frac{n_l^2-1}{4} \log\left(\frac{n_l}{n}\right)+\frac{n_r^2}{4} \log\left(\frac{n_r}{n}\right)-\left[\frac{(n_l-1)n_r}{4} \log\left(\frac{n-1}{2n}\right)+\frac{(n_l+1)n_r}{4} \log\left(\frac{n+1}{2n}\right)\right].
		\end{eqnarray*}
		Multiplying the above equation by $4\ln 2$, without changing its sign, yields
		\begin{eqnarray*}
			(4\ln 2) \Delta = (n_l^2-1)\ln\left(\frac{n_l}{n}\right)+n_r^2\ln\left(\frac{n_r}{n}\right)-\left[(n_l-1)n_r\ln\left(\frac{n-1}{2n}\right)+(n_l+1)n_r\ln\left(\frac{n+1}{2n}\right)\right]
		\end{eqnarray*}	
		Splitting the right hand side into two parts,
		\begin{eqnarray*}
			A &=& n_l^2\ln\left(\frac{n_l}{n}\right)+n_r^2\ln\left(\frac{n_r}{n}\right)+2n_ln_r\ln2 \\
			B &=& -\ln\left(\frac{n_l}{n}\right)-(n_l+1)n_r\ln\left(1+\frac{1}{n}\right)-(n_l-1)n_r\ln\left(1-\frac{1}{n}\right)
		\end{eqnarray*}
		Since $n$ is odd and the root $\lambda$ of $T_2$ is unbalanced, we only need to consider the case that $n_l=(n-i)/2$, $n_r=(n+i)/2$ (Note that $n_l$ and $n_r$ are symmetric. So if $(n-i)/2$ is even, exchange $n_l$ and $n_r$), where both $n$ and $i$ are odd satisfying $n > i \ge 3$. Next we show that in this case, $A \ge \ln(1/5)+4^2\ln(4/5)+2 \cdot 4\ln2$ and $B > \ln2-3/4-(2/3) \cdot (1/5^2)$. By calculation, $\Delta=A+B > 0$ for \textbf{Case $3$}.		
		
		\begin{claim} \label{clm:A}
			$A \ge \ln(1/5)+4^2\ln(4/5)+2 \cdot 4\ln2$ for odd integers $n > i \ge 3$.
		\end{claim}
		
		\begin{proof}
			Substituting $n_l=(n-i)/2$, $n_r=(n+i)/2$ into the $A$ yields
			\begin{eqnarray*}
				A = C(n,i) \triangleq \left(\frac{n-i}{2}\right)^2\ln\left(\frac{n-i}{2n}\right)+\left(\frac{n+i}{2}\right)^2\ln\left(\frac{n+i}{2n}\right)+2\cdot\frac{n-i}{2}\cdot\frac{n+i}{2}\ln2.
			\end{eqnarray*}
			Treat $n$ as a continuous variable, we have
			\begin{eqnarray*}
				\frac{\partial C(n,i)}{\partial n}=\frac{1}{2} \left[{(n+i)\ln\left(1+\frac{i}{n}\right)+(n-i)\ln\left(1-\frac{i}{n}\right)-\frac{i^2}{n}}\right]
			\end{eqnarray*}
			Multiplying the above equation by $2/n$ and setting $x = i/n$ yields
			\begin{eqnarray*}
				f(x) &\triangleq& (1+x)\ln(1+x)+(1-x)\ln(1-x)-x^2, \\
				f'(x) &=& \ln(1+x)-\ln(1-x)-2x, \\
				f''(x) &=& \frac{2x^2}{1-x^2}.
			\end{eqnarray*}
			It is easy to check that $f(0) = 0$ and $f'(0) = 0$. When $0 < x < 1$,  $f''(x) > 0$. Thus $f'(x) > 0$ and $f(x) > 0$. This means that $\partial C(n,i)/\partial n > 0$ for all $n>0$. So $C(n,i) \ge C(i+2,i)$ for $n \ge i+2$ (When $i$ is fixed, the minimum value of $n$ can be taken to $i + 2$, which makes $n_l=(n-i)/2$ and $n_r=(n+i)/2$ integral). The curves of $C(n,i)$ for varying $i$ are plotted in Figure \ref{fig:C(n,i)}.
			
			\begin{figure}[htb]
				\centering
				\includegraphics[width=4in]{figures/C_n_i}
				\caption{Functions $C(n,i)$}
				\label{fig:C(n,i)}
			\end{figure}
			
			When $n = i+2$, we get $n_l=(n-i)/2=1$ and $n_r=(n+i)/2=n-1$. Substituting them into $A$ yields
			\begin{eqnarray*}
				D(n) &\triangleq& \ln\left(\frac{1}{n}\right)+(n-1)^2\ln\left(1-\frac{1}{n}\right)+2(n-1)\ln2, \\
				\frac{d D}{dn} &=& 1-\frac{2}{n}+2\ln2+2(n-1)\ln\left(1-\frac{1}{n}\right).
			\end{eqnarray*}
			When $n>2$, it is easy to check that $d D/d n>0$. So the minimum value of $d(n)$, which is also the minimum value of $C(i+2,i)$, is achieved at $n=i+2=5$. So $A=C(n,i) \ge C(i+2,i) \ge C(5,3)=\ln(1/5)+4^2\ln(4/5)+2 \cdot 4\ln2$.
		\end{proof}
		
		\begin{claim} \label{clm:B}
			$B > \ln2-3/4-(2/3) \cdot (1/5^2)$.
		\end{claim}
		
		\begin{proof}
			Due to the facts that
			\begin{eqnarray*}
				\ln\left(1+\frac{1}{n}\right) &<& \frac{1}{n}-\frac{1}{2n^2}+\frac{1}{3n^3}, \\
				\ln\left(1-\frac{1}{n}\right) &<& -\frac{1}{n}-\frac{1}{2n^2}-\frac{1}{3n^3},
			\end{eqnarray*}
			we have
			\begin{eqnarray*}
				B &=& -\ln\left(\frac{n_l}{n}\right)-(n_l+1)n_r\ln\left(1+\frac{1}{n}\right)-(n_l-1)n_r\ln\left(1-\frac{1}{n}\right) \\
				&>& -\ln\left(\frac{n_l}{n}\right)+\frac{n_ln_r}{n^2}-\frac{2n_r}{n}-\frac{2n_r}{3n^3} \\
				&>& -\ln\left(\frac{n_l}{n}\right)+\frac{n_ln_r}{n^2}-\frac{2n_r}{n}-\frac{2}{3n^2}.
			\end{eqnarray*}
			Let $\alpha=n_l/n$, then
			\begin{eqnarray*}
				B &>& -\ln\alpha+\alpha(1-\alpha)-2(1-\alpha)-\frac{2}{3n^2} \\
				&\geq& \ln2-\frac{3}{4}-\frac{2}{3n^2}.
			\end{eqnarray*}
			When $n \ge 5$, $B > \ln2-3/4-(2/3) \cdot (1/5^2)$.
		\end{proof}
		Combining Claims \ref{clm:A} and \ref{clm:B}, Lemma \ref{lem:T2_T3_transplant} follows.
	\end{proof}
	This completes the proof of Theorem \ref{thm:expander}.
	
\end{proof}



\section{Proof of Theorem \ref{thm:expander}} \label{supsec:proof_expander}

\begin{proof}
	Note that $\cost^T(G)$ for any cluster tree $T$ has a trivial upper bound. That is,
	$$\cost^T(G) = \sum_{(u,v)\in E} w(u,v)\cdot\log(\vol(u\vee v)) \le \sum_{(u,v)\in E} w(u,v) \cdot \log(\vol(V)) \le \frac{\vol(V) \cdot \log(\vol(V))}{2}.$$ Let $T^*$ be the optimal cluster tree that achieves the minimum cost, we present here a lower bound for $\cost^{T^*}(G)$. Referring to the dense branch technique \citep{dasgupta2016cost,DBLP:conf/nips/ManghiucS21}, we start with the root node $A_0$ and walk along $T^*$ recursively as follows: at every internal node $A_i$, walk down to the node $A_{i+1}$ of higher volume between its two children. This process stops when we reach node $A_k$ such that $\vol_G(A_k) \le \frac{2\vol(V)}{3}$. Denote $A = A_k$ as well as $B = V \setminus A_k$. By construction, it holds that $\vol_G(A) > \frac{\vol(V)}{3}$ and $\vol_G(B) \ge \frac{\vol(V)}{3}$. Moreover, $\vol(A_i) > \frac{2\vol(V)}{3}$ for every $0 \le i < k$. The basic idea behind the dense branch is that cut $(A, B)$ has a significant contribution to $cost^{T^*}(G)$. Denote by $E(A,B)$ the set of edges between cut $(A,B)$.
	\begin{align*}
		cost^{T^*}(G) &= \sum_{(u, v)\in E}w(u, v) \cdot \log(\vol(u\vee v)) \\
		&\ge \sum_{\substack{(u, v)\in E(A,B)}}w(u, v) \cdot \log(\vol(u\vee v)) \\
		&\ge w(A, B) \cdot \log\left(\frac{2\vol(V)}{3}\right). \\
		&\ge \Phi(G) \cdot \frac{\vol(V)}{3} \cdot \log\left(\frac{2\vol(V)}{3}\right).
	\end{align*}
	Let $T$ be an arbitrary cluster tree, and $T^*$ be an optimal tree. We have
	$$\frac{cost^T(G)}{cost^{T^*}(G)} \le \frac{3}{2 \Phi(G)} \cdot \frac{\log(\vol(V))}{\log\left(\frac{2\vol(V)}{3}\right)}=O(\Phi(G)^{-1}).$$
\end{proof}



\section{Proof of Theorem \ref{thm:well-clustered}} \label{supsec:proof_well-clustered}

\begin{proof}
	To prove Theorem \ref{thm:well-clustered}, we only have to prove the following lemma. Then the theorem follows from a simplification of the approximation factor.
	\begin{lemma} \label{lem:well-clustered}
		Let $\alpha=\max_i\{\Phi_G(P_i)\}$ and $\beta=\min_i\{\Phi(G[P_i])\}$. Algorithm \ref{alg:balanced_binary_merge} achieves $$\left(\left(\left(\log\left(\frac{1}{1 - \alpha}\right) + 1\right) +  \frac{2\alpha}{1 - \alpha}\left(1 + \log\frac{k}{1 - \alpha}\right)\right) \cdot \frac{3}{2\beta\log\left(\frac{4}{3}\right)}\right)$$ approximation.
	\end{lemma}
	
	\begin{proof}
		We group the edges of G into two categories: let $E_1$ be the set of edges in the induced subgraphs $G[P_i]$ for all $1 \le i \le l$, i.e.,
		$$E_1 \triangleq \cup_{i = 1}^{l}E[G[P_i]],$$
		and $E_2$ be the remaining crossing edges. Then we have
		$$\cost^T(G) = \sum_{e \in E_1}\cost^T(e) + \sum_{e \in E_2}\cost^T(e).$$
		We denote by $\vol(G[P_i])$ the volume of the induced graph $G[P_i]$, by $\vol_G(P_i)$ the volume of $P_i$ in $G$, and by $\parent^T(P_i)$ the parent of $P_i$ on $T$.
		By the construction of $T$, for every $P_i$,
		$$\vol_G(\parent^T(P_i)) = \sum_{j = 1}^i\vol_G(P_j) \le i \cdot \vol_G(P_i) \le k \cdot \vol_G(P_i).$$
		Note that
		$$w(P_i, V \backslash P_i) = \vol_G(P_i) - \vol(G[P_i]) \le \alpha \cdot \vol_G(P_i),$$
		$$(1 - \alpha) \cdot \vol_G(P_i) \le \vol(G[P_i]),$$
		and thus
		$$\vol_G(\parent^T(P_i)) \le k \cdot \vol_G(P_i) \le \frac{k}{1 - \alpha}\vol(G[P_i]).$$
		Combining the above, we have that
		\begin{eqnarray*}
			\sum_{e \in E_1}\cost^T(e) &\le& \sum_{e \in E_1}w_e \cdot \log(\vol_G(P_i)) \\ 
			&\le& \sum_{e \in E_1}w_e \cdot \log\left(\frac{1}{1 - \alpha} \vol(G[P_i])\right) \\
			&=& \sum_{e \in E_1}\left(w_e \cdot \log\frac{1}{1 - \alpha} + w_e \cdot \log(\vol(G[P_i]))\right) \\
			%&\le& \sum_{j = 1}^k\left(\log\frac{1}{1 - \alpha} + 1\right) \cdot \frac{\vol(G[P_i]) \cdot \log(\vol(G[P_i]))}{2} \\
			&\le& \left(\log\frac{1}{1 - \alpha} + 1\right) \cdot \sum_{i = 1}^k \frac{\vol(G[P_i]) \cdot \log(\vol(G[P_i]))}{2},
		\end{eqnarray*}
		and
		\begin{eqnarray*}
			\sum_{e \in E_2}\cost^T(e) &\le& \sum_{i = 1}^k{w(P_i, V \backslash P_i)} \cdot \log(\vol_G(\parent^T(P_i))) \\
			&\le& \sum_{i = 1}^k\frac{\alpha}{1 - \alpha}\vol(G[P_i])\log\left(\frac{k}{1 - \alpha}\vol(G[P_i])\right) \\
			&\le& \sum_{i = 1}^k\frac{\alpha}{1 - \alpha}\left(1 + \log\frac{k}{1 - \alpha}\right)\vol(G[P_i])\log(\vol(G[P_i])) \\
			&=& \frac{2\alpha}{1 - \alpha}\left(1 + \log\frac{k}{1 - \alpha}\right) \cdot \sum_{i = 1}^k\frac{\vol(G[P_i]) \cdot \log(\vol(G[P_i]))}{2}.
		\end{eqnarray*}
		Let $T^*$ be the optimal cluster tree of $G$, and $OPT_G$ be the optimal value. We have
		$$OPT_G = \cost_G(T^*) \ge \sum_{i = 1}^l\sum_{e \in E(G[P_i])}\cost_{T^*}(e) \ge \sum_{i = 1}^lOPT_{G[P_i]}.$$
		Denote by
		$$h(\alpha,k)=\left(\left(\log\left(\frac{1}{1 - \alpha}\right) + 1\right) +  \frac{2\alpha}{1 - \alpha}\left(1 + \log\frac{k}{1 - \alpha}\right)\right).$$
		We have
		\begin{eqnarray*}
			\cost^T(G) &=& \sum_{e \in E_1}\cost^T(e) + \sum_{e \in E_2}\cost^T(e) \\
			&\le& h(\alpha,k)\cdot\sum_{j = 1}^k\frac{\vol(G[P_i]) \cdot \log(\vol(G[P_i]))}{2} \\
			&\le& h(\alpha,k)\cdot\sum_{j = 1}^k\frac{\vol(G[P_i]) \cdot \log(\vol(G[P_i]))}{2 \Phi_{G[P_i]} \cdot \frac{1}{3}\vol(G[P_i]) \cdot \log(\frac{2}{3}\vol(G[P_i]))}OPT_{G[P_i]} \\
			&\le& h(\alpha,k)\cdot \max_{i} \frac{3\log(\vol(G[P_i]))}{2\Phi_{G[P_i]} \cdot \log(\frac{2}{3}\vol(G[P_i]))}\sum_{j = 1}^kOPT_{G[P_i]} \\
			&\le& h(\alpha,k)\cdot \max_{i} \frac{3\log(\vol(G[P_i]))}{2\Phi_{G[P_i]} \cdot \log(\frac{2}{3}\vol(G[P_i]))}OPT_G \\
			&\le& h(\alpha,k)\cdot \frac{3}{2\beta\log(\frac{4}{3})}OPT_G
		\end{eqnarray*}
		Lemma \ref{lem:well-clustered} follows.
	\end{proof}
	Note that $h(\alpha,k)=O\left(\frac{1}{(1-\alpha)}\log\frac{k}{1-\alpha}\right)$, Theorem \ref{thm:well-clustered} follows.
\end{proof}





\section{Synthetic datasets and linkage-based methods} \label{supsec:synthetic_data}

In this section, we introduce the datasets generated from SBM \citep{ana2003robust} and HSBM \citep{lyzinski2017Community}, and the settings of linkage-based baseline algorithms.

\subsection{SBM datasets and linkage-based methods} \label{supsubsec:SBM}

SBM is a classic random graph model for two-level clustering structures. In this model, clusters are preset and two probabilities $p_0$ and $p_1$ for intra-cluster and inter-cluster connections of vertices are required. Usually, $p_0<p_1$. For example, in Table \ref{tab:cost_balance} of the full paper, we preset five clusters with $p_0=0.1$ and $p_1=0.9$.

We compare our algorithm BBM on SBM graphs against four linkage-based methods \citep{cohen2019hierarchical}: average-linkage (AL), single-linkage (SL), complete-linkage (CL) and Linkage++ (L++). AL, SL and CL are agglomerative methods based on different greedy strategy. For two clusters $C_1$ and $C_2$ in a weighted graph (denote by $w$ the weight function on edges), their similarity $\text{sim}(C_1,C_2)$ are defined as:
\begin{itemize}
	\item[(1)] $\frac{1}{|C_1| |C_2|} \sum_{u\in C_1,v\in C_2} w(u,v)$ for AL;
	\item[(2)] $\max_{u\in C_1,v\in C_2} \{w(u,v)\}$ for SL;
	\item[(3)] $\min_{u\in C_1,v\in C_2} \{w(u,v)\}$ for CL.
\end{itemize}
These three algorithms have the same agglomerative framework. For a weighted graph of $n$ nodes, starting from $n$ singleton trees with labels $\{v_i\}$, $i=1,2,\ldots,n$, in each iteration, two trees $T_1$, $T_2$ with root labels $C_1$, $C_2$ that have maximum similarity are merged into a new tree, whose root label is $C_1\cup C_2$, until a single tree is left.

Note that in unweighted (or uniformly weighted) graphs used in our experiments, SL and CL probably have a large number of ties on similarity to break in each iteration. An arbitrary breaking strategy makes the resulting tree varies wildly. To break these ties in a mild fashion, we introduce a random and slight perturbation on edge weights, that is, each edge weight is set to be $1+\varepsilon$ where $\varepsilon$ is uniformly and randomly chosen from interval $[0,10^{-6}]$. Intuitively, a slight perturbation should not affect the construction of a cluster tree and the output trees of SL and CL will not varies a lot. The balance indices and costs of resulting trees in our experiments are calculated on the original unweighted graphs.

Linkage++ \citep{cohen2019hierarchical} is proposed for unweighted graphs. It takes a positive integer $d$ as a parameter and projects vertices into a $d$-dimensional subspace of $\mathbb{R}^n$. The Euclidean distance based SL algorithm are invoked to construct $d$ clusters, and then based on a newly defined similarity (the same as that for AL) for these $d$ clusters, SL is invoked again to merge them into a single tree. In our experiments, the ground truth $d$ of each SBM graph is given as input for Linkage++.

\subsection{HSBM datasets}  \label{supsubsec:HSBM}

HSBM is a generalization of SBM to the hierarchical case. In this model, $k$ is set to be an intrinsic number of hierarchies of a graph. Let $\vec{p}=(p_0,p_1,\ldots,p_{k-1})$ be a probability vector, for which $p_i$ is the probability of generating edges for vertex pairs whose LCA on the ground-truth cluster tree has depth $i$. The $0$-depth node is the root, and typically, $p_{i-1}\leq p_i$ for each $i$.

Our experiments for HCSE utilize $k$-level HSBM with $k=4$ and $5$. we set the graph sizes for $k=4,5$ to be $2500$ and $6000$ respectively. For simplicity, in each ground-truth cluster tree, the number of internal nodes on each level is fixed. On each level, we group all the tree nodes uniformly and randomly into a fixed number of groups, and then allocate each group accordingly to the tree nodes on the next upper level. For $k=4$, we set the tree node numbers from leaves to the root to be $[2500, 250, 25, 5, 1]$, and for $k=5$, to be $[6000, 600, 125, 25, 5, 1]$.



\section{Additional experimental results} \label{supsec:add_exp_results}

In this section, we introduce the balance indices and additional results on binary and non-binary clustering algorithms.

\subsection{More cost and balancedness results for binary clustering on SBM graphs} \label{supsubsec:BBM_results}

We conduct more experiments on cost and balancedness results on SBM graphs. As similar settings in Table \ref{tab:cost_balance} of the full paper, we choose the two strategies ``Biased'' and ``Random'' for generating cluster sizes for SBM. For the Biased strategy, five clusters are generated, and their sizes are set to be $[32,32,64,128,256]$ with $\pm 5$ random perturbation on each figure. The probability of presence for intra-cluster edges is also $p_1=0.9$, but compared to $p_0=0.1$ for inter-cluster edges in the full paper, we choose $p_0=0.2$ and $0.01$ respectively. The results are shown in Table \ref{suptab:cost_balance_Biased}.

\begin{table}[!htbp]
	\centering
	\caption{Cost and balancedess results on SBM graphs for the Biased strategy. All values are averaged over $100$ rounds of trials.}
	\label{suptab:cost_balance_Biased}
	\renewcommand\arraystretch{1.2}
	\tabcolsep=2.9pt
	\scalebox{1}{
		\begin{tabular}{c|ccccc|ccccc}
			\toprule[1.5pt]
			%\cmidrule(r){1-4}
			& \multicolumn{5}{|c|}{$(p_1,p_0)=(0.9, 0.2)$}	& \multicolumn{5}{|c}{$(p_1,p_0)=(0.9, 0.01)$} \\
			Method & $\text{B}_{\text{size}}$ & $\text{B}_{\text{vol}}$ & $\text{B}_{\text{dep}}$ & cost(Das) & cost(SE) & $\text{B}_{\text{size}}$ & $\text{B}_{\text{vol}}$ & $\text{B}_{\text{dep}}$ & cost(Das) & cost(SE) \\
			\hline
			AL & 0.663 & 0.702 & 8.70 & 1.51E7 & 8.75E5 & 0.351 & 0.407 & 2.33 & 6.21E6 & 5.73E5 \\
			%\hline
			SL & 0.217 & 0.225 & 1.29 & 1.92E7 & 8.92E5 & 0.244 & 0.251 & 1.55 & 1.37E7 & 6.15E5 \\
			%\hline
			CL & 0.975 & 0.976 & 60.2 & 1.81E7 & 9.02E5 & 0.972 & 0.976 & 60.1 & 9.59E6 & 6.13E5 \\
			%\hline
			L++ & 0.894 & 0.913 & 29.7 & \textbf{1.32E7} & 8.65E5 & 0.603 & 0.655 & 5.05 & \textbf{6.12E6} & 5.74E5 \\
			%\hline
			BBM & \textbf{0.0146} & \textbf{0.0556} & \textbf{0.178} & 1.33E7 & \textbf{8.55E5} & \textbf{0.0148} & \textbf{0.0148} & \textbf{0.181} & 6.13E6 & \textbf{5.72E5} \\
			\bottomrule[1.5pt]
		\end{tabular}
	}
\end{table}

Our algorithm BBM always achieves the best cost(SE) and balance indices compared with the baselines, while keeps competitive (quite near) to the cost(Das) of Linkage++.
Combined with the results in Table \ref{tab:cost_balance} of the full paper, it can be observed that the costs and balance indices are better for SBM graphs with more obvious clustering structure.

For the Random strategy, we choose different cluster numbers $10$ and $15$ while adjusting the inter-link probabilities accordingly to $p_0=0.01$ and $0.005$, respectively. The sizes of each clusters are generated uniformly and randomly from integers in the interval $[2^4,2^6)$ as we do in the full paper. The probability of presence for intra-cluster edges is $p_1=0.9$. The results are shown in Table \ref{suptab:cost_balance_Random}.

\begin{table}[!htbp]
	\centering
	\caption{Cost and balancedness results on SBM graphs for the Random strategy. All values are averaged over $100$ rounds of trials.}
	\label{suptab:cost_balance_Random}
	\renewcommand\arraystretch{1.2}
	\tabcolsep=2.9pt
	\scalebox{1}{
		\begin{tabular}{c|ccccc|ccccc}
			\toprule[1.5pt]
			%\cmidrule(r){1-4}
			& \multicolumn{5}{|c|}{block num = $10$, $(p_1,p_0)=(0.9, 0.01)$}	& \multicolumn{5}{|c}{block num = $15$, $(p_1,p_0)=(0.9, 0.005)$} \\
			Method & $\text{B}_{\text{size}}$ & $\text{B}_{\text{vol}}$ & $\text{B}_{\text{dep}}$ & cost(Das) & cost(SE) & $\text{B}_{\text{size}}$ & $\text{B}_{\text{vol}}$ & $\text{B}_{\text{dep}}$ & cost(Das) & cost(SE) \\
			\hline
			AL & 0.321 & 0.351 & 2.09 & 4.71E5 & 8.27E4 & 0.333 & 0.362 & 2.28 & 8.87E5 & 1.37E5 \\
			%\hline
			SL & 0.337 & 0.346 & 2.36 & 2.04E6 & 1.02E5 & 0.382 & 0.390 & 2.76 & 5.28E6 & 1.76E5 \\
			%\hline
			CL & 0.957 & 0.959 & 37.6 & 1.63E6 & 9.90E4 & 0.969 & 0.970 & 50.6 & 3.83E6 & 1.67E5 \\
			%\hline
			L++ & 0.725 & 0.735 & 7.10 & \textbf{4.16E5} & 8.26E4 & 0.769 & 0.779 & 9.12 & \textbf{7.74E5} & 1.37E5 \\
			%\hline
			BBM & \textbf{0.108} & \textbf{0.0866} & \textbf{0.723} & 4.24E5 & \textbf{8.19E4} & \textbf{0.102} & \textbf{0.0749} & \textbf{0.724} & 7.99E5 & \textbf{1.36E5} \\
			\bottomrule[1.5pt]
		\end{tabular}
	}
\end{table}

It can be observed that our algorithm BBM always achieves the best cost(SE) and balance indices compared with the baselines, while keeps competitive to cost(Das) of Linkage++.



\subsection{NMI results for non-binary clustering on HSBM graphs} \label{supsubsec:non_binary_results}

The NMI results for our non-binary clustering algorithm HCSE on HSBM graphs are summarized in Table \ref{tab:HSBM_NMI}.

\begin{table*}[!htbp]
	\centering
	\caption{The NMI results for $k=4,5$ respectively. Each group picks three probability vectors. The probability $p_{k-1}=0.9$ for the innermost hierarchy is default and omitted from the table. $p_{k-2}, \ldots, p_0$ of each vector are listed sequentially. Those choices of $\vec{p}$ guarantee that the clusters on each level are clear and the generated graphs are connected with high probability. For each $k$, $\vec{p}$ makes the clustering structure become increasingly clear one by one. ``$-$'' means that the algorithm does not find the corresponding level. HCSE and all baselines have no hyper-parameters.}
	\label{tab:HSBM_NMI}
	\renewcommand\arraystretch{1.1}
	\tabcolsep=4pt
	\scalebox{1}{
		\begin{tabular}{ccccccc|ccccccc}
			%\caption{aaa}
			\toprule[1.5pt]
			%\cmidrule(r){1-4}
			\multicolumn{7}{c|}{($k=4$)} & \multicolumn{7}{c}{($k=5$)} \\
			$\vec{p}$  		& HC$_\text{ave}$ 	& HC$_\text{sin}$ 	& HC$_\text{com}$  	& HLP  		& LOU  					& HCSE & $\vec{p}$  		& HC$_\text{ave}$ 	& HC$_\text{sin}$ 	& HC$_\text{com}$  	& HLP  		& LOU  					& HCSE  \\
			\midrule
			4.5E(-2) & 0.904 & 0.821 & 0.815 & 0.899 & \textbf{0.997} & 0.902 & 1.5E(-2) & 0.927 & 0.857 & 0.863 & 0.993 & \textbf{1.00} & 0.957 \\
			1.5E(-3) & 0.958 & 0.611 & 0.642 & 0.792 & \textbf{1.00} & 0.825 & 1.5E(-3) & 0.843 & 0.731 & 0.743 & 0.703 & \textbf{0.961}& 0.857 \\
			6.0E(-6) & $-$ & 0.557 & 0.894 & 0.598 & $-$ & \textbf{1.00} & 7.5E(-5) & 0.875 & 0.593 & 0.601 & $-$ & 0.877 & \textbf{0.885} \\
			& & & & & & & 1.0E(-6) & $-$ & 0.438 & 0.523 & $-$ & $-$ & \textbf{0.984} \\
			\midrule
			5.5E(-2) & 0.876 & 0.819 & 0.816 & 0.864 & \textbf{0.994} & 0.886 & 4.5E(-2) & 0.940 & 0.846 & 0.849 & 0.962 & \textbf{0.999} & 0.946 \\
			1.5E(-3) & 0.914 & 0.595 & 0.606 & 0.799 & \textbf{1.00} & 0.860 & 4.5E(-3) & 0.915 & 0.723 & 0.748 & 0.822 & \textbf{0.951} & 0.892 \\
			4.0E(-6) & $-$ & 0.657 & 0.840 & 0.653 & $-$ & \textbf{1.00} & 7.5E(-5) & 0.925 & 0.639 & 0.648 & 0.771 & 0.903 & \textbf{0.948} \\
			& & & & & & & 6.0E(-7) & $-$ & 0.508 & 0.542 & $-$ & $-$ & \textbf{0.959} \\
			\midrule
			4.5E(-2) & 0.908 & 0.823 & 0.824 & 0.879 & \textbf{0.978} & 0.901 & 1.5E(-2) & 0.917 & 0.857 & 0.866 & 0.994 & \textbf{0.999} & 0.935 \\
			7.5E(-4) & 0.947 & 0.630 & 0.652 & 0.824 & \textbf{1.00} & 0.977 & 1.5E(-3) & 0.813 & 0.735 & 0.757 & 0.879 & \textbf{0.958} & 0.860 \\
			2.5E(-6) & 0.656 & 0.574 & 0.718 & 0.677 & $-$ & \textbf{1.00} & 5.0E(-5) & 0.872 & 0.631 & 0.671 & 0.841 & 0.877 & \textbf{0.915} \\
			& & & & & & & 3.0E(-7) & 0.762 & 0.686 & 0.761 & 0.786 & $-$ & \textbf{0.953} \\
			\bottomrule[1.5pt]
		\end{tabular}
	}
\end{table*}

Our experiments demonstrate that HCSE outperforms those algorithms that incorporate other efficient and non-parametric linkage-based binary clustering methods into our framework. This demonstrates the effectiveness of our new cost function. HCSE also outperforms the representative heuristic non-parametric algorithms LOUVAIN \citep{blondel2008fast} and HLP \citep{rossi2020fast}. These two algorithms proceed simply by recursively invoking flat clustering algorithms based on modularity and label propagation, respectively, and the hierarchy number is solely determined by the number of rounds when the algorithm terminates. They follow the main framework of present non-binary clustering algorithms whose interpretability is poor. Our experimental results show that HCSE has a great advantage in both finding the intrinsic number of hierarchies and reconstructions.



\subsection{Ablation for cost(SE)} \label{supsubsec:ablation}

For ablation, we replace cost(SE) with cost(Das) in the stretch and compress processes, respectively. We define Das-Das to be the non-binary clustering algorithm that replaces the structural entropy cost used in both stretch and compress processes with cost(Das), Das-SE the one that replaces with cost(Das) in stretch only, and SE-Das the one that replaces with cost(Das) in compress only. In these settings, HCSE is in fact SE-SE. The NMI results for $k=4,5$ with the same settings of probability vectors with Table \ref{tab:HSBM_NMI} are demonstrated in Table \ref{tab:ablation_NMI}.

\begin{table*}[!htbp]
	\centering
	\caption{The ablation NMI results for $k=4,5$ respectively.}
	\label{tab:ablation_NMI}
	\renewcommand\arraystretch{1.1}
	\tabcolsep=4pt
	\scalebox{1}{
		\begin{tabular}{ccccc|ccccc}
			%\caption{aaa}
			\toprule[1.5pt]
			%\cmidrule(r){1-4}
			\multicolumn{5}{c|}{($k=4$)} & \multicolumn{5}{c}{($k=5$)} \\
			$\vec{p}$  		& Das-Das	& Das-SE 	& SE-Das	& HCSE & $\vec{p}$  		& Das-Das	& Das-SE 	& SE-Das	& HCSE  \\
			\midrule
			4.5E(-2) & 0.795 & 0.813 & 0.825 & \textbf{0.902} & 1.5E(-2) & 0.843 & 0.845 & 0.928 & \textbf{0.957} \\
			1.5E(-3) & 0.579 & 0.593 & 0.718 & \textbf{0.825} & 1.5E(-3) & 0.710 & 0.713 & 0.845 & \textbf{0.857} \\
			6.0E(-6) & 0.287 & 0.382 & \textbf{1.00} & \textbf{1.00} & 7.5E(-5) & 0.544 & 0.551 & 0.876 & \textbf{0.885} \\
			& & & & & 1.0E(-6) & 0.317 & 0.353 & 0.965 & \textbf{0.984} \\
			\midrule
			5.5E(-2) & 0.814 & 0.823 & 0.808 & \textbf{0.886} & 4.5E(-2) & 0.832 & 0.847 & 0.885 & \textbf{0.946} \\
			1.5E(-3) & 0.587 & 0.584 & 0.758 & \textbf{0.860} & 4.5E(-3) & 0.706 & 0.714 & 0.833 & \textbf{0.892} \\
			4.0E(-6) & 0.395 & 0.369 & \textbf{1.00} & \textbf{1.00} & 7.5E(-5) & 0.563 & 0.553 & 0.932 & \textbf{0.948} \\
			& & & & & 6.0E(-7) & 0.443 & 0.368 & 0.943 & \textbf{0.959} \\
			\midrule
			4.5E(-2) & 0.814 & 0.821 & 0.830 & \textbf{0.901} & 1.5E(-2) & 0.839 & 0.846 & 0.934 & \textbf{0.935} \\
			7.5E(-4) & 0.594 & 0.591 & 0.938 & \textbf{0.977} & 1.5E(-3) & 0.710 & 0.713 & \textbf{0.872} & 0.860 \\
			2.5E(-6) & 0.403 & 0.396 & \textbf{1.00} & \textbf{1.00} & 5.0E(-5) & 0.555 & 0.557 & \textbf{0.925} & 0.915 \\
			& & & & & 3.0E(-7) & 0.447 & 0.442 & 0.949 & \textbf{0.953} \\
			\bottomrule[1.5pt]
		\end{tabular}
	}
\end{table*}

It can be observed that, for almost all settings of probability vectors, HCSE outperforms all other strategies with stretch or compress process that uses cost(Das). When we keep to use structural entropy in stretch, the NMIs of HCSE and SE-Das are much better than Das-Das and Das-SE that use cost(Das) in stretch. This implies that structural entropy always has more advantages in binary hierarchical clustering than cost(Das), because structural entropy based stretch process reconstructs on the binary cluster tree more precise intermediate truth clusters which can be preserved after compress.


\subsection{Inflection points} \label{supsubsec:inflection_points}

We show in Figure \ref{fig:inflection_points} the inflection points of HCSE for the three probability vectors with $k=4$ in Table \ref{tab:HSBM_NMI} of the full paper. $\delta_t(\mathcal{H})$ has the sharpest drop for each graph after the ground-truth sparsest levels have been stratified at $t=4$, which terminates the stratification of HCSE. This verifies the rationality of our strategy of stratification in finding the intrinsic hierarchy numbers.


\begin{figure}
	\centering
	\includegraphics[width=4in]{figures/inflection_points_4.png}
	\caption{$\delta_t(\mathcal{H})$ variations for HCSE and $k=4$. The inflection points for all probability vectors appear correctly on $t=4$.}
	\label{fig:inflection_points}
\end{figure}



\subsection{Jaccard index results on Amazon network} \label{supsubsec:Jaccard_Amazon}

We do our non-binary clustering experiments on the Amazon network \footnote{http://snap.stanford.edu/data/} for which the set of ground-truth clusters has been given. There are a huge number of overlaps and variations on the sizes of these clusters, and we suppose that they are some parts of a hierarchical cluster tree.
We evaluate the resulting cluster trees by Jaccard index. For two sets $A,B$, the \emph{Jaccard Index} of them is defined as $J(A,B)=|A\cap B|/|A\cup B|$. We pick the largest cluster which is a subgraph with $58283$ vertices and $133178$ edges. For each ground-truth cluster $c$ that appears in this subgraph, we find from the resulting cluster tree an internal node that has maximum Jaccard index with $c$. Then we calculate the average Jaccard index $\overline{J}$ over all such $c$. The scores are summarized in Table \ref{tab:Amazon}.

\begin{table}[htbp]
	\caption{Comparisons of the average Jaccard index ($\overline{J}$). We choose the best results in five runs for the two baselines HLP and LOUVAIN, respectively.}
	\label{tab:Amazon}
	\centering
	\begin{tabular}{cccc}
		\toprule
		%\cmidrule(r){1-3}
		index     & HCSE     & HLP      & LOUVAIN  \\
		\midrule
		$\overline{J}$ & \textbf{0.20}  & 0.16      & 0.17\\
		\bottomrule
	\end{tabular}
\end{table}






\section{Supplementary figures and pseudocodes} \label{supsec:figures_pseudocodes}

We list all the supplementary figures and pseudocodes in this section.

\begin{figure}[!h]	
	\centering
	\includegraphics[scale=0.5]{figures/stretch_compress.jpg}
	\caption{Illustrations of stretch and compress for a $u$-triangle. A binary cluster tree is constructed first by stretch, and then edge $e$ is compressed, which yields a non-binary tree.}
	\label{fig:stretch_compress}
\end{figure}

\begin{figure}[!h]
	\begin{center}	
		\subfigure[]{
			\begin{minipage}[t]{0.5\linewidth}
				\centering
				\includegraphics[width=6cm]{figures/up.jpg}
				\label{fig:up}
				%\caption{fig1}
			\end{minipage}%
		}%
		\subfigure[]{
			\begin{minipage}[t]{0.5\linewidth}
				\centering
				\includegraphics[width=6cm]{figures/below.jpg}
				\label{fig:below}
				%\caption{fig2}
			\end{minipage}%
		}%	
		\caption{Illustrations of stratification for a $2$-level cluster tree. The preference of (a) and (b) depends on the average sparsity of triangles at each level.}
		\label{fig:stratify}
	\end{center}
\end{figure}



% \begin{algorithm}
	% 	\caption{$k$-Hierarchical clustering based on structural entropy ($k$-HCSE)}
	% 	\label{alg:k-HCSE}
	% 	\KwIn{a graph $G = (V,E)$, $k\in\mathbb{Z}^+$}
	% 	\KwOut{a $k$-level cluster tree $T$}
	% 	Initialize $T$ to be the $1$-level cluster tree\;
	% 	$h=\text{height(T)}$\;
	% 	\While{$h<k$}{
		% 		$j' \gets \arg\max_{j} \{\overline{\text{Spar}}_{j}(T)\}$; \quad // Find the sparsest level of $T$ (breaking ties arbitraily)\;
		% 		\If{$\overline{\text{Spar}}_{j'}(T)=0$}{
			% 			break;	\quad // No cost will be saved by any further clustering\;
			% 		}
		% 		\For{$u\in U_{j'}$}{
			% 			$T_u \gets$ Stretch($u$-triangle $T_u$)\;
			% 			Compress($T_u$)\;
			% 		}
		% 		$h \gets h+1$\;
		% 		\For{$j\in [j'+1,h]$}{
			% 			Update $U_j$\;
			% 		}
		% 	}
	% 	return $T$
	% \end{algorithm}

\begin{algorithm}[!h]
	\caption{$k$-Hierarchical clustering based on structural entropy ($k$-HCSE)}
	\label{alg:k-HCSE}
	\begin{algorithmic}
		\State {\bfseries Input:} a graph $G = (V,E)$, $k\in\mathbb{Z}^+$.
		\State {\bfseries Output:} a $k$-level cluster tree $T$.
		\State Initialize $T$ to be the $1$-level cluster tree.
		\State $h=\text{height(T)}$.
		\While{$h<k$}
		\State // Find the sparsest level of $T$ (breaking ties arbitraily).
		\State $j' \gets \arg\max_{j} \{\overline{\text{Spar}}_{j}(T)\}$.
		\If{$\overline{\text{Spar}}_{j'}(T)=0$}
		\State // No cost will be saved by any further clustering.
		\State break.
		\EndIf		
		\For{$u\in U_{j'}$}
		\State $T_u \gets$ Stretch($u$-triangle $T_u$).
		\State Compress($T_u$).
		\EndFor
		\State $h \gets h+1$
		\For{$j\in [j'+1,h]$}
		\State Update $U_j$.
		\EndFor
		\EndWhile
		\State {\bfseries return} $T$.
	\end{algorithmic}
\end{algorithm}

% \begin{algorithm}
	% 	\caption{Hierarchical clustering based on structural entropy (HCSE)}
	% 	\label{alg:HCSE}
	% 	\KwIn{a graph $G = (V,E)$}
	% 	\KwOut{a cluster tree $T$}
	% 	$t \gets 2$\;
	% 	\While{$\Delta_t \delta < \Delta_{t-1} \delta$ or $\Delta_t \delta < \Delta_{t+1} \delta$}{
		% 		\If{$\max_{j} \{\overline{\text{Spar}}_{j}(T)\}$=0}{
			% 			break\;
			% 		}
		% 		$t \gets t+1$\;
		% 	}
	% 	return $t$-HCSE$(T)$
	% \end{algorithm}

\begin{algorithm}[!h]
	\caption{Hierarchical clustering based on structural entropy (HCSE)}
	\label{alg:HCSE}
	\begin{algorithmic}
		\State {\bfseries Input:} a graph $G = (V,E)$.
		\State {\bfseries Output:} a cluster tree $T$.
		\State $t \gets 2$.
		\While{$\Delta_t \delta < \Delta_{t-1} \delta$ or $\Delta_t \delta < \Delta_{t+1} \delta$}
		\If{$\max_{j} \{\overline{\text{Spar}}_{j}(T)\}$=0}
		\State break.
		\EndIf
		\State $t \gets t+1$.
		\EndWhile
		\State {\bfseries return} $t$-HCSE$(T)$.
	\end{algorithmic}
\end{algorithm}


\end{document}
