% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% \usepackage{algorithm}
% \usepackage{algpseudocode}
% \usepackage{algorithmic}

% \usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsfonts}
% \usepackage{booktabs} % commands to create good-looking tables
% \usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{float}

\usepackage{subfigure}

%\newcommand{\theHalgorithm}{\arabic{algorithm}}

\newcommand{\csize}{
\fontsize{8}{8}\selectfont
}

\newcommand{\csizenine}{
\fontsize{9}{9}\selectfont
}
\newenvironment{proofof}[1]{{\bf Proof of #1.  }}{\hfill$\Box$}

\newcommand{\csizenineplus}{
\fontsize{9.5}{9.5}\selectfont
}

\newcommand{\csizeten}{
\fontsize{10}{10}\selectfont
}

\newcommand{\tabsize}{
\fontsize{7}{7}\selectfont
}

\newcommand{\cA}{{\mathcal{A}}}
\newcommand{\cB}{{\mathcal{B}}}
\newcommand{\cC}{{\mathcal{C}}}
\newcommand{\cD}{{\mathcal{D}}}
\newcommand{\cG}{{\mathcal{G}}}
\newcommand{\cI}{{\mathcal{I}}}
\newcommand{\cN}{{\mathcal{N}}}
\newcommand{\cM}{{\mathcal{M}}}
\newcommand{\cO}{{\mathcal{O}}}
\newcommand{\cP}{{\mathcal{P}}}
\newcommand{\cQ}{{\mathcal{Q}}}
\newcommand{\bP}{{\mathbf{P}}}
\newcommand{\cR}{{\mathcal{R}}}
\newcommand{\cS}{{\mathcal{S}}}
\newcommand{\cH}{{\mathcal{H}}}
\newcommand{\cK}{{\mathcal{K}}}
\newcommand{\cT}{{\mathcal{T}}}
\newcommand{\cU}{{\mathcal{U}}}
\newcommand{\cV}{{\mathcal{V}}}
\newcommand{\cY}{{\mathcal{Y}}}
\newcommand{\cZ}{{\mathcal{Z}}}
\newcommand{\newsetminus}{{\!-\!}}
\newcommand{\cVmA}{{\cV\newsetminus\cA}}
\newcommand{\cX}{{\mathcal{X}}}
\newcommand{\cs}{s}
\newcommand{\cVms}{{\cV-\cs}}

\newcommand{\ba}{{\mathbf{a}}}
\newcommand{\bb}{{\mathbf{b}}}
\newcommand{\bu}{{\mathbf{u}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\resid}{\cR}

\newcommand{\NP}{{\mathbf{NP}}}

% \DeclareMathOperator{\MIF}{MI} 

\newcommand{\bs}[1]{\boldsymbol{#1}}
\newcommand{\mb}[1]{\mathbf{#1}}

\newcommand{\mhk}{\cM^h_k}

\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\eqnref}[1]{Eq.~\ref{#1}}
\newcommand{\secref}[1]{Sec.~\ref{#1}}
\newcommand{\appref}[1]{Appendix~\ref{#1}}
\newcommand{\prcref}[1]{Procedure~\ref{#1}}
\newcommand{\assmref}[1]{Assumption~\ref{#1}}
\newcommand{\crlref}[1]{Corollary~\ref{#1}}
\newcommand{\algoref}[1]{Alg.~\ref{#1}}
\newcommand{\prpref}[1]{Proposition~\ref{#1}}
\newcommand{\cnjref}[1]{Conjecture~\ref{#1}}
\newcommand{\axmref}[1]{Axiom~\ref{#1}}
\newcommand{\lmaref}[1]{Lemma~\ref{#1}}

\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[lemma]{Corollary}
\newtheorem{procedure}[lemma]{Procedure}
\newtheorem{assumption}[lemma]{Assumption}
\newtheorem{claim}[lemma]{Claim}
\newtheorem{conclusion}[lemma]{Conclusion}
\newtheorem{proposition}[lemma]{Proposition}
\newtheorem{conjecture}[lemma]{Conjecture}
\newtheorem{axiom}[lemma]{Axiom}
\newtheorem{algo}[lemma]{Algorithm}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Information Theoretic Clustering via Divergence Maximization among Clusters (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{Sahil Garg}
\author[1]{Mina Dalirrooyfard}
\author[1]{Anderson Schneider}
\author[1]{Yeshaya Adler}
\author[1]{Yuriy Nevmyvaka}
\author[1]{Yu Chen}
\author[1]{Fengpei Li}
\author[2]{Guillermo Cecchi}

% Add affiliations after the authors
\affil[1]{%
    Dept. of Machine Learning Research\\
    Morgan Stanley\\
    New York, New York, USA
}
\affil[2]{%
    IBM T. J. Watson Research Center\\
    Yorktown Heights, New York, USA\\
}

\affil[*]{
Corresponding Author: sahil.garg@morganstanley.com, sahil.garg.cs@gmail.com
}
  
\begin{document}

\onecolumn
\maketitle

\section{Proofs}
% 
\label{sec:proofs}

\subsection{}

\begin{proofof}{Theorem 1}
% 
First we compute $D(p||q)$ and $D(q||p)$ in terms of entropy. 
% 
\begin{align}
% 
\cD(p||q) 
&= - \cH(p) - \langle \log q(.) \rangle_p \\
&= - \cH(p) + {\cH}_p(q)
% 
\end{align}
And similarly we have
\begin{align}
\cD(q||p) 
&= - \cH(q) - \langle \log p(.) \rangle_q \\
&= - \cH(q) + {\cH}_q(p)
\end{align}
    
So with $\lambda = \frac{p(y=q)}{p(y = p)} = \frac{n_q}{n_p}$ we have
    
\begin{align}
% 
& {\arg\!\max}_{p,q: \cH(\bs{\cX}) = c} \  \ \cD(p||q) + \lambda \cD(q||p) \\
% 
&= {\arg\!\max}_{p,q: \cH(\bs{\cX}) = c} \  \ - \cH(p) - \lambda \cH(q)  + \lambda \cH_q(p) + \cH_p(q) \\
% 
&= {\arg\!\max}_{p,q: \cH(\bs{\cX}) = c} \  \ - \cH(\bs{\cX}| y = p) - \lambda \cH(\bs{\cX}| y = q)  + \lambda \cH_q(p) + \cH_p(q) \\
% 
&= {\arg\!\max}_{p,q: \cH(\bs{\cX}) = c} \  \ - p(y = p) \cH(\bs{\cX}| y = p) - p(y = q) \cH(\bs{\cX}| y = q)  + p(y = q) \cH_q(p) + p(y = p) \cH_p(q) \\
% % 
&= {\arg\!\max}_{p,q: \cH(\bs{\cX}) = c} \  \ \cH(\bs{\cX}) - p(y = p) \cH(\bs{\cX}| y = p) - p(y = q) \cH(\bs{\cX}| y = q)  + p(y = q) \cH_q(p) + p(y = p) \cH_p(q) \\
% 
% &= {\arg\!\max}_{p,q: \cH(p,q) = c} \  \ \cI(\bs{\cX}:\cY) - p(y = p) \cH_q(p) - p(y = q) \cH_p(q) \\
% 
&= {\arg\!\max}_{p,q: \cH(\bs{\cX}) = c} \  \ \cI(\bs{\cX}:\cY) + \frac{n_q}{n} \cH_q(p) + \frac{n_p}{n} \cH_p(q)
% 
\end{align}
% 
\end{proofof}

\subsection{}

\begin{proofof}{Theorem 2}
The problem can be seen as, given a subset $S\subseteq S_{\mathcal X}$, the input space (i.e., for $x$ in a high dimensional space), we want to solve the optimization problem
\begin{equation*}
    \text{Opt1}(S)\vcentcolon= \max_{\substack{S_1 \cup S_2=S\\S_1\cap S_2=\emptyset}}\bigg[\max_{\substack{f\in C^1(S_\mathcal X)\\ a\leq f\leq b}} \bigg[\frac{\sum_{x\in S_1}f(x)}{|S_1|}-\log \frac{\sum_{x\in S_2}e^{f(x)}}{|S_2|}\bigg]+\max_{\substack{g\in C^1(S_\mathcal X)\\ a\leq g\leq b}} \bigg[\frac{\sum_{x\in S_2}g(x)}{|S_2|}-\log \frac{\sum_{x\in S_1}e^{g(x)}}{|S_1|}\bigg]\bigg].
\end{equation*}

For any given $f$, by Jesen's inequality, we have
\begin{equation}\label{Jesen}
     \frac{\sum_{x\in S_2}e^{f(x)}}{|S_2|} \geq \exp\big(\frac{\sum_{x\in S_2}f(x)}{|S_2|}\big),
\end{equation}
so we have 
\begin{align}\label{optimalab}
   \max_{\substack{S_1 \cup S_2=S\\S_1\cap S_2=\emptyset}}\max_{\substack{f\in C^1(S_\mathcal X)\\ a\leq f\leq b}} \bigg[\frac{\sum_{x\in S_1}f(x)}{|S_1|}-\log \frac{\sum_{x\in S_2}e^{f(x)}}{|S_2|}\bigg]\leq &\max_{\substack{S_1 \cup S_2=S\\S_1\cap S_2=\emptyset}}\max_{\substack{f\in C^1(\mathcal X)\\ a\leq f \leq b}} \bigg[\frac{\sum_{x\in S_1}f(x)}{|S_1|}-\frac{\sum_{x\in S_2}f(x)}{|S_2|}\bigg]\nonumber\\
   \leq &b-a.
    \end{align}
   
   Thus, an upper bound for $\text{Opt1}(S)$ is simply $2(b-a)$. However, by letting 
   \begin{equation}\label{optimalfg}
   f= \begin{cases}
b, \text{ for $x\in S_1$}\\
a, \text{ for $x\in S_2$}
\end{cases}
\text{ and \quad }
 g= \begin{cases}
a, \text{ for $x\in S_1$}\\
b, \text{ for $x\in S_2$}
\end{cases},
\end{equation}
    we would achieve the optimal value $2(b-a)$, since \eqref{Jesen} is tight for such $f,g$. Thus, the optimal value of $\text{Opt1}(S)$ is exactly $2(b-a).$
    
Now, if there exists two clusters $S_1$, $S_2$ with  
\begin{align*}
    d_{\text{between}} \vcentcolon=& \min_{x_1\in S_1, x_2\in S_2}\|x_1-x_2\| \nonumber\\
    d_i\vcentcolon=& \max_{x\in S_i, x'\in S_i}\|x-x'\|\nonumber\\
    d_{\text{within}} \vcentcolon=& \max(d_1,d_2) 
\end{align*}
 that satisfies $d_{\text{between}}>d_{\text{within}}$. Then for $L$ that satisfies
\begin{equation*}
    \frac{b-a}{d_{\text{within}}} < L < \frac{b-a}{d_{\text{between}}},
\end{equation*}
we claim \eqref{optimalfg} is the unique optimal solution to 

\begin{align*}
    \text{Opt4}(S)
   \vcentcolon= &\max_{\substack{S_1 \cup S_2=S\\S_1\cap S_2=\emptyset}}\bigg[\max_{\substack{f\in C^1(\mathcal X)\\ a\leq f\leq b\\f \text{is $L$-Lipschitz}}} \bigg[\frac{\sum_{x\in S_1}f(x)}{|S_1|}-\log \frac{\sum_{x\in S_2}e^{f(x)}}{|S_2|}\bigg]+\max_{\substack{g\in C^1(\mathcal X)\\ a\leq g\leq b\\g \text{is $L$-Lipschitz}}} \bigg[\frac{\sum_{x\in S_2}g(x)}{|S_2|}-\log \frac{\sum_{x\in S_1}e^{g(x)}}{|S_1|}\bigg].
\end{align*}
To see this, first note that solution \eqref{optimalfg} is feasible because $L<\frac{(b-a)}{d_{\text{between}}}$.
     This solution is optimal because it achieves value $2(b-a)$ which is the upper bound (best possible) of the objective value. 
     This solution is unique because in order to achieve objective value $b-a$, the inequality in \eqref{optimalfg} and \eqref{optimalab} must be tight, which means one set of points must take value $b$ and the other $a$. One can check if any $x\in S_1$ have $f(x)\neq a$ then either $f(x)\neq b$ which is sub-optimal or $f(x)=b$ which is infeasible because $\frac{b-a}{d_{\text{within}}}$. Thus we must have $f(x)=a$ for all $x\in S_1$. Similarly $f(x)=b$ for all $x\in S_2$. Thus the optimal solution is unique.
\end{proofof}

\textbf{Reminder of Theorem 3}\textit{
Consider a DV function $f$ and the associated DV representation of the data. 
Then the clusters that maximize $\hat{D}_f(P||Q)$ form contiguous clusters on the DV representation of the data points: there is a cut point $c$ such that $i\in P$ if $f(\mathbf{x}_i)\ge c$ and $i\in Q$ if $f(\mathbf{x}_i)<c$.
% 
}

\begin{proofof}{Theorem 3}
% 
Recall that:
\begin{equation}\label{eq:kl-d-fixed-f-ex}
\hat{D}_f(P||Q)
=
\frac{1}{n_P}\sum_{z \in P}f(z)-\log{\frac{1}{n_Q}\sum_{z' \in Q}\exp f(z')}.
\end{equation}
% 
Consider the clusters $P$ and $Q$ that maximize Equation \ref{eq:kl-d-fixed-f-ex} (possibly different from the initial clusters that are used to define the DV function $f$). Suppose that there are data points $z\in P$ and $z'\in Q$ such that $f(z) < f(z')$. We show that by swapping $z$ and $z'$ from their clusters, Equation \ref{eq:kl-d-fixed-f-ex} increases, which is a contradiction (see figure \ref{fig:proof}). Note that the size of the clusters does not change. Since $f(z)<f(z')$, after swapping $z$ and $z'$ the first term in Equation \ref{eq:kl-d-fixed-f-ex} increases and the second term (which is negated) decreases.
So there are no such $z$ and $z'$, which means that the clusters must be contiguous. Note that if we consider the approximation of \eqref{eq:kl-d-fixed-f-ex}, namely $\hat{D}_f(P||Q)\approx\frac{1}{n_P}\sum_{z\in P}f(z)-\max_{z'\in Q} f(z') +\log n_Q$, the same proof works. 
%
%Let the sorted data points be $f(\mathbf{x}_1)\le\ldots\le f(\mathbf{x}_n)$, where $n$ is the total number of data points. Consider the optimal clustering of the points obtained by maximizing $z(P,Q)=\frac{1}{n_p} \sum_{i \in P} f(\mathbf{x}_i) - \log \frac{1}{n_q} \sum_{j \in Q} \exp {f(\mathbf{x}_j)}$. Suppose that there are $i\in P$ and $j\in Q$ such that $f(\mathbf{x}_i)<f(\mathbf{x}_j)$. We claim that swapping $i$ and $j$, i.e. adding $i$ to $Q$ and $j$ to $P$, would increase the objective. To see this, note that by swapping $i$ and $j$ the first term increases since $f(\mathbf{x}_i)$ is replaced by $f(\mathbf{x}_j)$. Similarly, the absolute value of the second term is decreased. So since $n_p$ and $n_q$ doesn't change, the objective increases, and this contradict the optimality of the clustering. So there are no $i\in P$ and $j\in Q$ such that $f(\mathbf{x}_i)<f(\mathbf{x}_j)$, which means there is a cut point $c$ where for all $i\in P$ we have $f(\mathbf{x}_i)>c$ and for all $j\in Q$ we have $f(\mathbf{x}_j)<c$.
%\frac{1}{n_p} \sum_{i \in P} f(\mathbf{x}_i) - \log \frac{1}{n_q} \sum_{j \in Q} \exp {f(\mathbf{x}_j)}

\begin{figure}
    \centering
    \includegraphics{proof.eps}
    \caption{Swapping the cluster labels of two points $z$ and $z'$ to increase KL-D.}
    \label{fig:proof}
\end{figure}

\end{proofof}

% \mina{The initial P and Q from which we derive DV function f are contiguous as you proved below, but what we are saying is that if you fix $f$ and look for other clusters, the clusters that maximize the kl-d in this representation must be contiguous. I'm not sure how to write this in probability language as it is a bit loose (since this representation does not accurately give the kl-d for the new clusters)}

%  \textcolor{green}{Yu says:
%  This would be a probabilistic statement. If the algorithm labels small $f$ values as $P$ distribution. Assuming $x_p$ and $x_q$ are drew independently,
%  \begin{equation*}
%  \text{error}_1 := \mathbb{P}\left[f(x_p) > f(x_q) | x_p \sim P, x_q \sim Q \right]
%  \end{equation*}
%  Or using a fixed cut point, while assuming $x_p$ and $x_q$ are drew according to the mixture model,
%  \begin{equation*}
%  \text{error}_2 := weight_p \mathbb{P}\left[f(x_p) > c | x_p \sim P \right]
%  + weight_q \mathbb{P}\left[f(x_q) < c |  x_q \sim Q \right]
%  \end{equation*}
%  Due to the overlap of distributions, this error is not zero. In the extreme case that P and Q are exactly the same, the error gets the maximum.
%  See if the following lemma is useful.
%  }

%  According to the following lemma, assume the theoretical optimal function is
%  $f(x) = \log \frac{p(x)}{q(x)}$, so given the distribution of $p$ and $q$, the clustering boundary is when $f(x) = 0$. The data point is clustered in $P$ if $f(x) > 0$ and in $Q$ if $f(x) < 0$. Then the error can be derived as,
%  \begin{equation*}
%  \text{error}_1
%  = \int_{\Omega \times \Omega} I(p(x) > q(y)) P(\mathrm{d}x) Q(\mathrm{d}y)
%  \end{equation*}
%  Set the cut point as $c=0$,
%  \begin{align*}
%  \text{error}_2
%  =& weight_p \mathbb{P}(f(X) > 0) + weight_q \mathbb{Q}(f(X) < 0) \\
%  =& weight_p \mathbb{P}(\log p(X) > \log q (X) ) 
%  + weight_q \mathbb{Q}(\log p(X) < \log q (X) ) 
%  \end{align*}




%  \textbf{Lemma} 
%  Assume densities $p, q > 0$,
%  \begin{equation*}
%  \mathcal{D}(p \| q) \geq 
%  % \sup_{f: \Omega \to \mathbb{R}} \Big\{ 
%  \mathbb{E}_P f(X) - \log \mathbb{E}_Q[e^{f(X)}]
%  % \Big\}
%  \end{equation*}
%  which achieves the equality when $f(x) = \log \frac{p(x)}{q(x)} + C$, $C$ is a constant.

%  \begin{proofof}{Lemma}
%  \begin{align*}
%  & \mathcal{D}(p \| q) - \mathbb{E}_P f(X) + \log \mathbb{E}_Q[\exp\{f(X)\}] \\
%  =& \mathbb{E}_P \log\frac{ p(X) }
%      {\Big( \mathbb{E}_Q[\exp\{f(X)\}]\Big)^{-1}  q(X) \exp\{f(X)\} }
%  = \mathcal{D}(p \| h) \geq 0
%  \end{align*}
%  where the new density is defined as,
%  \begin{equation*}
%      h(x) := \Big( \mathbb{E}_Q[\exp\{f(x)\}]\Big)^{-1}  q(x) \exp\{f(x)\}
%  \end{equation*}
%  It is easy to verify that when $f(x) = \log \frac{p(x)}{q(x)} + C$, the equality holds.
%  \begin{equation*}
%  \mathcal{D}(p \| q) - \mathbb{E}_P [f(X)] + \log \mathbb{E}_Q[\exp\{f(X)\}]
%  = \mathcal{D}(p \| q) - \mathbb{E}_P[\log\frac{p}{q}]  - C
%  + \log \mathbb{E}_Q[\frac{p}{q} C] = 0
%  \end{equation*}
%  \end{proofof}

%  \textbf{Remark} 
%  The optimal function $f$ is achievable. 
%  The clustering method is analog to likelihood ratio test given the distribution of different clusters, where a data is clustered by comparing the likelihoods accordingly.







%%%%%%%%%%%%%%%%%%%%%%%%%%%


\subsection{}

We first define a total ordering on clusters when the DV representation $f$ is fixed. For any $P$ and $Q$ and fixed DV function $f$, we say $P>Q$ if the maximum $f$ value of the data points in $Q$ is less than the maximum $f$ value of the data points in $P$. More formally, $P>Q$ if
\begin{equation}\label{eq:order-prop}
\max_{x\in P} f(x)
> \max_{x\in Q} f(x)
\end{equation}

\textbf{Reminder of Theorem 4}\textit{
% 
Consider a DV function $f$ and the associated DV representation of the data. %such that for two clusters $P$ and $Q$ of the data set, $P>Q$, we have 
%\begin{equation*}
%\hat{D}(P||Q)=\frac{1}{n_p} \sum_{i\in P} f(\mathbf{x}_i)  - \log \frac{1}{n_q}\sum_{j\in Q} \exp f(\mathbf{x}_j).
%\end{equation*}  %such that for any two clusters $P$ and $Q$ of the data set, we have $$\hat{D}(P||Q)=z_f(P,Q)=\frac{1}{n_p} \sum_{i\in P} f(\mathbf{x}_i)  - \log \frac{1}{n_q}\sum_{j\in Q} \exp f(\mathbf{x}_j).$$ 
Let $P_1^*<\ldots<P_k^*$ be the clusters that maximize the objective 
\begin{equation}\max_{P_1<\ldots<P_k} \sum_{i=1}^{k-1} \hat{D}_f(P_{>i}||P_i)
\end{equation}
where $P_{>i}=P_k\cup \ldots\cup P_{i+1}$.
Then $P_1^*,\ldots,P_k^*$ form contiguous clusters on the DV representation of the data points: there exist cut points $c_1<\ldots<c_{k-1}$ such that for all $i<k$ and for all $j\in P_i$ we have 
$f(\mathbf{x}_j)< c_i$, and for all $i>1$ and for all $j\in P_i$ we have $f(\mathbf{x}_j)\ge c_i$. 
% 
%Extend the above case for k-clusters scenario.
%Informal: Let $z(S,S')= \overline{S}-\log{x \overline{e^{S'}}}$, where $S$ and $S'$ are subsets of the points in the VD dimension. Order clusters on their maximum element: $P_1,\ldots,P_k$. Then we define the objective as follows: $\sum_{i=1}^{k-1} z(P_k\cup \ldots\cup P_{i+1},P_i)$. Then the clusters must be contiguous, and ordered from left to right.  
%Informal: For any objective of the following form, the clusters must be contiguous sections of the line. The objective must assume an ordering on the clusters, say $P_1,\ldots,P_k$, and must be the sum of KL-D estimates $D_{KL}(Q||Q')$ such that $Q$ and $Q'$ are unions of clusters, and for each $i,j$ where $P_i\subseteq Q$ and $P_j\subseteq Q'$, we have $i>j$.
% 
% 
}


\begin{proofof}{Theorem 4}
Suppose that there are indices $t,j$, $t<j$ such that for data points $x,y$ with $x\in P_t$ and $y\in P_j$ we have $f(x)>f(y)$. We show that swapping $x$ and $y$ in their clusters increases the function maximized by Equation
% \ref{eq:obj-k-clusters}, which we re-state 
as follows.%, \ref{eq:obj-kclusters2}, \ref{eq:obj-kclusters-sum}. 
% 
\begin{equation*}\max_{P_1<\ldots<P_k} \sum_{i=1}^{k-1} \hat{D}_f(P_{>i}||P_i)
\label{eq:obj-k-clusters}
\end{equation*}
% 
We determine which terms in the Equation above
% \ref{eq:obj-k-clusters} 
changes with swapping $x$ and $y$ in $P_j$ and $P_t$. We consider the following for estimating $\hat{D}_f(P||Q)$.
\begin{equation}\label{eq:kl-d-fixed-f}
\hat{D}_f(P||Q)\approx\frac{1}{n_P}\sum_{x\in P}f(x)-\max_{x\in Q}f(x) +\log{n_Q}.
\end{equation}
Note that the size of the clusters stays the same by swapping $x$ and $y$.

For all $i< t$ and all $i> j$, $\hat{D}_f(P_{>i}||P_i)$ does not change. For $i=t$, $\hat{D}_f(P_{>t}||P_t)$ increases since the first term in Equation \ref{eq:kl-d-fixed-f} increases and the second term (which is negated) either stays the same or decreases. %Moreover $\hat{D}(P_t||P_{<t})$ increases since the first term in Equation \ref{eq:kl-d-fixed-f} increases.
For $t< i< j$, $\hat{D}_f(P_{>i}||P_i)$ increases since the first term increases and the second term doesn't change. For $i=j$, $\hat{D}_f(P_{>j}||P_j)$ stays the same since none of the terms change, because $y$ cannot be the maximum value in $P_j$ by the ordering properties.
%proof sketch: 
%Suppose we have shown that $P_1,\ldots,P_{i-1}$ are contiguous clusters and ordered from left to right. Consider $P_i$. If there is $x\in P_i$ and $y\in P_j$ for $j>i$ such that $x>y$, by swapping $x$ and $y$ the objective increases. This is because the only terms in the objective that change are $z(P_k\cup \ldots P_{t+1},P_t)$ for $j> t\ge i$. Recall that $z(S,S')= \overline{S}-\log{ \overline{e^{S'}}}$. If $t=i$, the first term gets bigger, the second term might get smaller or stay the same. If $i<t<j$, the first term increases and the second term stays the same. 
%\mina{This objective might suggest that we find the next cutpoint in the right most cluster. If we ignore the $log_{n_q}$ factor in equation 8 this is optimal (so we'll have a $\log{n}$ additive error). Regardless can find optimal with DP in quadratic time.}
\end{proofof}

\subsection{}


% 
\textbf{Reminder of Theorem 5}\textit{
% 
Let $OPT$ be the optimal value of the objective
$\max_{P_1<\ldots<P_k} \sum_{i=1}^{k-1} \hat{D}_f(P_{>i}||P_i)$. Alg. 3
% \algoref{alg:greedy} 
finds clusters $P_1^*<\ldots<P_k^*$ such that $\sum_{i=1}^{k-1} \hat{D}_f(P_{>i}^*||P_i^*)\ge \frac{e-1}{e} OPT$, where the ordering of clusters is defined with respect to the DV representation obtained from a DV function $f$.
}

\begin{proofof}{Theorem 5}
% 
Let $f$ be the DV function that defines the fixed DV representation in the theorem. We first rewrite the objective $\max_{P_1<\ldots<P_k} \sum_{i=1}^{k-1} \hat{D}_f(P_{>i}||P_i)$ in terms of cut points that separate contiguous clusters in the DV representation. 

Let $S$ be a set of real numbers where for any two values $s_1,s_2\in S$, $s_1<s_2$, there is at least one data point $x$ such that $s_1<f(x)<s_2$, and for any two data points $x_1,x_2$, $f(x_1)<f(x_2)$, there is a value $s\in S$ such that $f(x_1)<s<f(x_2)$. Note that $S$ can be the set of the value of all the data points in the DV representation minus a very small number. These conditions mean that we cannot have empty clusters, and that any contiguous cluster can be represented uniquely by the interval between two values in $S$ without worrying about including or excluding the endpoints of the interval. Recall that in Alg. 3,
% \ref{alg:greedy} 
we use the DV representation values of the data points as cut points, and we could enforce the non-empty cluster condition by using the set $S$. 

Let $c_{min}$ and $c_{max}$ be the minimum and maximum value in $S$. 
Let $C=\{c_1,\ldots,c_{t-1}\}\subseteq S$ be a set of cut points. $C$ defines $t$ clusters, where cluster $i$ is all the data points with DV representation between $c_{i-1}$ and $c_{i}$, where we define $c_0=c_{min}$ and $c_t= c_{max}$. %Note that the cut points can be defined in a way that no data point falls on them in the DV representation. 

We rewrite the objective in terms of the cut set $C$. For any two cut points $c,c'\in S$, $c<c'$, let $n_{c,c'}$ be the number of data points whose DV representation is in $(c,c')$, and let $\overline{f((c,c'))}$ be the mean of the values of the data points in $(c,c')$, i.e. $\overline{f((c,c'))}=\frac{1}{n_{c,c'}}\sum_{x:f(x)\in (c,c')}f(x)$. Define $\max{f(c,c')}=\max_{x:f(x)\in (c,c')}f(x)$. Let $C=\{c_1<\ldots<c_{t-1}\}$. We define the function $z(\cdot)$ as follows. \begin{equation}\label{eq:obj-sets}
z(C)=\sum_{i=1}^{t-1}\overline{f(c_i,c_t)}-\sum_{i=0}^{t-1}\max{f(c_i,c_{i+1})} + \sum_{i=0}^{t-1}\log{n_{c_i,c_{i+1}}}\end{equation}

Then the objective is to maximize $z(C)$ over cut sets $C$ of size $k-1$. Note that this objective is equivalent to Equation \ref{eq:obj-k-clusters} because by Theorem 4 we know that the optimal clusters are contiguous.

To prove that the greedy algorithm gives a $\frac{e-1}{e}$ approximation of the optimal solution, we use a result of \cite{nemhauser1978analysis} stating that if a set function is submodular, then the generic greedy algorithm is a $\frac{e-1}{e}$ approximation of the optimal. 

% \sd{Might be nice to include this as a formal lemma citing the result in the original paper}



%, let the clusters defined by these cut points be $P_1<\ldots<P_{t}$. Define $z(C) = \sum_{i=1}^{t-1} \hat{D}(P_{>i}||P_i)$. Using the DV function $f$, we can write the KL-D estimates $\hat{D}(P||Q)=\frac{1}{n_P}\sum_{x\in P} f(x)-\log \frac{1}{n_Q}\sum_{x\in Q}e^{x} \approx \frac{1}{n_P}\sum_{x\in P} f(x)-\max_{x\in Q} f(x) + \log{n_Q}$. 
%For any two cut points $c<c'$, let $n_{(c,c']}$ be the number of data points whose DV representation is in $(c,c']$, and let $\overline{f((c,c'])}$ be the mean of the values of the data set in $(c,c']$. Define $\max_{(c,c']}$ to be the data point with the largest value in $(c'c']$. 
%Suppose $C=\{c_1<\ldots<c_{t-1}\}$, and let $c_0$ and $c_t$ be the min and max value of all the data opints. We have $z(c)=\sum_{i=1}^{t-1}\overline{f(c_i,c_t)}-\sum_{i=0}^{t-1}\max{(c_i,c_{i+1}]} + \sum_{i=0}^{t-1}\log{n_{c_i,c_{i+1}}}$.\mina{refine this paragraph}

%By Theorem \ref{thm:k-cluster-contiguous} we know that the optimal clusters are contiguous, so there the problems is to maximize $z(C)$ where $C$ is any set of $k-1$ cut points. First we prove that this set function is sub-modular. Then by \mina{cite the submodularity paper} the greedy algorithm is a $\frac{e-1}{e}$ approximation of the optimal solution. 

To show the submodularity of function $z$, we need to prove that for any two sets $A$ and $B$ where $A\subseteq B$ and for any cut point $c\in S$, $c\notin B$, we have $z(A\cup \{c\})-z(A)\ge z(B\cup \{c\})-z(B)$. 

 First we compute $z(A\cup \{c\})-z(A)$. Sort the points in $A\cup \{c\}$, and let $c_1<c<c_2$ be the points before and after $c$ in this ordering. Note that $c_1$ might be $c_{min}$ and $c_2$ might be $c_{max}$. From equation \ref{eq:obj-sets}, we see that 
\begin{align*}z(A\cup \{c\})-z(A) &= \overline{f(c,c_{max})}
+\max f(c_1,c_2)-\log n_{c_1,c_2} \\
& -\max f(c,c_2)+\log n_{c,c_2}\\
&-\max{f(c_1,c)} +\log n_{c_1,c} .
\end{align*}
Let $c\notin B$ and let $c_1'\le c\le c_2'$ be the cut points before and after $c$ in $B$'s ordering.
Similar to above, we have 
\begin{align*}z(B\cup \{c\})-z(B) &= \overline{f(c,c_{max})}
+\max f(c_1',c_2')-\log n_{c_1',c_2'} \\
& -\max f(c,c_2')+\log n_{c,c_2'}\\
&-\max{f(c_1',c)} +\log n_{c_1',c} .
\end{align*}
Note that since $A\subseteq B$, we have $c_1\le c_1'\le c\le c_2'\le c_2$. Now from the definition of $S$, we have that $n_{c_1',c}>0$ and $n_{c,c_2'}>0$. %\mina{have to assume this, otherwise we'll have problems. constraint: no empty clusters?}. 
So $\max f(c_1,c)= \max f(c_1',c)$, $\max f(c_1,c_2)= \max f(c,c_2)$ and $\max f(c_1',c_2')= \max f(c,c_2')$. So we have
\begin{align*}
    z(A\cup\{c\})-z(A)- (z(B\cup \{c\})-z(B)) &= -\log n_{c_1,c_2} +\log n_{c_1,c} +\log{n_{c,c_2}}\\
    &+\log n_{c_1',c_2'} -\log n_{c_1',c} -\log{n_{c,c_2'}} \\
    &=\log \frac{n_{c_1',c_2'}}{n_{c_1',c}n_{c,c_2'}}- \log \frac{n_{c_1,c_2}}{n_{c_1,c}n_{c,c_2}} \\
    & = \log(\frac{1}{n_{c_1',c}}+\frac{1}{n_{c,c_2'}})-\log(\frac{1}{n_{c_1,c}}+\frac{1}{n_{c,c_2}})\ge 0
\end{align*}
Note that we used the fact that $n_{c_1,c}+n_{c,c_2} = n_{c_1,c_2}$, $n_{c_1',c}+n_{c,c_2'} = n_{c_1',c_2'}$. Moreover, since $c_1\ge c_1'$, we have $n_{c_1,c}\ge n_{c_1',c}$. Similarly, we have $n_{c,c_2}\ge n_{c,c_2'}$, and hence we have the last inequality.
\end{proofof}

% \section{Pseudo code}

% \todo{cut the algo in short and explain concisely. we must have some pseudo code from nips or aaai submission. express it in terms of a single backpropagation~(weights update) and the cut points update iteratively. teh first few iterations is warmup. }

% \begin{algorithm}[!h]
% \begin{algorithmic}[1]
% \State {\bfseries Input:}  Random clusters $\mb{X}_{y0}$ and $\mb{X}_{y1}$. max iteration T, max cutoffs points $I$ for sorting
% \For {$t \leftarrow 0, \cdots, T$}
%     \State Solve $f_{y0 \to y1}$ and $f_{y1 \to y0}$ from \eqnref{eqn:full_opt_log_sum_exp}; $\mb{X}_{y0}$ and $\mb{X}_{y1}$ as inputs
%     \For {$f_{y0 \to y1}$ and $f_{y1 \to y0}$}
%         \State Sort indices of data points w.r.t values of fctn $f$ 
%         \For  {cutoff $i \leftarrow 0, \cdots, I$}
%         \State Evaluate the divergence objective
%         \EndFor
%     \EndFor    
%     \State Update $\mb{X}_{y0}$ and $\mb{X}_{y1}$ by taking the max divergence  from the step above
% \EndFor
% \State{\bfseries Output:} $\mb{X}_{y0}$ and $\mb{X}_{y1}$
% \end{algorithmic}
% \caption{\texttt{Cut-point clustering based on KL-D objective}}
% \label{Algo_1}
% \end{algorithm}

\section{Experimental Details}

\subsection{Datasets of Noisy Timeseries}

Details for some of the datasets which are of high impact but rare to find are as belows.

\paragraph{Neural Activity}
% 
%Clustering neurons in a brain is one of the most impactful and challenging application for clustering algorithms. Considering the recent advancements in neuroscience, we have selected a current and valuable public dataset for evaluation.
% 
We used public electrophysiological Neuropixels dataset \citep{siegle2021survey, visualcoding2020}.
% 
% The neural activities were recorded while the mice were head-fixed and were passively shown with drifting gratings visual stimuli. 
% 
Multiple high-density extracellular electrophysiology probes were used to simultaneously record spiking neural activity from a wide variety of areas in the mouse brain.
% 
% , ranging from the subcortical region, such as the thalamus, to multiple the visual cortices, such as the primary visual cortex (V1), lateral medial visual area (LM), rostrolateral visual area (RL), anterolateral visual are (AL), anteromedial visual area (AM), etc.
% 
% Details can be found in \citep{siegle2021survey, visualcoding2020}.
% 
We used the data of the animal with session-id \texttt{798911424} and included the first 100 out of 195 trials.
% 
% The visual condition ids are: \texttt{275}, \texttt{268}, \texttt{270}, \texttt{284}, \texttt{274}, \texttt{249}, \texttt{261}, \texttt{278}, \texttt{280}, \texttt{256}, \texttt{260}, \texttt{257}, \texttt{281}. Each condition had 15 repeated trials, including 195 trials in total. 
The first 2000 ms of each trial after stimulus onset was extracted.
% 
We time-binned the timestamps with 0.1 ms resolution, giving 443 timeseries, each of length 20,000 timesteps.
        
\paragraph{Financial time series of returns of US stocks.}
% 
We started with the 1000 stocks from the constituents of the Russell 3000 index that have the highest liquidity. This dataset is publicly available, though very large in size to be released as a single file.
% 
After performing necessary preprocessing and checks on data quality issues, we use 982 of those stocks. The returns are evaluated every 15 minutes, for the period of from May 2021 to May 2022, i.e. 2600 timesteps.

\paragraph{Wind Dataset}
This dataset is daily average wind speed (in knots = 0.5418 m/s) data collected from year 1961 to 1978 at 12 meteorological stations in the Republic of Ireland~(Gneiting 2002).\footnote{\url{http://lib.stat.cmu.edu/datasets/wind.desc}}

\paragraph{Rain Dataset}
% 
Daily data collected for rainy days in 1949–94 across 167 regions in Washington and Oregon states.

Other datasets are available at Kaggle. We also generated a synthetic dataset, with binary timeseries. Script for generating the data will be provided in the code base.
% 
All the data files will be provided as part of the codebase.

\bibliography{references}

\end{document}
