
%%i%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Clustering and model selection} 
\label{sec:clustering}
%%i%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

As proposed in \cite{wang2009ksubspaces}, the value $\sum_{x \in
  D_\ell} d_\ell(x,C_\ell(\theta_\ell))^2$ is an important one to
consider. However, one still needs to counter the overfitting effects
by taking into account the complexity of the model. As we proposed in
the outline, we will use the Bayesian Information Criterion (BIC) to
select the most appropriate model for the data. The computations are
presented hereafter.


\subsection{Recursive smart seeding via challenge}

\begin{algorithm}
\begin{algorithmic}[1]
\Procedure{\algRecChaSeed}{$data, nbc, depth, max\_depth$}

\Statex
\State{// smart seeding a-la kmeans++}
\State{seeds = Smart\_seeding\_plus\_plus(data, nbc)}
\State{// For each sample: index of its nearest center/cluster}
\State{$NN1$ = Assign samples to clusters using distance to seeds}
\If{$depth = max\_depth$}
\State{return $(seeds, NN1)$}
\Else
\State{// Clusters to split}
\State{$S\gets$ Ids of clusters which can be split}
\State{// Clusters to merge}
\State{$M\gets$ Ids of clusters which can be merged}
\State{//Unique ids of invalid clusters}
\State{$I = S \cup M$}
\State{$(seeds\_I, NN1\_I) \gets \algRecChaSeed(data[I], |I|, depth+1, max\_depth)$}
\State{In place substitution of seeds from $seeds_I$ into $seeds$}
\State{In place substitution of clusters ids of $NN1_I$ into $NN1$}
\State{return $(seeds, NN1)$}
\EndIf

\EndProcedure
\end{algorithmic}
\caption{{\bf Recursive smart seeding with cluster challenges.}}
\label{alg:rec-cha-seed}
\end{algorithm}


\subsection{BIC calculation}



Denote $P = d \times (\dim V + 1)$ the number of parameters of the model. 
%%
Assuming that the residual errors are normally distributed around our cluster model, 
the calculation below yields the following BIC:
\begin{equation}
\label{eq:bic-final}
BIC = \log(n) P - 2 \mathcal{LL}(\theta_\ell) = \log(n) P + n \log\left(\frac{RSS}{n}\right) + 
%nK
n(1+\log \frac{\pi}{2}).
\end{equation}
\bigskip

%% xfc: distance used to be...which was not coherent
%% The calculation goes as follows. We assume the following  statistical model for errors,
%% \begin{equation}
%%     \| X - C_\ell\| \sim \mathcal{N}_\text{folded}(0,\sigma^2)
%% \end{equation}

The calculation goes as follows. We assume the distribution of
distances (Eq. \ref{eqn:distance-sesc}) follows a folded normal
distribution with probability density function:
\begin{equation}
    f_\text{folded}(x) = \left\lbrace \begin{aligned}
        & \frac{2}{\sqrt{2\pi}\sigma} \exp\left(-\frac{x^2}{2\sigma^2}\right) \quad \text{for } x \ge 0 \\
        & 0 \quad \text{for } x < 0
    \end{aligned}\right.
\end{equation}
%%
Then, the log-likelihood writes:
\begin{equation}
\label{eqn:log-likelihood}
\mathcal{LL}(\theta_\ell | x_1 \dots x_n) = \sum_{i=1}^n \log f_\text{folded}\left(d_\ell(x_i,C_\ell(\theta_\ell))\right)
\end{equation}
For the sake of simplicity, we will denote $\mathcal{LL}(\theta_\ell)$ the log-likelihood associated with the points of $D_\ell$. Then, we denote:
\begin{equation}
    RSS = \sum_{i=1}^n d_\ell(x_i,C_\ell(\theta_\ell))^2  \quad \text{and} \quad \hat\sigma^2 = \frac{RSS}{n}
\end{equation}
This gives the final expression -- Eq. \ref{eq:bic-final}:
\begin{align*}
    \mathcal{LL}(\theta_\ell) &=  n\log 2 - \frac{n}{2}\log(2\pi) - \frac{n}{2}\log\hat\sigma^2 - \frac{1}{2\hat\sigma^2}\underbrace{\sum_{i=1}^n d_\ell(x_i,C_\ell(\theta_\ell))^2}_{n \hat\sigma^2} \\
    &= n\log 2 - \frac{n}{2}\log(2\pi) - \frac{n}{2} - \frac{n}{2} \log \hat\sigma^2 \\
    &= -\frac{n}{2} \left( \underbrace{1 +\log\frac{\pi}{2}}_{\text{constant } K} + \log\left(\frac{RSS}{n}\right) \right)
\end{align*}
%%
We finally arrive at
%%
\begin{equation}
\label{eq:fic-final}
BIC = P \log n_l  - 2 \mathcal{LL}(\theta_\ell) = P \log n_l + n_l \log\left(\frac{RSS}{n_l}\right) +  n_l (1+\log \pi/2).
\end{equation}
%%

\subsection{Algorithm}

Then, the model with the lowest BIC is preferred. In order to perform
model selection, one can compute the estimation log-likelihood of
several models and simply pick the better one. However, choosing which
models to test is a difficult problem. The simplest scheme is probably
to start from a 1-dimensional cluster, compute its BIC, and then
continue trying new models with increasing dimension until the BIC
stops decreasing. This could be improved for high-dimensional systems
using a dichotomy of the dimensions in order to reduce the number of
tried models.

An additional scheme that is interesting is to only perform model
selection at some predefined steps: we only perform it after a few
steps, and whenever the algorithm believes it has converged. 
This allows decreasing the computation time of the algorithm while still performing model selection.


%% Moreover, we will see in section
%% \ref{subsec:convergence} makes the termination of the algorithm less
%% simple, and that this scheme helps keeping good properties.

A simple version of model selection is presented in algorithm \ref{alg:model-selection}.

\begin{algorithm}
    \begin{algorithmic}[1]
        \Require{$D_\ell$ the dataset matrix for the given cluster.}
        \Require{$maxdim$ the dimension of the ambient space}
        \Statex

        \Procedure{SelectModel}{$D_\ell$}
            \State{$dim \gets 1$} \Comment{Initial guess}
            \State{$c_{dim}, dir_{dim} \gets$} \Call{OptimizeCluster}{$D_\ell,dim$}
            \State{$BIC_{dim} \gets $} \Call{BIC}{$D_\ell, c_{dim}, dir_{dim}$}
            \While{$dim \le maxdim$}
                \State{$newdim \gets dim +1$}
                \State{$newc_{dim}, newdir_{dim} \gets $} \Call{OptimizeCluster}{$D_\ell,newdim$}
                \State{$BIC_{newdim} \gets $} \Call{BIC}{$D_\ell, newc_{dim}, newdir_{dim}$}
                \If{$BIC_{newdim} > BIC_{dim}$} \Comment{If the BIC increases, we have passed to optimal model}
                    \State{\bf break}
                \Else
                    \State{$dim \gets newdim$}
                    \State{$c_{dim}, dir_{dim} \gets newc_{dim}, newdir_{dim}$}
                    \State{$BIC_{dim} \gets BIC_{newdim}$}
                \EndIf
            \EndWhile
            \State \Return{$dim, c_{dim}, dir_{dim}$}
        \EndProcedure

    \end{algorithmic}
    \caption{{\bf Model selection scheme:} Simple model selection scheme based on the BIC, where the every dimension is tried until the BIC increases. }
%%For speed-up solutions, read section \ref{sec:model-selection}.}
    \label{alg:model-selection}
\end{algorithm}


\subsection{Results}

\ifLONG
\else
\begin{figure*}[htbp]
\begin{center}
\begin{tabular}{cccc}
\rotatebox{90}{$\eta=0.3$}& \includegraphics[width=\figwradius\linewidth]{fig-cmp-UoS/dtm-vs-sesc-radius-var0dot9-mu0dot3-eta0dot3.png} & \rotatebox{90}{$\eta=0.5$} & \includegraphics[width=\figwradius\linewidth]{fig-cmp-UoS/dtm-vs-sesc-radius-var0dot9-mu0dot3-eta0dot5.png}\\
%$\eta=0.3$ & $\eta=0.5$\\
\rotatebox{90}{$\eta=0.7$}& \includegraphics[width=\figwradius\linewidth]{fig-cmp-UoS/dtm-vs-sesc-radius-var0dot9-mu0dot3-eta0dot7.png} &\rotatebox{90}{$\eta=0.9$}&  \includegraphics[width=\figwradius\linewidth]{fig-cmp-UoS/dtm-vs-sesc-radius-var0dot9-mu0dot3-eta0dot9.png}\\
%$\eta=0.7$ & $\eta=0.9$
\end{tabular}
\end{center}
\caption{\small {\bf Dataset \protHMM, SESC radius x median Distance To Measure: incidence of $\eta$.}
 The cluster center is the     solution of the optimization problem of Eq. \ref{eqn:cluster-optim-sph}.
Labels correspond to the 16 cluster ids.  }
\label{fig:cmp-DTM} 
\end{figure*} 
\fi
