%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{arrows.meta, positioning, fit, backgrounds}
\usetikzlibrary{bayesnet}
\tikzset{plate caption/.append style={font=\scriptsize, below left=0pt and 0pt of #1.south east}}
\usepackage{lipsum} % For filler text
\tikzset{
    bayesnode/.style={draw, circle, text width=0.4cm, align=center, minimum size=0.55cm, font=\scriptsize},
    latent/.style={bayesnode, fill=white},
    observed/.style={bayesnode, fill=gray!30, text width = 1cm},
    param/.style={draw, rectangle, text width=0.55cm, fill=gray!20, minimum size=0.75cm, aspect=1, align=center, font=\scriptsize},
    edge/.style={-Stealth},
    dashededge/.style={-Stealth,dashed}
}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% My packages
%\usepackage{hyperref}       % hyperlinks
\usepackage{url}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{appendix}
\usepackage{array} % required for text wrapping in tables
% My input comment
\input{commands.txt}

%%% APPENDIX %%%
% Renumbering equations and figures within the appendix. 
\renewcommand\appendix{\par 
    \setcounter{section}{0}%   
    \setcounter{subsection}{0}%     
    %\setcounter{equation}{0} 
    \setcounter{table}{0} 
    \setcounter{figure}{0}
    \gdef\thefigure{\Alph{section}.\arabic{figure}}%
    \gdef\thetable{\Alph{section}.\arabic{table}}%
    \gdef\thesection{\Alph{section}}%
    \gdef\thesubsection{\Alph{section}.\arabic{subsection}}% 
    %\@addtoreset{equation}{section}%
    \gdef\theequation{\Alph{section}.\arabic{equation}}%
}
% change \autoref names
\let\orgautoref\autoref
\renewcommand{\autoref}
      {\def\equationautorefname{Equation}%
      \def\figureautorefname{Figure}%
      \def\subfigureautorefname{Figure}%
      \def\Itemautorefname{Item}%
      \def\tableautorefname{Table}%
      \def\sectionautorefname{Section}%
      \def\subsectionautorefname{Section}%
      \def\subsubsectionautorefname{Section}%
      \def\appendixautorefname{Supplement}%%%%!!!!!!!!!!!!!
      \orgautoref}
%%% begin appendix autoref patch [\autoref subsections in appendix](https://tex.stackexchange.com/questions/149807/autoref-subsections-in-appendix) %%%
\usepackage{etoolbox}
\makeatletter
\patchcmd{\hyper@makecurrent}{%
    \ifx\Hy@param\Hy@chapterstring
        \let\Hy@param\Hy@chapapp
    \fi
}{%
    \iftoggle{inappendix}{%true-branch
        % list the names of all sectioning counters here
        \@checkappendixparam{chapter}%
        \@checkappendixparam{section}%
        \@checkappendixparam{subsection}%
        \@checkappendixparam{subsubsection}%
        \@checkappendixparam{paragraph}%
        \@checkappendixparam{subparagraph}%
    }{}%
}{}{\errmessage{failed to patch}}

\newcommand*{\@checkappendixparam}[1]{%
    \def\@checkappendixparamtmp{#1}%
    \ifx\Hy@param\@checkappendixparamtmp
        \let\Hy@param\Hy@appendixstring
    \fi
}
\makeatletter

\newtoggle{inappendix}
\togglefalse{inappendix}

\apptocmd{\appendix}{\toggletrue{inappendix}}{}{\errmessage{failed to patch}}
\apptocmd{\subappendices}{\toggletrue{inappendix}}{}{\errmessage{failed to patch}}
%%% end appendix autoref patch %%%


\title{Nonparametric Bayesian Multi-Facet Clustering for Longitudinal Data}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% Information below.
%
% Add authors
\author[1]{\href{mailto:<luwei.wang@ed.ac.uk>?Subject=Your UAI 2025 paper}{Luwei Wang}{}}
\author[1]{Kieran Richards}
\author[1]{Sohan Seth}
% Add affiliations after the authors
\affil[1]{%
    School of Informatics\\
    University of Edinburgh\\
    UK
}

\allowdisplaybreaks
  
\begin{document}
\maketitle

\begin{abstract} % 207 words
Complex real-world time series data are inherently multi-faceted, e.g., temporal data can be described by seasonality and trend. Popular clustering methods typically aggregate information from all facets, treating them collectively rather than individually. This aggregation may diminish the interpretability of clusters by obscuring the specific contributions of individual facets to the clustering outcome. This limitation can be addressed by multi-facet clustering that builds a separate clustering model for each facet simultaneously. In this paper, we explore Bayesian multi-facet clustering modelling for temporal data using nonparametric priors to select an appropriate number of clusters automatically and using variational inference to efficiently explore the parameter space. We apply this framework to nonlinear growth models and vector autoregressive models and observe their performance through simulation studies. We apply these models to real-world time series data from the English Longitudinal Study of Ageing (ELSA), highlighting its utility in identifying meaningful and interpretable clusters. These findings underscore the potential of the framework for advancing the analysis of multi-faceted longitudinal data in diverse fields. Code is available at \href{https://github.com/Demi-wlw/Nonparametric-Bayesian-Multi-Facet-Clustering-for-Longitudinal-Data.git}{GitHub}. 
\end{abstract}


\section{Introduction}\label{sec:intro}
% 1. Background/motivation 2. Gaps/Challenges to resolve 3. Related works 4. Objectives 5. Overview of the following sections

Clustering, a key task in unsupervised machine learning, partitions unlabelled datasets into subgroups based on similarity measures \citep{MurphyML2012}. Classical algorithms such as $k$-means \citep{MacQueenKMeans1967}, hierarchical clustering \citep{HastieHC2009}, Gaussian mixture models \citep{ChrisGMM2002} and DBSCAN \citep{EsterDBSCAN1996} are widely applied to uncover hidden data structures across various fields \citep{XuClusteringAlgo2005}. In the context of longitudinal data, clustering is crucial for exploring shared dynamics over time, with applications in speech processing, medical diagnosis and social sciences \citep{Speech1985Wilpon, warrenliaoClusteringTimeSeries2005, bulteelClusteringVectorAutoregressive2016, marshallIncomeTrajectoriesPrecarity2024}. 

Existing clustering methods typically identify a single partition of the data by accumulating contributions from all facets (we refer to this as \emph{single-facet clustering}). However, the rise of high-dimensional data and complex data structures in many clustering applications may reveal multiple interesting clustering structures when focusing on different characteristics or facets of the data. For instance, in images of objects, two interesting facets might be the shape and the color of the objects. Similarly, in temporal data, two interesting facets might be the seasonality and the trend of the data. \citet{falckMultiFacetClusteringVariational2021} argued that focusing on a single facet, rather than considering multiple facets, is an arbitrary and incomplete approach to clustering high-dimensional datasets. In practice, heterogeneous samples are often more effectively clustered based on a subset of characteristics, with other characteristics being uninformative or redundant \citep{kundu2024BayesProductMixture}. 


\begin{figure*}[t]
  \centering
  \begin{subfigure}[b]{0.49\linewidth}
    \includegraphics[width=1\linewidth]{pic/intro/Multi-facet}
    %\caption{single-facet vs. multi-facet clustering}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.49\linewidth}
  \centering
    \begin{tikzpicture}[node distance = 6mm]
    % Define nodes
    \node[param] (lambda) at (0, 0) {\(\bflambda^{(f)}\)};
    \node[param] (S) [below=of lambda] {\(\bfsdot\)};
    \node[latent] (G) [right=of lambda] {\(G_0^{(f)}\)};
    \node[latent] (alpha) [right=of S] {\(\alphadot\)};
    \node[latent] (theta) [right= of G] {\(\bfthetadot_{k_f}\)};
    \node[latent] (V) [right=of alpha] {\(\vdot_{k_f}\)};
    \node[latent] (Z) [above right=0.05cm and 0.6cm of V.east] {\(\zdot_n\)};
    \node[observed, text width = 0.7cm] (Y) [right=of Z] {\(\bfy_n\)};
    % Edges
    \edge {lambda} {G};
    \edge {G} {theta};
    \edge {S} {alpha};
    \edge {alpha} {V};
    \edge {V.east} {Z.west};
    \edge {Z} {Y};
    \edge {theta.east} {Y.west};
    % Plates
    \tikzset{plate caption/.append style={font=\scriptsize, below left=8pt and -3pt of #1.south east}}
    \plate [inner sep=.15cm] {plate1}{(S)(lambda)(G)(alpha)(V)(theta)(Z)} {\(F\)};
    \tikzset{plate caption/.append style={font=\scriptsize, below left=0pt and 0pt of #1.south east}}
    \plate [inner sep=.1cm] {plate2}{(theta)(V)} {\(\infty\)};
    \plate [inner sep=.1cm] {plate3}{(Z)(Y)} {\(N\)};
\end{tikzpicture}
  \end{subfigure}
  \caption{\textbf{left)} single-facet versus multi-facet clustering. \textbf{right)} nonparametric Bayesian multi-facet mixture model}
  \label{fig:multi-facet illustration}
\end{figure*}

Standard clustering using combined information from all facets for complex data structures, such as time series, highlights the limitation in interpretability. 
For instance, \citet{marshallIncomeTrajectoriesPrecarity2024} employed a mixture of nonlinear growth models to cluster individual income trajectories into several groups. Here the average income value and the variation of income over time both contribute to the clustering, and it is not clear which facet is driving the inferred clustering outcome more. Instead, it might be more effective to cluster each facet separately to find clusters with respect to both average income and variation of income simultaneously, and an individual can be assigned to both a specific average income cluster and a variation over time cluster.
Similarly, the mixture of vector autoregressive models proposed by \citet{bulteelClusteringVectorAutoregressive2016} can be potentially more interpretable if the multivariate time series are clustered separately based on their average values and their temporal dynamics. 

Multi-facet clustering offers significant potential to address these challenges by constructing separate partitions for each facet. This approach not only ensures that items within specific clusters exhibit homogeneity but also facilitates the exploration of facets and their corresponding clusters, each representing distinct characteristics of the data \citep{falckMultiFacetClusteringVariational2021}. Furthermore, this method requires a number of clusters that scale linearly with the number of facets, rather than exponentially. As a result, it reduces the total number of clusters required to represent high-dimensional data while explicitly capturing the unique characteristics driving the clustering process. For instance, consider the example illustrated in \autoref{fig:multi-facet illustration} (left). Unlike single-facet clustering which identifies five clusters by considering both characteristics in aggregate, multi-facet clustering explicitly partitions the data across multiple facets. Specifically, it identifies $K_1=3$ clusters for Facet $1$ and $K_2=2$ clusters for Facet $2$, effectively summarizing six potential clusters into a more interpretable structure. It is important to emphasize that the multi-facet clustering is conceptually distinct from \textit{multi-view/aspect clustering} approaches \citep{ChaoMultiview2021, nayakMultiaspectLearningMethods2023}. Multi-view clustering aims to derive a single clustering solution that integrates information from multiple inputs (views) of the same sample cohort. In contrast, multi-facet clustering seeks to uncover multiple clustering solutions, each described by distinct characteristics/facets of a single input cohort.


\paragraph{Related Work}
The concept of multi-facet clustering aligns with the notion of learning \textit{multiple clusterings} as highlighted by \citet{gordonClassification1999}. Various methods have since been developed to address this issue by adapting conventional clustering approaches. For example, \citet{FriedmanSubsetClust2004} proposed a distance-based clustering algorithm that automatically detects subgroups of objects clustering on different, possibly overlapping subsets of attribute variables. \citet{galimbertiModelbasedMethodsIdentify2007} introduced a two-step procedure with the first step identifying independent subsets of variables and the second step applying a model-based approach to identify cluster structures based on these subsets. A Bayesian method by \citet{niuNonparametricBayesianModel2012} introduced a probabilistic nonparametric Bayesian model to learn overlapping feature facets and clusters within each facet in a joint framework. \citet{ZongMultifacetOmics2024} proposed a similar model-based multi-facet clustering approach using a mixture of Gaussian mixture models, particularly suited for high-dimensional nonclusterable genes. These methods leverage feature selection techniques to identify relevant subsets of features as facets for clustering. Additionally, \citet{falckMultiFacetClusteringVariational2021} presented a deep learning approach extending the variational autoencoder (VAE), a feature-based method, to develop a multi-facet clustering algorithm. This model identifies facets by learning latent variables for each facet and simultaneously learns multiple clusterings in an end-to-end framework. Notably, all these models are tailored for \emph{static feature data} and are not suitable for \emph{temporal data}. %Moreover, most methods utilizing feature selection or latent variables to identify facets necessitate association analysis post-clustering to interpret the discovered facets and clustering outcomes.
A recent study by \citet{kundu2024BayesProductMixture} introduced a product Dirichlet process mixture model that employs Dirichlet process (DP) mixture priors on model parameters. Their approach primarily focused on applications to vector autoregressive models, relying on Markov chain Monte Carlo (MCMC) methods. % which are computationally expensive.
\paragraph{Contributions} While nonparametric Bayesian approaches for multi-facet clustering using parameters of the model as facets have been explored in prior research \citep{kundu2024BayesProductMixture}, they often rely on computationally intensive techniques such as MCMC sampling, which might limit their scalability and practical applicability for large datasets. \textbf{(A)} This study extends existing methods by incorporating a Variational Bayesian (VB) framework, enabling efficient and scalable inference while separately modelling key characteristics and identifying their corresponding clusters. \textbf{(B)} We implement the method for the nonlinear growth model (for the first time) to handle complex temporal data. \textbf{(C)} We apply the framework to novel real-world data from the English Longitudinal Study of Ageing (ELSA), showcasing its effectiveness in capturing meaningful and interpretable clusters of income trajectories. These contributions enhance the practical applicability of Bayesian multi-facet clustering in large-scale longitudinal data analysis.

%The remainder of this article is organized as follows. In \autoref{sec:MMM}, we develop the nonparametric Bayesian multi-facet mixture model within a VB framework. \autoref{sec:example models} applies the methodology to two time-series models: the nonlinear growth model (\textsc{NLG}) and the vector autoregressive model (\textsc{VAR}). Simulation study results are presented in \autoref{sec:simulation}. Practical applications using the ELSA dataset discussed in \autoref{sec:application}, demonstrate the model's ability to uncover meaningful cluster patterns in real-world scenarios. Finally, \autoref{sec:discussion} offers further discussions, and supplementary materials provide additional relevant details.

\newpage
\section{Multi-facet Mixture Model}\label{sec:MMM}
A standard mixture model with $K$ components is described as $\sum_{k=1}^K \pi_k p(\bfy \g \bftheta_k)$ where $\pi_k$ is the probability of the $k$-th mixture component and $\bftheta_k$ is the parameter of the respective component. The multi-facet mixture model (\MMM), is described as \[\sum_{k_1=1}^{K_1} \cdots\sum_{k_F=1}^{K_F} \pi^{(1)}_{k_1} \cdots \pi^{(F)}_{k_F} p(\bfy \g \bftheta^{(1)}_{k_1}, \ldots, \bftheta^{(F)}_{k_F})\] where $f=1,\dots,F$ are $F$ facets of the mixture component and we assume independence among these facets apriori. Here each facet has its respective mixture components described by the probabilities $\bfpi^{(f)}$ and parameters $\bfTheta^{(f)}$. In MMM, for each sample $\bfy_n$, the cluster assignments for different facets are generated independently, and the sample is generated using respective parameters simultaneously i.e.,  
\begin{align*}
\zdot_n &\distas \categorical(\bfpi^{(f)})\qquad \forall \;f=1,\ldots,F \\
\bfy_n &\distas p\left(\bfy \,\left| \bftheta^{(1)}_{z^{(1)}_n}, \ldots, \bftheta^{(F)}_{z^{(F)}_n}\right.\right).
\end{align*}
 Each facet $\bfthetadot$ is typically an exclusive partition of the entire parameter space, i.e., $\bftheta^{(1)} \cup \ldots \cup \bftheta^{(F)} = \bftheta$ allowing the model to disentangle and cluster different aspects of the data; and the choice of partition, i.e., facet is guided by the user to provide flexibility and interpretability.
 
%\paragraph{Prior} 
We use the Dirichlet process \citep{Thomas1973DP} as a nonparametric prior for the parameters $\bfthetadot$ of each facet, i.e., 
\begin{align*}
    \Gdot &\distas \DirichletProcess(\Gdot_0, \alphadot)\\
    \bfthetadot_n &\distas \Gdot \qquad \forall\; n = 1,\ldots,N
\end{align*}
$\Gdot$ is a random probability measure made up of discrete values (atoms) for $\bfthetadot$. The Dirichlet process prior has two hyperparameters: the base distribution $G_0^{(f)}$ is the mean distribution of the Dirichlet process, commonly chosen to be a conjugate prior; and the concentration parameter $\alphadot$, which controls how many distinct clusters are likely to form \citep{Charles1974DPM}. 
We use the stick-breaking construction \citep{Sethuraman1994stickbreak}, i.e.,
\begin{align*}
\vdot_{k_f} &\distas \betarand(1,\alphadot) \\
\pidot_{k_f} &\quad=\quad\vdot_{k_f}\prod_{i=1}^{k_f-1}(1-\vdot_i)\\
\bfthetadot_{k_f} &\distas G^{(f)}_0(\bfthetadot_{k_f}\g{\bflambda^{(f)}_{k_f}}).
\end{align*}
Here $\bfpi^{(f)}$ follows the Griffiths-Engen-McCloskey (GEM) distribution. $\bflambda^{(f)}_{k_f}$ denotes the parameters of base distribution in general.
%
%described in \autoref{app:derivation of VI} for the Dirichlet process %the distribution $\Gdot$ is represented as:
% \begin{align*}
%     \Gdot(\bfthetadot) =\sum_{k_f=1}^\infty\pidot_{k_f}(\bfvdot)\bfone(\bfthetadot=\bfthetadot_{k_f})
% \end{align*}
% where $\pidot$ is distributed as the GEM distribution.
Given the influence of the concentration parameter on the growth of components within the data, we place conjugate $\gammarand$ priors on $\alphadot$, as suggested by \citet{David2006VIforDPM}.
\[\alphadot \distas \gammarand({\sdot_1},{\sdot_2})\]
Thus, the parameters for nonparametric \MMM\ are
$\bfTheta=\left\{\bftheta^{(f)}_{k_f}, \vdot_{k_f},\alphadot,\ldots\right\}$ alongside auxiliary variables $\bfZ=\{\zdot_n,\ldots\}$. The plate diagram is shown in \autoref{fig:multi-facet illustration} (right).
%$\bfTheta=\left[\left\{\{\bftheta^{(f)}_{k_f}\}_{k_f=1}^{\infty}\right\}_{f=1}^F, \left\{\bfpidot(\bfvdot)\right\}_{f=1}^F, \left\{\{\zdot_n\}_{n=1}^N \right\}_{f=1}^F \right]$. 

\paragraph{Mean Field Variational Inference}
We apply variational inference using the mean-field method \citep{David2006VIforDPM} to approximate the posterior distribution of the variables of interest. This approach leverages a coordinate ascent algorithm to optimize the evidence lower bound (ELBO). In comparison to the Gibbs sampler, variational inference demonstrates faster convergence, with its runtime largely unaffected by dimensionality \citet{David2006VIforDPM}.
To ensure computational efficiency within the mean-field framework, we adopt fully factorized variational distributions, which assume no dependencies between unobserved variables. In addition, we consider a truncated stick-breaking representation \citep{David2006VIforDPM} to approximate the distribution of the infinite-dimensional random measure $\Gdot$. This approach involves setting a fixed truncation level $\truncbig$ and defining $q(\vdot_\truncbig = 1) = 1$ for any facet parameter, ensuring that the mixture probabilities $\pidot_k(\bfvdot)$ are zero for $k > \truncbig$. We use variational distribution in the same family as the respective prior (see \autoref{eqn:MMM varDist}). The update rules for $\alphadot$ and $\bfvdot$ do not depend on the choice of distribution $p$ while $\zdot_n$ and $\bfthetadot$ do (see \autoref{eqn:MMM updateRule}). 

\paragraph{Multi-facet Nonlinear Growth Model}\label{sec:BNPMF-NLG}
The nonlinear growth model captures complex growth dynamics \citep{Suk2018NLG}, and spline functions \citep{Ahlberg1967Spline} have been a well-established method for modelling such nonlinearity. We assume that for each individual $n$, the trajectory $\bfy^\obs_n$ is observed at locations $\bft^\obs_n \in [0,T]^{T^\obs_n}$ where $T^\obs_n$ is the number of observed locations for an individual. Given the individual cluster assignments $\za_n$, $\zbeta_n$ and $\ztau_n$ for each facet and corresponding cluster parameters, the likelihood of the $n$-th time series at time $\bft^\obs_n$ is described as
\begin{equation}\label{eqn:BMFNLG_NA_obs_dist}
\begin{split}
    \bfy^\obs_n\g \za_n=\ka,\zbeta_n=\kbeta,\ztau_n=\ktau\distas\\
    \normal_{T^\obs_n}(a_{\ka}+\bfbeta_{\kbeta}\calB(\bft^\obs_n), \tau_{\ktau}\bfI)
\end{split}
\end{equation}
where $\calB(\bft^\obs_n)\in\reals^{L\times {T^\obs_n}}$ is the basis matrix generated by evaluating spline basis functions at locations $\bft^\obs_n$. We treat the intercept $a$, the coefficient row vector $\bfbeta=(\beta_1,\dots,\beta_L)$ and noise precision $\tau$ as three facets. 
We employ B-spline basis functions of order $p=2$ with $M$ internal knots, %This spline function can be built from a linear combination of a collection of B-splines $\{B_{i,p}(t)\}_{i=1}^L$ of degree $p-1$ with coefficient $\beta_i$. 
and exclude one of the basis functions to explicitly include an intercept term $a$ (see \autoref{proof:B-spline}), thus, $L=M+p-1$. We use the following priors over the parameters, and choose variational distributions in the same family,
\begin{align*}
    a &\sim \normal(\mua,\taua)\\
    \bfbeta &\sim \normal_L(\mubeta,\taubeta\bfI)\\
    \tau &\sim \gammarand(\lambdatau_1,\lambdatau_2).
\end{align*}

\autoref{fig:BNPMF-NLG & BNPMF-VAR flowchart} (left) shows the plate diagram of the multi-facet mixture of nonlinear growth models (\NLG). \autoref{app:NLG} provides additional model specifications and \autoref{app:derivation of VI} derives the update rules.
\tikzset{plate caption/.append style={font=\tiny, below left=0pt and 0pt of #1.south east}}
\begin{figure*}[!htb]
  \centering
  \begin{subfigure}[b]{0.39\linewidth}
  \centering
    \begin{tikzpicture}[node distance = 6mm]
    % Define nodes
    \node[param] (S_beta) at (0, 0) {\(\bfsbeta\)};
    % Top row nodes
    \node[latent] (alpha_beta) [below=0.3cm of S_beta] {\(\alphabeta\)};
    \node[latent] (alpha_a) [left=of alpha_beta] {\(\alphaa\)};
    \node[latent] (alpha_tau) [right=of alpha_beta] {\(\alphatau\)};
    \node[param] (S_a) [left=of S_beta] {\( \bfsa \)};
    \node[param] (S_tau) [right=of S_beta] {\( \bfstau \)};
    % Middle row nodes
    \node[latent] (v_beta) [below=0.3cm of alpha_beta] {\(\vbeta_{\kbeta}\)};
    \node[latent] (v_a) [below=0.3cm of alpha_a] {\(\va_{\ka} \)};
    \node[latent] (v_tau) [below=0.3cm of alpha_tau] {\( \vtau_{\ktau} \)};
    %\node[latent] (pi_beta) [right=of v_beta] {\( \bfpibeta \)};
    %\node[latent] (pi_a) [right=of v_a] {\( \bfpia \)};
    %\node[latent] (pi_tau) [right=of v_tau] {\( \bfpitau \)};
     % Bottom row nodes
    \node[latent] (z_beta) [below=0.4cm of v_beta] {\( \zbeta_n \)};
    \node[latent] (z_a) [below=0.4cm of v_a] {\( \za_n \)};
    \node[latent] (z_tau) [below=0.4cm of v_tau] {\( \ztau_n \)};
    % Observation
    \node[observed] (y_obs) [below=0.3cm of z_beta] {\( \bfy_n^{\obs} \)};
    \node[observed, text width=0.4cm] (B) [below=of z_a] {\( \bft^\obs_n \)};
    \node[latent] (beta_k) [below=0.45cm of y_obs] {\( \bfbeta_{\kbeta} \)};
    \node[latent] (a_k) [left=of beta_k] {\( a_{\ka} \)};
    \node[latent] (tau_k) [right=of beta_k] {\( \tau_{\ktau} \)};
    % Parameters
    \node[param] (lambda_tau) [right=0.3cm of tau_k] {\( \lambdatau_1 \)};
    \node[param] (lambda_tau2) [below=0.25cm of lambda_tau] {\( \lambdatau_2 \)};
    \node[param] (lambda_beta) [below right=0.4cm and -0.2cm of beta_k] {\( \tau^{(\bfbeta)} \)};
    \node[param] (lambda_beta2) [below left=0.4cm and -0.2cm of beta_k] {\( \bfmu^{(\bfbeta)} \)};
    \node[param] (lambda_a) [left=0.3cm of a_k] {\( \mu^{(a)} \)};
    \node[param] (lambda_a2) [below=0.25cm of lambda_a] {\( \tau^{(a)} \)};
    
    % Edges
    \edge {S_beta.south} {alpha_beta.north};
    \edge {S_a.south} {alpha_a.north};
    \edge {S_tau.south} {alpha_tau.north};
    \edge {alpha_beta}{v_beta};
    %\draw[dashededge] (v_beta) -- (pi_beta);
    \edge {v_beta}{z_beta};
    \edge {alpha_a}{v_a};
    %\draw[dashededge] (v_a) -- (pi_a);
    \edge {v_a}{z_a};
    \edge {alpha_tau}{v_tau};
    %\draw[dashededge] (v_tau) -- (pi_tau);
    \edge {v_tau}{z_tau};
    \edge {B}{y_obs};
    \edge {z_a.south,z_beta.south,z_tau.south}{y_obs.north};
    \edge {a_k.north,beta_k.north,tau_k.north}{y_obs.south};
    \edge{lambda_beta.north}{beta_k.south};
    \edge{lambda_beta2.north}{beta_k.south};
    \edge{lambda_a}{a_k};
    \edge{lambda_a2.east}{a_k.west};
    \edge{lambda_tau}{tau_k};
    \edge{lambda_tau2.west}{tau_k.east};
    % Fit background boxes for clarity
    \plate [inner sep=.1cm] {plate1}{(v_a)(v_beta)(v_tau)} {\(\infty\)};
    \plate [inner sep=.1cm] {plate2}{(y_obs)(B)(z_a)(z_tau)}{\(N\)};%xshift=.02cm,yshift=.2cm
    \plate [inner sep=.1cm] {plate3}{(a_k)(beta_k)(tau_k)} {\(\infty\)};
\end{tikzpicture}
%\caption{Nonparametric Bayesian multi-facet NLG model}
%\label{fig:BNPMF-NLG flowchart}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.6\linewidth}
\centering
\begin{tikzpicture}[node distance = 6.5mm]
    % Define nodes
    \node[param] (S_beta) at (0, 0) {\( \bfsBeta \)};
    % Top row nodes
    \node[latent] (alpha_beta) [below=0.3cm of S_beta] {\( \alphaBeta \)};
    \node[latent] (alpha_a) [left=of alpha_beta] {\( \alpha^{(\bfa)} \)};
    \node[latent] (alpha_tau) [right=of alpha_beta] {\( \alpha^{(\bftau)} \)};
    \node[param] (S_a) [left=of S_beta] {\( \bfs^{(\bfa)} \)};
    \node[param] (S_tau) [right=of S_beta] {\( \bfs^{(\bftau)} \)};
    % Middle row nodes
    \node[latent] (v_beta) [below=0.3cm of alpha_beta] {\( \vBeta_{\kBeta} \)};
    \node[latent] (v_a) [below=0.3cm of alpha_a] {\( v_{\ka}^{(\bfa)} \)};
    \node[latent] (v_tau) [below=0.3cm of alpha_tau] {\( v_{\ktau}^{(\bftau)} \)};
    %\node[latent] (pi_beta) [right=of v_beta] {\( \bfpiBeta \)};
    %\node[latent] (pi_a) [right=of v_a] {\( \bfpi^{(\bfa)} \)};
    %\node[latent] (pi_tau) [right=of v_tau] {\( \bfpi^{(\bftau)} \)};
    % Bottom row nodes
    \node[latent] (z_beta) [below=0.4cm of v_beta] {\( \zBeta_n \)};
    \node[latent] (z_a) [below=0.4cm of v_a] {\( z^{(\bfa)}_n \)};
    \node[latent] (z_tau) [below=0.4cm of v_tau] {\( z^{(\bftau)}_n \)};    
    % Observation
    \node[observed] (y_obs) [below=0.3cm of z_beta] {\( \bfy_{nt} \)};
    \node[observed] (y_t-1) [left=0.2cm of y_obs] {\(\bfy_{n(t-1)}\)};
    \node[observed] (y_t+1) [right=0.2cm of y_obs] {\(\bfy_{n(t+1)}\)};
    \node[text width=0.3cm, align=center,minimum size=0.2cm, node font = \small] (empty1)[left=0.4cm of y_t-1] {\dots};
    \node[observed] (y_1) [left=0.4cm of empty1] {\(\bfy_{n0}\)};
    \node[text width=0.3cm, align=center,minimum size=0.2cm, node font = \small] (empty2)[right=0.4cm of y_t+1] {\dots};
    \node[observed] (y_T) [right=0.4cm of empty2] {\(\bfy_{n(T_n-1)}\)};
    \node[latent] (beta_k) [below=0.45cm of y_obs] {\( \Beta_{\kBeta} \)};
    \node[latent] (a_k) [left=of beta_k] {\( \bfa_{\ka} \)};
    \node[latent] (tau_k) [right=of beta_k] {\( \bftau_{\ktau} \)};
    % Parameters
    \node[param] (lambda_tau) [right=0.3cm of tau_k] {\( \lambdatau_1 \)};
    \node[param] (lambda_tau2) [below=0.25cm of lambda_tau] {\( \lambdatau_2 \)};
    \node[param] (lambda_beta) [below right=0.4cm and -0.2cm of beta_k] {\( \tau^{(\Beta)} \)};
    \node[param, font=\scriptsize] (lambda_beta2) [below left=0.4cm and -0.2cm of beta_k] {\( \muBeta \)};
    \node[param] (lambda_a) [left=0.3cm of a_k] {\( \bfmu^{(\bfa)} \)};
    \node[param] (lambda_a2) [below=0.25cm of lambda_a] {\( \tau^{(\bfa)} \)};
    % Edges
    %\edge {S.south} {alpha_beta.north,alpha_a.north,alpha_tau.north};
     \edge {S_beta.south} {alpha_beta.north};
    \edge {S_a.south} {alpha_a.north};
    \edge {S_tau.south} {alpha_tau.north};
    \edge {alpha_beta}{v_beta};
    %\draw[dashededge] (v_beta) -- (pi_beta);
    \edge {v_beta}{z_beta};
    \edge {alpha_a}{v_a};
    %\draw[dashededge] (v_a) -- (pi_a);
    \edge {v_a}{z_a};
    \edge {alpha_tau}{v_tau};
    %\draw[dashededge] (v_tau) -- (pi_tau);
    \edge {v_tau}{z_tau};
    \edge {y_1}{empty1};
    \edge {empty1}{y_t-1};
    \edge {y_t-1}{y_obs};
    \edge {y_obs}{y_t+1};
    \edge {y_t+1}{empty2};
    \edge {empty2}{y_T};
    \edge {z_a.south,z_beta.south,z_tau.south}{y_obs.north};
    \edge {a_k.north,beta_k.north,tau_k.north}{y_obs.south};
    \edge {a_k.north,tau_k.north}{y_1.south};
    \edge{lambda_beta.north}{beta_k.south};
    \edge{lambda_beta2.north}{beta_k.south};
    \edge{lambda_a}{a_k};
    \edge{lambda_a2.east}{a_k.west};
    \edge{lambda_tau}{tau_k};
    \edge{lambda_tau2.west}{tau_k.east};
    % Fit background boxes for clarity
    \plate [inner sep=.1cm] {plate1}{(v_a)(v_beta)(v_tau)} {\(\infty\)};
    \plate [inner sep=.1cm] {plate2}{(y_obs)(y_1)(y_T)(z_a)(z_beta)(z_tau)}{\(N\)};%xshift=.02cm,yshift=.2cm
    \plate [inner sep=.1cm] {plate3}{(a_k)(beta_k)(tau_k)} {\(\infty\)};
\end{tikzpicture}
%\caption{Nonparametric Bayesian multi-facet VAR model}
\end{subfigure}
\caption{Plate diagram of MMM with \textbf{left)} nonlinear growth model and \textbf{right)} multivariate autoregressive model.}
\label{fig:BNPMF-NLG & BNPMF-VAR flowchart}
\end{figure*}


\paragraph{Multi-facet Vector Autoregressive Model}\label{sec:BNPMF-VAR}
The Vector Autoregressive (VAR) model \citep{Ltkepohl2007VAR} captures the linear dynamical relationships among multiple time series. In VAR model involving $D$ variables over $T$ time points, each variable is modelled as a linear transformation of its $P$ preceding values. Here we consider $P=1$ since we are interested in short time series. Given the individual cluster assignments $\za_n$, $\zBeta_n$ and $\ztau_n$ for each facet and corresponding cluster parameters, the distribution of the $n$-th time series at time $t$ is:
\begin{align}\label{eqn:BMFVAR_obs_dist}
%\begin{split}
\nonumber
    \bfy_{nt}&\g  \bfy_{n(t-1)},\za_n=\ka,\zBeta_n=\kBeta,\ztau_n=\ktau 
    \distas\\    &\normal_D(\bfa_{\ka} + \Beta_{\kBeta}(\bfy_{n(t-1)}-\bfa_{\ka}), \diag(\bftau_{\ktau})).
%\end{split}
\end{align}
We assume the outcome vector at the first time point $\bfy_{n0}\sim\normal_D(\bfa_{\ka},\diag(\bftau_{\ktau}))$. We view the intercept vector $\bfa\in\reals^D$, the coefficient matrix $\Beta\in\reals^{D\times D}$ and the noise precision vector $\bftau\in\reals_+^D$ as three facets.
We use Yule-Walker representation \citep{ghosh_high-dimensional_2019, Ltkepohl2007VAR} to assign
the intercept $\bfa$ to the trajectory after the evolution equation, allowing the trajectories to be centered around $\bfa$. This can be extended to accommodate varying time lengths $T_n$ across individuals.

We use the following priors over the parameters, and choose variational distribution in the same family,
\begin{align*}
    \bfa &\sim \normal_D(\bfmua,\taua\bfI)\\
    \Beta &\sim \matnormal_{D,D}(\muBeta,\diag(\tauBeta),\bfI)\\
    \tau_d &\sim \gammarand(\lambdatau_1,\lambdatau_2) \text{ for } d=1,\dots,D
\end{align*}
The right figure in \autoref{fig:BNPMF-NLG & BNPMF-VAR flowchart} shows the plate diagram of the nonparametric Bayesian multi-facet vector autoregressive model (\VAR). \autoref{app:VAR} provides additional model specification, and  \autoref{app:derivation of VAR VI} derives the update rules.


\paragraph{Implementation Details}
%% TO ensure Reproducibility %%
% 1. Datasets 2. Evaluation metrics 3. Hyperparameters settings/model selections, initialization (for applications)
% 4. Results display: a) relative L2 errors & ARI (for simulations) b) est. param visual plots

% Settings: 1. Truncation level (cluster pruning--threshold) 2. Initializations (parallel runs) 3. priors-alpha (eg. Gamma(300,5000))
Bayesian mixture models often face challenges due to the high multimodality of posterior distributions \citep{RamsésH2015Identifiability, stephensMultimodality2000, CarreiraModesGMM2003}. Therefore, we perform multiple optimization runs with diverse initializations in parallel and select the run with the highest ELBO as defined in \autoref{eqn:MMM ELBO}. %This approach mitigates the multimodality problem by improving the likelihood of identifying the optimal posterior mode.
In addition, we incorporate cluster ordering and cluster pruning techniques during the learning process to enhance the algorithm's performance, following the methods demonstrated by \citet{Kurihara2007ClustOrder} and \citet{Kart2018ClustPrune}. Cluster ordering involves rearranging clusters in descending order based on their estimated probabilities at each iteration. Cluster pruning discards clusters whose estimated probabilities fall below a specified threshold, dynamically reducing the number of active clusters during the learning process.

We observe that large clusters are often subdivided into smaller, similar clusters (see \autoref{app:sim}). %—a phenomenon also reported by \citet{zhao2013VBDirchlet}—
We adjust the hyperparameters of $\alphadot$'s prior to have a mean smaller than $1$, with a small variance to encourage the automatic merging of such clusters. We found that combining cluster pruning with a smaller prior mean for $\alphadot$ helps mitigate the cluster splitting when sufficient iterations for convergence are allowed (see \autoref{tab:clust pruning and prior}). Intuitively, while a smaller prior mean for $\alphadot$ can reduce the probability of forming redundant clusters, it typically requires many iterations to reach convergence. Cluster pruning accelerates this process by dynamically shrinking the truncation level $\truncbig$ during each iteration to approximate the optimal number of clusters. %Furthermore, \autoref{tab:clust pruning threshold} illustrates that the pruning threshold can be tuned using ELBO. %In addition, we also investigate the model performance across different truncation levels $\truncbig$ and explore model selection strategies for determining the optimal truncation level based on the ELBO in \autoref{app:sim}, as described by \citep{David2006VIforDPM}.


\section{Simulation Studies}\label{sec:simulation}
%We compute the relative $L_2$ error to measure the accuracy of estimated facet parameters by comparing them with the ground truth, and use the Adjusted Rand Index (ARI) \citep{Hubert1985ComparingP} to evaluate clustering performance against known ground truth clustering assignments. 

\paragraph{\NLG}
We use two datasets of different sizes: \textsc{(NLG-S)} a small dataset with $N=2,400$ and \textsc{(NLG-L)} a large dataset with $N=15,000$. Furthermore, we use two versions of the same dataset, namely, complete \textsc{(C)} (i.e., no missing values) and incomplete \textsc{(I)}. For both cases, the number of time points is set to $T=10$, while for the incomplete dataset, $50\%$ of the values are randomly removed. The ground truth number of facet clusters in the simulated large datasets is $K_a=5,\;K_\bfbeta=5$ and $K_\sigma=5$. \autoref{tab:NLG sim} reports the resulting average relative $L_2$ errors and adjusted Rand indices (ARIs) for the simulated datasets. Visual representations of the estimations are provided in \autoref{app:sim}.

We observed that the estimations for the intercept and coefficient facets are accurate, as indicated by the low relative $L_2$ errors across all datasets, with values consistently near $0$ for the intercept and ranging between $0.006$ and $0.011$ for the coefficient facet. For the noise parameter, while the estimation is precise (relative $L_2$ error between $0.006$ and $0.017$) in the complete or small datasets \textsc{(NLG-S-C, NLG-S-I, NLG-L-C)}, it shows greater error (rel$L_2=0.046$ and ARI $<0.5$) in the large incomplete dataset \textsc{(NLG-L-I)}. In terms of ARI for the intercept and coefficient facets, the model achieves near-perfect clustering results (ARI $>0.9$) under the complete or small datasets \textsc{(NLG-S-C, NLG-S-I, NLG-L-C)} but demonstrates less accurate results ($0.7<\text{ARI}<0.9$) for the large incomplete dataset \textsc{(NLG-L-I)}. 
Both results are expected since the substantial missingness might hinder the model's ability to effectively infer noise from the time series data points and degrade clustering performance.
We found that most mis-clustered trajectories originate from clusters with large noise. This is reasonable as individual trajectories within high-noise clusters often deviate significantly from the mean, making them more susceptible to being mis-clustered into other groups.

A comparison of computational efficiency across inference methods is presented in \autoref{tab:runtime comparison} in the Appendix. Our Variational Bayes method achieves runtimes between ADVI and MLE, offering a substantial speed-up over MCMC while maintaining competitive performance.

\paragraph{\VAR}
We test the model on both small and large datasets with varying time lengths, containing up to $T=10$ time points and $D=3$ variables. The ground truth number of facet clusters is $K_{\bfa}=3,\;K_\Beta=3$ and $K_\bfsigma=3$. The results in \autoref{tab:VAR sim} and visual representations in \autoref{app:sim} demonstrate the model's performance. 

For the smaller dataset ($N=1,000$), the relative $L_2$ errors for the intercept, coefficient, and noise facets are $0.002$, $0.017$, and $0.028$, respectively, with corresponding ARIs of $0.915$, $1.0$, and $0.997$, indicating near-perfect clustering accuracy. In the larger dataset ($N=6,000$), estimation accuracy for the intercept and noise facets improves further (errors decrease to 0.001 and 0.012), while the error for the coefficient matrix slightly increases to 0.09. Nevertheless, ARIs remain high (0.843 for $\bfa$, 0.982 for $\Beta$, and 0.989 for $\bfsigma$), demonstrating accurate clustering and estimation performance across varying-length multivariate time series.


\section{Applications}\label{sec:application}
% Need to describe the single-facet clustering results obtained by other literature.

In this section, we apply the \NLG\ and \VAR\ models to two distinct time series datasets derived from the English Longitudinal Study of Ageing (ELSA) \citep{BanksELSA2023}. ELSA is a nationally representative dataset of individuals aged $50$ and older, residing in private households and originally derived from the Health Survey for England in 2002. Comprehensive methodological details on ELSA can be found in \citet{PacchiottiELSAreport2021}. %This dataset has also been used by \citet{marshallIncomeTrajectoriesPrecarity2024} to analyze income trajectories in later life and their associations with factors such as social class, gender, and precarity in housing, pensions, relationships, care, and retirement. In the current application, we analyze a subset of the ELSA dataset to identify multi-facet clusters and compare these results with existing results from single-facet clustering methods.

%\newpage
\paragraph{ELSA Income Data}
The ELSA income data used in this study is consistent with \citet{marshallIncomeTrajectoriesPrecarity2024}. A final sample of $13,002$ respondents is selected by including only individuals who participated in at least two waves of ELSA and reported incomes ranging between $\pounds0$ and $\pounds1000$ per week to reduce the influence of outliers. The income variable used in this analysis is the equivalised total income at the ``benefit unit" level, which includes either a single individual or a couple with any dependent children \citep{marshallIncomeTrajectoriesPrecarity2024}. This equivalisation process adjusts the reported income values so that they represent the income of a single-person household, making it possible to compare households fairly regardless of their size. Additionally, all income data is adjusted for inflation from 2002 to 2019, using 2018/19 as the base year. The analysis considers ages between $50$ to $90$, observing up to nine time points across nine waves for each individual spanning at most 18 years of their life. Due to this longitudinal framework, the missing data rate is significantly high at $86.8\%$.

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.99\linewidth]{uai2025-template/pic/res/ELSA_income_results.png}
\caption{\textbf{a)} Seven intercept clusters named ``Poor", ``Poor High", ``Comfortable Low", ``Comfortable", ``Comfortable High", ``Luxury Low" and ``Luxury" that broadly fall into the categories of ``Poor", ``Comfortable" and ``Luxury", at the retirement age of $67$. \textbf{b)} Five shape clusters, ``Stable"  income, ``Pre-retirement Drop" in income, ``Pre-retirement Spike Low" in income, ``Pre-retirement Spike High" in income and ``Retirement Spike" in income. \textbf{c)} Four noise clusters. Relative risk ratios (RRRs) for \textbf{d)} intercept \textbf{e)} shape and \textbf{f)} noise clusters.}
  \label{fig:ELSA income estimations}
\end{figure*}


The single-facet clustering analysis reported by \citet{marshallIncomeTrajectoriesPrecarity2024} identified ten distinct income trajectory clusters. These clusters were later consolidated into four broader categories of later-life income trajectories based on stable and similar income levels following the statutory retirement age of $65$. The resulting categories were labelled as ``Luxury" (retirement income at or above $\pounds500$ per week), ``Comfortable" ($\pounds300$ to $\pounds500$ per week in retirement), ``Always Poor" (generally below $\pounds300$ per week in retirement) and ``Boom to Bust" (income rising to $\pounds600$ per week by age 70, then declining to around $\pounds200$ after age 80). A critical differentiating factor among the income trajectory clusters within these broader groups was the degree of income volatility experienced between the ages of $50$ and $65$. Three distinct volatility patterns were identified: a pre-retirement income drop, a spike in pre-retirement income, and stable income trajectories.

We applied our \NLG\ model with an intercept shift aligned to the retirement age of $67$ (proof in \autoref{proof:B-spline shift}) to the same dataset to learn multi-facet clustering results. The shift was applied to gain better interpretability in the context of retirement age as done by \cite{marshallIncomeTrajectoriesPrecarity2024}, and also since the missing rate was lower around this age \autoref{app:application}). The dataset spans $T=41$ time steps corresponding to ages $50$ through $90$. We used linear B-splines ($p=2$) with $3$ equidistant internal knots positioned at ages $60$, $70$ and $80$, following the approach in \citet{marshallIncomeTrajectoriesPrecarity2024}. Given the income range of $\pounds0$ to $\pounds1000$, the prior mean of the intercept is appropriately set at $500$ with a standard deviation of $300$, to reflect the central tendency and variability within this range. Moreover, we set truncation level $\truncbig=20$ with a pruning probability threshold of $0.05$ (see \autoref{app:application}), and specify the prior for $\alphadot$ as $\gammarand(300,5000)$. To ensure a robust exploration of the optimization landscape, we conducted $50$ parallel runs of our Variational Bayesian framework. 

\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.9\linewidth]{uai2025-template/pic/res/ELSA_income_Clust.vs.Single.png}
\caption{Contingency table of cluster assignments between multi-facet (column) and single-facet (row) clustering presented as percentage over whole population.}
  \label{fig:ELSA income vs single clust}
\end{figure*}


The estimated clusters in each facet are visually presented in \autoref{fig:ELSA income estimations}. The model identified seven intercept clusters at $193.9$, $263.1$, $313.69$, $356.01$, $415.45$, $502.32$ and $650.52$. We refer to these clusters as ``Poor", ``Poor High", ``Comfortable Low", ``Comfortable", ``Comfortable High", ``Luxury Low" and ``Luxury", aligning with the findings of \citet{marshallIncomeTrajectoriesPrecarity2024}. Moreover, five distinct income trajectory shapes were identified: ``Stable income" (Cluster 1), ``Pre-retirement Drop in income" (Cluster 2), ``Pre-retirement Spike Low in income" (Cluster 3), ``Pre-retirement Spike High in income" (Cluster 4), and ``Retirement Spike in income" (Cluster 5). These findings are largely consistent with those of \citet{marshallIncomeTrajectoriesPrecarity2024}. The four noise clusters reveal significant variability in income trajectories. The noise variable captures the volatility of income around an average trend, and it was not captured previously, thus, adding an additional dimension to the analysis of income trajectory. 


\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.99\linewidth]{uai2025-template/pic/res/ELSA_multivarTS_results.png}
   \caption{\textbf{a)} Eight intercept clusters. \textbf{b)} Three coefficient matrix clusters. \textbf{c)} Three noise clusters. Relative risk ratios (RRRs) for \textbf{d)} intercept \textbf{e)} coefficients and \textbf{f)} noise clusters.}
  \label{fig:ELSA multivar estimations}
\end{figure*}

%\newpage
Following  \citet{marshallIncomeTrajectoriesPrecarity2024}, we explore the drivers of these distinct patterns using multinomial regression models with social class, gender, and precarity in housing, pensions, relationships, care, and retirement as predictors. 
We observe that lower education decreases the likelihood of belonging to ``Luxury'' cluster compared to ``Poor'' cluster (RRR $<0.5$, $p<0.001$) while having occupational pension increases the likelihood of belonging to ``Luxury'' cluster compared to ``Poor'' cluster (RRR $=3.597$, $p<0.001$). Compared to ``Stable'' cluster, lower education decreases the likelihood of belonging to ``Pre-retirement Spike'' clusters (RRR $<0.7$, $p<0.02$). Compared to ``Stable'' cluster, being widowed increases the likelihood of belonging to ``Pre-retirement Drop'' cluster (RRR $=1.505$, $p<0.001$) while having occupational pension decreases the likelihood of belonging to ``Pre-retirement Drop'' cluster (RRR $=0.872$, $p<0.01$). Compared to ``Lowest Noise'' cluster, lower education decreases the likelihood of belonging to other higher noise clusters, e.g., ``Highest Noise'' (RRR $<0.5$, $p<0.001$), while an opposite effect is observed for having an occupational pension.

%\textbf{Results: comparison with single-facet.}
We compare the cluster assignments from our multi-facet model with those from the single-facet model explored by \citet{marshallIncomeTrajectoriesPrecarity2024}, which identified ten clusters, and summarize this in \autoref{fig:ELSA income vs single clust}. We observe general consistency between the two clustering approaches when viewed through a multi-faceted lens. In the intercept facet, a significant proportion of individuals assigned to ``Poor'' and ``Poor High'' clusters in the single-facet model have also been assigned to ``Poor'' and ``Poor High'' clusters in the multi-facet model, although some ``Poor High'' individuals in the former have been assigned to ``Comfortable Low'' in the latter. A similar situation is also observed in the ``Luxury'' clusters in the single-facet model that have been aligned with ``Luxury'' cluster in the multi-facet model except ``Luxury-Pre-retirement Drop'' cluster that has been aligned with ``Comfortable High'' and ``Luxury Low'' clusters.  Similarly, a significant proportion of individuals assigned to ``Luxury-Pre-retirement Drop'' and ``Comfortable-Pre-retirement Drop'' clusters in single-facet model have also been assigned to ``Pre-retirement Drop'' cluster in the multi-facet clustering model. A similar situation is observed for ``Pre-retirement Spike'' clusters in the single-facet model that have been aligned to ``Pre-retirement Spike Low'' and ``Pre-retirement Spike High'' clusters in the multi-facet model. In the context of the noise facet, a significant proportion of individuals from most clusters in the single-facet clustering except ``Poor'', ``Poor High'', ``Comfortable-Stable Income'', and ``Luxury-Medium Stable Income'' have been assigned to the ``Highest Noise'' cluster, while the rest of the clusters align with ``Moderate Noise'' and ``Second Lowest Noise'' clusters with ``Poor'' and ``Poor High'' also aligning with the ``Lowest Noise'' cluster in the multi-facet model.

\newpage
\paragraph{ELSA Multivariate Data}
The ELSA multivariate dataset analysed includes $6640$ individuals with varying time lengths. For each individual, a minimum of $4$ time points is observed across nine survey waves. We examine the trajectories of three variables for each individual: ``frailty", ``wellbeing" and ``social isolation". Frailty is quantified using a frailty index \citep{Marshall316}%\citep{Rockwood2012Frailty}
, ranging from $0$ to $56$. Wellbeing is measured using CASP-19 \citep{Howel2012CASP19}, with a scale ranging from $0$ to $57$, and social isolation is assessed using an existing scale of $0$ to $6$ \citep{davies_chandola_todd_pendleton_2021}. To ensure consistency, all scales are standardized to range between $0$ and $1$. Additionally, missing entries in each time series are imputed using mean imputation for each individual.

We applied our first-order \VAR\ model to this dataset, which consists of up to $T_n \in [4,9]$ time steps and $D=3$ variables. Since the values for each variable were standardized to fall between $0$ and $1$, the prior mean of the intercepts was set to $0.5$, with a prior standard deviation of $0.1$. The truncation level was set to $\truncbig=20$ with a pruning threshold of $0.05$, and the prior for $\alphadot$ was specified as $\gammarand(300,5000)$. To ensure robust optimization, we performed $50$ parallel runs using our VB framework.

% connectivity
The estimated clusters in each facet are visually displayed in \autoref{fig:ELSA multivar estimations}. 
We found eight intercept clusters representing varying levels of frailty, wellbeing and social isolation. The coefficients cluster 2 and 3 largely align with the intuitive notion that wellbeing has a negative impact on frailty. However, cluster 1 and 2 demonstrate a more counterintuitive relationship where the opposite is observed, i.e., wellbeing positively impacting frailty and vice versa. Cluster 3 shows a strong negative influence of frailty on wellbeing. The noise facet reveals significant variability in social isolation trajectories in cluster 1 and 3 compared to cluster 2, while cluster 2 and 3 are clusters least and most driven by noise respectively. 

Similar to our analysis in the previous section, we explore the drivers of these patterns using covariates. Due to the smaller size of the data and the large number of clusters, we mostly observe broad confidence intervals from the multinomial regression. However, some interesting patterns appear nonetheless. Compared to intercept cluster 1, being divorced or widowed increases (RRR $>1.5$, $p<0.001$) the likelihood of belonging to cluster 6 (higher frailty and lower wellbeing than cluster 1) and cluster 8 (higher social isolation than cluster 1). An opposite effect is observed for intercept clusters 3, 4 (lower social isolation than cluster 1) and cluster 7 (lower wellbeing and social isolation than cluster 1) (RRR $<0.3$, $p<0.001$). Compared to coefficient cluster 3 (intuitive direction of wellbeing negatively affecting frailty), receiving care decreases the likelihood of belonging to clusters 1 and 2 (counterintuitive direction of wellbeing positively affecting frailty, RRR $<0.5$, $p<0.001$). Compared to noise cluster 1, receiving care increases the likelihood of belonging to cluster 3 (higher noise variance in wellbeing and frailty compared to cluster 1, RRR $=5.294$, $p<0.001$). Compared to noise cluster 1, being widowed decreases the likelihood of belonging to cluster 2 (lower noise variance in social isolation compared to cluster 1, RRR $=0.823$, $p<0.02$).


%\newpage
\section{Discussion}\label{sec:discussion}
% 1. Summary of findings 2. Relate to literature: highlight similarities, differences or advancements 3. Implications of the Results 4. Contributions 5. Limitations 6. Future improvements and directions
%% pros and cons of assuming Gaussian.
% \textbf{a) illustrate the generality of our approach eg. HMM or models of Dynamic systems; b) clarify choice of facets; c) priori independence of facets: possible solution?}

Traditional time series clustering methods, like those used by \citet{marshallIncomeTrajectoriesPrecarity2024} and \citet{bulteelClusteringVectorAutoregressive2016}, typically produce a single clustering solution using all facets simultaneously and require extensive post-analysis to interpret the clusters. In contrast, nonparametric Bayesian multi-facet clustering model disentangles multiple facets within a dataset, each described by distinct characteristics. This enhances interpretability by providing clearer insight into why clusters form and what defines them. Additionally, tying facets directly to model parameters offers an intuitive way to explain clustering outcomes.

In this paper, we present an extension to existing multi-facet mixture models. First, we incorporate a variational Bayesian framework, which offers enhanced computational efficiency and is particularly well-suited for large-scale datasets, in contrast to traditional MCMC sampling methods. Second, we incorporate Dirichlet process priors to simultaneously learn the number of facet clusters, removing the need to predefine this value. Third, we apply multi-facet clustering in the context of nonlinear growth models. Fourth, we capture an additional dimension, i.e., the noise characteristics as a facet for both nonlinear growth model and multivariate regression model. Fifth, we demonstrate the versatility of the proposed method through two detailed time series model applications tested on real datasets.

This multi-facet clustering framework can be further generalised to a broader class of time series models by adopting alternative likelihood functions. For example, a Hidden Markov Model (HMM) can be used for categorical response data, where the facets may correspond to the columns of the transition matrix, capturing distinct state dynamics. %Similarly, in models of dynamic systems with intractable likelihoods, the facets could represent specific parameters governing the interactions between system components. 
This flexibility allows the multi-facet approach to be adapted to diverse temporal modelling contexts, enabling interpretable clustering based on model-specific structural elements.

%Simulation results highlight the effectiveness of the \NLG\ model in capturing multi-facet clusters in large longitudinal datasets with randomly missing data, while the \VAR\ model performs well on smaller, multivariate time series. 

While the method and analysis have notable strengths, some limitations remain. From a conceptual perspective, the independence of facets apriori is a strong assumption that considers any combinations of parameters from the facets to be feasible. However, in practice, this might not happen, thus, creating a model mismatch as shown in the introduction \autoref{fig:multi-facet illustration}. This limitation can be addressed by allowing facets to be dependent, but this can increase the number of cluster probabilities being estimated, potentially impacting computational efficiency and model identifiability. Another aspect of the multi-facet clustering is the choice of facets. For instance, in the VAR model, the coefficient matrix facet can be further decomposed into row-wise facets to capture variable-specific interaction patterns (as discussed in \citep{kundu2024BayesProductMixture}). This alternative facet specification may lead to different clustering outcomes. Therefore, the definition and granularity of facets are inherently subjective and should be guided by the research question and the interpretability of the underlying parameter components. %We consider different parameters as facets but this can be done in various manners as well for a given model. %For example, in the nonlinear growth model the intercept captures the income at age 50 while it might be interesting to reparameterize as income at the retirement age. 
From an implementation perspective, a tuning of the pruning threshold may be necessary to address the splitting of similar clusters. This process can be improved and validated more extensively on simulated data. %From an analysis perspective, we do not consider missing values in the VAR model, and for simplicity only consider time series of the same length. This limitation can be addressed by extending the implementation with appropriate assumptions thus allowing the VAR model to be applied to a much larger dataset on ELSA, and potentially drawing more meaningful and significant interpretation. Furthermore, expanding the application of multi-facet mixture models to other traditional longitudinal models and diverse datasets would further validate their generalizability and enhance their practical utility. We are currently exploring all these avenues.

Multi-facet clustering offers a unique and exciting direction to clustering complex temporal data in an interpretable manner. This study takes a step in this direction by implementing this method on several temporal models, and applying this approach to complex real-world applications. 
The framework offers valuable insights into complex real-world phenomena and provides a flexible, interpretable, and computationally efficient approach for analysing multi-faceted real datasets.

\newpage

%%%%%%%%%%%%%%% end of main context %%%%%%%%%%%%%%%%%%
\begin{acknowledgements} % will be removed in pdf for initial submission, (without ‘accepted’ option in \documentclass) so you can already fill it to test with the ‘accepted’ class option
Research reported in this publication was supported by the National Institute On Aging of the National Institutes of Health under Award Number R01AG017644. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health.
ELSA is funded by the NIHR Policy Research Programme (HEI) 198\_1074\_03. The views expressed are those of the author(s) and not necessarily those of the NIHR or the Department of Health and Social Care. %The authors acknowledge that the English Longitudinal Study of Ageing (ELSA) is funded by the National Institute on Aging (grant R01AG017644) and supported by UK Government Departments, with coordination provided by the National Institute for Health and Care Research (NIHR). 
Wang is supported by the United Kingdom Research and Innovation (grant EP/S02431X/1), UKRI Centre for Doctoral Training in Biomedical AI at the University of Edinburgh, School of Informatics. Richards is supported by the National Institute for Health and Care Research (NIHR) under its Artificial Intelligence for Multiple and Long-Term Conditions Programme (project references NIHR203982), and Seth is partly supported by the National Institute for Health and Care Research (NIHR) under its Artificial Intelligence for Multiple and Long-Term Conditions Programme (reference number NIHR202639 and NIHR203982). The views expressed are those of the author and not necessarily those of the NIHR or the Department of Health and Social Care. Seth is partly supported by the Legal \& General Group (research grant to establish the independent Advanced Care Research Centre at the University of Edinburgh). The funder had no role in the conduct of the study, interpretation or the decision to submit for publication. The views expressed are those of the authors and not necessarily those of Legal \& General.
\end{acknowledgements}

% References
\bibliography{reference}

\newpage
\onecolumn
\title{Supplementary Material}
\maketitle
\appendix


\section{Additional Results}
\subsection{Simulation}\label{app:sim}
In this section, we show additional results for simulations. Specifically, we first examine learning outcomes under various truncation levels for both the \NLG\ and \VAR\ models without cluster pruning, as shown in \autoref{fig:L2 ARI vs K}. The findings suggest that learning outcomes are generally stable when the truncation level exceeds the true number of clusters. However, a notable issue arises where some large clusters may split into multiple smaller, similar clusters. This phenomenon is observed in the visual representations of estimations for both models. \autoref{fig:NLGNA k10 estimations} and \autoref{fig:NLGNA k50 estimations} provide visual representations of \NLG\ estimations on incomplete datasets across different truncation levels, while \autoref{fig:VAR K10 estimations} and \autoref{fig:VAR K50 estimations} depict estimations for \VAR\ under varying truncation levels. The ARIs fluctuate due to cluster splitting issues, which can slightly degrade the ARI values. We then investigate the relationship between ELBO and truncation levels in \autoref{fig:ELBO vs K sim}. The figure shows that the highest ELBO is achieved at the correct truncation level, which aligns with the notion of cluster pruning during the learning process. %a potential strategy for selecting an appropriate truncation level, as suggested by \citep{David2006VIforDPM}.

We demonstrate the effects of cluster pruning and tuning the hyperparameters of $\alphadot$'s prior in simulation for the \VAR\ model, as shown in \autoref{tab:clust pruning and prior}. To ensure a fair comparison, we use the same random seed for all runs to eliminate the influence of initialization, along with a fixed maximum of $500$ iterations and a truncation level of $20$. Our results show that combining cluster pruning with hyperparameter tuning of the $\alphadot$ prior not only facilitates convergence to the optimal number of clusters but also accelerates convergence, requiring fewer iterations. Moreover, we explore the impact of tuning the pruning threshold based on ELBO, as shown in \autoref{tab:clust pruning threshold}, using the same prior settings, maximum iteration and truncation level.

\begin{table*}[!htb]
    \centering
    \caption{The Average Relative $L_2$ Errors and ARIs of Facets for \NLG\ Simulation.}
    \label{tab:NLG sim}
    \begin{tabular}{cccc|ccc}\toprule
        Dataset & rel$L_2\;a$ & rel$L_2\;\bfbeta$& rel$L_2\;\sigma$ & ARI $a$ & ARI $\bfbeta$ & ARI $\sigma$\\\midrule
        $N=2,400; \text{NA}\%=0$ & 0.000 & 0.006 & 0.007 & 0.999 & 1.0 & 0.999\\
        $N=2,400; \text{NA}\%=20$ & 0.000 & 0.010 & 0.006 & 1.0 & 0.996 & 0.995\\
        $N=15,000; \text{NA}\%=0$ & 0.000 & 0.009 & 0.017 & 0.984 & 0.930 & 0.734\\
        $N=15,000; \text{NA}\%=50$ & 0.000 & 0.011 & 0.046 & 0.886 & 0.73 & 0.426\\\bottomrule
    \end{tabular}
\end{table*}

\begin{table*}[!htb]
    \centering
    \caption{The Average Relative $L_2$ Errors and ARIs of Facets for \VAR\ Simulation.}
    \label{tab:VAR sim}
    \begin{tabular}{cccc|ccc}\toprule
        Dataset & rel$L_2\;\bfa$ & rel$L_2\;\Beta$& rel$L_2\;\bfsigma$ & ARI $\bfa$ & ARI $\Beta$ & ARI $\bfsigma$\\\midrule
        $N=1,000; \text{NA}\%=0$ & 0.002 & 0.017 & 0.028 & 0.915 & 1.0 & 0.997\\
         $N=6,000; \text{NA}\%=0$ & 0.001 & 0.09 & 0.012 & 0.843 & 0.982 & 0.989\\\bottomrule
    \end{tabular}
\end{table*}

\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.33\linewidth}
    \includegraphics[width=\linewidth]{pic/app/NLGsim15000_Ks.png}
    \caption{\NLG\ complete simulated data}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.33\linewidth}
    \includegraphics[width=\linewidth]{pic/app/NLGsim15000NA_Ks.png}
    \caption{\NLG\ incomplete simulated data}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.33\linewidth}
    \includegraphics[width=\linewidth]{pic/app/VARsim1000_Ks.png}
    \caption{\VAR\ simulated data}
  \end{subfigure}
  \caption{The relative $L_2$ errors and ARIs vs. different truncation levels for simulations.}
\label{fig:L2 ARI vs K}
\end{figure}

\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.49\linewidth}
    \includegraphics[width=\linewidth]{pic/app/NLGsim15000s_ELBOvsK.png}
    \caption{\NLG\ model (ground truth at $5$)}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.49\linewidth}
    \includegraphics[width=\linewidth]{pic/app/VARsim1000_ELBOvsK.png}
    \caption{\VAR\ model (ground truth at $3$)}
  \end{subfigure}
  \caption{The ELBO vs. different truncation levels for simulations. Both models achieve their highest ELBO when the truncation level matches the ground truth number of clusters.}
\label{fig:ELBO vs K sim}
\end{figure}

\begin{table}[!htb]
    \centering
    \caption{Demonstration of Effects of Cluster Pruning and Hyperparameters Tuning of $\alphadot$ Prior by \VAR\ Simulation.}
    \label{tab:clust pruning and prior}
    \begin{tabular}{cccccc}\toprule
     Cluster pruning  & $\alphadot$ prior tuning & Iterations used & \#Cluster $\bfa$ & \#Cluster $\Beta$ & \#Cluster $\bfsigma$ \\\midrule
     No & No & 500 & 4 & 12 & 3 \\
     No & Yes & 500 & 3 & 8 & 3 \\
     Yes & No & 500 & 3 & 4 & 3 \\
     Yes & Yes & 470 & 3 & 3 & 3 \\\midrule
     \multicolumn{3}{c}{True \#Clusters}& 3 & 3 & 3\\\bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Demonstration of Tuning Cluster Pruning Threshold by \NLG\ Simulation.}
    \label{tab:clust pruning threshold}
    \begin{tabular}{ccccc}\toprule
     Pruning threshold & ELBO & \#Cluster $a$ & \#Cluster $\bfbeta$ & \#Cluster $\sigma$ \\\midrule
     No pruning & -189642.87 & 6 & 10 & 6 \\
     0.01 & -186681.43 & 5 & 7 & 7 \\
     0.02 & -185241.61 & 5 & 6 & 6 \\
     0.04 & -184177.22 & 5 & 5 & 5 \\
     0.06 & -183491.80 & 5 & 5 & 5 \\
     0.07 & \textbf{-183135.41} & 5 & 5 & 5 \\
     0.08 & -190081.36 & 5 & 5 & 5 \\\midrule
     \multicolumn{2}{c}{True \#Clusters}& 5 & 5 & 5\\\bottomrule
    \end{tabular}
\end{table}


\begin{table}[ht]
    \centering
    \caption{Average runtime of the BMF-NLG model using different inference methods implemented in \textsc{RStan}.}
    \begin{tabular}{cccc}\hline
         Size $N$&  HMC&  ADVI& MLE\\\hline
         250&  ~1h&  ~3min& ~30s\\
         300&  ~4h&  ~10min& ~80s\\
         1200&  ~15h&  ~1h& ~7min\\
         2400&  ~60h&  ~2h& ~20min\\
         15000&  >10days &  >2days & ~20h\\\hline
    \end{tabular}
    \label{tab:runtime comparison}
\end{table}


\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K10.Init0pointEstA.png}
    \caption{$a$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K10.Init0pointEstbeta.png}
    \caption{$\bfbeta$}
  \end{subfigure}
  \\
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K10.Init0pointEstSIGMA.png}
    \caption{$\sigma$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K10.Init0Pis.png}
    \caption{$\bfpi$ for facets}
  \end{subfigure}
  \caption{Parameter estimations of the \NLG\ model on the large incomplete dataset at truncation level 10. The true number of clusters for all three facets is $5$.}
  \label{fig:NLGNA k10 estimations}
\end{figure}

\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K50.Init0pointEstA.png}
    \caption{$a$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K50.Init0pointEstbeta.png}
    \caption{$\bfbeta$}
  \end{subfigure}
  \\
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K50.Init0pointEstSIGMA.png}
    \caption{$\sigma$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.496\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/NLGsim15000NA_K50.Init0Pis.png}
    \caption{$\bfpi$ for facets}
  \end{subfigure}
  \caption{Parameter estimations of the \NLG\ model on the large incomplete dataset at truncation level 50.}
  \label{fig:NLGNA k50 estimations}
\end{figure}

\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.48\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K10.Init0pointEstA.png}
    \caption{$\bfa$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.51\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K10.Init0pointEstBeta.png}
    \caption{$\Beta$}
  \end{subfigure}
  \\
  \begin{subfigure}[c]{0.48\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K10.Init0pointEstSIGMA.png}
    \caption{$\bfsigma$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.51\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K10.Init0Pis.png}
    \caption{$\bfpi$ for facets}
  \end{subfigure}
  \caption{Parameter estimations of the \VAR\ model on the simulated dataset when truncation level is $10$. The true number of clusters for all three facets is $3$.}
  \label{fig:VAR K10 estimations}
\end{figure}

\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.48\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K50.Init0pointEstA.png}
    \caption{$\bfa$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.51\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K50.Init0pointEstBeta.png}
    \caption{$\Beta$}
  \end{subfigure}
  \\
  \begin{subfigure}[c]{0.48\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K50.Init0pointEstSIGMA.png}
    \caption{$\bfsigma$}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.51\linewidth}
    \includegraphics[width=0.7\linewidth]{pic/app/VARsim1000_K50.Init0Pis.png}
    \caption{$\bfpi$ for facets}
  \end{subfigure}
  \caption{Parameter estimations of the \VAR\ model on the simulated dataset when truncation level is $50$. For the coefficient facet in (b), Cluster 1 and Cluster 7 are the same, Cluster 2, Cluster 4, Cluster 8 and Cluster 10 are the same while Cluster 3, Cluster 5, Cluster 6 and Cluster 9 are the same.}
  \label{fig:VAR K50 estimations}
\end{figure}


\clearpage
\subsection{Applications}\label{app:application}
In this section, we present additional results for the application. \autoref{fig:ELBO income intercept missingness} shows the missing rate in the ELBO income dataset at different ages. We see the missing rate at the retirement age 67 is relatively lower. As a result, we changed the intercept facet from age 50 to retirement age 67. \autoref{fig:prunThres vs K real} displays the impact of different cluster pruning thresholds on real datasets. Due to the high missing rate and high noise in the real datasets, we choose the pruning threshold based on the number of clusters and the smallest cluster probability we expect ($0.05$ for both datasets).

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.6\linewidth]{uai2025-template/pic/app/ELSA_income_missing_rate.png}
    \caption{The missing rate of ELSA data for the NLG model at each age time point.}
    \label{fig:ELBO income intercept missingness}
\end{figure}


\begin{figure}[!htb]
  \centering
  \begin{subfigure}[c]{0.49\linewidth}
    \includegraphics[width=\linewidth]{pic/app/ELSA_income_K10_vs_PruneThreshold_bubbleplot.png}
    \caption{ELSA income data}
  \end{subfigure}
  \hfill
  \begin{subfigure}[c]{0.49\linewidth}
    \includegraphics[width=\linewidth]{pic/app/ELSA_multivarTS_large_K10_vs_PruneThreshold_bubbleplot.png}
    \caption{ELSA multivariate data}
  \end{subfigure}
  \caption{Different pruning thresholds vs. number of clusters for real datasets. We compute the proportion of the number of clusters learned from multiple runs.}
\label{fig:prunThres vs K real}
\end{figure}


\section{Additional Example Model Details}
\subsection{\NLG}\label{app:NLG}
The mathematical expression of the nonlinear growth model is
$$
f_{n}^{(p)}(t)=a_n+\sum_{i=1}^L \beta_{ni} B_{i,p}(t)
$$
where the spline function can be built from a linear combination of a collection of B-splines $\{B_{i,p}(t)\}_{i=1}^L$ of degree $p-1$ with coefficient $\beta_i$. Note that if we set $M=0$ and $p=2$, meaning no internal knots and use linear B-splines, it simply forms the linear growth model.

Assume an additive Gaussian noise for each trajectory at various time points $\epsilon_{nt}\sim\normal(0,\tau_n)$ and we let $\tau=\frac{1}{\sigma^2}$ known as precision. Given the individual cluster assignments $\za_n$, $\zbeta_n$ and $\ztau_n$ for each facet and corresponding cluster parameters, we have the observable data distributed as
\begin{equation}\label{eqn:BMFNLG_obs_dist}
    \bfy_n\g \za_n=\ka,\zbeta_n=\kbeta,\ztau_n=\ktau\distas\normal_T(a_{\ka}+\bfbeta_{\kbeta}\calB(\bft), \tau_{\ktau}\bfI)
\end{equation}
where $\calB(\bft)\in\reals^{L\times T}$ is the basis spline matrix of order $p$ and $\tau_{\ktau}\bfI$ is the precision matrix with scalar precision parameter. The cluster numbers $\ka,\kbeta,\ktau$ can go to infinity.

We specify the base distributions for facet parameters as conjugate priors. That is,
\begin{equation}\label{eqn:NLG priors}
\begin{aligned}
    G^{(a)}_{0}&\sim \normal(\mua,\taua)\\
    G^{(\bfbeta)}_{0}&\sim \normal_L(\mubeta,\taubeta\bfI)\\
    G^{(\tau)}_{0}&\sim \gammarand(\lambdatau_1,\lambdatau_2)
\end{aligned}
\end{equation}
where $\taua$ and $\taubeta\bfI$ are precisions. Therefore, the corresponding variational distributions should be in the same distribution family as priors.

For incomplete data, we have $p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)=p(\bfy_n^\obs\g\bfz_n,\bfa,\bfbeta,\bftau)p(\bfy_n^\mis\g\bfz_n,\bfa,\bfbeta,\bftau)$. Denote $\bft^\obs_n$ observed time points for each $n$ where $\deter{\bft^\obs_n}=T^\obs_n$ then $\bfy_n^\obs$ will depend on $\bft^\obs_n$. So, the likelihood \ref{eqn:BMFNLG_obs_dist} becomes distribution \ref{eqn:BMFNLG_NA_obs_dist}. It is obvious that with the marginal distribution \ref{eqn:BMFNLG_NA_obs_dist} of observed data, we can simply replace all $\bfy_n$, $\calB(\bft)$ and $T$ with $\bfy_n^\obs$, $\calB(\bft_n^\obs)$ and $T^\obs_n$ in the update rules.

\subsection{\VAR}\label{app:VAR}
We define the first-order vector autoregressive model for an individual time series $n$ as:
$$
\bfy_{nt} = \bfa_n + \Beta_n(\bfy_{n(t-1)}-\bfa_n) + \bfepsilon_{nt}
$$
where $\bfy_{nt}$ and $\bfy_{n(t-1)}\in\reals^{D}$ are $D$ dimensional vector of time series values at time points $t$ and $t-1$ and $\bfa_n$ denotes the intercept term. $\Beta_n$ is a $D\times D$ matrix containing the regression coefficients where $\Beta_{n,ij}$ refers to the coefficient of $y_{n(t-1),j}$ in the linear function for $y_{nt,i}$. We assume $\bfepsilon_{nt}\sim\normal_D(0,\diag(\bftau_n))$ is the time-invariant noise term with diagonal precision matrix parametrised by $\bftau_n\in\reals_+^D$.

The distribution of individual time series is already stated in the main text and the joint likelihood of the entire time series is 
\begin{align*}
\begin{split}
    p(\bfy_n=[\bfy_{n0},\dots,\bfy_{n(T-1)}]\g \bfz_n, \bfa,\Beta,\bftau)=
    p(\bfy_{n0}\g\bfz_n, \bfa,\bftau)\prod_{t=1}^{T-1}p(\bfy_{nt}\g\bfy_{n(t-1)},\bfz_n, \bfa,\Beta,\bftau)
\end{split}
\end{align*}
where the individual time series matrix $\bfy_n\in\reals^{D\times T}$. This forms a Matrix normal distribution \citep{Dawid1981MatrixNormal}:
\begin{multline*}
\bfy_n\g \za_n=\ka,\zBeta_n=\kBeta,\ztau_n=\ktau \distas \matnormal_{D,T}\left(\bfa_{\ka}\transpose{\bfone}+\left[ 
\mathbf{0},\Beta_{\kBeta}([\bfy_{n0},\dots,\bfy_{n(T-2)}] -\bfa_{\ka}\transpose{\bfone})\right],\diag(\bftau_{\ktau}),\bfI_T\right).
\end{multline*}
The extension to varying length time series is straightforward by considering different time series lengths $T_n$ for each individual.

We specify the base distributions for facet parameters as conjugate priors. That is,
\begin{equation}\label{eqn:VAR priors}
\begin{aligned}
    G^{(\bfa)}_{0}&\distas \normal_D(\bfmua,\taua\bfI)\\
    G^{(\Beta)}_{0}&\distas \matnormal_{D,D}(\muBeta,\diag(\tauBeta),\bfI)\\
    G^{(\tau)}_{0}&\distas \gammarand(\lambdatau_1,\lambdatau_2) \text{ for } \tau_d,\; d=1,\dots,D
\end{aligned}
\end{equation}


\section{Additional Information on Variational Inference}
Variational inference focuses on minimizing the Kullback-Leibler (KL) divergence \citep{Kullback1951KLD} between a variational distribution, denoted as $q(\bfTheta)$, and the true posterior distribution $p(\bfTheta\g\bfY)$. Specifically, let $q_\bfnu(\bfTheta)$ be a family of distributions parameterized by variational parameters $\bfnu$. The objective is to minimize the KL divergence between $q_\bfnu(\bfTheta)$ and $p(\bfTheta\g\bfY)$, given by:
$$
D_\KL(q_\bfnu(\bfTheta)\gsep p(\bfTheta\g\bfY))=\E_q[\log q_\bfnu(\bfTheta)]-\E_q[\log p(\bfTheta,\bfY)]+\log p(\bfY).
$$
Since this term is constant with respect to the variational parameters, it is equivalent to maximizing a lower bound on the log model evidence $\log p(\bfY)$, referred to as the evidence lower bound (ELBO). The generic ELBO for \MMM\ is:
\begin{equation}\label{eqn:MMM ELBO}
    \begin{split}
        \text{ELBO}&=\E_q[\log p(\bfTheta,\bfY)]-\E_q[\log q(\bfTheta)]\\
        &=\sum_{f=1}^F \sum_{k_f=1}^\infty \E_q[\log p(\bfthetadot_{k_f})]+\sum_{f=1}^F \E_q[\log p(\bfvdot\g\alphadot)]+\sum_{n=1}^N \sum_{f=1}^F \E_q[\log p(\zdot_n\g\bfvdot)]\\
        &\quad+\sum_{n=1}^N\E_q[\log p(\bfy_n\g\bfz_n,\{\bfthetadot_{k_f}\}_{f=1}^F)]-\E_q[\log q(\bfTheta)].
    \end{split}
\end{equation}
The cluster assignments for a sample across all facets are collectively encoded as $\bfz_n=(z^{(1)}_n,\dots,z^{(F)}_n)$. Under the stick-breaking construction, facet parameters are sampled as $\bfthetadot_{k_f}\sim \Gdot_0$, with stick lengths $\vdot_{k_f}\sim \betarand(1,\alphadot)$ determining probabilities $\pidot_{k_f}(\bfvdot)=\vdot_{k_f}\prod_{i=1}^{k_f-1}(1-\vdot_i)$. We also place $\gammarand$ prior to $\alphadot$: $\alphadot\sim\gammarand(\sdot_1,\sdot_2)$. Thus, the generic variational distributions for $\MMM$ are:
 \begin{equation}\label{eqn:MMM varDist}
 \begin{aligned}
 \bfthetadot_k &\distas p(\bfthetadot_k\g{\bflambda^{(f)}_k}^\ast)\\
 \vdot_k &\distas \betarand({\alphadot_{k1}}^\ast,{\alphadot_{k2}}^\ast)\\
 \zdot_n &\distas \categorical({\bfpidot_n}^\ast)\\
 \alphadot& \distas \gammarand({\sdot_1}^\ast,{\sdot_2}^\ast).
 \end{aligned}
 \end{equation}

It can be shown that the generic update rules for variational parameters can be accomplished by computing the following equations ( \autoref{app:derivation of VI}):
 \begin{equation}\label{eqn:MMM updateRule}
 \begin{aligned}
 {\bflambda^{(f)}_k}^\ast &= \E_q[g(\bfTheta_{-\bfthetadot_k},\bfY)]\\
     {\alphadot_{k1}}^\ast &= 1+\sum_{n=1}^N{\pidot_{nk}}^\ast;\;{\alphadot_{k2}}^\ast = \frac{{\sdot_1}^\ast}{{\sdot_2}^\ast} +\sum_{n=1}^N\sum_{i=k+1}^\truncbig{\pidot_{ni}}^\ast\\
     {\pidot_{nk}}^\ast &\propto \exp\left(\E_q[\log \vdot_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\vdot_i)] + \Sdot_{nk} \right)\\
     {\sdot_1}^\ast &=\sdot_1+\truncbig-1;\;{\sdot_2}^\ast=\sdot_2-\sum_{k=1}^{\truncbig-1}\E_q[\log(1-\vdot_k)]
 \end{aligned}
 \end{equation}
 where $g(\bfTheta_{-\bfthetadot_k},\bfY)$ are the parameters of the distribution for $\bfthetadot_k$ when conditioning on the remaining latent variables and the observations i.e. $p(\bfthetadot_k\g \bfTheta_{-\bfthetadot_k},\bfY)$. The update for $\vdot_k$ is independent of model specification while $\Sdot_{nk}$ depends on the likelihood and different facets. Iterating these update rules optimizes ELBO in \autoref{eqn:MMM ELBO} with respect to the variational parameters defined in \autoref{eqn:MMM varDist}. The algorithm converges when the change in ELBO falls below a predefined threshold. 


\subsection{Derivation of Update Rules for Variational Parameters in \NLG}\label{app:derivation of VI}
We take the \NLG\ model for example to give the full derivation steps. Likelihood function of $\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau$:
$$
    \bfy_n\g \za_n=\ka,\zbeta_n=\kbeta,\ztau_n=\ktau\distas\normal_T(a_{\ka}+\bfbeta_{\kbeta}\calB(\bft), \tau_{\ktau}\bfI)\in\reals^{1\times T}
$$
where $\calB(\bft)\in\reals^{L\times T}$ is the basis spline matrix of order $p$ and $\tau_{\ktau}\bfI$ is the precision matrix with scalar precision parameter.

Conjugate priors and base distributions on latent variables $\bfTheta$ where we consider assigning priors to the concentration parameter in the beta distribution:
\begin{align*}
 a_{\ka}&\distas\normal(\mua,\taua)\text{ for }\ka=1,\dots,\infty\text{ and scalar precision parameter }\taua\\
    \bfbeta_{\kbeta}&\distas\normal_L(\mubeta,\taubeta\bfI)\text{ for }\kbeta=1,\dots,\infty\text{ and scalar precision parameter }\taubeta\\
    \tau_{\ktau}&\distas\gammarand(\lambdatau_1,\lambdatau_2) \text{ for }\ktau=1,\dots,\infty\\
    \za_n&\distas\categorical(\bfpia(\bfva))\\
    \zbeta_n&\distas\categorical(\bfpibeta(\bfvbeta))\\
    \ztau_n&\distas\categorical(\bfpitau(\bfvtau))\\
    \va_{\ka}&\distas\betarand(1,\alphaa) \text{ for }\ka=1,\dots,\infty\\
    \vbeta_{\kbeta}&\distas\betarand(1,\alphabeta) \text{ for }\kbeta=1,\dots,\infty\\
    \vtau_{\ktau}&\distas\betarand(1,\alphatau) \text{ for }\ktau=1,\dots,\infty\\
    \alphaa&\distas\gammarand(\sa_1,\sa_2)\\
    \alphabeta&\distas\gammarand(\sbeta_1,\sbeta_2)\\
    \alphatau&\distas\gammarand(\stau_1,\stau_2)
\end{align*}

Assume the variational distribution (i.e. approximate posterior distribution) with truncation level $\truncbig$ by considering truncated stick-breaking representations:
\begin{align*}
q(a_k) &\distas \normal({\mua_k}^\ast,{\taua_k}^\ast)\\
q(\bfbeta_k) &\distas \normal_L({\mubeta_k}^\ast,{\Lambdabeta_k}^\ast)\\
q(\tau_k) &\distas \gammarand({\lambdatau_{k1}}^\ast,{\lambdatau_{k2}}^\ast)\\
q(\zdot_n) &\distas \categorical({\bfpidot_n}^\ast)\text{ for facets } a, \bfbeta,\tau\\
q(\vdot_k) &\distas \betarand({\bfalphadot_k}^\ast)\text{ for facets } a, \bfbeta,\tau\\
q(\alphadot) &\distas \gammarand({\bfsdot}^\ast)\text{ for facets } a, \bfbeta,\tau
\end{align*}
Therefore, the equation for the joint factorized variational distribution is as follows:
\begin{align*}
q(\bfTheta)&=\prod_{k=1}^\truncbig\{q(a_k\g{\mua_k}^\ast,{\taua_k}^\ast)q(\bfbeta_k\g{\mubeta_k}^\ast,{\Lambdabeta_k}^\ast)q(\tau_k\g{\bflambdatau_k}^\ast)\}\\
&\times\prod_{k=1}^{\truncbig-1}\{q(\va_k\g{\bfalphaa_k}^\ast)q(\vbeta_k\g{\bfalphabeta_k}^\ast)q(\vtau_k\g{\bfalphatau_k}^\ast)\}\\
&\times\prod_{n=1}^N\{q(\za_n\g{\bfpia_n}^\ast)q(\zbeta_n\g{\bfpibeta_n}^\ast)q(\ztau_n\g{\bfpitau_n}^\ast)\}\\
&\times q(\alphaa\g{\bfsa}^\ast)q(\alphabeta\g{\bfsbeta}^\ast)q(\alphatau\g{\bfstau}^\ast)
\end{align*}

We then derive the true conditional posterior distributions for each parameter and the corresponding update rules for variational parameters. First, the joint probability of $\bfTheta$ and $\bfY$ is as follows:
\begin{align*}
    p(\bfTheta,\bfY)&=\prod_{n=1}^N\left\{p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)p(\za_n\g\bfpia(\bfva))p(\zbeta_n\g\bfpibeta(\bfvbeta))p(\ztau_n\g\bfpitau(\bfvtau))\right\}\\
    &\times \prod_{k=1}^\infty \{p(\va_k\g 1,\alphaa)p(\vbeta_k\g 1,\alphabeta)p(\vtau_k\g 1,\alphatau)\}\\
    &\times\prod_{k=1}^\infty \{p(a_k\g\mua,\taua)p(\bfbeta_k\g\mubeta,\taubeta\bfI)p(\tau_k\g\lambdatau_1,\lambdatau_2)\}\\
    &\times p(\alphaa\g\sa_1,\sa_2)p(\alphabeta\g\sbeta_1,\sbeta_2)p(\alphatau\g\stau_1,\stau_2)
\end{align*}

The ELBO is expressed as:
\begin{equation}\label{eqn:BNPMF-NLG ELBO}
    \begin{aligned}
        \text{ELBO}_{NLG}&=\E_q[\log p(\bfa)]+\E_q[\log p(\bfbeta)]+\E_q[\log p(\bftau)]\\
        &+\E_q[\log p(\bfva\g\alphaa)]+\E_q[\log p(\bfvbeta\g\alphabeta)]+\E_q[\log p(\bfvtau\g\alphatau)]\\
        &+\sum_{n=1}^N(\E_q[\log p(\za_n\g\bfva)]+\E_q[\log p(\zbeta_n\g\bfvbeta)]+\E_q[\log p(\ztau_n\g\bfvtau)])\\
        &+\E_q[\log p(\alphaa\g\bfsa)]+\E_q[\log p(\alphabeta\g\bfsbeta)]+\E_q[\log p(\alphatau\g\bfstau)]\\
        &+\sum_{n=1}^N\E_q[\log p(\bfy_n\g\bfz_n, \bfa, \bfbeta,\bftau)]-\E_q[\log q(\bfTheta)].
    \end{aligned}
\end{equation}

The terms in the third row using indicator random variables $z_n$ in \autoref{eqn:BNPMF-NLG ELBO} can be rewritten as:
\begin{align*}
\E_q[\log p(z_n\g\bfv)]&=\E_q\left[\log\left(\prod_{k=1}^\infty(1-v_k)^{\bfone[z_n>k]}{v_k}^{\bfone[z_n=k]}\right)\right]\\
&=\sum_{k=1}^\infty q(z_n>k)\E_q[\log(1-v_k)]+q(z_n=k)\E_q[\log v_k]\\
&= \sum_{k=1}^\truncbig q(z_n>k)\E_q[\log(1-v_k)]+q(z_n=k)\E_q[\log v_k].
\end{align*}
Recall that $\E_q[\log(1-v_k)] = 0$ and $q(z_n>\truncbig) = 0$ and we know:
\begin{align*}
    q(z_n=k)&={\pi}^\ast_{nk}\\
    % log-odds of each category relative to the k-th category (the reference category)
    q(z_n>k)&=\sum_{i=k+1}^\truncbig{\pi}^\ast_{ni}\\
    \E_q[\log v_k]&=\bfPsi({\alpha}^\ast_{k1})-\bfPsi({\alpha}^\ast_{k1}+{\alpha}^\ast_{k2})\\ 
    % \gamma_1=a-1,\gamma_2=b-1
    \E_q[\log(1-v_k)]&=\bfPsi({\alpha}^\ast_{k2})-\bfPsi({\alpha}^\ast_{k1}+{\alpha}^\ast_{k2})
\end{align*}
where the digamma function, denoted by $\bfPsi$, arises from the derivative of the log normalization factor in the beta distribution. Note that this generic derivation does not rely on a particular model.

\subsubsection{Parameters that Do Not Depend on Particular Model}\label{app:update not dependent}
\textbf{For} $\alphadot$ \textbf{of the Beta distribution}:
\begin{align*}
    p(\alphadot\g\bfTheta_{-\alphadot},\bfY)&\propto p(\alphadot\g\sa_1,\sa_2)\prod_{k=1}^\infty p(\vdot_k\g 1,\alphadot)\\
    &\propto {\alphadot}^{\sa_1-1}\exp\{-\sa_2\alphadot\} \prod_{k=1}^\infty \alphadot(1-\vdot_k)^{\alphadot-1}\\
    &\propto {\alphadot}^{\sdot_1-1}{\alphadot}^{\max(k)} \exp\{-\sdot_2\alphadot\} \prod_{k=1}^\infty \exp\{(\alphadot-1)\log(1-\vdot_k)\}\\
    &\propto {\alphadot}^{\sdot_1+\max(k)-1}\exp\{-\sdot_2\alphadot\}\exp\left\{(\alphadot-1)\sum_{k=1}^\infty \log(1-\vdot_k)\right\}\\
    &\propto {\alphadot}^{\sdot_1+\max(k)-1}\exp\left\{-\left(\sdot_2- \sum_{k=1}^\infty \log(1-\vdot_k)\right)\alphadot\right\}
\end{align*}
Thus, ${\sdot_1}^\ast=\sdot_1+\truncbig-1$ and ${\sdot_2}^\ast=\sdot_2- \sum_{k=1}^{\truncbig-1}\E_q[\log(1-\vdot_k)]$.

\textbf{The true conditional distribution for} $\vdot_k$ \textbf{is}:
\begin{align*}
    p(\vdot_k\g\bfTheta_{-\vdot_k},\bfY)&\propto p(\vdot_k\g 1,\alphadot)\prod_{n=1}^N p(\zdot_n\g\bfpidot(\bfvdot))\\
    &\propto \exp\left\{(\alphadot-1)\log(1-\vdot_k) + \sum_{n=1}^N \log\left(\prod_{k=1}^\infty(1-\vdot_k)^{\bfone[\zdot_n>k]}{\vdot_k}^{\bfone[\zdot_n=k]}\right)\right\}\\
    &\propto \exp\left\{(\alphadot-1)\log(1-\vdot_k) + \sum_{n=1}^N \{\bfone[\zdot_n>k]\log(1-\vdot_k) + \bfone[\zdot_n=k]\log \vdot_k\}\right\}\\
    &\propto \exp\left\{\sum_{n=1}^N\bfone[\zdot_n=k]\log \vdot_k + (\alphadot+\sum_{n=1}^N\bfone[\zdot_n>k]-1)\log(1-\vdot_k)\right\}
\end{align*}
Thus, ${\alphadot_{k1}}^\ast=1+\sum_{n=1}^N {\pidot_{nk}}^\ast$ and ${\alphadot_{k2}}^\ast=\frac{{\sdot_1}^\ast}{{\sdot_2}^\ast}+\sum_{n=1}^N \sum_{i=k+1}^\truncbig{\pidot_{ni}}^\ast$.

\subsubsection{Facets Parameters Specific for \NLG}
\textbf{For intercept} $a_k$:
\begin{align*}
    p(a_k\g\bfTheta_{-a_k},\bfY)&\propto p(a_k\g\mua,\taua)\prod_{n=1}^N p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)^{\bfone[\za_n=k]}\\
    &\propto \exp\left(\frac{\taua(a_k-\mua)^2}{-2}\right)\\
    &\times \prod_{n=1}^N \left\{\exp\left(\frac{\tau_{\ztau_n}(a_{\za_n}-\bfm_{\zbeta_n})\transpose{(a_{\za_n}-\bfm_{\zbeta_n})}}{-2} \right)\right\}^{\bfone[\za_n=k]}\\
    &\quad\text{where }\bfm_{\zbeta_n}=\bfy_n-\bfbeta_{\zbeta_n}\calB\in\reals^{1\times T}\\
    &\propto \exp\left(\frac{\taua(a_k-\mua)^2+\sum_{n=1}^N \{\bfone[\za_n=k]\tau_{\ztau_n}(Ta_{\za_n}^2-2a_{\za_n}\sum\bfm_{\zbeta_n}+\norm{\bfm_{\zbeta_n}}^2)\}}{-2}\right)\\
    &\propto \exp\left(\frac{(\taua+T\sum_{n=1}^N \{\bfone[\za_n=k]\tau_{\ztau_n}\})a_k^2 -2(\taua\mua+\sum_{n=1}^N \{\bfone[\za_n=k]\tau_{\ztau_n}\sum\bfm_{\zbeta_n}\})a_k}{-2}\right)
\end{align*}
\begin{align*}
    q(a_k) &\propto \exp\{\E_{q(\bfTheta_{-a_k})}[\log p(a_k\g\bfTheta_{-a_k},\bfY)]\} \\
    &\propto \exp\left\{\frac{\taua+T \E_q\left[\sum_{n=1}^N \{\bfone[\za_n=k]\tau_{\ztau_n}\}\right])a_k^2 - 2(\taua\mua+ \E_q\left[\sum_{n=1}^N \{\bfone[\za_n=k]\tau_{\ztau_n}\sum\bfm_{\zbeta_n}\}\right])a_k}{-2} \right\}
\end{align*}
where we have $\E_q[\tau_{\ztau_n}]=\sum_{j=1}^\truncbig q(\ztau_n=j)\E_q[\tau_j] = \sum_{j=1}^\truncbig {\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}$ and $\E_q[\bfbeta_{\zbeta_n}] = \sum_{j=1}^\truncbig {\pibeta_{nj}}^\ast{\mubeta_j}^\ast$.
Thus, 
$$
{\taua_k}^\ast=\taua+T\sum_{n=1}^N \left\{{\pia_{nk}}^\ast\sum_{j=1}^\truncbig \left({\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}\right) \right\}
$$ and 
$$
{\mua_k}^\ast=\frac{\taua\mua+\sum_{n=1}^N \left\{{\pia_{nk}}^\ast \left(\sum_{j=1}^\truncbig {\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}\right) \left(\sum_T\bfy_n-\sum_T \left[\left(\sum_{j=1}^\truncbig {\pibeta_{nj}}^\ast{\mubeta_j}^\ast\right)\calB\right] \right) \right\}}{{\taua_k}^\ast}
$$ 


\textbf{For coefficient row vector} $\bfbeta_k$:
\begin{align*}
    p(\bfbeta_k\g\bfTheta_{-\bfbeta_k},\bfY)&\propto 
    p(\bfbeta_k\g\mubeta,\taubeta\bfI)\prod_{n=1}^N p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)^{\bfone[\zbeta_n=k]}\\
    &\propto \exp\left(\frac{\taubeta(\bfbeta_k-\mubeta)\transpose{(\bfbeta_k-\mubeta)}}{-2}\right)\\
    &\times \prod_{n=1}^N \left\{\exp\left(\frac{\tau_{\ztau_n}(\bfbeta_k\calB+a_{\za_n}-\bfy_n)\transpose{(\bfbeta_k\calB+a_{\za_n}-\bfy_n)}}{-2} \right)\right\}^{\bfone[\zbeta_n=k]}\\
    &\propto \exp\left(\frac{\taubeta(\bfbeta_k\transpose{\bfbeta_k}-2\bfbeta_k\transpose{{\mubeta}})+\sum_{n=1}^N \{\bfone[\zbeta_n=k]\tau_{\ztau_n}(\bfbeta_k\calB-\bfm_{\za_n})\transpose{(\bfbeta_k\calB-\bfm_{\za_n})}\}}{-2}\right)\\
    &\quad\text{where }\bfm_{\za_n}=\bfy_n-a_{\za_n}\in\reals^{1\times T}\\
    &\propto \exp\left(\frac{1}{-2}\left[\bfbeta_k(\taubeta\bfI+\sum_{n=1}^N \{\bfone[\zbeta_n=k]\tau_{\ztau_n}\calB\transpose{\calB}\})\transpose{\bfbeta_k}\right.\right. \\
    &\quad\quad- \left.\left. 2\bfbeta_k(\taubeta\transpose{{\mubeta}}+ \sum_{n=1}^N \{\bfone[\zbeta_n=k]\tau_{\ztau_n})\calB\transpose{\bfm_{\za_n}} \right]\right)
\end{align*}
Thus,
$$
{\Lambdabeta}^\ast_k=\taubeta\bfI+\sum_{n=1}^N \left\{{\pibeta_{nk}}^\ast \left(\sum_{j=1}^\truncbig {\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}\right) \calB\transpose{\calB}\right\}\in\reals^{L\times L}
$$
and
$$
{\mubeta_k}^\ast=\left\{\taubeta\mubeta+\sum_{n=1}^N \left\{{\pibeta_{nk}}^\ast \left(\sum_{j=1}^\truncbig {\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}\right) \left(\bfy_n- \left(\sum_{j=1}^\truncbig {\pia_{nj}}^\ast{\mua_j }^\ast\right)\right)\transpose{\calB}\right\}\right\}\inv{({\Lambdabeta_k}^\ast)}\in\reals^{1\times L}
$$
where $\E_q[a_{\za_n}] = \sum_{j=1}^\truncbig {\pia_{nj}}^\ast{\mua_j}^\ast$.


\textbf{For precision scalar} $\tau_k$:
\begin{align*}
    p(\tau_k\g\bfTheta_{-\tau_k},\bfY)&\propto p(\tau_k\g\lambdatau_1,\lambdatau_2)\prod_{n=1}^N p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)^{\bfone[\ztau_n=k]}\\
    &\propto \tau_k^{\lambdatau_1-1}\exp\{-\lambdatau_2\tau_k\}\\
    &\times \tau_k^{\frac{T}{2}\sum_{n=1}^N\bfone[\ztau_n=k]}\prod_{n=1}^N \left\{\exp\left(\frac{\tau_{\ztau_n}(\bfy_n-\bfmu_{\za_n,\zbeta_n})\transpose{(\bfy_n-\bfmu_{\za_n,\zbeta_n})}}{-2} \right)\right\}^{\bfone[\ztau_n=k]}\\
    &\quad\text{where }\bfmu_{\za_n,\zbeta_n}=a_{\za_n}+\bfbeta_{\zbeta_n}\calB\in\reals^{1\times T}\\
    &\propto \tau_k^{\lambdatau_1+\frac{T}{2}\sum_{n=1}^N\bfone[\ztau_n=k]-1}\\
    &\times \exp\left( -\tau_k\left\{ \lambdatau_2+\frac{\sum_{n=1}^N\{\bfone[\ztau_n=k](\bfy_n-\bfmu_{\za_n,\zbeta_n})\transpose{(\bfy_n-\bfmu_{\za_n,\zbeta_n})}\}}{2}\right\} \right)
\end{align*}
Thus,
$$
{\lambdatau_{k1}}^\ast=\lambdatau_1+\frac{T}{2}\sum_{n=1}^N{\pitau_{nk}}^\ast
$$ and
\begin{align*}
{\lambdatau_{k2}}^\ast&=\lambdatau_2+\frac{1}{2}\sum_{n=1}^N \left\{{\pitau_{nk}}^\ast \E_q\left[\norm{\bfy_n-\bfbeta_{\zbeta_n} \calB- a_{\za_n}}^2\right]  \right\}
\end{align*}
%where
%\begin{align*}
%    \E_q\left[\norm{\bfy_n -\bfbeta_{\zbeta_n}\calB - a_{\za_n}}^2\right] &= \norm{\bfy_n}^2 - 2\E_q[\bfbeta_{\zbeta_n}]\calB\transpose{\bfy_n} + \E_q[\bfbeta_{\zbeta_n}]\calB\transpose{\calB}\transpose{\E_q[\bfbeta_{\zbeta_n}]} + \tr(\calB\transpose{\calB}\textit{Cov}(\bfbeta_{\zbeta_n})) \\
%    &-2\E_q[a_{\za_n}]\sum_T\bfy_n +2\E_q[a_{\za_n}]\sum_T\left[\E_q[\bfbeta_{\zbeta_n}]\calB\right] + \E_q[a_{\za_n}]^2 + \textit{Var}(a_{\za_n})\\
%    &= \norm{\bfy_n -\E_q[\bfbeta_{\zbeta_n}]\calB - \E_q[a_{\za_n}]}^2 + \tr(\calB\transpose{\calB}\textit{Cov}(\bfbeta_{\zbeta_n})) + \textit{Var}(a_{\za_n})
%\end{align*}
%and $\textit{Cov}(\bfbeta_{\zbeta_n})=\sum_{j=1}^\truncbig{\pibeta_{nj}}^\ast\inv{({\Lambdabeta_j}^\ast)}+ \sum_{j=1}^\truncbig{\pibeta_{nj}}^\ast ({\mubeta_j}^\ast-\E_q[\bfbeta_{\zbeta_n}])\transpose{({\mubeta_j}^\ast-\E_q[\bfbeta_{\zbeta_n}])}$ 

\subsubsection{Parameters for Cluster Assignments}
\textbf{For} $\zdot_n$ \textbf{of any facet}:
\begin{align*}
    p(\zdot_n\g \bfTheta_{-\zdot_n},\bfY)&\propto p(\zdot_n=k\g\bfpidot(\bfvdot))p(\bfy_n\g \bfz_n,\{\bfthetadot_{k_f}\}_{f=1}^F)
\end{align*}
Hence,
\begin{align*}
    {\pidot_{nk}}^\ast\propto \exp\left(\E_q[\log \vdot_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\vdot_i)] + \Sdot_{nk} \right)
\end{align*}
where $\Sdot_{nk}$ depends on the likelihood and different facets and
\begin{align*}
\E_q[\log \vdot_k]&=\bfPsi({\alphadot_{k1}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\\ 
    \E_q[\log(1-\vdot_k)]&=\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)
\end{align*} with the digamma function denoted by $\bfPsi$.

\textbf{So} $\za_n$ \textbf{of intercept}:
\begin{align*}
    p(\za_n=k\g \bfTheta_{-\za_n},\bfY)&\propto p(\za_n=k\g\bfpia(\bfva))p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)\\
    &\propto
    \va_k\prod_{i=1}^{k-1}(1-\va_i)\exp\left(\frac{\tau_{\ztau_n}(\bfy_n-a_k-\bfbeta_{\zbeta_n}\calB)\transpose{(\bfy_n-a_k-\bfbeta_{\zbeta_n}\calB)}}{-2} \right)\\
    &\propto \exp\left(\log \va_k+\sum_{i=1}^{k-1}\log(1-\va_i) + \frac{-1}{2}\tau_{\ztau_n} (\bfy_n-a_k-\bfbeta_{\zbeta_n}\calB)\transpose{(\bfy_n-a_k-\bfbeta_{\zbeta_n}\calB)} \right)
\end{align*}
Thus,
$$
{\pia_{nk}}^\ast\propto \exp\left(\E_q[\log \va_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\va_i)] + \Sa_{nk} \right)
$$
where 
$$
\Sa_{nk}=\frac{-1}{2}\left(\sum_{j=1}^\truncbig {\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}\right) \norm{\bfy_n- {\mua_k}^\ast- \left(\sum_{j=1}^\truncbig{\pibeta_{nj}}^\ast{\mubeta_j}^\ast\right) \calB}^2
$$

\textbf{We can obtain a similar result for} ${\pibeta_{nk}}^\ast\propto \exp\left(\E_q[\log \vbeta_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\vbeta_i)] + \Sbeta_{nk} \right)$ where
$$
\Sbeta_{nk}=\frac{-1}{2}\left(\sum_{j=1}^\truncbig {\pitau_{nj}}^\ast\frac{{\lambdatau_{j1}}^\ast}{{\lambdatau_{j2}}^\ast}\right) \norm{\bfy_n- \left(\sum_{j=1}^\truncbig {\pia_{nj}}^\ast{\mua_j}^\ast\right) - {\mubeta_k}^\ast\calB}^2
$$


\textbf{For} $\ztau_n$ \textbf{of noise}:
\begin{align*}
    p(\ztau_n=k\g \bfTheta_{-\ztau_n},\bfY)&\propto p(\ztau_n=k\g\bfpitau(\bfvtau))p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)\\
    &\propto
    \vtau_k\prod_{i=1}^{k-1}(1-\vtau_i)\tau_k^{\frac{T}{2}}\exp\left(\frac{\tau_k(\bfy_n-a_{\za_n}-\bfbeta_{\zbeta_n}\calB)\transpose{(\bfy_n-a_{\za_n}-\bfbeta_{\zbeta_n}\calB)}}{-2} \right)\\
    &\propto \exp\left(\log \vtau_k+\sum_{i=1}^{k-1}\log(1-\vtau_i) + \frac{T}{2}\log\tau_k \right.\\
    &\quad\quad\left. + \frac{-1}{2}\tau_k (\bfy_n-a_{\za_n}-\bfbeta_{\zbeta_n}\calB)\transpose{(\bfy_n-a_{\za_n}-\bfbeta_{\zbeta_n}\calB)} \right)
\end{align*}
Thus,
$$
{\pitau}^\ast_{nk}\propto \exp\left(\E_q[\log \vtau_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\vtau_i)] + \Stau_{nk} \right)
$$
where 
\begin{align*}
\Stau_{nk} &= \frac{T}{2}\log\frac{{\lambdatau_{k1}}^\ast}{{\lambdatau_{k2}}^\ast} + \frac{-1}{2}\frac{{\lambdatau_{k1}}^\ast}{{\lambdatau_{k2}}^\ast} \norm{\bfy_n- \left(\sum_{j=1}^\truncbig {\pia_{nj}}^\ast{\mua_j}^\ast\right) - \left(\sum_{j=1}^\truncbig{\pibeta_{nj}}^\ast{\mubeta_j}^\ast\right) \calB}^2
\end{align*}

\subsubsection{ELBO Computation}
For ELBO in \autoref{eqn:BNPMF-NLG ELBO}:
\begin{align*}
&\E_q[\log p(\bfy_n\g\bfz_n,\bfa,\bfbeta,\bftau)]=-\frac{T}{2}\log2\pi+\frac{T}{2}\E_q[\log\tau_{\ztau_n}]-\frac{1}{2}\E_q[\tau_{\ztau_n}]\left\{\bfy_n\transpose{\bfy_n} - 2\sum_T(\E_q[a_{\za_n}]\bfy_n) -2\E_q[\bfbeta_{\zbeta_n}]\calB\transpose{\bfy_n} \right.\\
    &+ \left. T\E_q[a_{\za_n}^2] + 2\sum_T(\E_q[a_{\za_n}]\E_q[\bfbeta_{\zbeta_n}]\calB) + \E_q[\bfbeta_{\zbeta_n}]\calB\transpose{\calB}\E_q[\transpose{\bfbeta_{\zbeta_n}}] \right\}\\
    &\E_q[\log p(a_k\g\mua,\taua)]=-\log\sqrt{2\pi}+\frac{1}{2}\log\taua-\frac{1}{2}\taua(\frac{1}{{\taua_k}^\ast}+{\mua_k}^{\ast 2}-2\mua{\mua_k}^\ast+{\mua}^2)\\
    &\E_q[\log q(a_k\g{\mua_k}^\ast,{\taua_k}^\ast)]=-\log\sqrt{2\pi}+\frac{1}{2}\log{\taua_k}^\ast-\frac{1}{2}{\taua_k}^\ast(\frac{1}{{\taua_k}^\ast}+{\mua_k}^{\ast 2}-2{\mua_k}^\ast{\mua_k}^\ast+{\mua_k}^{\ast 2})\\
    &\hspace{+3.8cm}= -\log\sqrt{2\pi}+\frac{1}{2}\log{\taua_k}^\ast-\frac{1}{2}\\
    &\E_q[\log p(\bfbeta_{k}\g\mubeta,\taubeta]= -L\log\sqrt{2\pi}+\frac{L}{2}\log\taubeta-\frac{1}{2}\taubeta(\tr(\inv{({\Lambdabeta_k}^\ast)})+\norm{{\mubeta_k}^\ast}^2-2\transpose{{\mubeta}}{\mubeta_k}^\ast+\norm{{\mubeta}}^2)\\
    &\E_q[\log q(\bfbeta_{k}\g{\mubeta_k}^\ast,{\Lambdabeta_k}^\ast)]= -L\log\sqrt{2\pi}+\frac{1}{2}\log\deter{{\Lambdabeta_k}^\ast}-\frac{L}{2}\\
    &\E_q[\log p(\tau_k\g\bflambdatau)]=\lambdatau_1\log\lambdatau_2-\log\Gamma(\lambdatau_1)+(\lambdatau_1-1)\left\{\bfPsi({\lambdatau_{k1}}^\ast)-\log{\lambdatau_{k2}}^\ast \right\} -\lambdatau_2\frac{{\lambdatau_{k1}}^\ast}{{\lambdatau_{k2}}^\ast}\\
    &\E_q[\log q(\tau_k\g{\bflambdatau_k}^\ast)]={\lambdatau_{k1}}^\ast\log{\lambdatau_{k2}}^\ast-\log\Gamma({\lambdatau_{k1}}^\ast)+({\lambdatau_{k1}}^\ast-1)\left\{\bfPsi({\lambdatau_{k1}}^\ast)-\log{\lambdatau_{k2}}^\ast \right\} -{\lambdatau_{k1}}^\ast\\
    &\E_q[\log p(\vdot_k\g\alphadot)]=(\frac{{\sdot_1}^\ast}{{\sdot_2}^\ast}-1)\{\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\bfalphadot_{k2}}^\ast)\} -\{\bfPsi({\sdot_1}^\ast)-\log{\sdot_1}^\ast\} \\
    &\E_q[\log q(\vdot_k\g{\bfalphadot_k}^\ast)]= ({\alphadot_{k1}}^\ast-1)\{\bfPsi({\alphadot_{k1}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\}+ ({\alphadot_{k2}}^\ast-1)\{\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\} \\ 
    &\hspace{+3.5cm}-\log\calB({\alphadot_{k1}}^\ast,{\alphadot_{k2}}^\ast) \\
    &\E_q[\log p(\zdot_n\g\bfvdot)] =\sum_{k=1}^\truncbig\left\{\left(\sum_{i=k+1}^\truncbig{\pidot_{ni}}^\ast\right)\{\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\bfalphadot_{k2}}^\ast)\} +{\pidot_{nk}}^\ast\{\bfPsi({\alphadot_{k1}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\}\right\}\\
    &\E_q[\log q(\zdot_n\g{\bfpidot_n}^\ast)]=\log\max_i{\pidot_{ni}}^\ast\\
    &\E_q[\log p(\alphadot\g\bfsa)]=\sdot_1\log\sdot_2-\log\Gamma(\sdot_1)+(\sdot_1-1)\{\bfPsi({\sdot_1}^\ast)-\log{\sdot_1}^\ast\}-\sdot_2\frac{{\sdot_1}^\ast}{{\sdot_2}^\ast}\\
    &\E_q[\log q(\alphadot\g{\bfsa}^\ast)]={\sdot_1}^\ast\log{\sdot_1}^\ast-\log\Gamma({\sdot_1}^\ast)+({\sdot_1}^\ast-1)\{\bfPsi({\sdot_1}^\ast)-\log{\sdot_1}^\ast\}-{\sdot_1}^\ast
\end{align*}
\newline


\subsection{Derivation of Update Rules for Variational Parameters in \VAR}\label{app:derivation of VAR VI}

\subsubsection{Facets Parameters Specific for \VAR}
\textbf{For intercept vector} $\bfa_k$:
\begin{equation*}
\begin{split}
    p(\bfa_k\g\bfTheta_{-\bfa_k},\bfY)&\propto p(\bfa_k\g\bfmua,\taua\bfI)\prod_{n=1}^N p(\bfy_n\g\bfz_n, \bfa,\Beta,\bftau)^{\bfone[\za_n=k]}\\
  &\propto \exp\left(\frac{\taua\transpose{(\bfa_k-\bfmua)}(\bfa_k-\bfmua)}{-2}\right) \prod_{n=1}^N\left\{\exp\left(\frac{\transpose{(\bfy_{n0}-\bfa_k)}\diag(\bftau_{\ztau_n})(\bfy_{n0}-\bfa_k)}{-2}\right)\right\}^{\bfone[\za_n=k]}\\
  &\hspace*{-2.3cm}\times \prod_{n=1}^N \prod_{t=1}^{T-1} \left\{\exp\left(\frac{\transpose{(\bfy_{nt}-\bfa_k-\Beta_{\zBeta_n}\bfy_{n(t-1)} + \Beta_{\zBeta_n}\bfa_k)}\diag(\bftau_{\ztau_n})(\bfy_{nt}-\bfa_k-\Beta_{\zBeta_n}\bfy_{n(t-1)} + \Beta_{\zBeta_n}\bfa_k)}{-2}\right)\right\}^{\bfone[\za_n=k]}\\
  &\propto \exp\left(\frac{\taua\transpose{(\bfa_k-\bfmua)}(\bfa_k-\bfmua) + \sum_{n=1}^N \{\bfone[\za_n=k]\transpose{(\bfy_{n0}-\bfa_k)}\diag(\bftau_{\ztau_n})(\bfy_{n0}-\bfa_k) \}}{-2}\right)\\
  &\hspace*{-3cm}\times \exp\left(\frac{\sum_{n=1}^N \bfone[\za_n=k]\sum_{t=1}^{T-1}\{\transpose{(\bfy_{nt}-\bfa_k-\Beta_{\zBeta_n}\bfy_{n(t-1)} + \Beta_{\zBeta_n}\bfa_k)}\diag(\bftau_{\ztau_n})(\bfy_{nt}-\bfa_k-\Beta_{\zBeta_n}\bfy_{n(t-1)} + \Beta_{\zBeta_n}\bfa_k)\}}{-2}\right)\\
  &\propto \exp\left(\frac{1}{-2}\left[ \transpose{\bfa_k}\left\{\taua\bfI+T\sum_{n=1}^N\bfone[\za_n=k]\diag(\bftau_{\ztau_n}) + (T-1)\sum_{n=1}^N\bfone[\za_n=k]\transpose{\Beta}_{\zBeta_n}\diag(\bftau_{\ztau_n})\Beta_{\zBeta_n}\right.\right.\right.\\
  & \left. - (T-1)\sum_{n=1}^N\bfone[\za_n=k]\transpose{\Beta}_{\zBeta_n}\diag(\bftau_{\ztau_n}) -(T-1)\sum_{n=1}^N\bfone[\za_n=k]\diag(\bftau_{\ztau_n})\Beta_{\zBeta_n} \right\}\bfa_k \\
  &-2\left\{\taua\transpose{{\bfmua}}+ \sum_{n=1}^N\bfone[\za_n=k]\transpose{\bfy_{n0}}\diag(\bftau_{\ztau_n}) + \sum_{n=1}^N\bfone[\za_n=k]\{\sum_{t=1}^{T-1}\transpose{\bfy_{nt}}\}\diag(\bftau_{\ztau_n})\right.\\
  & \left.\left.\left. -\sum_{n=1}^N\bfone[\za_n=k]\{\sum_{t=1}^{T-1}\transpose{\bfy_{n(t-1)}}\}\transpose{\Beta}_{\zBeta_n}\diag(\bftau_{\ztau_n}) +\sum_{n=1}^N\bfone[\za_n=k]\{\sum_{t=1}^{T-1}\transpose{\bfy_{n(t-1)}}\}\transpose{\Beta}_{\zBeta_n}\diag(\bftau_{\ztau_n})\Beta_{\zBeta_n} \right.\right.\right.\\
  &\left.\left.\left.-\sum_{n=1}^N\bfone[\za_n=k]\{\sum_{t=1}^{T-1}\transpose{\bfy_{nt}}\}\diag(\bftau_{\ztau_n})\Beta_{\zBeta_n} \right\}\bfa_k\right]\right) 
\end{split}
\end{equation*}

Thus, 
\begin{multline*}
{\Taua_k}^\ast=\taua\bfI+T\sum_{n=1}^N{\pia_{nk}}^\ast\diag(\E_q[\bftau_{\ztau_n}]) + (T-1)\sum_{n=1}^N{\pia_{nk}}^\ast\E_q[\transpose{\Beta}_{\zBeta_n}]\diag(\E_q[\bftau_{\ztau_n}])\E_q[\Beta_{\zBeta_n}] \\
-(T-1)\sum_{n=1}^N{\pia_{nk}}^\ast\E_q[\transpose{\Beta}_{\zBeta_n}]\diag(\E_q[\bftau_{\ztau_n}]) -(T-1)\sum_{n=1}^N{\pia_{nk}}^\ast\diag(\E_q[\bftau_{\ztau_n}])\E_q[\Beta_{\zBeta_n}]
\end{multline*}
and
\begin{multline*}
    {\bfmua_k}^\ast = \inv{({\Lambda_k^{(a)}}^\ast)} \left\{\taua{\bfmua}+ \sum_{n=1}^N{\pia_{nk}}^\ast\diag(\E_q[\bftau_{\ztau_n}])\bfy_{n0} + \sum_{n=1}^N{\pia_{nk}}^\ast\diag(\E_q[\bftau_{\ztau_n}])\{\sum_{t=1}^{T-1}\bfy_{nt}\} \right.\\
    -\sum_{n=1}^N{\pia_{nk}}^\ast \diag(\E_q[\bftau_{\ztau_n}])\E_q[\Beta_{\zBeta_n}]\{\sum_{t=1}^{T-1}\bfy_{n(t-1)}\} + \sum_{n=1}^N{\pia_{nk}}^\ast \E_q[\transpose{\Beta}_{\zBeta_n}]\diag(\E_q[\bftau_{\ztau_n}])\E_q[\Beta_{\zBeta_n}]\{\sum_{t=1}^{T-1}\bfy_{n(t-1)}\}\\
    \left.-\sum_{n=1}^N{\pia_{nk}}^\ast\E_q[\transpose{\Beta}_{\zBeta_n}]\diag(\E_q[\bftau_{\ztau_n}])\{\sum_{t=1}^{T-1}\bfy_{nt}\} \right\}\in\reals^{D\times 1}
\end{multline*}


\textbf{For coefficient matrix} $\Beta_k$:
\begin{equation*}
    \begin{split}
        p(\bfvec(\Beta_k)\g\bfTheta_{-\Beta_k},\bfY)&\propto 
    p(\bfvec(\Beta_k)\g\bfvec(\muBeta),\tauBeta\bfI)\prod_{n=1}^N p(\bfy_n\g\bfz_n,\bfa,\Beta,\bftau)^{\bfone[\zBeta_n=k]}\\
    &\propto \exp\left(\frac{{\tauBeta}\transpose{\bfvec(\Beta_k-\muBeta)}\bfvec(\Beta_k-\muBeta)}{-2}\right) \\
  &\times \prod_{n=1}^N \left\{\exp\left(\frac{\tr(\diag(\bftau_{\ztau_n})(\bfY_{n,-0}-\bfM_{\za_n})\transpose{(\bfY_{n,-0}-\bfM_{\za_n})})}{-2}\right)\right\}^{\bfone[\zBeta_n=k]}\\
  &\hspace*{-0.8cm}\text{where we let }\bfY_{n,-0}=[\bfy_{n1},\dots,\bfy_{n(T-1)}]\in\reals^{D\times (T-1)} \text{ and }\bfY_{n,-(T-1)}=[\bfy_{n0},\dots,\bfy_{n(T-2)}]\in\reals^{D\times (T-1)}\\
  & \text{so }\bfM_{\za_n}= \bfa_{\za_n}\transpose{\bfone}+\Beta_k(\bfY_{n,-(T-1)}-\bfa_{\za_n}\transpose{\bfone}) \in\reals^{D\times (T-1)}\\
  %&\hspace*{-2.5cm}\propto \exp\left(\frac{1}{-2}\left\{\tr\left(\tauBeta\bfI\Beta_k\tauBeta\bfI\transpose{\Beta}_k + \sum_{n=1}^N\bfone[\zBeta_n=k]\diag(\bftau_{\ztau_n})\Beta_k(\bfY_{n,-(T-1)}-\bfa_{\za_n}\transpose{\bfone})\transpose{(\bfY_{n,-(T-1)}-\bfa_{\za_n}\transpose{\bfone})}\transpose{\Beta_k}\right) \right.\right.\\
 % &\hspace*{-1cm}\left.\left.-2\tr\left(\tauBeta\bfI\muBeta\tauBeta\bfI\transpose{\Beta}_k + \sum_{n=1}^N\bfone[\zBeta_n=k]\diag(\bftau_{\ztau_n})(\bfY_{n,-0}-\bfa_{\za_n}\transpose{\bfone})\transpose{(\bfY_{n,-(T-1)}-\bfa_{\za_n}\transpose{\bfone})}\transpose{\Beta_k}\right)\right\}\right)
\end{split}
\end{equation*}
Thus the precision matrix and mean matrix are
\begin{equation*}
    \begin{split}
        %{\rtauBeta_k}^\ast &= \tauBeta\bfI + \sum_{n=1}^N{\piBeta_{nk}}^\ast\diag(\E_q[\bftau_{\ztau_n}])(\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone})\transpose{(\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone})}\\
        %{\ctauBeta_k}^\ast &= \tauBeta\bfI\\
        {\TauBeta_k}^\ast &= \tauBeta\bfI_{D^2} + \sum_{n=1}^N{\piBeta_{nk}}^\ast\left\{(\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone})\transpose{(\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone})} \otimes \diag(\E_q[\bftau_{\ztau_n}])\right\}\\
        {\muBeta_k}^\ast &= \mathrm{mat}\left[\inv{{{\TauBeta_k}^\ast}}\bfvec\left({\tauBeta}\muBeta + \sum_{n=1}^N{\piBeta_{nk}}^\ast\diag(\E_q[\bftau_{\ztau_n}])(\bfY_{n,-0}-\E_q[\bfa_{\za_n}]\transpose{\bfone})\transpose{(\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone})}\right)\right]
    \end{split}
\end{equation*}


\textbf{For precision vector} $\bftau_k$:
\begin{equation*}
    \begin{split}
        p(\bftau_k\g\bfTheta_{-\bftau_k},\bfY)&\propto \prod_{d=1}^D p(\tau_{kd}\g\lambdatau_1,\lambdatau_2)\prod_{n=1}^N p(\bfy_n\g\bfz_n,\bfa,\Beta,\bftau)^{\bfone[\ztau_n=k]}\\
        \propto &\prod_{d=1}^D \left\{\tau_{kd}^{\lambdatau_1-1}\exp(-\lambdatau_2\tau_{kd})\right\} \prod_{n=1}^N \left\{\left(\prod_{d=1}^D \tau_{kd}^{\frac{1}{2}}\right)\exp\left(\frac{\transpose{(\bfy_{n0}-\bfa_{\za_n})}\diag(\bftau_k)(\bfy_{n0}-\bfa_{\za_n})}{-2}\right)\right\}^{\bfone[\ztau_n=k]}\\
  &\hspace*{-4.7cm}\times \prod_{n=1}^N \prod_{t=1}^{T-1} \left\{\left(\prod_{d=1}^D \tau_{kd}^{\frac{1}{2}}\right)\exp\left(\frac{\transpose{(\bfy_{nt}-\bfa_{\za_n}-\Beta_{\zBeta_n}\bfy_{n(t-1)} + \Beta_{\zBeta_n}\bfa_{\za_n})}\diag(\bftau_k)(\bfy_{nt}-\bfa_{\za_n}-\Beta_{\zBeta_n}\bfy_{n(t-1)} + \Beta_{\zBeta_n}\bfa_{\za_n})}{-2}\right)\right\}^{\bfone[\ztau_n=k]}\\
  &\propto \prod_{d=1}^D \left\{\tau_{kd}^{(\lambdatau_1+\frac{T}{2}\sum_{n=1}^N\bfone[\ztau_n=k]-1)}\right\}\exp\left(-\tau_{kd}\left\{\lambdatau_2+ \frac{1}{2}\sum_{n=1}^N\bfone[\ztau_n=k](y_{n0,d}-a_{\za_n,d})^2 \right.\right.\\
  &\left.\left.+\frac{1}{2}\sum_{n=1}^N\bfone[\ztau_n=k]\sum_{t=1}^{T-1}(y_{nt,d}-a_{\za_n,d}-\transpose{\Beta}_{\zBeta_n,d\cdot}\bfy_{n(t-1)}+\transpose{\Beta}_{\zBeta_n,d\cdot}\bfa_{\za_n})^2 \right\}\right)
    \end{split}
\end{equation*}

Thus, $\bftau_k$ follows independent Gamma distribution with parameters for each $\tau_{kd}$ to be:
$$
{\lambdatau_{kd,1}}^\ast=\lambdatau_1+\frac{T}{2}\sum_{n=1}^N{\pitau_{nk}}^\ast    
$$
and
\begin{multline*}
    \hspace*{-1cm}{\lambdatau_{kd,2}}^\ast=\lambdatau_2+\frac{1}{2}\sum_{n=1}^N {\pitau_{nk}}^\ast \left\{(y_{n0,d}-\E_q[a_{\za_n,d}])^2 +\sum_{t=1}^{T-1}(y_{nt,d}-\E_q[a_{\za_n,d}]-\E_q[\transpose{\Beta}_{\zBeta_n,d\cdot}]\bfy_{n(t-1)} +\E_q[\transpose{\Beta}_{\zBeta_n,d\cdot}]\E_q[\bfa_{\za_n}])^2 \right\}
\end{multline*}

We have the following results for the expectations in terms of variational parameters:
\begin{align*}
    \E_q[\bfa_{\za_n}] &=\sum_{j=1}^\truncbig {\pia_{nj}}^\ast{\bfmua_j}^\ast\\
    \E_q[\Beta_{\zBeta_n}] &= \sum_{j=1}^\truncbig {\piBeta_{nj}}^\ast{\muBeta_j}^\ast\\
    \E_q[\bftau_{\ztau_n}] &= \sum_{j=1}^\truncbig {\pitau_{nj}}^\ast \transpose{[\frac{{\lambdatau_{j1,1}}^\ast}{{\lambdatau_{j1,2}}^\ast},\cdots, \frac{{\lambdatau_{jD,1}}^\ast}{{\lambdatau_{jD,2}}^\ast}
]}
\end{align*}

\subsubsection{Parameters for Cluster Assignments}
\textbf{For} $\za_n$ \textbf{of intercept}:
\begin{align*}
    p(\za_n=k\g \bfTheta_{-\za_n},\bfY)&\propto p(\za_n=k\g\bfpia(\bfva))p(\bfy_n\g\bfz_n,\bfa,\Beta,\bftau)\\
    &\propto
    \va_k\prod_{i=1}^{k-1}(1-\va_i)\exp\left(\frac{\transpose{(\bfy_{n0}-\bfa_k)}\diag(\bftau_{\ztau_n})(\bfy_{n0}-\bfa_k)}{-2}\right)\\
    &\times \exp\left(\frac{\tr(\diag(\bftau_{\ztau_n})(\bfY_{n,-0}-\bfM_{\zBeta_n})\transpose{(\bfY_{n,-0}-\bfM_{\zBeta_n})})}{-2}\right)\\
    &\text{where }\bfY_{n,-0}=[\bfy_{n1},\dots,\bfy_{n(T-1)}]\in\reals^{D\times (T-1)} \text{ and }\bfY_{n,-(T-1)}=[\bfy_{n0},\dots,\bfy_{n(T-2)}]\in\reals^{D\times (T-1)}\\
  & \text{and }\bfM_{\zBeta_n}= \bfa_k\transpose{\bfone}+\Beta_{\zBeta_n}(\bfY_{n,-(T-1)}-\bfa_k\transpose{\bfone}) \in\reals^{D\times (T-1)}
\end{align*}
Thus,
$$
{\pia_{nk}}^\ast\propto \exp\left(\E_q[\log \va_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\va_i)] + \Sa_{nk} \right)
$$
where 
\begin{multline*}
\Sa_{nk}=\frac{-1}{2}\left\{\transpose{\left(\bfy_{n0}-\E_q[\bfa_k]\right)}\diag(\E_q[\bftau_{\ztau_n}])(\bfy_{n0}-\E_q[\bfa_k]) + \tr\left(\diag(\E_q[\bftau_{\ztau_n}])\right.\right.\\
\times \left.\left.(\bfY_{n,-0}-\E_q[\bfa_k]\transpose{\bfone}-\E_q[\Beta_{\zBeta_n}](\bfY_{n,-(T-1)}-\E_q[\bfa_k]\transpose{\bfone}))\transpose{(\bfY_{n,-0}- \E_q[\bfa_k]\transpose{\bfone}-\E_q[\Beta_{\zBeta_n}](\bfY_{n,-(T-1)}-\E_q[\bfa_k]\transpose{\bfone}))} \right)\right\}
\end{multline*}

\textbf{Similarly,} ${\piBeta_{nk}}^\ast\propto \exp\left(\E_q[\log \vBeta_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\vBeta_i)] + \SBeta_{nk} \right)$
where 
\begin{multline*}
\SBeta_{nk}=\frac{-1}{2}\left\{\transpose{\left(\bfy_{n0}-\E_q[\bfa_{\za_n}]\right)}\diag(\E_q[\bftau_{\ztau_n}])(\bfy_{n0}-\E_q[\bfa_{\za_n}]) + \tr\left(\diag(\E_q[\bftau_{\ztau_n}])\right.\right.\\
\hspace*{-0.5cm}\left.\left.(\bfY_{n,-0}-\E_q[\bfa_{\za_n}]\transpose{\bfone}-\E_q[\Beta_k](\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone}))\transpose{(\bfY_{n,-0}- \E_q[\bfa_{\za_n}]\transpose{\bfone}-\E_q[\Beta_k](\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone}))} \right)\right\}
\end{multline*}

\textbf{For} $\ztau_n$ \textbf{of noise}:
\begin{align*}
    p(\ztau_n=k\g \bfTheta_{-\ztau_n},\bfY)&\propto p(\ztau_n=k\g\bfpitau(\bfvtau))p(\bfy_n\g\bfz_n,\bfa,\Beta,\bftau)\\
    &\propto
    \va_k\prod_{i=1}^{k-1}(1-\va_i) \left(\prod_{d=1}^D \tau_{kd}^{\frac{1}{2}}\right)\exp\left(\frac{\transpose{(\bfy_{n0}-\bfa_{\za_n})}\diag(\bftau_k)(\bfy_{n0}-\bfa_{\za_n})}{-2}\right)\\
    &\times \left(\prod_{d=1}^D \tau_{kd}^{\frac{T-1}{2}}\right)\exp\left(\frac{\tr(\diag(\bftau_k)(\bfY_{n,-0}-\bfM_{{\za_n},\zBeta_n})\transpose{(\bfY_{n,-0}-\bfM_{{\za_n},\zBeta_n})})}{-2}\right)
\end{align*}
Thus,
$
{\pitau}^\ast_{nk}\propto \exp\left(\E_q[\log \vtau_k] + \sum_{i=1}^{k-1}\E_q[\log(1-\vtau_i)] + \Stau_{nk} \right)
$
where 
\begin{align*}
\Stau_{nk} &= \frac{T}{2}\sum_{d=1}^D\log\E_q[\tau_{kd}]
+ \frac{-1}{2}\left\{\transpose{\left(\bfy_{n0}-\E_q[\bfa_{\za_n}]\right)}\diag(\E_q[\bftau_k])(\bfy_{n0}-\E_q[\bfa_{\za_n}]) + \tr\left(\diag(\E_q[\bftau_k])\times\right.\right.\\
&\hspace*{-1.7cm}\left.\left.(\bfY_{n,-0}-\E_q[\bfa_{\za_n}]\transpose{\bfone}-\E_q[\Beta_{\zBeta_n}](\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone}))\transpose{(\bfY_{n,-0}- \E_q[\bfa_{\za_n}]\transpose{\bfone}-\E_q[\Beta_{\zBeta_n}](\bfY_{n,-(T-1)}-\E_q[\bfa_{\za_n}]\transpose{\bfone}))} \right)\right\}
\end{align*}

\subsubsection{ELBO Computation}
\begin{align*}
&\E_q[\log p(\bfy_n\g\bfz_n,\bfa,\Beta,\bftau)]= \left(\frac{1}{2}\sum_{d=1}^D \E_q[\log\tau_{\ztau_n,d}]\right)-\frac{1}{2}\transpose{(\bfy_{n0}-\E_q[\bfa_{\za_n}])}\diag(\E_q[\bftau_{\ztau_n}])(\bfy_{n0}-\E_q[\bfa_{\za_n}])\\
& + \sum_{t=1}^{T-1} \left\{\left(\frac{1}{2}\sum_{d=1}^D \E_q[\log\tau_{\ztau_n,d}]\right) -\frac{1}{2}\transpose{(\bfy_{nt}-\E_q[\bfa_{\za_n}]-\E_q[\Beta_{\zBeta_n}]\bfy_{n(t-1)} + \E_q[\Beta_{\zBeta_n}]\E_q[\bfa_{\za_n}])}\diag(\E_q[\bftau_{\ztau_n}]) \right.\\
&\times \left.(\bfy_{nt}-\E_q[\bfa_{\za_n}]-\E_q[\Beta_{\zBeta_n}]\bfy_{n(t-1)} + \E_q[\Beta_{\zBeta_n}]\E_q[\bfa_{\za_n}])\right\}\\
    &\E_q[\log p(\bfa_k\g\bfmua,\taua\bfI)]=-\frac{D}{2}\log(2\pi)+\frac{D}{2}\log\taua-\frac{\taua}{2}(\norm{{\bfmua_k}^\ast-\bfmua}^2+\tr(\inv{({{\Taua_k}^\ast})}))\\
    &\E_q[\log q(\bfa_k\g{\bfmua_k}^\ast,{\Taua_k}^\ast)]=-\frac{D}{2}\log(2\pi)+\frac{1}{2}\log\deter{{\Taua_k}^\ast}\\
    &\E_q[\log p(\bfvec(\Beta_{k})\g\bfvec(\muBeta),\tauBeta\bfI]= -\frac{D^2}{2}\log(2\pi)+\frac{D^2}{2}\log\tauBeta-\frac{{\tauBeta}}{2}
    (\norm{\bfvec({\muBeta_k}^\ast-\muBeta)}^2 + \tr(\inv{({\TauBeta_k}^\ast)}))\\
    &\E_q[\log q(\bfvec(\Beta_{k})\g\bfvec({\muBeta_k}^\ast),{\TauBeta_k}^\ast)]= -\frac{D^2}{2}\log(2\pi)+\frac{1}{2}\log\deter{{\TauBeta_k}^\ast}\\
    &\E_q[\log p(\bftau_k\g\bflambdatau)]=\sum_{d=1}^D\left(\lambdatau_1\log\lambdatau_2 - \log\Gamma(\lambdatau_1)+(\lambdatau_1-1)\left\{\bfPsi({\lambdatau_{kd,1}}^\ast)-\log{\lambdatau_{kd,2}}^\ast \right\} -\lambdatau_2\frac{{\lambdatau_{kd,1}}^\ast}{{\lambdatau_{kd,2}}^\ast}\right)\\
    &\E_q[\log q(\bftau_k\g{\bflambdatau_k}^\ast)] = \sum_{d=1}^D\left( {\lambdatau_{kd,1}}^\ast\log{\lambdatau_{kd,2}}^\ast-\log\Gamma({\lambdatau_{kd,1}}^\ast)+({\lambdatau_{kd,1}}^\ast-1)\left\{\bfPsi({\lambdatau_{kd,1}}^\ast)-\log{\lambdatau_{kd,2}}^\ast \right\} -{\lambdatau_{kd,1}}^\ast \right)\\
    &\E_q[\log p(\vdot_k\g\alphadot)]=(\frac{{\sdot_1}^\ast}{{\sdot_2}^\ast}-1)\{\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\bfalphadot_{k2}}^\ast)\} -\{\bfPsi({\sdot_1}^\ast)-\log{\sdot_1}^\ast\} \\
    &\E_q[\log q(\vdot_k\g{\bfalphadot_k}^\ast)]= ({\alphadot_{k1}}^\ast-1)\{\bfPsi({\alphadot_{k1}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\}+ ({\alphadot_{k2}}^\ast-1)\{\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\} \\ 
    &\hspace{+3.5cm}-\log\bfB({\alphadot_{k1}}^\ast,{\alphadot_{k2}}^\ast) \\
    &\E_q[\log p(\zdot_n\g\bfvdot)] =\sum_{k=1}^\truncbig\left\{\left(\sum_{i=k+1}^\truncbig{\pidot_{ni}}^\ast\right)\{\bfPsi({\alphadot_{k2}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\bfalphadot_{k2}}^\ast)\} +{\pidot_{nk}}^\ast\{\bfPsi({\alphadot_{k1}}^\ast)-\bfPsi({\alphadot_{k1}}^\ast+{\alphadot_{k2}}^\ast)\}\right\}\\
    &\E_q[\log q(\zdot_n\g{\bfpidot_n}^\ast)]=\log\max_i{\pidot_{ni}}^\ast\\
    &\E_q[\log p(\alphadot\g\bfsa)]=\sdot_1\log\sdot_2-\log\Gamma(\sdot_1)+(\sdot_1-1)\{\bfPsi({\sdot_1}^\ast)-\log{\sdot_1}^\ast\}-\sdot_2\frac{{\sdot_1}^\ast}{{\sdot_2}^\ast}\\
    &\E_q[\log q(\alphadot\g{\bfsa}^\ast)]={\sdot_1}^\ast\log{\sdot_1}^\ast-\log\Gamma({\sdot_1}^\ast)+({\sdot_1}^\ast-1)\{\bfPsi({\sdot_1}^\ast)-\log{\sdot_1}^\ast\}-{\sdot_1}^\ast
\end{align*}


\section{Proof for B-splines}\label{proof:B-spline}
Given a set of $N$ B-splines $\{B_{i,p}(t)\}_{i=0}^{N-1}$ of degree $p-1$ with coefficients $\{\beta_i\}_{i=0}^{N-1}$, denote $B_{0,p}(t)$ as a B-spline controlling the intercept such that $B_{0,p}(t=0)\neq 0$ while $B_{i,p}(t=0)=0$ for all $i\neq 0$. Then the linear combination of the collection including $B_{0,p}(t)$ is equivalent to the linear combination of B-splines without $B_{0,p}(t)$ but plus additional explicit intercept.

\begin{proof}
We know $\sum_{i=0}^{N-1}B_{i,p}(t)=1$ by definition. With $B_{0,p}(t)$, the linear combination is
\begin{align*}
    \sum_{i=0}^{N-1}\beta_i B_{i,p}(t) & = \beta_0 B_{0,p}(t)+ \sum_{i=1}^{N-1}\beta_i B_{i,p}(t) - \sum_{i=0}^{N-1}\beta_0 B_{i,p}(t) + \beta_0\\
    &= \beta_0 B_{0,p}(t)+ \sum_{i=1}^{N-1}\beta_i B_{i,p}(t) - \beta_0 B_{0,p}(t) - \sum_{i=1}^{N-1}\beta_0 B_{i,p}(t) + \beta_0\\
    &= \sum_{i=1}^{N-1}(\beta_i-\beta_0) B_{i,p}(t) + \beta_0\\
    &= \sum_{i=1}^{N-1}\beta_i^{(new)} B_{i,p}(t) + \beta_0 \quad\text{( let } \beta_i^{(new)}=\beta_i-\beta_0)
\end{align*}
From the last equation, we see that the first term is the linear combination of B-splines without $B_{0,p}(t)$ and the second term can be seen as an additional intercept term.
\end{proof}

\section{Proof for B-splines Intercept Shift}\label{proof:B-spline shift}
Given a set of $N$ B-splines $\{B_{i,p}(t)\}_{i=1}^{N}$ of degree $p-1$ with coefficients $\{\beta_i\}_{i=1}^{N}$, excluding the intercept $B_{0,p}(t)$, denote $t_{\text{tar}}$ as a targeted time point where we want a new intercept $\beta_0^{(new)}$ to represent its value. Then the B-splines function with this new intercept is equivalent to shifting downward all B-splines by $B_{i,p}(t_{\text{tar}})$.
%\end{theorem}

\begin{proof}
By \autoref{proof:B-spline} we know the function of B-splines can be expressed as $f(t)= \beta_0 + \sum_{i=1}^{N}\beta_i B_{i,p}(t)$. When at time $t_{\text{tar}}$, let $\beta_0^{(new)}=f(t_{\text{tar}})=\beta_0 + \sum_{i=1}^{N}\beta_i B_{i,p}(t_{\text{tar}})$, then
\begin{align*}
    f(t) & = \beta_0 + \sum_{i=1}^{N}\beta_i B_{i,p}(t) + \beta_0^{(new)} - \beta_0^{(new)}\\
    &= \beta_0^{(new)} + \left(\beta_0 + \sum_{i=1}^{N}\beta_i B_{i,p}(t) - \beta_0 - \sum_{i=1}^{N}\beta_i B_{i,p}(t_{\text{tar}})\right)\\
    &= \beta_0^{(new)} + \sum_{i=1}^{N}\beta_i \left(B_{i,p}(t) - B_{i,p}(t_{\text{tar}})\right)
\end{align*}
\end{proof}

\end{document}
