% \section{Appendix}
\counterwithin{figure}{section}


\section{HDP-Flow variational posterior distribution}\label{app:var_dist}
The mean-field factorized variational posterior of \hdpflow\ over the global and local latent variables $W = \{Z, \theta, \beta, \pi, \kappa\}$ is shown below, where the posterior on each variable is modelled independently, and with a family of distribution similar to the prior. The infinite number of states are truncated to $k \in [1, \cdots, L]$ to simplify the variational posterior.

\begin{equation} 
\label{eq:app_variational_posterior}
\begin{split}
q(Z, \theta, \beta, \pi, \kappa|\Theta^*) &= 
{\underbrace{q(\beta)}_{\text Dirichlet}}
\prod_{t} {\underbrace{q(z^{(i)}_t)}_{\text Categorical}} 
\prod_k {\underbrace{q(\theta_k)}_{\text Gaussian}}\\
& \times \prod_k {\underbrace{q(\pi_k)}_{\text Dirichlet}}
\prod_k {\underbrace{q(\kappa_k)}_{\text Beta}}  
\end{split}
\end{equation} 

The distribution of the global state probabilities and the transition probability for each state is modelled with a Dirichlet distribution with $L$ categories. The variational distribution over each latent state $z_t$ is a categorical distribution, following the mean-field assumption. 





\section{Stochastic BBVI gradient estimations}\label{app:gradients} 

BBVI uses the Rao-Blackwell estimator for the gradients, which under the mean field assumption becomes:

\begin{equation}
\begin{split}
    \nabla_{\theta^*} \mathcal{L}  = \E_{q_1}\dots\E_{q_i} [& \sum_{j=1}^{i} \nabla_{\theta^*} \log q_j (z_j | \theta^*_j) (\log p(x, z) \\
    & -\sum_{j=1}^{i} \log q_j (z_j | \theta^*_j))]
\end{split}
\end{equation}

$\nabla_{\theta^*} \mathcal{L}$ as the gradient of the ELBO with respect to $\theta^*_i$
, $p_i$ are the components of the log joint that include terms form the $i$th factor, and $\E_{q_i}$ is the expectation with respect to the set of latent variables that appear in the complete conditional for $z_i$. Let $p_i$ be the components of the joint that does not include terms from the $i$-th factor respectively. We can write the gradient with respect to the $i$-th factor’s variational parameters as:

\begin{equation}
    \nabla_{\theta^*_i} \mathcal{L}  = \E_{q_i} [\nabla_{\theta^*} \log q_i (z_i | \theta^*_i) (\log p_i(x, z_i) -  \log q_i (z_i | \theta^*_i))]
\end{equation}


% \begin{equation}
%     % \nabla_{\beta} \mathcal{L}  = \E_{q_\beta} [\nabla_{\beta} \log q(\beta) (\log p(\beta) + \log p(\pi|\beta) -  \log q (\beta))]
% \end{equation} 

Using this derivation, the joint distribution defined in Equation \ref{eq:joint}
and the factorized variation posterior (\ref{eq:app_variational_posterior}), the gradient of each of the variational parameters $\Theta^*_{\beta}, \Theta^*_{\theta}, \Theta^*_{\pi}, \Theta^*_{\kappa}, \Theta^*_{Z}$ that are the parameters of $q(\beta), q(\theta), q(\pi), q(\kappa), q(Z)$ respectively is calculated in Equation \ref{eq:gradients}. Note that these estimations take into account the unique sequential and hierarchical dependencies of \hdpflow\ distribution. 
  
\begin{equation}
 \begin{split} 
    \label{eq:gradients}
    {\hat{\nabla}}_{\Theta^*_{\beta}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\beta_s) \Big(\log p(\beta_s) + \\ 
    & \sum_{k=1}^{L} \log p(\pi_{k,s}|\beta_s) - \log q(\beta_s)\Big)\\    
    {\hat{\nabla}}_{\Theta^*_{\theta_k}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\theta_{k,s})\Big(\log p(\theta_{k,s}) + \\ 
    & \sum_{t=1}^{T}\log p(x_t|z_{t,s},\theta_{s})\delta_{(z_{t,s}=k)} - \log q(\theta_{k,s})\Big) \\
    {\hat{\nabla}}_{\Theta^*_{\pi_k}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\pi_{k,s})\Big(\log p(\pi_{k,s}) + \\ 
    & \sum_{t=1}^{T}\log p(z_{t,s}|\theta_s, \pi_s, \kappa_s)\delta_{(z_{t-1}=k)} - \log q(\pi_{k,s})\Big) \\
    {\hat{\nabla}}_{\Theta^*_{\kappa_k}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\kappa_{k,s})\Big(\log p(\kappa_{k,s}) +\\ 
    & \sum_{k=1}^{L} \log p(\pi_{k,s}|\beta_s)
    - \log q(\kappa_{k,s})\Big)\\
    {\hat{\nabla}}_{\Theta^*_{z_t}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(z_{t,s})\Big(\log p(z_{t,s}|\pi_s, \kappa_s) +\\
    & \log p(x_t|z_{t,s}, \theta_s) - \log q(z_{t,s}) \Big).
\end{split}  
\end{equation}






\section{Posterior predictive estimation}\label{app:fb_algo}
In order to measure the posterior likelihood of new samples and to estimate the underlying states for these sample, we use the forward meassages of FB algorithm. The $k^{th}$ forward message of the FB algorithm at time $t$, ${\bf{f}}_t(k)$, estimates the joint likelihood of the observations upto time $t$, and the state $z_t$:

\begin{equation}
    {\bf{f}}_t(k) = p(\tilde{x}_{0:t}, z_t=k|\beta, \pi, \kappa, \theta)
\end{equation}
 

Therefore, the likelihood of a series of observations $p(\tilde {X}|\beta, \pi, \kappa, \theta, {\bf{X}})$ is the marginal of the last time step. 

\begin{equation}
\begin{split}
    \mathbf {f_{t}}(k) = &  p(x_t|z_t=k) \sum_{z_{t-1}=0}^{k} \mathbf{f_{t-1}}(z_{t-1}) p(z_t|z_{t-1})  \\
    \mathbf {f_{t}}(k)= & p(x_1, x_2,\dots ,x_t,z_{t}=k)\\
\end{split}
\end{equation}

Typically, the forward probability vectors at each step are normalized so that the entries sum to 1. A scaling factor is thus introduced, and as a result, the product of the scaling factors is the total probability for observing the given events irrespective of the final states:

\begin{equation}
\begin{split}
    &\mathbf{\hat{f}}_{t}(k) =  c_t^{-1} p(x_t|z_t=k) \sum_{z_{t-1}=0}^{k} \mathbf{\hat{f}}_{t-1}(z_{t-1}) p(z_t|z_{t-1})  \\
    &p({\tilde{X}}_{0:T}) = p(x_1, x_2,\dots ,x_T) =  \prod_{t=1}^{T}c_{t}\\
\end{split}
\end{equation}

To estimate the negative log likelihood of an unobserved time series sample ${\tilde{x}}$, we need to estimate the posterior likelihood as follows:
\begin{equation}
\begin{split}
    NLL =& -\log(p({\tilde{X}}|{\bf{X}}))\\
    = & -\log \int_{\beta, \pi, \kappa, \theta} (p({\tilde{X}}|\beta, \pi, \kappa, \theta)p(\beta, \pi, \kappa, \theta|{\bf{X}})\\
       = & -\log \int_{\beta, \pi, \kappa, \theta} p({\tilde{X}}|\beta, \pi, \kappa, \theta)q(\beta, \pi, \kappa, \theta)\\
       = & -\log \E_{\beta, \pi, \kappa, \theta \sim q} p({\tilde{X}}|\beta, \pi, \kappa, \theta)\\
\end{split}
\end{equation}


\section{Datasets} \label{app:dataset}
\paragraph{Simulated dataset I}
This dataset consists of 3-dimensional time series samples with 4 underlying states. The state transitions are governed by an HMM with the following fixed transition probabilities: 
$$\pi = \begin{bmatrix} 0.8 & 0.1 & 0.05 & 0.05 \\ 0.1 & 0.8 & 0.1 & 0. \\ 0.05 & 0.1 & 0.8 & 0.05 \\ 0.05 & 0.05 & 0.0 & 0.9\end{bmatrix}$$ 
Here the $ij$'th element is the probability of moving from state $i$ to state $j$. 
The emission probability of each state is a Normal Gaussian $\mathcal{N}(\mu_{z_t} , I)$, where $\mu_{z_t}$ is fixed for each state and defined as $\begin{bmatrix} 0 & 1 & 2 \\ 5 & 6 & 1 \\ 5 & 5 & 5 \\ 9 & 12 & 11 \end{bmatrix}$, where the the $i$'th row is the mean vector for state $i$. The observations are drawn iid from the distribution and don't depend on time. 


\paragraph{Simulated dataset II}
This dataset consists of 4-dimensional time series samples with 6 underlying states. The sequence of states for each sample are determined by a sticky HDP-HMM, with a fixed global state distribution, transition probabilities, and self-transition parameters. This dataset is designed to have a non-stationary emission where within each state $k$, the data distribution is $x_t = a_kt + b_k + \epsilon$, with $\epsilon$ Gaussian noise. The matrix of all $a_k$ and $b_k$ are as follows:
$$A = \begin{bmatrix} 0&0& 0& 0.2 \\ 0.3& 0& 0& 0 \\ 0& 0& 0& 0 \\ 0& 0& 0& 0 \\ 0& 0.2& -0.1& 0 \\ 0& 0.5& 0& 0\end{bmatrix} B = \begin{bmatrix} 8&8&8&8 \\ -3& -3& -5& -5 \\ 5& 0& 2& 0 \\ 2& 0& 2& 0 \\ 0& 2& -3& 0 \\ -4&  -4&  -4&  -4\end{bmatrix}$$ 



\paragraph{Simulated dataset III}
To increase complexity in the experiments, we used a second simulated dataset with additional temporal dynamics. This dataset is generated from a generative model with a similar structrue to HDP-Flow, where the number of states are finite (set to 6) and all latent variables are fixed. The code to generate this dataset is included as part of the supplementary material. 

\paragraph{Human Activity Recognition (HAR 70+):} This dataset contains 18 fit-to-frail older-adult subjects (70-95 years old) wearing wearable sensors during a semi-structured free-living protocol \citep{misc_har70+_780}. With 6 features and an average of 5K time steps per sample, it is a good example of long time series found in real-world applications.


\subsection{Bump and Crohns' datasets}
\label{subsubsection:bump_crohn_data}
The inclusion criteria for both Bump and Crohn's datasets require participants to have less than 60\% missing data during their participation and at least 35 recorded data points. Additionally, participants from the Bump dataset with missing information were excluded. The Crohn's dataset consists of a 62\% female population, with the demographic distribution shown in Figure \ref{fig:crohn_demo}. 
\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.95\linewidth]{figures/figures_bump/crohn_demo_dist.jpg}
    \caption{Age and BMI distribution of Crohn's data}
    \label{fig:crohn_demo}
\end{figure}

Physiological wearable input features include Nighttime Mean Heart Rate and Heart Rate Variability (HRV), calculated using RMSSD, which provide insights into cardiovascular function. Sleep metrics, including the duration of deep sleep, REM sleep, and awake time, help assess overall sleep structure and efficiency. Body Temperature Shift, representing deviations from an individual’s baseline, offers insights into physiological changes. The Midpoint of Sleep Period, measured in seconds from sleep onset to its midpoint, helps identify patterns and potential disruptions in sleep timing. Sleep Onset Latency, which quantifies the time taken to transition from wakefulness to sleep, serves as a key indicator of sleep efficiency and potential disorders. Lastly, the Sleep Score provides a comprehensive evaluation of sleep quality and quantity by analyzing factors such as sleep stages, restfulness, and timing.

Subjective features that \hdpflow is not trained on were analyzed to check whether changes in the distribution of input physiological signals relate to the distribution of subjective measures. To identify overlapping survey questions between the Bump and Crohn's datasets, we used TF-IDF \citet{salton1988term} vectorization and cosine similarity. First, both sets were transformed into numerical TF-IDF vectors, capturing word importance while minimizing the impact of common terms. Then, cosine similarity was computed between each pair of questions from the two datasets. A similarity threshold of 0.7 was applied to identify closely related question pairs. Based on this analysis, we selected the following set of features:
\begin{itemize}
    \item Feeling in Control: A daily feature based on the question, "Right now, do you feel in control?" Responses range from 0 to 100, normalized to 0-1 for analysis.
    \item Weekly Sleep Impairment: Assessed through 9 questions, e.g. "I had problems during the day because of poor sleep.", with responses ranging from "Not at all" (0) to "Very much" (4), capturing the extent of sleep-related difficulties.
    \item Pain Interference: Evaluates how pain impacts daily life, including participation in social activities, day-to-day tasks, work, and household chores.
\end{itemize}
% \paragraph{Human Activity Recognition (HAR)}
% An interesting application of learning states in time series data is  detecting activities using wearable data. We use the UCR HAR dataset\footnote{\url{http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions}} that consists of 30 individuals performing a protocol of actions composed of six basic activities: three static postures (standing, sitting, lying) and three dynamic activities (walking, walking downstairs and walking upstairs). The experiment also included postural transitions that occurred between the static postures: stand-to-sit, sit-to-stand, sit-to-lie, lie-to-sit, stand-to-lie, and lie-to-stand. All the participants were wearing a smartphone to record accelerometer and gyroscope signals at a rate of 50Hz. The resulting signals have 6 features and on average 10K samples over time, spanning over multiple activities


\section{Baselines}\label{app:baselines}


% \subsection{Training details}
\subsection{HDP-Flow}

 We use the pytorch-ts implementation of MAF \footnote{https://github.com/zalandoresearch/pytorch-ts} in HDP-Flow. The models are all trained on CPU machines, and with a 8 hour limit for the training. For evaluation, data is split into train, validation and test cohort. All results in the paper are reported for the test set, and the validation set is used to choose the parameters of the prior distribution $\Theta=\{\alpha, \gamma, \rho, \lambda\}$. We estimate the posterior predictive over the unobserved samples in the validation set for tuning the parameters of our prior. 

 Variation inference algorithm can potentially converge to local optima, as a result not yielding the best estimate of the posterior. To overcome this, we use the estimated ELBO to pick the best converged posterior out of $5$ runs for each experiment. 
 % The posterior predictive distribution for a new observation $\tilde {x}$ can be estimated by the following marginalization:

% \begin{equation}
%  p({\tilde {x}}|\mathbf {X} )=\int _{W }p({\tilde {x}}|W ,\mathbf {X} )\,p(W |\mathbf {X} )\operatorname {d} \!W \approx \int _{W }p({\tilde {x}}|W),q(W)\operatorname{d} \!W   
% \end{equation}



The final parameters used for each set of experiments are shown in Table \ref{tab:prior_param}

\begin{table*}
    \centering
    \begin{tabular}{lcccccc}
    \toprule
        Dataset &  $\alpha$ & $\gamma$ & $\rho_1$ & $\rho_2$ & $H$ & MADE size\\
        \midrule
        Simulated I & 4 & 4 & 1 & 3 & $\mathcal{N}(0,3I)$ & 2\\
        Simulated II & 4 & 2 & 0.2 & 0.6 & $\mathcal{N}(0,2I)$ & 1\\
        Simulated III & 6 & 2 & 0.5 & 1 & $\mathcal{N}(0,4I)$ & 2\\
        HAR  & 6 & 4 & 1 & 3 & $\mathcal{N}(0,3I)$ & 2\\
        CPAP  & 2 & 2 & 0.01 & 0.3 & $\mathcal{N}(0,4I)$ & 2\\
        HAR70  & 6 & 2 & 0.2 & 0.6 & $\mathcal{N}(0,3I)$ & 2\\
        Crohns' data & 4 & 2 & 0.8 & 0.2 & $\mathcal{N}(0,I)$ & 2\\
        \bottomrule
    \end{tabular}
    \caption{Optimal hyper parameters selected for HDP-Flow for each dataset}
    \label{tab:prior_param}
\end{table*}

As discussed in \ref{sec:proposed_gen_model}, we use a MAF to model the data distribution. \footnote{We use the implementation of MAF provided on} This method utilizes Masked Autoencoders for Density Estimation (MADE) blocks. Our generative model uses 1 MADE block, with 1 hidden layer, and we treat the size of the hidden layer as one of our parameters.  
% We modified the implementation by \citet{pytorchtsgithub} so that the MADE blocks only learn the means of the conditional distributions, and variances are fixed to 1, in order to reduce the size of $\theta_k$ needed. In other words, the $\alpha_{i,t}$ values from \ref{sec:proposed_gen_model} are set to $0$, meaning the variances which are $e^{\alpha_{i,t}} = 1$ 

\subsection{S-HDP-HMM and DS-HDP-HMM}
 We use implementations of both the S-HDP-HMM and DS-HDP-HMM from \cite{zhou2020disentangled}, including the gibbs sampling routine. For S-HDP-HMM, a $gamma(1,1)$ prior is placed on $\gamma$ (the concentration parameter on the higher level DP). A value $x$ is sampled from $gamma(\alpha_a, 1/\alpha_b)$ and a value $y$ is sampled from $beta(c_1, c_2)$. These define the initial value of $\kappa$ (self transition weight) and $\alpha$ (lower level DP concentration parameter) as follows: $\kappa = x*y$, $\alpha=x-\kappa$.
% \subsection{Training Details for DS-HDP-HMM}
For the DS-HDP-HMM model, we place a $beta(\rho_0, \rho_1)$ prior on the $\kappa$ values, where $\rho_0 = \frac{v_0}{v_1^3}$ and $\rho_1 = \frac{(1-v_0) \rho_0}{v_0}$, where $v_0 \sim Unif(0, 1)$ and $v_1 \sim Unif(0, 1)$. $\gamma$ is initialized the same way as above described for the S-HDP-HMM, and $\alpha \sim gamma(1, 10)$. In both DS-HDP-HMM and S-HDP-HMM, we use the implementation from \cite{zhou2020disentangled} for the AR-HMM emission, which models the emission distribution as follows: $y_t \sim \mathcal{N}(A_{z_t}y_{t-1}, \Sigma_{z_t})$ where $y_t$ is the observed vector at time $t$, and $A$ and $\Sigma$ matrices are learned for each state. A Matrix Normal prior is placed on $A_j$ given $\Sigma_j$ as follows: 
\begin{equation}
    \begin{split}
        p(A_j|\Sigma_j) &= \frac{1}{(2\pi)^{\frac{d^2}{2}}|V|^{d/2}|\Sigma|^{d/2}} \times\\
        &exp\bigg(-\frac{1}{2}tr[(A_j-M)^\top)\Sigma_j^{-1}(A_j-M)V^{-1}]\bigg)
    \end{split}
\end{equation}
and an Inverse Wishart prior is placed on $\Sigma_j$ as follows:
\begin{equation}
    \begin{split}
    p(\Sigma_j) =& \frac{|S_0|^{n_0/2}}{2^{n_0/2} \Gamma_d(n_0/2)}|\Sigma_j|^{-(n_0+d+1)/2}\\
    & exp\bigg(-\frac{1}{2}tr(\Sigma_j^{-1}S_0)\bigg)
    \end{split}
\end{equation}
where $\Gamma_d()$ is the multivariate gamma function, $d$ is the dimension of the data, $M$ is a $d \times d$ 0 matrix and $n_0 = d+2$. $V = v*I_{d \times d}$ and $S_0 = s*\bar{\Sigma}$ (where $\bar{\Sigma}$ is the emperical covariance matrix of the train data). The final choice of hyper parameters chosen for S-HDP-HMM for all datasetes can be found in Table \ref{tab:shdphmmprior_param}, and for DS-HDP-HMM in Table \ref{tab:dshdphmmprior_param}

\begin{table}[!ht]
    \centering
    \begin{tabular}{lcccccc}
    \toprule
        Dataset &  $\alpha_a$ & $\alpha_b$ & $c_1$ & $c_2$ & $v$ & s\\
        \midrule
        Simulated I & 2 & 1 & 1 & 1 & 0.1 & 0.75\\
        Simulated II & 2 & 1 & 2 & 1 & 1 & 0.75\\
        Simulated III & 2 & 1 & 2 & 1 & 0.1 & 1.0\\
        HAR  & 2 & 1 & 2 & 1 & 1 & 0.75\\
        HAR70 & N/A & N/A & N/A & N/A & N/A & N/A\\
        CPAP  & 1 & 1 & 1 & 1 & 1 & 1\\
        \bottomrule
    \end{tabular}
    \caption{Best set of hyper parameters (based on validation loss) for each dataset for S-HDP-HMM}
    \label{tab:shdphmmprior_param}
\end{table}

\begin{table}[!ht]
    \centering
    \begin{tabular}{lcc}
    \toprule
        Dataset & $v$ & s\\
        \midrule
        Simulated I & 0.1 & 0.75\\
        Simulated II & 0.1 & 0.75\\
        Simulated III & 0.1 & 1\\
        HAR & 1 & 0.75\\
        HAR70 & N/A & N/A\\
        CPAP  & 1 & 0.75\\
        \bottomrule
    \end{tabular}
    \caption{Best set of hyper parameters (based on validation loss) for each dataset for DS-HDP-HMM}
    \label{tab:dshdphmmprior_param}
\end{table}
% \subsubsection{Run time}

\subsection{HMM-Flow}
We use the implementation\footnote{https://github.com/tooploox/flowhmm} provided by \cite{lorek2022flowhmm}. For all datasets, we train for 100 epochs, using Q training (see \cite{lorek2022flowhmm} for more details), learning rate of 0.01. The number of hidden states for the HMM is set to the true number of hidden states for each dataset. 

\subsection{Supervised RNN}
This architecture consists of an LSTM along with a linear classifier which takes in the LSTM's hidden state and predicts the state class. The model is trained end to end. Each model is trained for 100 epochs, with a dropout rate of 0.50 in the LSTM. We vary learning rate and the number of layers in the LSTM, and report the best choice for each dataset (chosen according to the lowest validation loss). 

\begin{table}[!ht]
    \centering
    \begin{tabular}{lcc}
    \toprule
        Dataset & LR & \# Layers\\
        \midrule
        Simulated I & 0.01 & 4\\
        Simulated II & 0.01 & 4\\
        Simulated III & 0.005 & 2\\
        HAR & 0.005 & 2\\
        HAR70 & 0.01 & 4\\
        CPAP  & 0.01 & 4\\
        \bottomrule
    \end{tabular}
    \caption{Best set of hyper parameters (based on validation loss) for each dataset for Supervised RNN}
    \label{tab:lstm_classifier_hparams}
\end{table}


\section{Computational analysis}
Gibbs sampling is indeed a major computational bottleneck in Bayesian nonparametric models, particularly when applied to long sequences. The inference complexity of HDP-HMM variants is $\mathcal{O}(N(TK^2+TL^2)+NK)$ (see \citet{zhou2020disentangled} for more details), and addressing this challenge was a key motivation for adopting variational inference in our approach. By using a mean-field approximation within the black-box variational inference (BBVI) framework, we reduce the overall complexity to $\mathcal{O}(NS(TL+TdL))$, where the first term accounts for the variational posterior updates, and the second term reflects the cost of evaluating the MAF-based emission likelihood. This reduction translates into significant efficiency gains in practice. To ensure a fair comparison, we allocated a maximum training time of 20 hours for all Gibbs-based models or until convergence. In practice, these models consistently reached the time limit without converging. In contrast, our model typically converges well within this time frame. Table \ref{tab:runtime} shows runtime of HDPFlow on different datasets. 


\begin{table}[!ht]
    \centering
    \begin{tabular}{lcccccc}
    \toprule
    \textbf{Dataset} & Sim I & Sim II & Sim III & HAR & CPAP \\
    \midrule
    \textbf{Runtime} & 4  & 4.5  & 6  & 2.5  & 8  \\
    \bottomrule
    \end{tabular}
    \caption{HDPFlow train time until convergence (measured by hours) on CPU for different datasets}
    \label{tab:runtime}
\end{table}



\section{Uncertainty Measures}
\label{subsection: Uncertainty}
Our interpretable probabilistic model enables uncertainty estimation, providing insights into model reliability for new samples and states. Distinguishing different types of uncertainty is crucial, especially in clinical applications \cite{hullermeier2021aleatoric, valdettarooffline, gawlikowski2023survey}.We compute multiple uncertainty metrics based on the inferred state probabilities $\gamma$ and posterior sample likelihood, capturing epistemic (model-related), aleatoric (data-related), and robustness-based (perturbation-based) uncertainties.

\paragraph{Aleatoric Uncertainty (Variance of Log-Likelihood): } Aleatoric uncertainty represents inherent data noise. It is estimated by the variance of posterior likelihoods across MC samples:
\[
\text{log\_like\_var} = \text{Var}(\text{posterior\_like}, \text{across MC samples})
\]
A higher variance suggests greater ambiguity in the data, limiting confidence in inferred states. Based on log\_like\_var, we identified 4 patients in the Crohn's dataset and 3 participants in the Bump dataset with high aleatoric uncertainty. These individuals were excluded from the analysis.

\paragraph{State Uncertainty (Variance of $\gamma$): } We quantify state uncertainty following \citet{blei2017variational}: 
\[
\text{gamma\_var} = \text{Var}(\gamma, \text{across MC samples})
\]
This metric reflects the variance of inferred state probabilities $\gamma$ across Monte Carlo (MC) samples. Higher variance indicates greater disagreement in state assignments, suggesting increased uncertainty in state estimation.

\paragraph{Credible Interval Width (Bayesian Uncertainty): } This Bayesian uncertainty measure \cite{gelman1995bayesian} provides interval-based state probability estimates $U-L$, where \( U, L \) are the 97.5th and 2.5th percentile bounds of the posterior distribution. A wider interval indicates higher uncertainty in state estimates.

Incorporating gamma\_var and credible interval uncertainty estimates to exclude uncertain detected states reduced the Hamming distance by an average of 2-3\%.

\paragraph{Uncertainty via Perturbation (Robustness Test): }To assess the model’s robustness to input noise, we introduce Gaussian perturbations to input features $X$ and compute the variance in inferred states:
\[
\text{perturbation\_variance} = \text{Var}(\gamma_{\text{perturbed}}, \text{dim}=0)
\]
Higher variance indicates that state assignments are sensitive to small input changes, suggesting reduced robustness.

\paragraph{Feature Dropout Robustness (Effect of Missing Features): }following \citet{dolezal2022uncertainty}, we evaluate robustness by randomly setting features to zero (dropout) and measuring the variance in inferred states:
\[
\text{feature\_dropout\_variance} = \text{Var}(\gamma_{\text{dropout}}, \text{dim}=0)
\]

A high variance suggests that the model strongly depends on specific features, making it more dependent to missing data.

Both dropout sensitivity (\(\approx 0.003\)) and noise sensitivity (\(\approx 0.004\)) are relatively small, indicating that \hdpflow is fairly robust to input perturbations. As expected, the perturbation sensitivity for the Bump data, when the model is trained on Crohn's data, is higher, with dropout sensitivity (\(\approx 0.005\)) and noise sensitivity (\(\approx 0.007\)). Without sleep features, both sensitivities increased to 0.01. 
% \begin{table*}
%     \centering
%     \begin{tabular}{lll}
%         \toprule
%         \textbf{Uncertainty Measure} & \textbf{Type} & \textbf{Interpretation} \\
%         \midrule
%         State Uncertainty (Gamma Variance) & Epistemic & Measures disagreement across MC samples. \\
%         Aleatoric Uncertainty (Log-Likelihood Variance) & Aleatoric & Measures uncertainty due to inherent data noise. \\
%         State Entropy & Epistemic & Higher entropy means more uncertainty in state assignment. \\
%         Predictive Uncertainty (KL Divergence) & Epistemic & Measures temporal instability in predictions. \\
%         Credible Interval Width (Bayesian Uncertainty) & Bayesian & Wider intervals indicate greater uncertainty. \\
%         Uncertainty via Perturbation & Robustness & Higher variance means model is sensitive to input noise. \\
%         Feature Dropout Robustness & Robustness & Measures sensitivity to missing feature data. \\
%         \bottomrule
%     \end{tabular}
%     \caption{Summary of uncertainty measures used in state estimation.}
% \end{table*}

\section{Beta Bayes Factor Analysis}
\label{subsection: bayes factor}
To assess whether two independent samples originate from the same Beta distribution, we approximate the Bayes Factor (BF) \cite{kass1995bayes} using the Bayesian Information Criterion (BIC) \cite{neath2012bayesian}. This approach efficiently estimates distributional differences by balancing model complexity and goodness of fit. Unlike the commonly used Gaussian distribution, we adopt the Beta distribution as it better captures the bounded nature of subjective symptom responses and accommodates a wide range of distribution shapes. This is particularly beneficial when sample sizes are limited and the assumptions of the Central Limit Theorem may not hold. Given two independent samples, $X_1$ and $X_2$ (normalized by the maximum value of questionnaire responses), we define the Null hypothesis $H_0$ as both samples being drawn from a single Beta distribution.

To test these hypotheses, we fit Beta distributions to the combined dataset, yielding parameters $(\alpha_{\text{comb}}, \beta_{\text{comb}})$ and each sample separately, yielding parameters $(\alpha_1, \beta_1)$ and $(\alpha_2, \beta_2)$. The fitting procedure is performed using maximum likelihood estimation (MLE), constrained to the interval $[0,1]$.

The log-likelihood of a Beta-distributed sample $S$ with shape parameters $\alpha$ and $b$ is given by:
\begin{equation}
    \log P(X | \alpha, \beta) = \sum_{i} \log \text{Beta}(X_i | \alpha, \beta),
\end{equation}
where $\text{Beta}(X_i |\alpha, \beta)$ denotes the probability density function (PDF) of the Beta distribution. We compute $\log L_{\text{comb}}$: Log-likelihood of the combined dataset and $\log L_{\text{sep}}$ which is sum of log-likelihoods for the separate distributions.

To balance model complexity and fit, we compute the BIC for each model using:
\begin{equation}
    BIC = k \log(n) - 2 \log L,
\end{equation}
where $k$ is the number of parameters in the model, $n$ is the sample size, and $\log L$ is the log-likelihood of the fitted model. The BIC values are computed as:
\begin{equation}
    BIC_{\text{comb}} = 2 \log(|X_1| + |X_2|) - 2 \log L_{\text{comb}},
\end{equation}
\begin{equation}
    BIC_{\text{sep}} = 4 \log(|X_1| + |X_2|) - 2 \log L_{\text{sep}}.
\end{equation}

Since direct computation of the Bayes Factor requires marginal likelihood estimation, which is computationally expensive, we approximate it using the difference in BIC values:
\begin{equation}
    BF \approx e^{(BIC_{\text{comb}} - BIC_{\text{sep}}) / 2}.
\end{equation}

The computed Bayes Factor provides a quantitative measure of evidence for model selection:
\begin{itemize}
    \item \textbf{If} $BF > 10$: Strong evidence \textbf{against} $H_0$, suggesting that $S_1$ and $S_2$ are drawn from distinct Beta distributions.
    \item \textbf{If} $BF \leq 10$: Insufficient evidence to reject $H_0$.
\end{itemize}

This implementation enables efficient hypothesis testing while preserving the interpretability of Bayesian model selection.

\section{Supplementary Results}\label{app:supp_results}
\begin{table}[!ht]
% \scriptsize
\footnotesize
    \centering
    \begin{tabular}{lcc}
        &\multicolumn{2}{c}{HAR 70+}\\
        \toprule
         & Hamming  & NLL\\
        \midrule
        HDP-Flow & \textbf{0.28$\pm$0.06} & \textbf{5219.0$\pm$1106.6}\\
        DS-HDP  &\longdash  & \longdash \\
        S-HDP  &\longdash  & \longdash\\
        \midrule
        RNN &0.56 $\pm$ 0.14l &N/A\\
        HMM-Flow &\textbf{0.28$\pm$0.07} & 40121.6$\pm$4341.1\\
        \midrule
        RNN Sup. &0.27 $\pm$ 0.08&N/A
    \end{tabular}
    \caption{Performance on real-world datasets, measured by the Hamming distance, and the posterior predictive likelihood. Standard deviation are reported across samples, and best results with statistical significance are highlighted.}
    \label{tab:real_results}
\end{table}

\subsection{Bump and Crohn's cross cohort analysis}
\label{app_subsection: bump_crohns_analysis}
Figure \ref{fig:Input_zscore} illustrates the consistency in wearable signal distributions and their correlation with states across the Bump and Crohn's datasets. While the z-score correlation of Awake Time remains nearly identical between the two datasets, relative differences emerge in specific correlations: Body Temperature varies in states 0, 2, 3, 4, and 7, HRV differs in states 4 and 5, and Heart Rate shows variation in states 3 and 9. These features are particularly important as they exhibit distinct behavior in pregnant women, a pattern that is reflected in the Bump data in this figure. As a result, HDP-Flow serves as a powerful framework for tracking and interpreting wearable data across cohorts, identifying new states or shifts in state distribution, and detecting significant changes in physiological signals.




\section{Supplementary figures}\label{app:figures}
This section provides similar visualizations as the ones present in the paper for all datasets. In Figure \ref{fig:appendix_ece}, compared to the simulated datasets I and II (\ref{fig:ECE_plot}), this dataset is more complex. However, calibration (ECE = 0.2328) remains reasonable, though some confidence bins show larger deviations from perfect calibration. 


\begin{figure}[!ht]
    \centering    \includegraphics[width=0.6\linewidth]{figures/bar_plot_simIII.jpg}
    \caption{Reliability Plot of Simulated data III}
    \label{fig:appendix_ece}
\end{figure}

In the rest of the figures, the first row presents the ground truth underlying state for a test sample (left), and distribution of states in the training data (middle). Each subsequent row presents corresponding results from a different model (\hdpflow\ and baselines). The left column shows the inferred state sequences for a test sample, indicated by the background color. The middle column shows each model's estimated global state distribution. The right column depicts samples generated by the BNP models, with states as background colors and state duration reflecting their estimated probabilities. All colors are matched for each state.

\begin{figure*}[hbt!]
    \centering
    \includegraphics[width=\textwidth]{figures/sim_easy.pdf}
    \caption{Ground Truth vs. BNP models inference on \textbf{Simulated Data I}.}
    \label{fig:sim1_states}
\end{figure*}






\begin{figure*}
    \centering    \includegraphics[width=0.9\textwidth]{figures/figures_bump/Inputs_zscore.jpg}
    \caption{Z-scored mean values of physiological features across predicted states in both the Crohn's and Bump datasets.}
    \label{fig:Input_zscore}
\end{figure*}



\begin{figure*}[hbt!]
    \centering
    \includegraphics[width=\textwidth]{figures/sim_semi.pdf}
    \caption{Ground Truth vs. BNP models inference on \textbf{Simulated Data II}. }
    \label{fig:sim2_states}
\end{figure*}


\begin{figure*}[hbt!]
    \centering
    \includegraphics[width=\textwidth]{figures/sim_hard.pdf}
    \caption{Ground Truth vs. BNP models inference on \textbf{Simulated data III}.}
    \label{fig:sim3_states}
\end{figure*}


\begin{figure*}[hbt!]
    \centering
    \includegraphics[width=\textwidth]{figures/cpap.pdf}
    \caption{Ground Truth vs. BNP models inference on \textbf{CPAP} dataset.}
    \label{fig:cpap_states}
\end{figure*}


\begin{figure*}[hbt!]
    \centering
    \includegraphics[width=\textwidth]{figures/har.pdf}
    \caption{Ground Truth vs. BNP models inference on \textbf{HAR} dataset. }
    \label{fig:har_states}
\end{figure*}


% \subsubsection{HAR70+}
\begin{figure*}[hbt!]
    \centering
    \includegraphics[width=\textwidth]{figures/har70.pdf}
    \caption{Ground Truth vs. BNP models inference on \textbf{HAR70} dataset. }
    \label{fig:har70_states}
\end{figure*}


\begin{figure}[hbt!]
\centering
\begin{minipage}{.49\textwidth}
  \includegraphics[width=\linewidth]{figures/pi_sim1.pdf} 
  \caption{Transition probabilities learned for the \textbf{Simulated dataset I} by \hdpflow. The states are ordered from high to low probability determined by $\beta$. }
  \label{fig:pi_sim1}
\end{minipage}\hfill % This will add a small space between the two minipages
\begin{minipage}{.49\textwidth} % Reduced to .49 to allow for some spacing
  % \centering (Can be uncommented if you want to center inside the minipage)
  \includegraphics[width=\linewidth]{figures/pi_sim2.pdf}
  \caption{Transition probabilities learned for the \textbf{Simulated dataset II} by \hdpflow. The states are ordered from high to low probability determined by $\beta$. }
  \label{fig:gen_sample_sim2}
\end{minipage}
\end{figure}

\begin{figure}[hbt!]
\centering
\begin{minipage}{.49\textwidth}
  \includegraphics[width=\linewidth]{figures/pi_sim3.pdf} 
  \caption{Transition probabilities learned for the \textbf{Simulated dataset III} by \hdpflow. The states are ordered from high to low probability determined by $\beta$. }
  \label{fig:pi_sim3}
\end{minipage}\hfill % This will add a small space between the two minipages
\begin{minipage}{.49\textwidth} % Reduced to .49 to allow for some spacing
  % \centering (Can be uncommented if you want to center inside the minipage)
  \includegraphics[width=\linewidth]{figures/pi_har.pdf}
  \caption{Transition probabilities learned for the \textbf{HAR} by \hdpflow. The states are ordered from high to low probability determined by $\beta$. }
  \label{fig:pi_har}
\end{minipage}
\end{figure}

\clearpage