\section{\hdpflow}\label{sec:proposed_gen_model}

% \hdpflow\ learns the underlying state of a time series 
%By integrating the flexibility of nonparametric HMMs with the expressive capabilities of flow-based emission models, \hdpflow\ effectively captures complex and evolving states.  
%By combining the flexibility of nonparametric HMMs with the expressive power of flow-based emissions, \hdpflow\ can effectively model complex, evolving states. 

% prevalent, posing a significant challenge to most existing models.



\subsection{\hdpflow\ generative process}

The generative process of \hdpflow\ is illustrated in Figure \ref{fig:graphical_model}. This graphical model shows a time series sample $X^{(i)} \in \mathbb{R}^{T \times D}$ with $i \in[0, N]$, where $N$ is the total number of samples. Each ${\bf x}_t$  represents a $D$-dimensional observation at time $t \in [0,T]$, where the total number of time steps $T$ depends on the sample. \footnote{For clarity, we omit the sample index $i$ throughout the remainder of the paper.}

\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{figures/HDPFlow.png}
    \caption{Description of \hdpflow\ generative process. The variables $\{{\bf x}_t, \forall t\in[1,\ldots, T]\}$ are the observations and the local latent variables $\{z_t, \forall t\in[1,\ldots, T]\}$ are the underlying state of the sample at all $t$. The global latent variables $\{ \beta, \pi_k, \kappa_k, \theta_k \}$ characterize the global properties of the states, where $k$ is the index for infinite states. The parameters $\{\gamma, \alpha, \rho, \lambda\}$ of the priors are visualized with dark boxes, and $d_t$ is the deterministic variable measuring the number of steps $z_t$ has persisted.}
    \label{fig:graphical_model}
\end{figure}


% \begin{figure*}
% \begin{minipage}{.57\textwidth}
%   \centering    \includegraphics[width=\textwidth]{figures/HDPFlow.png}
%     % \captionof{figure}{\hdpflow\ graphical model describing the generative process of sample $X$. The global latent variables $\{ \beta, \pi_k, \kappa_k, \theta_k \}$ characterize the global properties of the states. The local latent variables $Z$ determine the underlying state of the sample at all $t$. The observations ${\bf x}_t$ are shaded and the prior parameters $\{\gamma, \alpha, \rho, \lambda\}$ are visualized with dark boxes. $d_t$ is the deterministic variable measuring the number of steps $z_t$ has persisted.}
% \end{minipage}
% \begin{minipage}{.4\textwidth}
% \begin{align} \label{eq:gen_start}
% \beta &\sim GEM(\gamma) \\
% \theta_k  &\sim H_{\lambda},\\
% \kappa_k &\sim Beta(\rho_1, \rho_2)\\
%  \pi_k &\sim DP(\alpha+\kappa_k, \frac{\alpha\beta+\kappa_k\delta_k}{\alpha+\kappa_k})\\
% z_t &\sim \pi_{z_{t-1}}\\
% d_t &= \begin{cases}
%         d_{t-1}+1 & \text{if } z_t=z_{t-1}\\
%         1 & \text{else}.
%     \end{cases}\\
% {\bf x}_{t} &\sim p({\bf x}|z_t, d_t)\label{eq:gen_end}
% \end{align} 
% \end{minipage}
% \caption{Description of \hdpflow\ generative process. The variables $\{{\bf x}_t, \forall t\in[1,\ldots, T]\}$ are the observations and the local latent variables $\{z_t, \forall t\in[1,\ldots, T]\}$ are the underlying state of the sample at all $t$. The global latent variables $\{ \beta, \pi_k, \kappa_k, \theta_k \}$ characterize the global properties of the states, where $k$ is the index for infinite states. The parameters $\{\gamma, \alpha, \rho, \lambda\}$ of the priors are visualized with dark boxes, and $d_t$ is the deterministic variable measuring the number of steps $z_t$ has persisted.}
% \label{fig:graphical_model}
% \end{figure*}


% \paragraph{Sequential process:} 
In this latent variable model constitutes of (1) {\emph{local latent variables}} $Z=\{z_t, \forall t\in[1,\ldots, T]\}$ that indicate the underlying state of a sample at each time step $t$, (2) A set of {\emph{global latent variables}} $\{ \beta, \pi_k, \kappa_k, \theta_k  \}$ that represent the characteristics of the states, with $k=1,2,\dots$ indexing the potentially infinite number of states. The variable $\beta$ represents the overall density of each state $k$ and $\pi_k$ is the transition probability from state $k$ to all possible infinite states. The self-transition parameter $\kappa_k$ is the added weight to transition probabilities that prevents unrealistically fast transitions among states. Similar to \citet{fox2011sticky}, this parameter is sampled from a distribution $Beta(\rho_1, \rho_2)$, but instead of using the same sticky parameter for all states, \hdpflow\ samples the parameter $\kappa_k$ independently for each state. This gives \hdpflow\ the flexibility to model different self-transition behaviors across different states. The parameters $\theta_k$ determine the distribution of observations in state $k$ and are sampled from the base distribution $H_\lambda$ of the top-level DP. The variable $d_t$ is a deterministic function of latent states which measures how many consecutive steps the sample has been in state $z_t$. The distribution of observations $p({\bf x}|z_t, d_t)$ at any time $t$ is conditioned on $d_t$ as well as the underlying state, which enables modelling non-stationarity. The generative process is summarized in Equations \ref{eq:gen_start} to \ref{eq:gen_end}. 

\begin{align} \label{eq:gen_start}
\beta &\sim GEM(\gamma) \\
\theta_k  &\sim H_{\lambda},\\
\kappa_k &\sim Beta(\rho_1, \rho_2)\\
 \pi_k &\sim DP(\alpha+\kappa_k, \frac{\alpha\beta+\kappa_k\delta_k}{\alpha+\kappa_k})\\
z_t &\sim \pi_{z_{t-1}}\\
d_t &= \begin{cases}
        d_{t-1}+1 & \text{if } z_t=z_{t-1}\\
        1 & \text{else}.
    \end{cases}\\
{\bf x}_{t} &\sim p({\bf x}|z_t, d_t)\label{eq:gen_end}
\end{align} 


% The goal of introducing $d_t$ is to break the Markov assumption to be able to model non-stationary states. 
% This modelling approach is reminiscent of the BNP hidden semi-Markov model of \citet{johnson2013bayesian} that changes the Markov assumption by introducing explicit duration states. 
% While they learn the distribution of states by drawing the random variable of duration from a prior distribution, 
% To break the Markov assumption, \hdpflow\ however, uses the state duration indicator to model complex emissions rather than non-Markovian state transitions. 

% \paragraph{Distribution of observations:} 
The joint likelihood of \hdpflow\ variables is shown in Equation \ref{eq:joint}. Here, the {\emph{hyperparameters}} $\Theta=\{\alpha, \gamma, \lambda, \rho\}$ are the parameters of the priors over the latent variables (dark squares in Figure \ref{fig:graphical_model}).
\begin{equation} \label{eq:joint}
\begin{split}
p(X, Z, \theta, \pi, \beta, &\kappa) =  p(\beta|\gamma) \prod_k^\infty  p(\pi_k|\beta, \alpha, \kappa_k) p(\theta_k|
\lambda) \\ 
& \times p(\kappa_k|\rho)\prod_{t=1}^T p(z_t|z_{t-1}, \pi)p({\bf x}_t|z_t, d_t).
\end{split}
\end{equation}
What distinguishes \hdpflow\ is its approach to estimating the data distribution at any time step $t$.
%What sets \hdpflow\ apart the most is how it estimates the data distribution at any time step $t$. 
It incorporates conditional masked autoregressive flows (MAFs) \citep{papamakarios2017masked} into the nonparametric process. When extending MAF to the \hdpflow\ setting, each dimension $j \in [D]$ of a sample at time $t$ is modeled conditionally on the preceding dimensions, i.e. $p(x_{j,t}|{\bf x}_{0:j-1,t})$. MAFs transform a standard Gaussian into a Normal distribution $\mathcal{N}(\mu_j, \sigma^2_j)$ to model the conditional as $p(x_{j,t}|{\bf x}_{0:j-1,t}) = \mathcal{N}(\mu_j, \sigma^2_j)$.
The parameters $\mu_j$ and $\sigma_j$ are functions of the preceding observations ${\bf x}_{0:j-1,t}$ and are estimated by neural networks $f_{\mu}$ and $f_{\sigma}$. By chaining these conditionals, MAF can model arbitrarily complex data distributions $p(\textbf{x}_t)$ as a product of the Gaussian conditionals.

\hdpflow\ uses state-specific MAF functions $f_{\mu(\theta_{z_t})}$ and $f_{\sigma(\theta_{z_t})}$ where the parameters of the transformation functions are determined by the state $z_t$ at time $t$. In essence, where $z_t=k$, the function parameters are set as the parameters $\theta_k \sim H_\gamma$. 
This differs from traditional HDP-HMMs where $\theta_k$ directly models the data distribution.
To capture non-stationarity, \hdpflow\ additionally conditions the observation distribution $p(\textbf{x}_t|z_t, d_t)$ on $d_t$. This is achieved by incorporating $d_t$ as an input to the mapping transforms within the conditional MAFs as:
\begin{equation}
\begin{split} \label{eq:nf_cond}
x_{j,t} \sim \mathcal{N}(\mu_j, \sigma^2_j) , &   \quad  \mu_j = f_{\mu(\theta_{z_t})}(x_{1:j-1,t}, g(d_t))\\
 &  \quad  \sigma_j = f_{\sigma(\theta_{z_t})}(x_{1:j-1, t}, g(d_t)).      
\end{split}
\end{equation}
To model various types of non-stationarities, \hdpflow\ applies a non-linear function $g(\cdot)$ to the duration variable $d_t$. Standard activation functions like ReLU can capture trends within states. However, for datasets with periodic patterns (like ECG or EEG), a specialized activation function is needed. \hdpflow\ incorporates the activation function $g(x) = sin(x)^2$ \citep{ziyin2020neural} for its periodic inductive bias, enabling it to model cyclical patterns within states.


\subsection{Variational Inference for \hdpflow}

A key algorithmic challenge for \hdpflow\ is performing approximate inference, i.e. estimating the posterior over the global and local variables given observations ${\bf X}_{train}$.
% This task becomes particularly demanding for large datasets and models with complex dependencies like \hdpflow.
%This task is particularly difficult for large datasets and models with complex dependencies such as ours. 
In large datasets of long time series, most existing sampling-based inference algorithms struggle with long time series due to their repeated reliance on the memory-intensive forward-backward (FB) algorithm.

To address this limitation, \hdpflow\ employs stochastic variational inference (SVI) for scalable posterior approximation. While closed-form SVI exists for HDP-HMMs \citep{zhang2016stochastic}, it still relies on FB estimation, creating a bottleneck for long sequences. Moreover, \hdpflow's exact posterior is heavily conditioned, making closed-form approximations difficult.
We instead adopt black-box variational inference (BBVI) \citep{ranganath2014black} with a mean-field assumption, extending it to \hdpflow's hierarchical and temporal setting. 
% Using a Rao-Blackwellized estimator \citep{casella1996rao}, BBVI efficiently computes noisy gradients of the ELBO with Monte Carlo samples from the variational posterior. 

\hdpflow's mean-field factorized variational posterior over all global and local latent variables $W = \{Z, \theta, \beta, \pi, \kappa\}$ is shown in Equation \ref{eq:variational_posterior}. The variational posterior of each variable is modeled independently with a family of distributions similar to the prior. The infinite number of states are truncated to a large value $K$ for the posteriors. Note that the truncation is only for the variational approximation and not the generative model. The set $\Theta^*$ contains all variational parameters of the factorized distributions: (1) the Dirichlet concentration parameters for $q(\beta)$ and $q(\pi_k)$, (2) the probabilities of the categorical distribution of the states $q(z_t)$, (3) the mean and variance for $q(\theta_k)$, and (4) the concentration parameters of the Beta distribution of $q(\kappa_k)$: 

\begin{equation}
    \begin{split}
        q(Z, \theta, \beta, \pi, \kappa|\Theta^*) &= 
\underbrace{q(\beta)}_{\text{Dirichlet}}
\prod_{t} \underbrace{q(z^{(i)}_t)}_{\text{Categorical}} \prod_k \underbrace{q(\theta_k)}_{\text{Gaussian}} \\
&\quad \times \prod_k \underbrace{q(\pi_k)}_{\text{Dirichlet}}  \prod_k \underbrace{q(\kappa_k)}_{\text{Beta}}.
    \end{split}
    \label{eq:variational_posterior}
\end{equation}

We dynamically update the $\kappa$ parameters during training to enhance performance. The parameters are scaled by a factor of $1 + 0.1 \cdot \text{epoch}$, allowing the model to adapt over time. 
% Additionally, a small bias term is included to ensure numerical stability during training. 
This approach enables the model to start with broader state representations and refine them progressively, balancing exploration and stability in non-stationary environments.

The VI objective is to find the variational parameters $\Theta^*$ such that the posterior $q(W)$ closely approximates the true posterior $p(W|{\bf X}_{train})$ by maximizing the evidence lower bound (ELBO), $\E_{q(W)}[\log p({\bf X}_{train}, W)- \log q(W)]$. 

Algorithm \ref{alg:bvi-hdp-flow} details the SVI approach we use for \hdpflow. We compute noisy gradients of the ELBO with the Rao-Blackwellized estimator \citep{casella1996rao} and using Monte Carlo samples $W[s] \sim q(W)$ for $s \in [1, \dots, S]$. The gradients ${\hat{\nabla}}_{\theta^*_i}\mathcal{L}$ with respect to each parameter $\theta^*_i \in \Theta^*$ of the variational posteriors is a function of all components of the log joint (Eq. \ref{eq:joint}) that include terms from the $i^{th}$ factor. See Equation \ref{eq:gradients} in Appendix \ref{app:gradients} for derivations of the gradients based on the hierarchical structure of the global and local variables. To further enhance estimation, we incorporate control variates to reduce variance \citep{ranganath2014black}. Additionally, we employ an adaptive per-component learning rate during parameter updates. This detail is crucial for \hdpflow\ as the parameterizations of its probability distributions have varying scales.

\begin{algorithm}[hbt!]
\caption{Stochastic BBVI for HDP-Flow}
\label{alg:bvi-hdp-flow}
\begin{algorithmic}
\STATE {\bfseries Input:} ${\bf{X}}_{train}$, $p(x,W)$ (Eq. \ref{eq:joint}), $q(W)$ (Eq. \ref{eq:variational_posterior})\\
% \STATE \textbf{Initialize} $\Theta^*$ randomly
\REPEAT
    \FORALL{$X^{(i)} \in {\bf{X}}_{train}$}
    \FOR{$s=1$  {\bfseries to} $S$ }
    \STATE $W[s] \sim q(Z, \theta, \beta, \pi, \kappa|\Theta^*)$
    \ENDFOR
        
        \FORALL{$\theta^* \in \Theta^*$}
            \STATE Estimate control variate $a_{\theta^*}$  \citep{ranganath2014black};
            \STATE $\nabla_{\theta^*} \mathcal{L}  = \frac{1}{S}\sum_{s=1}^{S} \nabla_{\theta^*} \log q (W[s]) 
            \times (\log p(x, W[s]) - a_{\theta^*}\log q (W[s]))$ (Eq. \ref{eq:gradients});
            \STATE $\rho_{\theta^*} \gets$ Update adaptive learning rate 
            \STATE $\theta^* \gets \theta^*$ + $\rho_{\theta^*}\nabla_{\theta^*}$ 
        \ENDFOR
    % \STATE $\Theta^* \gets \Theta^*$ + $\rho\nabla\Theta^*$ 
\ENDFOR
\UNTIL $\nabla$ELBO$\leq \epsilon$
% {\normalfont{ELBO}$\frac{1}{S}\sum_i\sum_s\E_{q(W_s)}[\log p(X^i, W_s)- \log q(W_s)]$ }
% \UNTIL {\normalfont{change of }$\Theta^*$ is less than 0.01}
\end{algorithmic}
\end{algorithm}
% Approximating the true posterior of our model is very challenging due to all the different dependencies between the variables. 

% where $\Theta^*_{\beta}, \Theta^*_{\theta}, \Theta^*_{\pi}, \Theta^*_{\kappa}, \Theta^*_{Z}$ are the parameters of $q(\beta), q(\theta), q(\pi), q(\kappa), q(Z)$, respectively. Note that these estimations take into account the unique sequential and hierarchical dependencies of \hdpflow\ distribution. 
% % Visit Appendix \ref{app:gradients} for more detail on the gradient derivations.  
% \begin{equation}
%  \begin{split} 
%     \label{eq:gradients}
%     {\hat{\nabla}}_{\Theta^*_{\beta}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\beta_s) \Big(\log p(\beta_s) 
%     + \\ & \sum_{k=1}^{L} \log p(\pi_{k,s}|\beta_s) - \log q(\beta_s)\Big)\\    
%     {\hat{\nabla}}_{\Theta^*_{\theta_k}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\theta_{k,s})\Big(\log p(\theta_{k,s}) + \\ & \sum_{t=1}^{T}\log p(x_t|z_{t,s},\theta_{s})\delta_{(z_{t,s}=k)} - \log q(\theta_{k,s})\Big) \\
%     {\hat{\nabla}}_{\Theta^*_{\pi_k}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\pi_{k,s})\Big(\log p(\pi_{k,s}) + \\ & \sum_{t=1}^{T}\log p(z_{t,s}|\theta_s, \pi_s, \kappa_s)\delta_{(z_{t-1}=k)} - \log q(\pi_{k,s})\Big) \\
%     {\hat{\nabla}}_{\Theta^*_{\kappa_k}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(\kappa_{k,s})\Big(\log p(\kappa_{k,s}) + \\ & \sum_{k=1}^{L} \log p(\pi_{k,s}|\beta_s)
%     - \log q(\kappa_{k,s})\Big)\\
%     {\hat{\nabla}}_{\Theta^*_{z_t}}\mathcal{L} = & \frac{1}{S} \sum_{s=1}^{S} \nabla \log q(z_{t,s})\Big(\log p(z_{t,s}|\pi_s, \kappa_s) +\\ & \log p(x_t|z_{t,s}, \theta_s) - \log q(z_{t,s}) \Big).
% \end{split}  
% \end{equation}


To compute the log joint probabilites $p(\beta|\gamma)$ and $p(\pi_k|\alpha, \beta, \kappa)$ under the infinite states of the non-parametric priors, we employ a degree-$L$ weak-limit approximation. This technique expresses a DP as the limit of finite-dimensional Dirichlet distributions as the dimensions tend to infinity \citep{ishwaran2002exact, teh2006hierarchical}. Using the weak-limit theorem, we impose a finite Dirichlet prior over the variables $\beta$ and $\pi_k$ as shown in Equation \ref{eq:weak_limit}. Importantly, the approximation order $L$ can be different and significantly larger than the posterior truncation value $K$.
% Calculating the gradient terms ${\hat{\nabla}}_{\theta^*}\mathcal{L}$ requires estimating the likelihood of the Monte-Carlo samples $W[s]$ under the variational posterior (Eq. \ref{eq:variational_posterior}) and the joint distribution (Eq. \ref{eq:joint}). 
% We consider a degree $L$ weak-limit approximation for approximating likelihood under the infinite states of the non-parametric priors for estimating ${\hat{\nabla}}_{\theta^*_{\beta}}\mathcal{L}$ and ${\hat{\nabla}}_{\theta^*_{\pi_k}}\mathcal{L}$. A Dirichlet process model can be derived as the limit of a sequence of finite dimension Dirichlet distributions, where the number of dimensions is taken to infinity \citep{ishwaran2002exact, teh2006hierarchical}. Using the weak-limit theorem, we induce a finite Dirichlet prior over the variables $\beta$ and $\pi_k$ as shown in Equation \ref{eq:weak_limit}. Note that $L$ doesn't need to match the truncation value of the posterior $K$, and can be significantly larger.
% To estimate the likelihood under the prior, instead of integrating over the infinite states, we can use a finite approximation of the non-parametric priors $\log p(x, W[s])$. A Dirichlet process model can be derived as the limit of a sequence of finite dimension Dirichlet distributions, where the number of dimensions is taken to infinity \citep{ishwaran2002exact, teh2006hierarchical}.
% We can consider a degree $L$ weak-limit approximation to the DP \citep{ishwaran2002exact} to induce a finite Dirichlet prior over the variables $\beta$ and $\pi_k$ as shown in Equation \ref{eq:weak_limit}. Note that this $L$ doesn't need to match the truncation value used in the posterior approximation, and can be set to a significantly larger value.
\begin{equation}
\label{eq:weak_limit}
\begin{split}
    p(\beta|\gamma) & \approx Dir(\gamma/L, \dots , \gamma/L)\\
    p(\pi_k | \beta, \alpha) &\approx Dir(\alpha \beta_1, \dots , \alpha \beta_L).    
\end{split}
\end{equation}
Finally, the NF allows us to estimate the log likelihood of observations $p({\bf x}_t|z_t, d_t)$ in closed form,
\begin{equation}
\label{eq:nf_logprob}
\begin{split}
    \log p({\bf x}_t|z_t, d_t) &= \log p_{\mathbf{u}}(f_{\theta_{z_t}}({\bf x}_t, d_t)) - \log \left\vert det \left(\frac{\partial \mathbf{x}_t}{\partial \mathbf{u}_t} \right) \right\vert\\
    &= \log p_{\mathbf{u}}(f_{\theta_{z_t}}({\bf x}_t, d_t)) + \sum_i \log\sigma_{i,t}.    
\end{split}
\end{equation}
\paragraph{Posterior predictive estimation}
The posterior predictive distribution will help assess generalization. It is the distribution of new, unseen samples ${\bf{X}}_{test}$ given the data we've already seen ${\bf{X}}_{train}$. We can estimate the likelihood of a new sample  $\tilde {X} \in {\bf{X}}_{test}$ by integrating over the learned posterior of the global variables as shown in Equation \ref{eq:post_pred}:


\begin{equation}
\label{eq:post_pred}
\begin{split}
    p(\tilde {X}|{\bf{X}}_{train}) & = \int_{\beta, \pi, \kappa, \theta} p(\tilde {X}| \beta, \pi, \kappa, \theta) p(\beta, \pi, \kappa, \theta|{\bf{X}}_{train})\\
    & \approx \int_{\beta, \pi, \kappa, \theta} p(\tilde {X}|\beta, \pi, \kappa, \theta) q(\beta, \pi, \kappa, \theta)  \\
    & \approx \E_{\beta, \pi, \kappa, \theta \sim q} p(\tilde {X}|\beta, \pi, \kappa, \theta).
\end{split}
\end{equation}
Knowing the global structure of the generative process of HDP-Flow also enables us to estimate the most likely underlying state $\tilde {Z}$ for a newly observed time series sample $\tilde {X} \in {\bf{X}}_{test}$ using Equation \ref{eq:post_state}.
\begin{equation}
\label{eq:post_state}
\begin{split}
    p(\tilde {Z}|\tilde {X}, {\bf{X}}_{train}) & = \int_{\beta, \pi, \kappa, \theta} p(\tilde {Z}|\tilde {X}, \beta, \pi, \kappa, \theta)\\ 
    & \times p(\beta, \pi, \kappa, \theta|{\bf{X}}_{train})\\
    % & \approx \int_{\beta, \pi, \kappa, \theta} p(\tilde {Z}|\tilde {X}, \beta, \pi, \kappa, \theta) q(\beta, \pi, \kappa, \theta)  \\
    & \approx \E_{\beta, \pi, \kappa, \theta \sim q} p(\tilde {Z}|\tilde {X}, \beta, \pi, \kappa, \theta).
\end{split}
\end{equation}
To evaluate Equations \ref{eq:post_pred} and \ref{eq:post_state}, we employ the FB algorithm. The $k^{th}$ forward message of the FB algorithm ${\bf{f}}_t(k)$ at time $t$ measures $p(\tilde{X}_{0:t}, z_t=k|\beta, \pi, \kappa, \theta)$. Hence, we can estimate the likelihood of a series of observations $p(\tilde {X}|\beta, \pi, \kappa, \theta)$ as the marginal of the last time step, and find the most likely sequence of underlying states for $\tilde {X}$ using the Viterbi algorithm (details in Appendix \ref{app:fb_algo}). 

\hdpflow's state- and duration-dependent observations $p({\bf x}_t|z_t, d_t)$ require us to modify the traditional FB algorithm. We must explicitly account for the probability of a state transition or persistence at each time step. This modification is expressed in Equation \ref{eq:fb_hdpflow} and implemented efficiently using matrix calculations.
\begin{equation}
\label{eq:fb_hdpflow}
\begin{split}
    {\bf{f}}_t(z_t) =& \sum_{z_{t-1} \neq z_t}{\bf{f}}_{t-1}(z_{t-1})  p(z_t|z_{t-1}) p({\bf x}_t|z_t, d=1)+ \\
    & {\bf{f}}_{t-1}(z_{t-1}) p(z_t|z_{t-1}) p({\bf x}_t|z_t, d=d_{t-1}+1).
\end{split}
\end{equation}
%During inference, we approximate the standard FB algorithm to accommodate \hdpflow's unique state dynamics. At each time step, we select the state with the highest likelihood based on the forward messages to update the duration variable $d$. Importantly, this modification applies only during inference and does not impact model training. Additionally, this approximation is necessary for batch inference; standard FB algorithm applies in streaming settings.
During inference, we approximate the FB algorithm to accommodate \hdpflow's state dynamics. At each step, the state with the highest likelihood updates the duration variable $d$. This modification, essential for batch inference, does not affect training, while the standard FB algorithm remains applicable in real-time streaming settings.

% \subsection{Uncertainty in \hdpflow}
% Need for uncertainty
% Types of uncertainty (related work goes into background and specific calculations for HDP-Flow come here)
% How to measure each for HDPFlow