% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
\usepackage{graphicx,verbatim}
\usepackage{tabularx}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{amsmath}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{LaST-Diff: Latent Spatiotemporal Diffusion for Temporally Stable Echocardiography Video Segmentation}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Xiaojie Chen\inst{1}\orcidID{0009-0007-9090-7982} \and
Fahim Ahmed Zaman\inst{1}\orcidID{0000-0002-0607-847X} \and
Amanda Chang\inst{1} \and
Liya Dai\inst{2}\orcidID{0000-0003-3598-5981} \and
Kan Liu\inst{3}\and
Milan Sonka\inst{1}\orcidID{0000-0002-9613-9968}\and
Xiaodong Wu\inst{1}\orcidID{0000-0003-3617-5091}
}
% index{Chen, Xiaojie}
% index{Zaman, Fahim Ahmed}
% index{Chang, Amanda}
% index{Dai, Liya}
% index{Liu, Kan}
% index{Sonka, Milan}
% index{Wu, Xiaodong}
%
\authorrunning{X. Chen et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{University of Iowa, Iowa City, IA 52246, USA \and
Lishui Central Hospital, Lishui, Zhejiang 323020, China \and
Washington University in St. Louis, St. Louis, MO 63130, USA\\
\email{xiaojie-chen@uiowa.edu}}
%  
\maketitle              % typeset the header of the contribution
%
\begin{abstract}
Accurate segmentation of cardiac chambers in echocardiography videos is essential for quantitative functional assessment, yet remains challenging due to low image quality, rapid motion, and heavy annotation burden. While recent deep learning methods achieve high frame-wise accuracy, they often neglect temporal dependencies, resulting in temporally unstable predictions. Recent methods leverage temporal information or full 3D modeling to improve temporal coherency, but are often hindered by expensive computational costs or local temporal constraints. To address these limitations, we propose LaST-Diff, a latent diffusion–based framework for efficient and temporally stable echocardiography video segmentation. With a three-stage training strategy, LaST-Diff first learns strong spatial representations from images and then refines temporal coherence at the sequence level through a dedicated temporal branch with adaptive fusion, which enables effective spatiotemporal modeling without the computational burden. Experiments on both public and private datasets demonstrate that LaST-Diff achieves competitive static segmentation accuracy, while more faithfully modeling the anatomical trajectory across frames, leading to reduced inter-frame flickering and inconsistent boundary jitter. Notably, the robustness of LaST-Diff is particularly pronounced by the improvements on low-quality videos. Furthermore, the generative nature of LaST-Diff enables uncertainty quantification, offering critical confidence indicators for expert clinical review. 

\keywords{Echocardiogram video segmentation \and Latent diffusion.}
\end{abstract}
%
%
%
\section{Introduction}
Echocardiography is an essential imaging modality for cardiovascular diagnosis, offering a non-invasive, real-time, and cost-effective alternative to imaging modalities like CT and MRI  \cite{antico_ultrasound_2019}. The accurate segmentation of cardiac structures from echocardiography videos is a foundational step for critical tasks, such as ejection fraction estimation. However, manual segmentation remains a significant clinical burden, necessitating reliable automated solutions \cite{liu_deep_2019}. Various approaches have been developed to address the above problems. UNet and its variants such as UNet++ \cite{zhou_unet_2018} and Res-UNet \cite{Diakogiannis_2020} have achieved superior results in spatial segmentation, with further performance gains realized through the integration of attention mechanisms \cite{qurri_improved_2023}. Building on these, PLANet \cite{liu_deep_2021} introduced pyramid local attention module to enhance feature extraction by capturing supporting information within compact and sparse neighboring contexts. Despite their success in frame-wise accuracy, these single-frame approaches fail to utilize the temporal consistency inherent in cardiac motion, often leading to inconsistent predictions across the cardiac cycle. Recognizing the video-based nature of echocardiography, people have increasingly integrated temporal information to improve segmentation accuracy and stability. Painchaud \textit{et al.} \cite{painchaud_echocardiography_2022} proposed a post-processing method to enforce temporal consistency, while Guo \textit{et al.} \cite{guo_spatial-temporal_2025} introduced a temporal-context-aware attention mechanism to capture dynamic information. Similarly, Wu \textit{et al.} \cite{wu_semi-supervised_2022} employed a spatiotemporal semantic calibration method to align feature maps between consecutive frames. Xue \textit{et al.} \cite{9946374} explicitly predicted optical flow between successive frames and enhanced the segmentation accuracy.
However, these methods are often limited to constraints between adjacent frames rather than modeling the global dependencies of the entire sequence, thus still lacking temporal consistency. 3D approaches \cite{qin_dtaue_2025}, on the other hand, utilizing the entire video sequence as input, have the capability to capture global information. Although offering straightforward solutions to address sequence segmentation, 3D approaches are frequently constrained by prohibitive computational costs, which may be unnecessary for echocardiography video segmentation where inter-frame variations are relatively moderate.

Generative models, specifically Denoising Diffusion Probabilistic Models   \cite{ho_denoising_2020}, have emerged as a promising paradigm for medical image analysis due to their inherent ability to model complex data distributions and provide principled uncertainty quantification  \cite{wu2023medsegdiffmedicalimagesegmentation}. However, the direct application of standard DDPMs to high-resolution medical imaging datasets is frequently constrained by substantial computational requirements and memory overhead, limiting their viability for efficient clinical deployment. While efficiency can be improved by shifting the denoising process into a latent space via pretrained autoencoders \cite{rombach_high-resolution_2022}, direct extension to videos remains challenging, as compressed latent spaces may introduce blurriness in reconstructed frames. Separating video generation into image generation followed by video refinement, Zhang \textit{et al.} \cite{zhang_step_2025} proposed a decoupled framework to solve video inverse problems efficiently. This strategy effectively bypasses the constraints of 3D diffusion by treating temporal consistency as a refinement task rather than a direct high-dimensional modeling problem.

In this work, we propose LaST-Diff (Latent Spatiotemporal Diffusion), a novel framework that reformulates echocardiogram video segmentation as a progressive denoising process in latent space. Our approach employs a three-stage strategy that independently establishes strong spatial representations before fine-tuning for temporal coherence. By integrating a dedicated temporal branch with an adaptive fusion mechanism, LaST-Diff harmonizes spatial accuracy, temporal coherence, and computational efficiency. Furthermore, the generative nature of LaST-Diff enables robust uncertainty estimation, providing critical confidence indicators for clinical review.


\section{Method}
We propose a three-stage framework for echocardiogram video segmentation. It decomposes the complexity of spatial-temporal segmentation into: 1) latent representation learning for discrete-to-continuous domain mapping, 2) image-conditioned spatial latent denoising, and 3) temporal refinement via adaptive spatial-temporal fusion. The global pipeline is illustrated in Fig~\ref{fig:denoiser}.

\subsection{Latent Representation Learning}
Standard Diffusion Probabilistic Models are defined over a continuous domain, making them ill-suited for the discrete nature of semantic segmentation masks. Direct injection of Gaussian noise into binary or categorical masks leads to unnatural perturbations during inference. To address this, we first project the discrete mask space $\mathcal{M}$ into a continuous latent space $\mathcal{Z}$. We employ a Variational Autoencoder (VAE) where the encoder $E_{label}$ maps a discrete 2D mask $y \in \{0, 1, \dots, C\}^{H \times W}$ to a low-dimensional continuous representation $z_{0} \in {R}^{h \times w}$. The decoder $D_{label}$ is trained to recover the full resolution labels from latent space $\mathcal{Z}$. By performing diffusion on $z_{0}$ rather than $y$, we ensure smooth state transitions between object classes and significantly reduce the computational overhead during the iterative denoising process. Once convergence is reached, VAE are frozen to serve as a fixed representation layer for all subsequent stages.

\subsection{Image-Conditioned Spatial Latent Denoising}
The second stage focuses on training an Image Diffusion Model (IDM) to learn the denoising process within the latent space $\mathcal{Z}$. After extracting the latent representation $z_{0}$ using pretrained label encoder $E_{label}$ from individual mask frames, Gaussian noise was incrementally introduced over a predefined sequence of T timesteps, eventually transforming the structured latent representation $z_{0}$ into noisy latents $z_{t}$. A 2D U-Net is trained as the denoiser to recover the clean latent representation $z_{0}$ from a noised version $z_{t}$, for timestep $t$.

To ensure the denoising process anatomically aligned with the target structure, we utilize an Image Feature Encoder $E_{image}$ to extract high-level semantic embeddings $i$ from the source image frames $X$. We extract the feature map $i$ that matches the spatial dimensions of the latent mask representation $(h \times w)$, and integrate into the diffusion process via channel-wise concatenation with the noisy latent $z_{t}$. The denoiser is optimized using a denoising score-matching objective, learning to identify and remove the specific noise components $\epsilon_{t}$ added at each diffusion timestep $t$ under the explicit control of the image embedding $i$.

\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{arch_v2.pdf} 
\caption{Overview of the LaST-Diff architecture and the spatio-temporal denoiser. The left panel illustrates the training pipeline: a source image sequence $\mathbf{X} = \{X_1, X_2, \dots, X_F\}$ and its ground truth segmentation $\mathbf{y}= \{y_1, y_2, \dots, y_F\}$ are mapped into latent spaces $i$ and $z_{0}$ via their respective encoders $E_{image}$ and $E_{label}$. The latent label sequence $z_{0}$ is then perturbed into $z_{t}$ through a Gaussian noise module $G$. A denoising network $DN$, conditioned on the image embedding $i$ and timestep $t$, is trained to estimate the noise $\epsilon_{t}$. The final segmentation $\mathbf{y}'= \{y'_1, y'_2, \dots, y'_F\}$ is reconstructed from the denoised latent $z'_{0}$ using $D_{label}$. The right panel details the $DN$ structure, which consists of a 2D Spatial Denoiser and a 1D Temporal Denoiser. For an input of shape $(F, c, h,w)$, where $F$ is the number of frames and $c$ is the number of channels, the temporal branch reshapes the input to $(hw, c, F)$ to capture inter-frame dependencies, and the outputs are integrated using a learnable weighting parameter $\alpha$. } 
\label{fig:denoiser}
\end{figure}

\subsection{Temporal Fine-tuning}
To extend the framework to echocardiogram sequences $\mathbf{X} = \{X_1, X_2, \dots, X_F\}$, we transition to a Video Diffusion Model (VDM). This stage introduces temporal dependencies necessary to capture the dynamic motion of cardiac structures across the cardiac cycle. The detailed internal structure of the modified denoiser is depicted in Fig~\ref{fig:denoiser}.

To capture the motion through cardiac cycle, we augment the spatial layers with a temporal convolutional branch. This branch utilizes 1D convolutions that operate across the frame dimension. Compared to 3D convolutions which significantly increase computational cost, this decoupled approach allows the model to leverage the pre-trained spatial weights while independently learning inter-frame dependencies.To prevent the temporal branch from overriding the high-fidelity spatial details learned in the previouse stage, we implement an adaptive spatialtemporal fusion mechanism. The integrated latent $z_{{out}}$ is computed as:
\begin{equation}
z_{out} = (1 - \alpha) \cdot z_{spatial} + \alpha \cdot z_{temporal} \; ,
\end{equation}
where $\alpha \in [0, 1]$ is a learnable scalar parameter. During fine-tuning, $\alpha$ is optimized to find the ideal balance between spatial accuracy and temporal smoothness. This three-stage approach allows the VDM to generate masks that are not only anatomically precise in individual frames but also exhibit natural continuity across the cardiac cycle.


\section{Experiments and Results}
\subsection{Datasets}
We validated the performance of LaST-Diff on two 2D+time echocardiography datasets: the CAMUS dataset \cite{8649738} and an in-house dataset (Echronos-DB). 

\textbf{CAMUS dataset} consists of apical four-chamber (A4C) and two-chamber (A2C) view sequences from 500 patients, where 19\% of them have low quality. The contours of the left ventricular (LV) endocardium, the LV epicardium, and the left atrium (LA) are provided for each sequence. We evenly sampled $F=16$ frames from each sequence and followed the official data partition protocol.

\textbf{Echronos-DB dataset (in-house)} comprises A4C view videos from 481 patients, acquired at University of Iowa Hospitals \& Clinics. 15\% of sequences are of low quality. Expert clinicians manually traced the LV and LA boundaries to provide ground-truth segmentations. We extracted the first full cardiac cycle (End diastole to end diastole) from each sequence and evenly sampled $F=16$ frames from each cycle. The dataset was partitioned into a $80:20$ train-test split.

\subsection{Quantitative results}
To evaluate the effectiveness of the LaST-Diff, we employed four metrics. For the latter three, differences between predictions and ground truth are reported.

1) Dice Similarity Coefficient (DSC). DSC is well used metric to measure the static similarity between the prediction sequence $S$ and ground truth sequence $G$, defined as: $\mathrm{DSC} = (2|S \cap G|)/(|S| + |G|)$.
% \begin{equation}
% \mathrm{DSC} = \frac{2|P \cap G|}{|P| + |G|}
% \end{equation}

2) Temporal Dice Similarity Coefficient (t-DSC). Calculating DSC between consecutive frames, t-DSC quantifies the temporal similarity in the sequence $S$: $\mathrm{t\text{-}DSC} = (\sum_{f=1}^{f=F-1}{DSC(S_f, S_{f-1})})/(F-1)$, where $F$ is the number of frames.
% \begin{equation}
% \mathrm{t\text{-}DSC} = \frac{1}{F-1}\sum_{f=1}^{f=F-1}{DSC(S_f, S_{f-1})}
% \end{equation}
3) Temporal Curve Smoothness (TCS). To assess the temporal stability of segmentation sequence, Painchaud \cite{painchaud_echocardiography_2022} introduced TCS as temporal consistency indicator. TCS measures the second-order variation of the segmentation sequence: $\mathrm{TCS} = (\sum_{f=2}^{f=F-1}{|S_{f+1}-2S_f+S_{f-1}|})/({F-2})$.
% \begin{equation}
% \mathrm{TCS} = \frac{1}{F-2}\sum_{f=2}^{f=F-1}{|S_{f+1}-2S_f+S_{f-1}|}
% \end{equation}

4) Average Contour Displacement (ACD). ACD is a boundary based metric to evaluate the geometric stability of segmentation sequence \cite{freixenet_yet_2002}. It measures the displacement between contours of consecutive predictions: $\mathrm{ACD} = (\sum_{f=1}^{f=F-1}{d(Countour(S_f),Countour(S_{f-1}))})/({F-1})$.
% \begin{equation}
% \mathrm{ACD} = \frac{1}{F-1}\sum_{f=1}^{f=F-1}{d(Countour(S_f)+Countour(S_{f-1}))}
% \end{equation}

% For temporal metrics t-DSC, TCS, ACD, we report the unsigned difference between predictions and ground truth to reflect the ability of mimcing the ground thuth.
\begin{figure}[htbp]
\centering
\includegraphics[width=\textwidth]{comparison_02232133.png}
\caption{Qualitative comparison of segmentation results. 8 frames were sampled from 16 frames for visual convenience. Source images are shown in the first rows, with ground truths (GT) masks overlaid. For other rows, differences between predictions and ground truths are highlighted as red.} \label{fig:poor}
\end{figure}
\begin{table}[htbp]
\centering
\caption{Performance metrics (whole test set).}
\label{camusAll}
\begin{tabular}{lcccccccc}
\toprule
\multirow{2}{*}{} & \multicolumn{2}{c}{DSC} & \multicolumn{2}{c}{t-DSC} & \multicolumn{2}{c}{TCS} & \multicolumn{2}{c}{ACD} \\
\cmidrule(r){2-3} \cmidrule(r){4-5} \cmidrule(r){6-7} \cmidrule(r){8-9}
& LV & LA & LV & LA & LV & LA & LV & LA \\
\midrule
\multicolumn{9}{l}{\textbf{Dataset: CAMUS}} \\
[2pt]
SOCOF & 0.932 & 0.905 & 0.007 & 0.013 & 0.010 & 0.016 & 0.335 & 0.390 \\
LDSeg & 0.914 & 0.892 & 0.054 & 0.069 & 0.082 & 0.114 & 1.922 & 1.985 \\
nnUNet & \textbf{0.934} & \textbf{0.917} & \textbf{0.004} & 0.009 & 0.009 & 0.014 & \textbf{0.224} & 0.317 \\
LaST-Diff(image only) & 0.929 & 0.903 & 0.020 & 0.033 & 0.035 & 0.062 & 0.756 & 0.967 \\
LaST-Diff & 0.934 & 0.916 & 0.005 & \textbf{0.008}$^{\ast}$ & \textbf{0.007} & \textbf{0.011}$^{\ast}$ & 0.263 & \textbf{0.264}$^{\ast}$ \\
[2pt]
\multicolumn{9}{l}{\textbf{Dataset: Echronos-DB}} \\
[2pt]
SOCOF & 0.971 & 0.938 & 0.003& \textbf{0.005}& 0.005& \textbf{0.009} & 0.166 & 0.169\\
LDSeg & 0.959&0.919	&0.010&0.027&0.016&0.036&0.438&0.608\\
nnUNet & 0.826 & 0.668 & 0.057 & 0.133 & 0.043 & 0.095 & 1.279 & 1.938\\
LaST-Diff(image only)& 0.966 & 0.938 & 0.016& 0.023& 0.020 & 0.036 & 0.585 & 0.545\\
LaST-Diff & \textbf{0.971} & \textbf{0.945} & \textbf{0.003}  & 0.006  & \textbf{0.005} & 0.011 & \textbf{0.103}$^{*}$   & \textbf{0.102}$^{*}$\\
\bottomrule
\end{tabular}
\end{table}

% \vspace{.2cm}



We compared LaST-Diff with a task-specific method (SOCOF~\cite{9946374}), and two related methods (LDSeg~\cite{zaman2025latentdiffusionmedicalimage}, nnUNet~\cite{isensee_nnu-net_2021}). To isolate the impact of our three-stage training strategy, we included an ablation study using our model trained exclusively on image-conditioned diffusion (without temporal fine-tuning). Qualitative comparisons are presented in Fig~\ref{fig:poor}. Quantitative results are summarized in Table~\ref{camusAll}, with bold values indicating the best performance and asterisks ($*$) denoting statistical significance ($p < 0.05$) between the top two methods. Noting that only LV endocardium and LA results are reported for CAMUS dataset as LV epicardium shows less movements during cardiac cycle. As shown in the tables, LaST-Diff achieved spatial accuracy (DSC) comparable to nnUNet~\cite{isensee_nnu-net_2021} and SOCOF~\cite{9946374}, with no significant statistical difference found, indicating that LaST-Diff maintains state-of-the-art pixel-level precision. Besides, a significant advantage was observed in temporal and geometric metrics. Specifically, LaST-Diff achieves the closest agreement with the real trajectories in most evaluations, effectively mitigating the inter-frame flickering and anatomically inconsistent jitter common in 2D-based and other baselines. By integrating a dedicated temporal branch, LaST-Diff ensures that the generated mask sequences are not only frame-wise accurate but also follow the physiologically plausible motion of the cardiac structures, resulting in a substantially smoother and more reliable video segmentation for clinical analysis. 

\begin{table}[htbp]
\centering
% \scriptsize
\caption{Performance metrics (low quality test set).}
\label{camusPoor}
\begin{tabular}{lcccccccc}
\toprule
\multirow{2}{*}{} & \multicolumn{2}{c}{DSC} & \multicolumn{2}{c}{t-DSC} & \multicolumn{2}{c}{TCS} & \multicolumn{2}{c}{ACD} \\
\cmidrule(r){2-3} \cmidrule(r){4-5} \cmidrule(r){6-7} \cmidrule(r){8-9}
& LV & LA & LV & LA & LV & LA & LV & LA \\
\midrule
\multicolumn{9}{l}{\textbf{Dataset: CAMUS}} \\
[2pt]
SOCOF & 0.914 & 0.900 & 0.011 & 0.016 & 0.017 & 0.024 & 0.459 & 0.446\\
LDSeg & 0.897 & 0.890 & 0.064 & 0.067 & 0.102 & 0.116 & 2.398 & 1.829\\
nnUNet & 0.921 & 0.908  & 0.007 & 0.010 & 0.015 & 0.016& 0.348 & 0.328\\
LaST-Diff(image only)&  0.913& 0.906 & 0.025 & 0.032 & 0.049 & 0.065 & 0.993 & 0.891\\
LaST-Diff & \textbf{0.921} & \textbf{0.922} & \textbf{0.006} & \textbf{0.007} & \textbf{0.006} & \textbf{0.012}& \textbf{0.336} & \textbf{0.236}\\
[2pt]
\multicolumn{9}{l}{\textbf{Dataset: Echronos-DB}} \\
[2pt]
SOCOF & \textbf{0.968} & \textbf{0.928} & 0.003 & 0.006 &0.008& \textbf{0.008} & 0.486&0.713\\
LDSeg & 0.958 & 0.914 & 0.010& 0.020& 0.018 & 0.034 & 0.422& 0.521\\
nnUNet & 0.811 & 0.591 & 0.065 & 0.206& 0.053 & 0.148 & 1.417 & 3.988\\
LaST-Diff(image only)& 0.961  & 0.918 & 0.011 & 0.031 & 0.018 & 0.045& 0.486 & 0.713\\
LaST-Diff & 0.967 & 0.927 & \textbf{0.002}& \textbf{0.006}& \textbf{0.005} & 0.014& \textbf{0.101} & \textbf{0.153}\\
\bottomrule
\end{tabular}
\end{table}

\subsection{Robustness to Suboptimal Image Quality}
A key challenge in clinical echocardiography is the presence of sequences with suboptimal image quality, often characterized by low signal-to-noise ratios, shadowing artifacts, or poorly defined endocardial boundaries. To evaluate the robustness of LaST-Diff, we performed a subgroup analysis on sequences identified as having poor image quality. Due to the limited sample size of this specific subset, a formal statistical significance test was not conducted. However, the observed trends summarized in Table~\ref{camusPoor} provide compelling evidence of LaST-Diff's stability, especially in temporal coherence and geometric smoothness. Specifically, in sequences where the walls were partially obscured by artifacts, baseline models often produced physically improbable anatomical jumps between adjacent frames. Conversely, the temporal branch of LaST-Diff effectively mitigates these issues by controlling the denoising process through a combination of image guidance and inter-frame temporal constraints. In regions of significant signal dropout, LaST-Diff leverages the learned shape manifold and motion priors to maintain a physiologically plausible trajectory. This enables the framework to yield anatomically consistent boundaries that remain aligned with the cardiac cycle, even when local image information is ambiguous. These findings suggest that LaST-Diff offers robustness that is essential for reliable clinical deployment in real-world environments.


\subsection{Uncertainty Quantification via Probabilistic Sampling}
The generative nature of LaST-Diff enables robust uncertainty quantification through repeated sampling during the reverse diffusion process. By generating multiple denoised latent sequences for a single input video, we computed the per-pixel standard deviation across sampling runs to serve as a principled measure of predictive confidence. As illustrated in Figure~\ref{fig:uncertainty}, the obtained uncertainty maps effectively highlight regions of anatomical ambiguity. Unlike deterministic models that provide a single, potentially overconfident prediction, LaST-Diff provides an interpretable confidence indicator that aligns with clinical expectations. Such a capability is vital for clinical decision support, as it allows clinicians to identify specific frames or anatomical regions that may require manual review, thereby enhancing the overall reliability of automated echocardiogram analysis.


\begin{figure}[htbp]
\centering
\includegraphics[width=0.5\textwidth]{uncertainty.png}
\caption{ An example of uncertainty estimation of segmentation from the Echronos-DB
dataset. High standard deviation values are concentrated around the region where the boundaries are unclear.} \label{fig:uncertainty}
\end{figure}
\section{Conclusion}
We proposed LaST-Diff, a latent spatiotemporal diffusion framework for temporally stable echocardiography video segmentation. By performing progressive denoising in latent space and adopting a three-stage training strategy with adaptive spatial–temporal fusion, the method achieves strong frame-wise accuracy while significantly improving temporal coherence and geometric stability without the heavy cost of full 3D diffusion. Experiments on public and in-house datasets show that LaST-Diff produces more physiologically plausible and consistent segmentation trajectories, particularly on low-quality sequences. Its generative formulation also enables uncertainty estimation for clinical review, providing an efficient solution for temporally consistent cardiac video segmentation.


\begin{credits}
\subsubsection{\ackname} This research was supported in part by National Institutes of HealthGrant R01HL171624.

\subsubsection{\discintname}
The authors have no competing interests to declare that are
relevant to the content of this article.
\end{credits}
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{mybibliography}
%
\end{document}
