\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\jmlryear{2026}\jmlrworkshop{Full Paper -- MIDL 2026}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2026}

\title[DiffTAC]{DiffTAC: Temporal-Conditioned Latent Diffusion with Integrated Attention for Intermediate Frame Generation and Temporal Super-Resolution in Cardiac MRI}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
\Name{Shilajit Banerjee\nametag{$^{1}$}} \orcid{0009-0005-5330-7131} \Email{banerjee.shilajit@tcs.com}\\
\Name{Aniruddha Sinha\nametag{$^{1}$}} \orcid{0000-0002-4679-3806} \Email{aniruddha.s@tcs.com}\\
\addr $^{1}$ Connected Digital Health, TCS Research, Kolkata, India
}

\begin{document}

\maketitle

\begin{abstract}
Cardiac cine MRI captures dynamic cardiac motion, yet its temporal resolution remains fundamentally constrained by long acquisition times and breath-hold requirements. We introduce DiffTAC, a latent diffusion framework that synthesizes intermediate cardiac phases by treating time as an explicit conditioning variable. Using the end-diastolic (ED) and end-systolic (ES) frames as anatomical anchors, DiffTAC performs denoising in the latent space of a pretrained variational autoencoder and conditions generation on a learnable temporal embedding that specifies the desired phase location within the cardiac cycle. To effectively fuse temporal conditioning with anatomical context, we propose the Integrated Attention Block (IAB), a unified module that combines self-attention and cross-attention to modulate spatial features according to the target temporal position. This design enables the model to synthesize anatomically coherent, temporally smooth intermediate frames. Experiments on multiple publicly available datasets demonstrate that DiffTAC produces highly realistic intermediate phases and achieves superior temporal consistency compared to classical interpolation, optical-flow–based reconstruction, and ablated variants of our architecture. These findings show that modeling time as a conditioning signal within a diffusion framework provides an effective and acquisition-free solution for temporal super-resolution in cardiac MRI.
\end{abstract}




\begin{keywords}
Cardiac MRI, Frame Interpolation, Diffusion Models, Cross-Attention, Latent Space, Temporal Consistency
\end{keywords}

\section{Introduction}

Cardiac cine MRI provides dynamic information about ventricular contraction, relaxation, and myocardial motion, making it the reference standard for quantitative cardiac function assessment \cite{sechtem1987cine, heitner2012clinical, vollbrecht2023fetal}. However, achieving high temporal resolution requires acquiring many cardiac phases, which increases scan duration and often exceeds what is feasible within a single breath-hold. As a result, clinical protocols typically acquire only a limited number of frames (e.g., 20--30 per cardiac cycle), reducing the temporal fidelity needed for fine-grained wall-motion analysis or strain estimation.  
High–frame-rate reconstructions obtained through post-processing can therefore enable improved strain estimation, motion analysis, and visualization of subtle physiological events that are not captured at standard frame rates. They may also support downstream applications such as fluid–structure interaction modeling and digital-twin cardiac simulations, where smooth and temporally continuous myocardial motion is essential.

A natural strategy to overcome this limitation is to synthesize intermediate cardiac phases after acquisition. Classical interpolation (e.g., linear, spline) cannot capture the highly non-linear deformation of the heart \cite{preechakul2022diffusion, linearinterpolation}. Optical-flow-based methods are sensitive to noise and often fail during rapid contraction or valve motion \cite{shah2021traditional}. Deep generative models hold promise, but pixel-space generators frequently struggle to maintain anatomical correctness or produce smooth temporal transitions across frames.

Diffusion models \cite{ho2020denoising} have recently demonstrated exceptional generative fidelity and stability, especially in medical imaging \cite{kazerouni2023diffusion, webber2024diffusion}. Their iterative denoising formulation provides strong structural priors while avoiding mode collapse. Latent diffusion models (LDMs) \cite{rombach2022high} operate in the latent space of a pretrained VAE. They offer computational efficiency and improved anatomical coherence—properties particularly desirable for cardiac cine MRI \cite{pinaya2022brain, kim2024adaptive}.

In the context of cardiac motion, the end-diastolic (ED) and end-systolic (ES) frames represent two physiologically meaningful boundary conditions. These frames capture the extremes of ventricular relaxation and contraction and are readily available across standard clinical views. While ED and ES provide rich anatomical context, they do not specify when within the cardiac cycle an intermediate phase should occur. Thus, the central challenge in cardiac frame synthesis is to combine anatomical context from ED/ES with an explicit representation of time. Recent work explores diffusion and deep learning models for cardiac reconstruction and temporal enhancement. CaLID \cite{bubeck2025latent} uses latent diffusion to interpolate sparse slices for 3D cardiac reconstruction, but it does not use ED/ES anchors or treat time as an explicit conditioning signal. UVI-Net \cite{kim2024dataefficientunsupervisedinterpolationintermediate} adopts a learnable, flow-based interpolation strategy, sharing similar motion assumptions with classical optical-flow methods. TSSC-Net \cite{zhou2025diffusion} performs diffusion-driven temporal super-resolution for 4D MRI using start–end frame conditioning, but it focuses on volumetric motion and does not incorporate a learnable temporal embedding in latent space. Feature-based interpolation in DT-MRI \cite{yang2012feature} reconstructs diffusion tensor fields by interpolating eigenvalues and orientations, but its scope is limited to tensor data rather than cine intensity sequences.



Temporal super-resolution for 4D Flow MRI \cite{callmer2025deep} enhances velocity-field dynamics using residual networks, but it operates directly in pixel space and uses 3D convolutions which can be compute heavy. Building on these advances, we introduce DiffTAC, a diffusion-based temporal synthesis framework that treats time as an explicit conditioning variable. DiffTAC uses a learnable sinusoidal temporal embedding to encode the target phase position between ED and ES and injects this temporal information into a conditional U-Net \cite{ronneberger2015u}. To more effectively fuse temporal conditioning with ED/ES anatomy, we propose the Integrated Attention Block (IAB), a unified module that merges self-attention and cross-attention pathways, enabling the model to learn a smooth and physiologically plausible cardiac motion manifold.

We evaluate our method on the two publicly available datasets and conduct extensive analyses, including ablations and ground truth (GT) free temporal interpolation experiments. Results show that DiffTAC generates anatomically faithful frames, achieves superior temporal smoothness, and extrapolates beyond observed phases, enabling temporal super-resolution without additional scan burden.

Our key contributions are:
\begin{enumerate}
    \item We introduce DiffTAC, a diffusion-based framework that synthesizes cardiac frames between ED and ES by treating time as an explicit conditioning signal.
    \item We propose the Integrated Attention Block (IAB), a novel self--cross attention module that effectively fuses temporal and anatomical information within a latent diffusion U-Net.
    \item We demonstrate that DiffTAC enables robust temporal interpolation and supports cardiac temporal super-resolution in a fully data-driven manner.
    \item We validate our approach on two publicly available datasets, showing consistent improvements over interpolation, optical-flow baselines, and ablated variants.
\end{enumerate}
The rest of the paper is organized as follows. We present the details of the proposed method in Section~\ref{methods}. The details of the experiments and the results are given in Section~\ref{exp}. Finally, we conclude the paper in Section~\ref{conc}.



\begin{figure*}[t]
    \centering
    \includegraphics[width=0.80\linewidth]{images/Model-MIDL.pdf}
      \caption{\footnotesize
Overview of the DiffTAC framework. ED and ES frames are first encoded into the VAE latent space 
and concatenated with the noisy intermediate latent. A learnable sinusoidal embedding encodes the 
target temporal position and conditions the diffusion U-Net through cross-attention. The U-Net 
predicts the denoised latent, which is compared against the ground-truth latent during training. 
At inference, the predicted latent is decoded by the VAE to generate intermediate cardiac frames 
corresponding to any desired temporal position.}

    \label{block-diagram-1}
\end{figure*}



\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/new-midl-blocks-enlarged.pdf}
      \caption{\footnotesize
Proposed DownBlock and UpBlock structures in DiffTAC. Each block applies two Residual Blocks 
(ResBlock 1 and 2), each followed by an Integrated Attention Block (IAB1 and IAB2), which merges 
self-attention, cross-attention, and Feed forward Network (FFN) within a residual pathway. DownBlocks conclude with 
Conv2D downsampling, while UpBlocks fuse encoder skip connections and apply upsampling. Together, 
these modules integrate ED/ES anatomical context with temporal conditioning across spatial scales.}

    \label{block-diagram-2}
\end{figure*}
\section{Methodology}
\label{methods}
\subsection{Problem Formulation}

We consider a full cardiac cycle represented as a cine sequence 
$\mathbf{X}=\{x_1, x_2, \ldots, x_T\}$, where $x_1$ and $x_T$ correspond to successive 
end–diastolic (ED) frames. The end–systolic (ES) frame appears at a dataset–dependent 
position within the cycle, but the temporal evolution from ED to ES and from ES back to the 
next ED is not symmetric. All frames are uniformly sampled in time, and the complete cycle 
is mapped to a normalized temporal coordinate $\tau \in [0,1]$, such that 
$\tau=0$ and $\tau=1$ both correspond to ED, while ES occurs at some intermediate 
$\tau=\tau_{\mathrm{ES}}$.

For a full cardiac cycle, DiffTAC is trained on both motion segments.  
Using the provided ED and ES annotations, we construct (i) an ED→ES interval, where ED is the 
start frame and ES is the end frame, and the model learns to generate the intermediate ES-bound 
phases; and (ii) an ES→ED interval, where ES becomes the start frame and the subsequent frames 
toward the next ED define the target positions. Thus, each full cycle contributes two supervised 
temporal trajectories, allowing the model to learn intermediate-frame generation consistently 
across both directions of cardiac motion.













%\section{Methodology}
%\subsection{Problem Formulation}

%We formulate the cardiac frame interpolation problem as follows. Given a cardiac cine sequence $\mathbf{X} = \{x_0, x_1, \ldots, x_{T-1}\}$ where each $x_i \in \mathbb{R}^{H \times W \times 3}$ represents a cardiac phase, our objective is to reconstruct the intermediate frames $\{x_1, x_2, \ldots, x_{T-2}\}$ using only the end-diastolic frame $x_0$ and end-systolic frame $x_{T-1}$ as conditioning information. This setup reflects the clinical scenario where only keyframes might be acquired to reduce scan time and patient breath-hold duration, while intermediate phases must be synthesized to achieve the desired temporal resolution for accurate cardiac function assessment.

%More specifically, given the boundary frames $x_0$ and $x_{T-1}$, we aim to learn a generative model that produces intermediate frames $\hat{x}_\tau$ at normalized temporal positions $\tau \in [1,0]$ such that the synthesized sequence $\{x_0, \hat{x}_{\tau_1}, \hat{x}_{\tau_2}, \ldots, \hat{x}_{\tau_k}, x_{T-1}\}$ exhibits smooth temporal evolution and preserves anatomical consistency with the cardiac motion dynamics observed in complete cine sequences.
\subsection{Latent Space Encoding}

We operate in the latent space of a pretrained variational autoencoder (VAE), adapted from 
Stable Diffusion \cite{rombach2022high} and fine-tuned on cardiac MRI. The VAE maps images of size 
$H \times W \times C_{\text{img}}$ to latent tensors of size 
$\tfrac{H}{8} \times \tfrac{W}{8} \times C_\ell$, providing an $8\times$ spatial reduction while 
preserving anatomical structure. Here $H$, $W$ and $C$ represents the height, width and channels respectively. The encoder $\mathcal{E}$ outputs a diagonal Gaussian distribution from which a latent code is 
sampled:
\begin{equation}
z = \mathcal{E}(x),\qquad 
\mathcal{E}(x)\sim\mathcal{N}\!\big(\mu_E(x),\,\sigma_E^2(x)\big).
\end{equation}
The decoder maps latents back to the image domain, $\hat{x}=\mathcal{D}(z)$. 
VAE weights remain frozen during training. Latents for the end-diastolic (ED) and 
end-systolic (ES) frames are $z_{\text{ed}}=\mathcal{E}(x_0)$ and 
$z_{\text{es}}=\mathcal{E}(x_{\tau_{\mathrm{ES}}})$, which serve as anatomical anchors. 
The diffusion model predicts an intermediate latent $\hat{z}_\tau$ for any temporal 
position $\tau\in[1,0]$, decoded as $\hat{x}_\tau=\mathcal{D}(\hat{z}_\tau)$.




%\subsection{Latent Space Encoding}

%Rather than operating directly on high-dimensional image data, we leverage a pretrained variational autoencoder (VAE) to encode cardiac frames into a compact latent representation. The VAE, adapted from the Stable Diffusion architecture and fine-tuned on cardiac MRI data, maps input images of size $256 \times 256 \times 3$ to latent codes of dimensionality $32 \times 32 \times 4$, achieving an 8$\times$ spatial compression while preserving essential anatomical features such as myocardial boundaries, chamber structures, and papillary muscles.

%The encoding process follows a probabilistic framework where the encoder $\mathcal{E}$ maps an input frame $x$ to parameters of a diagonal Gaussian distribution. We sample from this distribution and apply a scaling factor $\alpha = 0.18215$ to normalize the latent space statistics:
%\begin{equation}
%z = \alpha \cdot \mathcal{E}(x), \quad \text{where } \mathcal{E}(x) \sim q(z|x) = \mathcal{N}(\mu_E(x), \sigma_E^2(x)).
%\end{equation}
%Here, $z \in \mathbb{R}^{32 \times 32 \times 4}$ represents the encoded latent representation, $\mu_E(x)$ and $\sigma_E^2(x)$ are the mean and variance predicted by the encoder network, and the scaling factor $\alpha$ ensures that the latent codes have unit variance, which stabilizes training and improves reconstruction quality.

%The decoder $\mathcal{D}$ reconstructs images from latent codes through the inverse transformation:
%\begin{equation}
%\hat{x} = \mathcal{D}(z/\alpha),
%\end{equation}
%where the division by $\alpha$ reverses the normalization applied during encoding.

%Crucially, we keep the VAE parameters frozen during frame interpolation training, using it purely as a feature extractor and decoder. This design choice prevents the model from learning to trivially bypass the interpolation task by directly copying input frames, while ensuring that generated latent codes remain within the distribution that the VAE can successfully decode to high-quality cardiac images. All subsequent operations in our pipeline, including the diffusion-based interpolation model, operate entirely in this compressed latent space.

%For the end-diastolic and end-systolic frames, we compute their latent representations:
%\begin{equation}
%z_{\text{ed}} = \alpha \cdot \mathcal{E}(x_0), \quad z_{\text{es}} = \alpha \cdot \mathcal{E}(x_{T-1}),
%\end{equation}
%which serve as the conditioning inputs to our diffusion model for generating intermediate latent frames $z_\tau$ at arbitrary temporal positions $\tau \in [1,0]$.



\subsection{Model Architecture}

The proposed model is a conditional latent diffusion U\!-Net that predicts the noise 
$\epsilon_\theta(x_t, t, c_\tau)$ added to a latent variable during the forward diffusion 
process. The network input $x_t \in \mathbb{R}^{3 C_\ell \times (H/8) \times (W/8)}$ 
is formed by concatenating three $C_\ell$-channel latent tensors: the noisy latent 
$z_t$, the ED latent $z_{\mathrm{ed}}$, and the ES latent $z_{\mathrm{es}}$. 
The diffusion timestep is denoted by $t$, and the temporal conditioning token 
$c_\tau \in \mathbb{R}^{1 \times d_c}$ is derived from the target phase position~$\tau$. The overall block diagram is shown in Fig \ref{block-diagram-1}.




\paragraph{Overall structure.}
The U\!-Net follows a four-level encoder–decoder architecture with channel widths 
$\{C, 2C, 4C, 8C\}$, where $C$ denotes the initial number of channels. 
Each resolution stage contains two residual blocks followed by two 
\emph{Integrated Attention Blocks} (IAB1, IAB2), as illustrated in 
Fig.~\ref{block-diagram-2}. Skip connections propagate encoder features to the 
corresponding decoder stages. Downsampling is performed using a stride-2 
$3 \times 3$ convolution, and upsampling uses nearest-neighbor interpolation 
followed by a $3 \times 3$ convolution.

\paragraph{Residual blocks.}
Each residual block processes a feature map 
$h \in \mathbb{R}^{B \times C \times H \times W}$ using a standard 
Group Normalization (GN)–SiLU–Conv2d stack. 
Timestep conditioning is injected by adding a projected timestep embedding 
$W_t e_t$ to the intermediate activation, giving
\[
u = \mathrm{Conv}(\mathrm{SiLU}(\mathrm{GN}(h))) + W_t e_t,
\]
where $e_t$ is the diffusion timestep embedding. 
If the input and output channel dimensions differ, an optional $1\times1$ 
convolution is applied to the skip path so that the residual connection can be 
added consistently.


\paragraph{Temporal conditioning.}
The target temporal position $\tau$ is encoded using a sinusoidal \emph{positional encoding} (PE) 
\cite{vaswani2017attention}, adapted to continuous time and modulated through learnable affine 
parameters. A base positional encoding is first computed from the discretized temporal index 
$p(\tau)$ (see Appendix~\ref{app:time-embedding}):
\begin{equation}
\phi(\tau)
    = \mathrm{clip}(s)\odot \mathrm{PE}\!\left(p(\tau)\right) + b,
\end{equation}
where $s,b\in\mathbb{R}^d$ are learnable scale and shift vectors. 
The modulated embedding is then projected to a compact conditioning token 
$c_\tau = W_{\mathrm{proj}}\,\phi(\tau)$ which encodes the desired temporal location and conditions all cross-attention layers inside the 
IAB modules.





\paragraph{Integrated Attention Block (IAB).}
Each IAB module fuses spatial features with temporal conditioning using 
multi-head self-attention (SA) followed by multi-head cross-attention (CA). 
Given flattened spatial tokens $Z \in \mathbb{R}^{(HW) \times d}$, 
the SA operation is
\begin{equation}
\mathrm{SA}(Z)=
\mathrm{softmax}\!\left(\frac{ZW_Q (ZW_K)^\top}{\sqrt{d_k}}\right)ZW_V.
\end{equation}
Cross-attention uses queries from $Z$ and keys/values from the temporal token $c_\tau$:
\begin{equation}
\mathrm{CA}(Z,c_\tau)=
\mathrm{softmax}\!\left(\frac{ZW_Q (c_\tau W_{K_c})^\top}{\sqrt{d_k}}\right)
      c_\tau W_{V_c}.
\end{equation}
With residual connections and LayerNorm (LN), the IAB update, consisting of SA and CA, is followed by a single position-wise feed-forward network (FFN), as shown below:
\begin{equation}
\begin{aligned}
\mathbf{Y}_1 &= \mathbf{Z} + \mathrm{SA}(\mathrm{LN}(\mathbf{Z})), \\
\mathbf{Y}_2 &= \mathbf{Y}_1 + \mathrm{CA}(\mathrm{LN}(\mathbf{Y}_1),\, \mathbf{c}_\tau), \\
\mathbf{Z}' &= \mathbf{Y}_2 + \mathrm{FFN}(\mathrm{LN}(\mathbf{Y}_2)).
\end{aligned}
\end{equation}



\paragraph{Decoder.}
The decoder mirrors the encoder and receives concatenated skip features at each 
resolution. Each stage applies three residual blocks and two IAB modules, allowing 
temporal conditioning to refine features even at fine spatial scales. Upsampling is 
performed by nearest-neighbor interpolation followed by a $3 \times 3$ convolution.

\paragraph{Output layer.}
The final decoder feature map is normalized and activated as 
$h_{\mathrm{out}}=\mathrm{SiLU}(\mathrm{GN}(h))$, and then mapped through a 
$3\times 3$ convolution to predict the latent-space noise 
$\hat{\epsilon}=\mathrm{Conv}_{3\times 3}(h_{\mathrm{out}})$. 
This prediction $\hat{\epsilon}$ is used by the reverse diffusion process to reconstruct a 
temporally conditioned latent frame corresponding to $\tau$.


















%\subsection{Model Architecture}

%We use a conditional U-Net that predicts noise $\epsilon_\theta(x_t,t,c)$ in the diffusion process. The input is the concatenation of the noisy latent ($4$ channels), the ED latent ($4$ channels), and the ES latent ($4$ channels), forming a $12$-channel tensor of size $32 \times 32$. Temporal conditioning is provided through a time embedding $c \in \mathbb{R}^{L \times d_c}$, injected via cross-attention.


%\subsubsection{Diffusion Timestep Embedding}

%We encode the diffusion timestep $t$ using a learned sinusoidal embedding $\tau(t)$, followed by a two-layer MLP:
%\begin{equation}
%e_t = W_2\, \mathrm{SiLU}(W_1\, \tau(t)).
%\end{equation}
%The embedding $e_t \in \mathbb{R}^{512}$ is projected and added to each residual block. It conditions the denoising process on the progress of the diffusion chain.



%\subsubsection{Learnable Temporal Position Embedding}

%Temporal position $\tau \in [0,1]$ is the key conditioning signal. It represents the desired frame location between ED ($\tau=0$) and ES ($\tau=1$). We encode it using a learnable sinusoidal embedding:
%\begin{equation}
%\phi(\tau) = \text{scale} \odot \text{PE}\!\left[\mathrm{round}(\tau(L-1))\right] + \text{shift},
%\end{equation}
%where $\text{PE}$ is a fixed sinusoidal table. The vectors \texttt{scale} and \texttt{shift} are learned and clamped to maintain stable ranges. This embedding keeps the smooth behavior of sinusoidal encodings while adapting to cardiac dynamics. We project $\phi(\tau)$ to obtain a conditioning token:
%\begin{equation}
%c = W_{\text{proj}} \phi(\tau) \in \mathbb{R}^{1 \times d_c},
%\end{equation}
%which is used in all cross-attention layers.



%\subsubsection{Network Architecture}

%The U-Net consists of an encoder, a bottleneck (mid-block), and a decoder, following a hierarchical structure with four stages at progressively different resolutions.


%\subsubsection{Cross-Attention Conditioning with ED/ES Context}

%Cross-attention forms the core conditioning mechanism. ED and ES latents supply anatomical structure. The temporal embedding $c$ specifies the phase to be generated. The U-Net integrates both through cross-attention at multiple resolutions.

%Let $Z \in \mathbb{R}^{(HW) \times d}$ denote the flattened spatial features at a given layer. Queries are derived from the spatial features:
%\begin{equation}
%Q = Z W_Q.
%\end{equation}

%Keys and values come from temporal conditioning tokens:
%\begin{equation}
%K_c = c\, W_{K_c}, \qquad V_c = c\, W_{V_c}.
%\end{equation}

%Cross-attention outputs:
%\begin{equation}
%\text{CA}(Z,c) = \mathrm{softmax}\!\left(\frac{Q K_c^\top}{\sqrt{d}}\right)\, V_c.
%\end{equation}

%This operation injects the target temporal position into the spatial features, guiding the network toward the correct phase between ED and ES. Repeating this at all scales lets the model align temporal conditioning with local and global anatomy.



%\paragraph{Encoder.} The encoder consists of four hierarchical stages with progressively increasing channel widths of 128, 256, 512, and 1024 respectively. Each stage is designed to capture features at different levels of abstraction, with deeper stages representing more semantic and high-level cardiac anatomical information. Within each of the first three stages, the architecture includes two residual blocks and two Transformer blocks that apply both self-attention and cross-attention mechanisms. The fourth stage contains two residual blocks without downsampling.

%For a given feature map $h$, each residual block performs a series of transformations. First, it computes an intermediate representation:
%\begin{equation}
%u = \text{Conv}_1(\text{SiLU}(\text{GN}(h))) + W_t e_t,
%\end{equation}
%where $\text{GN}$ denotes Group Normalization with 32 groups that normalizes the features across channel groups, $\text{SiLU}$ is the activation function, $\text{Conv}_1$ is the first $3 \times 3$ convolutional layer, and $W_t e_t$ is the projected timestep embedding that injects temporal information into the features. The final output of the residual block is computed as:
%\begin{equation}
%h' = \text{Conv}_2(\text{SiLU}(\text{GN}(u))) + h_{\text{skip}},
%\end{equation}
%where $\text{Conv}_2$ is the second $3 \times 3$ convolutional layer and %$h_{\text{skip}}$ represents the skip connection, which is either an identity mapping when input and output dimensions match, or a $1 \times 1$ convolutional projection when channel dimensions differ.

%Spatial resolution is progressively reduced in the first three encoder stages through downsampling operations:
%\begin{equation}
%h^{(i+1)} = \text{Conv}_{\text{stride}=2}(h^{(i)}),
%\end{equation}
%where $h^{(i)}$ represents the feature map at stage $i$, and the stride-2 convolution with $3 \times 3$ kernel reduces both height and width by a factor of two, effectively halving the spatial resolution while doubling the number of channels.

%\paragraph{Attention Mechanisms.} The Transformer blocks within the encoder apply both self-attention and cross-attention mechanisms to capture relationships within the spatial features and between the features and external conditioning information. Let $Z \in \mathbb{R}^{(HW) \times d}$ denote the flattened spatial features, where the spatial dimensions $H$ and $W$ are collapsed into a single sequence dimension $HW$, and $d$ represents the feature dimension at each spatial location.

%For self-attention, the mechanism computes three sets of projections from the input features:
%\begin{equation}
%Q = Z W_Q, \quad K = Z W_K, \quad V = Z W_V,
%\end{equation}
%where $W_Q, W_K, W_V \in \mathbb{R}^{d \times d}$ are learned weight matrices that project the features into query, key, and value spaces respectively. The self-attention output is then computed as:
%\begin{equation}
%\text{SA}(Z) = \text{softmax}\left(\frac{QK^\top}{\sqrt{d}}\right) V.
%\end{equation}
%In this formula, $QK^\top$ computes the dot product between queries and keys, producing an attention score matrix of size $(HW) \times (HW)$ that measures the similarity between each pair of spatial positions. The scaling factor $\frac{1}{\sqrt{d}}$ normalizes these scores to prevent gradients from becoming too small during training. The softmax function converts these scores into attention weights that sum to one across each row, effectively creating a probability distribution over spatial locations. Finally, these weights are used to compute a weighted sum of the value vectors $V$, allowing each position to aggregate information from all other positions based on their relevance.

%Cross-attention operates similarly but incorporates information from the external conditioning sequence. The queries are still derived from the spatial features using the same projection $Q = Z W_Q$. However, the keys and values now come from the conditioning sequence $C \in \mathbb{R}^{L \times d_c}$, where $L$ is the number of conditioning tokens (in our case, the temporal position embedding) and $d_c$ is the conditioning dimension:
%\begin{equation}
%K_c = C W_{K_c}, \quad V_c = C W_{V_c},
%\end{equation}
%where $W_{K_c}, W_{V_c} \in \mathbb{R}^{d_c \times d}$ project the conditioning tokens into the same key and value spaces. The cross-attention output is then:
%\begin{equation}
%\text{CA}(Z,C) = \text{softmax}\left(\frac{Q K_c^\top}{\sqrt{d}}\right) V_c.
%\end{equation}
%Here, $Q K_c^\top$ produces an attention score matrix of size $(HW) \times L$, measuring how relevant each conditioning token is to each spatial position. This cross-attention mechanism is applied at every spatial scale throughout the encoder, allowing the model to incorporate temporal position information hierarchically and guide the generation of cardiac phases at specific time points.

%\paragraph{Decoder.} The decoder mirrors the encoder architecture with four corresponding stages that progressively increase spatial resolution while decreasing channel dimensions from 1024 to 128. At each decoder stage, the upsampled features from the previous stage are concatenated with skip connections from the corresponding encoder stage:
%\begin{equation}
%\tilde{h}^{(i)} = \text{Concat}(h_{\text{up}}^{(i)}, h_{\text{skip}}^{(i)}),
%\end{equation}
%where $h_{\text{up}}^{(i)}$ represents the upsampled decoder features at stage $i$ and $h_{\text{skip}}^{(i)}$ are the features from the encoder at the same resolution level. These skip connections enable the decoder to access fine-grained spatial details such as myocardial borders and valve structures from the encoder that might otherwise be lost during downsampling.

%Spatial resolution is increased through an upsampling operation followed by convolution:
%\begin{equation}
%h_{\text{up}}^{(i-1)} = \text{Conv}(\text{Upsample}_{\text{NN}}(\tilde{h}^{(i)})),
%\end{equation}
%where $\text{Upsample}_{\text{NN}}$ denotes nearest-neighbor upsampling that doubles both height and width dimensions, and $\text{Conv}$ is a $3 \times 3$ convolutional layer that refines the upsampled features. Each decoder stage applies three residual blocks (except the last stage which has three) with the same structure as the encoder, incorporating timestep embeddings through $W_t e_t$ projections, along with Transformer blocks that perform cross-attention with the temporal conditioning sequence. The final decoder stage omits the attention mechanism and relies solely on residual convolutions to produce the output features.

%The decoder cross-attention follows the same mathematical formulation as the encoder:
%\begin{equation}
%\text{DecoderCA}(Z,C) = \text{softmax}\left(\frac{(Z W_Q)(C W_{K_c})^\top}{\sqrt{d}}\right)(C W_{V_c}).
%\end{equation}
%This ensures that semantic alignment between the generated features and the temporal conditioning information is preserved throughout the reconstruction process at progressively finer spatial scales.

%\paragraph{Output Layer.} The final feature map undergoes normalization and activation:
%\begin{equation}
%h_{\text{out}} = \text{SiLU}(\text{GN}(h)),
%\end{equation}
%where the Group Normalization stabilizes the features and the SiLU activation introduces non-linearity. A final $3 \times 3$ convolution then projects these features to the output space:
%\begin{equation}
%\hat{\epsilon} = \text{Conv}_{3 \times 3}(h_{\text{out}}) \in \mathbb{R}^{4 \times 32 \times 32}.
%\end{equation}
%This output $\hat{\epsilon}$ represents the predicted noise in the latent space that should be removed from the noisy input during the denoising process.

%\subsection{Model implementation details}

%We implement the interpolation model as a conditional U-Net (`UNet2DConditionModel`) that operates in the VAE latent space. The network takes a 12-channel latent input formed by concatenating the noisy target latent ($4$ channels) with the ED and ES latents (each $4$ channels), producing an input tensor of shape $12\times32\times32$. The network predicts noise in the latent space and follows the structure below.

%The input first passes through a $3\times3$ convolution (`conv\_in`) that maps 12 channels to 128 channels. We encode diffusion timesteps using a `Timesteps` module (`time\_proj`) and a `TimestepEmbedding` MLP. The timestep embedding MLP projects a learned sinusoidal timestep encoding to a 512-dimensional vector using a linear–SiLU–linear sequence; this 512-D vector is used by residual blocks via a linear projection.

%The encoder is built from four hierarchical \texttt{CrossAttnDownBlock2D} stages with output channel widths \{128, 256, 512, 1024\}. Each down block contains:
%- residual blocks (`ResnetBlock2D`) that apply GroupNorm, SiLU, and $3\times3$ convolutions and that accept a linear projection of the 512-D timestep embedding (`time\_emb\_proj`);
%- Transformer-style modules (`Transformer2DModel`) that provide both self-attention and cross-attention. In these Transformer blocks, queries are derived from the spatial features while keys and values for cross-attention are derived from the conditioning tokens (the projected temporal embedding). The Transformer feed-forward layers use GEGLU nonlinearity and a large hidden dimension (e.g., 4× the feature width) to increase capacity.
%Downsampling between encoder stages uses a $3\times3$ stride-2 convolution (`Downsample2D`).

%The U-Net contains a mid-block (`UNetMidBlock2DCrossAttn`) at the bottleneck with the same pattern: residual blocks and a Transformer block with cross-attention. The mid-block works at the lowest spatial resolution and highest channel width (1024 channels), enabling global context integration.

%The decoder mirrors the encoder with \texttt{CrossAttnUpBlock2D} and a final \texttt{UpBlock2D}. Each up block performs nearest-neighbor upsampling followed by a $3\times3$ convolution, concatenates encoder skip connections, and applies residual blocks and Transformer blocks with both self-attention and cross-attention. Channel widths reduce symmetrically during upsampling (1024 → 512 → 256 → 128). The decoder Transformer blocks continue to inject the temporal conditioning token at each resolution to align local details with the target phase.

%All residual blocks use Group Normalization with 32 groups. Transformer blocks apply LayerNorm inside attention blocks. Attention projections use linear layers for queries, keys, and values. Cross-attention keys and values concatenate or project the conditioning tokens (the projected $\phi(\tau)$ and any additional context) to the necessary dimensionality (e.g., 768 when concatenating multiple tokens) before the attention computation. Feed-forward modules use GEGLU followed by a linear projection back to the feature dimension.

%The feature map after the final decoder stage passes through GroupNorm and a SiLU nonlinearity (`conv\_norm\_out`, `conv\_act`) and is projected by a final $3\times3$ convolution (`conv\_out`) from 128 channels to 4 latent channels. The network output $\hat{\epsilon}\in\mathbb{R}^{4\times32\times32}$ matches the VAE latent dimensionality and is used in the reverse diffusion sampling procedure.

%Overall, the implementation details match the following design principles: multi-scale fusion of anatomy (ED/ES) and time (learnable $\phi(\tau)$) via cross-attention, global spatial modeling via self-attention, and timestep conditioning injected into residual blocks. This configuration balances capacity and efficiency for anatomically faithful, temporally conditioned latent-space interpolation.





% Paste this TikZ diagram into your LaTeX document (compile with xcolor, tikz, and tikz libraries: shapes.geometric, arrows.meta)




\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/acdc_ablation.pdf}
      \caption{\footnotesize
Qualitative results on the ACDC dataset showing intermediate cardiac frame synthesis from ED to ES.
The yellow box contains all generated intermediate frames for each method. DiffTAC closely matches
ground-truth dynamics and preserves ventricular geometry throughout the cardiac cycle. In contrast,
the ablated variants—without the learnable temporal embedding, without cross-attention, or without
both—show motion inconsistencies and anatomical distortions, highlighted in the red boxes.
}


\label{acdc-1}
\end{figure*}







\subsection{Training Procedure}

We train the model to predict the diffusion noise added to latent frames conditioned on 
the end-diastolic and end-systolic latents and a temporal embedding. Let 
$z_{\text{ed}}, z_{\text{es}} \in \tfrac{H}{8} \times \tfrac{W}{8} \times C$ denote the 
latent representations of the boundary frames, where $B$ is the batch size.

\subsubsection{Training Setup}

At each iteration, we sample a temporal position $\tau\in[1,0]$ between end-diastole and end-systole and compute its learnable sinusoidal embedding $\phi(\tau)\in\mathbb{R}^d$ and project it through a small fully connected layer \emph{outside} the diffusion U\!-Net 
to form the conditioning token:
\begin{equation}
    c_\tau = W_{\text{proj}} \phi(\tau) \in \mathbb{R}^{1 \times d_c}.
\end{equation}
This projection prepares the temporal embedding for injection into the U\!-Net via the 
cross-attention layers. Given the ground-truth latent $z_\tau$ at phase $\tau$, the 
forward diffusion step is
\begin{equation}
    z_t = \sqrt{\bar{\alpha}_t}\, z_\tau + \sqrt{1 - \bar{\alpha}_t}\, \epsilon,
    \qquad \epsilon \sim \mathcal{N}(0, I).
\end{equation}
where $t$ is the diffusion timestep and $\bar{\alpha}_t$ is the cumulative noise schedule. The U\!-Net predicts the added noise conditioned on the noisy latent and the ED/ES 
boundary frames:
\begin{equation}
    \hat{\epsilon} = \text{U-Net}\big([z_t, z_{\text{ed}}, z_{\text{es}}],\; t,\; c_\tau \big).
\end{equation}
The timestep embedding encodes $t$, while the token $c_\tau$ provides temporal 
conditioning at all attention layers. Training minimizes the mean squared error 
between predicted and true noise:
\begin{equation}
    \mathcal{L} =
    \mathbb{E}_{z_\tau,\, \epsilon,\, \tau,\, t}
    \left[ \left\| \epsilon - \hat{\epsilon} \right\|_2^2 \right].
\end{equation}
This loss guides the model to denoise latent frames according to both the anatomy and 
the target temporal phase, enabling smooth intermediate-frame synthesis during inference.










\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/Copy of Copy of sunnybrook.pdf}
      \caption{\footnotesize
Qualitative results on the Sunnybrook dataset showing cardiac motion from ED to ES.
The yellow boxes contain all generated intermediate frames for each method. DiffTAC (Ours) produces
temporally smooth and anatomically consistent transitions that align well with ground-truth dynamics.
In contrast, the ablated variants—without the learnable temporal embedding (WLT), without
cross-attention (WCA), or without both (WLTA)—show motion artifacts and structural distortions,
which are marked with red boxes.
}

\label{sunny-1}
\end{figure*}

\begin{table}[t]
\centering
\caption{Intermediate frame generation performance on the Sunnybrook and ACDC datasets. Higher PSNR/SSIM and lower LPIPS indicate better reconstruction quality.}
\label{tab:combined_temporal}
\resizebox{0.60\linewidth}{!}{
\begin{tabular}{lcccc}
\hline
\textbf{Dataset} & \textbf{Method} & \textbf{PSNR↑} & \textbf{SSIM↑} & \textbf{LPIPS↓} \\
\hline
\multirow{4}{*}{Sunnybrook}
 & Ours          & \textbf{39.660} & \textbf{0.922} & \textbf{0.011} \\
 & Linear        & 33.450 & 0.897 & 0.125 \\
 & Spline        & 34.850 & 0.901 & 0.101 \\
 & Optical Flow  & 29.500 & 0.860 & 0.172 \\
\hline
\multirow{4}{*}{ACDC}
 & Ours          & \textbf{33.950} & \textbf{0.898} & \textbf{0.110} \\
 & Linear        & 18.280 & 0.611 & 0.249 \\
 & Spline        & 18.600 & 0.617 & 0.242 \\
 & Optical Flow  & 16.690 & 0.569 & 0.335 \\
\hline
\end{tabular}
}
\end{table}


\begin{table}[t]
\centering
\caption{Comparison with learning-based temporal reconstruction methods on the Sunnybrook and ACDC datasets. Higher PSNR/SSIM and lower LPIPS indicate better performance. \textbf{Temporal SR} denotes the ability to generate cine sequences at denser temporal positions (higher frame rate) within the cardiac cycle.}
\label{tab:dl_comparison}
\resizebox{1.00\linewidth}{!}{
\begin{tabular}{llccccc}
\hline
\textbf{Dataset} & \textbf{Method} & \textbf{PSNR$\uparrow$} & \textbf{SSIM$\uparrow$} & \textbf{LPIPS$\downarrow$} & \textbf{Temporal SR} \\
\hline
\multirow{5}{*}{Sunnybrook}
 & UVI-Net \cite{kim2024dataefficientunsupervisedinterpolationintermediate}  & 32.313 & 0.896 & 0.121 & No \\
 & TSSC-Net \cite{zhou2025diffusion} & 28.767 & 0.874 & 0.132 & 6$\times$ \\
 & Ours     & \textbf{39.660} & \textbf{0.922} & \textbf{0.011} & \textbf{Flexible (2$\times$, 3$\times$, 4$\times$, ...)} \\
\hline
\multirow{5}{*}{ACDC}
 & UVI-Net \cite{kim2024dataefficientunsupervisedinterpolationintermediate}  & \textbf{35.091} & \textbf{0.925} & \textbf{0.035} & No \\
 & TSSC-Net \cite{zhou2025diffusion} & 34.677 & 0.917 & 0.040 & 6$\times$ \\
 & Ours     & 33.950 & 0.898 & 0.110 & \textbf{Flexible (2$\times$, 3$\times$, 4$\times$, ...)} \\
\hline
\end{tabular}
}
\end{table}






\section{Experiments and Results}
\label{exp}
\subsection{Datasets}

We evaluate DiffTAC on two public cine MRI datasets: the Sunnybrook Cardiac Data (SCD) 
\cite{radau2009evaluation} and ACDC \cite{bernard2018deep}. Both provide short-axis cine 
sequences with annotated end-diastolic (ED) and end-systolic (ES) frames. SCD contains 45 
subjects spanning healthy and pathological cases and provides \emph{full} cardiac cycles 
(ED→ES→ED) with roughly 20 frames acquired at 1.5T and an in-plane resolution of 1.25–1.5\,mm 
at $256\times256$ resolution. In contrast, ACDC includes 150 subjects across five diagnostic 
categories (NOR, MINF, DCM, HCM, RV) and provides only the \emph{ED→ES} half-cycle with 10 
standardized frames acquired on 1.5T or 3T scanners at 1.37–1.68\,mm resolution. All frames are 
center-cropped, resized to $256\times256$, and normalized to $[-1,1]$. We use an 80/20 
subject-level train–test split for both datasets.








\subsection{Implementation Details}

All images are resized to $256\times256$ and normalized to $[-1,1]$. A pretrained Stable-Diffusion VAE
encodes each frame into a $32\times32\times4$ latent; the VAE remains frozen throughout training. 
We train the diffusion model for 500 epochs using AdamW (learning rate $1\!\times\!10^{-4}$, 
weight decay $10^{-5}$), cosine learning-rate scheduling with 500 warmup steps, batch size~1, and 
gradient-accumulation steps~4. The noise process uses 1000 diffusion timesteps with scaled-linear 
$\beta$ scheduling. The learnable sinusoidal embedding table length is set to 200 and the 
cross-attention conditioning dimension to 768. During inference, we employ DDIM sampling with 100 denoising steps. For any target temporal 
position $\tau\in[1,0]$, the model predicts the corresponding latent $\hat{z}_\tau$ conditioned on 
the ED and ES latents $(z_{\mathrm{ed}}, z_{\mathrm{es}})$ and the time token $c_\tau$. 
The final reconstruction is obtained via the frozen VAE decoder.





\subsection{Evaluation Metrics}

We evaluate interpolation quality using four metrics.  
\textbf{PSNR} measures pixel-wise reconstruction accuracy.  
\textbf{SSIM} measures structural similarity.  
\textbf{LPIPS} \cite{zhang2018unreasonable} measures perceptual similarity using deep features; lower is better.   These metrics assess fidelity, perceptual quality, and temporal stability. 
As shown in Table~\ref{tab:combined_temporal}, DiffTAC achieves strong performance across all metrics. 
Latent-space denoising preserves anatomical detail, and the temporal embedding with cross-attention 
improves phase alignment and global coherence.







\begin{table}[t]
\centering
\caption{Ablation study on the Sunnybrook and ACDC datasets. Higher PSNR/SSIM and lower LPIPS indicate better reconstruction quality.}
\label{tab:combined_ablation}
\resizebox{0.8\linewidth}{!}{
\begin{tabular}{lcccc}
\hline
\textbf{Dataset} & \textbf{Configuration} & \textbf{PSNR↑} & \textbf{SSIM↑} & \textbf{LPIPS↓} \\
\hline
\multirow{4}{*}{Sunnybrook}
 & Full Model & \textbf{39.660} & \textbf{0.922} & \textbf{0.011} \\
 & w/o Learnable Time Embedding & 34.105 & 0.900 & 0.116 \\
 & w/o Cross-Attention & 28.740 & 0.866 & 0.167 \\
 & w/o Time \& Cross-Attention & 20.680 & 0.811 & 0.205 \\
\hline
\multirow{4}{*}{ACDC}
 & Full Model & \textbf{33.950} & \textbf{0.898} & \textbf{0.110} \\
 & w/o Learnable Time Embedding & 24.640 & 0.850 & 0.181 \\
 & w/o Cross-Attention & 21.400 & 0.845 & 0.195 \\
 & w/o Time \& Cross-Attention & 16.880 & 0.589 & 0.274 \\
\hline
\end{tabular}
}
\end{table}





\begin{table}[t]
\centering
\caption{Ablation study on temporally interpolated frames (2× temporal super-resolution). Lower FID indicates better realism.}
\label{tab:fid_extrap_ablation}
\resizebox{0.8\linewidth}{!}{
\begin{tabular}{lcc}
\hline
\textbf{Configuration} & \textbf{Sunnybrook FID↓} & \textbf{ACDC FID↓} \\
\hline
Full Model & \textbf{23.060} & \textbf{60.170} \\
w/o Learnable Time Embedding & 40.770 & 90.610 \\
w/o Cross-Attention & 67.970 & 152.870 \\
w/o Time \& Cross-Attention & 50.120 & 137.940 \\
\hline
\end{tabular}
}
\end{table}



\subsection{Quantitative Results}

Table~\ref{tab:combined_temporal} summarizes interpolation performance on the Sunnybrook and ACDC datasets. DiffTAC achieves the highest PSNR and SSIM and the lowest LPIPS on both datasets. Linear and spline interpolation offer limited improvements because they operate directly on pixel intensities and cannot model nonlinear cardiac deformation. Optical flow performs worse due to unreliable motion estimates in regions with rapid contraction or low contrast.

DiffTAC outperforms all baselines because it models cardiac motion as a smooth trajectory in latent space. The ED and ES latents provide anatomical anchors, and the temporal embedding steers the diffusion process toward the correct phase. This combination yields sharper structures, better texture reconstruction, and smoother temporal transitions, as also seen qualitatively in Figures \ref{acdc-1} and \ref{sunny-1}. For more visual results see Appendix~\ref{apex d}.



\subsection{Comparison with State of the Art}
Table~\ref{tab:dl_comparison} compares DiffTAC with recent learning-based temporal reconstruction methods. We include UVI-Net \cite{kim2024dataefficientunsupervisedinterpolationintermediate}, a learnable flow-based volumetric interpolation method, and TSSC-Net \cite{zhou2025diffusion} , a diffusion-based temporal super-resolution approach that can achieve fixed $6\times$ upsampling using start and end frames. On the Sunnybrook dataset, which contains longer cardiac sequences with a larger number of intermediate frames, DiffTAC substantially outperforms both methods across all metrics, achieving higher PSNR and SSIM and markedly lower LPIPS, indicating more accurate and perceptually consistent reconstructions. On the ACDC dataset, UVI-Net attains higher image-based metrics, while DiffTAC performs phase-conditioned synthesis with flexible temporal scaling. Unlike TSSC-Net, which is limited to a fixed temporal factor, DiffTAC supports arbitrary temporal super-resolution within a unified framework. For more results  on temporal super resolution of DiffTAC please refer to Apendix \ref{apex c}.













\subsection{Ablation Studies}

We evaluate the impact of the learnable temporal embedding and cross-attention in Table~\ref{tab:combined_ablation}. Removing the temporal embedding reduces the model's ability to identify the target phase, decreasing reconstruction accuracy and temporal smoothness. Removing cross-attention prevents effective fusion of temporal information with spatial features, leading to blurrier structures and weaker perceptual quality. Disabling both components produces the largest degradation. The qualitative effects are visible in Figures~\ref{acdc-1} and~\ref{sunny-1}, where ablated variants exhibit motion artifacts and anatomical inconsistencies. These results show that DiffTAC benefits from three elements: latent-space denoising, explicit temporal conditioning, and the Integrated Attention Block for multi-scale fusion (see Appendix~\ref{apex d} for more).


\subsection{Temporal Super-Resolution}

Beyond interpolation, DiffTAC performs temporal super-resolution by generating cardiac phases 
within the acquired ED-ES interval. We evaluate 2×, 3×, 4×, and 5× temporal 
super-resolution by sampling the temporal position $\tau \in [1,0]$ at increasingly dense 
intervals (e.g., $\tau = \{0.10, 0.15, 0.20, \ldots\}$ for higher upsampling factors). As the 
upsampling factor increases, the model synthesizes more intermediate frames, and we observe a 
gradual reduction in visual quality, consistent with the increasing difficulty of high-ratio 
in-between frame synthesis. Since no ground-truth frames exist at these densified temporal 
positions, we assess realism using the Fréchet Inception Distance (FID) \cite{yu2021frechet}, as shown in 
Table~\ref{tab:fid_extrap_ablation}. DiffTAC achieves the lowest FID across both datasets. Removing the 
learnable temporal embedding or cross-attention leads to higher FID, indicating the importance 
of these components when no ground truth is available. These results show that DiffTAC provides 
reliable temporal super-resolution over a range of upsampling factors and can generate higher-frame-rate 
cine sequences without additional MRI acquisition. For more results see Appendix~\ref{apex c}.

\section{Discussion}
\label{discussion}
The learnable sinusoidal temporal embedding provides a continuous, smooth representation of the 
target phase $\tau$, which is essential for modeling cardiac motion. Standard positional encodings 
offer a fixed Fourier basis, but the learnable variant adapts this basis to the training distribution, 
allowing the network to capture dataset-specific temporal progression. Because cardiac motion evolves smoothly and follows a cyclic pattern, representing $\tau$ through a mixture of learnable sinusoidal 
components creates an implicit temporal manifold on which intermediate phases lie. The projection 
$c_\tau = W_{\mathrm{proj}}\phi(\tau)$ then maps this trajectory into the conditioning space, enabling 
the diffusion model to treat time as a continuous control signal rather than a discrete index.

During training, the end-diastolic (ED) and end-systolic (ES) frames are not used as supervision targets; instead, they are encoded using a frozen VAE and concatenated with the noisy latent of the target intermediate frame as conditioning inputs to the diffusion U-Net. This allows the model to learn view- and anatomy-specific feature representations, while supervision is applied only through noise prediction on the intermediate latent. Conditioning on ED and ES constrains generation to the correct anatomical feature space, which is important because cardiac MRI can be acquired from different views (e.g., 2-chamber, 4-chamber, short-axis).  By providing ED and ES latents, the model is guided to synthesize cine sequences that remain consistent with the given input anatomy and imaging plane.

The Integrated Attention Block (IAB) formalizes how temporal information modulates spatial 
representations. In diffusion models, self-attention alone captures spatial correlations, but it 
cannot enforce temporal alignment between ED and ES. Cross-attention provides a mechanism for 
injecting the temporal embedding into each spatial location. By using the same queries as 
self-attention but replacing keys and values with the temporal token, the model learns a linear 
operator that selects temporal features relevant to each spatial region. This gives a principled 
way to condition the denoising trajectory on the desired phase. The residual combination of 
self-attention, cross-attention, and feed-forward updates forms a shallow approximation to a 
dynamical system in which spatial features evolve under a temporally modulated flow field. 

Together, the learnable temporal embedding and IAB structure allow the network to encode temporal position as a continuous latent variable and to apply it consistently across all spatial scales. 
This yields a model that is sensitive to temporal ordering while remaining robust to anatomical 
variation, enabling coherent intermediate-frame generation and higher-order temporal 
super-resolution.

\section{Potential Applications}

The ability of DiffTAC to generate temporally super-resolved cardiac phases within the acquired 
ED–ES interval enables several practical applications in cardiac imaging and computational 
modeling. First, the method supports high–frame-rate cine reconstruction, producing temporally 
dense image sequences without requiring longer or repeated MRI acquisitions. Such high-resolution 
temporal data can improve clinical tasks including myocardial strain estimation \cite{amzulescu2019myocardial}, regional 
wall-motion assessment \cite{wahba2001assessment}, detection of subtle functional abnormalities, and enhanced visualization 
of rapid physiological events that may not be captured at standard frame rates.

Second, DiffTAC can facilitate fluid–structure interaction (FSI) \cite{kock2008mechanical} studies by providing smooth and 
temporally continuous myocardial boundary motion. This enables more accurate numerical 
simulations of ventricular hemodynamics, leading to improved characterization of blood flow 
patterns and pressure fields across the cardiac cycle.

Third, the framework is useful for digital-twin cardiac modeling \cite{zhao2025current}, where patient-specific dynamic 
sequences are required to calibrate or validate personalized biophysical models. By generating 
anatomically consistent intermediate phases, DiffTAC helps fill temporal gaps in sparsely sampled 
cine data.

Finally, the ability to synthesize physiologically realistic motion trajectories \cite{liu2024cardiac} can support downstream tasks such as motion correction, data augmentation, temporal harmonization across heterogeneous datasets, and the creation of dense training sequences for models requiring fine-grained temporal supervision.







\section{Conclusion}
\label{conc}
We present DiffTAC, a latent diffusion framework for cardiac cine interpolation and temporal 
super-resolution. By treating time as an explicit conditioning variable and combining ED/ES 
anatomical context with the proposed Integrated Attention Block, the model generates smooth and 
anatomically consistent intermediate frames across the cardiac cycle. Experiments on the 
Sunnybrook and ACDC datasets show strong improvements over interpolation and optical-flow 
baselines, as well as clear benefits from our design choices through ablation studies. DiffTAC 
also extrapolates beyond the acquired range, enabling flexible temporal super-resolution 
without additional scan burden. These results suggest that diffusion models with explicit 
temporal conditioning offer a promising direction for reconstructing high-frame-rate cardiac 
cine MRI in clinically constrained acquisition settings.















































%\midlacknowledgments{We thank a bunch of people.}









\bibliography{midl26_37}


\appendix

\section{Mathematical basis of the Learnable Sinusoidal Time Embedding}
\label{app:time-embedding}

Let $\tau\in[0,1]$ denote a normalized temporal position. We map $\tau$ to a discrete table 
index via
\begin{equation}
p(\tau)=\left\lfloor \tau (L-1)\right\rfloor,\qquad 
p(\tau)\in\{0,\dots,L-1\},
\end{equation}
where $L$ is the embedding-table length. The corresponding base sinusoidal embedding 
$\mathrm{PE}(p)\in\mathbb{R}^d$ is defined componentwise as
\begin{equation}
\mathrm{PE}_{2i}(p)=\sin(p\,\omega_i),\qquad
\mathrm{PE}_{2i+1}(p)=\cos(p\,\omega_i),
\qquad
\omega_i = 10000^{-2i/d}.
\end{equation}
This provides a smooth Fourier-like basis over the discrete index $p$. To adapt the embedding to dataset-specific temporal dynamics, we apply an affine modulation:
\begin{equation}
\phi(\tau)=\mathrm{clip}(s)\odot\mathrm{PE}\big(p(\tau)\big)+b,
\end{equation}
where $s,b\in\mathbb{R}^d$ are learnable scale and shift parameters, and 
$\odot$ denotes element-wise multiplication. The clamping of $s$ improves 
numerical stability and controls the amplitude of high-frequency components.

In our setting, a cine sequence consists of $T$ ordered frames 
$\{x_1,\ldots,x_T\}$ with annotated end-diastolic (ED) and end-systolic (ES) 
frames. For Sunnybrook, the full cardiac cycle ED→ES→ED is available, so we 
normalize temporal positions such that the first ED frame maps to $\tau=0$ and 
the final ED frame maps to $\tau=1$. The ES frame naturally lies at some 
intermediate $\tau\in[1,0]$ depending on its frame index. For ACDC, only an 
ED→ES half-cycle is provided; therefore ED is mapped to $\tau=0$ and ES to 
$\tau=1$. All intermediate frames occupy evenly spaced $\tau$ values in their 
respective ranges, while temporal super-resolution is performed by sampling 
$\tau$ more densely within $[1,0]$.



The resulting modulated embedding $\phi(\tau)$ is finally projected to
\begin{equation}
c_\tau = W_{\mathrm{proj}}\phi(\tau),
\end{equation}
which serves as the temporal conditioning token for cross-attention within the IAB modules. This construction admits several useful properties. (i) \emph{Local continuity}: adjacent values of \(\tau\) map to nearby indices \(p(\tau)\), and the sinusoidal basis varies smoothly with \(p\), so \(c(\tau)\) is locally continuous in \(\tau\) (modulo discretization). (ii) \emph{Expressivity}: the sinusoidal basis provides a set of frequency components (Fourier features), and the learnable affine parameters allow the model to reweight and shift these components to emphasize dataset-specific temporal patterns. (iii) \emph{Generalization}: because the base uses sinusoidal components rather than arbitrary lookup vectors, the embedding extrapolates in a structured way beyond seen indices, giving stable behavior for dense sampling of \(\tau\) during temporal super-resolution.

Gradients backpropagate through \(s\) and \(b\) directly. Since the dataset provides a fixed set of temporal positions (the eight interior frames between ED and ES), \(\tau\) takes only these discrete values during training. Thus the mapping \(p(\tau)\) is effectively fixed for all training examples, and the learnable affine parameters \(s\) and \(b\) adapt to these known temporal indices. During inference, however, we evaluate the embedding at dense values of \(\tau\) (e.g.\ \(\tau=0.05,0.10,0.15,0.20,0.25, \ldots\)) for temporal super-resolution. The sinusoidal basis ensures that these unseen positions still produce smooth, structured embeddings, while the learned affine modulation preserves dataset-adapted temporal trends. Together, this yields a compact, stable representation for providing continuous temporal conditioning to the cross-attention layers in the IAB modules.




\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/multi_row_all_tight.png}
      \caption{\footnotesize
Temporal super resolution on cardiac sequences on the ACDC dataset. 
The top row presents 2$\times$ temporal super-resolution, and the bottom row presents 3$\times$ temporal super-resolution, 
showing denser in-between frames within the acquired ED--ES interval. 
Due to the large number of generated frames, images are shown at a reduced scale; please zoom in to view structural details clearly.}


    \label{extra-acdc}
\end{figure*}


\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/sunnyy-multi_row_all_tight.png}
      \caption{\footnotesize
Temporal super resolution on cardiac sequences on the Sunnybrook dataset. 
The top row shows 2$\times$ temporal super-resolution, and the bottom row shows 3$\times$ temporal super-resolution, 
both producing denser in-between frames within the acquired ED--ES interval. 
Because the figure contains many generated frames, images are displayed at reduced size; 
please zoom in for detailed visualization.}


    \label{extra-sunny}
\end{figure*}




\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/apendix-sunnybrook.pdf}
      \caption{\footnotesize
Supplementary qualitative results on the Sunnybrook dataset. 
Generated frames (yellow box) illustrate the temporal progression within the ED-ES interval. 
DiffTAC maintains smooth motion and structural consistency, whereas 
models without temporal embedding (WLT), without cross-attention (WCA), 
or without both (WLTA) show irregular transitions and reduced stability (red markers).
}

    \label{add-sunny-1}
\end{figure*}

\begin{table}[t]
\centering
\caption{\textbf{FID scores for temporal super resolution at increasing super-resolution factors.} 
We evaluate 2×, 3×, 4×, and 5× temporal super-resolution on the Sunnybrook and ACDC datasets. 
Lower FID indicates closer alignment to the distribution of real cardiac cine MRI images.}
\label{tab:extrapolation_fid}
\begin{tabular}{lcccc}
\hline
\textbf{Dataset} & \textbf{2×} & \textbf{3×} & \textbf{4×} & \textbf{5×} \\
\hline
Sunnybrook & 27.32 & 32.58 & 34.42 & 37.22 \\
ACDC       & 62.33 & 66.18 & 67.24 & 71.24 \\
\hline
\end{tabular}
\end{table}
\section{Experiments on Temporal Super-Resolution}
\label{apex c}
In addition to the 2× super-resolution experiments reported in the main manuscript, we further 
evaluate DiffTAC under 3×, 4×, and 5× temporal super resolution. For each scaling factor, the temporal 
embedding $\tau$ is sampled more densely within the interval $[1,0]$, and the model generates the 
corresponding intermediate frames. Because ground-truth images do not exist for these extrapolated 
positions, we assess realism using the Fréchet Inception Distance (FID). As shown in 
Table~\ref{tab:extrapolation_fid}, FID increases gradually with the interpolation factor, which 
reflects the increasing difficulty of predicting frames farther from the ED--ES boundary frames. 
Even so, DiffTAC preserves stable performance across all settings.

Figures~\ref{extra-acdc} and~\ref{extra-sunny} provide visual examples of 2× and 3× temporal super resolution 
on the ACDC and Sunnybrook datasets, respectively. The generated sequences display smooth temporal 
evolution and coherent anatomical structure. Due to the large number of frames, images are shown 
at reduced scale, and we recommend zooming in for detailed inspection. These examples demonstrate 
that DiffTAC can synthesize plausible long-range temporal trajectories, enabling high-frame-rate 
cine reconstruction without requiring additional MRI acquisition.
















\begin{figure*}[t]
    \centering
    \includegraphics[width=1.00\linewidth]{images/apendix.pdf}
      \caption{\footnotesize
Additional qualitative examples from the ACDC dataset. 
The yellow box highlights all generated intermediate frames. 
DiffTAC produces coherent temporal transitions across the cardiac cycle, 
whereas the ablated variants exhibit less stable evolution of cardiac structures 
and reduced temporal smoothness, indicated by red markers.
}
    \label{add-acdc-1}
\end{figure*}










\section{Additional Visual Results}
\label{apex d}
In this section, we provide extended qualitative examples for both datasets used in our study. 
 Figure~\ref{add-sunny-1} shows additional intermediate-frame generation results on the Sunnybrook dataset, 
illustrating how DiffTAC produces smooth and coherent transitions across the ED--ES interval. 
Figure~\ref{add-acdc-1} presents corresponding results on the ACDC dataset, where the model 
maintains anatomical consistency and temporal continuity across all generated phases. 
These examples further highlight the effect of removing temporal conditioning or cross-attention, 
as the ablated variants exhibit visible disruptions in motion progression and spatial alignment.










\section{Limitations}

While DiffTAC shows strong performance in both interpolation and temporal super-resolution, 
several practical considerations remain. The model uses the ED and ES frames as boundary anchors, 
which is a clinically realistic setting but may limit performance if these frames are severely 
corrupted. Nevertheless, this dependency is far milder than methods requiring full-frame inputs, 
segmentation masks, or optical-flow estimations.

Temporal super resolution performance decreases moderately as the super-resolution factor increases, 
as reflected by rising FID values in Table~\ref{tab:extrapolation_fid}. This behavior is expected: 
the model is asked to synthesize motion increasingly distant from any acquired data. Importantly, 
DiffTAC still maintains consistent anatomical structure and temporal smoothness even at higher 
factors, unlike traditional interpolation or motion-based methods that degrade rapidly in the 
same regime. Finally, although diffusion models are computationally heavier than direct neural interpolators, 
operating in latent space significantly reduces cost and makes inference practical for cine MRI. 
Given the substantial gains in temporal coherence and realism, this trade-off is favorable for 
applications that require high-quality dynamic reconstruction.



\end{document}

