\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{algorithm}
\usepackage{algpseudocode}
 \usepackage{multirow} 
\usepackage[utf8]{inputenc}
\usepackage{textgreek}
\usepackage{booktabs}       % professional-quality ta
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{tcolorbox}
\usepackage[table]{xcolor}
\jmlrvolume{-- nnn}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
\editors{Accepted for publication at MIDL 2026}

\title[GenVOG-DiT]{GenVOG-DiT: A Transformer-Based Diffusion Model for Pose-Driven, Patient-Agnostic Nystagmus VOG Video Generation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\midlauthor{
\Name{Aimon Rahman\nametag{$^{1}$}} \Email{arahma30@jhu.edu}\\
\addr $^{1}$ Johns Hopkins University, Department of Electrical and Computer Engineering, Baltimore, MD, USA
\AND
\Name{Kemar E. Green\nametag{$^{2}$}} \Email{kgreen66@jhmi.edu}\\
\addr $^{2}$ Johns Hopkins University, Department of Neurology, Baltimore, MD, USA
\AND
\Name{Vishal M. Patel\nametag{$^{1}$}} \Email{vpatel36@jhu.edu}\\
}


\begin{document}

\maketitle
\begin{abstract}
Nystagmus, an involuntary eye movement indicative of neurological and vestibular disorders, is traditionally diagnosed using costly equipment or expert visual inspection: both of which limit accessibility in nonspecialist settings. Recent advances in computer vision and deep learning present an opportunity to automate the detection of nystagmus from standard video recordings. However, progress is hindered by the scarcity of publicly available video datasets due to privacy concerns surrounding ocular biometric data. In this work, we propose the use of synthetically generated eye movement videos to mitigate data limitations. Using video diffusion models, we simulate diverse clinically plausible nystagmus patterns without relying on real patient data, enabling scalable training while preserving privacy. We show that models trained on synthetic data generalize effectively to real-world settings and show potential for integration into telehealth applications. Our approach advances the development of accessible, generalizable, and privacy-aware diagnostic tools for eye movement disorders.
\end{abstract}

\begin{keywords}
Video Generation, Diffusion Transformer, Nystagmus.
\end{keywords}

\section{Introduction}
Nystagmus is an involuntary rhythmic oscillation of the eyes characterized by a slow drift of the eye in one direction followed by a corrective movement in the opposite direction \cite{wagle2022aeye,leigh2015-1,anastasio2022-2}.  It can occur in various planes: horizontal, vertical, or torsional, and typically presents in one of two waveform patterns: jerk nystagmus, where the eyes drift slowly in one direction, then jerk back quickly, and pendular nystagmus, where the eyes swing back and forth in a steady, pendulum-like motion without a distinct fast phase. Although the direction, velocity, and pattern of the nystagmus can localize dizziness to a peripheral vestibular disorder or a central brainstem or cerebellar lesion, and even outperform early neuroimaging in identifying dangerous brainstem strokes, these subtle eye movements often go unrecognized by front-line providers without specialized training in neuroophthalmology or neurootology \cite{wagle2022aeye}. ENG (electrodes) and VNG (infrared goggles) offer quantitative nystagmus metrics (e.g., slow‑phase velocity via caloric tests) but are costly, bulky, and confined to specialist centers. Consequently, most rely on subjective bedside exams that often miss subtle nystagmus, driving demand for portable, accurate detection methods. 

%Recent advances in computer vision and deep learning offer a promising avenue to address this need \cite{punuganti2019-41,phillips2019-42,newman2020-43,newman2021-44,reinhardt2020-45,wagle2022-46,zhang2021-47,lu2022-48}. Deep learning models can be trained to recognize the telltale patterns of nystagmus from ordinary video recordings, effectively mimicking what an expert sees but with quantitative consistency. Recent advances in computer vision and deep learning now enable quantitative, expert-level nystagmus detection from ordinary video recordings. Models can segment pupils, reconstruct eye trajectories, classify different nystagmus patterns, and compute metrics such as slow-phase velocity to generate clinician-friendly reports. These approaches promise cost-effective, portable, and telehealth-compatible solutions, allowing patients to record eye movements on a smartphone and receive automated analysis. Despite this progress, there remain significant challenges that motivate further research. A fundamental hurdle is the lack of large, high-quality training datasets for nystagmus in videos \cite{lohr2020eye,lohr2022eye,zola2021use}.  Even when video-oculography data are collected in research or specialist clinics, privacy and regulatory concerns often prevent sharing of patient videos, since they can contain identifiable features and constitute sensitive biometric health information. This scarcity of open data limits the development and validation of deep learning models, models trained on small, single-center datasets may not generalize well to broader patient populations or different recording conditions.


Recent advances in computer vision and deep learning offer a promising path toward automated, expert-level detection of nystagmus from standard video recordings \cite{punuganti2019-41,phillips2019-42,newman2020-43,newman2021-44,reinhardt2020-45,wagle2022-46,zhang2021-47,lu2022-48}. Deep learning models can now replicate what clinicians observe segmenting pupils, tracking eye trajectories, classifying nystagmus types, and computing metrics like slow-phase velocity-to produce quantitative, clinician-friendly reports. These technologies open the door to low-cost, portable, and telehealth-compatible solutions, enabling patients to record eye movements on a smartphone and receive automated assessments remotely.

Despite this progress, key challenges remain. Main among them is the lack of large, high-quality training datasets for nystagmus detection in video \cite{lohr2020eye,lohr2022eye,zola2021use}. Even when video-oculography data are available in research or specialty clinics, privacy and regulatory concerns often hinder data sharing, as videos may contain identifiable features and sensitive biometric health information. This scarcity of open datasets constrains the development and generalizability of deep learning models, which are often trained on limited, single-center data that may not perform reliably across diverse patient populations or real-world conditions.

To overcome the data bottleneck, researchers have started exploring synthetic data generation for nystagmus \cite{guibas2017synthetic,garcea2023data,wang2023deep}. Emerging work in video generation models suggests that it is possible to create realistic eye movement videos that exhibit specific nystagmus waveforms, without using any real patient video in the training process \cite{rahman2025genvog}. Such controllable video generation allows tuning parameters like the nystagmus direction, amplitude, and frequency, producing a wide range of scenarios to train robust models. By leveraging open-source simulation data or procedurally generated eye movement videos, deep learning models could be trained on diverse ``virtual" patients, making them more generalizable. In addition, synthetic data sidesteps privacy issues, enabling data sharing and collaborative research without risking confidential patient information. 

To address the limitations in data availability and support generalizable, privacy-preserving nystagmus detection, we propose a novel framework that leverages synthetic waveform modeling and video diffusion transformers. Our approach begins by mathematically modeling diverse synthetic nystagmus waveforms, incorporating variations in direction, amplitude, frequency, and noise to simulate real-world conditions. These waveforms are first validated through a deep learning-based waveform classifier trained on synthetic data and tested on waveforms from real-world nystagmus patient data.

To generate realistic eye movement videos, we condition a video diffusion model on synthetic pupil motion trajectories inspired by real-world video-oculography recordings. However, due to GPU memory constraints and the challenge of generating clinically viable long-form videos, the model alone cannot generate smooth, fine-grained nystagmus dynamics. To overcome this, we introduce a two-step generation pipeline: pupil segmentation masks are first generated from synthetic waveforms and then refined using a flow-based interpolation model to improve temporal consistency and clinical realism. Finally, we extract the waveforms from generated videos and quantitatively compare them against real patient data for validation. The contributions are summarized as:

\begin{itemize}
    \item \textit{Synthetic waveform generation:} We propose a mathematical modeling pipeline to generate diverse, realistic nystagmus waveforms with controllable parameters (e.g., noise, amplitude, frequency).
    \item \textit{Cross-domain waveform validation:} A waveform classifier trained solely on synthetic waveforms is evaluated on real patient data, demonstrating the clinical relevance of our synthetic dataset.
    \item \textit{Pupil-conditioned video diffusion:} We adapt a video diffusion model conditioned on real-world pupil trajectories to generate synthetic nystagmus videos.
    \item \textit{Flow-based waveform refinement:} To address the coarse nature of segmentation mask outputs, we integrate a flow-based model to interpolate and refine pupil motion, enhancing temporal smoothness and clinical fidelity.
    \item \textit{End-to-end waveform evaluation:} We extract and analyze waveform characteristics from generated videos and benchmark them against real patient waveforms to assess the realism and diagnostic utility of our approach.
\end{itemize}
This framework enables privacy-preserving, scalable nystagmus video synthesis, paving the way for robust, generalizable deep learning models in neuro-ophthalmology.

\begin{figure*}[t]
    \centering
    \includegraphics[width=1\textwidth]{figs/main_figs.pdf}

    \caption {\textbf{Architecture of our pupil-conditioned video diffusion transformer.}
The model adds a lightweight ControlNet to CogVideoX-2B, using CNN-encoded pupil masks to guide generation at every transformer layer. Real masks supervise training, synthetic masks drive inference. A decoder reconstructs frames, and IFNet \cite{huang2022real} upscales temporal resolution at inference only.}
    \label{fig:main_fig}
\end{figure*}

\section{Modeling Synthetic Nystagmus Waveforms}
To capture the diverse dynamics of nystagmus, we model the horizontal or vertical displacement of the pupil, \(P(t)\), as a function of time, \(t\), using a set of parametric equations. In all models below, \(A\) denotes the peak excursion (amplitude), \(\omega\) the angular frequency of oscillation, and \(\phi\) a phase offset that aligns the waveform with its initial condition. More details at Appendix \ref{sec1}.

\section{Controllable Video Diffusion Transformer}
We build on the pretrained CogVideoX-2b text–to–video diffusion model by adding a lightweight ControlNet \cite{chen2023control} branch for spatial conditioning (shown in the Figure \ref{fig:main_fig}). In the following section, we denote $x_0 \in \mathbb{R}^{T\times H\times W\times 3}$ be a ground-truth video, $p$ be its text prompt embedding, $c$ be a control map of shape $T\times H\times W$ and $\epsilon_\theta(\cdot)$ be the model’s noise predictor. For the video diffusion transformer, we encode $x_0$ via a 3D VAE:
\[
z_0 = \mathcal{E}(x_0)\in \mathbb{R}^{L\times d}, 
\quad L=T'H'W',\; d=\text{latent dim}.
\]
The expert transformer then iteratively denoises:
\[
z_t = z_{t-1} + \mathrm{TransformerBlock}\bigl(z_{t-1},\,p\bigr)\,,\quad t=1,\dots,T.
\]

For controllable generation, a small CNN encoder $\Phi$ maps $c$ into a feature tensor $C\in\mathbb{R}^{L\times d}$.  For each transformer layer $l$, we add a zero‑initialized injection $W^{(l)}$:
\begin{align}
C &= \Phi(c), \\
\tilde z^{(l)} &= \mathrm{Block}^{(l)}\bigl(z^{(l-1)},\,p\bigr), \\
z^{(l)} &= \tilde z^{(l)} \;+\; W^{(l)}(C)\,.
\end{align}
Since $W^{(l)}$ starts at zero, the pretrained behavior is preserved until fine‑tuning. We train only $\{\Phi,\,W^{(l)}\}$ by minimizing the standard DDPM loss with classifier‑free guidance on both $p$ and $c$:
\begin{align}
\mathcal{L} &= \mathbb{E}_{t,\,x_0,\,\epsilon}\Bigl\|\epsilon - \epsilon_\theta\bigl(z_t,\,p,\,C,\,t\bigr)\Bigr\|^2,\\
z_t &= \sqrt{\bar\alpha_t}\,z_0 + \sqrt{1-\bar\alpha_t}\,\epsilon,\quad \epsilon\sim\mathcal{N}(0,I).
\end{align}



\section{Synthetic Waveform-Guided Inference}
We now describe how to convert a 1D pupil waveform into a sequence of spatial masks for controllable video inference. Let
\[
w = \{w_i\}_{i=1}^{N},\quad w_i \in \mathbb{R}
\]
be the sampled pupil positions over time. Our transformer can generate at most $T'=41$ frames, so we first coarse‐sample $w$ at $T'$ indices:
\[
S = \bigl\lfloor \tfrac{(i-1)\,N}{T'-1}\bigr\rfloor + 1,\quad
\tilde w_i = w_{S_i},\quad i=1,\dots,T'.
\]

Each $\hat w_i$ is converted into a binary mask $c_i\in\{0,1\}^{H\times W}$ by placing a disk of radius
\[
r_i = \gamma\,\hat w_i
\]
centered at $(u_0, v_0)$ (the pupil centroid):
\[
c_i(x,y) = 
\begin{cases}
1, & (x - u_0)^2 + (y - v_0)^2 \le r_i^2,\\
0, & \text{otherwise}.
\end{cases}
\]
Stacking these yields the control map
\[
c = \bigl[c_1,\,c_2,\,\dots,\,c_{T'}\bigr]\in\{0,1\}^{T'\times H\times W}.
\]

During inference, we initialize $z_{T}$ with Gaussian noise and run the conditioned reverse diffusion:
\[
z_{t-1} = z_{t} - \epsilon_\theta\bigl(z_{t},\,p,\,\Phi(c),\,t\bigr)\,\Delta t,\quad t=T,\dots,1,
\]
then decode:
\[
\hat x = \mathcal{D}(z_0)\in\mathbb{R}^{T'\times H\times W\times 3}.
\]
This process produces a $T'$-frame video whose pupil motion follows the original waveform $w$ via the spatial masks $c$. To recover a full‑rate video of length $N \gg T'$ from the coarse $T'$‑frame output $\{\hat x_i\}_{i=1}^{T'}$, we apply a real‑time flow estimator \cite{huang2022real} $\mathcal{F}$:\\
\noindent \textbf{Pairwise flow estimation.}  
    For each adjacent frame pair $(\hat x_i, \hat x_{i+1})$, compute bidirectional flows
    \[
      f_{i\to i+1} = \mathcal{F}(\hat x_i, \hat x_{i+1}), 
      \quad
      f_{i+1\to i} = \mathcal{F}(\hat x_{i+1}, \hat x_i).
    \]
\noindent \textbf{Temporal interpolation.}  
    For any intermediate timestep $\tau \in (0,1)$ between frames $i$ and $i+1$, warp and blend:
    \[
      \tilde x_i(\tau) 
      = (1-\tau)\,\mathcal{W}\bigl(\hat x_i,\;\tau\,f_{i\to i+1}\bigr)
      + \tau\,\mathcal{W}\bigl(\hat x_{i+1},\;-(1-\tau)\,f_{i+1\to i}\bigr),
    \]
    where $\mathcal{W}(\cdot,\cdot)$ denotes differentiable backward warping.
  By sampling $\tau = \tfrac{k}{M+1}$ for $k=1,\dots,M$ (with $M = \tfrac{N}{T'} - 1$), we reconstruct the complete sequence.
\[
\bigl\{\hat x_1,\,\tilde x_1(\tfrac1{M+1}),\dots,\hat x_2,\dots,\hat x_{T'}\bigr\}
\]
of length $N$. Finally, we apply a 1D low‑pass filter along the time axis to the upsampled video frames to eliminate any residual flicker while preserving fine motion details. The complete workflow is detailed in Algorithms \ref{alg1} and \ref{alg2}.


\begin{figure}[t]
    \centering
    \includegraphics[width=0.5\textwidth]{figs/quals.pdf}

    \caption {\textbf{Qualitative examples of the generated video frames.} For each case, we show the corresponding input waveform that guides the synthesis process, followed by representative RGB frames from generated eye videos and infrared (IR) frames from IR video outputs. These examples illustrate the model’s ability to produce temporally coherent and anatomically consistent eye movements across both RGB and IR modalities.}
    \label{fig:qual}
\end{figure}

\begin{algorithm}[t]
\caption{Synthetic Waveform–Guided Inference}
\label{alg1}
\KwIn{1D waveform $w=\{w_i\}_{i=1}^N$, transformer $D_\theta$, decoder $\mathcal{D}$, parameters $T'$, $\gamma$, pupil center $(u_0,v_0)$, noise schedule $\{\alpha_t\}$}
\KwOut{Generated coarse video frames $\{\hat x_i\}_{i=1}^{T'}$}

\textbf{COARSE SAMPLING:}\\
\For{$i \gets 1$ \KwTo $T'$}{
  $S_i \gets \lfloor \tfrac{(i-1)\,N}{T'-1}\rfloor + 1$\;
  $\tilde w_i \gets w_{S_i}$\;
  $r_i \gets \gamma\,\tilde w_i$\;
  Generate mask $c_i(x,y)\gets \mathbb{I}\{(x-u_0)^2+(y-v_0)^2\le r_i^2\}$\;
}

\textbf{STACK} $c \gets [c_1,\dots,c_{T'}]$\;

\textbf{DIFFUSION INFERENCE:}\\
Sample $z_{T} \sim \mathcal{N}(0,I)$\;
\For{$t \gets T$ \textbf{downto} $1$}{
  $z_{t-1} \gets z_{t} - D_\theta\bigl(z_t,\,p,\,\Phi(c),\,t\bigr)\,\Delta t$\;
}

\textbf{DECODE:}\\
\For{$i \gets 1$ \KwTo $T'$}{
  $\hat x_i \gets \mathcal{D}(z_0[i])$\;
}

\Return $\{\hat x_i\}_{i=1}^{T'}$\;
\end{algorithm}
% \begin{algorithm}[t]
% \caption{Synthetic Waveform–Guided Inference}
% \label{alg:waveform_inference}
% \begin{algorithmic}
% \Require 1D waveform $w=\{w_i\}_{i=1}^N$, transformer $D_\theta$, decoder $\mathcal{D}$, parameters $T'$, $\gamma$, pupil center $(u_0,v_0)$, noise schedule $\{\alpha_t\}$
% \Ensure Generated coarse video frames $\{\hat x_i\}_{i=1}^{T'}$

% \State \textbf{COARSE SAMPLING:}
% \For{$i \gets 1$ \textbf{to} $T'$}
%   \State $S_i \gets \lfloor \tfrac{(i-1)\,N}{T'-1}\rfloor + 1$
%   \State $\tilde w_i \gets w_{S_i}$
%   \State $r_i \gets \gamma\,\tilde w_i$
%   \State Generate mask $c_i(x,y)\gets \mathbb{I}\{(x-u_0)^2+(y-v_0)^2\le r_i^2\}$
% \EndFor

% \State \textbf{STACK} $c \gets [c_1,\dots,c_{T'}]$

% \State \textbf{DIFFUSION INFERENCE:}
% \State Sample $z_{T} \sim \mathcal{N}(0,I)$
% \For{$t \gets T$ \textbf{downto} $1$}
%   \State $z_{t-1} \gets z_{t} - D_\theta\bigl(z_t,\,p,\,\Phi(c),\,t\bigr)\,\Delta t$
% \EndFor

% \State \textbf{DECODE:}
% \For{$i \gets 1$ \textbf{to} $T'$}
%   \State $\hat x_i \gets \mathcal{D}(z_0[i])$
% \EndFor

% \State \Return $\{\hat x_i\}_{i=1}^{T'}$
% \end{algorithmic}
%  \label{alg1}
% \end{algorithm}

\begin{algorithm}[t]
\caption{Flow‐Based Video Upsampling \& Temporal Smoothing}
\label{alg2}
\KwIn{Coarse frames $\{\hat x_i\}_{i=1}^{T'}$, flow estimator $\mathcal{F}$, upsample factor $M$, warping op.\ $\mathcal{W}$, smoothing filter $\mathcal{S}$}
\KwOut{Full‐rate frames $\{x_t\}_{t=1}^{N}$, $N = T'(M+1)$}

\textbf{INITIALIZE} empty list $\mathcal{V}$\;

\For{$i \gets 1$ \KwTo $T'-1$}{
  Append $\hat x_i$ to $\mathcal{V}$\;
  $f_{i\to i+1} \gets \mathcal{F}(\hat x_i,\hat x_{i+1})$\;
  $f_{i+1\to i} \gets \mathcal{F}(\hat x_{i+1},\hat x_i)$\;
  \For{$k \gets 1$ \KwTo $M$}{
    $\tau \gets \tfrac{k}{M+1}$\;
    $\tilde x \gets (1-\tau)\,\mathcal{W}(\hat x_i,\tau f_{i\to i+1})$
    \quad$+\;\tau\,\mathcal{W}(\hat x_{i+1},-(1-\tau)f_{i+1\to i})$\;
    Append $\tilde x$ to $\mathcal{V}$\;
  }
}

Append $\hat x_{T'}$ to $\mathcal{V}$\;

\textbf{TEMPORAL SMOOTHING:}\\
$\{x_t\} \gets \mathcal{S}(\mathcal{V})$\;

\Return $\{x_t\}_{t=1}^{N}$\;
\end{algorithm}
% \begin{algorithm}[t]
% \caption{Flow‐Based Video Upsampling \& Temporal Smoothing}
% \label{alg:flow_upsample}
% \begin{algorithmic}
% \Require Coarse frames $\{\hat x_i\}_{i=1}^{T'}$, flow estimator $\mathcal{F}$, upsample factor $M$, warping op.\ $\mathcal{W}$, smoothing filter $\mathcal{S}$
% \Ensure Full‐rate frames $\{x_t\}_{t=1}^{N}$, $N = T'(M+1)$

% \State \textbf{INITIALIZE} empty list $\mathcal{V}$

% \For{$i \gets 1$ \textbf{to} $T'-1$}
%   \State Append $\hat x_i$ to $\mathcal{V}$
%   \State $f_{i\to i+1} \gets \mathcal{F}(\hat x_i,\hat x_{i+1})$
%   \State $f_{i+1\to i} \gets \mathcal{F}(\hat x_{i+1},\hat x_i)$
%   \For{$k \gets 1$ \textbf{to} $M$}
%     \State $\tau \gets \tfrac{k}{M+1}$
%     \State $\tilde x \gets (1-\tau)\,\mathcal{W}(\hat x_i,\tau f_{i\to i+1})$
%     \State \quad$+\;\tau\,\mathcal{W}(\hat x_{i+1},-(1-\tau)f_{i+1\to i})$
%     \State Append $\tilde x$ to $\mathcal{V}$
%   \EndFor
% \EndFor

% \State Append $\hat x_{T'}$ to $\mathcal{V}$

% \State \textbf{TEMPORAL SMOOTHING:}
% \State $\{x_t\} \gets \mathcal{S}(\mathcal{V})$

% \State \Return $\{x_t\}_{t=1}^{N}$
% \end{algorithmic}
%  \label{alg2}
% \end{algorithm}





\section{Experiment}

\noindent \textbf{Datasets.} We train our conditional generative model of the ocular region using videos from the Labeled Pupils in the Wild (LPW) dataset \cite{tonsen2016labelled}, which includes 66 high-resolution high-frame rate videos centered on the eye region, originally developed for pupil detection tasks. For supervised training, we also incorporate a private Nystagmus dataset as described in \cite{kocak2021novel}. A separate validation set is constructed using videos from 5 normal and 5 nystagmus patients. The final training and test sets consist of 1000 and 200 short video clips, respectively.

\noindent \textbf{Synthetic Waveform Validation Model.} For our waveform validation classification experiments, we employ a four‐stage convolutional encoder followed by a bidirectional LSTM with an attention‐pooling head. Concretely, the raw input sequence (\(B\times T\times F\)) is first permuted to (\(B\times F\times T\)) and passed through four sequential 1D Conv-ReLU-BatchNorm-MaxPool blocks with channel widths \([32,64,128,256]\), kernel size 5 and stride 1.  The resulting feature map is then transposed back to (\(B\times T\times 256\)) and fed into a two‐layer, bidirectional LSTM (hidden size \(H\), dropout 0.5), whose full‐sequence outputs are scored by a learned linear attention layer.  The softmax‐normalized attention weights are applied to the LSTM outputs to produce a single, \(2H\)–dimensional representation per example, which is layer‐normalized and passed through an \(H\to4H\) linear projection (ReLU+dropout 0.5) and a final \(H\to C\) classifier to yield the \(C\) output logits.

\noindent \textbf{Baselines and Results.} Figure \ref{fig:qual} shows qualitative results of the generated video frames, including representative RGB frames from synthesized eye videos and infrared (IR) frames from IR outputs. These examples highlight the model's ability to generate temporally coherent and anatomically consistent eye movements across both modalities. Table \ref{tab:quantitative} presents the quantitative results alongside corresponding baselines. Each stage of the framework has its own baseline. For the \textit{Waveform Classifier}, the baseline refers to a model trained on real patient waveform data and evaluated on the validation set. Our results represent the classifier's performance on synthetic data during inference to assess whether the model can accurately identify synthetic waveforms, thereby validating their realism. The accuracy decreases from 97.0\% to 92.3\%, and the Macro-F1 score drops from 96.1\% to 90.1\%, reflecting a modest reduction in performance when using synthetic data, but still supporting the realism of the generated waveforms. For \textit{Video Generation} we compare our method against GenVOG \cite{rahman2025genvog}, a training-free framework for nystagmus video generation that leverages a pretrained UNet-based architecture. To evaluate the visual quality and temporal coherence of the generated videos, we report results using following widely adopted metrics: Frechet Video Distance (FVD) \cite{unterthiner2019fvd}, which captures distribution-level similarity in temporal dynamics, and LPIPS, which assesses perceptual similarity at the frame level. Additionally, we also report Dynamic Degree, Imaging Quality and Motion Smoothness from VBench evaluation suite \cite{huang2024vbench}. We observe a lower LPIPS score (0.082 vs. 0.120), indicating higher perceptual similarity, and a substantial improvement in FVD (395 vs. 678), suggesting better temporal consistency in the generated videos. Then, for \textit{Detection AUROC}, we train a classifier using three configurations: real data only, synthetic data only, and a combination of real and synthetic data. Classifier trained on synthetic data alone achieves an AUROC of 0.69. When trained on a combination of real and synthetic data, the performance improves to 0.92, slightly surpassing the classifier trained only on real data, which scored 0.89. This suggests that synthetic data can complement real data and enhance classifier performance. Finally, we conduct an ablation study by comparing the FVD scores of videos generated with and without the flow-based interpolation module. Removing the module results in a slightly higher FVD score (399 compared to 395), indicating degradation in video quality without the flow-based refinement step. Additionally, to assess the realism of our synthetic waveforms, we compare key clinical parameters: amplitude, frequency, and slow-phase velocity, across real patient recordings, procedurally generated synthetic signals, and waveforms extracted from generated videos (Figure \ref{fig:plots}). The distributions demonstrate strong alignment, particularly in amplitude, supporting the validity of our synthetic data for training and evaluation.

 \begin{figure*}[t]
    \centering
    \includegraphics[width=1\textwidth]{figs/plots.pdf}

    \caption {\textbf{Distribution shift analysis across waveforms.} We compare the distribution of three clinically relevant nystagmus waveform parameters: (a) amplitude, (b) frequency, and (c) slow-phase velocity (SPV): procedurally generated synthetic waveforms (orange) and waveforms extracted from real videos (green). The goal is to assess how well the synthetic signals, both direct and video-derived, match the statistical characteristics of real-world recordings. While synthetic waveforms closely resemble the real distributions in most metrics, minor shifts in SPV and frequency distributions suggest areas for further refinement.}
    \label{fig:plots}
\end{figure*}



\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{15pt} 
\renewcommand{\arraystretch}{1}
\begin{tabular}{@{}llccc@{}}
\toprule

\textbf{Exp.} & \textbf{Metric} & \textbf{Baseline} & \textbf{Ours} & \textbf{Δ} \\ 
\midrule
\multirow{2}{*}{Waveform Classifier} 
  & Accuracy (\%)        & 97.0            & 92.3         & -4.7      \\ 
  & Macro-F1 (\%)        & 96.1            & 90.1         & -6.0      \\ 
\midrule

\multirow{5}{*}{Video Generation} 
  & LPIPS $\downarrow$    & 0.120           & 0.082        & –0.038    \\ 
  & FVD $\downarrow$      & 678             & 395          & –283      \\ 
    & Dynamic Degree $\uparrow$    & 91.85           & 99.17        & +7.32    \\ 
  & Motion Smoothness $\uparrow$      & 95.67             & 98.50          & +2.83      \\ 
    & Imaging Quality $\uparrow$      & 63.17             & 70.5          & +7.33      \\ 

\midrule
\multirow{3}{*}{Detection AUROC} 
  & Real only            & 0.89            & —            & —         \\ 
  & Synthetic only       & —               & 0.69         & —         \\ 
  & Synth+Real         & —               & 0.92         & — \\ 
\midrule

\multirow{1}{*}{ Ablation (w/o flow)} 
  & FVD $\downarrow$   & 399            & 395         & –4     \\
    & Motion Smoothness $\uparrow$   & 96.1            & 98.50         & +2.4     \\ 

\bottomrule
\end{tabular}
\caption{\textbf{Quantitative results across all experimental settings.} The column labeled ``Baseline" corresponds to existing methods or specific ablated variants, while ``Ours" refers to the performance of our full proposed pipeline. Metrics are chosen to reflect both perceptual and temporal qualities across tasks. Downward arrows ($\downarrow$) indicate that lower values are preferred, such as in LPIPS and FVD, where lower scores denote better perceptual similarity and temporal coherence. These results collectively demonstrate the effectiveness of our approach in waveform classification, video generation, detection robustness, and the contribution of each component via ablation.}
\label{tab:quantitative}
\end{table*}

%A qualitative figure
% Quantitave figure

\section{Related Work}
\noindent\textbf{Video Generation Networks.}  
In recent years, there has been rapid progress in generative video modeling, with most state-of-the-art systems built on either denoising U‑Net architectures \cite{blattmann2023align,singer2022make,ho2022imagen,hong2022cogvideo,ho2022video,mei2023vidm,molad2023dreamix,wang2023modelscope} or transformer-based diffusion backbones \cite{yang2024cogvideox,liu2024sora}. These methods typically integrate spatial diffusion modules with temporal processing layers to jointly model frame-wise appearance and motion dynamics. Despite impressive results, two major challenges persist: (1) most high-quality video diffusion models remain closed-source, and (2) training them from scratch requires massive computational resources and large-scale video datasets. 


% Moreover, the commercial ecosystem is rapidly adopting these advances, examples include Luma AI, RunwayML Gen‑2/Gen‑3, Minimax, and Kling, highlighting the practical potential of video generation for applications in simulation, content creation, and beyond \cite{lumaAI,runwayml_gen2,runwayml_gen3,minimax,kling, qin2024worldsimbench,wang2024worlddreamer,cho2024sora}.

%controllable generation literature


\noindent \textbf{Deep learning for Nystagmus Modeling.} One of the early efforts was the development of diagnostic decision support systems using CNN-based models for benign paroxysmal positional vertigo (BPPV) detection, achieving high sensitivity and specificity across horizontal, vertical, and torsional directions \cite{lim2019developing}. Later work introduced aEYE, a deep learning model trained to detect nystagmus beats in short videos with an AUROC of 0.86 and an accuracy of 82.7\%, using simple 1D CNN architectures on labeled video clips \cite{wagle2022aeye}. Specialized models for vertical and torsional nystagmus classification have also been proposed. The model in \cite{li2023vertical} achieved 91\% accuracy in vertical nystagmus recognition, while torsional nystagmus classification with a transformer-based approach reached 92.9\% test accuracy \cite{li2023torsional}. The LAD hybrid system combined LSTM and CNN modules and achieved 91\% accuracy on a large dataset of positional tests \cite{pham2022lad}. Smartphone-based solutions have emerged for low-cost tracking. ConVNG, a CNN-based system, was proposed for analyzing slow-phase velocity in smartphone videos \cite{friedrich2023smartphone}. Similarly, EyePhone used smartphone cameras and showed strong correlation with infrared VOG for detecting optokinetic responses \cite{bastani2024quantifying}. Another lightweight real-time nystagmus tracking framework based on ocular object segmentation demonstrated robust feasibility in clinical settings \cite{cho2024feasibility}.

Generative and transformer-based architectures have also been explored. The GPT-4 Vision model was repurposed to classify nystagmus patterns but performed below expectation, achieving only 37\% accuracy overall \cite{noda2025exploring}. This highlights a gap in domain adaptation and the need for more targeted architectures. Lastly, telehealth frameworks integrating deep learning for nystagmus detection have shown promise in remote diagnostics. A deep learning-based system trained on 15,000 video frames achieved 98\% accuracy, emphasizing the potential of large-scale, annotated video data for real-world deployment \cite{sanghvi2025artificial}.

Recent studies have applied machine learning to detect and analyze nystagmus using video-oculography (VOG) data, including both video and waveform signals \cite{punuganti2019-41,phillips2019-42,newman2020-43,newman2021-44,reinhardt2020-45,wagle2022-46,zhang2021-47,lu2022-48}. However, these efforts are limited by the use of small, private datasets with few nystagmus variations, largely due to patient privacy concerns \cite{lohr2020eye,lohr2022eye,zola2021use}, which hinders reproducibility and broader research. Modeling nystagmus dynamics remains a complex challenge due to the intricate interplay of ocular motor physiology and biomechanics influenced by eye, head, and body position \cite{punuganti2019-41,phillips2019-42,newman2020-43,newman2021-44,reinhardt2020-45,wagle2022-46,lu2022-48,lim2019-50,zhang2021-51,li2023-52,li2023-53}. To mitigate dataset scarcity in medical applications, researchers often rely on synthetic data generation from real-world samples \cite{guibas2017synthetic,garcea2023data,wang2023deep}. Yet, this approach can suffer from overfitting, where generative models reproduce training data too closely, defeating the goal of generalization \cite{somepalli2023diffusion}. In the case of nystagmus, the challenge is amplified by the need to generate long-form, clinically meaningful videos that accurately reflect physiological pupil-position waveforms.







\section{Conclusion}
We have presented a novel, end‑to‑end framework for privacy‑preserving nystagmus video synthesis and analysis. By combining (1) mathematically modeled synthetic waveforms, (2) a waveform classifier for cross‑domain validation, (3) pupil‑conditioned video diffusion, and (4) flow‑based interpolation for temporal refinement, our pipeline generates high‑fidelity eye‑movement videos that faithfully reproduce clinically relevant nystagmus dynamics. Extensive experiments demonstrate that our synthetic datasets enable classifiers and regression models to generalize to real patient recordings, while sidestepping privacy and data‑scarcity challenges inherent to clinical video‑oculography. 


\bibliography{midl26_183}


\appendix


\section{Limitations and Ethics Statement.}  
While our framework advances scalable, privacy‑preserving nystagmus video synthesis, several limitations must be acknowledged. First, although synthetic waveforms are designed to mimic real patient dynamics, our generated videos may still lack subtle anatomical or contextual cues present in clinical recordings (e.g., eyelid motion, head oscillation, lighting variability). Second, the diffusion model’s coarse mask outputs and subsequent flow‑based interpolation may introduce artifacts under extreme waveform parameters (very high frequency or amplitude), potentially misleading downstream classifiers. Additionally, due to the stochastic nature of diffusion models, certain random seeds can lead to unrealistic visual anomalies, such as duplicate pupils, excessive noise, or nonsensical motion artifacts, that may compromise video fidelity.  Third, our current evaluation focuses primarily on horizontal jerk and pendular patterns; extension to vertical, torsional, or mixed‑pattern nystagmus will require further modeling and validation. Finally, the computational cost of diffusion and optical‑flow upsampling may limit real‑time deployment on resource‑constrained devices.

Ethically, synthetic data can mitigate privacy concerns by eliminating identifiable patient features, yet it may also foster overconfidence: models trained purely on artificial videos must be rigorously validated on diverse clinical datasets before clinical use. There is a risk of misuse if synthetic videos are mistaken for true patient recordings or used inappropriately (e.g., during forensic or insurance assessments). We therefore recommend that any diagnostic tool developed with our pipeline be integrated under expert supervision, with clear disclaimers about the synthetic nature of training data and strict adherence to medical device regulations and institutional review guidelines.


\section{Modeling Synthetic Nystagmus} 
\label{sec1}
Figure \ref{fig:waveform} presents examples of both synthetic and real waveforms. Figure \ref{fig:real-ex} shows real-world example of pupil movements overtime for Nystagmus patient.

\begin{figure*}[t]
    \centering
    \includegraphics[width=1\textwidth]{figs/output.png}

    \caption {Waveform morphologies of various nystagmus types.
Nystagmus can be categorized into two main types based on the pattern of eye movements: jerk nystagmus, which consists of a slow phase followed by a quick corrective phase, and pendular nystagmus, characterized by oscillations that are slow in both directions. These waveform patterns reflect abnormalities in the neural pathways responsible for maintaining gaze stability. Jerk nystagmus can be further subdivided according to the velocity profile of the slow phase, including linear velocity, decelerating velocity, and accelerating velocity waveforms.}
    \label{fig:nystype}
\end{figure*}

\begin{enumerate}
  \item \emph{Pendular Nystagmus.}  
    Pendular nystagmus exhibits smooth, sinusoidal oscillations akin to a physical pendulum. We define
    \[
      P_{\mathrm{pend}}(t) \;=\; A \,\sin\bigl(\omega\,t + \phi\bigr)\,.
    \]

  \item \emph{Accelerating Jerk Nystagmus.}  
    Here, the slow phase accelerates quadratically away from center before a rapid corrective saccade returns the gaze. One simple cycle‐based approximation is
    \[
      P_{\mathrm{jerk\_acc}}(t) \;=\; A\,\bigl(t^2 + \phi\bigr)\quad (0 \le t < T),
    \]
    with a discontinuous reset at \(t = T\).

  \item \emph{Decelerating Jerk Nystagmus.}  
    In this variant, the slow‐phase velocity decreases linearly over time, producing a concave trajectory:
    \[
      P_{\mathrm{jerk\_dec}}(t) \;=\; A\,\bigl(t - 0.5\,t^2\bigr)\quad (0 \le t < T),
    \]
    followed by a fast reset at the end of each period.

  \item \emph{Linear Jerk Nystagmus.}  
    When the slow phase proceeds at constant velocity before a quick reversal, the motion is piecewise linear:
    \[
      P_{\mathrm{jerk\_lin}}(t) =
      \begin{cases}
        A\,(-0.5\,t), & 0 \le t < T,\\
        0, & t = T,
      \end{cases}
    \]
    then repeats every \(T\).

  \item \emph{Square‐Wave Jerk.}  
    Square‐wave jerks consist of brief, step‐like deviations from fixation followed by corrective steps. We model this as
    \[
      P_{\mathrm{sq}}(t) =
      \begin{cases}
        +A, & 0 \le t < \tfrac{T}{2},\\
        -A, & \tfrac{T}{2} \le t < T,
      \end{cases}
    \]
    with periodic repetition.
\end{enumerate}

Figure \ref{fig:nystype} illustrates different Nystagmus types.


\noindent\textbf{Parameter Selection.}  
Following Kocak et al.\ \cite{kocak2021novel}, we constrain the slow-phase velocity by specifying an initial velocity \(v_0\in[5,25]\)\,°/s (in 0.5°/s steps) and a velocity decay of 0-24\% over each cycle. These bounds ensure our synthetic waveforms span clinically observed ranges.

\begin{figure*}[t]
    \centering
    \includegraphics[width=1\textwidth]{figs/quals_waveforms.pdf}

    \caption {\textbf{The figure illustrates a comparison between real and synthetic nystagmus waveforms.} We focus on jerk nystagmus, the only subtype present in our real patient dataset. For comparison, we present a real patient waveform recorded over 300 frames alongside a synthetically generated waveform rendered over a 5-second interval. While only jerk nystagmus is shown here due to limitations in real data availability, our generative framework is capable of realistically simulating a broad range of nystagmus patterns, including pendular, vertical, and mixed-type oscillations.}
    \label{fig:waveform}
\end{figure*}


\begin{figure*}[t]
    \centering
    \includegraphics[width=1\textwidth]{example.png}

    \caption {Real-world example of Nystagmus by tracking pupil.}
    \label{fig:real-ex}
\end{figure*}

\end{document}
