\documentclass{midl} % Include author names
%\usepackage[backend=biber,style=authoryear,natbib]{biblatex}
%\addbibresource{articles.bib}
\usepackage[textwidth=2cm]{todonotes}
\usepackage{adjustbox}
\usepackage{float}
\usepackage{booktabs}
\usepackage{bold-extra}
\usepackage{xcolor}



% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution


\usepackage{mwe} % to get dummy images
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025 submission}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2025}

\jmlrproceedings{}{}

\title[Sequence models for continuous cell cycle stage prediction from brightfield images]{Sequence models for continuous cell cycle stage prediction from brightfield images}

\midlauthor{\Name{Louis-Alexandre Leger\nametag{$^{1}$}\midljointauthortext{Contributed equally}} \orcid{0009-0007-9183-173X} \Email{louis-alexandre.leger@epfl.ch}\\
\Name{Maxine Leonardi\nametag{$^{1}$}\midlotherjointauthor} \orcid{0000-0002-9780-9126} \Email{maxine.leonardi@epfl.ch}\\
\Name{Andrea Salati\nametag{$^{1}$}\midlotherjointauthor} \orcid{0009-0006-6373-3668} \Email{andrea.salati@epfl.ch}\\
\setsharedfootnotesymbol
\Name{Felix Naef\nametag{\thinspace$^{1}$}\midljointsharedauthortext{Shared supervision}} \orcid{0000-0001-9786-3037} \Email{felix.naef@epfl.ch}\\
\Name{Martin Weigert\nametag{$^{1,2}$}\midlothersharedauthor} \orcid{0000-0002-7780-9057} \Email{martin.weigert@tu-dresden.de}\\
\addr $^{1}$ Institute of Bioengineering, School of Life Sciences, Ecole Polytechnique Fédérale de Lausanne (EPFL), Lausanne, Switzerland\\ 
\addr $^{2}$ Center for Scalable Data Analytics and Artificial Intelligence (ScaDS.AI), Dresden/Leipzig, Germany
}

          
\input{definitions.tex}

%\bibliography{articles.bib}
\begin{document}


\maketitle

\begin{abstract}
Understanding cell cycle dynamics is crucial for studying biological processes such as growth, development and disease progression. While fluorescent protein reporters like the \Fucci system allow live monitoring of cell cycle phases, they require genetic engineering and occupy additional fluorescence channels, limiting broader applicability in complex experiments.
In this study, we conduct a comprehensive evaluation of deep learning methods for predicting continuous \Fucci signals using non-fluorescence brightfield imaging, a widely available label-free modality. 
To that end, we generated a large dataset of 1.3 M images of dividing RPE1 cells with full cell cycle trajectories to quantitatively compare the predictive performance of distinct model categories including single time-frame models, causal state space models and bidirectional transformer models.
We show that both causal and transformer-based models significantly outperform single- and fixed frame approaches, enabling the prediction of visually imperceptible transitions like 
G1/S within 1h resolution. Our findings underscore the importance of sequence models for 
accurate predictions of cell cycle dynamics and highlight their potential for label-free imaging.
\end{abstract}

\begin{keywords}
Cell cycle prediction, label-free microscopy, sequence-models
\end{keywords}

\section{Introduction}

The cell cycle is the driving force behind the growth and development of all living organisms. This well-studied sequence of cellular events is tightly regulated and aberrations in such mechanisms can lead to genomic instability, a key driver of various diseases including cancer \cite{kastan_cell-cycle_2004}. 
Live cell fluorescence microscopy has become a powerful tool for studying cell cycle progression, particularly through the genetic engineering of fluorescent reporters like the \Fucci system~\cite{sakaue-sawano_visualizing_2008,stallaert_structure_2022}. This system enables the distinction of cell cycle phases from single images by fluorescently tagging the two proteins Cdt1 and Geminin, whose expression changes distinctively with the cell cycle~(\figureref{fig:Overview_of_data}). 
Despite its utility, the classic \Fucci system and recent variants~\cite{sakaue-sawano_genetically_2017,grant_accurate_2018} are limiting in practice as they occupy two of the few available microscopy channels, reducing the ability to study other cellular processes simultaneously and requiring genetic modification that might interfere with the endogenous cell cycle regulation. 
\begin{figure}[t!]
    \floatconts
      {fig:Overview_of_data}
      {\caption{\textbf{Multi-modal imaging of \Fucci-reporter cells reveals a continuous representation of cell cycle states.} 
      \textbf{a)} Time-lapse imaging of \Fucci-reporting cells allows for precise quantification of cell cycle
    staging through the characteristic oscillations of fluorescent reporter intensities. 
    \textbf{b)} Representative time-lapse images of brightfield, H2B, and 
    \Fucci channels across one full (M-M) cell cycle.
    \textbf{c)} Quantification of integrated logarithmic fluorescence intensities from \Fucci reporters, normalized nuclear area, and normalized total H2B signal in a representative full M-M (mitosis to mitosis) track. Vertical lines mark the time points corresponding to the images shown in d. 
    \textbf{d)} Log-transformed \Fucci manifold for continuous inference of cell cycle states. 
    }}
    {\includegraphics[width=1\linewidth, trim=0 1cm 0 0, clip]{Figures/Overview_of_data_mod.pdf}}
\end{figure}
In contrast, brightfield microscopy is an easily accessible and label-free imaging modality that does not require genetic engineering, providing limited specificity and imaging contrast. Although some cell cycle transitions, such as nuclear envelope breakdown, are marked by distinct morphological changes that are easily detectable, most cell cycle transitions are visually indiscernible in individual brightfield images of cells.
In this paper, we ask whether leveraging the \emph{temporal information} in time-lapse brightfield microscopy images of cells would allow to predict \emph{continuous} cell cycle states without the need for fluorescent reporters such as \Fucci. 
To address this, we study several sequence-based deep learning models, including transformers \cite{vaswani_attention_nodate} and recently proposed state-space models~\cite{gu_efficiently_2022}. 
In particular, we will investigate both \emph{causal} sequence models that only use information from previous time points, as well as \emph{non-causal} models that may ingest  the entire sequence. 
Providing a new dataset of over 1~M images of segmented and tracked RPE-1 cells with accompanying ground truth \Fucci signals, we show that both causal and transformer-based non-causal models significantly outperform single-frame approaches, enabling the prediction 
of morphologically subtle cell state transitions 
like G1/S within 1h resolution from live-cell brightfield imaging alone. 
\vspace{-.2cm}



\subsection{Related work}

The prediction of individual cell states from single microscopy images has found considerable interest in the literature. For instance~\cite{rappez_deepcycle_2020} and \cite{narotamo_machine_2021, li_predicting_2024} showed that deep learning-based models allow to classify \emph{discrete} cell cycle states such as the G1, S, or G2 phase from static multichannel images of cells. In particular, \cite{eulenberg_reconstructing_2017,blasi_label-free_2016,jin_imbalanced_2021,he_cell_2022} explored the use of brightfield and phase-imaging data for cell cycle classification without fluorescence labeling. However, accurately annotating discrete cell cycle stages highly depends on the imaging modality with nuclear stains like DAPI or Hoechst providing clear features, while phase or brightfield imaging require substantial more manual annotation expertise. 
Beyond static images, several studies have investigated how the \emph{temporal information} of images sequences can be leveraged to capture dynamic cell behaviors~\cite{wang_live-cell_2020, zhao_insights_2024,chu_prediction_2020}. 
However, most approaches have so far been focused on well-defined cell phases such as mitosis, where large morphological changes facilitate annotation and prediction~\cite{held_cellcognition_2010,moreno-andres_livecellminer_2022,jose_automatic_2024, su_spatiotemporal_2017}. A notable exception is \cite{ulicna_learning_2023} which applies dynamic time warping (DTW) to features from an unsupervised autoencoder, enabling to temporally align cell trajectories and to continuously predict cycle states across interphase. 
\emph{Sequence models} such as recurrent neural networks and transformers~\cite{vaswani_attention_nodate} have been shown to be effective for modeling temporal data~\cite{hewamalage2021recurrent,wen2022transformers}, with \emph{state-space models} such as Mamba~\cite{gu_mamba_2024} having recently gained considerable interest due to their training and inference efficiency. 
As the cell cycle is a continuous causal process, sequence models appear to be a natural fit for our task. However, studies that address this question systematically remain missing, with the closest work being~\cite{jose_automatic_2024} which uses recurrent neural networks for cell cycle prediction, but focuses on morphologically stark intermitotic phases. 



\section{Method}

\subsection{Dataset}


We generated a large dataset of dividing human \Fucci RPE1 cells using joint brightfield and fluorescence time lapse microscopy. Movies spanning 72 hours were acquired at a 5 minute time resolution, capturing multiple cell cycles. In addition to the brightfield modality, we acquired a nuclear marker channel (Histone H2B) and the two \Fucci channels ($\Fucci_{1/2}$). Based on the H2B channel, we segmented the cell nuclei with a custom StarDist model~\cite{schmidt_cell_2018} and tracked them across frames using TrackMate~\cite{tinevez_trackmate_2017}.
Note that since the amount of H2B histones needed by cells to pack DNA doubles during S (DNA replication), the H2B channel does contain information on cell cycle progression (\figureref{fig:Overview_of_data}c). Below, we leverage this as a control to assess predictive performance of brightfield vs H2B.   
Full cell cycle tracks (from one mitosis to the next, M-M) were identified using K-Means clustering and ground-truth \Fucci signals were computed by normalizing the average \Fucci intensities measured across the segmented nuclear mask~(\figureref{fig:Overview_of_data}c,d).
The training dataset comprises 5,188 full (M-M) cell cycle tracks with an average track length of 230 frames. Each track contains paired brightfield and H2B images of size $64\times64$ centered on the nucleus and the corresponding integrated \Fucci signals. To evaluate model performance, we created two additional test datasets: \dataregular, which contains 358 additional full tracks from RPE1 cells acquired at similar conditions as the training set, and \datadrug, which comprises 73 complete tracks of cells treated with the cell cycle inhibitor \emph{Palbociclib} that heavily distorts the cell cycle and which is used in the clinic to treat breast cancer. In total, this training and testing dataset consists of approximately 1.3 M images and \Fucci signals, all of which we make publicly available alongside this paper\footnote{\url{https://zenodo.org/records/14987478}}.




\begin{figure}[t!]
    \floatconts
      {fig:architechture}
      {\caption{\textbf{Overview of approach.} \textbf{a)} We use a ResNet-18~\cite{he2016deep} to extract single frame embeddings from an input sequences which are fed into a sequence model that predicts both \Fucci channels. \textbf{b)} Sequence models explored in this paper: Single Frame MLP, Fixed-frame CNN, causal state-space models \eg Mamba~\cite{gu_mamba_2024}, bidirectional models \eg transformers~\cite{vaswani_attention_nodate}.}}
    {\includegraphics[width=1\linewidth]{Figures/arch.pdf}}
\end{figure}

\subsection{Method}

The cell cycle prediction task can be formalized as follows: Given a temporal sequence of $N$ brightfield images $X \in \mathbb{R}^{N \times 64 \times 64}$, each associated with a corresponding two-dimensional \Fucci signal $Y \in \mathbb{R}^{N \times 2}$, the goal is to train a model $f$  that predicts the normalized \Fucci intensities across the entire sequence in a supervised manner, \ie. $f: \mathbb{R}^{N \times 64 \times 64} \to \mathbb{R}^{N \times 2}~$. Note that the length $N$ of the input is not fixed and the sequences is not required to span the entire cell cycle, allowing us to analyze the impact of temporal context on prediction performance.
For this cell cycle prediction task, we evaluated three conceptually distinct model classes: single-frame models, causal models and non-causal models~(\figureref{fig:architechture}). 
Each model uses first a ResNet-18~\cite{he2016deep} as feature extractor which for every input image in the sequence independently creates a 512-dimensional embedding. The sequence of embeddings is then fed into the proper sequence model head that differs between the three classes: \emph{Single-frame models} serve as a baseline, predicting the \Fucci signal from each image embedding independently without leveraging temporal information. We use a simple 4 layer MLP with hidden dimension 512. \emph{Fixed-frames models} that use a fixed history of past frames for prediction and for which we use a causal convolutional neural network~\cite{van2016wavenet} with a fixed causal temporal receptive field.  \emph{Causal models} that incorporate past and present temporal context in more flexible way and which are able to potentially capture arbitrary long temporal dependencies. In particular, we compare LSTMs~\cite{hochreiter_long_1997}, Mamba~\cite{gu_mamba_2024}, and causal transformers with masked attention. \emph{Bidirectional models} process the entire sequence bidirectionally, using both past and future frames for inference (\ie non-causal information). We use a standard transformer (4 layers) as representative architecture.
To ensure a fair comparison, we choose all sequence heads to have the same number of parameters ($\approx 1M$). All transformer variants additionally use rotary positional embeddings~\cite{su2024roformer} to encode the relative temporal position of each frame.
We train each model for 150 epochs while randomly sampling subtracks of variable lengths, using a learning rate of $10^{-4}$ and $L_1$ loss. We use standard data augmentation such as random rotations and flips, which is applied to the whole track. 



\begin{figure}[b!]
    \floatconts
      {fig:Prediction_bf}
      {\caption{\textbf{Predictions on unperturbed RPE cells \dataregular.} \textbf{a)} Distribution of $L1$ errors across the different  models.
    \textbf{b)} Predictions of \Fucci signals on two example tracks: one with accurate and one with poor predictions. The ground truth signal is shown in black.
    \textbf{c)} Average prediction error, and \textbf{d)} variability of ground truth \Fucci signals as a function of normalized cell cycle time $\tau$.
      }} 
      {\includegraphics[width=.9\linewidth, trim=0 .5cm 0 0, clip]{Figures/Predictions_bf.pdf}\vspace{-.4cm}} 
\end{figure}

\subsubsection{Performance metrics}

We evaluated model prediction by computing the mean $L_1$ error for each \Fucci channel across a given track. Further, we use the \emph{dynamic time warping distance} \DTW between the predicted and ground truth \Fucci signal that takes into account both the signal prediction error as well as the temporal misalignment between the two signals. We use the default \DTW distance implementation from the \texttt{dtaidistance} package~\cite{meert_dtaidistance_2020} with a penalty of 0.1. 
Additionally, we introduced two biologically meaningful cell cycle checkpoints (\figureref{fig:Overview_of_data}) and measured the time difference between our predicted and observed checkpoints in minutes. The first checkpoint $t_{G1/S}$ is the onset of the Geminin (\Fucci2) signal, marking the G1/S phase transition \cite{sakaue-sawano_genetically_2017} (additional information about how this checkpoint were calculated can be found in the appendix).
While the classic \Fucci reporter does not provide an exact molecular landmark of the S/G2 transition, we use the disappearance of the Cdt1 (\Fucci1) signal as an approximate landmark for evaluating S/G2 transition predictions $t_{S/G2}$. We use these two landmarks to categorize cells into discrete G1, S, and G2 phase classes for which we compute the F1-score of predictions. S phase is more constrained than G1, as genome duplication depends on DNA polymerase rates and replication origins. G1 is the key regulator of cell cycle duration and is expected to be more variable. In \dataregular,  the biological variability in phase duration ranges from a standard deviation of 1.6h for the S phase, to 2.2h for the G1 phase.



 


\section{Results}



\begin{table}[t] 
    \centering
    \floatconts
      {tab:table1_bf} 
      {\caption{\textbf{Cell cycle prediction accuracy for brightfield on \dataregular}. Shown are mean and standard deviation of the $L_1$ error per \Fucci channel and \DTW for full tracks across \dataregular. \DTW when using H2B images as input for comparison.} 
      }
    
    \footnotesize
    \begin{tabular}{lllllll}
        \toprule
        & \multicolumn{3}{c}{Brightfield} & \multicolumn{1}{c}{Histone H2B} \\
        \cmidrule(lr){2-4} \cmidrule(lr){5-5}
        \textbf{Models} & $L_{1, FUCCI_1}$ & $L_{1, FUCCI_2}$ & \DTW & \DTW \\
    \midrule
    Single Frame & 0.193 ± 0.066 & 0.146 ± 0.045 &  3.735 ± 0.863 &   2.595 ± 1.201 \\
    Causal CNN & 0.157 ± 0.078 & 0.122 ± 0.049 &  2.468 ± 0.917 &  2.165 ± 1.210 \\
    Mamba & 0.112 ± 0.072 & 0.091 ± 0.049 & 1.444 ± 0.898 &   1.426 ± 0.949 \\
    Transformer & \textbf{0.066} \textbf{± 0.038} & \textbf{0.062} \textbf{± 0.037} & \textbf{1.285} \textbf{± 0.553} & \textbf{1.155} \textbf{± 0.612} \\
    \bottomrule
      \end{tabular}
      
    \end{table}
    
\subsection{Comparison of prediction accuracy across sequence models}


\begin{table}[b] 
    \centering
\floatconts
  {tab:Table2} 
  {\caption{\textbf{Prediction accuracy of biological checkpoints and selected cell cycle states from brightfield images on \dataregular.}}}
  {
\footnotesize
\begin{tabular}{llllll}
\toprule
 \textbf{Models}& $\Delta t_{G1/S}[\text{min}]$ & $\Delta t_{S/G2}[\text{min}]$ & $G1$ & $S$ & $G2$ \\
\midrule
Single Frame & 191.3 & 117.9 & 0.71 & 0.64 & 0.87 \\
Causal CNN & 146.8 & 113.3 & 0.78 & 0.64 & 0.86 \\
Mamba & 111.4 & 102.4 & 0.83 & 0.72 & 0.89 \\
Transformer & \textbf{60.1} & \textbf{57.2} & \textbf{0.90} & \textbf{0.85} & \textbf{0.93} \\
\bottomrule
\end{tabular}
}
\end{table}



We first compare the performance of the different sequence models on predicting the \Fucci signals from brightfield images for full (M-M) cell cycle tracks.
As seen in \figureref{fig:Prediction_bf}a, the single frame model predicts extremely noisy signals that deviate substantially from the ground truth \Fucci signal, whereas both causal and bidirectional models achieve qualitatively much better predictions on both \Fucci channels and generally aligns with the expected trends (\cf~Supp.~\figureref{fig:histograms}a,b, Supp.~\figureref{fig:umap}).
To quantitatively assess the performance of the different models, we show the mean $L_1$ error for each \Fucci channel across all tracks as well as the average \DTW distance between the predicted and ground truth \Fucci signals for the different models in \tableref{tab:table1_bf}.
As expected, the single frame model which operates without integrating temporal information performs the worst across all metrics (\DTW = 3.735), while integrating the full bidirectional (non-causal) sequence information via a transformer achieves the best prediction (\DTW = 1.285). Surprisingly, there is a notable difference between the performance of the fixed-frames model (causal CNN) and the state-space model (Mamba), with the former performing substantially worse than the latter (\DTW = 2.468 vs. 1.444). This suggests that models that allow information propagation across the entire sequence can be more effective than models that only use a fixed-size temporal context. 
We additionally computed \DTW when training with the H2B channel as input modality, which a priori should be a substantially easier task as the H2B signal is biologically correlated with the cell cycle. Indeed, this is corroborated by the performance of the single frame model that vastly improves in this case (\DTW = 2.595). Interestingly, both state-space models as well as the transformer only marginally improve, suggesting that these models are able to extract temporal cues from the brightfield images comparable to the easier H2B modality. 
The predictions at biological checkpoints remain consistent with the performance observed across other evaluation metrics (\tableref{tab:Table2}, Supp. \figureref{fig:visual_landmarks}). As observed in Supp. \figureref{fig:correlation_checkpoints}, the timing of these checkpoints is highly variable. Sequence models effectively capture this variability, with the bi-directional transformer predictions being more accurate and therefore better aligned with the ground-truth distribution. In contrast, other methods tend to underestimate such variability.
As expected, the majority of prediction errors measured by the $L_1$ metric occur near the $t_{G1/S}$ and $t_{S/G2}$ landmarks, reflecting abrupt transitions where the FUCCI signal exhibits higher variability (\figureref{fig:Prediction_bf} b, c).


%This indicates its superior ability to capture and leverage long-term dependencies in the temporal data, leading to more precise predictions of dynamic cell cycle events.


% \begin{table}[t] 
% \floatconts
%   {tab:Table1} 
%   {}
%   {
% \centering
%   \begin{tabular}{llll}
%     \toprule
%     \textbf{Models} & $L_{1, FUCCI_1}$ & $L_{1, FUCCI_2}$ & $DTW$ \\
% \midrule
% Single Frame & 0.193 ± 0.066 & 0.146 ± 0.045 & 3.735 ± 0.863 \\
% CNN & 0.157 ± 0.078 & 0.122 ± 0.049 & 2.468 ± 0.917 \\
% Mamba & 0.112 ± 0.072 & 0.091 ± 0.049 & 1.444 ± 0.898 \\
% Transformer & \textbf{0.066} \textbf{± 0.038} & \textbf{0.062} \textbf{± 0.037} & \textbf{1.285} \textbf{± 0.553} \\
% \bottomrule
%   \end{tabular}
% \caption{\textbf{Cell cycle prediction accuracy for brightfield on \dataregular}. Shown are mean and standard deviation of the \Fucci channel specific $L_1$ error and DTW distance for full tracks across the test set of \dataregular.}} 
%   \label{tab:table1_bf}
% \end{table}



    






% As expected, the H2B imaging proved to be more predictive than BF imaging for cell cycle phase determination (table 3, 4, \figureref{Supp1}). However, the causal architecture, and notably the  transformer achieved  predictions that were nearly as accurate as those from the H2B channel.

\subsection{Prediction on partial tracks}


So far we focused on full (M-M) tracks of non-perturbed cells, all of which exhibit fairly stereotypical cell cycle trajectories and for which sequence models are able to base their predictions on a well defined starting points (\ie the cell division event).
We now evaluate the performance of the different models on partial tracks, where the starting point is not known a priori, which is a more challenging task.
\begin{figure}[t]
\floatconts
  {fig:partial_track_bf}
  {\caption{\textbf{Comparative performance on partial cell cycle tracks (brightfield).}
     Shown is the average $L_1$ error of both \Fucci signals when using partial tracks as input, parametrized by their relative start and end time $\tau_1 \leq \tau_2 \in [0,1]$. 
 }}
{\includegraphics[width=1\linewidth]{Figures/triang.pdf}\vspace{-.6cm}}
\end{figure}
In \figureref{fig:partial_track_bf}, we show the average $L_1$ error with cropped partial tracks as input, indicated by their relative start and end time $\tau_1 \leq \tau_2 \in [0,1]$. These partial tracks ranged from single-frame portions (along the diagonal) to entire tracks (lower-right element). Causal models still achieved better accuracy in predicting \Fucci values compared to non-temporal MLPs or fixed history CNNs. Surprisingly, the performance advantage of transformers over causal methods observed in full tracks diminishes on partial tracks. For all models, the error is maximal for segments that end near $t_{S/G2}$. Errors in $\Fucci_1$ show the same pattern while the errors in the $\Fucci_2$ arise mostly when taking segments from the beginning of the cell cycle (Supp. \figureref{fig:partial_track_fucci}).


\subsection{Prediction on out-of-distribution perturbations}

Finally, we evaluate the model performance on \datadrug, \ie biologically strongly perturbed cells that can be considered out-of-distribution. 
Specifically, these cells were treated with the drug \emph{Palbociclib}, a CDK4-6 inhibitor, that increases the cell cycle duration almost two-fold from $\sim$20h to 40 h (\figureref{fig:drug_predictions} a) and specifically the G1 phase duration, leading to a strongly distorted cell-cycle (\figureref{fig:drug_predictions} b). As expected, almost all models demonstrated a significant drop in accuracy when predicting \Fucci signal in these unseen drug-treated cells, as indicated by all evaluation metrics (Table \ref{tab:Table3}). The notable exception is the bidirectional transformer, that provides reasonable predictions and correctly captures distortions in the G1 phase (\figureref{fig:drug_predictions} c) that all other models significantly underestimated. Interestingly, the MLP outperformed the other causal models on this distorted data, potentially as the latter overfitted on the training data. When performing the same analysis on H2B, we found slightly better predictive performance (Supp. \tableref{tab:sup_table_WT,tab:sup_table_drug}, Supp. \figureref{fig:predictions_h2b},  Supp. \figureref{fig:partial_track_h2b}), which is expected due to the stronger correlation with the cell cycle. 


\begin{figure}[t!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument

\floatconts
  {fig:drug_predictions}
  {\caption{\textbf{Results on perturbed RPE cells \datadrug.}
  \textbf{a,b)} Effect of CDK4/6 inhibition on cell cycle durations and onset of G1, S, and G2/M phases, extending G1 duration while leaving S and G2/M unchanged.
  \textbf{c)} Example \Fucci predictions on \datadrug. 
 }}
{\includegraphics[width=1\linewidth]{Figures/Drug_predictions.pdf}\vspace{-.4cm}}
\end{figure}


\begin{table}[b!] 
    \centering
    \floatconts
      {tab:Table3} 
      {\caption{\textbf{$L_1$ error and \DTW for \Fucci channels on \datadrug.}}}
      {
    \footnotesize
    \begin{tabular}{llll}
    \toprule
     \textbf{Models} & $L_{1, FUCCI_1}$ & $L_{1, FUCCI_2}$ & \DTW \\
    \midrule
    Single Frame & 0.239 ± 0.082 & 0.182 ± 0.056 & 5.329 ± 1.147 \\
    Causal CNN & 0.252 ± 0.113 & 0.161 ± 0.059 & 4.323 ± 1.302 \\
    Mamba & 0.485 ± 0.090 & 0.259 ± 0.045 & 3.563 ± 1.918 \\
    Transformer & \textbf{0.147} \textbf{± 0.056} & \textbf{0.139} \textbf{± 0.048} & \textbf{3.022} \textbf{± 0.985} \\
    \bottomrule
    \end{tabular}
    }
    \end{table}
    
\vspace{-.2cm}


\section{Discussion and Conclusions}
In this study, we generated and released a large dataset of cycling RPE1 cells under both normal and drug-treated conditions and used it to investigate the utility of sequence models to infer the continuous cell cycle state from label-free brightfield images. 
Our analysis demonstrates that temporal sequence models can significantly improve the cell cycle prediction accuracy and enable the assessment of cell cycle state from brightfield images to a level comparable when using the more informative H2B channel. This suggests that brightfield morphological cues alone carry sufficient information for cell cycle inference.
Importantly, we found that causal state-space models substantially outperform commonly used fixed-history convolutional networks, demonstrating their potential for real-time computer vision and smart microscopy applications, where such causal inference is essential~\cite{simon_causalxtract_2025,mahecic2022event}. 
We note that the observed reduced accuracy on drug-treated cells underscores that creating general predictive cell state models for strong biological perturbations remains challenging.
For broader applicability, the model needs exposure to more diverse cell types, microscopy setups, and biologically perturbed conditions. Capturing variations in morphology, imaging parameters, and drug-induced alterations will enhance robustness, allowing it to reliably detect atypical or rare cell cycle dynamics. Furthermore, it would be valuable to explore probabilistic models such as variational autoencoders~\cite{zhao2019variational}, that can provide uncertainty estimates of the predicted cell cycle state instead of point predictions as we currently do.
Our findings demonstrate that sequence models can be effective predictors of cellular dynamics in more controlled settings,  providing a powerful tool for studying proliferation, cancer dynamics, and addressing cell cycle-related confounding factors. With larger datasets, models could improve drug effect predictions, paving the way for screening and broader biomedical applications. 




\clearpage  


\bibliography{midl25_241}

\newpage
\input{appendix.tex}


\end{document}
