\documentclass{midl} % Include author names

\newcommand{\ours}{OmniNet}
\newcommand{\eg}{\textit{e.g.}}
\newcommand{\ie}{\textit{i.e.}}
\newcommand{\etal}{\textit{et al.}}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% table
\usepackage{booktabs}
\usepackage{footnote}
\makesavenoteenv{tabular}
\usepackage{threeparttable}
\usepackage{multirow}

\usepackage{mwe} % to get dummy images
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 46}
\editors{Accepted for publication at MIDL 2026}

\title[OmniNet]{OmniNet: A Multi-Modality Neural Network for Robust Remote Respiratory Rate Measurement from Facial Video}

\midlauthor{\Name{Tsai-Ni Lin\nametag{$^{1,2}$}} \Email{tiffany10022000@gmail.com}\\
\Name{An-Sheng Liu\nametag{$^{3}$}} \Email{d00921006@ntu.edu.tw}\\
\Name{Li-Chen Fu\midljointauthortext{Corresponding author}\nametag{$^{3}$}} \Email{lichen@ntu.edu.tw}\\
\addr $^{1}$ Department of Mechanical Engineering, National Taiwan University, Taiwan \\
\addr $^{2}$ Department of Bioengineering, Hong Kong Science and Technology University, Hong Kong \\
\addr $^{3}$ Department of Electrical Engineering, National Taiwan University, Taiwan
}

\begin{document}

\maketitle

\begin{abstract}
Remote respiratory rate (RR) measurement has gained traction in recent studies due to its ability to reduce healthcare professionals' workload and patient discomfort. Recent studies have targeted this problem through remote photoplethysmography (rPPG) to capture subtle facial color changes. However, this technique is sensitive to lighting and motion variations. To this end, we propose \ours{}, a multimodal neural network that integrates image data processed through 3D convolutional neural networks (3D CNNs) with point of interest (POI) motion data and passes the fused features to Bidirectional Long Short-Term Memory (BiLSTM) to model long-term temporal dependencies. \ours{} achieves state-of-the-art performance by effectively capturing comprehensive spatial and temporal information while reducing illumination variation and motion-induced artifacts. It also requires fewer computational resources and enables faster inference compared to Transformer networks. The code has been released on GitHub: \url{https://github.com/tiffany-1002/OmniNet}.
\end{abstract}

\begin{keywords}
Remote Respiratory Rate Measurement, Multimodal Learning
\end{keywords}

\section{Introduction}


The respiratory rate (RR) is a critical vital sign that offers predictive insight into a variety of health conditions, including obstructive sleep apnea, asthma, and other respiratory disorders. Traditional RR monitoring has relied on contact sensors (\eg{} adhesive electrocardiogram (ECG) patches and respiration belts) that continuously record cardiac electrical activity or thoracic expansion. However, contact devices not only increase clinical workload but also cause discomfort or skin irritation during prolonged monitoring, especially in infants~\cite{ClinicalApplications2018} and patients with dermatological conditions or burns. 

Recent research in remote photoplethysmography (rPPG), a non-contact technique that detects subtle skin color variations through a RGB camera, has proposed various algorithms to obtain physiological signals. This method has been applied to estimate RR~\cite{Chen_2019}, heart rate (HR)~\cite{MALASINGHE2022117867, informatics9030057}, heart rate variability (HRV)~\cite{AdvancementsPoh}, and other physiological indicators. Although rPPG shows great potential, its reliance on skin color makes it sensitive to lighting variation~\cite{Tarassenko_2014} and prone to motion artifacts caused by head movements~\cite{Qiu2022, BOUSEFSAF2013568}.

Pixel-based RR techniques offer an alternative approach to estimate RR by directly tracking periodic pixel motion induced by thoracoabdominal respiration, providing greater robustness under varying illumination conditions. Among the representative techniques, optical flow detects motion by computing flow vectors of moving regions over time~\cite{neooptical, OPTICAL3}, while temporal differencing calculates pixel-wise intensity changes between consecutive video frames~\cite{MATCNN, temperaldiff}. 
These techniques are hampered by background noise in complex environments, often hindering the accurate separation of respiratory motion from unrelated movements~\cite{framedrawback}. 
Careful selection of the region of interest (ROI) has also been proven to raise the quality of respiratory signal. Commonly selected ROIs include the chest, abdomen~\cite{Janssen_2016}, and facial areas such as the forehead, cheeks, or nose~\cite{roinoses}.

Deep learning (DL) methods have emerged as powerful techniques to measure physiological signals. Convolutional neural networks (CNNs) have shown strong capabilities both in image understanding ~\cite{rcnn, yolov1} and extracting accurate meaningful rPPG signals from low-quality facial videos. While 2D CNNs focus on spatial information~\cite{MATCNN, deephys}, 3D CNNs effectively model both global spatial and temporal information across frames~\cite{ physnet, Cliff}.

Transformers, first used in Natural Language Processing (NLP) for sequence modeling, better capture long-term dependencies than convolutional networks and are well suited for modeling the periodic nature of respiratory signals due to their self-attention mechanism. 
Although Transformers achieve high accuracy, they suffer from relatively high complexity~\cite{actnet}, and their temporal attention can be inaccurate, leading to phase shifts and irrelevant attention~\cite{physformer, physformer++}. In camera-based physiological measurement, where data are more limited than in other vision tasks, Transformers often perform worse than CNNs despite their strong modeling capabilities~\cite{efficientphy}. Furthermore, Transformers require large-scale annotated facial videos to perform well, but such datasets are scarce and difficult to collect~\cite{selfsuperviseforsmalldataset}. CNNs are more effective in small-data settings but struggle to model long-term temporal dependencies. 

Recurrent neural networks (RNNs) are widely used DL models well-suited for time series and sequential data. Long Short-Term Memory (LSTM)~\cite{lstm} networks, a variant of RNNs, are effectively capture temporal dependencies and are often combined with CNNs for physiological signal estimation. Transformers require substantial training data, whereas LSTMs generally provide more efficient performance in environments with limited resources. 
While LSTMs are suitable for sequential modeling, their application to contactless RR estimation remains limited, as recent studies ~\cite{KUMAR-lstm, ARIMAANDLSTM} have relied exclusively on contact-based datasets, without data derived from images or videos. 

Moreover, existing contactless methods often rely on single-modality input and fail to integrate complementary information sources. 
To address these limitations, we propose a multimodal network named \ours{}. Several studies have explored multimodal approaches for respiratory or physiological signal estimation by combining facial motion, rPPG, or additional sensing modalities \cite{shao2025, liao2024, Kong2024, Zheng2024, Gwak2024}. These methods demonstrate the potential of multimodal fusion to improve robustness under noise and motion artifacts.

By incorporating complementary multimodal inputs, our model reduces reliance on any single information source, which improves robustness to noise and leads to more stable respiratory rate estimation. \ours{} utilizes two types of information: frame differencing with a 3D CNN and POI motion trajectories. In this paper, we use a 3D CNN to capture temporal information, which is crucial for sequential signal extraction. Furthermore, the POI motion trajectories, inspired by OPOIRES~\cite{OPOIRES}, is proposed to track meaningful points on the human torso rather than the fixed ROIs. Then we fuse these two types of features and feed them into a bidirectional LSTM (BiLSTM) to leverage its strength in modeling long-range temporal dependencies. 
We validate our approach on the COHFACE~\cite{cohface} dataset against state-of-the-art methods, where it achieves superior performance. Moreover, the lightweight architecture of \ours{} ensures efficient deployment on mobile and embedded devices, emphasizing its relevance for practical real-world usage, especially in mobile healthcare and telemedicine.

\begin{figure}[t]
\floatconts
  {fig:fig1}
  {\caption{\ours{} framework.}}
  {\includegraphics[width=0.8\textwidth]{images/modelstructure.png}}
\end{figure}




\section{Methodology}

\subsection{Overview of \ours{} Architecture}
\label{subsec:overview}
Illustrated in \figureref{fig:fig1}, \ours{} is a multimodal DL model designed for robust respiratory signal estimation from facial videos under challenging conditions, \eg{} illumination changes, head movements, and varying skin tones.
Given a batch of input videos $\mathbf{X}$ and POI-based motion signals $\mathbf{P}$, \ours{} adopts a dual-branch architecture consisting of a video stream and a POI stream. The video stream processes $\mathbf{X}$ with a lightweight 3D CNN to extract motion-aware spatiotemporal features while preserving temporal resolution. These features are globally averaged across spatial dimensions and linearly projected to form the image embedding $\mathbf{X}{_\text{img}}$. In parallel, the POI stream passes $\mathbf{P}$ through a linear projection layer to obtain $\mathbf{X}{_\text{poi}}$.
The two modality-specific features are concatenated along the last dimension and fused via a ReLU-activated fully connected layer, yielding $\mathbf{X}_{\text{fused}}$.
Temporal modeling is then performed using a single-layer BiLSTM. The resulting sequence $\mathbf{X}_{\text{lstm}}$ is passed through a dropout layer and a final linear projection to estimate the respiratory signal $\hat{\mathbf{Y}}$, subsequently obtaining RR estimation via peak detection.



\begin{figure}[t]
\floatconts
  {fig:fig2}
  {\caption{Visualization of POI selection: The system detects the face and defines an ROI below it, within which POIs are selected. The $y$-axis trajectories of these POIs are tracked over time, with each line representing the positional change of a single coordinate. Candidate signals are filtered after preprocessing, including detrending, smoothing, and normalization. Detected peaks are marked with red dashed lines indicate the estimated breathing cycles.}}
  {\includegraphics[width=\textwidth]{images/flow2.png}}
\end{figure}


\subsection{POI Selection}
\label{subsec:poi-selection}
Following the previous literature~\cite{OPOIRES}, we adopt the Viola-Jones algorithm for face detection due to its lightweight and real-time performance. As shown in \figureref{fig:fig2}, after the face detection, a rectangular region below the face is defined to approximate the chest area, where respiratory motion is most prominent. 

To enhance contrast, Contrast Limited Adaptive Histogram Equalization (CLAHE) and Gaussian smoothing are applied to the selected region. Harris corner detection is then performed to identify POIs, followed by sub-pixel refinement. A non-maximum suppression step with a minimum distance constraint enforces spatial diversity, retaining up to 30 points. These POIs are subsequently mapped to full-frame coordinates.

Only the $y$-axis displacement of each POI is preserved~\cite{verticalbreath} after tracking POIs through frames using optical flow, resulting in one-dimensional temporal signals. Each signal is detrended, smoothed and standardized via z-score normalization:
\begin{equation}
\tilde{y}(t) = \frac{y(t) - \mu}{\sigma}\text{,}
\end{equation}
where $\mu$ and $\sigma$ denote the mean and standard deviation of $y(t)$.
The normalized autocorrelation function is then computed and scaled by its maximum to ensure consistency across signals, where $N$ denotes the length of the standardized signal $\tilde{y}(t)$:
\begin{equation}
R(\tau) = \sum_{t=\tau-N+1}^{N} \tilde{y}(t) \cdot \tilde{y}(t + N - \tau), \quad N \le \tau \le 2N-1\text{.}
\end{equation}

Signals with fewer than 2 or more than 30 autocorrelation peaks and with a mean difference of peak intervals exceeding 70 are discarded. To further enhance robustness, a Pearson correlation matrix $C \in \mathbb{R}^{S \times S}$ is computed among the autocorrelation functions of $S$ candidate signals, and the mutual similarity score for each is defined as:
\begin{equation}
L_j = \sum_{i=1}^{S} C(i, j)\text{,} \quad 1 \leq j \leq S\text{.}
\end{equation}
The top $\lceil S/2 \rceil$ signals with the highest $L_j$ scores are retained as valid candidate signals.




\subsection{Frame Differencing and Model Training}
\label{subsec:frame-differencing}

To extract RIM from video sequences, we employ a frame differencing strategy followed by a 3D CNN to capture both spatial and temporal dynamics of subtle respiratory movements.

\paragraph{Frame Differencing}
Given a facial video sequence ${\mathbf{I}_1, \mathbf{I}_2, ..., \mathbf{I}_T}$ of $T$ grayscale frames, we compute consecutive frame differences to highlight temporal motion: $\mathbf{X}_t = |\mathbf{I}_{t+1} - \mathbf{I}_t|$.
This operation emphasizes motion features while reducing the impact of illumination changes. The resulting difference sequence $\mathbf{X}$ serves as an intermediate representation of potential respiratory activity. To preserve the temporal structure, we stack the differences as a new input volume to the subsequent 3D CNN encoder.
Similarly for POI, we compute consecutive sample differences: $\mathbf{P}_t = \tilde{y}(t+1) - \tilde{y}(t)$. Afterward, we sum up the differences of different POIs and perform z-score normalization on the temporal dimension, producing a difference sequence $\mathbf{P}$, followed by the same process as the video stream.

\paragraph{3D CNN Architecture}
To process the stacked difference volume, we adopt a lightweight 3D CNN encoder to model spatial and short-term temporal patterns. Specifically, the architecture consists of three convolutional blocks, each of which contains a 3D convolution layer with kernel size $3\times3\times3$, followed by BatchNorm3D and ReLU activation. The first two blocks also has a 3D max pooling with kernel size $1\times2\times2$ to downsample spatial dimensions while preserving the temporal axis. 
The output feature maps are then averaged across the spatial dimensions and applied a linear projection to obtain $\mathbf{X}_{\text{img}}$ for temporal alignment with $\mathbf{X}{_\text{poi}}$ from the POI stream.
This design allows the model to capture fine-grained temporal cues from motion-only input, making it more robust to appearance variations, head pose changes, and illumination artifacts compared to directly analyzing RGB frames.

\paragraph{BiLSTM Architecture}
Following the fusion of image and POI features, we apply a single-layer BiLSTM. This allows the model to incorporate both past and future contexts for each time step. A dropout layer is applied to the LSTM output before a final fully connected layer maps each temporal feature vector to a scalar prediction $\hat{Y}(t)$.

\paragraph{Optimization Strategy}
The loss function is the mean squared error (MSE) between the predicted respiration belt signals $\hat{Y}(t)$ and the ground truth values $Y(t)$, defined as:
\begin{equation}
\mathcal{L}_{\text{MSE}} = \frac{1}{T} \sum_{t=1}^{T} \left( \hat{Y}(t) - Y(t) \right)^2 \text{,}
\end{equation}
where $T$ denotes the number of temporal samples in each sequence.
We refer the reader to \appendixref{app:optimization} for more details.

\subsection{Peak Detection and RR Estimation}
\label{subsec:rr-estimation}
To estimate the RR, we first preprocess the respiration belt signal using Savitzky-Golay smoothing, followed by a fourth-order Butterworth band-pass filter with cutoff frequencies of 0.1--0.4\,Hz to suppress noise and baseline drift. Peak detection is performed using the \texttt{find\_peaks} signal processing functions build-in in the SciPy library, where a dynamic threshold is applied to exclude spurious local maxima. The minimum peak distance is set to 6 frames to avoid detecting peaks caused by motion artifacts.

If multiple valid peaks are found, we compute the pairwise inter-peak intervals and calculate the average breathing cycle. The RR (in breaths per minute) is then given by: $\text{RR} = 60 / \bar{\Delta}$, where $\bar{\Delta}$ denotes the average peak interval in seconds. If fewer than two peaks are detected, the signal is excluded from the RR computation.

\subsection{Further Discussion on Multimodal Fusion}
Although both modalities are derived from the same input video, they encode fundamentally different information at the representation level. The POI branch captures sparse, low-dimensional geometric motion by tracking selected landmarks, whereas the video branch models dense, high-dimensional appearance and pixel-level temporal motion patterns using 3D convolution. Consequently, the two modalities exhibit distinct inductive biases and capture complementary rather than redundant aspects of respiratory motion.

From a machine learning perspective, features from one modality could indeed dominate or interfere with the other if early fusion or shared representations were used. To address this risk, the two branches are processed by independent encoders without parameter sharing, which prevents feature leakage and enforces modality disentanglement during representation learning. Unimodal baselines are trained independently using only their respective inputs, ensuring that their performance reflects the intrinsic properties of each modality alone.

In the multimodal setting, fusion is performed using a late fusion strategy. Specifically, modality-specific features are concatenated and projected through a lightweight learnable linear layer before temporal modeling. This design ensures that fusion occurs at the feature level rather than at the raw signal level, limits cross-modal interference, and allows the network to implicitly adjust the relative contribution of each modality based on data reliability. Empirically, this approach yields consistent improvements over both single-modality variants, confirming that fusion leverages complementary cues instead of amplifying shared noise.

\section{Experiments}

\subsection{Dataset and Experimental Setup}
\label{subsec:dataset}
Due to the limited availability of RR datasets, we rely solely on the COHFACE~\cite{cohface} dataset for our experiments.

\textbf{COHFACE:} The COHFACE dataset contains 160 one-minute RGB videos of 40 subjects (12 women, 28 men), with each subject contributing two videos under studio lighting and two under natural light. Videos were recorded at $640 \times 480$ resolution and 20\,Hz, while respiration belt and blood volume pulse signals from the same subject were simultaneously recorded at 256\,Hz. Official experimental splits are provided under three protocols: all, clean, and natural. Each protocol offers standard train/dev/test partitions for evaluating model robustness across varied illumination conditions. Data from the same subject will not present in multiple partitions at the same time, preventing temporal and subject leakage.

The original 20\,Hz video is temporally downsampled to 4\,Hz by selecting one frame every five frames. This method aligns the visual input with the respiration belt signal, which is also downsampled to 4\,Hz, enabling a more efficient calculation and facilitating synchronization between modalities. We set the input window size to 240 frames, which corresponds to the one-minute length of both the differential image sequences and the respiration belt readings.

The predicted respiration belt signals and the ground truth values are processed by the same procedures described in \sectionref{subsec:rr-estimation}, obtaining the estimated and real RR.
We evaluated the performance of the model using three standard metrics: mean absolute error (MAE), root mean square error (RMSE), and Pearson correlation coefficient (PCC). Standard deviation of absolute error (STD) is also used when comparing with OPOIRES~\cite{OPOIRES}. These metrics are described in \appendixref{app:metrics}.

In addition, a detailed description about the experimental environment is provided in \appendixref{app:environment}.

\subsection{Intra-Database Tests}
\label{subsec:intratest}

\begin{table}[ht]
\floatconts
  {tab:comparisonall}
  {\caption{Performance comparison between existing methods and our method on the COHFACE dataset.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{lccc} 
    \toprule
    Methods & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$& PCC $\uparrow$   \\ \midrule
    DeepPhys~\cite{deephys} & 3.21 & 6.56 & 0.52 \\
    TS-CAN~\cite{MATCNN} & 2.97 & 5.49 & 0.63 \\
    TS-DAN~\cite{DualAN} & 2.83 & 5.72 & 0.59 \\
    PhysNet~\cite{physnet} & 2.31 & 4.77 & 0.67 \\
    EVM-MPP~\cite{EVM-MPP} & 1.63 & 2.10 & 0.45 \\
    PhysFormer~\cite{physformer} & 1.44 & 2.29 & 0.62 \\
    ACTNet~\cite{actnet}  & 1.08 & 1.57 & 0.81 \\
    CliffPhys~\cite{Cliff} & 0.83 & 1.97 & 0.86 \\ 
    \ours{} & \textbf{0.24} & \textbf{0.42} & \textbf{0.99} \\  
    \bottomrule
    \end{tabular}%
    }

    \begin{tablenotes}
        \footnotesize
        \item[1] All results are reported on the COHFACE test set. The CliffPhys model is pre-trained on the SCAMPS dataset and fine-tuned using the COHFACE training set. All other results, including TS-CAN, TS-DAN, PhysFormer, and ACTNet, are cited from ACTNet.
    \end{tablenotes}
  }

\end{table}

The results on the COHFACE dataset are shown in \tableref{tab:comparisonall}. We follow the official training and testing protocols provided by the dataset.
\ours{} achieves the lowest MAE (0.24) and RMSE (0.42) and the highest PCC (0.99) on the COHFACE test set. These results surpass all compared baselines, demonstrating the effectiveness of our framework for accurate RR estimation.



\begin{table}[ht]
\floatconts
  {tab:naturalcleanresp}
  {\caption{Performance comparison of our method on the COHFACE dataset.}}
  {%
    \resizebox{0.85\textwidth}{!}{%
    \begin{tabular}{llccc} 
    \toprule
    Train Set & Test Set & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$& PCC $\uparrow$   \\ \midrule
    All & All & 0.242 & 0.421 & 0.991  \\  
    All & Clean & \textbf{0.223}& \textbf{0.404} & \textbf{0.993} \\  
    All & Natural & 0.261 & 0.437 & 0.991 \\  
    Clean & Clean & 0.234 & 0.511 & 0.989 \\
    Natural & Natural & 0.307 & 0.588 & 0.985 \\  
    \bottomrule
    \end{tabular}%
    }
  }
\end{table}

We further assess illumination robustness by training and testing \ours{} on the clean and natural subsets separately, reporting all results to three-decimal precision. As shown in \tableref{tab:naturalcleanresp}, training on the full dataset (All) and testing on the clean subset achieves the lowest MAE (0.223) and RMSE (0.404) and the highest PCC (0.993). Although the clean subset yields slightly better performance, the differences across illumination conditions remain small, indicating that \ours{} is robust to lighting variations and performs well even under natural lighting.

Moreover, training on the whole dataset consistently outperforms training solely on a single subset, likely due to the increased diversity and volume of training data that improve the model's generalizability.
For example, the MAE reduces from 0.307 to 0.261 and the RMSE from 0.588 to 0.437 with the PCC increasing from 0.985 to 0.991 when switching from natural-subset training and testing to whole-set training and natural-subset testing.
The consistent trend across MAE, RMSE, and PCC reinforces the effectiveness of using more comprehensive training data.

\begin{figure}[t]
\floatconts
  {fig:figall}
  {\caption{Comparison of predicted and ground truth respiration belt signals for Subject 10 using the model trained on the whole COHFACE dataset. Dots indicate peak positions of the ground truth and predicted signals.}}
  {%
    \subfigure[Clean lighting condition]{\includegraphics[width=0.49\textwidth]{images/subject_10_allclean.png}}
    \subfigure[Natural lighting condition]{\includegraphics[width=0.49\textwidth]{images/subject_10_allnatural.png}}
  }
\end{figure}

\begin{figure}[t]
\floatconts
  {fig:figall2}
  {\caption{Comparison of predicted and ground truth respiration belt signals for Subject 8 using models trained and tested on the same condition of the COHFACE dataset under different lighting settings. Dots indicate peak positions of the ground truth and predicted signals.}}
  {%
    \subfigure[Clean lighting condition]{\includegraphics[width=0.49\textwidth]{images/subject_8_onlyclean.png}}
    \subfigure[Natural lighting condition]{\includegraphics[width=0.49\textwidth]{images/subject_8_onlynatural.png}}
  }
\end{figure}


\figureref{fig:figall} illustrates the respiration belt signals predicted by models trained on the whole dataset for a representative sample under clean and natural lighting conditions. The predicted curves closely follow the ground truth in the clean condition, with accurate peak detection, while predictions under natural lighting exhibit slight deviations. To further analyze the effect of training data quality, \figureref{fig:figall2} compares the results when training and testing solely on clean versus natural subsets, using a representative sample with notably divergent behavior. When trained on the natural subset, the model produces significant mismatches, including unstable fluctuations and incorrect peak detection. In contrast, training on the clean subset results in more accurate and stable predictions.

Since our method was inspired by the framework proposed in OPOIRES~\cite{OPOIRES}, we conducted a direct comparison using their reported best setting (60-second input). For a fair comparison, we also adopted STD as one of the evaluation metrics, in addition to MAE and RMSE. All models are trained on the whole dataset and evaluated separately on the clean and natural test subsets.


\begin{table}[ht]
\floatconts
  {tab:cleanall}
  {\caption{Performance comparison between OPOIRES and our method on the COHFACE two test set, using models trained on the whole dataset.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccc} 
    \toprule
    Test Set & Methods & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$ & STD (bpm) $\downarrow$  \\ \midrule
    \multirow{2}{*}{Clean} & OPOIRES (60s)~\cite{OPOIRES} & 0.48 & 1.13 & 1.03 \\
    & \ours{} & \textbf{0.22} & \textbf{0.40} & \textbf{0.34} \\  \midrule
    \multirow{2}{*}{Natural} & OPOIRES (60s)~\cite{OPOIRES} & 0.60 & 1.40 & 1.28 \\
    & \ours{} & \textbf{0.26} & \textbf{0.44} & \textbf{0.35} \\
    \bottomrule
    \end{tabular}%
    }
  }
\end{table}



As shown in \tableref{tab:cleanall}, our method significantly outperforms OPOIRES.
These results demonstrate the robustness and effectiveness of our multimodal design. Compared to relying solely on POI selection, incorporating multiple modalities leads to more accurate and stable RR estimation.

To evaluate whether the proposed method satisfies commonly adopted clinical agreement criteria (±2 breaths per minute relative to manual counting as the gold standard, as reported in \citet{Goldfine2024}), we compute the absolute RR error using the model trained on the full dataset. Detailed results are reported in \appendixref{app:error}. We observe that all predictions exhibit absolute errors of approximately 1\,bpm or less, indicating that \ours{} meets established clinical accuracy requirements.






\subsection{Ablation Studies}
\label{subsec:ablation}
\subsubsection{Impact of 3D CNN}
\label{subsubsec:cnnabl}

\begin{table}[ht]
\floatconts
  {tab:w/ocnn}
  {\caption{Performance comparison under ablation settings on the COHFACE dataset, using models trained on the whole dataset.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccc} 
    \toprule
    Test Set & Methods & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$ & PCC $\uparrow$   \\ \midrule
    \multirow{2}{*}{All} & \ours{}  & \textbf{0.242} &  \textbf{0.421} &  \textbf{0.991}  \\  
    & \ours{} (\textit{w/o 3D CNN}) & 0.269 & 0.500 & 0.988\\ \midrule
    \multirow{2}{*}{Clean} & \ours{}& \textbf{0.223} & \textbf{0.404} & \textbf{0.993} \\
    & \ours{} (\textit{w/o 3D CNN}) & 0.273 & 0.552 & 0.986 \\ \midrule
    \multirow{2}{*}{Natural} & \ours{} &\textbf{0.261} & \textbf{0.437} & \textbf{0.991} \\    
    & \ours{} (\textit{w/o 3D CNN}) & 0.265 & 0.442 & 0.991\\  
    \bottomrule
    \end{tabular}%
    }
  }
\end{table} 


In this ablation study, we evaluated the impact of removing the 3D CNN module. As shown in the first two rows of Table~\ref{tab:w/ocnn}, removing the 3D CNN results in only a slight performance drop on the whole dataset (All).

On both the natural and clean test sets, the full model consistently outperforms the model without 3D CNN.
However, the changes are relatively small, indicating that the POI branch alone can still capture meaningful temporal features.
These findings highlight the value of our multimodal architecture. While the POI branch offers robustness in noisy conditions, the integration of image-level features through 3D CNN provides complementary information that is especially beneficial in clean and stable environments. A detailed comparison of all POI-related hyperparameters is provided in \appendixref{app:poi}.



\subsubsection{Impact of POI Selection}
\label{subsubsec:ipoiabl}

\begin{table}[ht]
\floatconts
  {tab:w/opoi}
  {\caption{Performance comparison under ablation settings on the COHFACE dataset, using models trained on the whole dataset.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccc} 
    \toprule
    Test Set& Methods & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$ & PCC $\uparrow$   \\ \midrule
    \multirow{2}{*}{All} & \ours{}  & \textbf{0.242} &  \textbf{0.421} &  \textbf{0.991}  \\  
     & \ours{} (\textit{w/o POI}) & 0.744 & 1.191 & 0.939 \\ \midrule
    \multirow{2}{*}{Clean} & \ours{} & \textbf{0.223} & \textbf{0.404} & \textbf{0.993} \\ 
    &\ours{} (\textit{w/o POI}) & 0.707 & 1.126 & 0.949 \\ \midrule
    \multirow{2}{*}{Natural} & \ours{} & \textbf{0.261}& \textbf{0.437} & \textbf{0.991} \\  
    & \ours{} (\textit{w/o POI}) & 0.782 & 1.252 & 0.931 \\ 
    \bottomrule
    \end{tabular}%
    }
  }
\end{table}

For all these ablation experiments, all CNN-related hyperparameters are fixed to the best-performing configuration to ensure a fair comparison. As shown in \tableref{tab:w/opoi}, removing the POI modality leads to substantial performance degradation, causing much greater impact compared to removing 3D CNN. 
This may be because respiratory motion is highly localized and periodic, making POI features more effective in capturing relevant temporal patterns. 
In contrast, 3D CNN processes broader spatiotemporal regions, which are more susceptible to noise from lighting variations or non-respiratory motion.
Nonetheless, even when either image or POI modality is ablated, \ours{} still outperforms all baselines. These results highlight the effectiveness and robustness of the model's multimodal design, as well as the complementary nature of image and POI features.

We further assess robustness under partial information loss by conducting test-time degradation experiments, where the model is trained with both modalities but evaluated with one modality deliberately corrupted while the other remains unchanged. Under severe single-modality corruption, the model maintains strong performance across all metrics, exhibiting only moderate degradation relative to the original setting. Detailed results are reported in \appendixref{app:degradtion}.

To assess whether the reported performance gains are statistically meaningful, we conducted subject-level statistical validation using paired Wilcoxon signed-rank tests, detailed in \appendixref{app:wilcoxon}.






\subsection{Analysis under Facial Occlusion Scenarios}
While the robustness evaluation scenarios provided by the COHFACE dataset mainly focus on varied illumination, \ie{} clean vs. natural lighting conditions, real-world deployment for remote RR measurement also involves additional challenges such as facial occlusion, head movement, speaking, and camera motion. To evaluate the effect of facial occlusion, we have added additional robustness analysis by simulating this factor in \appendixref{app:occlusion}. The results show that respiration estimation performance remains consistent across different occlusion strategies. Although this simulation does not exhaustively cover all real-world occlusion patterns, it provides a meaningful evaluation of robustness under partial facial occlusion. Systematic evaluation of these factors, however, requires reliable ground-truth respiration signals, which are not available in COHFACE. We therefore avoid over-claiming robustness and identify broader stress testing as an important direction for future work.

\subsection{Complexity Analysis}
We also compared the complexity of the proposed method with the baselines, from which we can conclude that \ours{} achieved state of the art not only in performance but also in complexity. We refer the reader to \appendixref{app:complexity} for more details.

\subsection{Comparison Between CNN- and Transformer-Based Encoders}
Although \ours{} is a multimodal framework, its architecture is modular, allowing the image-based encoder to be replaced without altering the overall design. To account for the growing use of Transformers in video modeling, we conduct a controlled comparison by replacing the 3D CNN with a lightweight Vision Transformer (ViT) while keeping all other components unchanged.

As shown in \appendixref{app:CNNTRANSFORMER}, CNN-based encoders achieve better performance under limited-data regimes. A more detailed analysis and discussion are provided in the appendix.



\section{Conclusion and future work}
In this paper, we proposed \ours{}, a lightweight multimodal framework for remote respiratory rate (RR) estimation. Our method integrates pixel-based POI selection with a 3D CNN module using frame differencing and a BiLSTM decoder to model temporal dynamics. Through extensive experiments on the COHFACE dataset, \ours{} consistently outperforms existing methods under all conditions, particularly in clean data scenarios. Notably, our network has the smallest parameter size and computational complexity among all compared approaches, making it suitable for deployment on portable medical devices. 


Despite its strong performance, our method has limitations. RR is a semi-voluntary signal that can be consciously controlled by subjects, potentially introducing noise in measurements.
Furthermore, the current pipeline relies on motion cues extracted after face detection, with respiration primarily inferred from the lower face, neck, and shoulder regions. Consequently, large pose variations, significant body motion, speech activity, and scenarios involving complete facial coverage may degrade performance by altering or obscuring respiration-related motion patterns.


Although explicit measures are taken to prevent temporal and subject leakage through non-overlapping temporal windows and subject-independent data splits, the study is conducted on a relatively small, single-dataset benchmark. Therefore, residual intra-subject correlations related to recording conditions may persist and limit generalization to unseen acquisition settings.


 Nevertheless, we believe that \ours{} remains valuable in specific clinical contexts.
It is particularly suitable for populations with limited voluntary motion, such as burn patients, newborns, or patients during sleep, where contact-based measurements are impractical.
The proposed approach is also well suited for telemedicine and home-care scenarios, enabling unobtrusive monitoring of respiratory trends during sleep.


In future work, we plan to 1) further validate the generalizability of \ours{} through cross-dataset evaluation on additional RR benchmarks, 2) improve the pipeline by developing end-to-end architectures, and 3) extend the framework to comprehensive vital sign estimation in practical clinical environments.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research was supported by the National Science and Technology Council \& National Taiwan University, Taiwan,
under the grant numbers 113-2634-F-002-002- , 113-2223-E-002-006- \& 113-2221-E-002-127-MY3.}


\bibliography{midl26_46}

\appendix

\section{Optimization Strategy}
\label{app:optimization}
The model is trained with a batch size of 8. We use the Adam optimizer, and the learning rate is tuned per model to maximize performance, with $5 \times 10^{-3}$ serving as the common starting point in most configurations.
To stabilize training and prevent gradient explosion at the early stage, we adopt a linear learning rate schedule with warm-up. Specifically, the learning rate increases linearly during the first $N_\text{warmup}=10$ steps, followed by a linear decay until the total number of training steps $N_\text{total}=100$. 
The learning rate at step $i$ is computed as $\eta_i \cdot \text{lr}_\text{init}$, where
\begin{equation}
\eta_i =
\begin{cases}
\frac{i}{N_\text{warmup}} & \text{if } i < N_\text{warmup} \text{,} \\
\frac{N_\text{total} - i}{N_\text{total} - N_\text{warmup}} & \text{otherwise.}
\end{cases}
\end{equation}
This scheduling strategy facilitates stable convergence by allowing sufficient exploration in the initial phase and finer adjustment in the later training stages. Early stopping is triggered after 10 epochs of no improvement. At each epoch, we save the best model based on the loss of the development set, which is used for all subsequent evaluations.


\section{Evaluation Metrics}
\label{app:metrics}
The four evaluation metrics are described as follows, where $n$ is the number of samples, $\hat{r}_i$ denotes the predicted RR, and $r_i$ is the ground truth value for the $i$-th subject.

\textbf{Mean absolute error (MAE)} measures the average deviation between predicted and actual RR (in bpm):
\begin{equation}
\text{MAE} = \frac{1}{n} \sum_{i=1}^{n} \left| \hat{r}_i - r_i \right| \text{.}
\end{equation}

\textbf{Root mean square error (RMSE)} quantifies overall prediction error magnitude:
\begin{equation}
\text{RMSE} = \sqrt{ \frac{1}{n} \sum_{i=1}^{n} (\hat{r}_i - r_i)^2 } \text{.}
\end{equation}

\textbf{Pearson correlation coefficient (PCC)} indicates linear correlation between predicted and true values:
\begin{equation}
\text{PCC} = \frac{ \sum_{i=1}^{n} (\hat{r}_i - \bar{\hat{r}})(r_i - \bar{r}) }{ \sqrt{ \sum_{i=1}^{n} (\hat{r}_i - \bar{\hat{r}})^2 } \cdot \sqrt{ \sum_{i=1}^{n} (r_i - \bar{r})^2 } } \text{.}
\end{equation}

\textbf{Standard deviation of absolute error (STD)} measures the variability of the absolute errors:
\begin{equation}
  \text{STD} = \sqrt{ \frac{1}{n} \sum_{i=1}^{n} \left( \left| \hat{r}_i - r_i \right| - \frac{1}{n} \sum_{j=1}^{n} \left| \hat{r}_j - r_j \right| \right)^2 } \text{.}
\end{equation}

\section{Experimental Environment}
\label{app:environment}
The experimental environment consists of a Windows 11 64-bit operating system running on a machine equipped with an Intel Core i9-14900HX processor (2.20 GHz) and 32 GB of system memory. All experiments were conducted using an NVIDIA GeForce RTX 4060 GPU with CUDA version 12.5. The proposed method is implemented in Python (version 3.12) using the PyTorch (version 2.6) deep learning framework.

\section{Clinical Agreement Analysis of Absolute Error Distributions}
\label{app:error}
To visualize the error distributions, we adopt the standard Tukey boxplot convention, where the box represents the interquartile range (IQR) between the first quartile (Q1, 25th percentile) and the third quartile (Q3, 75th percentile), the central line denotes the median, and the whiskers extend to values within $1.5\times\mathrm{IQR}$. Values beyond this range are treated as outliers.

\begin{figure}[ht]
\floatconts
  {fig:figerror}
  {\caption{Absolute respiratory rate error distributions across different test sets ($N=64$, $32$, $32$) using the model trained on the whole dataset.}}
  {\includegraphics[width=0.75\textwidth]{images/Figure_1.png}}
\end{figure}

As shown in \figureref{fig:figerror}, all predictions fall well within $\pm1$~bpm, and at least $75\%$ of the samples exhibit absolute errors below $0.233$~bpm, indicating strong estimation consistency. These results confirm that the proposed approach achieves clinically acceptable accuracy and demonstrates strong potential for practical deployment in real-world respiratory monitoring scenarios.

\section{Analyses of POI Selection Parameters}
\label{app:poi}
\begin{table}[ht]
\floatconts
  {tab:poi_analyses}
  {\caption{Analyses of POI-related hyperparameters on RR estimation performance on the whole COHFACE dataset.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{cccccc}
      \toprule
      \#POIs & Quality level & MD & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$ & PCC $\uparrow$ \\
      \midrule
      \multicolumn{6}{l}{\textit{Default configuration}} \\
      \midrule
      100 & $10^{-4}$ & 2 & \textbf{0.242} & \textbf{0.421} & \textbf{0.991} \\
      \midrule
      \multicolumn{6}{l}{\textit{Varying number of POIs, others fixed to default}} \\
      \midrule
      50  & $10^{-4}$ & 2 & 0.272 & 0.503 & 0.988 \\
      150 & $10^{-4}$ & 2 & 0.255 & 0.518 & 0.987 \\
      \midrule
      \multicolumn{6}{l}{\textit{Varying quality level, others fixed to default}} \\
      \midrule
      100 & $10^{-2}$   & 2 & 0.355 & 0.723 & 0.976 \\
      100 & $10^{-3}$   & 2 & 0.248 & 0.476 & 0.989 \\
      100 & $10^{-5}$ & 2 & 0.264 & 0.523 & 0.987 \\
      \midrule
      \multicolumn{6}{l}{\textit{Varying minimum distance (MD), others fixed to default}} \\
      \midrule
      100 & $10^{-4}$ & 4 & 0.277 & 0.512 & 0.987 \\
      100 & $10^{-4}$ & 6 & 0.273 & 0.575 & 0.984 \\
      \bottomrule
    \end{tabular}%
    }
  }
\end{table}

We tuned the number of POIs (\#POIs), quality level, and minimum distance (MD) in the POI stream, showing the results in \tableref{tab:poi_analyses}. Based on these analyses, we fix the POI-related hyperparameters to their best-performing values and adopt this configuration as the default in all subsequent experiments.

\section{Test-Time Robustness Analysis under Single-Modality Degradation}
\label{app:degradtion}

To assess robustness under partial information loss, we conduct additional test-time degradation experiments. The model is trained using both modalities on the whole dataset and evaluated under settings where one modality is deliberately corrupted while the other remains unchanged, simulating partial modality failure at inference time. Additive Gaussian noise is adopted as the corruption type for both modalities. Image noise is injected directly in the 8-bit \texttt{uint8} intensity space, whereas POI noise is applied in the z-score normalized coordinate space. Therefore, noise magnitudes are not directly comparable across modalities. 

For POI trajectories, respiration-induced displacements in the normalized space typically lie within a small range around zero, and we therefore apply a relatively large noise level ($\sigma = 1$) to simulate severe degradation of the POI modality. For image corruption, we empirically observe that small noise levels (\eg{} $\sigma \le 5$
) have negligible impact.
Consequently, we adopt a stronger noise level corresponding to a peak signal-to-noise ratio (PSNR) of approximately 18.6\,dB. According to \citet{tian2023}, PSNR values below 20 dB correspond to unacceptable image quality.

\begin{table}[ht]
\floatconts
  {tab:test_time_degradation}
  {\caption{Performance under test-time single-modality degradation.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{lccc}
    \toprule
    Degradation Setting & MAE (bpm) $\downarrow$ & RMSE (bpm) $\downarrow$ & PCC $\uparrow$ \\
    \midrule
    Original & \textbf{0.242} & \textbf{0.421} & \textbf{0.991} \\
    POI noise ($\sigma = 1$, z-score space) & 0.379 & 0.717 & 0.976 \\
    Image noise ($\sigma = 30$ 8-bit space) & 0.294 & 0.524 & 0.987 \\
    \bottomrule
    \end{tabular}%
    }
  }
\end{table}

Table~\ref{tab:test_time_degradation} summarizes the performance under different test-time degradation settings. Even under severe single-modality degradation, the model maintains strong performance across all evaluation metrics. These results indicate that the proposed multimodal framework leverages complementary information across modalities and achieves robust performance without strict dependence on any single modality.


\section{Statistical Validation of Ablation Studies}
\label{app:wilcoxon}

\begin{table}[ht]
\floatconts
    {tab:ablation_stats}
    {\caption{Subject-level statistical validation of ablation studies on the COHFACE dataset. $\Delta$MAE denotes the increase in MAE after removing a component (ablated vs. full). Statistical significance is evaluated using paired Wilcoxon signed-rank tests with bootstrap 95\% confidence intervals.}}
    {%
        \resizebox{\textwidth}{!}{%
        \begin{tabular}{l l c c c c c}
        \toprule
        Condition & Comparison & $N$ & $\Delta$MAE (bpm) & 95\% CI & $p$-value & Cohen's $d_z$ \\
        \midrule
        All     & Full vs. w/o 3D CNN & 16 & 0.027 & $[-0.027,\;0.087]$ & 0.221 & 0.22 \\
        Clean   & Full vs. w/o 3D CNN & 16 & 0.050 & $[-0.057,\;0.170]$ & 0.248 & 0.20 \\
        Natural & Full vs. w/o 3D CNN & 16 & 0.004 & $[-0.003,\;0.012]$ & 0.249 & 0.24 \\
        \midrule
        All     & Full vs. w/o POI    & 16 & 0.502 & $[0.237,\;0.875]$ & \textbf{0.00021} & 0.73 \\
        Clean   & Full vs. w/o POI    & 16 & 0.484 & $[0.197,\;0.898]$ & \textbf{0.00076} & 0.64 \\
        Natural & Full vs. w/o POI    & 16 & 0.521 & $[0.199,\;0.906]$ & \textbf{0.00269} & 0.70 \\
        \bottomrule
        \end{tabular}%
        }
    }
\end{table}

To assess whether the reported performance gains are statistically meaningful, we conducted subject-level statistical validation using paired Wilcoxon signed-rank tests. All comparisons were performed on a per-subject basis to avoid bias arising from correlated samples. In addition, we report bootstrap-based 95\% confidence intervals and effect sizes (Cohen's $d_z$) to quantify the magnitude of the observed differences. To estimate uncertainty, we further computed 95\% confidence intervals of the mean MAE difference (ablated vs. full) via non-parametric bootstrap resampling (20,000 iterations with a fixed random seed for reproducibility). All reported statistics are therefore based on subject-level paired measurements, ensuring a fair and statistically sound comparison.

The statistical results in \tableref{tab:ablation_stats} show that removing the POI branch leads to a statistically significant increase in MAE across the all, clean, and natural conditions ($\Delta\text{MAE} \approx 0.48\text{--}0.52\,\text{bpm}$, $p < 0.01$, with medium-to-large effect sizes), confirming that the POI branch is a major contributor to the observed performance improvements. In contrast, removing the 3D CNN results in only small and statistically non-significant changes in MAE across all conditions ($p > 0.22$, small effect sizes), indicating a limited but consistent auxiliary contribution.

\section{Detailed Analysis under Facial Occlusion Scenarios}
\label{app:occlusion}

\begin{figure}[ht]
\floatconts
  {fig:figblurblock}
  {\caption{Qualitative illustration of facial occlusion modeling using blur-based and block-based occlusion on the lower facial region to simulate occlusion-related noise, such as mask wearing or hand occlusion.}}
  {%
    \subfigure[blur-based occlusion]{\includegraphics[width=0.49\textwidth]{images/blur.png}}
    \subfigure[block-based occlusion]{\includegraphics[width=0.49\textwidth]{images/block.png}}
  }
\end{figure}

Given the limited number of available public datasets, we conduct additional simulations on the COHFACE dataset to evaluate the robustness of our method under realistic noise conditions. We focus on facial occlusions commonly encountered in practice, such as mask wearing or partial hand occlusion, which may affect face detection and motion extraction. 

We simulate lower-face occlusions using two strategies: blur-based and block-based masking, as illustrated in \figureref{fig:figblurblock}.
The occlusion is applied only to the lower facial region, while the eye region is preserved, as this setting reflects common real-world occlusion patterns and maintains reliable face detection.

Our framework is modular and detector-agnostic, allowing different face detectors to be used without modifying the core model.
By default, we adopt the Viola-Jones detector for efficiency, while a MediaPipe-based detector is optionally evaluated in occlusion-heavy scenarios to ensure stable face localization.
This choice introduces only a modest increase in model complexity (0.135M parameters).

\label{app:blurandblock}
\begin{table}[ht]
\floatconts
  {tab:blur_block}
  {\caption{Performance comparison under different occlusion strategies using the \ours{} model, trained on the whole dataset.}}
  {%
    \resizebox{0.85\textwidth}{!}{%
    \begin{tabular}{llccc}
        \toprule
        Test Set & Strategy
        & MAE (bpm) $\downarrow$
        & RMSE (bpm) $\downarrow$
        & PCC $\uparrow$ \\
        \midrule
        \multirow{3}{*}{All}&
        Original & \textbf{0.242} & \textbf{0.421} & \textbf{0.991} \\
        &Blur  & 0.272 & 0.507 & 0.988 \\
        &Block & 0.264 & 0.526 & 0.987 \\
        \midrule
        \multirow{3}{*}{Clean}&
        Original & \textbf{0.223} & \textbf{0.404} & \textbf{0.993} \\
        &Blur  & 0.265 & 0.538 & 0.987 \\
        &Block & 0.244 & 0.517 & 0.988 \\
        \midrule
        \multirow{3}{*}{Natural}&
        Original & \textbf{0.261} & \textbf{0.437} & \textbf{0.991} \\
        &Blur  & 0.278 & 0.473 & 0.989 \\
        &Block & 0.284 & 0.534 & 0.986 \\
        \bottomrule
    \end{tabular}%
    }
  }
\end{table}

The results in Table~\ref{tab:blur_block} show that the overall respiration estimation performance remains highly consistent across different occlusion modeling strategies. Although this simulation does not exhaustively cover all possible real-world occlusion patterns, it provides a meaningful evaluation of the model's robustness. In \ours{}, the POI-based motion representation places greater emphasis on shoulder and upper-torso motion cues, which are less sensitive to facial visibility. Consequently, performance differences across settings remain limited.

\section{Complexity Analysis}
\label{app:complexity}

\begin{table}[ht]
\floatconts
  {tab:complexity}
  {\caption{Comparison of the complexity between existing methods and our method.}}
  {% 
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{lccc}
        \toprule
        Methods & Param. (M) $\downarrow$& MACs (G) $\downarrow$& MAE (bpm) $\downarrow$ \\ \midrule
        DeepPhys~\cite{deephys} & 7.50 & 111.76 & 3.21 \\
        TS-CAN~\cite{MATCNN}& 7.50 & 111.76 & 2.97 \\
        TS-DAN~\cite{DualAN}& 7.50 & 111.91 & 2.83 \\
        PhysNet~\cite{physnet} & 0.73 & 65.19 & 2.31 \\
        PhysFormer~\cite{physformer} & 7.03 & 47.01 & 1.44 \\
        ACTNet~\cite{actnet}& 20.72 & 77.84 & 1.08 \\
        \ours{} & \textbf{0.20} & \textbf{15.01} & \textbf{0.24} \\ \bottomrule
    \end{tabular}%
    }
    \begin{tablenotes}
       \footnotesize
       \item[1] All results, including TS-CAN, TS-DAN, PhysFormer, and ACTNet, are cited from ACTNet.
    \end{tablenotes}
  }
\end{table}

The number of parameters (Param.) and multiply-accumulate operations (MACs) are shown in \tableref{tab:complexity}. The parameters in the proposed method consist of two parts: the pre-trained models via AdaBoost in the Viola-Jones algorithm and the \ours{}, including 90K and 114K parameters, respectively. We omitted the MACs of the Viola-Jones and subsequent algorithms, as they are negligible compared to those of \ours{}, whose complexities are measured by \href{https://github.com/ultralytics/thop}{\texttt{thop}}. We can observe that \ours{} achieved the state of the art in both complexity and performance, demonstrating the excellence of the method.

\section{Detailed Comparison Between CNN- and Transformer-Based Encoders}
\label{app:CNNTRANSFORMER}

Recent work by~\citet{efficientphy} systematically compared CNN- and Transformer-based models for camera-based vital sign measurement. Their results show that Transformer-based models require substantial optimization and large-scale pretraining to outperform even relatively shallow CNNs, which is difficult to achieve in physiological video analysis due to limited data availability.

\begin{table}[ht]
\floatconts
    {tab:cnn_transformer_all_settings}
    {\caption{Comparison between 3D CNN and ViT encoders trained on the whole COHFACE dataset under different testing conditions.}}
    {%
        \resizebox{0.85\textwidth}{!}{%
        \begin{tabular}{l l c c c}
        \toprule
        Test Set & Encoder
        & MAE (bpm) $\downarrow$
        & RMSE (bpm) $\downarrow$
        & PCC $\uparrow$ \\
        \midrule
        \multirow{2}{*}{All}
        & 3D CNN         & \textbf{0.242} & \textbf{0.421} & \textbf{0.991} \\
        & ViT & 0.280 & 0.526 & 0.986 \\
        \midrule
        \multirow{2}{*}{Clean}
        & 3D CNN         & \textbf{0.223} & \textbf{0.404} & \textbf{0.993} \\
        & ViT & 0.258 & 0.459 & 0.991 \\
        \midrule
        \multirow{2}{*}{Natural}
        & 3D CNN         & \textbf{0.261} & \textbf{0.437} & \textbf{0.991} \\
        & ViT & 0.302 & 0.586 & 0.985 \\
        \bottomrule
        \end{tabular}%
        }
    }
\end{table}


\begin{table}[ht]
\floatconts
  {tab:w/otransformer}
  {\caption{Comparison between 3D CNN and ViT encoders under ablation settings on the COHFACE dataset, using models trained on the whole dataset.}}
  {%
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llccc}
    \toprule
    Test Set & Encoder
    & MAE (bpm) $\downarrow$ 
    & RMSE (bpm) $\downarrow$ 
    & PCC $\uparrow$ \\
    \midrule
    \multirow{2}{*}{All}
    & 3D CNN (\textit{w/o POI}) & \textbf{0.744} & \textbf{1.191} & \textbf{0.939} \\
    & ViT (\textit{w/o POI}) & 1.824 & 2.581 & 0.664 \\
    \midrule
    \multirow{2}{*}{Clean}
    & 3D CNN (\textit{w/o POI}) & \textbf{0.707} & \textbf{1.126} & \textbf{0.949} \\
    & ViT (\textit{w/o POI}) & 1.612 & 2.293 & 0.716 \\
    \midrule
    \multirow{2}{*}{Natural}
    & 3D CNN (\textit{w/o POI}) & \textbf{0.782} & \textbf{1.252} & \textbf{0.931} \\
    & ViT (\textit{w/o POI}) & 2.036 & 2.840 & 0.621 \\
    \bottomrule
    \end{tabular}%
    }
  }
\end{table}

Motivated by these observations, we conduct a controlled comparison by replacing the 3D CNN with a lightweight Vision Transformer (ViT) (0.91M parameters), and report the results in Table~\ref{tab:cnn_transformer_all_settings} and Table~\ref{tab:w/otransformer}. These results indicate that under relatively small data regimes, CNNs exhibit stronger capability in effectively supporting the overall model. The observed performance gains are primarily driven by the quality of the POI-based motion representation rather than the specific choice of image-based encoder. Once respiration-relevant motion trajectories are explicitly extracted, even lightweight image models achieve near-saturated performance. Although CNN-based and Transformer-based models achieve similar accuracy on the controlled benchmark, the CNN is more data-efficient and robust under reduced training data and corrupted POI conditions, with substantially fewer parameters.

\end{document}





