\section{Method}
%
\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{./figures/method-2.png}
    \captionsetup{font=small}
    \caption{ 
    \textbf{Overview of ADAPT.} 
    In each minibatch,  ADAPT takes up to $M$ modalities, including video, audio, and biosignals, as input to produce a modality-agnostic representation for downstream tasks. It is trained in two steps. (i) \textit{Anchoring.} We align the representations of all modalities via contrastive learning to the one of an \emph{anchor} modality, i.e., the strongest and richest modality; here the video. (ii) \textit{Fusion.} The encoders' features are concatenated and fed into the Masked Multimodal Transformer. When a modality is unavailable, the transformer masks its corresponding feature representations. The final representation (i.e., \ [CLS] token output) is used for downstream tasks.
    }
    \label{fig:method}
\end{figure}
%
\noindent This study addresses the detection of physiological changes using multimodal data, including video, audio, and biomedical signals. 
Real-world scenarios often involve missing modalities, motivating our goal to develop a modality-agnostic representation with broad applicability and to propose ADAPT -- \textbf{A}nchore\textbf{D} multimod\textbf{A}l \textbf{P}hysiological \textbf{T}ransformer. An overview of ADAPT is presented in Figure~\ref{fig:method}.  
%
\noindent \textbf{Notations.} Let $\mathcal{D} = \{(x_m^i)_{m=1}^M, y^i\}_{i=1}^N$ denote our training dataset, with $M$ modalities and $N$ labeled observations and $x^i = (x_m^i)_{m=1}^M$ the $i$-th observation (i.e., a family of $m$ modality values) with $y^i \in \mathcal{Y} = \{0, .., J\}$ its corresponding label (i.e., a physiological state).
Given this input, we seek to train a neural network $\mathcal{F}$, that associates to any observation, with any missing modality, a target label $y \in \mathcal{Y}$.
%%
\subsection{Anchoring modality-specific encoders}
We train modality-specific encoders with a contrastive learning objective to align their representations to the one of the \emph{anchor}. In this work, anchor is the video, as it can capture visually distinguishable physiological changes; however, any modality can be the anchor.

\paragraph{Modality-specific encoders.} Each modality is encoded using a dedicated encoder. For \textit{video}, we use the pre-trained Hiera~\cite{hiera} encoder. For \textit{audio}, each sample is encoded into a mel-spectrogram (a 2D acoustic time-frequency representation of sound), fed to BYOL-A~\cite{niizumi2021byol} to obtain a 1-d feature. \textit{Biomedical signals} are processed using 1D CNNs~\cite{wang2023contrast, ismail2019deep}. We add a modality-specific linear projection head to each encoder to obtain a fixed size $d$ dimensional embedding. 
ADAPT can be extended to other modalities by adding their respective encoders.

\paragraph{Anchoring.} We consider a pair of modalities with aligned observations $(\mathcal{A}, \mathcal{M}_m)$, where $\mathcal{A}$ represents the anchor (video) and $\mathcal{M}_m$ another modality. The anchor video $x^i_a$ and its corresponding observation $x^i_m$ are encoded using $z_a^i {=} E_a(x^i_a)$ and $z^i_m {=} E_m(x^i_m)$, respectively, where $E_a$ is a pre-trained and frozen video encoder and $E_m$ a DNN. Projection heads map the embeddings to $f_a^i, f_m^i {\in} \mathbb{R}^d$. The loss is computed on $f_a^i$ and $f_m^i$~\cite{girdhar2023imagebind}:
\begin{equation} 
    \label{eq:contrastive_loss}
    \mathcal{L}_{\mathcal{A},\mathcal{M}_m} = -\sum_{i=1}^B \log \frac{\exp(\cos(f^i_a ,f^i_{m})/\tau)}{\sum_{k=1}^{B}\exp(\cos(f^i_a,f^k_m)/\tau)} \quad ,
\end{equation}
where $\tau$ is a temperature parameter $\tau \in \mathbb{R}^{+}$, $\cos(.,.)$ the cosine similarity, and $B$ the batch size. In practice, we use a symmetric loss: $\mathcal{L}_{\mathcal{A}, \mathcal{M}_m} + \mathcal{L}_{\mathcal{M}_m, \mathcal{A}}$. To alleviate the modality gap~\cite{liang2022mind}, we add Gaussian noise to the modality $m$ representation~\cite{gu2023can}. We use a cosine schedule for the temperature parameter~\cite{kukleva2023temperature}. Given $M$ modalities, we define the anchoring loss as $\mathcal{L}_{\text{anchoring}} = \sum_{m=1,\mathcal{M}_m \neq \mathcal{A}}^{M} (\mathcal{L}_{\mathcal{A},\mathcal{M}_m} + \mathcal{L}_{\mathcal{M}_m,\mathcal{A}})$. 

%%%%%% Masked Multimodal Transformer %%%%%%
\subsection{Masked Multimodal Transformer}
To effectively build modality-agnostic representations, we use the transformer~\cite{vaswani2017attention} with $N_L$ attention blocks. 
For each sample, we stack the modality-specific representations, $f_m^i \in \mathbb{R}^d, \forall m \in[1,M]$, into a single matrix and prepend a special token [CLS], yielding a matrix $F \in \mathbb{R}^{(M+1) \times d}$. Similarly to \citet{liu2022funnynet,nagrani2021attention}, the query, key and value are derived from $F$ via: $Q=W^QF$, $K=W^KF$ and $V=W^VF$ where $Q,K \in \mathbb{R}^{(M+1) \times d_k}$ and $V\in \mathbb{R}^{(M+1) \times d_v}$. Our modelization of inter-modal interactions differs from the usual cross-attention~\cite{chen2021crossvit,jaegle2021perceiver}, which asymmetrically combines two separate embedding sequences of the same dimension. Using stacked features $F$ allows generalization
to any number of modalities, with linear scalability in the number of modalities instead of quadratic.
%%%%%% Handling Missing Modalities %%%%%%

\paragraph{Handling missing modalities.} Inspired by~\citet{milecki2022contrastive} for missing follow-up patient examinations, we apply our strategy to deal with missing modalities to the scaled dot-product, core of each multi-head self-attention sub-layer. We consider one sub-layer with one head ($h=1$) for simplicity. 
We use a masking binary matrix $Z \in \mathbb{R}^{(M+1) \times (M+1)}$ that specifies which modalities are missing: $z_{ij} = 1$ if $i$ and $j$ are available, else $z_{ij} = 0$. The output $O \in \mathbb{R}^{(M+1) \times d_v}$ of the attention mechanism is, for $O_i$ each line of $O$:

\begin{equation}
\label{eq:attention}
    O_i = \sum_{j} z_{ij}\frac{
        \exp (Q_i^TK_j / \sqrt{d_k})
    }{ \sum_{\{j', z_{ij'}  = 1\}}\exp (Q_i^TK_{j'} / \sqrt{d_k})}V_j \quad .
\end{equation}
When $h > 1$, queries, keys, and values are linearly projected $h$ times with different, learned linear projections, concatenated, and once again projected after the scaled-dot product. 
%
%%%%%% Modality Dropout %%%%%%

\paragraph{Modality dropout.}  We train the Masked Multimodal Transformer with a multi-view contrastive objective~\cite{chen2020simple}. Drawing inspiration from~\citet{shi2022learning}, we mitigate the model's over-reliance on a single modality while enhancing its robustness in the absence of modalities through an augmentation technique called \textit{modality dropout}. We leverage the masking scheme at the attention level to randomly mask input modalities.  Given a batch $\mathcal{Z}$, we create two simultaneous view $\mathcal{Z}'$ and $\mathcal{Z}''$. For each observation within $\mathcal{Z'}$, we hide up to $M - 1$ modalities following a uniform probability, $M$ the number of modalities. Additionally, motivated by~\citet{han2020deep}, who recently showed the effect of additive noise on electrocardiograms, we add $\epsilon \sim \mathcal{N}(0, \sigma)$ on the biomedical signals to each view independently. We chose $\sigma$ based on the amplitude of the signal.
We use the infoNCE loss to enforce the similarity between the two views~\cite{chen2020simple} on the final representation output given by the [CLS] token. Since the two representations are already mapped to the same dimension, following~\citet{jing2022understanding}, we directly optimize the representations to enforce scalability and mitigate dimension collapse: $\mathcal{L}_{\text{fusion}} =  -\sum_{i=1}^B \log \frac{\exp(\cos(\text{CLS}^i, \text{CLS}^{i'})/\tau)}{\sum_{k=1}^{B}\exp(\cos(\text{CLS}^i, \text{CLS}^{k'})/\tau)}$.