


%\CE{This diagram is a bit confusing: is $Z$ a random variable (cell state), but all other nodes are samples? So $Z \to z_i$ represents both the sampling process and the application of a treatment? What about the measurement process? I think these processes should all be separated for clarity. E.g., start with different cells $z_i, z_j \sim p_Z$, apply the same treatment $t$ to get treated cells $\tilde{z}_i, \tilde{z}_j$, and then finally use measurement devices $m^1, m^2$ to get observations in each modality $x^1_i = m^1(\tilde{z}_i)$ and $x^2_j = m^2(\tilde{z}_j)$. It looks like the SimCLR paper~(Figure 2) was the inspiration for this diagram, but in that setup you have the same sample $z_i$ to start with. Starting with the random variable $Z$ here seems confusing.}
%\jh{I didn't mind it because I'm more okay with writing down potential outcomes than Cian - we've argued about this many times before :) - but he's right that you need to make clear that in any given sample, you only see either $x_j^1$ or  $x_j^2$ but not both (unless you have specialized assays. )}

%\CE{Should we first have a ``Background: Unpaired contrastive learning'' section, and then a ``IntraPair InterCluster (IPIC)'' methods section with the final algorithm?} 

%\paul{this definition is just standard unpaired multimodal learning, nothing to do with bio?}
% \paul{also all these equations are hard to read, can we have some intuitive figures of the data distributions and contrastive objectives - what is paired/unpaired, where the clusters come from etc}

%\CE{this notation says we are given triplets $(x^1_i, x^2_j, t)$ \textit{at the sample level} with the same number of samples $N$ for each treatment. Better to say we're given two treatment-labeled datasets $\mathcal{D}_1 = \{(x^1_i, t_i)\}_{i=1}^{N_1}$ and $\mathcal{D}_2 = \{(x^2_j, t_j)\}_{j=1}^{N_2}$ with shared treatment labels $t \in T$?}





\section{Background}\label{sec:background}
We define our unpaired problem setup before reviewing prior paired contrastive methods and their limitations in this context.

% \subsection{Problem setup: Unpaired multimodal learning}
% Given two treatment-labeled datasets $\mathcal{D}_1 = \{(x^1_i, t_i)\}_{i=1}^{N_1}$ and $\mathcal{D}_2 = \{(x^2_j, t_j)\}_{j=1}^{N_2}$ sharing treatment labels $t \in T$ (Figure \ref{fig:dataset}), we construct a combined dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i, j=1}^{N}$. Unlike paired data, samples $x^1_i$ and $x^2_j$ are indirectly linked by $t$ rather than direct correspondence. As in Figure~\ref{fig:methodoverview}, our goal is to learn encoders $\mathcal{\phi}^1, \mathcal{\phi}^2$ that produce improved representations, measured by zero-shot treatment prediction and downstream task performance.

\subsection{Problem setup: Unpaired multimodal learning}
Given two treatment-labeled datasets $\mathcal{D}_1 = \{(x^1_i, t_i)\}_{i=1}^{N_1}$ and $\mathcal{D}_2 = \{(x^2_j, t_j)\}_{j=1}^{N_2}$, where $x^1_i \in X^1$ (e.g., images) and $x^2_j \in X^2$ (e.g., sequences or texts) are samples from different modalities, each sample is associated with a treatment label $t \in T$ drawn from the set of possible treatments $T$ (see Figure \ref{fig:dataset}). 
$\mathcal{D}_1$ and $\mathcal{D}_2$ share the same set of treatment labels, allowing us to construct a combined dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i, j=1}^{N}$. However, samples $x^1_i$ and $x^2_j$ are not directly paired (i.e., they do not correspond to the same cell or natural scene), but are instead indirectly linked by their shared treatment label $t$. As in Figure~\ref{fig:methodoverview}, our goal is to learn encoders $\mathcal{\phi}^1, \mathcal{\phi}^2$ that produce improved representations, measured by zero-shot treatment prediction and downstream task performance.  %As in Figure~\ref{fig:methodoverview}, the objective of unpaired multimodal learning with treatment group labels is to jointly learn two encoders, $\mathcal{\phi}^1$ and $\mathcal{\phi}^2$ for each modality, that produce improved representations. Here, improved representations are defined by their zero-shot performance in distinguishing treatment labels and their effectiveness in downstream tasks.

\subsection{Contrastive learning for \textit{paired} data}
Standard methods like SimCLR~\citep{chen2020simple} and CLIP~\citep{radford2021learning} align paired samples $\{(x^1_i, x^2_i)\}_{i=1}^B$ by embedding them into vectors $\mathbf{v}$ via encoders $\mathcal{\phi}$ and heads $f$. The InfoNCE~\citep{gutmann2010noise, oord2018representation} loss maximizes similarity between true pairs:
$\mathcal{L}^{i(1)}_{NCE} = - \log \frac{\exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}$ \refstepcounter{equation}(\theequation)\label{eq:nce_loss}.
The total objective averages over both modalities $\mathcal{L}_{NCE} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{NCE} + \mathcal{L}^{i(2)}_{NCE})$ \refstepcounter{equation}(\theequation)\label{eq:nce}\footnote{We refer only to the $i$th example of modality 1 for simplicity; the total is averaged over both directions.}.
In our unpaired setting, where batches $\{(x^1_i, x^2_j)\}_{i,j=1}^B$ share only treatment labels ($t_i = t_j$), standard InfoNCE forces arbitrary alignment of same-treatment samples.

Weak supervision from treatment labels allows for variants like Supervised Contrastive Learning (SupCon~\cite{khosla2020supervised}) which aligns all pairs within a treatment group:
$\mathcal{L}^{i(1)}_{SupCon} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[t_i = t_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}$ \refstepcounter{equation}(\theequation)\label{eq:supcon}.
InfoCore~\cite{wang2024removing} combines these: $\mathcal{L}_{InfoCore} = \frac{1}{2}\mathcal{L}_{SupCon} + \frac{1}{2}\mathcal{L}_{NCE}$ \refstepcounter{equation}(\theequation)\label{eq:inforcore}.

Alternatively, XDC~\cite{alwassel2020self} and WCL~\cite{zheng2021weakly} use cluster-derived weak labels $c$ from representations to guide learning:
$\mathcal{L}^{i(1)}_{WCL} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[c^2_i = c^2_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}$ \refstepcounter{equation}(\theequation)\label{eq:wcl}.
However, none of these methods effectively handle the unpaired biological setting, as even SupCon lacks mechanisms to distinguish specific instances within the same treatment group.

\subsection{Learning from \textit{unpaired} data}
Unpaired learning often employs cross-modal translation~\citep{nakada2023understanding, park2020contrastive, sturma2024unpaired}, cycle consistency~\citep{zhu2017unpaired,almahairi2018augmented,amodio2018magan, tsai2022learning}, or Optimal Transport~\citep{demetci2022scot,ryu2024cross, gao2020ucmh,kriebel2022uinmf}. However, biological data's inherent noise and non-linearity limit cycle consistency, and OT relies on metric assumptions that often fail in this domain. We focus specifically on adapting representation learning to these biological constraints.



% \section{Background}\label{sec:background}
% We start by defining the problem setup for unpaired multimodal learning, and then we introduce prior contrastive-learning methods for paired data. We end by highlighting the limitations of these existing paired methods for our (unpaired) setting of interest.

% % dataset and problem and goal and mesaure of success
% \subsection{Problem setup: Unpaired multimodal learning}
% Given two treatment-labeled datasets $\mathcal{D}_1 = \{(x^1_i, t_i)\}_{i=1}^{N_1}$ and $\mathcal{D}_2 = \{(x^2_j, t_j)\}_{j=1}^{N_2}$, where $x^1_i \in X^1$ (e.g., images) and $x^2_j \in X^2$ (e.g., sequences or texts) are samples from different modalities, each sample is associated with a treatment label $t \in T$ drawn from the set of possible treatments $T$ (see Figure \ref{fig:dataset}). 
% $\mathcal{D}_1$ and $\mathcal{D}_2$ share the same set of treatment labels, allowing us to construct a combined dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i, j=1}^{N}$. However, samples $x^1_i$ and $x^2_j$ are not directly paired (i.e., they do not correspond to the same cell or natural scene), but are instead indirectly linked by their shared treatment label $t$. As depicted in Figure~\ref{fig:methodoverview}, the objective of unpaired multimodal learning with treatment group labels is to jointly learn two encoders, $\mathcal{\phi}^1$ and $\mathcal{\phi}^2$ for each modality, that produce improved representations. Here, improved representations are defined by their zero-shot performance in distinguishing treatment labels and their effectiveness in downstream tasks.


% % infonce for paired dataset
% \subsection{Contrastive learning for \textit{paired} data}
% Most prior contrastive methods assume paired datasets and use variations of the noise contrastive estimation (NCE) objective to differentiate instance pairs. The NCE objective treats two modalities of the same instance as positive pairs, while treating different instances as negative pairs, adjusting the latent space by bringing positive pairs closer together and pushing apart negative pairs. 
% Following the SimCLR~\citep{chen2020simple} and CLIP~\citep{radford2021learning} frameworks, given a batch of samples $\{(x^1_i, x^2_i)\}_{i=1}^B$, we embed them using encoders $\mathcal{\phi}^1, \mathcal{\phi}^2$ and non-linear projection heads $f^1, f^2$ to produce hidden vectors $\{(\mathbf{v}^1, \mathbf{v}^2)_i\}_{i=1}^B$ where $\{\mathbf{v}^\smblksquare \} = f^\smblksquare (\mathcal{\phi}^\smblksquare  (\{x^\smblksquare \}))$. For a paired batch indexed by $i$, the InfoNCE~\citep{gutmann2010noise, oord2018representation} objective for a sample in modality 1 is defined as:
% $\mathcal{L}^{i(1)}_{NCE} = - \log \frac{\exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}$
% \refstepcounter{equation}(\theequation)\label{eq:nce_loss}. 
% The complete objective becomes: $\mathcal{L}_{NCE} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{NCE} + \mathcal{L}^{i(2)}_{NCE}) 
%     \footnote{For simplicity, we will refer only to the $i$th example of modality 1 in the subsequent loss definitions, with the objective for the entire batch calculated as the average over all batch samples from both modality 1 and modality 2.}$ \refstepcounter{equation}(\theequation)\label{eq:nce}.
% This objective aims to maximize the similarity $\sm(\cdot, \cdot)$ between positive pairs while minimizing it for negative ones, with $\tau >0$ serving as a temperature parameter to control the sharpness of the distribution.

% In the unpaired setting, a batch $\{(x^1_i, x^2_j)\}_{i,j=1}^B$ consists of samples from different modalities linked only by their shared treatment label ($t_i = t_j$), resulting in hidden vectors $\{(\mathbf{v}^1_i, \mathbf{v}^2_j)\}_{i, j=1}^B$. Directly applying the InfoNCE objective here would simply encourage randomly paired samples within the same treatment group to move closer  while pushing apart all other pairs, potentially introducing noise due to weak or nonexistent correlations.

% % supcon for paired dataset, infocore for combining the two
% Since the treatment label in this context can provide weak supervision, it can thus be leveraged by supervised InfoNCE objective variants such as \textit{Supervised Contrastive Learning}~(SupCon~\cite{khosla2020supervised}) and InfoCore~\cite{wang2024removing}. Rather than focusing on pairing specific indices, the SupCon objective brings all pairs within the same treatment group closer together: 
% $\mathcal{L}^{i(1)}_{SupCon} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[t_i = t_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}$ \refstepcounter{equation}(\theequation)\label{eq:supcon}.

% The InfoCore~\cite{wang2024removing} objective is a combination of the NCE and SupCon~\cite{khosla2020supervised} objective, namely $\mathcal{L}_{InfoCore} = \frac{1}{2}\mathcal{L}_{SupCon} + \frac{1}{2}\mathcal{L}_{NCE}$ \refstepcounter{equation}(\theequation)\label{eq:inforcore}.

% % wcl and xdc
% Rather than relying on predefined class labels for each sample (for \textit{supervised} contrastive learning), methods like \textit{Cross-Modal Deep Clustering}~(XDC~\cite{alwassel2020self}) and \textit{Weakly-Supervised Contrastive Learning}~(WCL~\cite{zheng2021weakly}) generate weak labels ${c^1}$ and ${c^2}$ from the representation spaces ${\mathbf{v}^1}$ and ${\mathbf{v}^2}$. These weak labels are then utilized as supervisory signals across modalities. The WCL objective is
% $\mathcal{L}^{i(1)}_{WCL} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[c^2_i = c^2_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}$ \refstepcounter{equation}(\theequation)\label{eq:wcl}.

% \paragraph{Limitations.}\ 
% Despite the variations in the above methods and their extended definitions of positive pairs to \textit{additionally} leverage class labels, none of them can be directly applied to the \textit{unpaired} setting. 
% % The paired assumption in these methods always brings $\mathbf{v}^1_i$ and $\mathbf{v}^2_j$ closer in latent space when $i=j$.
% Even in methods like SupCon or InfoCore, which aim to bring samples within the same class (or treatment group) closer together, there is no mechanism to distinguish between individual samples within the same class (treatment group).

% \subsection{Learning from \textit{unpaired} data}
% Existing approaches for learning from unpaired data primarily involve cross-modal translation techniques including unsupervised and semi-supervised contrastive learning \citep{nakada2023understanding, park2020contrastive, sturma2024unpaired}, cycle consistency losses \citep{zhu2017unpaired,almahairi2018augmented,amodio2018magan, tsai2022learning}, Optimal Transport (OT) techniques \citep{demetci2022scot,ryu2024cross}, and approaches that first learn latent subspaces then constructs affinity matrices to align modalities \citep{gao2020ucmh,kriebel2022uinmf}. Despite their potential, these approaches face significant limitations in biology due to the inherent noise, sparsity, and complex non-linear relationships in the data. Cycle consistency methods often struggle under these conditions, while Optimal Transport techniques are not only sensitive to noise but also rely on metric assumptions that may not hold in practice. In this work, we focus on biological multimodal representation learning where the direct applicability of these method is limited without significant adaptations.



\begin{figure}[t] % [t] is usually better than [ht!] for single column
\centering
% REDUCED SPACING: Adjusted node distances below for compactness
\begin{tikzpicture}[every node/.style={}]
% Nodes
\node (Z) {$Z$};
% Horizontal distances reduced from 1.5cm to 1.2cm
\node[left= 1.2cm of Z] (zzi) {$\tilde{z}_i$};
\node[right= 1.2cm of Z] (zzj) {$\tilde{z}_j$};
% Vertical distances reduced from 1cm to 0.7cm
\node[below left=0.7cm and 1.2cm of Z] (xi) {$z_i$};
\node[below right=0.7cm and 1.2cm of Z] (xj) {$z_j$};
% Vertical distances reduced from 1cm to 0.7cm
\node[below=0.7cm of xi] (gxi) {$x^1_i$};
\node[below=0.7cm of xj] (fxj) {$x^2_j$};

% Arrows (no changes needed here)
\draw[-{Latex}] (Z) -- node[above] {}(zzi);
\draw[-{Latex}] (Z) -- node[above] {} (zzj);
\draw[-{Latex}] (zzi) -- node[left] {$t$} (xi);
\draw[-{Latex}] (zzj) -- node[right] {$t$} (xj);
\draw[-{Latex}] (xi) -- node[left] {$m^1$} (gxi);
\draw[-{Latex}] (xj) -- node[right] {$m^2$} (fxj);
\end{tikzpicture}
\vspace{-3mm} % Tighten space between figure and caption
\caption{\textbf{Data generating process for unpaired biological datasets.} $Z$ represents the original sample distributions (e.g. HUVEC cells~\cite{baudin2007protocol}). Two samples $\tilde{z}_i, \tilde{z}_j$ are drawn from this distribution and have the same treatment $t$ applied, resulting in treated samples $z_i = t(\tilde z_i)$ and $z_j=t(\tilde z_j)$. Finally, measurement devices $m^1,m^2$ are used to get per-modality observations $x^1_i=m^1(z_i)$ and $x^2_j=m^2(z_j)$ sharing the same treatment $t$.
}
\label{fig:dataset}
\vspace{-5mm} % Tighten space after caption against main text
\end{figure}


% % Text next to the branches
%\node[right=2.5cm of Z] (text1) {\parbox{10cm}{Original Distribution (e.g. Natural Scene~\cite{yosef2023irfl}, HUVEC Cells\cite{baudin2007protocol})}};
% \node[right=0.5cm of xj] (branchtext2) {\parbox{10cm}{Group of samples under treatment $t$ indexed by $i$ and $j$ \\
% (e.g. Natural scene under augmentation (t) that will be captured by image ($i$) or will be described by text ($j$), or \\
% Cells under treatment $t$ that will be shown as image ($i$) or will be measured as sequence ($j$) ) }};
% % Text below
% \node[right=0.5cm of fxj] (text3) {Observational Data Modality (e.g. Image ($x^1$), Text/Sequence ($x^2$)) };
% \includegraphics[width=\linewidth]{sec/figs/fig3v1.pdf}

% \begin{figure} %[ht!]
% \centering
% \begin{tikzpicture}[node distance=2cm, every node/.style={}]
% % Nodes
% \node (Z) {$Z$};
% \node[left=  1.5cm of Z] (zzi) {$\tilde{z}_i$};
% \node[right= 1.5cm of Z] (zzj) {$\tilde{z}_j$};
% \node[below left=1cm and 1.5cm of Z] (xi) {$z_i$};
% \node[below right=1cm and 1.5cm of Z] (xj) {$z_j$};
% \node[below=1cm of xi] (gxi) {$x^1_i$};
% \node[below=1cm of xj] (fxj) {$x^2_j$};
% % Arrows
% \draw[-{Latex}] (Z) -- node[above] {}(zzi);
% \draw[-{Latex}] (Z) -- node[above] {} (zzj);
% \draw[-{Latex}] (zzi) -- node[left] {$t$} (xi);
% \draw[-{Latex}] (zzj) -- node[right] {$t$} (xj);
% \draw[-{Latex}] (xi) -- node[left] {$m^1$} (gxi);
% \draw[-{Latex}] (xj) -- node[right] {$m^2$} (fxj);
% \end{tikzpicture}
% \caption{\textbf{Data generating process for unpaired biological datasets.} $Z$ represents the original sample distributions (e.g. HUVEC cells~\cite{baudin2007protocol}). Two samples $\tilde{z}_i, \tilde{z}_j$ are drawn from this distribution and have the same treatment $t$ applied, resulting in treated samples $z_i = t(\tilde z_i)$ and $z_j=t(\tilde z_j)$. Finally, measurement devices $m^1,m^2$ are used to get per-modality observations $x^1_i=m^1(z_i)$ and $x^2_j=m^2(z_j)$ sharing the same treatment $t$.
% }
% \label{fig:dataset}
% \end{figure}







% $ \mathcal{L}^{i(1)}_{NCE} = - \log \left( \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_i)/\tau) \big/ \sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau) \right) $ 

% \begin{equation}
%     \mathcal{L}^{i(1)}_{NCE} = - \log \frac{\exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
% \end{equation}


% \vspace{-1.5em}
% \begin{equation} \label{eq:nce}
%     \mathcal{L}_{NCE} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{NCE} + \mathcal{L}^{i(2)}_{NCE}) 
%     \footnote{For simplicity, we will refer only to the $i$th example of modality 1 in the subsequent loss definitions, with the objective for the entire batch calculated as the average over all batch samples from both modality 1 and modality 2.}
% \end{equation}






% \begin{equation} \label{eq:supcon}
%     \mathcal{L}^{i(1)}_{SupCon} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[t_i = t_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
% \end{equation}

% \begin{equation} \label{eq:inforcore}
% \mathcal{L}_{InfoCore} = \frac{1}{2}\mathcal{L}_{SupCon} + \frac{1}{2}\mathcal{L}_{NCE}.
% \end{equation}
% EN, I think this was too long and can be explained by text alone. 
% In the unpaired setting, modality 1 is indexed by $i$ and modality 2 by $j$, yielding hidden vectors $\{(\mathbf{v}^1_i, \mathbf{v}^2_j)\}_{i, j=1}^B$. Directly applying the NCE objective to the $i$th sample in modality 1 and the $j$th sample in modality 2 within this unpaired batch setup would become:
% \begin{align} 
%     \mathcal{L}^{i(1)}_{NCE} = - \log \frac{\mathbbm{1}_{[i = j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}\\
%     \mathcal{L}^{j(2)}_{NCE} = - \log \frac{\mathbbm{1}_{[j = i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_l)/\tau)}
% \end{align}



% \begin{equation} \label{eq:wcl}
%     \mathcal{L}^{i(1)}_{WCL} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[c^2_i = c^2_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
% \end{equation}

