\clearpage
% \setcounter{page}{1}
% \maketitlesupplementary
\appendix

% TODO merging supplement into main
%(identical mapping, Gaussian Blur, Gray Scale, Channel Shuffle, CLAHE, adding Gaussian Noise and Grid Distortion)


\begin{figure}
    \centering
    %\rule{\linewidth}{0.3\textheight}
    \includegraphics[width=0.7\textwidth]{sec/figs/fig1v5.png}
    %\includegraphics[width=0.5\linewidth]{}
    \caption{\textbf{Paired image-text data vs.\ unpaired biological data.} Unlike natural image-text pairs, biological data collection is destructive, resulting in unpaired modalities. Here, phenomics (cell imaging) and transcriptomics (gene sequencing) data cannot be collected from the same sample, leading to unpaired datasets.
    %\CE{Perhaps we can: (1) show pairs as ([Image], [Caption]); (2) write short \textit{captions} (``an orange cat looking confused'') rather than class labels (``cat'', since just a class label is actually a weak pairing); and (3) show weak pairing of biological data (by treatment)?}
    }
    \label{fig:unpairbio}
    \vspace{-10pt}
\end{figure}

\section{Detailed Dataset Description}
\label{apx:dataset}
\begin{figure*}[t]
    \centering
    % \rule{\textwidth}{0.3\textheight} % Placeholder box with width and height
    \includegraphics[width=\textwidth]{sec/figs/suppfig1.png}
    \caption{BIRFL Treatments and Sample illustration.
    }
    \label{sfig:brifl}
\end{figure*}

In this section, we provide a comprehensive description and visualizations of the four curated datasets used in our study.
\subsection{Bio-Augmented IRFL Dataset (BIRFL)}

% \begin{table}[ht]
% \centering
% \caption{Augmentations (Treatments) used for the BIRFL dataset.}
% \label{tab:augmentations}
% \resizebox{\columnwidth}{!}{%
% \begin{tabular}{|l|}
% \hline
% \textbf{BIRFL Treatments} \\ \hline
% Identical Mapping \\ \hline
% Gaussian Blur \\ \hline
% Gray Scale \\ \hline
% Channel Shuffle \\ \hline
% CLAHE (Contrast Limited Adaptive Histogram Equalization) \\ \hline
% Adding Gaussian Noise \\ \hline
% Grid Distortion \\ \hline
% \end{tabular}%
% }
% \end{table}



To assess whether shuffling within treatment groups impacts embedding or representation performance on downstream tasks, we first tested this hypothesis using a dataset with true paired information. Specifically, we extended the Image Recognition of Figurative Language (IRFL) dataset~\cite{yosef2023irfl}, which consists of 786 paired social media images and their corresponding captions. Each pair is labeled with a figurative type: \textbf{\textit{simile}}, \textbf{\textit{metaphor}}, or \textbf{\textit{idiom}}.

To create treatment groups $T$, we applied seven augmentation methods to the images: \textbf{\textit{Identical Mapping}},\textbf{\textit{ Gaussian blur}}, \textbf{\textit{Gray Scale Conversion}}, \textbf{\textit{Channel Shuffle}}, \textbf{\textit{CLAHE (Contrast Limited Adaptive Histogram Equalization)}}, \textbf{\textit{Gaussian Noise Addition}}, and \textbf{\textit{Grid Distortion}}. Each augmentation was paired with a descriptive modification to the corresponding caption. For example, if an image was augmented with \textbf{\textit{Gaussian Noise Addition}}, the associated caption would add the sentence, \textit{``this image has gaussian noise added."}

To further increase the variability of the text augmentation, we generated 10 additional rephrased versions of each descriptive sentence for every augmentation method. For instance, rephrased descriptions for Gaussian noise included: 
\begin{enumerate}
    \setcounter{enumi}{-1}
    \item   \textit{``this image has gaussian noise added.''}
    \item    \textit{``gaussian noise has been added to this image.''}
    \item    \textit{``this picture includes gaussian noise.''}
    \item    \textit{``the image has been modified with gaussian noise.''}
    \item    \textit{``gaussian noise has been introduced to this image.''}
    \item    \textit{``this image now features added gaussian noise.''}
    \item    \textit{``the picture has undergone the addition of gaussian noise.''}
    \item    \textit{``gaussian noise has been incorporated into this image.''}
    \item    \textit{``the image contains added gaussian noise.''}
    \item     \textit{``gaussian noise has been applied to this image.''}
    \item    \textit{``this image has been augmented with gaussian noise.''}
\end{enumerate}

For each augmented image, one rephrased sentence was randomly selected and appended to the corresponding caption. This process ensured that the paired data remained consistent across modalities while introducing diverse treatment effects in both visual and textual representations. A visualization of these augmentations is provided in Figure~\ref{sfig:brifl}.

To emulate the complexity of biological data generation, we introduced additional variability to the augmentations for each image-text pair. Within each treatment group, originally comprising 786 image-text pairs, we applied the following modifications to subsets of the data to simulate diverse treatment outcome effects:

\begin{itemize}
    \item \textbf{No Treatment Effect:} For 10\% of the samples, we applied no augmentation to mimic a control or no-treatment scenario.
    \item \textbf{Same Treatment Effect for Different Treatments:} Another 10\% of the samples were randomly assigned a treatment augmentation other than the intended one, simulating overlap in treatment effects.
    \item \textbf{Mixed/Combined Treatment Effect:} For 10\% of the samples, we applied a combination of the correct augmentation and an additional randomly selected augmentation to mimic the effect of mixed or combined treatments, reflecting the complexity observed in biological datasets.
\end{itemize}

This bio-inspired augmentation process resulted in the creation of seven treatment groups derived from the original 786 image-text pairs, culminating in our Bio-Augmented IRFL dataset, which contains a total of 5,502 unique image ($X^1$) and text caption ($X^2$) pairs.

To facilitate experiments for comparing various contrastive learning objectives, we embedded each image and text caption into vector spaces using state-of-the-art models. Images were embedded using the Vision Transformer (ViT-L-16) model~\cite{dosovitskiy2020image} (pretrained encoder $\mathcal{F}_1$), resulting in vectors of dimension 786. Text captions were embedded using the SFR-2 model~\cite{lee2024nv} ($\mathcal{F}_2$), resulting in vectors of dimension 4,096.

The paired structure of the BIRFL dataset allows us to rigorously evaluate contrastive learning methods for paired data as well as their performance with unpaired data using only the weak labels associated with treatments.


\subsection{Spatial Transcriptomics for Pancreatic Cancer Tumor Biopsy (ST)}
% \begin{table}[ht]
% \centering
% \caption{Treatments selected for the ST dataset.}
% \label{tab:staug}
% % \resizebox{\columnwidth}{!}{%
% \begin{tabular}{|l|}
% \hline
% \textbf{ST Treatments} \\ \hline
% FOLFIRINOX \\ \hline
% Naive \\ \hline
% Chemo-RT \\ \hline
% Mixed \\ \hline
% \end{tabular}%
% % }
% \end{table}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{sec/figs/suppfig2.png}
    %  \begin{tikzpicture}
    %     \fill[black] (0,0) rectangle (8,6);
    %     \node[white] at (4,3) {\Huge \(t\)-SNE Visualization};
    % \end{tikzpicture}
    \caption{\textbf{Patient Biopsy Spatial Transcriptomics Data.} Our ST dataset consists of spatial transcriptomics data from patients diagnosed with pancreatic cancer. Each small tissue sample is scanned using microscopy and sequencing machines, resulting in histopathology images with various sites marked as red dots. Each site represents a sample in our ST dataset, where we obtain both the RNA-sequence readout and the corresponding image patch (right).
}
    \label{fig:st}
\end{figure*}


% Our second curated dataset is a biological dataset of spatial transcriptomics (ST) data derived from biopsies of patients diagnosed with Pancreatic Ductal Adenocarcinoma (PDAC), as released by~\citet{cui2022spatially}. PDAC is a lethal disease with limited treatment options and poor survival rates. The released dataset encompasses samples from 31 patients, of which only 15 spatial transcriptomics (ST) samples are publicly available. Each ST sample is scanned using 10x Visium technology and is represented by a histopathology slide alongside a gene-expression matrix. In this matrix, each row corresponds to RNA expression data indexed by the spatial location of the histopathology slide (A visualization of the data samples is shown in Figure~\ref{fig:st}). 

Our second curated dataset is a biological dataset of spatial transcriptomics (ST) data derived from biopsies of patients diagnosed with Pancreatic Ductal Adenocarcinoma (PDAC), as released by~\citet{cui2022spatially}. PDAC is a lethal disease with limited treatment options and poor survival rates. The released dataset includes samples from 31 patients, of which only 15 spatial transcriptomics (ST) samples are publicly available. 
Each ST sample is scanned using 10x Visium technology and is represented by a histopathology slide along with a gene-expression matrix. In this matrix, each row corresponds to RNA expression data indexed by the spatial location of the histopathology slide. A visualization of the data samples is shown in Figure~\ref{fig:st}.


Thus, ST data forms a naturally paired biological dataset, where each spatial location on the histopathology slide has a corresponding pathology image patch ($X^1$) and an RNA-seq readout ($X^2$) from the same tissue region. Since biopsies do not consist solely of tumor cells, standard practice involves experienced pathologists annotating histopathology slides, identifying different regions and cell types. For simplicity, we define our downstream task as a binary classification problem: predicting whether a site corresponds to a pathologist-labeled tumor region ($Y$).
Since each slide represents a biopsy from a single patient, we define treatments ($T$) as one of four options received by the patient before the biopsy was taken: \textit{\textbf{FOLFIRINOX}}, \textit{\textbf{Naive}}, \textit{\textbf{Chemo-RT}}, or \textit{\textbf{Mixed}}.

In this dataset, each spatial site with an image-expression pair is treated as a sample, resulting in 57,363 image-expression pairs from the 15 ST samples. For each image patch, we use the pathology foundation model UNI~\cite{chen2024uni} (pretrained encoder $\mathcal{F}_1$) to embed the image into a 1024-dimensional vector. Similarly, we train an scVI (Single-cell Variational Inference) model~\cite{lopez2018deep} ($\mathcal{F}_2$) on all 57,363 RNA expression profiles, embedding each into a 1024-dimensional vector.

Although the ST dataset is inherently paired, it is constrained by the limited number of treatment groups and downstream labels. Additionally, obtaining ST datasets remains challenging and expensive, often confined to laboratory environments. This limits broader adoption within the biological community, especially for single-cell level experiments.


%\subsection{Compound Treated Single-cell Dataset (COMP)} 

% We also curated two unpaired biological datasets that are on the single-cell level for us to better understand the multimodal learning for biolgical datasets. The first dataset are compound-treated single-cell datasets. 
% We have two separate biological experiments where both experiments studied the same cells $Z$ as human umbilical vein endothelial cells (HUVEC)~\cite{baudin2007protocol} and they also share the same set of treatment $T$ which in this case FDA-approved bioactive small-molecule compounds like the ones used by \citet{fay2023rxrx3}.

% The first experiment put HUVEC cells into culture plates where each plate have various wells. For each well, HUVEC cells are being treated with different compound with different concentration levels. The treated cells are then being stained with cell-painting protocols~\cite{baudin2007protocol} into 6-channel fluorescent microscopy images ($X^1$).
% The second experiment also put HUVEC cells into culture plates where each plate have various wells. For each well, HUVEC cells are again being treated with different compound. Then treated cells are then being sequenced using bulk RNA sequencing techniques~\cite{ye2018drug} ($X^2$).

% We then post-process each modality separately. Each cell-painting images are being cropped into $32 \times 32$ single-cell image crops and embedded into an 786-dimensional vector. We train a scVI model on all the sequences and each sequence is embedded into a 256-dimensional vector. We selected 78 small molecule compounds as our treatment $T$ (A sublist shown in Table~\ref{tab:chemical_structures}). We choose the concentration levels 1, 2.5, 10 associated with each sample as our downstream tasks labels $Y$. We randomly pair each sequence with around four image crops within the same treatment groups, generated 36562 image-sequence pairs each with treatment group label $t$ and shared constration level $y$ for our COMP dataset.


\subsection{Compound-Treated Single-Cell Dataset (COMP)}
\begin{table}[t]
\centering
\caption{A sub list of Chemical Structures in SMILES Notation for COMP Dataset}
\label{tab:chemical_structures}
\resizebox{0.7\columnwidth}{!}{%
\begin{tabular}{|l|}
\hline
\textbf{SMILES Notation} \\ \hline
CC(C)(C\#N)c1ccc(N2C(=O)OCc3cnc4ccc(-c5cnc6ccccc6c5)cc4c32)cc1 \\ \hline
CC(C)(C)c1ccc(C(=O)NCCC(=O)Nc2ccc3[nH]ncc3c2)cc1 \\ \hline
CC(C)(O)COc1ccc(S(=O)(=O)Nc2cccc3c(C\#N)c[nH]c23)cc1 \\ \hline
CC(C)C1CCc2[nH]c(=O)c(C\#N)cc2C1 \\ \hline
CC(C)Oc1ccc(-c2nc(-c3cccc4c3CCC4NCCC(=O)O)no2)cc1C\#N \\ \hline
CC(C)c1nn(C)c2nc(C(F)F)nc(NCc3cccc(Oc4ccccn4)c3)c12 \\ \hline
CC(C)n1ncc2nccc(OCc3ccc(Br)cc3)c21 \\ \hline
CC1CCC(n2c3cnccc3c3cnc(Nc4ccc5c(n4)CCN(C(=O)CO)C5)nc32)CC1 \\ \hline
CC1Sc2ccc(C(=O)Nc3ccc(-c4cn5ccccc5n4)cc3)cc2NC1=O \\ \hline
CCCn1cnc2ncc(Nc3ccc4c(c3)CCC4)nc21 \\ \hline
CCOC(=O)c1c(C)nc2c(c1-c1ccc(Cl)cc1)C(=O)CC(C)(C)C2 \\ \hline
CCc1nc(S(=O)(=O)Nc2ccc(C)c3c(C\#N)c[nH]c23)c[nH]1 \\ \hline
CCn1c(O)nc2cc(Cl)c(Cl)cc21 \\ \hline
CNC(=O)C(NC(=O)C(CC(C)C)C(O)C(=O)NO)C(C)(C)C \\ \hline
COc1cc(-c2cc3c(cn2)c(C2CC2)nn3C(C)C)ccn1 \\ \hline
COc1ccc(C(=O)Nc2ccc(-c3cn[nH]c3)c(C)c2)nc1 \\ \hline
COc1ccc(C2CNC(=O)C2)cc1OC1CCCC1 \\ \hline
COc1ccc(OC2CCN(c3cn[nH]c(=O)c3Cl)CC2)cc1 \\ \hline
COc1ccc2nc3c([N+](=O)[O-])ccc4c3c(c2c1)NN4CCCN(C)C \\ \hline
COc1nc(C)cnc1Nc1cc2c(cn1)c(C1CC1)nn2C(C)C \\ \hline
CS(=O)(=O)N1CCN(Cc2cc3nc(-c4cccc5[nH]ncc45)nc(N4CCOCC4)c3s2)CC1 \\ \hline
CS(=O)(=O)c1ccc(-c2nc(NCc3ccc4c(c3)OCO4)c3cn[nH]c3n2)cc1 \\ \hline
Cc1c(C(=O)Nc2ccccc2Cc2cccnc2)oc2ccc(Br)cc12 \\ \hline
... \\ \hline
\end{tabular}%
}
\end{table}


% Ensure \usepackage{booktabs} and \usepackage{array} are in your preamble



To better understand multimodal learning for biological datasets, we curated two unpaired biological datasets at the single-cell level. The first dataset, referred to as the compound-treated single-cell dataset (COMP), originates from two separate biological experiments that studied the same cell type, human umbilical vein endothelial cells (HUVEC)~\cite{baudin2007protocol}. Both experiments involved the same set of FDA-approved bioactive small-molecule compounds, similar to those used by \citet{fay2023rxrx3}, applied at varying  concentration levels (1, 2.5, and 10) in $\mu M$. In the first experiment, HUVEC cells were treated with compounds in multi-well plates and stained using the cell-painting protocol~\cite{baudin2007protocol}, generating 6-channel fluorescent microscopy images ($X^1$).  In the second experiment, cells were treated identically, but their RNA expression profiles were measured using bulk RNA sequencing~\cite{ye2018drug} ($X^2$).

We post-processed each modality separately. The cell-painting images were cropped into $32 \times 32$ single-cell image crops, which were then embedded into a 786-dimensional vector using the \textit{Phenom-1} model~\cite{kraus2024masked} (pretrained encoder $\mathcal{F}_1$), a 300 million
parameter pretrained foundation model designed for large scale analysis of microscopy images. For the sequencing data, we trained a scVI model on all sequences and embedded each sequence into a 256-dimensional vector ($\mathcal{F}_2$). A total of 78 small-molecule compounds were selected as treatments ($T$), with a sublist of their SMILES representations shown in Table~\ref{tab:chemical_structures}. 

To construct the COMP dataset, we randomly paired each sequence with approximately four image crops from the same treatment group, resulting in 36,562 image-sequence pairs.  Each pair is annotated with a treatment group label ($t$) and a shared concentration level label ($Y$), serving as downstream task labels.

% Our first unpaired dataset includes experimental results from compound-treated human umbilical vein endothelial cells (HUVEC)~\cite{baudin2007protocol}. Samples consist of plated HUVEC cells treated with 78 FDA-approved bioactive small-molecule compounds ($T$), then either scanned to generate cell-painting images~\cite{bray2016cell} following methods similar to \citet{fay2023rxrx3}, or sequenced using bulk RNA sequencing techniques~\cite{ye2018drug} to obtain single-cell transcriptomics data. For each sample, we randomly paired a $32 \times 32$ single-cell image crop ($X^1$) with a single-cell RNA readout ($X^2$), ensuring both represent cells treated with the same compound. The downstream task label ($Y$) is the culture solution concentration level of the cells. We collected 36562 randomly paired image-sequence samples to create the COMP dataset.



\subsection{Gene Knockout Single-cell Dataset (GKO)}
% GKO
\begin{table}[ht]
\centering
\caption{List of Targeted Knockout Genes}
\label{tab:knockout_genes}
\begin{tabular}{|l|}
\hline
\textbf{Gene Name} \\ \hline
ATP6V1E1 \\ \hline
IL6R \\ \hline
CD33 \\ \hline
HPS1 \\ \hline
CD22 \\ \hline
ERGIC1 \\ \hline
CASP6 \\ \hline
ADORA2B \\ \hline
EIF2AK3 \\ \hline
CANX \\ \hline
ARF1 \\ \hline
ATP13A2 \\ \hline
BST1 \\ \hline
ATP1A3 \\ \hline
ABI3 \\ \hline
ALS2 \\ \hline
SYNJ1 \\ \hline
AKT1 \\ \hline
AKT2 \\ \hline
FOXRED1 \\ \hline
ACVRL1 \\ \hline
PSMA7 \\ \hline
ATP6V1F \\ \hline
EIF2B1 \\ \hline
\end{tabular}
\end{table}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{sec/figs/suppfig3.png}
    %  \begin{tikzpicture}
    %     \fill[black] (0,0) rectangle (8,6);
    %     \node[white] at (4,3) {\Huge \(t\)-SNE Visualization};
    % \end{tikzpicture}
    \caption{\textbf{Cell-painting images.} Both the COMP and GKO datasets utilize fluorescent microscopy images (left) obtained from biological experiments. Each image includes the locations of cell nuclei, enabling us to crop the images into smaller, single-cell images (right) for downstream processing.}
    \label{fig:cells}
\end{figure*}

The second unpaired biological dataset we curated, referred to as the Gene Knockout Single-cell Dataset (GKO), follows a similar structure to the COMP dataset but focuses on CRISPR-mediated gene knockouts as treatments~\cite{bock2022high}. The sequencing modality employs single-cell RNA sequencing~\cite{dixit2016perturb} to capture gene expression profiles. A visualization of the cell images is shown in Figure~\ref{fig:cells}.

As with COMP, two separate experiments were conducted on the same HUVEC cell line. In the first experiment, HUVEC cells were treated with CRISPR-mediated knockouts targeting specific genes. After treatment, the cells were stained using the cell-painting protocol~\cite{bray2016cell}, producing 6-channel fluorescent microscopy images ($X^1$). In the second experiment, the same set of knockouts was applied, followed by single-cell RNA sequencing~\cite{dixit2016perturb} to generate RNA expression profiles ($X^2$).  

The image modality was post-processed into $32 \times 32$ single-cell image crops, which were then embedded into 1024-dimensional vectors using the Phenom-1 model~\cite{kraus2024masked} ($\mathcal{F}_1$). For the single-cell sequencing data, we trained an scVI model~\cite{lopez2018deep} ($\mathcal{F}_2$) on all sequences, embedding each sequence into a 128-dimensional vector. The treatment set $T$ included negative control + 25 targeted genes (listed in Table~\ref{tab:knockout_genes}), and the downstream task labels ($Y$) were derived by discretizing the total gene counts into 5 levels, reflecting biological variability in gene expression profiles across treatments.

To construct the GKO dataset, we randomly paired single-cell images with single-cell sequences within the same treatment groups, resulting in a total of 59,011 image-sequence pairs.




% Similar to the COMP dataset,this dataset involves HUVEC cells subjected to 25 distinct CRISPR-mediated gene knockouts ~\cite{bock2022high} ($T$). We selected 25 gene knockouts  applied to HUVEC cells.  Cells were processed through two modalities: either cell-painting imaging~\cite{bray2016cell} or single-cell transcriptomics using perturb-seq \cite{dixit2016perturb}. For the imaging modality, $32 \times 32$ random crops ($X^1$) were extracted from cell-painting scans, while for the sequencing modality, RNA profiles ($X^2$) were obtained from treated cells. Each sample pair shares the same gene knockout treatment, linking the two modalities indirectly. For the downstream task label ($Y$), we used total gene counts for each cell, categorized into five bins resulting in 59,011 couple of samples.

\begin{table}[t]
    \centering
    \caption{Dataset specifications. $N$: samples, $|T|$: treatments, $|Y|$: labels. Dimensions $|X^{1,2}|$ correspond to encoders $\mathcal{F}^{1,2}$.}
    \label{tab:dataset_specifications}
    \footnotesize % Shrink font slightly
    \setlength{\tabcolsep}{3.5pt} % Reduce space between columns to fit width
    \begin{tabular}{l c c c l l c c c}
        \toprule
        Dataset & $N$ & $|T|$ & $|Y|$ & $\mathcal{F}^1$ (Emb.) & $\mathcal{F}^2$ (Emb.) & $|X^1|$ & $|X^2|$ & Paired \\
        \midrule
        BIRFL & 5,502 & 7 & 3 & ViT\cite{dosovitskiy2020image} & SFR\cite{lee2024nv} & 768 & 4096 & \checkmark \\
        ST & 57,363 & 4 & 2 & UNI\cite{chen2024uni} & scVI\cite{lopez2018deep} & 1024 & 1024 & \checkmark \\
        COMP & 36,562 & 78 & 3 & Phenom1\cite{kraus2024masked} & scVI\cite{lopez2018deep} & 768 & 256 & \text{\sffamily X} \\
        GKO & 59,011 & 25 & 5 & Phenom1\cite{kraus2024masked} & scVI\cite{lopez2018deep} & 1024 & 128 & \text{\sffamily X} \\
        \bottomrule
    \end{tabular}
    \vspace{-3mm} % Optional: pulls text closer to bottom of table
\end{table}

\section{Experiment Details}
\label{apx:exp}

This section outlines the experimental setup, detailing the learning objectives for each method and the training details for our experiments.

\subsection{Contrastive Learning Objectives}
We provide the detailed learning objectives for each baseline method used in our experiments, as well as the objective introduced in our proposed IPIC method. 

In the unpaired setting, a batch $\{(x^1_i, x^2_j)\}_{i,j=1}^B$ consists of samples from different modalities linked only by their shared treatment label ($t_i = t_j$), resulting in hidden vectors $\{(\mathbf{v}^1_i, \mathbf{v}^2_j)\}_{i, j=1}^B$. Given a unpaired batch of hidden vectors, the NCE objective becomes
\begin{equation*}
    \mathcal{L}^{i(1)}_{NCE} = - \log \frac{ \mathbbm{1}_{i=j} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
\end{equation*}
\begin{equation*}
    \mathcal{L}^{j(2)}_{NCE} = - \log \frac{ \mathbbm{1}_{j=i} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_l)/\tau)}
\end{equation*}
\begin{equation*} 
    \mathcal{L}_{NCE} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{NCE} + \mathcal{L}^{j(2)}_{NCE}) 
\end{equation*}

The full definition of SupCon objective is:
\begin{equation*}
    \mathcal{L}^{i(1)}_{SupCon} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[t_i = t_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
\end{equation*}
\begin{equation*}
    \mathcal{L}^{j(2)}_{SupCon} = - \sum_{i}^B \log  \frac{\mathbbm{1}_{[t_j= t_i]} \cdot \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^2_l)/\tau)}
\end{equation*}
\begin{equation*} 
    \mathcal{L}_{SupCon} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{SupCon} + \mathcal{L}^{j(2)}_{SupCon}) 
\end{equation*}
According to \citet{wang2024removing}, for our experiments we treat the infoCore objective as
\begin{equation*}
\mathcal{L}_{InfoCore} = \frac{1}{2}\mathcal{L}_{SupCon} + \frac{1}{2}\mathcal{L}_{NCE}.
\end{equation*}

For an unpaired batch, the objective of WCL~\cite{zheng2021weakly} becomes:
\begin{equation*} 
    \mathcal{L}^{i(1)}_{WCL} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[c^2_i = c^2_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
\end{equation*}
\begin{equation*} 
    \mathcal{L}^{j(2)}_{WCL} = - \sum_{i}^B \log  \frac{\mathbbm{1}_{[c^1_j = c^1_i]} \cdot \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_l)/\tau)}
\end{equation*}
\begin{equation*} 
    \mathcal{L}_{WCL} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{WCL} + \mathcal{L}^{j(2)}_{WCL}) 
\end{equation*}
Here $c^\smblksquare_i$ and $c^\smblksquare_j$ denote the cluster label of the $i$ and $j$th index of modality $\smblksquare$. Thus, modality 1 objective utilize the modality 2's cluster labels as targets and vice versa for modality 2. We follow the same implementation as \citet{zheng2021weakly} using connected components algorithms to find the cluster labels.

For XDC~\cite{alwassel2020self}, where no publicly available implementation is available, we followed the description in the paper. Specifically, we used K-means as the clustering method. Instead of using an NCE-type objective, we directly employed the cross-modality labels as classification targets and calculated the cross-entropy loss between $v^1$ and $c^2$ or $v^2$ and $c^1$. We choose the number of clusters as the number of treatment for dataset for convenience
\begin{equation*}
\mathcal{L}_{XDC} = -\frac{1}{2B} \sum_{i=1}^{B} \sum_{t=1}^{|T|} c^2_{i,t} \log(v^1_{i}) - \frac{1}{2B} \sum_{j=1}^{B} \sum_{t=1}^{|T|} c^1_{j,t} \log(v^2_{j})
\end{equation*}

In all objectives above, $i$ indexes samples in modality 1, and $j$ corresponds to the  $i$th sample in modality 2. Then $l$ is the denominator index for the sum of other similarities calculations.

For our one-time matching objective, after applying Algorithm~\ref{alg:matching_repair}, the dataset is pseudo-paired using the matching matrices for each treatment group. The indexing of modality 2 is updated to be indexed by $k$. Thus, the batch of hidden vectors becomes $\{(\mathbf{v}^1_i, \mathbf{v}^2_k)\}_{i, k=1}^B$, where $t_i = t_k$. Our one-time matching objective is then defined as:
\begin{equation*}
    \mathcal{L}^{i(1)}_{1X-Matching} = - \log \frac{ \mathbbm{1}_{i=k} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_k)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
\end{equation*}
\begin{equation*}
    \mathcal{L}^{k(2)}_{1X-Matching} = - \log \frac{ \mathbbm{1}_{k=i} \exp (\sm(\mathbf{v}^2_k, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_k, \mathbf{v}^1_l)/\tau)}
\end{equation*}
\begin{equation*} 
    \mathcal{L}_{1X-Matching} = \frac{1}{2B} \sum_{i}^B (\mathcal{L}^{i(1)}_{1X-Matching} + \mathcal{L}^{k(2)}_{1X-Matching}) 
\end{equation*}

As our $\mathcal{L}_{intra}$ and $\mathcal{L}_{inter}$ introduced in our IPIC method also compute objectives on pseudo-paired batches, the full objective of $\mathcal{L}_{intra}$ becomes:
\begin{equation*}
    \mathcal{L}^{i(1)}_{intra} = - \sum_{k}^B \log  \frac{\mathbbm{1}_{[t_i = t_k]} \cdot m^1_{i,k} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_k)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq k]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}, 
\end{equation*}
\begin{equation*}
    \mathcal{L}^{k(2)}_{intra} = - \sum_{i}^B \log  \frac{\mathbbm{1}_{[t_k = t_i]} \cdot m^2_{k,i} \cdot \exp (\sm(\mathbf{v}^2_k, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_k, \mathbf{v}^1_l)/\tau)}, 
\end{equation*}
\begin{equation*}
    \mathcal{L}_{intra} = \frac{1}{2B} \sum_{i,k}^B (\mathcal{L}^{i(1)}_{intra} + \mathcal{L}^{k(2)}_{intra}),
\end{equation*} 
The full objective of $\mathcal{L}_{inter}$ is 
\begin{equation*}
    \mathcal{L}^{i(1)}_{inter} = - \sum_{o}^B \log  \frac{\mathbbm{1}_{[i = k]} \cdot \mathbbm{1}_{[c^2_k = c^2_o]} \cdot \exp (\sm(\mathbf{u}^1_i, \mathbf{u}^2_o)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq k]} \exp (\sm(\mathbf{u}^1_i, \mathbf{u}^2_l)/\tau)}, 
\end{equation*}
\begin{equation*}
    \mathcal{L}^{k(2)}_{inter} = - \sum_{o}^B \log  \frac{\mathbbm{1}_{[k = i]} \cdot \mathbbm{1}_{[c^1_i = c^1_o]} \cdot \exp (\sm(\mathbf{u}^2_k, \mathbf{u}^1_o)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{u}^2_k, \mathbf{u}^1_l)/\tau)}, 
\end{equation*}
\begin{equation*}
    \mathcal{L}_{inter} = \frac{1}{2B} \sum_{i,k}^B (\mathcal{L}^{i(1)}_{inter} + \mathcal{L}^{k(2)}_{inter}),
\end{equation*}
Here, $o$ denotes the set of indices for all hidden vectors in the cross-modality that share the same cluster label as the pseudo-paired sample in the cross-modality. 


\subsection{Training Details}

In this section, we describe the training and experimental details, including architecture choices and the hyperparameters used for our experiments.

To perform matching, we first train a propensity score predictor $\psi$ for each modality across all four datasets. Since $X^1$ and $X^2$ for each dataset are represented as vectors, we train two separate MLPs, one for each modality. Each MLP has two hidden layers, with the input dimensions corresponding to $|X^1|$ and $|X^2|$, respectively, and the output dimension equal to $|T|$, as shown in Table~\ref{tab:dataset_specifications}. Each $\psi$ is trained using only the vectors of a single modality and their corresponding treatment label $t$.

\begin{table*}[ht]
\centering
\caption{Model Input and Output Dimensions for Each Dataset}
\label{tab:model_map_dimensions}
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Model/Map} & \textbf{BIRFL} & \textbf{ST} & \textbf{COMP} & \textbf{GKO} \\ \hline
$\psi^1$ & $\mathbb{R}^{768} \rightarrow \mathbb{R}^{7}$ & $\mathbb{R}^{1024} \rightarrow \mathbb{R}^{4}$ & $\mathbb{R}^{768} \rightarrow \mathbb{R}^{78}$ & $\mathbb{R}^{1024} \rightarrow \mathbb{R}^{25}$ \\ \hline
$\psi^2$ & $\mathbb{R}^{4096} \rightarrow \mathbb{R}^{7}$ & $\mathbb{R}^{1024} \rightarrow \mathbb{R}^{4}$ & $\mathbb{R}^{256} \rightarrow \mathbb{R}^{78}$ & $\mathbb{R}^{128} \rightarrow \mathbb{R}^{25}$ \\ \hline
$\phi^1$ & $\mathbb{R}^{768} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{1024} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{768} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{1024} \rightarrow \mathbb{R}^{512}$ \\ \hline
$\phi^2$ & $\mathbb{R}^{4096} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{1024} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{256} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{128} \rightarrow \mathbb{R}^{512}$ \\ \hline
$f^1, f^2$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ \\ \hline
$g^1, g^2$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ & $\mathbb{R}^{512} \rightarrow \mathbb{R}^{512}$ \\ \hline
\end{tabular}
\end{table*}


For all four datasets, we adopt the CLIP model architecture as implemented by the Hugging Face platform with the default Vision and Language configuration. However, we customize the model by excluding the encoder parts for both the image and text modalities, starting directly with the hidden and projection layers. For each dataset, the input modality dimensions are linearly projected into the CLIP model's hidden vector dimension. Both $\phi^1$ and $\phi^2$ produce 512-dimensional vectors, which serve as inputs to the downstream projection heads.  Each projection head $f$ and $g$ is implemented as a three-layer MLP comprising hidden linear layers, ReLU activations, and batch normalization. The detailed input and output dimension of models with trainable parameters for each dataset are shown in Table \ref{tab:model_map_dimensions}.

\begin{table}[ht]
\centering
\caption{Hyperparameter Choices for Each Dataset}
\label{tab:hyperparameters}
\resizebox{0.6\columnwidth}{!}{%
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Hyperparameter} & \textbf{BIRFL} & \textbf{ST} & \textbf{COMP} & \textbf{GKO} \\ \hline
%Seed & \multicolumn{4}{c|}{$\{2000, 42, 2034945, 2024, 1, 100, 1000, 10000\}$} \\ \hline
Learning Rate & $5 \times 10^{-7}$ & $1 \times 10^{-4}$ & $1 \times 10^{-6}$ & $1 \times 10^{-5}$ \\ \hline
Batch Size & 512 & 4096 & 4096 & 4096 \\ \hline
Epochs & 50 & 200 & 100 & 100 \\ \hline
\end{tabular}
}
\end{table}

For each dataset, we split the training and testing datasets in an 80-20 split fashion, with the split performed randomly and controlled by a random seed. For each dataset, we ran every hyperparameter setting using 8 random seeds.
%$\{2000, 42, 2034945, 2024, 1, 100, 1000, 10000\}$. 
We experimented with learning rates $\{$ 1e-3, 1e-4, 1e-5, 1e-6, 5e-7 $\}$, number of epochs $\{20, 50, 100, 200\}$, and batch sizes $\{128, 256, 512, 1024, 2048, 4096\}$. For the results reported, we selected the combination of learning rate, number of epochs, and batch size that achieved the highest average performance across all methods. The final results were reported as the mean and standard deviation over the 8 random seeds.
The detailed hyperparameter selections for each dataset are shown in Table~\ref{tab:hyperparameters}. These hyperparameters were chosen to ensure consistency and comparability across experiments.


\section{Extra Experiments and Results}

This section provides a comprehensive evaluation of the IPIC embeddings in the context of unpaired multimodal learning to show its effectiveness at capturing biological knowledge. We focus on metrics such as \textit{Perturbation Consistency}, \textit{iLISI}, and \textit{Structural Integrity}, adapted to unpaired data, where samples across modalities share treatment labels but lack direct pairwise correspondence.


\subsection{Full Results}
We present the full results, including both the mean and standard deviation, in Table~\ref{tab:results_full_std}. These results supplement the mean-only results reported in Table~\ref{tab:results} in the main text, providing a more comprehensive view of the performance across methods and datasets. 

For each experiment conducted, we first use the learned model to embed the training dataset. The embeddings from both modalities are then concatenated to form a multi-modality representation. Using these concatenated vectors, we train a logistic regression model, which is subsequently used to predict the labels on the concatenated test set embeddings. This process provides the accuracy for the zero-shot performance of the test set embeddings.


% \begin{table*}[ht]
%     \centering
%     \caption{Test Set Results on Our Four Curated Datasets with Treatment and Downstream Task Label Predictions (Mean ± Std)}
%     \label{tab:results_std}
%     \resizebox{\textwidth}{!}{
%     \begin{tabular}{lcccccccc}
%         \toprule
%         & \multicolumn{2}{c}{BIRFL} & \multicolumn{2}{c}{ST} & \multicolumn{2}{c}{COMP} & \multicolumn{2}{c}{GKO} \\
%         \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
%         Method & Treatment & Task & Treatment & Task & Treatment & Task & Treatment & Task \\
%         \midrule
%         InfoNCE-paired      & 81.4 ± 1.3 & 97.2 ± 1.1 & 98.5 ± 0.1 & 80.9 ± 0.2 & - & - & - & - \\
%         \hline
%         InfoNCE-unpaired    & 81.4 ± 1.4 & 64.7 ± 3.6 & 97.0 ± 0.2 & 68.4 ± 0.4 & 95.6 ± 3.6 & 72.9 ± 2.4 & 22.6 ± 1.6 & 36.9 ± 1.0 \\
%         SupCon\cite{khosla2020supervised}             & 83.0 ± 1.0 & 65.3 ± 3.4 & \textbf{99.4 ± 0.2} & 68.9 ± 0.5 & 96.7 ± 2.5 & 73.4 ± 2.4 & \textbf{34.1 ± 1.8} & 33.7 ± 1.5 \\
%         InfoCore\cite{wang2024removing}            & 82.1 ± 3.5 & 65.2 ± 0.9 & 98.8 ± 0.2 & 68.4 ± 0.4 & 96.3 ± 3.0 & 73.6 ± 2.6 & 27.8 ± 1.6 & 34.9 ± 1.6 \\
%         1X-Matching            & 81.5 ± 0.5 & 66.7 ± 2.5 & 95.9 ± 0.2 & 68.6 ± 0.5 & 95.7 ± 3.4 & 73.0 ± 2.2 & 23.6 ± 1.5 & 36.6 ± 1.0 \\
%         WCL\cite{zheng2021weakly}                 & 81.1 ± 0.9 & 67.4 ± 2.3 & 96.6 ± 0.2 & 68.8 ± 0.4 & 96.4 ± 2.9 & 75.6 ± 2.2 & 23.2 ± 1.6 & 36.8 ± 0.8 \\
%         XDC\cite{alwassel2020self}                 & 80.8 ± 1.0 & 67.2 ± 3.3 & 94.7 ± 0.3 & 68.4 ± 0.4 & 96.2 ± 2.8 & 77.6 ± 2.2 & 21.8 ± 1.4 & 37.3 ± 0.9 \\
%         \hline
%         Ours (intra)        & 83.4 ± 0.3 & 69.7 ± 0.3 & 97.8 ± 0.3 & 70.4 ± 0.4 & 97.4 ± 0.3 & 78.5 ± 0.9 & 26.0 ± 0.4 & 42.8 ± 0.4 \\
%         Ours (intra+inter)     & \textbf{83.5 ± 0.2} & \textbf{71.5 ± 0.5} & 97.7 ± 0.1 & \textbf{70.7 ± 0.2} &  \textbf{97.8 ± 0.2} & \textbf{80.2 ± 0.7} & 26.4 ± 0.5 & \textbf{43.3 ± 0.2} \\
%         \bottomrule
%     \end{tabular}
%     }
% \end{table*}


\begin{table*}[t]
    \centering
    \caption{Test Set Results on Our Four Curated Datasets with Treatment and Downstream Task Label Predictions (Mean $\pm$ Std). \textbf{Bold} indicates best performance.}
    \label{tab:results_full_std}
    \resizebox{\textwidth}{!}{
    \begin{tabular}{lcccccccc}
        \toprule
         & \multicolumn{2}{c}{BIRFL} & \multicolumn{2}{c}{ST} & \multicolumn{2}{c}{COMP} & \multicolumn{2}{c}{GKO} \\
        \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
        Method & Treatment & Task & Treatment & Task & Treatment & Task & Treatment & Task \\
        \midrule
        \multicolumn{9}{l}{\textit{Reference Baselines}} \\
        Raw Features ($x^1||x^2$)       & 77.0 $\pm$ 0.4 & 55.4 $\pm$ 1.2 & 77.7 $\pm$ 0.3 & 63.6 $\pm$ 0.5 & 74.3 $\pm$ 1.1 & 58.6 $\pm$ 1.4 & 19.9 $\pm$ 0.2 & 28.9 $\pm$ 0.6 \\
        InfoNCE-paired (Oracle)         & 81.4 $\pm$ 1.3 & 97.2 $\pm$ 1.1 & 98.5 $\pm$ 0.1 & 80.9 $\pm$ 0.2 & - & - & - & - \\
        FactorCL-SSL                    & 80.7 $\pm$ 1.5 & 64.5 $\pm$ 2.8 & 92.1 $\pm$ 0.4 & 67.3 $\pm$ 0.6 & 90.9 $\pm$ 2.1 & 72.5 $\pm$ 1.9 & 25.1 $\pm$ 1.1 & 37.9 $\pm$ 1.3 \\
        FactorCL-Sup                    & 81.0 $\pm$ 1.2 & 66.5 $\pm$ 2.5 & 95.7 $\pm$ 0.3 & 68.9 $\pm$ 0.5 & 91.6 $\pm$ 2.4 & 73.4 $\pm$ 2.1 & 30.7 $\pm$ 1.5 & 35.2 $\pm$ 1.4 \\
        InfoNCE-unpaired\cite{chen2020simple}              & 81.4 $\pm$ 1.4 & 64.7 $\pm$ 3.6 & 97.0 $\pm$ 0.2 & 68.4 $\pm$ 0.4 & 95.6 $\pm$ 3.6 & 72.9 $\pm$ 2.4 & 22.6 $\pm$ 1.6 & 36.9 $\pm$ 1.0 \\
        SupCon\cite{khosla2020supervised}                 & 83.0 $\pm$ 1.0 & 65.3 $\pm$ 3.4 & \textbf{99.4 $\pm$ 0.2} & 68.9 $\pm$ 0.5 & 96.7 $\pm$ 2.5 & 73.4 $\pm$ 2.4 & \textbf{34.1 $\pm$ 1.8} & 33.7 $\pm$ 1.5 \\
        InfoCore\cite{wang2024removing}                & 82.1 $\pm$ 3.5 & 65.2 $\pm$ 0.9 & 98.8 $\pm$ 0.2 & 68.4 $\pm$ 0.4 & 96.3 $\pm$ 3.0 & 73.6 $\pm$ 2.6 & 27.8 $\pm$ 1.6 & 34.9 $\pm$ 1.6 \\
        WCL\cite{zheng2021weakly}                     & 81.1 $\pm$ 0.9 & 67.4 $\pm$ 2.3 & 96.6 $\pm$ 0.2 & 68.8 $\pm$ 0.4 & 96.4 $\pm$ 2.9 & 75.6 $\pm$ 2.2 & 23.2 $\pm$ 1.6 & 36.8 $\pm$ 0.8 \\
        XDC\cite{alwassel2020self}                    & 80.8 $\pm$ 1.0 & 67.2 $\pm$ 3.3 & 94.7 $\pm$ 0.3 & 68.4 $\pm$ 0.4 & 96.2 $\pm$ 2.8 & 77.6 $\pm$ 2.2 & 21.8 $\pm$ 1.4 & 37.3 $\pm$ 0.9 \\
        \midrule
        \multicolumn{9}{l}{\textit{Matching Strategy Ablation}} \\
        1X-Matching (SNN)               & 74.2 $\pm$ 1.8 & 65.2 $\pm$ 2.1 & 93.9 $\pm$ 0.5 & 68.2 $\pm$ 0.6 & 91.1 $\pm$ 3.1 & 74.2 $\pm$ 2.5 & 19.7 $\pm$ 1.9 & 37.5 $\pm$ 1.2 \\
        1X-Matching (OT)                & 81.5 $\pm$ 0.5 & 66.7 $\pm$ 2.5 & 95.9 $\pm$ 0.2 & 68.6 $\pm$ 0.5 & 95.7 $\pm$ 3.4 & 73.0 $\pm$ 2.2 & 23.6 $\pm$ 1.5 & 36.6 $\pm$ 1.0 \\
        Ours (intra-SNN)                & 73.9 $\pm$ 0.8 & 69.4 $\pm$ 1.2 & 96.5 $\pm$ 0.3 & 68.0 $\pm$ 0.5 & 92.4 $\pm$ 1.5 & 76.1 $\pm$ 1.8 & 20.7 $\pm$ 1.1 & 41.3 $\pm$ 0.9 \\
        Ours (intra-OT)                 & 83.4 $\pm$ 0.3 & 69.7 $\pm$ 0.3 & 97.8 $\pm$ 0.3 & 70.4 $\pm$ 0.4 & 97.4 $\pm$ 0.3 & 78.5 $\pm$ 0.9 & 26.0 $\pm$ 0.4 & 42.8 $\pm$ 0.4 \\
        \midrule
        \multicolumn{9}{l}{\textit{Clustering Ablation (IPIC)}} \\
        Ours (intra+inter, $K=0.25|T|$) & - & - & - & - & 97.3 $\pm$ 0.6 & 80.1 $\pm$ 1.0 & 25.6 $\pm$ 0.5 & 39.4 $\pm$ 0.7 \\
        Ours (intra+inter, $K=0.5|T|$)  & 82.3 $\pm$ 0.6 & 66.4 $\pm$ 1.5 & 96.7 $\pm$ 0.3 & 70.0 $\pm$ 0.4 & 97.4 $\pm$ 0.5 & \textbf{80.3 $\pm$ 0.8} & 25.4 $\pm$ 0.6 & 43.0 $\pm$ 0.5 \\
        \textbf{Ours (intra+inter, $K=|T|$)}     & \textbf{83.5 $\pm$ 0.2} & \textbf{71.5 $\pm$ 0.5} & 97.7 $\pm$ 0.1 & \textbf{70.7 $\pm$ 0.2} & \textbf{97.8 $\pm$ 0.2} & 80.2 $\pm$ 0.7 & 26.4 $\pm$ 0.5 & \textbf{43.3 $\pm$ 0.2} \\
        Ours (intra+inter, $K=2|T|$)    & 83.5 $\pm$ 0.3 & 69.3 $\pm$ 0.9 & 97.0 $\pm$ 0.2 & 70.1 $\pm$ 0.3 & 97.3 $\pm$ 0.4 & 79.7 $\pm$ 0.8 & 23.6 $\pm$ 0.5 & 42.0 $\pm$ 0.4 \\
        Ours (intra+inter, $K=4|T|$)    & 82.8 $\pm$ 0.5 & 69.2 $\pm$ 1.1 & 96.9 $\pm$ 0.2 & 70.0 $\pm$ 0.4 & 97.2 $\pm$ 0.5 & 79.0 $\pm$ 0.9 & 23.7 $\pm$ 0.6 & 42.2 $\pm$ 0.3 \\
        \bottomrule
    \end{tabular}
    }
\end{table*}


\begin{table*}[ht]
\centering
\caption{Comparison of Performance Across Four Datasets with Single- and Multi-Modal Averages}
\label{tab:comparison}
\resizebox{\textwidth}{!}{
\begin{tabular}{lcc|cccc|cccc|cccc|cc}
\toprule
& \multicolumn{4}{c}{\textbf{BIRFL}} & \multicolumn{4}{c}{\textbf{ST}} & \multicolumn{4}{c}{\textbf{COMP}} & \multicolumn{4}{c}{\textbf{GKO}} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9} \cmidrule(lr){10-13} \cmidrule(lr){14-17}
\textbf{Method} & mod1 & mod2 & avg & multimod & mod1 & mod2 & avg & multimod & mod1 & mod2 & avg & multimod & mod1 & mod2 & avg & multimod \\
\midrule
InfoNCE   & 56.2 & 33.5 & 44.9 & 64.7 & 69.9 & 60.7 & 65.3 & 68.4 & 69.8 & 38.9 & 54.8 & 72.9 & 27.1 & 32.6 & 29.9 & 36.9 \\
SupCon    & 58.5 & 36.1 & 47.3 & 65.3 & 70.4 & 61.2 & 65.8 & 68.9 & 70.4 & 38.5 & 54.5 & 73.4 & 27.2 & 31.7 & 29.5 & 33.7 \\
1X-Matching & 60.5 & 37.7 & 49.1 & 66.7 & 69.6 & 60.7 & 65.2 & 68.6 & 70.4 & 37.9 & 54.2 & 73.0 & 27.2 & 33.4 & 30.3 & 36.6 \\
WCL       & 63.9 & 37.6 & 50.8 & 67.4 & 69.4 & 61.0 & 65.2 & 68.8 & 72.9 & 38.8 & 55.9 & 75.6 & 23.8 & 33.7 & 28.8 & 36.8 \\
\hline
Ours      & 64.6 & 39.5 & 52.1 & 71.5 & 72.2 & 62.1 & 67.2 & 70.7 & 74.5 & 39.4 & 57.0 & 80.2 & 27.9 & 41.2 & 34.6 & 43.3 \\
\bottomrule
\end{tabular}
}
\end{table*}

% Add a table in here (15 minutes)

\subsection{Single-Modality vs. Multi-Modality}
In order to compare whether multi-modality embedding outperforms single-modality embedding, for each method and each dataset, we use the learned encoders $\phi^1$ and $\phi^2$ to obtain embeddings for each modality. These embeddings are then used to perform zero-shot prediction tasks on the downstream task label $Y$. 
From the results shown in Table~\ref{tab:comparison}, we observe that, with the exception of the image modality outperforming the concatenated multi-modality embedding in the ST dataset, all other datasets demonstrate that multi-modality training is more beneficial than single-modality embeddings. Multi-modality learning enhances both data modalities, as the average performance of single-modality embeddings for downstream tasks is consistently lower than that of concatenated multi-modality embeddings. Specifically, the accuracy drops by $4\%$ to $26\%$ when using single-modality embeddings.
Additionally, Table~\ref{tab:comparison} highlights a trend where our proposed IPIC method consistently improves the quality of each single-modality embedding compared to baseline methods, further demonstrating its effectiveness in multi-modality learning.


\subsection{Perturbation Consistency}

The \textit{Perturbation Consistency} metric evaluates how consistently a model captures the effects of treatments across samples and batches. For unpaired data, this involves assessing similarities within and across treatment groups for embeddings from each modality. The metric is computed as follows:


\textbf{1. Average Within-Group Similarity:}



Given embeddings $\mathbf{v}_i$ for samples in treatment group $T_k$, the cosine similarity between embeddings is computed as:
\begin{equation}
    \text{cos}(\mathbf{v}_i, \mathbf{v}_j) = \frac{\mathbf{v}_i \cdot \mathbf{v}_j}{\|\mathbf{v}_i\| \|\mathbf{v}_j\|}.
\end{equation}

The average similarity for group $T_k$ is:
\begin{equation}
    \text{AvgSim}_{T_k} = \frac{1}{|T_k|^2} \sum_{i, j \in T_k} \text{cos}(\mathbf{v}_i, \mathbf{v}_j).
\end{equation}

\textbf{2. Contrast Between Groups:}
For embeddings from different treatment groups $T_k$ and $T_l$ ($k \neq l$), the inter-group similarity is:

\begin{equation}
    \text{AvgSim}_{T_k, T_l} = \frac{1}{|T_k||T_l|} \sum_{i \in T_k, j \in T_l} \text{cos}(\mathbf{v}_i, \mathbf{v}_j)
\end{equation}

\textbf{3. Perturbation Consistency Score:}
The final score measures the separation of within-group similarities from across-group similarities:
\begin{equation}
    \text{Perturbation Consistency} = \frac{\sum_k \text{AvgSim}_{T_k}}{\sum_{k, l, k \neq l} \text{AvgSim}_{T_k, T_l}}
\end{equation}


\begin{table}[bt!]
\centering
\caption{Perturbation Consistency}
\label{tab:perturb}
\begin{tabular}{lcccc}
\hline
\textbf{Method}      & \textbf{BIRFL} & \textbf{ST}  & \textbf{COMP} & \textbf{GKO}  \\ \hline
Original Input       & 0.178         & 0.337        & 0.085         & 0.042         \\
InfoNCE              & 0.171          & 0.371        & 0.086         & 0.083         \\
SupCon               & 0.278          & 0.438        & 0.109         & 0.120         \\
WCL                  & 0.176          & 0.389        & 0.103         & 0.043         \\
1X-Matching             & 0.172          & 0.372        & 0.087         & 0.086         \\ \hline
IPIC (Ours)          & 0.180          & 0.433        & 0.114         & 0.129         \\ \hline
\end{tabular}
\end{table}

For simplicity, we concatenated embeddings from both modalities for each ``paired sample'' to compute the Perturbation Consistency score. 

Higher scores indicate better preservation of treatment effects within groups. From the results shown in Table~\ref{tab:perturb}, we observe that the Perturbation Consistency score computed from the original input vectors $X^1$ and $X^2$ is lowest across the ST, COMP, and GKO datasets. For the BIRFL dataset, methods such as InfoNCE, one-time matching, and WCL slightly decrease the perturbation consistency of the embeddings. However, our IPIC method consistently outperforms these methods, including the original input vectors, and achieves better performance than SupCon, which directly uses treatment as labels during training. 
These results demonstrate that our IPIC method successfully preserves treatment effects within each treatment group, further validating its effectiveness.



\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{sec/figs/suppfig4.png}
    %  \begin{tikzpicture}
    %     \fill[black] (0,0) rectangle (8,6);
    %     \node[white] at (4,3) {\Huge \(t\)-SNE Visualization};
    % \end{tikzpicture}
    \caption{\(t\)-SNE visualization of embeddings. The plots illustrate separability by treatment class (\textit{T}) (top) and downstream tasks labels (\textit{Y}) (bottom) for pretraining embeddings (left) and IPIC embeddings (right).}
    \label{fig:tsne_separability}
\end{figure*}

\subsection{Data Integration and Batch Effect Reduction} 
\label{subsec:ilisi}

In biological experiments, data are often collected in distinct batches (groups of samples processed under varying conditions or at different times). These batch-specific variations, known as \textit{batch effects}, can obscure true biological signals and hinder accurate perturbation analysis. Effective batch integration is essential to ensure that comparisons in the latent space reflect genuine biological differences rather than technical artifacts.

The \textit{Integration Local Inverse Simpson’s Index (iLISI)} measures how well a model mitigates batch effects by evaluating the diversity of batch labels within the neighborhood of each sample in the embedding space \citep{korsunsky2019fast}. A high iLISI score signifies successful batch mixing, enabling the latent space to highlight biological variation while minimizing technical noise. The iLISI computation follows these steps:

\textbf{1. Neighborhood Probabilities:}
For each sample \( i \), the conditional probability \( p_{ic} \) of its neighbors belonging to batch \( c \) is computed as:
\[
p_{ic} = \sum_{\substack{j \in \mathcal{N}_i \\ l_j = c}} p_{ij},
\]
where \( \mathcal{N}_i \) is the set of \( k \)-nearest neighbors of sample \( i \), \( l_j \) is the batch label of neighbor \( j \), and \( p_{ij} \) is the probability defined as:
\[
p_{ij} = \frac{\exp(-\beta_i d_{ij})}{\sum_{l \in \mathcal{N}_i} \exp(-\beta_i d_{il})}.
\]
Here, \( \beta_i \) is a scaling parameter ensuring that the entropy of \( P_i = \{p_{ij}\} \) matches \( \log(k) \).

\textbf{2. Inverse Simpson's Index:}
For each sample \( i \), the diversity of batch labels in its neighborhood is measured using the Inverse Simpson’s Index:
\[
\text{ISI}_i = \left( \sum_{c \in C} p_{ic}^2 \right)^{-1},
\]
where \( C \) is the set of all batch labels.

\textbf{3. Final iLISI Score:}
The overall iLISI score is the mean ISI across all samples:
\[
\text{iLISI} = \frac{1}{n} \sum_{i=1}^n \text{ISI}_i.
\]
A higher iLISI score indicates better batch integration, as samples are more evenly distributed across all batches within their neighborhoods.

We evaluated the iLISI scores for embeddings generated by IPIC (concatenated embeddings) and compared them to baseline models, which include the original modality encoders with their respective pretraining schemes. The comparison focuses on batch integration across two datasets: GKO (with batch ID categories) and ST (with slide ID categories).

\begin{table}[bt!]
    \centering
    \caption{iLISI Scores for Batch Integration}
    \resizebox{0.6\columnwidth}{!}{%
    \begin{tabular}{lcc}
        \hline
        \textbf{Model} & \textbf{GKO (Batch ID)} & \textbf{ST (Slide ID)} \\
        \hline
        IPIC ($\phi_1 \Vert \phi_2$) & 1.82 & 1.67 \\
        $\mathcal{F}_1$ (Original Encoder) & 1.52  & 1.12\\
        $\mathcal{F}_2$ (Original Encoder) & 1.66 & 1.1\\
        $\phi_1$ & 1.78 & 1.66 \\
        $\phi_2$ & 1.79 & 1.35 \\
        \hline
    \end{tabular}%
    }
    \label{tab:ilisi_results}
\end{table}


Table~\ref{tab:ilisi_results} presents the iLISI scores, providing a quantitative comparison of integration performance between IPIC and baseline models (individual original encoders $\mathcal{F}$ and learned encoders $\phi$). IPIC achieved consistently higher iLISI scores than the original encoders, demonstrating superior batch effect reduction and data integration. Notably, the learned encoders ($\phi$) for each modality also showed significant improvement over their respective pretrained embeddings.

These results emphasize IPIC’s ability to leverage multimodal embeddings effectively, reducing batch effects and integrating samples into a unified representation space more robustly than the baseline approaches.

\subsection{Separability of Embeddings}
\label{subsec:separability_embeddings}

The ability of embeddings to separate meaningful biological signals from batch effects or other confounding factors is a critical property for robust representation learning. We evaluated the separability of embeddings generated by IPIC on ST dataset using \(t\)-SNE visualizations, focusing on metadata categories such as treatment class (\textit{T}) and downstream task labels (\textit{Y}). This visualization allows for a qualitative assessment of how well the embeddings group similar treatments while also distinguish orthogonal downstream task labels. 
Figure~\ref{fig:tsne_separability} shows \(t\)-SNE plots comparing pre- and post-training embeddings for both modalities. Pretraining embeddings exhibited significant overlap between distinct treatment classes, often clustering according to batch effects. In contrast, post-training embeddings generated by IPIC demonstrated clear separation of treatment classes and better distinction between downstream task labels.

The \(t\)-SNE visualization highlights the enhanced ability of IPIC to separate embeddings by treatment class while reducing the influence of batch effects by separating the downstream task labels clearly. This improvement indicates that IPIC embeddings better capture biologically meaningful signals compared to baseline models. However, further quantitative evaluations, such as classification accuracy or silhouette scores, could provide additional insights into the robustness of these embeddings.

% \subsection{Zero-Shot Retrieval of Known Biological Relationships}
% \label{subsec:zero_shot_relationships}

% Evaluating model representations for their ability to capture known biological relationships is critical for assessing their biological relevance. Using gene embeddings generated by IPIC following gene knockout perturbations, we assess their alignment with established biological knowledge. Gene-to-gene distances are computed directly from the latent-space embeddings, and their biological validity is benchmarked against curated databases such as CORUM, HuMAP, Reactome, SIGNOR, and StringDB, as outlined by \citet{Celik2022} and \citet{bendidi2024benchmarking}.

% We assess the ability of IPIC embeddings to capture known biological relationships by calculating pairwise cosine similarities between aggregated perturbation embeddings on the GKO dataset:
% \[
% \text{cos}(\mathbf{v}_i, \mathbf{v}_j) = \frac{\mathbf{v}_i \cdot \mathbf{v}_j}{\|\mathbf{v}_i\| \|\mathbf{v}_j\|}.
% \]
% Self-similarities are excluded to avoid distortion. Predicted relationships are defined as pairs with cosine similarities within the top \textcolor{red}{5\%} (cooperative) or bottom \textcolor{red}{5\%}  (antagonistic) of the similarity distribution. This approach identifies gene pairs most likely to exhibit strong biological interactions based on the model's latent space.

% \paragraph{Recall Calculation}: To evaluate the biological relevance of these predicted relationships, we compute recall: 
% \[
% \text{Recall} = \frac{\#\text{(True Positive Links)}}{\#\text{(Total Known Links)}} \times 100,
% \]
% where true positive links correspond to predicted relationships that match known interactions from curated databases. Only relationships present in the perturbation dataset are considered to ensure fairness. Recall is calculated for each database individually and averaged across CORUM, HuMAP, Reactome, SIGNOR, and StringDB for a comprehensive assessment.

% \paragraph{Significance and Results}: This metric evaluates how effectively the latent-space embeddings reflect known biological structure. \textcolor{red}{The results, summarized in Table~\ref{tab:known_relationships}, compare IPIC embeddings to baseline models, highlighting IPIC's ability to align with established biological knowledge.}

% \begin{table}[bt!]
%     \centering
%     \caption{Recall (\%) of Known Biological Relationships on the GKO dataset}
%     \begin{tabular}{lccc}
%         \hline
%         \textbf{Database} & \textbf{IPIC} & \textbf{BASELINE1} & \textbf{BASELINE2} \\
%         \hline
%         CORUM & [X] & [Y] & [Z] \\
%         HuMAP & [X] & [Y] & [Z] \\
%         Reactome & [X] & [Y] & [Z] \\
%         SIGNOR & [X] & [Y] & [Z] \\
%         StringDB & [X] & [Y] & [Z] \\
%         \hline
%     \end{tabular}
% \label{tab:known_relationships}
% \end{table}




% \section{Detailed Matching Setup}
% mostly adopting from Johnny's paper


