% \newpage
% total 16 columns (writing 13-14 coulumns)
% experiment and results ==> 4 columns (9 10 11 12)

% ST \cite{cui2022spatially}, IRFL \cite{yosef2023irfl}, rxrx3 \cite{fay2023rxrx3}, phenom-1 \cite{kraus2024masked}, 
% CLIP paper \cite{radford2021learning} cell-paint \cite{bray2016cell} drug-seq \cite{ye2018drug} perturb-seq \cite{dixit2016perturb}
% gko \cite{bock2022high}




% \begin{table}[ht]
%     \centering
% \caption{Dataset specifications. $N$: \# samples, $|T|$: \# treatments, $|Y|$: \# task labels. $\mathcal{F}^1, \mathcal{F}^2$: embedding model, $|X^1|$, $|X^2|$: embedding dimensions. "Paired" indicates if the dataset includes original pairings.}    \label{tab:dataset_specifications}
%     \resizebox{\columnwidth}{!}{
%     \begin{tabular}{ccccc}
%         \toprule
%         & BIRFL & ST & COMP & GKO \\
%         \midrule
%         $N$         & 5502 & 57363 & 36562  & 59011 \\
%         $|T|$       & 7 & 4& 78& 25\\
%         $|Y|$       & 3& 2& 3& 5\\
%         $\mathcal{F}^1$       & ViT\cite{dosovitskiy2020image}& UNI\cite{chen2024uni} & Phenom1\cite{kraus2024masked} & Phenom1\cite{kraus2024masked}\\
%         $\mathcal{F}^2$       & SFR\cite{lee2024nv} & scVI\cite{lopez2018deep} & scVI\cite{lopez2018deep} & scVI\cite{lopez2018deep} \\
%         $|X^1|$  & 768& 1024& 768 & 1024 \\
%         $|X^2|$  & 4096 & 1024 & 256 & 128\\
%         Paired  & \checkmark & \checkmark & \text{\sffamily X} & \text{\sffamily X}\\
%         \bottomrule
%     \end{tabular}}
% \end{table}

% \begin{table}[t]
%     \centering
%     \caption{Dataset specifications. $N$: samples, $|T|$: treatments, $|Y|$: labels. Dimensions $|X^{1,2}|$ correspond to encoders $\mathcal{F}^{1,2}$.}
%     \label{tab:dataset_specifications}
%     \footnotesize % Shrink font slightly
%     \setlength{\tabcolsep}{3.5pt} % Reduce space between columns to fit width
%     \begin{tabular}{l c c c l l c c c}
%         \toprule
%         Dataset & $N$ & $|T|$ & $|Y|$ & $\mathcal{F}^1$ (Emb.) & $\mathcal{F}^2$ (Emb.) & $|X^1|$ & $|X^2|$ & Paired \\
%         \midrule
%         BIRFL & 5,502 & 7 & 3 & ViT\cite{dosovitskiy2020image} & SFR\cite{lee2024nv} & 768 & 4096 & \checkmark \\
%         ST & 57,363 & 4 & 2 & UNI\cite{chen2024uni} & scVI\cite{lopez2018deep} & 1024 & 1024 & \checkmark \\
%         COMP & 36,562 & 78 & 3 & Phenom1\cite{kraus2024masked} & scVI\cite{lopez2018deep} & 768 & 256 & \text{\sffamily X} \\
%         GKO & 59,011 & 25 & 5 & Phenom1\cite{kraus2024masked} & scVI\cite{lopez2018deep} & 1024 & 128 & \text{\sffamily X} \\
%         \bottomrule
%     \end{tabular}
%     \vspace{-3mm} % Optional: pulls text closer to bottom of table
% \end{table}


% \begin{table*}[t]
%     \centering
%     \caption{Test Set Results on Four Curated Datasets. We report accuracy for Treatment and Task label predictions.}
%     \label{tab:results}
%     \small % Slightly smaller font to match MIDL style
%     \setlength{\tabcolsep}{4pt} % Tighten space between columns
%     \begin{tabular}{l c c c c c c c c}
%         \toprule
%          & \multicolumn{2}{c}{BIRFL} & \multicolumn{2}{c}{ST} & \multicolumn{2}{c}{COMP} & \multicolumn{2}{c}{GKO} \\
%         \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
%         Method & Treat. & Task & Treat. & Task & Treat. & Task & Treat. & Task \\
%         \midrule
%         InfoNCE-paired & 81.4 & 97.2 & 98.5 & 80.9 & - & - & - & - \\
%         \midrule
%         InfoNCE-unpaired & 81.4 & 64.7 & 97.0 & 68.4 & 95.6 & 72.9 & 22.6 & 36.9 \\
%         SupCon\cite{khosla2020supervised} & 83.0 & 65.3 & \textbf{99.4} & 68.9 & 96.7 & 73.4 & \textbf{34.1} & 33.7 \\
%         InfoCore\cite{wang2024removing} & 82.1 & 65.2 & 98.8 & 68.4 & 96.3 & 73.6 & 27.8 & 34.9 \\
%         1X-Matching & 81.5 & 66.7 & 95.9 & 68.6 & 95.7 & 73.0 & 23.6 & 36.6 \\
%         WCL\cite{zheng2021weakly} & 81.1 & 67.4 & 96.6 & 68.8 & 96.4 & 75.6 & 23.2 & 36.8 \\
%         XDC\cite{alwassel2020self} & 80.8 & 67.2 & 94.7 & 68.4 & 96.2 & 77.6 & 21.8 & 37.3 \\
%         \midrule
%         Ours (intra) & 83.4 & 69.7 & 97.8 & 70.4 & 97.4 & 78.5 & 26.0 & 42.8 \\
%         Ours (intra+inter) & \textbf{83.5} & \textbf{71.5} & 97.7 & \textbf{70.7} & \textbf{97.8} & \textbf{80.2} & 26.4 & \textbf{43.3} \\
%         \bottomrule
%     \end{tabular}
%     \vspace{-3mm} % Pulls following text up
% \end{table*}
% \begin{table*}[ht]
%     \centering
%     \caption{Test Set Results on Our Four Curated Datasets with Treatment and Downstream Task Label Predictions}
%     \label{tab:results}
%     \begin{tabular}{lcccccccc}
%         \toprule
%         & \multicolumn{2}{c}{BIRFL} & \multicolumn{2}{c}{ST} & \multicolumn{2}{c}{COMP} & \multicolumn{2}{c}{GKO} \\
%         \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
%         Method & Treatment & Task & Treatment & Task & Treatment & Task & Treatment & Task \\
%         \midrule
%         InfoNCE-paired      & 81.4& 97.2 & 98.5 & 80.9 & - & - & - & - \\
%         \hline
%         InfoNCE-unpaired    & 81.4 & 64.7 & 97.0 & 68.4 & 95.6 & 72.9 & 22.6 & 36.9 \\
%         SupCon\cite{khosla2020supervised}             & 83.0 & 65.3 & \textbf{99.4} & 68.9 & 96.7 & 73.4 & \textbf{34.1} & 33.7 \\
%         InfoCore\cite{wang2024removing}            & 82.1 & 65.2 & 98.8 & 68.4 & 96.3  &  73.6 & 27.8 & 34.9 \\
%         1X-Matching            & 81.5 & 66.7 & 95.9 & 68.6 & 95.7  & 73.0 & 23.6 & 36.6 \\
%         WCL\cite{zheng2021weakly}                 & 81.1 & 67.4 & 96.6 & 68.8 & 96.4 & 75.6 & 23.2 & 36.8 \\
%         XDC\cite{alwassel2020self}                 & 80.8 & 67.2 & 94.7 & 68.4 & 96.2 & 77.6 & 21.8 & 37.3 \\
%         \hline
%         Ours (intra)        & 83.4 & 69.7 & 97.8 & 70.4 & 97.4 & 78.5 & 26.0 & 42.8 \\
%         Ours (intra+inter)     & \textbf{83.5} & \textbf{71.5} & 97.7 & \textbf{70.7} &  \textbf{97.8} & \textbf{80.2} & 26.4 & \textbf{43.3} \\
%         \bottomrule
%     \end{tabular}
% \end{table*} 

% \begin{itemize}
%     \item dataset: single-cell HUVEC cell-lines
%     \item Treatment (T): T small molecule compound
%     \item Task (Y): Y concentration
%     \item Modality 1 (X1): X1 cell-painting images
%     \item Modality 2 (X2): X2 trek-seq scRNA sequences
% \end{itemize}
% single-cell HUVEC cell-lines, T small molecule compound, Y concentration, X1 cell-painting images, X2 trek-seq scRNA sequences

% Different setup of SSL training
% \begin{itemize}
%     \item SimCLR + MLP
%     \item Pair Corrected Loss + MLP
%     \item Pair Corrected Loss \& FCL + MLP
% \end{itemize}
% Foundation V.S. Raw
% \begin{itemize}
%     \item Raw image \& Raw Sequnece (HVGs)
%     \item Embed(Im) \& Embed(Seq) 
%     \item Found-Embed(Im)~\cite{kraus2024masked} \& Found-Embed(Seq)
% \end{itemize}

\section{Experiments}

\subsection{Datasets}
Our benchmarks consist of two modalities ($X^1, X^2$), distinct treatments ($T$), and orthogonal downstream labels ($Y$) related to the latent distribution; detailed descriptions and generation protocols are provided in Appendix~\ref{apx:dataset}.

\noindent\textbf{Bio-Augmented IRFL (BIRFL):} Derived from IRFL~\cite{yosef2023irfl}, we applied seven augmentations ($T$) to 786 image-text pairs, introducing random noise to mimic biological variability. This results in 5,502 unique pairs of images ($X^1$) and captions ($X^2$), with $Y$ representing figurative types.
\textbf{Spatial Transcriptomics (ST):} Sourced from \citet{cui2022spatially}, this dataset contains 57,363 paired pathology image patches ($X^1$) and RNA readouts ($X^2$) from 15 pancreatic cancer biopsies. $T$ denotes one of four chemotherapy regimens, and $Y$ indicates the presence of tumor tissue.
\textbf{Compound Treated (COMP):} An unpaired dataset of HUVEC cells~\cite{baudin2007protocol} treated with 78 bioactive compounds ($T$). We randomly paired 36,562 samples consisting of cell-painting crops ($X^1$)~\cite{bray2016cell,fay2023rxrx3} and bulk RNA-seq data ($X^2$)~\cite{ye2018drug} sharing the same compound. $Y$ denotes solution concentration.
\textbf{Gene Knockout (GKO):} Similar to COMP, this dataset links HUVEC cells subjected to 25 CRISPR knockouts~\cite{bock2022high} ($T$). We associated 59,011 cell-painting crops ($X^1$)~\cite{bray2016cell} with perturb-seq profiles ($X^2$)~\cite{dixit2016perturb} via shared treatments. $Y$ represents binned total gene counts.

\subsection{Experimental Setup}
To simulate the unpaired setting for the naturally paired datasets (BIRFL and ST), we shuffled samples within treatment groups in the training set to create $\mathcal{D}$. For the unpaired datasets (COMP and GKO), we used manual matching in the test set solely for evaluation purposes.
To ensure fair comparison, all samples were embedded using large pre-trained models~\cite{kraus2024masked, chen2024uni, dosovitskiy2020image, lopez2018deep} before training (dimensions detailed in Table \ref{tab:dataset_specifications}).

We employed a consistent CLIP-like architecture~\cite{radford2021learning} across all methods, consisting of modality-specific vector encoders and projection heads tailored to each objective. Models were trained using their respective contrastive losses, and the resulting encoders were frozen to generate test-set embeddings. We evaluate performance on both treatment classification and downstream tasks using logistic regression on the concatenated embeddings. All experiments were repeated with eight random seeds for robustness. Further details are provided in Appendix~\ref{apx:exp}.



% \section{Experiments}

% In this section, we present the curated datasets used in our experiments, along with descriptions of our experimental setup and benchmarking tasks.

% \subsection{Datasets}

% Each of our datasets consists of two modalities, $X^1$ and $X^2$, with an equal number of samples, a set of distinct treatments $T$, and downstream task labels $Y$ that relate primarily to the original latent distribution $Z$ and are orthogonal to the treatment $T$. 

% \paragraph{Bio-Augmented IRFL Dataset (BIRFL): }  This biology-inspired dataset is derived from the Image Recognition of Figurative Language (IRFL) dataset~\cite{yosef2023irfl}, containing 786 paired images and captions labeled with figurative type labels ($Y$). To generate treatment groups, we applied seven different augmentations ($T$) to each image-text pair (see Appendix~\ref{apx:dataset}), resulting in a dataset of 5,502 unique pairs of images ($X^1$) and text captions ($X^2$). To better mimic biological noise, we introduced variability within each treatment group by randomly selecting some pairs to receive no-treatment, other treatments, or combinations of treatments.

% \paragraph{Spatial Transcriptomics for Pancreatic Cancer Tumor Biopsy (ST): }  This paired dataset, sourced from \citet{cui2022spatially}, includes 15 histopathology slides with corresponding site-level RNA sequence readouts from patient pancreatic cancer biopsies.  Each paired sample comprises a $256 \times 256$ pathology image patch ($X^1$) and RNA sequence data ($X^2$) from the same tissue region. Treatment labels correspond to four types of chemotherapy ($T$) administered before biopsy. The downstream task label ($Y$) is a pathologist's annotation indicating whether the tissue is cancerous, yielding a total of 57,363 image-sequence pairs.


% \paragraph{Compound Treated Single-cell Dataset (COMP): } In addition to biology-inspired and tissue biopsy datasets, we curated unpaired single-cell level datasets. Our first unpaired dataset includes experimental results from compound-treated human umbilical vein endothelial cells (HUVEC)~\cite{baudin2007protocol}. Samples consist of plated HUVEC cells treated with 78 FDA-approved bioactive small-molecule compounds ($T$), then either scanned to generate cell-painting images~\cite{bray2016cell} following methods similar to \citet{fay2023rxrx3}, or sequenced using bulk RNA sequencing techniques~\cite{ye2018drug} to obtain single-cell transcriptomics data. For each sample, we randomly paired a $32 \times 32$ single-cell image crop ($X^1$) with a single-cell RNA readout ($X^2$), ensuring both represent cells treated with the same compound. The downstream task label ($Y$) is the culture solution concentration level of the cells. We collected 36562 randomly paired image-sequence samples to create the COMP dataset.


% \paragraph{Gene Knockout Single-cell Dataset (GKO): } Similar to the COMP dataset,this dataset involves HUVEC cells subjected to 25 distinct CRISPR-mediated gene knockouts ~\cite{bock2022high} ($T$). We selected 25 gene knockouts  applied to HUVEC cells.  Cells were processed through two modalities: either cell-painting imaging~\cite{bray2016cell} or single-cell transcriptomics using perturb-seq \cite{dixit2016perturb}. For the imaging modality, $32 \times 32$ random crops ($X^1$) were extracted from cell-painting scans, while for the sequencing modality, RNA profiles ($X^2$) were obtained from treated cells. Each sample pair shares the same gene knockout treatment, linking the two modalities indirectly. For the downstream task label ($Y$), we used total gene counts for each cell, categorized into five bins resulting in 59,011 couple of samples.



% \subsection{Experimental Setup}
% For all datasets, we split the data into 80\% training and 20\% testing samples. For the two originally paired datasets (BIRFL and ST), we shuffled samples from one modality within each treatment group in the training set to create our unpaired dataset $\mathcal{D}$. For the two unpaired datasets (COMP and GKO), we manually matched pairs in the test set as accurately as possible based on shared metadata across modalities.
% To ensure fair comparison across methods, we first embed each sample in both modalities ($X^1$ and $X^2$) using large pretrained embedding models~\cite{kraus2024masked, chen2024uni, dosovitskiy2020image, lopez2018deep},   with the resulting embedding dimensions detailed in Table \ref{tab:dataset_specifications}.

% We use a consistent CLIP-like model architecture~\cite{radford2021learning} for all evaluated methods, utilizing two vector encoders (one per modality) and projection heads based on the individual requirements of each method. each method.
% In each experiment, models are trained on the training set using their respective contrastive learning objectives, and the trained encoders are then used to generate embeddings for test set samples. For each dataset, we report model performance on both treatment classification and downstream tasks using logistic regression on concatenated modality embeddings. Each experiment was repeated with eight random seeds to ensure robustness. Further experimental details can be found in the supplementary materials (see Appendix~\ref{apx:exp}).






% \begin{table*}[t]
%     \centering
%     \caption{Test Set Results on Four Curated Datasets. We report accuracy for Treatment and Task label predictions.}
%     \label{tab:results}
%     \small % Slightly smaller font to match MIDL style
%     \setlength{\tabcolsep}{4pt} % Tighten space between columns
%     \begin{tabular}{l c c c c c c c c}
%         \toprule
%          & \multicolumn{2}{c}{BIRFL} & \multicolumn{2}{c}{ST} & \multicolumn{2}{c}{COMP} & \multicolumn{2}{c}{GKO} \\
%         \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
%         Method & Treat. & Task & Treat. & Task & Treat. & Task & Treat. & Task \\
%         \midrule
%         InfoNCE-paired & 81.4 & 97.2 & 98.5 & 80.9 & - & - & - & - \\
%         \midrule
%         InfoNCE-unpaired & 81.4 & 64.7 & 97.0 & 68.4 & 95.6 & 72.9 & 22.6 & 36.9 \\
%         SupCon\cite{khosla2020supervised} & 83.0 & 65.3 & \textbf{99.4} & 68.9 & 96.7 & 73.4 & \textbf{34.1} & 33.7 \\
%         InfoCore\cite{wang2024removing} & 82.1 & 65.2 & 98.8 & 68.4 & 96.3 & 73.6 & 27.8 & 34.9 \\
%         1X-Matching & 81.5 & 66.7 & 95.9 & 68.6 & 95.7 & 73.0 & 23.6 & 36.6 \\
%         WCL\cite{zheng2021weakly} & 81.1 & 67.4 & 96.6 & 68.8 & 96.4 & 75.6 & 23.2 & 36.8 \\
%         XDC\cite{alwassel2020self} & 80.8 & 67.2 & 94.7 & 68.4 & 96.2 & 77.6 & 21.8 & 37.3 \\
%         \midrule
%         Ours (intra) & 83.4 & 69.7 & 97.8 & 70.4 & 97.4 & 78.5 & 26.0 & 42.8 \\
%         Ours (intra+inter) & \textbf{83.5} & \textbf{71.5} & 97.7 & \textbf{70.7} & \textbf{97.8} & \textbf{80.2} & 26.4 & \textbf{43.3} \\
%         \bottomrule
%     \end{tabular}
%     % \vspace{-3mm} % Pulls following text up
% \end{table*}

\begin{table*}[t]
    \centering
    \caption{Test Set Results on Four Curated Datasets. We report accuracy for Treatment and Task label predictions. \textbf{Bold} indicates best performance.}
    \label{tab:results}
    \small 
    \setlength{\tabcolsep}{3.5pt} % Tighten space to fit all columns
    \begin{tabular}{l c c c c c c c c}
        \toprule
         & \multicolumn{2}{c}{BIRFL} & \multicolumn{2}{c}{ST} & \multicolumn{2}{c}{COMP} & \multicolumn{2}{c}{GKO} \\
        \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
        Method & Treat. & Task & Treat. & Task & Treat. & Task & Treat. & Task \\
        \midrule
        \textit{Reference Baselines} & & & & & & & & \\
        Raw Features ($x^1||x^2$) & 77.0 & 55.4 & 77.7 & 63.6 & 74.3 & 58.6 & 19.9 & 28.9 \\
        InfoNCE-paired (Oracle) & 81.4 & 97.2 & 98.5 & 80.9 & - & - & - & - \\
        FactorCL-SSL\cite{liang2024factorized} & 80.7 & 64.5 & 92.1 & 67.3 & 90.9 & 72.5 & 25.1 & 37.9 \\
        FactorCL-Sup\cite{liang2024factorized} & 81.0 & 66.5 & 95.7 & 68.9 & 91.6 & 73.4 & 30.7 & 35.2 \\
        InfoNCE-unpaired & 81.4 & 64.7 & 97.0 & 68.4 & 95.6 & 72.9 & 22.6 & 36.9 \\
        SupCon\cite{khosla2020supervised} & 83.0 & 65.3 & \textbf{99.4} & 68.9 & 96.7 & 73.4 & \textbf{34.1} & 33.7 \\
        InfoCore\cite{wang2024removing} & 82.1 & 65.2 & 98.8 & 68.4 & 96.3 & 73.6 & 27.8 & 34.9 \\
        WCL\cite{zheng2021weakly} & 81.1 & 67.4 & 96.6 & 68.8 & 96.4 & 75.6 & 23.2 & 36.8 \\
        XDC\cite{alwassel2020self} & 80.8 & 67.2 & 94.7 & 68.4 & 96.2 & 77.6 & 21.8 & 37.3 \\
        \midrule
        \textit{Matching Strategy Ablation} & & & & & & & & \\
        1X-Matching (SNN) & 74.2 & 65.2 & 93.9 & 68.2 & 91.1 & 74.2 & 19.7 & 37.5 \\
        1X-Matching (OT) & 81.5 & 66.7 & 95.9 & 68.6 & 95.7 & 73.0 & 23.6 & 36.6 \\
        Ours (intra-SNN) & 73.9 & 69.4 & 96.5 & 68.0 & 92.4 & 76.1 & 20.7 & 41.3 \\
        Ours (intra-OT) & 83.4 & 69.7 & 97.8 & 70.4 & 97.4 & 78.5 & 26.0 & 42.8 \\
        \midrule
        \textit{Clustering Ablation (IPIC)} & & & & & & & & \\
        Ours (intra+inter, $K=0.25|T|$) & - & - & - & - & 97.3 & 80.1 & 25.6 & 39.4 \\
        Ours (intra+inter, $K=0.5|T|$) & 82.3 & 66.4 & 96.7 & 70.0 & 97.4 & \textbf{80.3} & 25.4 & 43.0 \\
        Ours (intra+inter, $K=2|T|$) & \textbf{83.5} & 69.3 & 97.0 & 70.1 & 97.3 & 79.7 & 23.6 & 42.0 \\
        Ours (intra+inter, $K=4|T|$) & 82.8 & 69.2 & 96.9 & 70.0 & 97.2 & 79.0 & 23.7 & 42.2 \\
        \textbf{Ours (intra+inter, $K=|T|$)} & \textbf{83.5} & \textbf{71.5} & 97.7 & \textbf{70.7} & \textbf{97.8} & 80.2 & 26.4 & \textbf{43.3} \\
        \bottomrule
    \end{tabular}
    \vspace{-1.5mm} % Pulls following text up slightly
\end{table*}




\section{Results}
We present a comprehensive evaluation of our proposed IPIC method against baselines tailored for paired datasets, weakly-supervised contrastive frameworks, and matching-based alignment. The zero-shot performance of test set embeddings, evaluated on treatment and downstream task prediction, is summarized in Table \ref{tab:results} \footnote{See full results with standard deviation in Table~\ref{tab:results_full_std}.}.

\paragraph{Paired Assumptions vs. Unpaired Reality: } 
We first examined the impact of exact pairing versus random pairing. The BIRFL and ST datasets, which possess true pairings, allow us to test performance degradation when precise pairings are removed. As shown in the first two rows of Table \ref{tab:results}, downstream task performance drops significantly when using randomly shuffled pairs within treatment groups. We also evaluated \textbf{FactorCL} (SSL and Sup \cite{liang2024factorized}) as an additional baseline; its performance aligned closely with InfoNCE-unpaired, further confirming that methods assuming paired modalities struggle to recover representation quality when restricted to group-level alignment. This highlights that treating randomly paired samples as genuinely ``paired'' yields suboptimal embeddings.

\paragraph{Robust Performance over Baselines: } 
We compared IPIC with six baseline methods, including standard InfoNCE (Eq.\ref{eq:nce}), treatment-guided methods like SupCon (Eq.\ref{eq:supcon}) and InfoCore, and clustering-based methods WCL (Eq.~\ref{eq:wcl}) and XDC~\cite{alwassel2020self}. Additionally, we provide linear probing results on \textbf{raw input vectors} ($x^1 || x^2$) to establish a performance lower bound. As shown in Table \ref{tab:results}, IPIC consistently outperforms all learned baselines and significantly surpasses the raw feature baseline (e.g., on BIRFL Task prediction), demonstrating that our method learns non-trivial, biologically meaningful structure rather than simply retaining input statistics. Notably, IPIC rivals fully supervised approaches like SupCon in treatment prediction while achieving superior downstream task performance.

\paragraph{Dynamic Alignment \& Matching Strategy: } 
A key contribution of IPIC is its dynamic pairing mechanism. We compared our iterative approach against a static baseline, ``1X-Matching'', where Algorithm \ref{alg:matching_repair} is applied only once before training. 
IPIC achieves a \textbf{4.8\% average improvement} on downstream tasks compared to one-time matching~\cite{xi2024propensity}, confirming that iterative propensity score updates better capture underlying biological mechanisms where treatments induce similar population-level effects despite individual-cell heterogeneity. Furthermore, we ablated the matching algorithm itself, comparing our Optimal Transport (OT) approach inspired by \cite{xi2024propensity} against Shared Nearest Neighbor (SNN) matching. We found OT to be consistently superior (e.g., SNN degrades performance by $\sim$7-10\% on BIRFL), likely because OT enforces global distributional constraints that are more robust to batch effects than local neighbor-based methods.

\paragraph{Clustering Granularity ($K$): }
For the inter-treatment clustering objective, we investigated the sensitivity of the number of clusters $K$. We evaluated $K \in \{0.25|T|, 0.5|T|, |T|, 2|T|, 4|T|\}$ and found that alternatives did not outperform \textbf{$K=|T|$}. This is intuitive, as setting $K$ equal to the number of treatment groups ensures that clusters align with the experimental design, preserving biological relevance while allowing the model to bridge similar treatment effects.

\paragraph{Single-Modality vs. Multi-Modality: } 
To assess the benefits of data integration, we compared concatenated multi-modality embeddings against single-modality embeddings obtained from learned encoders $\phi^1$ and $\phi^2$ on downstream task $Y$. As shown in Table~\ref{tab:comparison}, multi-modality training proves consistently beneficial; with the exception of the ST image modality, concatenated embeddings outperform single-modality counterparts across all datasets. Specifically, relying on a single modality results in accuracy drops ranging from \textbf{4\% to 26\%}. Furthermore, a comparison with baselines in Table~\ref{tab:results} highlights that IPIC consistently improves the quality of \textit{each} single-modality embedding. This demonstrates that our unpaired contrastive framework effectively leverages cross-modal synergy to enhance representation learning, benefiting both joint and individual modality inference.

\paragraph{Impact of Treatment Group Labels: } 
While InfoNCE achieves similar treatment prediction accuracy for paired and unpaired datasets (as unpaired samples are simply shuffled within groups), incorporating treatment labels significantly improves representation quality. Methods explicitly leveraging treatment labels (SupCon, InfoCore, Matching, IPIC) consistently outperform those that do not (InfoNCE, WCL, XDC) on treatment prediction. This supports our hypothesis that shared treatment information acts as a strong training signal, which is particularly valuable in biological contexts like drug response prediction where capturing subtle treatment effects is crucial~\cite{fradkin2024molecules,iorio2016landscape}.

\paragraph{Orthogonality of Treatment and Downstream Tasks: } 
We consistently observed that baselines excelling in treatment label prediction often underperformed on downstream tasks. This suggests that our downstream task labels capture latent information intrinsic to the original sample distribution $Z$, independent of treatment labels $T$. Consequently, strong performance on downstream tasks serves as a robust indicator of representation quality, reflecting the model's ability to capture biologically meaningful variations beyond simple treatment effects. IPIC excels here by leveraging treatment labels for alignment while preserving the rich latent features required for downstream tasks.

\paragraph{Ablation Study on IPIC Objectives: } 
Finally, we compared two IPIC variants: one using only the intra-treatment objective ($\mathcal{L}_{intra}$) and another combining both intra- and inter-treatment objectives ($\mathcal{L}_{intra} + \mathcal{L}_{inter}$). Leveraging just the intra-treatment objective yields significant gains over baselines by enabling weighted positive pairings. Adding inter-treatment clustering further boosts performance by aligning samples not only with cross-modal pairs but also with similar representations across different treatment groups. This dual strategy enables the learning of richer, more robust representations.


% \section{Results}
% We present a comprehensive evaluation of our proposed IPIC method, comparing its performance against various baselines tailored for multimodal learning. Specifically, we assess methods designed for paired datasets, label-guided approaches, weakly-supervised contrastive frameworks, and techniques that utilize matching for alignment.  The zero-shot performance of test set embeddings, evaluated on treatment and downstream tasks prediction, is summarized in Table \ref{tab:results}. 

% \paragraph{Paired vs. Unpaired: } 
% We first examined the impact of exact pairing versus random pairing within treatment groups on representation capacity. The BIRFL and ST datasets, which have true pairings by design, allow us to test how the removal of  precise pairings affects performance. As shown in the first two rows of Table \ref{tab:results}, downstream task performance drops significantly when using randomly shuffled pairs within treatment groups. This highlights that prior multi-modal representation learning methods that treat randomly paired samples within treatment groups as if they were genuinely ``paired", may learn less accurate and lower-quality embeddings.


% \paragraph{Robust Performance of IPIC over Baselines: } 
% To demonstrate IPIC's effectiveness, we compared it with six baseline methods. These include the standard InfoNCE objective (Eq.\ref{eq:nce}), methods that leverage treatment labels such as SupCon (Eq.\ref{eq:supcon}) and InfoCore, as well as two methods, WCL (Eq.~\ref{eq:wcl}) and XDC~\cite{alwassel2020self}, which leverage  pseudo-labels in the representation space through clustering. For the baseline matching method, which we refer to as ``1X-Matching", we applied one-time re-pairing within each treatment group using Algorithm \ref{alg:matching_repair} before training with InfoNCE. 
% As shown in Table \ref{tab:results}, our method, IPIC, \textbf{consistently outperforms} all baselines across all four datasets. Notably, it achieves superior downstream task performance while also maintaining strong treatment label prediction, rivaling SupCon and InfoCore, which directly use treatment labels for supervision.

% \paragraph{Single-Modal vs. Multi-Modal Learning: } 
% We also examined the impact of multimodal learning versus single-modality embeddings for downstream task prediction. Across all datasets, single-modality embeddings consistently underperformed, with accuracy dropping by $4\%$ to $26\%$. This underscores the benefits of associating multiple modalities, showing that multimodal learning not only improves downstream task performance but also enhances the quality of representations in each modality.

% \paragraph{Impact of Treatment Group Labels: }  Our experiments reveal that while InfoNCE achieves similar treatment prediction accuracy for paired and unpaired datasets (since unpaired samples are simply shuffled within each treatment group), incorporating treatment labels could significantly improves representation quality. Specifically, methods that leverage treatment labels (SupCon, InfoCore, Matching) consistently outperforms methods that do not (InfoNCE, WCL, and XDC) on treatment label prediction.  This supports our hypothesis that using shared treatment information across modalities acts as a strong signal during training, leading to higher-quality embeddings and better performance on treatment-related tasks. This is particularly valuable in biological contexts, such as predicting cellular responses to drugs or optimizing virtual compound screening, where capturing subtle, biologically meaningful treatment effects is crucial~\cite{fradkin2024molecules,iorio2016landscape}.

% \paragraph{Orthogonality of Treatment and Downstream Tasks: } 
% We consistently observed that baselines excelling in treatment label prediction often underperformed on downstream tasks. This suggests that our downstream task labels capture latent information intrinsic to the original sample distribution $Z$, which is independent of treatment labels $T$. Consequently, strong performance on downstream tasks serves as a more robust indicator of representation quality, as it reflects the model's ability to capture biologically meaningful variations beyond treatment-specific effects. This further supports the advantage of our IPIC method over baseline approaches, as IPIC is specifically designed to leverage treatment labels while preserving the ability to capture richer latent features relevant to downstream tasks. 

% \paragraph{Ablation Study on IPIC Objectives: } 
% The last two rows of Table \ref{tab:results} compare two IPIC variants: one using only the intra-treatment group objective and another combining both intra- and inter-treatment objectives. Leveraging just the intra-treatment objective already results in significant gains over baselines, as it enables positive pairings within modalities weighted by a similarity (matching) score, a design absent in the baseline methods. Adding the inter-treatment clustering further boosts performance by enabling each sample to align not only with its matched cross-modal pair but also with similar representations across different treatment groups. Overall, this dual strategy enables the model to learn richer, more robust representations, ultimately leading to better downstream task performance.


 %—the higher the downstream task performance on the test set, the better the embedding quality.

% \newpage


% \subsection{Cell images, RxRx3~\cite{fay2023rxrx3}}
% 2.2 million (2200k) images of 2046x2046x6 , with each image captures around $\sim$1000 cells
% \par
% (400k) are controlled (no treatment)
% \par
% rest or ($\sim$1600k) are treated, with 18k gene knockout experiments

% \subsection{PerturbSeq (CRISPR Gene Knockout)}
% Private
% \par
% $\#$ cells of RxRx3 (18k) gene knock out of sc-RNASeq
% \par
% Public Dataset ~\cite{moshkov2023predicting}
% \par
% Some other treatment (with same cell-line as RxRx3) or other cell-lines (with same treatment as RxRx3)


% \subsection{TrekSeq (Compound)}
% TBA
% (TrekSeq on BrightField)