% \newpage





% %\CE{Should we first have a ``Background: Unpaired contrastive learning'' section, and then a ``IntraPair InterCluster (IPIC)'' methods section with the final algorithm?} 
% In this section, we define the problem of unpaired contrastive learning (UCL) in biological contexts and review the limitations of existing paired contrastive learning methods. Next, we introduce our proposed UCL method, \textit{IntraPair InterCluster} (IPIC), which leverages intra-treatment group information via matching and inter-treatment group information via clustering. Finally, we detail the IPIC algorithm for batch learning, and its implementation specifics.





% % dataset and problem and goal and mesaure of success
% We start by formally defining the problem of unpaired contrastive learning for biological datasets. Given two treatment-labeled datasets $\mathcal{D}_1 = \{(x^1_i, t_i)\}_{i=1}^{N_1}$ and $\mathcal{D}_2 = \{(x^2_j, t_j)\}_{j=1}^{N_2}$, where $x^1_i \in X^1$ (e.g., images) and $x^2_j \in X^2$ (e.g., sequences or texts) are samples from different modalities, each sample is associated with a treatment label $t \in T$ drawn from the set of possible treatments $T$ (see Figure \ref{fig:dataset}). 
% $\mathcal{D}_1$ and $\mathcal{D}_2$ share the same set of treatment labels, allowing us to construct a combined dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i, j=1}^{N}$. However, samples $x^1_i$ and $x^2_j$ are not directly paired (i.e., they do not correspond to the same cell or natural scene), but are instead indirectly linked by their shared treatment label $t$. The objective of unpaired contrastive learning with treatment group labels is to jointly learn two encoders, $\mathcal{\phi}^1$ and $\mathcal{\phi}^2$, for each modality that produce improved representations. Here, improved representations are defined by their zero-shot performance in distinguishing treatment labels and their effectiveness in downstream tasks.

% %\CE{this notation says we are given triplets $(x^1_i, x^2_j, t)$ \textit{at the sample level} with the same number of samples $N$ for each treatment. Better to say we're given two treatment-labeled datasets $\mathcal{D}_1 = \{(x^1_i, t_i)\}_{i=1}^{N_1}$ and $\mathcal{D}_2 = \{(x^2_j, t_j)\}_{j=1}^{N_2}$ with shared treatment labels $t \in T$?}


% % infonce for paired dataset
% Most prior contrastive methods assume paired datasets and use variations of the noise contrastive estimation (NCE) objective to differentiate instance pairs. The NCE objective treats two modalities of the same instance as positive pairs, while treating different instances as negative pairs, adjusting the latent space by bringing positive pairs closer together and pushing apart negative pairs. 
% Following the SimCLR and CLIP frameworks, given a batch of samples $\{(x^1_i, x^2_i)\}_{i=1}^B$, we embed them using encoders $\mathcal{\phi}^1, \mathcal{\phi}^2$ and non-linear projection heads $f^1, f^2$ to produce hidden vectors $\{(\mathbf{v}^1, \mathbf{v}^2)_i\}_{i=1}^B$ where $\{\mathbf{v}^\smblksquare \} = f^\smblksquare (\mathcal{\phi}^\smblksquare  (\{x^\smblksquare \}))$. For a paired batch indexed by $i$, the NCE objective for a sample in modality 1 is defined as:
% \begin{equation}
%     \mathcal{L}^{i(1)}_{NCE} = - \log \frac{\exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
% \end{equation}

% The complete objective becomes:
% \begin{equation} \label{eq:nce}
%     \mathcal{L}_{NCE} = \frac{1}{2B} \sum_{i,j}^B (\mathcal{L}^{i(1)}_{NCE} + \mathcal{L}^{i(2)}_{NCE}) 
%     \footnote{For simplicity, we will refer only to the $i$th example of modality 1 in the subsequent loss definitions, with the objective for the entire batch calculated as the average over all batch samples from both modality 1 and modality 2.}
% \end{equation}
% This objective aims to maximize the similarity $\sm(\cdot, \cdot)$ between positive pairs while minimizing it for negative ones, with $\tau >0$ serving as a temperature parameter to control the sharpness of the distribution.

% % EN, I think this was too long and can be explained by text alone. 
% % In the unpaired setting, modality 1 is indexed by $i$ and modality 2 by $j$, yielding hidden vectors $\{(\mathbf{v}^1_i, \mathbf{v}^2_j)\}_{i, j=1}^B$. Directly applying the NCE objective to the $i$th sample in modality 1 and the $j$th sample in modality 2 within this unpaired batch setup would become:
% % \begin{align} 
% %     \mathcal{L}^{i(1)}_{NCE} = - \log \frac{\mathbbm{1}_{[i = j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}\\
% %     \mathcal{L}^{j(2)}_{NCE} = - \log \frac{\mathbbm{1}_{[j = i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_i)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq i]} \exp (\sm(\mathbf{v}^2_j, \mathbf{v}^1_l)/\tau)}
% % \end{align}

% In the unpaired setting, a batch $\{(x^1_i, x^2_j)\}_{i,j=1}^B$ consists of samples from different modalities linked only by their shared treatment label ($t_i = t_j$), resulting in hidden vectors $\{(\mathbf{v}^1_i, \mathbf{v}^2_j)\}_{i, j=1}^B$.  Directly applying the NCE objective here would simply encourage randomly paired samples within the same treatment group to move closer  while pushing apart all other pairs, potentially introducing noise due to weak or nonexistent correlations.

% % supcon for paired dataset, infocore for combining the two
% Since the treatment label in this context can provide weak supervision, it can thus be leveraged by supervised NCE objective variants such as SupCon and InfoCore~\cite{}. Rather than focusing on pairing specific indices, the SupCon objective brings all pairs within the same treatment group closer together:
% \begin{equation} \label{eq:supcon}
%     \mathcal{L}^{i(1)}_{SupCon} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[t_i = t_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
% \end{equation}
% The InfoCore objective is a combination of the NCE and SupCon objective $\mathcal{L}_{InfoCore} = \frac{1}{2}\mathcal{L}_{SupCon} + \frac{1}{2}\mathcal{L}_{NCE}$.
% % wcl and xdc

% Rather than relying on predefined labels for each sample, methods like XDC and WCL generate weak labels ${c^1}$ and ${c^2}$ from the representation spaces ${\mathbf{v}^1}$ and ${\mathbf{v}^2}$. These weak labels are then utilized as supervisory signals across modalities. The WCL objective is
% \begin{equation} \label{eq:wcl}
%     \mathcal{L}^{i(1)}_{WCL} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[c^2_i = c^2_j]} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_j)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}
% \end{equation}


% Despite the variations in the above methods and their extended definitions of positive pairs, none of the prior methods can be directly applied to unpaired contrastive learning. The paired assumption in these methods always brings $\mathbf{v}^1_i$ and $\mathbf{v}^2_j$ closer in latent space when $i=j$. Even in methods like SupCon or InfoCore, which aim to bring samples within the same treatment group closer together, there is no mechanism to distinguish between individual samples within the same treatment group.

\section{Methods}
% \section{IntraPair InterCluster~(IPIC)}
\label{sec:method}

We now introduce our proposed method, \textit{IntraPair InterCluster}~(IPIC), which leverages both intra-treatment group information via matching (or pairing) \emph{and} inter-treatment group information via clustering.
% Finally, we detail the IPIC algorithm for batch learning, and its implementation specifics.

\subsection{Intra-treatment Group Learning via Matching}


\begin{algorithm2e}[t] % Remove [t] if it causes an error in your specific template
\caption{Matching and Re-pairing}
\label{alg:matching_repair}
\small % Reduces font size
\DontPrintSemicolon

\KwIn{Unpaired dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i,j=1}^N$}
\KwOut{Matching matrices $\{M_t\}$, pseudo-paired $\mathcal{D}' = \{(x^1_i, x^2_k, t)\}$}

Train $\psi^1, \psi^2$ on $\{(x^1_i, t_i)\}, \{(x^2_j, t_j)\}$; compute propensity scores $\{\pi^1_i\}, \{\pi^2_j\}$\;
\For{each treatment $t$ in $T$}{
  Filter samples by $t$; compute cost $C_{ij} = \|\pi^1_i - \pi^2_j\|_2$ and set uniform $p_1, p_2$\;
  Solve EOT via Sinkhorn to get $M_t$: $\min_M \sum C_{ij} M_{ij} - \lambda H(M)$ s.t. $M\mathbbm{1}{=}p_1, M^T\mathbbm{1}{=}p_2$\;
  Reorder modality 2 samples using $M_t$ to form pairs $\{(x^1_i, x^2_k)\}$\;
}
\end{algorithm2e}

% \begin{algorithm}[t]
% \small % Reduces font size to save vertical space
% \SetAlgoLined
% \DontPrintSemicolon % Removes the semicolons at end of lines for a cleaner look

% \KwIn{Unpaired dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i,j=1}^N$}
% \KwOut{Matching matrices $\{M_t\}$ and pseudo-paired dataset $\mathcal{D}' = \{(x^1_i, x^2_k, t)\}$}

% Train classifiers $\psi^1, \psi^2$ on $\{(x^1_i, t_i)\}$ and $\{(x^2_j, t_j)\}$\;
% Compute propensity scores $\{\pi^1_i\}$ and $\{\pi^2_j\}$ using $\psi^1, \psi^2$\;

% \For{each treatment $t$ in $T$}{
%     Select samples with treatment $t$ and compute cost $C_{ij} = \|\pi^1_i - \pi^2_j\|_2$\;
%     Initialize $M$ with uniform distributions $p_1, p_2$ (s.t. $M \mathbbm{1} = p_1, M^T \mathbbm{1} = p_2$)\;
    
%     Solve Entropic Optimal Transport (EOT) via Sinkhorn to get $M_t$:
%     $ \min_M \sum_{i,j} C_{ij} M_{ij} - \lambda H(M) \quad \text{s.t.} \quad M_{ij}\geq 0 $\;
%     \tcp{where $H(M)$ is the entropy}
    
%     Reorder modality 2 samples using $M_t$ to form pairs $\{(x^1_i, x^2_k)\}$\;
% }
% \caption{Matching and Re-pairing}
% \label{alg:matching_repair}
% \end{algorithm}

% \begin{algorithm}[hbt]
% \SetAlgoLined
% \SetKwInOut{Input}{Input}
% \SetKwInOut{Output}{Output}
% \Input{Unpaired dataset $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i,j=1}^N$}
% Train classifier $\psi^1$ on $\{(x^1_i, t_i)\}$\; 
% Train classifier $\psi^2$ on $\{(x^2_j, t_j)\}$\;
% % Use $ \psi^1$ and $ \psi^2$ to  for each sample\;
% Get propensity scores $\{\pi^1_i\}$ and $\{\pi^2_j\}$\;% with $ \psi^1$ and $ \psi^2$\;

% \For{each treatment $t$ in $T$}{
%     Find all pairs with the same treatment value $t$\\
%     Compute the cost matrix $C_{ij} = \|\pi^1_i - \pi^2_j\|_2$\\
%     %Define $p_1$ as the uniform distributions of $\{\pi^1_i\}$\\
%     %Define $p_2$ as uniform distributions of $\{\pi^2_j\}$\\
%     Define uniform distributions $p_1$ and $p_2$ over $\{\pi^1_i\}$ and $\{\pi^2_j\}$\;
%     Initialize matrix $M_{ij}$ such that $M \mathbbm{1} = p_1$ and $M^T \mathbbm{1} = p_2$\;
%     %Initialize $M_{ij}$ where $M \mathbbm{1} = p_1$ and $M^T \mathbbm{1} = p_2$\\ % with uniform distributions $p_1$ and $p_2$ based on $\pi^1_i$ and $\pi^2_j$\;
    
%     Solve the entropic optimal transport (EOT) problem:\vspace{-3mm}
%     $$\min_M \sum_{i=1}^n \sum_{j=1}^n C_{ij} M_{ij} - \lambda H(M), M_{i,j}\geq 0$$
%     where $H(M) = -\sum_{i,j} M_{ij} \log(M_{ij})$, and $n$ is the number of samples in treatment group $t$\\
    
%     Use Sinkhorn's algorithm to obtain the optimal matching matrix $M_t$\;
    
%     Reorder modality 2 samples with $M_t$ by\\
%     getting $x^2_k$ for each $x^1_i$ with treatment label $t$\\
%     %Use $M_t$ to re-order samples in modality 2 to match with modality 1, forming $\{x^2_k\}$ for each $x^1_i$\;
% }
% \Output{Matching matrices $\{M_t\}$ and pseudo-paired dataset $\mathcal{D}' = \{(x^1_i, x^2_k, t)\}$}
% \caption{Matching and Re-pairing}
% \label{alg:matching_repair}
% \end{algorithm}

To address the challenge of unpaired contrastive learning more effectively, we first explore using matching methods by leveraging intra-treatment group information to align samples across modalities. This involves re-pairing samples across the two modalities by computing matching scores between each sample in one modality and all samples in the other modality that share the same treatment label. We refer to this step as \textit{intra-treatment group learning}.

To leverage the treatment group label and allow our encoders to distinguish samples within each group, we start by re-pairing the two modalities within the same treatment group. This draws inspiration  from causal inference methods which match unpaired modalities with shared latent features~\cite{xi2024propensity, ryu2024cross}. 
As illustrated in Figure \ref{fig:dataset}, both modalities are assumed to capture measurements from a shared latent space $Z$ (e.g. natural scenes or identical cell lines). Thus each modality,  $X^1$ or $X^2$, can be viewed as a \textit{partial} observation of $Z$. Ideally, if we could observe the corresponding latent variables $z_i$ and $z_j$, re-pairing could be straightforward since we could directly match the samples $x^1_i$ and $x^2_j$ based on their proximity in this shared space.  However, since $Z$ is unobservable and difficult to estimate without introducing unverifiable assumptions, we must rely on alternative approaches to match $x^1_i$'s and $x^2_j$'s effectively. To approximate this latent alignment, we use propensity scores as surrogates for the latent features, enabling us to compute similarity scores between samples across modalities. 
%$Z$ might also be high-dimensional which increases the inefficiencies of matching, we have to 
% follow from Xi et al, formulating the problem under causal inference, we assume t perturbs observations via shared latent state which allows us to use estimated propensity scores of p(t|Z) as the proxy of latent Z.
% We further assume that t|X^1 and t|X^2 are identical and t|X^1 =d t|X^2 =d t|Z. 
% These further assumptions makes PS to provide a common space for matching, makes PS fully identifiable via classification on individual modalities and allows us to estimate these propensity scores from observational data (i.e $X^1, X^2$) alone.
% Then given ${(x^1_i, x^2_j, t)}$, we train a classifier (propensity score predictor)  \psi^1_t and  \psi^2 with {x^1_i, t_i} and {x^2_j, t_j} for each modality. Then find the propensity scores as {\pi^1_i} and {\pi^2_j} where \pi \in \mathbb{R}^{|T|}.  
% then for each unique treatment t \in T, we perform entropic optimial transport matching to get a matching matrix M_t and a new paring for each treatment group represent by a new indexing k of modality 2. 
% The detailed process of re-pairing and obtaining mathcing scores is described in Algorithm 1.

Following \citet{xi2024propensity}, we frame the matching problem using principles from causal inference~\cite{rubin1974estimating}. Specifically, we assume that each treatment $t$ induces perturbations through a shared latent space $Z$, enabling us to use estimated propensity scores, $p(t|Z)$, as a proxy for the unobserved latent space $Z$. Additionally, we assume that the conditional distributions $t|X^1$ and $t|X^2$ are identical, such that $t|X^1 \overset{d}{=} t|X^2 \overset{d}{=} t|Z$. Under these assumptions, propensity scores can serve as a common space for aligning modalities, making them fully identifiable through classifiers trained independently on each modality.  This enables us to estimate propensity scores solely from observational data %(i.e., $X^1$ and $X^2$)
\footnote{The complete theoretical setup and assumptions of the matching problem are provided in the supplement.}. 

With the above setup, given $\mathcal{D} = \{(x^1_i, x^2_j, t)\}_{i,j=1}^N$, our goal is to compute a matching matrix $M_t$ for each $t \in T$ to re-pair samples across the two modalities within each treatment group based on $M_t$. We begin by training  classifiers (propensity score predictor) $ \psi^1$ and $ \psi^2$ for each modality using $\{x^1_i, t_i\}$ and $\{x^2_j, t_j\}$. This yields propensity scores $\{\pi^1_i\}$ and $\{\pi^2_j\}$, where $\pi \in \mathbb{R}^{|T|}$. For each unique treatment group $t$, we then apply entropic optimal transport to obtain a matching matrix $M_t$, which enables a new pairing within each treatment group. The result is a pseudo-paired dataset $\mathcal{D}' = \{(x^1_i, x^2_k, t)\}_{i,j=1}^N$, where modality 2 is now indexed by $k$. The detailed process for re-pairing and obtaining matching matrices is outlined in Algorithm \ref{alg:matching_repair}.


After obtaining the pseudo-paired dataset and matching matrix for each treatment group $t$, we define our \textit{intra-treatment group} learning objective as follows:
\begin{equation}
    \mathcal{L}^{i(1)}_{intra} = - \sum_{k}^B \log  \frac{\mathbbm{1}_{[t_i = t_k]} \cdot m^1_{i,k} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_k)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq k]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_l)/\tau)}, 
\end{equation}
where $\sum_{k=1}^B m^1_{i,k} = 1$ represents the adjusted matching scores between each sample $i$ and all of its cross-modal pairs within the sampled batch. The matching score can be defined as 
\begin{equation}
    \label{eq:matchingscore}
    m^1_{i, k} =  
    \begin{cases}
        (M_{t_i})_{(a,b)},&\text{if $t_i$=$t_k$, and ($a$,$b$) are indices of $i$,$k$ in $t$ } \\
        0, & \text{otherwise} 
    \end{cases}
\end{equation}
This ensures that $m^1_{i,k}$ is non-zero only when both samples $i$ and $k$ belong to the same treatment group $t$ and are aligned according to the matching matrix $M_t$. The full \textit{intra-treatment group} objective becomes: $\mathcal{L}_{intra} = \frac{1}{2B} \sum_{i,k}^B (\mathcal{L}^{i(1)}_{intra} + \mathcal{L}^{k(2)}_{intra})$\refstepcounter{equation}(\theequation)\label{eq:lintra},
and enables the encoders to learn treatment-aware information while distinguishing sample pairs across modalities.

% \begin{equation}
%     \label{eq:lintra}
%     \mathcal{L}_{intra} = \frac{1}{2B} \sum_{i,k}^B (\mathcal{L}^{i(1)}_{intra} + \mathcal{L}^{k(2)}_{intra}),
% \end{equation} 

% After obtaining this re-paired datasets and matching matrix for each treatment group $t$, we propose our intra-treatment group learning objective: 
% \begin{equation}
%     \mathcal{L}^{i(1)}_{intra} = - \sum_{j}^B \log  \frac{\mathbbm{1}_{[t_i = t_j]} \cdot m_{i,j} \cdot \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_k)/\tau)}{\sum_{m=1}^B \mathbbm{1}_{[l \neq j]} \exp (\sm(\mathbf{v}^1_i, \mathbf{v}^2_m)/\tau)}, 
% \end{equation}
% where $\sum_{j}^B m_{i,j} = 1$ are adjusted matching scores between each element $i$ and all of its cross-modal pairs in the sampled batch.
% $\mathcal{L}^{i(1)}_{intra}$ allows the encoders to learn treatment-aware information while distinguish sample pairs across modalities.


\subsection{Inter-treatment Group Learning via Clustering}
\begin{algorithm2e}[t]
\caption{Cluster Pseudo-labeling}
\label{alg:cluster}
\small 
\DontPrintSemicolon
\KwIn{Batch $\mathcal{B} = \{(\mathbf{u}^1_i, \mathbf{u}^2_k)\}$, number of treatments $|T|$}
\KwOut{Pseudo-labels $\{c^1_i\}$ and $\{c^2_k\}$}

Run \emph{KMeans} ($K=|T|$) on $\{\mathbf{u}^1_i\}$ and $\{\mathbf{u}^2_k\}$ separately to obtain labels $\{c^1_i\}$ and $\{c^2_k\}$\;
\end{algorithm2e}

% \begin{algorithm}[t]
% \SetAlgoLined
% \SetKwInOut{Input}{Input}
% \Input{Batch of pseudo-paired embeddings $\mathcal{B} = \{(\mathbf{u}^1_i, \mathbf{u}^2_k)\}$ and number of treatments $|T|$}

% Run \emph{KMeans} clusters for $\{\mathbf{u}^1_i\}_{i=1}^B$ with $K=|T|$
% Obtain clustered labels $\{c^1_i\}_{i=1}^B$ \\
% Run \emph{KMeans} clusters for $\{\mathbf{u}^2_k\}_{k=1}^B$ with $K=|T|$
% Obtain clustered labels $\{c^2_k\}_{k=1}^B$ \\

% \SetKwInOut{Output}{Output}
% \Output{Pseudo-labels $\{c^1_i\}$ and $\{c^2_k\}$ for the batch.}
% \caption{Cluster Pseudo-labeling}
% \label{alg:cluster}
% \end{algorithm}

% \jh{this was a little hard to parse; it's not clear what's special about modality 2? Is the important part is that the $z_j$ and $z_k$ are similar despite different treatments (i.e. cell states are the same, and hence inter-treatment samples in both modalities will look similar)?}
%Specifically, if $\mathbf{v}^1_i$ is the counterpart of $\mathbf{v}^2_j$, it should also be considered a positive pair with $\mathbf{v}^2_k$. This ensures that the model captures underlying latent similarities across treatments, allowing for more flexible and biologically meaningful cross-modal alignments.


In addition to \textit{intra-treatment group} learning, a robust approach for unpaired contrastive learning should account for cases where different treatment groups yield similar or no effect at all on certain samples. In biological datasets, it is common for some treatments to have minimal or overlapping effects, which can lead to similar representations across different treatment groups~\cite{fradkin2024molecules}. 
If two samples in modality 2, $x^2_j$ and $x^2_k$, exhibit similar traits in the original population (i.e., their latent variables $z_j$ and $z_k$ are similar) and receive treatments $t_1$ and $t_2$ with no substantial effect, then their representations $\mathbf{v}^2_j$ and $\mathbf{v}^2_k$ should remain similar. In such cases, $\mathbf{v}^2_j$ and $\mathbf{v}^2_k$ should both be considered as positive pairs with their cross-modality counterparts (i.e. $\mathbf{v}^1_i$).
Motivated by this observation, we incorporate clustering in the representation space to enable \textit{inter-treatment group} learning.

To capture these inter-treatment similarities, we introduce a clustering mechanism that assigns pseudo-labels to the embeddings of each modality. Inspired by methods like XDC~\cite{alwassel2020self} and WCL~\cite{zheng2021weakly}, we add a projection head $g$ to our network specifically designed for clustering to produce embeddings $\{\mathbf{u}^\smblksquare \} = g^\smblksquare (\mathcal{\phi}^\smblksquare  (\{x^\smblksquare \}))$. The goal is to leverage cluster assignments from one modality to guide the learning process for the other modality, thereby enhancing cross-modal alignment that generalize across treatments. This clustering process is outlined in Algorithm \ref{alg:cluster}.

% We follow similar fashion like XDC~\cite{alwassel2020self} and WCL~\cite{zheng2021weakly} by introducing new projection head into our network specifically for generate representations for clustering, then use the cluster labels from one modality to guide the learning of the representation of the other modality. Algorithm \ref{alg:cluster} depict how we perform representation space clustering and generate pseudo-label for each modalities.
With these generated pseudo-labels, we define our \textit{inter-treatment group} objective as:
%With generated pseudo-labels, we define our inter-treatment group objective as:
\begin{equation}
    \mathcal{L}^{i(1)}_{inter} = - \sum_{o}^B \log  \frac{\mathbbm{1}_{[i = k]} \cdot \mathbbm{1}_{[c^2_k = c^2_o]} \cdot \exp (\sm(\mathbf{u}^1_i, \mathbf{u}^2_o)/\tau)}{\sum_{l=1}^B \mathbbm{1}_{[l \neq k]} \exp (\sm(\mathbf{u}^1_i, \mathbf{u}^2_l)/\tau)}, 
\end{equation}
The complete inter-treatment group objective is defined as:
$\mathcal{L}_{inter} = \frac{1}{2B} \sum_{i,k}^B (\mathcal{L}^{i(1)}_{inter} + \mathcal{L}^{k(2)}_{inter})$ \refstepcounter{equation}(\theequation)\label{eq:linter},
% \begin{equation}
%     \label{eq:linter}
%     \mathcal{L}_{inter} = \frac{1}{2B} \sum_{i,k}^B (\mathcal{L}^{i(1)}_{inter} + \mathcal{L}^{k(2)}_{inter}),
% \end{equation} 
It enables the model to identify and bring together similar samples across different treatment groups, thereby capturing nuanced biological patterns and enhancing the robustness of our unpaired contrastive learning framework.

% In this case, they should be considered positive pairs with their cross-modality counterparts, allowing us to bring them closer in the latent space.
% Motivated by this observation, we ensure that inter-treatment group information is integrated by clustering samples in the representation space. 
%$\mathcal{L}_{inter}$ enables the model to identify and pull together similar samples across different treatment groups, capturing nuanced biological patterns and improving the robustness of our unpaired contrastive learning framework.
% In addition to the intra-treatment group learning, in biological datasets, some treatment $t$ might have no treatment effect or generate similar treatment effects for some samples. This phenomenon will cause samples in different treatment group to still have similar representations. A robust learning approach for unpaired contrastive learning should also allow pushing similar samples in different treatment groups together. To this point, we apply clustering in the representation space for inter-treatment group learning.
% \par
% for no treatment effects, if two samples $x_i$ and $x_j$ have similar traits in the original populations i.e. $z_i$ and $z_j$ are similar, the if the treat $t_1$ and $t_2$ both had no effect to $x_i$ and $x_j$ their representation ought to look similar and should be considered as positive pair to its cross modality counter parts and being pull closer in the latent space. 
% \par 
% With this motivation, it is important we don't omit the inter-treatment group information 
% \par 
% we do this by following WCL or XDC fashion to generate clusters in representation space for each modality and use these clusters as pseudo-labels for the other modalities to create positive pairs for inter-treatment group learning.
% \par
% algorithm for clustering
% \par
% the loss is defined as ... $\mathcal{L}_{inter}$

% \begin{algorithm}[t]
% \SetAlgoLined
% \SetKwInOut{Input}{Input}
% \Input{$\mathcal{D} =\{(x^1_i, x^2_j, t)\}_{i, j=1}^N$: unpaired biological dataset. $\lambda$: loss scaling hyperparameter. $\mathcal{\phi}^1, \mathcal{\phi}^2$: encoders. $f^1, f^2, g^1, g^2$: projection heads.}

% Estimate matching matrices $\{M_t\}$ and pseudo-paired dataset $\mathcal{D}' = \{(x^1_i, x^2_k, t)\}_{i, k=1}^N$ via Alg.~\ref{alg:matching_repair} \\
% \For{epoch $= 1$ \KwTo $E$}{
% \For{step $=1$ \KwTo $S$}{
% Sample a batch $\mathcal{B}$ from $\mathcal{D}'$, where $|\mathcal{B}| = B$\\
% \For{$x^1_i$ $x^2_k$ in $\mathcal{B}$}{
%     Get matching score $m^1_{i,k} \in \mathbb{R}^B$ Eq.(\ref{eq:matchingscore}) \\ 
%     Get matching score $m^2_{i,k} \in \mathbb{R}^B$ \\
%     Normalize $m^1_{i,k}$ and $m^2_{i,k}$
% }
% $\{\mathbf{h}^1\} = \mathcal{\phi}^1(\{x^1\})$
% $\quad\quad\;\;$
% $\{\mathbf{h}^2\} = \mathcal{\phi}^2(\{x^2\})$\\
% $\{\mathbf{v}^1\} = f^1(\{\mathbf{h}^1\})$ $\quad\quad\;\;$ $\{\mathbf{v}^2\} = f^2(\{\mathbf{h}^2\})$\\
% $\{\mathbf{u}^1\}= g^1(\{\mathbf{h}^1\})$ $\quad\quad\;\;$ $\{\mathbf{u}^2\} = g^2(\{\mathbf{h}^2\})$ \\

% Calculate $\mathcal{L}_{intra}$ with $\{m, \mathbf{v}\}$ Eq.(\ref{eq:lintra})\\
% Get cluster pseudo-labels $\{c^1, c^2\}$ via Alg.~\ref{alg:cluster}\\
% Calculate $\mathcal{L}_{inter}$ with $\{c, \mathbf{u}\}$ Eq.(\ref{eq:linter})\\
% Optimize $\mathcal{\phi}^1, \mathcal{\phi}^2$ with $\mathcal{L}_{intra} + \lambda \mathcal{L}_{inter}$
% }
% }

% \SetKwInOut{Output}{Output}
% \Output{Learned encoders $\mathcal{\phi}^1$ and $\mathcal{\phi}^2$}
% \caption{Batch IntraPair InterCluster Contrastive Learning (IPIC)}
% % \caption{Batch IntraPair InterCluster Contrastive Learning (IPIC)} 
% \label{alg:ipic}
% \end{algorithm}

\begin{algorithm2e}[t]
\caption{Batch IntraPair InterCluster Contrastive Learning (IPIC)}
\label{alg:ipic}
\small
\DontPrintSemicolon

\KwIn{Unpaired dataset $\mathcal{D}$, loss scale $\lambda$, encoders $\phi^{1,2}$, heads $f^{1,2}, g^{1,2}$}

Estimate matching matrices $\{M_t\}$ and pseudo-paired $\mathcal{D}'$ via Alg.~\ref{alg:matching_repair}\;

\For{epoch $= 1$ \KwTo $E$ \textbf{and} step $=1$ \KwTo $S$}{
    Sample batch $\mathcal{B}$ from $\mathcal{D}'$ ($|\mathcal{B}| = B$)\;
    
    Compute and normalize matching scores $m^1, m^2$ for all pairs in $\mathcal{B}$ using Eq.~(\ref{eq:matchingscore})\;
    
    \tcc{Forward pass for both views $v \in \{1,2\}$}
    Compute $\mathbf{h}^v = \phi^v(x^v)$, $\mathbf{v}^v = f^v(\mathbf{h}^v)$, and $\mathbf{u}^v = g^v(\mathbf{h}^v)$\;
    
    Calculate $\mathcal{L}_{intra}$ using $\{m, \mathbf{v}\}$ (Eq.~\ref{eq:lintra})\;
    Get cluster pseudo-labels $\{c^1, c^2\}$ via Alg.~\ref{alg:cluster}\;
    Calculate $\mathcal{L}_{inter}$ using $\{c, \mathbf{u}\}$ (Eq.~\ref{eq:linter})\;
    
    Optimize $\phi^{1,2}, f^{1,2}, g^{1,2}$ to minimize $\mathcal{L}_{intra} + \lambda \mathcal{L}_{inter}$\;
}
\end{algorithm2e}

\begin{figure*}[t]
    \centering
    % \rule{\textwidth}{0.3\textheight} % Placeholder box with width and height
    \includegraphics[width=0.95\textwidth]{sec/figs/fig4v3.png}
    \caption{
    \textbf{Method comparison.} This figure shows the various methods we experimented with, where different shapes represent different modalities, and different colors represent different treatment groups. Each figure demonstrates how the method objective is computed within a batch for a specific sample (highlighted circle) in one modality.
    Solid lines connecting circles and triangles indicate positive pairs, while dashed lines indicate negative pairs. Dashed triangles represent the pseudo-paired dataset generated by matching within each treatment group (Algorithm~\ref{alg:matching_repair}).
    In $\mathcal{L}_{intra}$, the solid lines have varying widths to denote different weights applied to each positive pairing based on the matching score.
    In WCL~\cite{zheng2021weakly}, XDC~\cite{alwassel2020self}, and $\mathcal{L}_{intra}$, curved lines connect triangles that share the same cluster label. With our $\mathcal{L}_{inter}$ (Eq.~\ref{eq:linter}) and $\mathcal{L}_{intra}$ (Eq.~\ref{eq:lintra}) design, each sample can learn both intra- and inter-treatment group information.
    }
    \label{fig:methodcomp}
\end{figure*}


% \subsection{Batch IntraPair InterCluster Contrastive Learning (IPIC)}
\subsection{Combined algorithm}

We now combined our \textit{intra-treatment group} and \textit{inter-treatment group} objectives to form our unified IPIC method for unpaired multimodal representation learning on biological datasets. Figure~\ref{fig:methodcomp} illustrates a conceptual comparison between IPIC and baseline methods, highlighting their differences. Our method begins by re-pairing the dataset using Algorithm~\ref{alg:matching_repair} to generate matching matrices for each treatment group. For each batch sampled from the dataset, we retrieve the corresponding matching scores for each sample in each modality via the matching matrix. This process is implemented with a custom collate function in the data loader, ensuring efficient score retrieval and normalization to prevent numerical errors.
We also employ two distinct sets of projection heads: one for \textit{intra-treatment group}  representation ($f^1, f^2$) and one for \textit{inter-treatment group} representation ($g^1, g^2$). This separation prevents conflicts within the embedding space between intra- and inter-treatment group learning objectives. Finally, we optimize the network using the complete objective. The full steps of IPIC are outlined in detail in Algorithm \ref{alg:ipic}.

% With our intra-treatment group and inter-treatment group objective, we want to combine the force to introduce our method for learning better representations for batch unpaired contrastive learning  for biological essays. We first re-pair the dataset with Algorithm~\ref{alg:matching_repair} and obtain matching matrices for each treatment group. For each batch we sample from the dataset, we find the corresponding matching score for each sample in each modality using the matching matrix, this is implemented by designing a specified collate function for the data loader in our implementation. We also normalize each matching score to avoid numeric errors. Then we utilize two sets of projection heads, one for intra-treatment group representation and one for inter-treatment group representation. This prevents conflicts in the embedding space of intra and inter treatment group. Finally we optimize the network with our previously definied objective. The details of IPIC is described in detail in Algorithm \ref{alg:ipic}.
% \par
% Given a dataset D, we perform a one-time matching like in Algorithm 1 (one time-matching) and save the matching matrix for each treatment group
% \par
% Then for each batch B, we find the matching weight for each sample in each modality a vector of B with corresponding weights to its other modality counter parts generated from matching matrix and treatment labels with Algo 3 (Find weight for batch)
% \par
% Then we start to do learning in this batch and perform clustering in the hidden space with Algo 2 (cluster hidden space)
% \par
% Finally we minimize $\mathcal{L}_{inter}$ and $\mathcal{L}_{intra}$ until converge, the entire learning algorithm is captured in Algo 4


% \newpage