\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{float}
\usepackage{multirow}
\usepackage{url}
\usepackage{adjustbox}
\usepackage{xcolor}
\usepackage{textcomp}
\usepackage[utf8]{inputenc}
\usepackage{longtable}

\usepackage[font=normalsize,labelfont=bf]{caption}

\jmlryear{2025}\jmlrworkshop{Full Paper -- MIDL 2025}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2025}
\def\SB#1{\textsubscript{#1}}

\title[I2I-Galip]{I2I-Galip: Unsupervised Medical Image Translation Using Generative Adversarial CLIP}

 \midlauthor{\Name{Yilmaz Korkmaz}\nametag{$^{1}$}\Email{ykorkma1@jhu.edu} \\
 \Name{Vishal M. Patel}\nametag{$^{1}$} \Email{vpatel36@jhu.edu}\\
 \addr $^{1}$ Johns Hopkins University}

\begin{document}

\maketitle

\begin{abstract}
Unpaired image-to-image translation is a challenging task due to the absence of paired examples, which complicates learning the complex mappings between the distinct distributions of the source and target domains. One of the most commonly used approaches for this task is cycle-consistent models which require the training of a new pair of generator-discriminator networks for each translation. In this paper, we propose a new image-to-image translation framework named Image-to-Image-Generative-Adversarial-CLIP (I2I-Galip) where we utilize pre-trained multi-modal foundation models to mitigate the need of separate generator-discriminator pairs for each source-target mapping while achieving better and more efficient multi-domain translation. By utilizing the massive knowledge gathered during pre-training a foundation model, our approach makes use of a single lightweight generator network with $\approx$13M parameters for the multi-domain image translation task. Comprehensive experiments on translation performance in public MRI and CT datasets show the superior performance of the proposed framework over the existing approaches.
\end{abstract}

\section{Introduction}

Medical image translation is a crucial task due to the availability of diverse information across various modalities. However, it is challenging because of significantly different domain distributions, necessitating the learning of very complex mappings between different imaging modalities \cite{roy2013atlas}.  Many supervised deep learning-based image translation methods have been proposed to address this problem \cite{dar2019image,jiang2023cola,armanious2020medgan}. However, these methods are limited due to the requirement of paired training data which might be challenging to acquire in real case scenarios. To overcome this constraint, various unsupervised image translation methods have been introduced for both general computer vision and medical imaging tasks \cite{dai2020multimodal,liu2017unsupervised,huang2018multimodal,ozbey2023unsupervised,han2021dual,yi2017dualgan,torbunov2023uvcgan}.  CycleGAN \cite{zhu2017unpaired} is one of the first approaches that proposed unpaired image translation which loosened the requirement for paired datasets by enforcing cycle-consistency among inverse translations. However, in the case of multiple modalities, cycle-consistent models introduce significant computational requirements as separate generator-discriminator pairs are required for each new modality. To mitigate the need of separate network pairs several multi-domain translation frameworks have been proposed \cite{choi2018stargan,choi2020stargan,huang2018multimodal,lee2018diverse}. Nonetheless, these methods generally lag in performance compared to uni-modal approaches.

More recently, a couple of text-driven diffusion based image-to-image translation frameworks have been proposed that integrate large vision-language pre-trained models as guidance \cite{tumanyan2023plug,rombach2022high,hertz2022prompt,kwon2022diffusion}, enabling robust translation across multiple domains. While these models provide zero-shot editing capabilities for various text conditions, they are limited in delivering fidelity necessary for the medical tasks. Moreover, these methods impose a significant computational burden due to the requirement for large denoiser backbones and extended inference times in their backward diffusion processes.

% here we may mention why these models are not utilized in medical domain.

In this paper, we propose a cycle-consistent generative adversarial model to address the aforementioned limitations. Our model integrates BiomedCLIP (see \sectionref{biomedclip}), a pre-trained multi-modal vision-language model specifically trained in the medical domain, within a cycle-consistent feed-forward framework. By leveraging contrastive information from this large pre-trained network, we eliminate the need to train a new generator network for each translation task and reduce the requirement for large discriminator backbones in feature extraction. Furthermore, our model enhances overall translation performance compared to existing unsupervised approaches in both single and multi-domain translation tasks. 

Our main contributions can be summarized as follows:
\begin{itemize}
    \item We introduce a novel adversarial framework for language-driven multi-domain medical image translation.
    \item Our framework outperforms existing unsupervised baselines with a relatively lightweight backbone. Extensive experiments demonstrate its superior performance across various publicly available datasets from different modalities.
\end{itemize}



\section{Background}
\subsection{Cycle-Consistent Generative Adversarial Networks (CycleGAN)}

CycleGAN \cite{zhu2017unpaired} models the unpaired image translation problem between domain $A$ and $B$ using two translators. First, two translators   (G : $A$ → $B$) and  (F : $B$ → $A$) are defined. Then  G and F are forced to be inverses of each other, thus making both mappings to be approximately bijections.  CycleGAN achieves remarkable performance using this cycle-consistency combined with the adversarial loss which encourages $F (G(X_A)) \approx X_A$ and $G(F (X_B)) \approx X_B$.

\subsection{BiomedCLIP}
\label{biomedclip}
%Recently, large vision-language models with or without contrastive Language-Image Pre-training (CLIP) models \cite{radford2021learning} have been widely utilized %in medical domain \cite{eslami2023pubmedclip,zhang2023large,wei2024visionclip,boecking2022making}. 
In this paper, we utilize BiomedCLIP \cite{zhang2023large} as our pre-trained vision-language model. BiomedCLIP is trained on PMC-15M dataset using pairs of figures and captions from biomedical research articles in PubMed Central and outperforms other medical vision-language models in various tasks \cite{zhang2023large}. BiomedCLIP utilizes a ViT-B \cite{dosovitskiy2020image} based image encoder to generate image embeddings while utilizing PubMedBERT \cite{gu2021domain} for the text embeddings.


\section{Methodology}
\subsection{I2I-Galip}
We design a lightweight generator network which is a very thin variant of the latent diffusion U-Net \cite{rombach2022high} (with only $\approx13$M parameters). Our discriminator network uses the projections of intermediate Vision Transformer (ViT) features as input, adapted from text-to-image model Stylegan-T \cite{sauer2023stylegan}. This discriminator design  allows  us to utilize the output of different layers in BiomedCLIP's ViT, capturing different level of details. We modify this design by dividing the discriminator heads into distinct sets, tailored specifically for a target translation domain. We also utilize BiomedCLIP's text encoder to generate target text embeddings using captions for each modality, which  controls the generated image features via cross-attention transformers while serving as a regularizer in the training (see \figureref{fig:fig_1}a). Overall training objective for the generator can be expressed as follows
\begin{equation}
\begin{aligned}
    \mathcal{L}_{total} = \, & \lambda_{cycle} \cdot \mathcal{L}_{cycle} + \lambda_{adv} \cdot \mathcal{L}_{adv_G} 
    + \lambda_{clip} \cdot \mathcal{L}_{clip} \\
    & + \lambda_{cls} \cdot \mathcal{L}_{cls} + \lambda_{identity} \cdot \mathcal{L}_{identity},
\end{aligned}
\end{equation}
where $\lambda_{cycle}, \lambda_{adv}, \lambda_{clip}, \lambda_{cls}, \lambda_{identity} $ are coefficients to control the contribution from each loss. We denote the loss associated with the discriminator as $\mathcal{L}_{adv_D}$. In what follows, we describe each of these loss terms in detail.
\begin{figure*}[!t]
\includegraphics[width=1.0\textwidth]{i2i-galip.pdf}
\caption{Training scheme and overall model architecture of I2I-Galip is illustrated for the T\SB{1}- to T\SB{2}-weighted MRI translation task. Part (a) illustrates the definition of $L_{clip}$, ${L}_{cls}$ and ${L}_{adv}$ losses. ${X_A}^{Input}$, ${X_B}^{Input}$ and ${X_B}^{Out}$ denotes the input T\SB{1}-weighted, input T\SB{2}-weighted and output T\SB{2}-weighted images respectively. Part (b) and (c) stand for ${L}_{cycle}$ and ${L}_{identity}$ losses respectively. BiomedCLIP's ViT and Text Encoder parameters are frozen during training. "This MRI Image is T\SB{2}-weighted" corresponds to a sample prompt used for T\SB{1} to T\SB{2} translation.}
\label{fig:fig_1}
\end{figure*}
\begin{enumerate}
\item \textbf{Adversarial Loss}: By leveraging intermediate features from the ViT, direct feature extraction from images becomes unnecessary, enabling the use of lightweight discriminator heads for each feature level. We utilize the least squares GAN loss \cite{mao2017least} to enhance the stability of training instead of Hinge loss used in StyleGAN-T, which can be defined as follows
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{adv_G} = & \mathbb{E}[\left(Head_{A}(E_{X_A}^{out}, E_{T_A}) - 1\right)^2] \\
        & + \mathbb{E}[\left(Head_{B}(E_{X_B}^{out},E_{T_B}) - 1\right)^2],
    \end{aligned}
\end{equation}
\vspace{-10pt}
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{adv_D} = & \mathbb{E}\left[\left(Head_{A}(E_{X_A}^{input}, E_{T_A}) - 1\right)^2 
        + \left(Head_{A}(E_{X_A}^{out}, E_{T_A})\right)^2\right] \\
        & + \mathbb{E}\left[\left(Head_{B}(E_{X_B}^{input}, E_{T_B}) - 1\right)^2 
        + \left(Head_{B}(E_{X_B}^{out}, E_{T_B})\right)^2\right],
    \end{aligned}
\end{equation}
    where $Head_{A}$ and $Head_{B}$ correspond to the discriminator heads allocated for the specific target domain, $E_{T_A}$ and $E_{T_B}$ are corresponding text encodings (i.e., text encodings of captions for each domain), $E_{X_A}^{input}$, $E_{X_B}^{input}$, $E_{X_A}^{out}$ and $E_{X_B}^{out}$ are feature maps from ViT for input and generated images from domain $A$ and $B$, respectively. 


    
    \item \textbf{Cycle Loss}: We enforce cycle-consistency loss \cite{zhu2017unpaired}, shown in \figureref{fig:fig_1}b, to enforce faithful translation between source and target domains for each pair:
    \begin{equation}
    \begin{aligned}
    \mathcal{L}_{\text{cycle}} = & \mathbb{E}\left[\|X_B^{input} - G(X_A^{input}, E_{T_B}, Y_B)\|_1\right] \\ 
     & + \mathbb{E}\left[\|X_A^{input} - G(X_B^{input}, E_{T_A}, Y_A)\|_1\right].
    \end{aligned}
    \end{equation}
    \item \textbf{CLIP Loss}: We minimize the cosine distance between the text encoding corresponds to the caption of target domain (e.g., ``This MRI image is T\SB{1}-Weighted") and the encoding from ViT for the generated images to enable the utilization of CLIP's joint embedding space similarly with \cite{patashnik2021styleclip}, which can be defined as follows
    \begin{equation}
    \mathcal{L}_{\text{clip}} = -\frac{\langle E_{T_A},E_{X_A}^{out^{last}}  \rangle}{\|E_{T_A}\| \cdot \|E_{X_A}^{out^{last}}\|} - \frac{\langle E_{T_B},E_{X_B}^{out^{last}}  \rangle}{\|E_{T_B}\| \cdot \|E_{X_B}^{out^{last}}\|},
\end{equation}
    where $E_{X_A}^{out^{last}}$, $E_{X_B}^{out^{last}}$ are the image encodings from the last layer of ViT for the generated images from domain $A$ and $B$, respectively. Generally, $\mathcal{L}_{clip}$ is dominated by cycle and adversarial losses giving comparingly small benefits (see \sectionref{sec:ablation} for details).

    \item \textbf{CLS Loss}: The CLS tokens in the final layers of the vision transformers are recognized for containing semantically rich information, as highlighted by \cite{tumanyan2022splicing}, which is typically leveraged for downstream classification tasks and shown to be beneficial in image translation \cite{kwon2022diffusion}. Therefore,  we enforce the cosine similarity between the CLS tokens in the ViT for the generated and target domain's images to enforce semantic similarity among these images, which can be written as follows
\begin{equation}
    \mathcal{L}_{\text{cls}} = -\frac{\langle cls_{X_A}^{input} , cls_{X_A}^{out}  \rangle}{\|cls_{X_A}^{input}\| \cdot \|cls_{X_A}^{out}\|} - \frac{\langle cls_{X_B}^{input} , cls_{X_B}^{out}  \rangle}{\|cls_{X_B}^{input}\| \cdot \|cls_{X_B}^{out}\|},
\end{equation}
    where $cls_{X_A}^{input}$ and $cls_{X_B}^{input}$ are the CLS tokens in the last layers of ViT for input images from domain $A$ and $B$ respectively. $cls_{X_A}^{out}$ and $cls_{X_B}^{out}$ are corresponding CLS tokens for the generated images.
    
    \item \textbf{Identity Loss}: The identity loss is found to be beneficial to maintain source image structure in translation by enforcing the pixel-level equality when target and source domains match \cite{zhu2017unpaired}. We enforce it via using same labels and text embeddings corresponding to the input image domain (see \figureref{fig:fig_1}c).
%\begin{equation}
%\begin{aligned}
%        \mathcal{L}_{\text{identity}} = &\mathbb{E}\left[\|X_A^{input} - G(X_A^{input}, %E_{T_A}, Y_A))\|_1\right] \\
%        & + \mathbb{E}\left[\|X_B^{input} - G(X_B^{input}, E_{T_B}, Y_B)\|_1\right].

%\end{aligned}
%\end{equation}
\end{enumerate}

\begin{figure*}[!t]
\includegraphics[width=1\textwidth]{ixi_pd_to_t1_i2igalip_new.png}
\caption{Multi-domain translation illustrations from PD to T\SB{1}-weighted image in IXI dataset. Accompanying this are error maps and magnified sections, positioned below and above each translation, respectively.}
\label{fig:pd_to_t1}
\end{figure*}


\begin{figure*}[!t]
\includegraphics[width=1\textwidth]{ct_to_t1_i2igalip_new.png}
\caption{Single-domain translation from T\SB{1}-weighted Pelvic MRI to CT images. Accompanying this are error maps and magnified sections, positioned below and above each translation, respectively.}
\label{fig:mri_to_ct}
\end{figure*}


\begin{table*}[]
\centering
\caption{Multi-domain image translation results in IXI dataset. T\SB{1}-, T\SB{2}- and PD-weighted images are considered. Best and second best results are indicated with red and blue respectively.}
\label{tab:my-table}
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
One-to-one task         & T\SB{1}-\textgreater{}T\SB{2}    & T\SB{2}-\textgreater{}T\SB{1}    & T\SB{2}-\textgreater{}PD    & PD-\textgreater{}T\SB{2}    & T\SB{1}-\textgreater{}PD    & PD-\textgreater{}T\SB{1}    \\ \hline
IXI                     & PSNR $\vert$ SSIM            & PSNR $\vert$ SSIM            & PSNR $\vert$ SSIM            & PSNR $\vert$ SSIM            & PSNR $\vert$ SSIM            & PSNR $\vert$ SSIM            \\ \hline
I2I-Galip-M   & \textcolor{blue}{27.22} $\vert$ 90.18 & 27.30 $\vert$ 90.86 &\textcolor{blue}{32.34} $\vert$ \textcolor{blue}{95.74} &\textcolor{red}{33.12} $\vert$ 95.39 & 26.76 $\vert$ 90.75 & \textcolor{blue}{27.70} $\vert$ \textcolor{blue}{91.20} \\ \hline
I2I-Galip-S   &\textcolor{red}{27.47} $\vert$ \textcolor{blue}{90.54} & \textcolor{blue}{27.33} $\vert$ \textcolor{blue}{91.06} & 32.11 $\vert$ 95.65 & \textcolor{blue}{32.87} $\vert$ \textcolor{blue}{95.62} & \textcolor{blue}{26.99} $\vert$ \textcolor{blue}{90.80} & \textcolor{red}{27.75} $\vert$ 91.07  \\ \hline
CycleGAN           & 26.10 $\vert$ 87.36          & 26.31 $\vert$ 88.51          & 27.43 $\vert$ 93.68          & 31.07 $\vert$ 93.81          & 24.56 $\vert$ 88.26          & 25.91 $\vert$ 89.47          \\ \hline
U-GAT-IT          & 24.44 $\vert$ 86.19          & 24.51 $\vert$ 86.85          & 26.81 $\vert$ 91.39          & 29.03 $\vert$ 92.11          & 22.98 $\vert$ 85.16          & 24.83 $\vert$ 87.44          \\ \hline
SynDiff           & 26.34 $\vert$ \textcolor{red}{91.87}          & \textcolor{red}{27.60} $\vert$ \textcolor{red}{92.14}          & \textcolor{red}{33.15} $\vert$ \textcolor{red}{96.87}          & 29.81 $\vert$ \textcolor{red}{96.99}          & \textcolor{red}{27.29} $\vert$ \textcolor{red}{92.49}          & 25.54 $\vert$ \textcolor{red}{92.41}          \\ \hline
UNIT &   23.59  $\vert$ 84.40                     &      24.76  $\vert$ 86.63                  &  25.22 $\vert$ 91.42                       &            29.10 $\vert$ 93.30            &    23.20 $\vert$ 86.00                    &           23.50 $\vert$ 80.05             \\ \hline
EGSDE        &   16.93 $\vert$ 53.32                     &      17.44 $\vert$ 57.54                  &      17.98 $\vert$ 75.93                  &     16.40 $\vert$ 57.55                    &           19.70 $\vert$ 71.21             &     19.71 $\vert$ 59.73                   \\ \hline
\end{tabular}%
}
\label{tab:ixi}
\end{table*}


\subsection{Datasets}
We conduct experiments on single-coil brain MRI dataset (IXI) and CT-MRI dataset \cite{nyholm2018mr} to demonstrate the performance of our approach. Dataset details are presented in the \sectionref{additional_details}. We consider the IXI dataset in both multi-domain and single-domain translation contexts. In the multi-domain scenario, we use a single network for all translation tasks, whereas in the single-domain scenario, we utilize distinct networks for each individual task. On the other hand, CT-MRI dataset only allows us to use single-domain translation context.





\subsection{Implementation Details}
We illustrate the model complexities using the number of parameters in each competing method in the \tableref{tab:model_comp}. A single NVIDIA RTX A5000 GPU with PyTorch framework is utilized in all experiments. Our model is trained with Adam optimizer with an initial learning rate set at 0.0002, which is linearly decreased to 0 after the $50th$ epoch. Number of discriminator head sets are determined according to the number of domains in the translation problem, where for IXI it is 3, and 2 for CT-MRI. We utilize hyperparameters 10, 1, 1, 1, 1 for $\lambda_{cycle}$, $\lambda_{adv}$, $\lambda_{cls}$, $\lambda_{clip}$, and $\lambda_{identity}$ respectively.



\begin{figure*}[!t]
\includegraphics[width=1\textwidth]{ixi_t2_pd_i2igalip_new.png}
\caption{Multi-domain translation illustrations from T\SB{2}-weighted to PD image in IXI dataset. Accompanying this are error maps and magnified sections, positioned below and above each translation, respectively.}
\label{fig:fig_t2_to_pd}
\end{figure*}





\section{Results}
We utilize well known unsupervised image translation baselines CycleGAN \cite{zhu2017unpaired}, U-GAT-IT \cite{kim2019u},  SynDiff \cite{ozbey2023unsupervised}, UNIT \cite{liu2017unsupervised} and EGSDE \cite{zhao2022egsde} as competing methods (see \sectionref{competing_details} for details). We use Peak-Signal-to-Noise-Ratio (PSNR, dB) and Structural Similarity Index Measure (SSIM, \%) to compare the translation performances of competing methods. Results are presented for both single- and multi-domain case in IXI for I2I-Galip to show the effectiveness of the proposed approach for both cases. CT-MRI results are presented as the single-domain translation. I2I-Galip-S (Single), CycleGAN, U-GAT-IT, UNIT and SynDiff are separately trained for all possible domain pairs while I2I-Galip-M (Multi) is trained once per dataset. EGSDE is an unsupervised image translation method that is agnostic to translation direction. However, it relies on separately pre-trained diffusion models for each target domain. \tableref{tab:ixi} and \tableref{tab:ct_mri} show the translation performance in IXI and CT-MRI datasets, respectively. We show the corresponding translated images for each competing methods for distinct translation tasks in \figureref{fig:pd_to_t1}, \figureref{fig:mri_to_ct}, and \figureref{fig:fig_t2_to_pd}. Best and second best performances are highlighted as red and blue, respectively, in each table for each metric. 

Overall, I2I-Galip-M, a single network for multi-domain translation in IXI—unlike CycleGAN, which requires separately trained models for each image pair—achieves 2.17dB higher PSNR and over 2\% better SSIM compared to CycleGAN, the foundational approach. Additionally, I2I-Galip-S outperforms the second-best method with a 0.10 dB improvement in PSNR and a 1.52\% increase in SSIM for the T\SB{1} to CT translation, also delivers a 1.38\% gain in SSIM for the T\SB{2} to CT task. Compared to SynDiff—a state-of-the-art diffusion-based, cycle-consistent translation model—I2I-Galip achieves comparable performance while significantly reducing computational demands. For example, in the T\SB{1} to T\SB{2} task, I2I-Galip-M attains 27.22 dB PSNR (compared to SynDiff’s 26.34 dB), and in the PD to T\SB{2} task, it reaches 33.12 dB PSNR versus SynDiff’s 29.81 dB. Similar performance trends are observed across other tasks, demonstrating that I2I-Galip matches SynDiff’s effectiveness while preserving finer structural details, avoiding the oversmoothing effect seen in SynDiff, as illustrated in the qualitative figures. Additionally, I2I-Galip requires far fewer network parameters (see \tableref{tab:model_comp}) and offers faster inference by circumventing the iterative diffusion process inherent to SynDiff (see \tableref{tab:compt_cost} for details). Unlike other competing methods such as CycleGAN, U-GAT-IT, and UNIT, which suffer from noise artifacts that degrade output quality, I2I-Galip also effectively avoids these issues while maintaining sharp, accurate translations.

\begin{table}[H]
\centering
\caption{Single-domain image translation results in CT-MRI dataset for T\SB{1}- and T\SB{2}-weighted images.}
\label{tab:ct_mri}
\resizebox{0.35\textwidth}{!}{%
\begin{tabular}{|c|cc|cc|}
\hline
             & \multicolumn{2}{c|}{T\SB{1}-\textgreater{}CT} & \multicolumn{2}{c|}{T\SB{2}-\textgreater{}CT} \\ \hline
             & \multicolumn{1}{c|}{PSNR}     & SSIM     & \multicolumn{1}{c|}{PSNR}     & SSIM     \\ \hline
I2I-Galip & \multicolumn{1}{c|}{\textcolor{red}{26.13}}    & \textcolor{red}{90.86}    & \multicolumn{1}{c|}{\textcolor{blue}{27.08}}    & \textcolor{red}{91.30}    \\ \hline
CycleGAN     & \multicolumn{1}{c|}{24.55}    & 78.63    & \multicolumn{1}{c|}{\textcolor{red}{27.39}}    & \textcolor{blue}{89.92}    \\ \hline
U-GAT-IT     & \multicolumn{1}{c|}{\textcolor{blue}{25.79}}    & \textcolor{blue}{89.34}    & \multicolumn{1}{c|}{26.01}    & 87.48    \\ \hline
SynDiff      & \multicolumn{1}{c|}{23.81}    & 75.16    & \multicolumn{1}{c|}{21.73}    & 75.07    \\ \hline
UNIT         & \multicolumn{1}{c|}{26.02}    & 79.22    & \multicolumn{1}{c|}{25.15}    & 75.30    \\ \hline
EGSDE        & \multicolumn{1}{c|}{19.03}    & 74.63    & \multicolumn{1}{c|}{14.74}    & 66.67    \\ \hline
\end{tabular}}
\end{table}




\begin{table}[!h]
\centering
\caption{Model complexities are illustrated using total number of parameters for each competing method. The third row indicates the number of required generator and discriminator networks, given the specified number of domain. $T$ and P(.) represents the number of domains for a multi-modal translation problem and permutation operator respectively. Total parameters are calculated for a representative case where $T=4$.}
\resizebox{1\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Network/Model & I2I-Galip & CycleGAN       & U-GAT-IT       & SynDiff & UNIT           & EGSDE  \\ \hline
Generator (G)     & 13.2M        & 11.3M          & 278.9M         & 39.7M + 7.8M    & 5.4M + 5.4M    & 164M   \\ \hline
Discriminator (D) & 23.9M        & 2.7M           & 56.4M          & 27.7M + 2.7M   & 2.8M           & 0      \\ \hline
Times (G, D)         & 1, T      & P(T,2), P(T,2) & P(T,2), P(T,2) & P(T,2), P(T,2)   & P(T,2), P(T,2) & T, 0   \\ \hline
Total         & 108.8M        & 169.5M          & 4023.6M        & 936.6M   & 162.2M          & 657.2M \\ \hline
\end{tabular}}
\label{tab:model_comp}
\end{table}

\begin{table}[!h]
\centering
\caption{Memory usage, training time, and inference time for the most lightweight and the most computationally intensive methods for a single-domain translation with a NVIDIA RTX A5000 gpu.}
\resizebox{0.5\textwidth}{!}{ 
\begin{tabular}{|l|l|l|l|}
\hline
               & CycleGAN  & I2I-Galip & SynDiff   \\ \hline
Memory         & 3,128 MiB & 7,074 MiB & 9,638 MiB \\ \hline
Training Time  & $\approx$4.5h      & $\approx$27h       & $\approx$35h       \\ \hline
Inference Time & 0.00353s  & 0.04883s  & 0.1792s  \\ \hline
\end{tabular}}
\label{tab:compt_cost}
\end{table}

%The ability to preserve fine-grained details without introducing noise underscores the efficacy of our approach in various image translation tasks, setting it apart from other unsupervised translation methods given .

\subsection{Ablation Studies}
\label{sec:ablation}
As shown in \tableref{tab:ablation}, we assess the contribution of each loss component and compare I2I-Galip with BiomedCLIP against its variant equipped with OpenCLIP \cite{ilharco_gabriel_2021_5143773} trained on Liaon \cite{schuhmann2022laionb} in both single- and multi-domain settings. The majority of the performance gains stem from adversarial and cycle losses, although CLIP, identity, and CLS losses add notable benefits in the multi-domain scenario. In contrast, the adversarial loss tends to dominate in the single-domain case, reducing the impact of the other loss terms. We discuss the underlying reasons for these findings in \sectionref{sec:limitations}.

\begin{table}[!h]
\centering
\caption{Single- and multi-domain ablation results in IXI dataset. PSNR and SSIM values are averaged across the whole test set.}
\resizebox{0.35\textwidth}{!}{%
\begin{tabular}{|c|cc|cc|}
\hline
             & \multicolumn{2}{c|}{I2I-Galip-S}                          & \multicolumn{2}{c|}{I2I-Galip-M}                           \\ \hline
             & \multicolumn{2}{c|}{IXI}                             & \multicolumn{2}{c|}{IXI}                             \\ \hline
             & \multicolumn{1}{c|}{PSNR}           & SSIM           & \multicolumn{1}{c|}{PSNR}           & SSIM           \\ \hline
Proposed     & \multicolumn{1}{c|}{29.09} & 92.48 & \multicolumn{1}{c|}{29.07} & 92.35 \\ \hline
$\lambda_{adv}=0$   & \multicolumn{1}{c|}{19.80}          &60.80          & \multicolumn{1}{c|}{18.62}          & 49.05          \\ \hline
$\lambda_{cls}=0$   & \multicolumn{1}{c|}{29.00}          & 92.26          & \multicolumn{1}{c|}{28.88}          & 91.99          \\ \hline
$\lambda_{cycle}=0$ & \multicolumn{1}{c|}{27.91}          & 90.93          & \multicolumn{1}{c|}{28.08}          & 90.88          \\ \hline
$\lambda_{clip}=0$  & \multicolumn{1}{c|}{28.90}          & 92.23          & \multicolumn{1}{c|}{28.99}              & 92.18              \\ \hline
$\lambda_{identity}=0$    & \multicolumn{1}{c|}{28.74}          & 91.12          & \multicolumn{1}{c|}{28.76}              & 92.01              \\ \hline
CLIP-Laion-2B     & \multicolumn{1}{c|}{19.06} & 66.67 & \multicolumn{1}{c|}{27.54} & 91.21 \\ \hline
\end{tabular}}
\label{tab:ablation}
\end{table}




\section{Discussion and Limitations}
\label{sec:limitations}
We observe only marginal gains by incorporating identity, CLS, and CLIP losses in our experiments—even after trying different metrics such as Cosine, L2, and Contrastive. These losses seem overshadowed by the adversarial loss, given that our discriminator (powered by BiomedCLIP's ViT and MSE loss) can detect fake images early on, effectively functioning as a strong regularizer. In single-domain settings, the broad, generalized embeddings (e.g., from OpenCLIP) can further destabilize this adversarial training, misaligning with the narrower data distribution and producing noise-like outputs. As a result, CLIP guidance—prone to providing inaccurate translation directions \cite{sauer2023stylegan,kwon2022diffusion}—loses additional effectiveness. 

Moreover, because BiomedCLIP serves as our multi-modal foundation model, our approach inherits its contrastive pre-training strategy, which emphasizes semantically meaningful features at the expense of finer image details. We also found our method to be sensitive to caption choices for the target domain, but experimenting with diverse captioning styles did not yield improvements. Consequently, we adopt BiomedCLIP’s simplest templates (e.g., “This MRI is XX-weighted,” “This is pelvic MRI,” or “This is pelvic CT”). We leave further exploration of this aspect to future work.
\section{Conclusion}
We propose an unsupervised multi-modal image translation framework employing a generative adversarial network which is empowered with a pre-trained vision-language model. Our framework improves upon the cycle-consistent translation models while enhancing the multi-domain translation performance with a reduced computational budget. 


\midlacknowledgments{This work was supported by the NSF CAREER Award under Grant 2045489.}


\bibliography{midl25_45}


\appendix
\section{Related Works}
\paragraph{Cycle-consistent image translation.}
Zhu et al. revolutionized the field of unsupervised image translation with their proposal of CycleGAN \cite{zhu2017unpaired}. Yi et al. proposed DualGAN \cite{yi2017dualgan} which is a concurrent work with CycleGAN offering the same cycle-consistency loss. Various studies followed the cycle-consistency constraint for more faithful translation in the unsupervised setting. Liu et al. proposed UNIT \cite{liu2017unsupervised} for uni-modal translation where a shared latent space is assumed between source and target modalities. Huang et al. proposed MUNIT \cite{huang2018multimodal} where UNIT's assumption of shared latent space is divided into content and style for multi-domain translation. Lee et al. \cite{lee2018diverse} introduced DRIT, which shares a similar approach to MUNIT by using disentangled content and attribute latents for multi-domain translation. Choi et al. proposed StarGANv1 \cite{choi2018stargan} and StarGANv2 \cite{choi2020stargan} where they utilized a separate style encoder network to generate distinct style codes to be used in generator for multi-domain translation. Perera et al. \cite{perera2018in2i} proposed an alternative method where they utilize multi-domain input modalities with a latent-consistency loss. Kim et al. proposed U-GAT-IT \cite{kim2019u} with an advanced generator equipped with adaptive layer instance normalization layers and attention. Torbunov et al. proposed UVCGan \cite{torbunov2023uvcgan} employing a pre-trained vision transformer as generator in a cycle-consistent framework for improved translation performance.
\paragraph{Text-guided image translation.}
Following the advancements in vision-language models \cite{radford2021learning} several text-guided unsupervised image translation methods proposed with or without cycle-consistency constraint. Park et al. proposed LANIT \cite{park2023lanit} where they use CLIP to generate pseudo labels for unlabeled images with a similar approach in Starganv2. Gal et al. proposed StyleGAN-NADA \cite{gal2022stylegan} for CLIP driven adaptation of Stylegan2 generator \cite{karras2020analyzing}. Patashnik et al. proposed StyleCLIP \cite{patashnik2021styleclip} where they invert source image to find its latent code for CLIP guided feature manipulation. Yu et al. proposed \cite{yu2022towards} a counterfactual image manipulation pipeline using CLIP.
\paragraph{Diffusion-based image translation.}
More recently, building on the success of diffusion models in image generation, various unsupervised image translation methods utilizing diffusion-based backbones have been proposed. Zhao et al. proposed EGSDE \cite{zhao2022egsde} where they utilize energy-guided translation between diversely trained diffusion models. Özbey et al. proposed SynDiff \cite{ozbey2023unsupervised}, where they use multiple cycle-consistent diffusive and non-diffusive generators for improved translation performance. Kwon et al. proposed DiffuseIT \cite{kwon2022diffusion} and used pre-trained vision transformers as guidance in image manipulation. Tumanyan et al. \cite{tumanyan2023plug} offered a plug and play framework to adapt pre-trained text-to-image diffusion models in image translation. Zhan et al. proposed MedM2G \cite{zhan2024medm2g}, where they proposed a unified multi-modal diffusive framework for text to image, image to text synthesis and image translation tasks. 
Our approach shares similarities with MedM2G \cite{zhan2024medm2g} in employing a multi-modal text-guided framework for image translation. However, our model is over an order of magnitude smaller, leveraging a feed-forward generative adversarial network architecture and enforcing cycle-consistency across translations. We also incorporate common loss terms with DiffuseIT \cite{kwon2022diffusion}, utilizing CLS tokens from pre-trained vision transformers for semantically meaningful information extraction. Nonetheless, our approach differs in its use of cycle-consistency and the feed forward generative adversarial methodology adopted. We named our method in reference to the text-to-image generative adversarial model Galip \cite{tao2023galip}. However, apart from the CLIP based feature extraction utilized for the Discriminator, our method does not share further similarities with Galip in terms of architecture or training methodology.

\section{Datasets and Competing Methods}
\subsection{Datasets}
\label{additional_details}
\begin{enumerate}
    \item \textbf{IXI}: Translation performance demonstrated in a single-coil brain MRI dataset from (http://brain-development.org/ixi-dataset/). T\SB{1}-, T\SB{2}- and PD-weighted acquisitions are considered. In IXI, 25 subjects are used for training, 5 for validation and 10 for testing.
   % \item \textbf{Midas}: Translation performance demonstrated in brain MRI data from Midas dataset \cite{bullitt2005vessel}. T\SB{1}- and T\SB{2}-  acquisitions are considered. In Midas, 25 subjects are used for training, 5 for validation and 10 for testing. 
    \item \textbf{CT-MRI}: Translation performance demonstrated in pelvic T\SB{1}- and T\SB{2}-weighted MRI and CT data from \cite{nyholm2018mr}. In CT-MRI dataset, 9 subjects are used for training, 1 for validation and 4 for testing. 
\end{enumerate}
\subsection{Competing Methods}
\label{competing_details}
\begin{enumerate}
    \item \textbf{CycleGAN}: Cycle-consistent generative adversarial model is considered \cite{zhu2017unpaired}. The Adam optimizer is utilized for training with an initial learning rate set at 0.0002, which linearly decreased to 0 after the $50th$ epoch. The training process spans a total of 100 epochs. Weights for adversarial, cycle, identity losses are selected as 1, 10, 0.5 respectively.
    \item \textbf{U-GAT-IT}: An attention guided GAN model with adaptive layer-instance normalization designed for unsupervised image translation is considered \cite{kim2019u}. Adam optimizer is utilized for training with a learning rate of 0.0001. Training lasts for 100 epochs. Weights for adversarial, cycle, identity and CAM losses are selected as 1, 10, 10, and 1000 respectively.
    \item \textbf{SynDiff}: A cycle-consistent diffusion-based image translation model is considered \cite{ozbey2023unsupervised}. Adam optimizer is used for training with a learning rate of 0.0001. Training length is 50 epochs. The weights assigned to the cycle-consistency and adversarial loss terms are $\lambda_{1}^{\phi}, \lambda_{1}^{\theta} = 0.5$ and $\lambda_{2}^{\phi}, \lambda_{2}^{\theta} = 1$, respectively. The noise variance schedule is bounded between $\beta_{\text{min}} = 0.1$ and $\beta_{\text{max}} = 20$. Other diffusion related hyper-parameters are directly obtained from \cite{ozbey2023unsupervised}.
    \item \textbf{UNIT}: An unsupervised GAN model designed for unsupervised image translation is considered \cite{liu2017unsupervised}. Adam optimizer is utilized for training with a learning rate of 0.0001 for 100 epochs. Weights for adversarial, image, style, and content reconstruction losses are selected as 1, 10, 1, 1 respectively.
    \item \textbf{EGSDE}: A diffusion based unpaired image translation model is considered \cite{zhao2022egsde}. Seperate DDPM models are trained for each translation domain to be utilized in EGSDE model. 500,000 diffusion steps are used for training of the DDPMs and T is selected as 150 to maintain source structure, and cross-validated weight parameters $\lambda_s$ and $\lambda_i$ are selected as $1 \times 10^{-7}$ and 10.
\end{enumerate}


\end{document}
