\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage[dvipsnames]{xcolor}
\usepackage{comment}
\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{array}
\usepackage{caption}
\usepackage{arydshln} 
\usepackage{xcolor}
\usepackage{algorithm,algpseudocode}
\usepackage{algorithmicx}
\usepackage{algpseudocode}
\usepackage{minted}
\usepackage{float}
% Define a command for blue text
\newcommand{\bluetext}[1]{\textcolor{blue}{#1}}
% Define an environment for blue text
\newenvironment{blueenv}
    {\color{blue}}
    {}
% Set up minted for Python code
\usemintedstyle{colorful}
\setminted{
  breaklines=true,
  breakanywhere=true,
  fontsize=\footnotesize,
  linenos,
  numberblanklines=true,
  numbersep=12pt,
  framesep=2mm,
  xleftmargin=8pt,
  funcnamehighlighting=true,
  tabsize=4,
  obeytabs=true,
  mathescape=false,
  samepage=false,
  showspaces=false,
  showtabs=false,
  texcl=false,
  %bgcolor=lightgray,
}

\newfloat{listing}{thp}{lop}
\floatname{listing}{Code Listing}
\definecolor{darkorange}{HTML}{FA6800}

\jmlrvolume{-- 230}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025 }
\editors{Accepted for publication at MIDL 2025}
\title[PRISM]{PRISM: High-Resolution \& Precise Counterfactual Medical Image Generation using Language-guided Stable Diffusion}


% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Amar Kumar\nametag{$^{1,2}$}} \Email{amar.kumar@mail.mcgill.ca}\AND
\Name{Anita Kriz\nametag{$^{1,2}$}} \Email{anita.kriz@mail.mcgill.ca} \AND
\Name{Mohammad Havaei\nametag{$^{3}$}} \Email{mhavaei@google.com} \AND
\Name{Tal Arbel\nametag{$^{1,2}$}} \Email{tal.arbel@mcgill.ca}\\
\addr $^{1}$ Center for Intelligent Machines, McGill University, Montreal, Canada. \\
\addr $^{2}$ Mila - Quebec AI Institute, Montreal, Canada. \\
\addr $^{3}$ Google Research, Montreal, Canada.
}

\begin{document}

\maketitle

\begin{abstract}

Developing reliable and generalizable deep learning systems for medical imaging faces significant obstacles due to spurious correlations, data imbalances, and limited text annotations in datasets. Addressing these challenges requires architectures that are robust to the unique complexities posed by medical imaging data. Rapid advancements in vision-language foundation models within the natural image domain prompt the question of how they can be adapted for medical imaging tasks. In this work, we present PRISM, a framework that leverages foundation models to generate high-resolution, language-guided medical image counterfactuals using Stable Diffusion. Our approach demonstrates unprecedented precision in selectively modifying spurious correlations (the medical devices) and disease features, enabling the removal and addition of specific attributes while preserving other image characteristics. Through extensive evaluation, we show how PRISM advances counterfactual generation and enables the development of more robust downstream classifiers for clinically deployable solutions. To facilitate broader adoption and research, we make our code publicly available at \url{https://github.com/Amarkr1/PRISM}.
 

\end{abstract}

\begin{keywords}
Counterfactual Image Synthesis, Diffusion, Foundation Models, Generative Models, Large Language Models
\end{keywords}

\section{Introduction}

The development of deep learning models in healthcare settings has the potential to transform current medical practices in disease diagnosis, biomarker discovery, and personalized treatment. However, clinical deployment requires robust models -- a standard that remains largely unmet due to the inherent complexities of medical imaging data. Class imbalances and spurious correlations can cause models to learn misleading patterns that are not penalized when optimizing the training objective. This flawed training paradigm results in incorrect disease classification, ultimately degrading the model's generalizability to real-world clinical scenarios. To address these challenges, the field has explored counterfactual (CF) generation to expose shortcut learning and alleviate data imbalance issues by augmenting underrepresented classes. Previous work has focused on classifier-guided counterfactual image generation methods, such as using standard classifiers with robust empirical minimization techniques~\cite{mertes2022ganterfactual, singla2019explanation} or classifiers based on distributional robust optimization (Group-DRO)~\cite{kumar2023debiasing,fathi2024decodex}. An alternative approach leverages Structural Causal Models (SCMs) to explicitly model and intervene on causal relationships between attributes during the generation process; these methods also (largely) rely on classifiers to produce high-quality results~\cite{ribeiro2023high}. 
%However, achieving high-quality results often requires a classifier ~\cite{ribeiro2023high}. 
These methods expose a paradox in their formulation -- their performance is dependent on the same biased data (and classifiers) they are designed to mitigate (see Fig.~\ref{fig:motivation}). Moreover, end-to-end architectures face a tradeoff between competing objectives: high-quality generation demands fine-grained details, while classification relies on abstract features. %However, due to the limited availability of medical data and computation constraints, these challenges remain an open problem. 
Compounded by the computational burden of training high-capacity architectures from scratch, synthesizing high-resolution and precise CFs remains elusive.

\begin{figure}[th]
    \centering
    \includegraphics[width=\textwidth]{images/fig1_v3.png}
    \caption{Counterfactual (CF) explanations for a subject with `Pleural Effusion'. (a) Original chest radiograph of subject; (b)  Classifier-guided CF image fails to show changes in the diseased area and determined the CF image is healthy. The classifier is biased and associates the disease with the medical device;  (c) PRISM modifies the area of \textcolor{cyan}{disease pathology}, leaving the devices (e.g. pacemaker) unchanged.}
    \label{fig:motivation}
\end{figure}

% The capabilities of foundation models have been demonstrated in numerous tasks such as medical image generation~\cite{wang2024self}, visual question answering~\cite{li2024llava}, text-to-image generation, and image editing~\cite{perez2025radedit}. Given the impracticality and computational burden of training these memory- and data-intensive architectures from scratch, 

% To address the impracticality of training a foundation model from scratch, 
Fine-tuning foundation models has recently emerged at the forefront of deep learning for medical image analysis~\cite{wang2023real,dutt2023parameter,azad2023foundational}, outperforming existing state-of-the-art (SOTA) methods in tasks such as zero-shot classification~\cite{yuan2021florence}, out-of-distribution generalization~\cite{goyal2023finetune},  histopathology image classification~\cite{roth2024low}, and visual question answering~\cite{li2024llava}. In computer vision, many methods have been developed for high-resolution, language-guided image editing (e.g.Null-text Inversion~\cite{mokady2023null},  Imagic~\cite{kawar2023imagic}). BiomedJourney~\cite{gu2023biomedjourney} was the first work to fine-tune foundation models for counterfactual medical image generation via language prompts and achieved SOTA results. However, it was not designed to remove large confounding artifacts (e.g. medical devices) and is constrained to low resolution images (256 $\times$ 256). RadEdit~\cite{perez2025radedit} employs language-guided image editing to address biases from acquisition, manifestation, and population shifts. It uses two masks: one to define areas where edits can occur and another to maintain fidelity. This limits its ability to generate fully unconstrained counterfactuals.
%focuses exclusively on counterfactual generation for disease states, overlooking the challenging task of identifying and removing spurious correlations within images.
This raises a natural question:  \textit{Could we leverage a vision-language foundation model pre-trained on diverse natural images and adapt it to generate precise high-resolution medical image counterfactuals?}
 %to avoid being constrained by medical dataset-specific biases

In this work, we introduce PRISM (\textbf{Pr}ecise counterfactual \textbf{I}mage generation using language-guided \textbf{S}table Diffusion \textbf{M}odel), a strategically fine-tuned vision-language foundation model, 
% Stable Diffusion, 
that leverages language guidance to generate medical image counterfactuals for novel generative tasks (see Fig.~\ref{fig:arch}). Specifically, PRISM 
%achieves a significant breakthrough as 
presents the first framework to generate high-resolution ($512 \times 512$) medical counterfactuals that can selectively remove significant spurious artifacts, such as medical devices. Crucial for explainability in medical settings, it can isolate and modify individual disease attributes (and spurious correlations) while preserving others.
%— using only tabular data.
Existing approaches have relied on detailed clinician notes to train language models~\cite{zhang2023knowledge,luo2024devide}. In order to leverage the guidance of a language embedding, our framework adapts binary labels, typical for medical datasets, into text captions. %Through strategic fine-tuning of a Stable Diffusion model. 
%we achieve our results despite the limited data and computational resources typical in medical settings. %As much of the publicly available medical data lacks text annotations, our method uses tabular data to generate simple language descriptions of the images. Given the lack of data in medical datasets, we fine-tune a pretrained stable diffusion model. 
% We conduct extensive experiments on the publicly available CheXpert dataset ~\cite{irvin2019chexpert}, demonstrating that our method can synthesize high-quality 512 × 512 resolution images and generate counterfactuals for both disease and device states, all while maintaining invariance to fixed attributes.
% We introduce the first medical image counterfactual generation framework that uses fine-tuned vision-language models to produce high-resolution (512 × 512) counterfactuals from numerical metadata. 


Through extensive experimentation on the publicly available CheXpert dataset~\cite{irvin2019chexpert},
% we demonstrate our model's ability to selectively modify user-defined attributes, such as disease states or medical devices, while preserving other image features.
we validate our approach by (i) generating difference maps between the original and the synthesized CF image to assess the clinical plausibility of the disease, and  (ii) using multi-head classifiers to confirm that the counterfactuals are correctly classified. We also show improvement over a baseline classifier-guided GAN-based model, GANterfactual~\cite{mertes2022ganterfactual}. %We benchmark our method against an existing baseline, showing both qualitative and quantitative improvements in counterfactual generation. 
As a key demonstration of PRISM's utility, we show that our counterfactuals improve the accuracy of an existing classifier.


\section{Methodology}

While state-of-the-art vision-language foundation models in computer vision utilize millions of image-text pairs to generate images, their direct application to the medical domain is hindered by two key challenges. First, patient information is stored as tabular data (e.g., numerical labels for age or sex) rather than descriptive text 
% or elaborate doctor's notes
, limiting direct integration into existing vision-language models. Second, medical imaging datasets are significantly smaller than those in computer vision, making it impractical to train a foundation model from scratch. To address these shortcomings and enable CF generation, our methodology consists of three main steps: (i) convert patient tabular data into text format, enabling the generation of rich semantic embeddings via a pre-trained CLIP (Contrastive Language-Image Pre-training) text encoder, Section~\ref{sec:text}; (ii) fine-tune a Stable Diffusion model, to better adapt to a medical imaging dataset, Section~\ref{sec:finetune}; (iii) at inference, synthesize CF images guided by a text input, Section~\ref{sec:counterfactual}. 
\begin{figure}[t]
    \centering
    \includegraphics[width=\textwidth]{pdfs/arch_v2.pdf}
    \caption{The framework for synthesizing counterfactual (CF) images takes an original input image ($I_\text{orig}$) and its corresponding text prompt ($P_\text{orig}$), along with an edited text prompt ($P_\text{edit}$) for the CF image. It employs a frozen VAE consisting of an image encoder ($E_I$) and decoder, as well as a frozen CLIP text encoder ($E_P$). The core component of the framework is a denoising U-Net, which is the only trainable module during the fine-tuning process. During inference, the encoded text prompt ($E_P(P_\text{edit})$) is used to condition the U-Net, guiding the generation of a high-resolution $512\times512$ counterfactual image that aligns with the modified text description.} %The framework uses a stable diffusion finetuned on CheXpert data. The image latent ($E_I(X)$) combined with the original text embeddings ($E_T(T)$) synthesize an inverted image, while the same image latent combined with edited text embeddings ($E_T(T')$) synthesizes an edited image where the medical devices are removed as per the text.}
    \label{fig:arch}
\end{figure}
\subsection{Tabular Data to Text Conversion} \label{sec:text}
One of the key requirements of training a Stable Diffusion (specifically v1.5)~\cite{Rombach_2022_CVPR} model is the image-text pair. CheXpert, the medical dataset we use here, only contains binary labels for different diseases and the presence of support devices. To leverage Stable Diffusion, we create a custom template for image-text pairs based on the available tabular data (\bluetext{see code listing in Appendix~\ref{appen:implementation details}}). For example, if the subject's radiograph shows pleural effusion and cardiomegaly, our text caption for the image is \texttt{chest x-ray of a patient showing pleural effusion, cardiomegaly.} Additionally, for patients with no findings, we use the template text \texttt{Normal chest X-ray with no significant findings}.

\subsection{Fine-Tuning the Stable Diffusion Model} \label{sec:finetune}
The Stable Diffusion v1.5 architecture consists of three components: (i) the Variational Autoencoder (VAE), which encodes images into the latent space and subsequently decodes the processed latent representation back into image space; (ii) the U-Net, which operates at the latent level and is trained to predict and remove noise introduced during the forward diffusion process, enabling iterative image refinement; and (iii) the CLIP Encoder, which encodes text descriptions into a vector embedding that is used to condition the U-Net, guiding the image generation process to match the given text description. It should also be noted here that the CLIP model is already pre-trained, providing general semantic knowledge about image-text relationships.

%Given our objective of generating identity-preserving counterfactuals, we adopt the Denoising Diffusion Implicit Model (DDIM)~\cite{song2020denoising}. This method provides a deterministic mapping between the noisy and denoised latent. By conditioning the U-Net with a CLIP-encoded text, we gain direct control over the image generation process. For stability during fine-tuning, we update only the U-Net component of the model while keeping the VAE and CLIP encoder fixed. 
The conditional U-Net architecture learns to predict noise based on noisy latents (noise image embeddings), timesteps (indicating noise level) and text embeddings (embeddings from the CLIP text encoder). A Mean Squared Error (MSE) loss is computed between the predicted and the actual noise. Finally, the backpropagation updates only the U-Net weights, excluding the VAEs. % Additionally, the gradients are clipped during this backpropagation to prevent exploding gradients.} 
 We provide full pseudocode and details for implementing the fine-tuning process in Appendix~\ref{appen:implementation details}.

\subsection{Generating Counterfactuals at inference}\label{sec:counterfactual}

After fine-tuning the Stable Diffusion model on a medical imaging dataset, generating counterfactuals requires no extra training and is done at inference. To generate identity-preserving counterfactuals, we adopt the image-editing component of LANCE \cite{prabhu2023lance}, which combines DDIM with null-text inversion for precise image editing that maintains fidelity to the original image. The three main steps in CF generation include image inversion, image editing and image quality evaluation. Additional details for image editing are discussed in Appendix~\ref{appen:lgcigPRISM}.
% Counterfactuals should preserve the identity of the original image while accurately modifying it to belong to the target class (the opposite of the factual class)~\cite{mothilal2020explaining,nemirovsky2020countergan}. For instance, in the case of disease counterfactuals, the model should learn to retain the identity features of the original image while altering only the disease-related attributes. 
%To produce precise counterfactuals, the latent representations of the input image ($X$) and the CF edited text ($T'$) are used to align the denoising process. This combined representation is then passed through the decoder to synthesize the edited counterfactual image ($X'$). 
To produce a precise counterfactual image ($I_\text{CF}$), the language embeddings of the CF edit text ($P_\text{edit}$) are used as contexts within the U-Net to guide the denoising process applied to the diffused latent representation of the input image ($I_\text{orig}$). The text embeddings are incorporated into the denoising U-Net during the reverse diffusion process using cross-attention modules. %in order to produce the edited counterfactual image ($X'$). 
% This combined representation is then passed through the decoder to synthesize the edited counterfactual image ($X'$).
To quantify the alignment of the counterfactual image with the provided edited text alignment, we use an editing score, $S_\text{CLIP}$ (Eq.~\ref{eq:edit_score}), which measures the similarity between the generated image and the intended textual modification. Following a similar approach to~\cite {prabhu2023lance}, we compute the editing score and directional similarity~\cite{gal2022stylegan} to filter out edited samples where $S_\text{CLIP}<0.1$. All details required to perform language-guided image editing are discussed in Appendix~\ref{appen:lgcigPRISM}.


\begin{comment}
    Here, $(I_{\text{orig}},T_\text{orig})$ and $(I_{\text{edit}},T_\text{edite})$ are the original and edited image-text pairs, $E_I$ is the image encoder and $E_T$ is the text encoder. 
\end{comment}

\begin{equation}\label{eq:edit_score}
    S_{\text {CLIP}}=\frac{\Delta I \cdot \Delta P}{\|\Delta I\|\|\Delta P\|}, \quad \text { where } \quad \begin{aligned}
\Delta I & =E_I\left(I_\text{CF}\right)-E_I\left(I_\text{orig}\right), \text { and } \\
\Delta P & =E_P\left(P_\text{edit}\right)-E_P\left(P_\text{orig}\right) 
\end{aligned}
\end{equation}

\begin{comment}
    \begin{algorithm2e}
    \caption{Image Editing}
    \label{alg:net}
    \KwIn{Original Image $x_0$, original prompt $c_\text{inv}$, editing prompt $c_\text{edit}$, diffusion model $f_\theta$, CFG weight $w$}
    \KwOut{ Edited image, $x_0^{'}$ }
    \Indp$\left(\hat{x}_{1: T}, z_{1: T}\right) \leftarrow \text { DDPMINVERSION }\left(x_0, c_{\text {inv }}\right)$\\
    $x_T\leftarrow \hat{x}_T$\\
    \For{$t\leftarrow T$ \KwTo $1$}{
      $y \leftarrow y + w_i*x_i$\;
    }
    \end{algorithm2e}
\end{comment}
% As an intermediatory check step, we also test the model's capacity to reconstruct the original image using image inversion (reconstruction) from the DDIM latent and original text (Appendix~\ref{append:image_inversion} provides additional details on this).
\section{Experiments and Results }
\subsection{Dataset and Implementation Details} 
We use the publicly available CheXpert dataset~\cite{irvin2019chexpert} that contains over 200,000 chest X-ray images, with binary labels for 14 diseases including the presence of support devices. Table~\ref{table:samples} shows a summary of the number of subjects in each split and their distributions. To demonstrate our method's versatility to other medical datasets, we additionally ran experiments on dermoscopic images from the publicly available ISIC dataset ~\cite{tschandl2018ham10000,codella2018skin,combalia2019bcn20000}, with results and details discussed in Appendix~\ref{append:ISIC}.

PRISM uses the default DDPM scheduler for fine-tuning the model `runwayml/stable-diffusion-v1-5'. There is a known tradeoff between a lower noise scheduler and diversity in the sampled results. Choosing a lower noise scheduler (12e-3 in this case) tends to produce a more detailed image with less noise and generates deterministic results. Additionally, the convergence is faster, as only a few sampling steps are required~\cite{song2020denoising}. The implications of this tradeoff should be explored in each context of interest. During image editing (CF synthesis), we utilize a DDIM scheduler with a\texttt{scaled\_linear} scheduler with \texttt{beta\_start} and \texttt{beta\_end} as \texttt{85e-5} and \texttt{12e-3} respectively. These parameters define the range of noise variance ($\beta$) added at each timestep and linearly increase from \texttt{beta\_start} to \texttt{beta\_end}. Text similarity is computed based on \texttt{cosine\_similarity}. Additionally, for all the synthesized counterfactual images discussed in this manuscript, we use the same hyperparameters (e.g. denoising steps, DDIM scheduler) for all tasks, except the language-based command for each case. Thus, our proposed method does not need extensive hyperparameter tuning. We provide additional implementation details in Appendix~\ref{appen:implementation details} and the code along with model weights for the fine-tuned Stable Diffusion are publicly available at \url{https://huggingface.co/amar-kr/PRISM}.

\begin{table}[h]
\centering
\begin{tabular}{c|cccc}
\hline
\begin{tabular}[c]{@{}c@{}}\textbf{Attribute} $\rightarrow$\\ \textbf{Splits}$\downarrow$\end{tabular} & \begin{tabular}[c]{@{}c@{}}\textbf{Pleural}\\ \textbf{Effusion}\end{tabular} & \textbf{Cardiomegaly} & \begin{tabular}[c]{@{}c@{}}\textbf{No} \\ \textbf{Finding}\end{tabular} & \begin{tabular}[c]{@{}c@{}}\textbf{Support} \\ \textbf{Devices}\end{tabular} \\ \hline

Train & 62509 & 21888& 12222 & 78211 \\
Validation & 10996 & 3739 & 2161 & 13678 \\
Test & 12972  & 4515 & 2591 & 16196 \\ \hline
\end{tabular}
\caption{Summary of the number of samples for train, validation and test splits.}
\label{table:samples}
\end{table}

\label{sec:datasetandimplementation}
\subsection{Experiments and Metrics: Evaluating the Generated CF Images}
To establish baseline comparisons, we implement GANterfactual~\cite{mertes2022ganterfactual}, a classifier-guided CF image generation method.  We fine-tune pre-trained Efficient-Net~\cite{tan2019efficientnet}, initially trained on Image-Net, for a multi-head classification task:  pleural effusion, cardiomegaly, no finding and support devices. This classifier is then used to verify the class of the CF images synthesized by our PRISM framework, ensuring that the generated CFs accurately reflect the desired modifications of the correct target class. It should be noted that the baseline method requires an image size of $224\times224$. 
%For baseline comparisons to a classifier-guided CF generation method~\cite{mertes2022ganterfactual}, we needed an accurate binary classifier to detect the presence (0/1) of medical support devices (e.g. pacemaker, wires) in the original and generated images. To this end, we trained AlexNet~\cite{krizhevsky2012imagenet}. 

To quantitatively evaluate the quality 
%and validate the class 
of synthesized counterfactual images, we use the following metrics:
(i) \textbf{Subject Identity Preservation} evaluates how well the subject-identifying characteristics are maintained while only modifying the targeted attribute. Following prior work~\cite{mothilal2020explaining, nemirovsky2020countergan}, this is calculated through the $L_1$ distance between the CF and factual images. 
%A lower $L_1$ score indicates minimal change to the CF image compared to the factual; 
(ii) \textbf{Counterfactual Prediction Gain (CPG)} \cite{nemirovsky2020countergan} measures the absolute difference in a classifier's predictions between factual and CF images. A higher CPG indicates a greater shift across the classifier's decision boundary. To this end, we trained a binary AlexNet \cite{krizhevsky2012imagenet} to detect the presence (1) or absence (0) of medical support devices (e.g. pacemakers, wires, tubes) in the original images. Then at inference, this AlexNet model measures the CPG score for the CF images synthesized by PRISM and the baseline method, respectively. 
%The CFs were synthesised to remove the medical device from the factual image.

A final set of experiments is devised in order to show that the synthesized CF images focus on the defining features of each disease  (such as pleural effusion occurring at the corner of the lungs or cardiomegaly surrounding the position of the heart). 
% To this end, CF images are generated with the instruction to remove the medical devices.
The training data for the original EfficientNet classifier is then augmented with these CF images. Each subgroup - Pleural Effusion, Cardiomegaly, No Findings and Support Devices are augmented with 2500 CF samples.
An increase in classifier accuracy suggests that synthesized counterfactual images enhance generalizability and robustness, enabling the classifier to identify defining disease features independent of potential confounding factors in the dataset. This is particularly important in the context of pleural effusion, which is correlated with the presence of medical devices. To validate the hypothesis that CF image augmentation enhances subgroup-level performance compared to generic augmentation, we perform a controlled experiment. In this setup, each subgroup is augmented with 2500 samples generated from the fine-tuned Stable Diffusion model with the following text prompts: \texttt{Chest X-ray of a patient with severe cardiomegaly and without support devices}, \texttt{Chest X-ray of a patient with no findings}, \texttt{Chest X-ray of a patient with pleural effusion and lots of \ support devices}, and \texttt{Chest X-ray of a patient with pleural effusion and without support devices}.

\subsection{Results}

\textbf{Classifiers} EfficientNet has a classification accuracy of 0.8, 0.87, 0.91 and 0.86 for pleural effusion, cardiomegaly, no finding and support devices, respectively (see first row of Table~\ref{table:quant_results}). The accuracy and AUC of the binary AlexNet classifier on a held-out test set are 0.89 and 0.91, respectively. These classifiers are used to measure the CPG scores reported in Table~\ref{tab:counterfactual_images}. 
%indicating the classifier can correctly identify the presence of medical devices.
\begin{table}
\centering{%
\begin{tabular}{ccc}
\hline & L1↓  & CPG↑ \\ \hline
\multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}Baseline (classifier-guided GAN-based CF)\end{tabular}} & 0.091          & 0.781          \\
\multicolumn{1}{c|}{PRISM \textbf{[Ours]}}                                                                  & \textbf{0.031} & \textbf{0.845} \\ \hline
\end{tabular}
}
\caption{Quantitative results comparing the scores for the CF generated by a classifier-guided GAN-based method and PRISM, when asked to remove the medical devices. The high CPG indicates that PRISM synthesizes CF images that correctly change the class labels.}
\label{tab:counterfactual_images}
\end{table}

\noindent\textbf{Qualitative Evaluations} Our qualitative evaluation demonstrates two primary capabilities of our method: (i) the ability to remove and, for completeness, \textit{add} medical devices to the original image, and (ii) the ability to emulate distinct visual pathologies of different diseases. 

Chest radiographs contain a variety of medical devices~\cite{gambato2023chest} such as chest tubes for draining air, blood, or fluid from the pleural space, surgical clips that are often visible after procedures like axillary node dissection, or pacemakers that regulate heart rhythm, typically seen as a small box near the clavicle~\cite{mathew2019chest}. These devices vary in shape, size and position in the X-ray image. Our method, PRISM, can remove medical devices, demonstrating robust performance across various device types and positions without any external classifier-based supervision or image-level mask/annotations.
\begin{figure}[t] 
    \centering \includegraphics[width=0.95\textwidth]{pdfs/qual_results.pdf} 
    \caption{Sample pairs of original and CF images demonstrate the capability of PRISM to remove and add medical devices (e.g. wires, pacemaker) in high resolution. \textcolor[HTML]{0066CC}{Left}: CF images with \textcolor[HTML]{FF66FF}{medical devices} removed. Language guidance is $T$: \texttt{chest xray of the patient with lots of medical devices}, $T'$: \texttt{chest xray of the patient without medical devices}. Note that the baseline method cannot properly remove \textcolor[HTML]{FF66FF}{medical devices}; \textcolor[HTML]{00CC00}{Right}: CF images with added medical devices. Language guidance is $T$: \texttt{chest xray of the patient with no support devices}, $T'$: \texttt{chest xray of the patient with lots of support devices}.} 
    \label{fig:qualresults} 
\end{figure}
In Fig.~\ref{fig:qualresults}, we show how, by using language guidance, we can remove complex medical devices from the given image without altering the pathology of the disease. We also compare our framework to a baseline method, GANterfactual~\cite{mertes2022ganterfactual}, a classifier-guided CF generator. This method relies on the gradient from a pre-trained classifier for guidance and fails to remove devices from the image. 
%Figure \ref{fig:qualresults} shows the original and edited text prompts used to guide the image generation, alongside visual examples of the removal and addition of medical devices. These results demonstrate the model's ability to identify such devices, determine their appropriate placement, and reconstruct the underlying anatomy when they are removed.
Next, we evaluate our method's ability to effectively \textit{differentiate} between diseases during CF image generation. Specifically, Fig.~\ref{fig:explainability} demonstrates PRISM's performance in generating CFs for two diseases: Pleural Effusion and Cardiomegaly.
\begin{figure}[ht] 
    \centering 
    \includegraphics[width=\textwidth]{pdfs/new_qal_results.pdf} \caption{Sample pairs of original and edited images showcasing accurate, precise and high-resolution generated CFs for \textcolor{cyan}{disease pathology} explainability. The original ($T$) and edited text prompts ($T'$) are - \underline{Row 1}: $T$ - \texttt{chest x-ray of the patient with severe pleural effusion}, $T'$ - \texttt{chest x-ray of the patient with no finding}; \underline{Row 2}: $T$ - \texttt{chest x-ray of the patient with severe cardiomegaly}, $T'$ - \texttt{chest x-ray of the patient with no finding}. } 
    \label{fig:explainability} 
\end{figure}
The difference maps in Fig.~\ref{fig:explainability} demonstrate that our approach can identify and remove the target disease while preserving the anatomical features of the subject, as well as the devices and other artifacts outside the regions of the expected changes. In Appendix~\ref{appen:comparison_sota}, we provide additional qualitative comparisons between PRISM and state-of-the-art (SOTA) text-guided image editing methods, including Imagic~\cite{kawar2023imagic}, Null-text Inversion~\cite{mokady2023null}, and RadEdit~\cite{perez2025radedit}. The results demonstrate PRISM's ability to generate precise CF images that remain consistent with the original factual image, outperforming other methods.


\noindent\textbf{Quantitative Evaluations} To quantitatively evaluate our approach, we compare our method with GANterfactual, a classifier-guided GAN-based approach for generating counterfactuals. Table~\ref{tab:counterfactual_images} shows the results for the task of removing medical devices. The counterfactual images generated by GANterfactual show similar $L_1$ scores to those produced by our method, indicating that the synthesized images in both cases remain close to their factual counterparts. However, counterfactuals generated by PRISM achieve higher CPG scores, suggesting that these images are more effectively converted to the opposite class (see  Appendix~\ref{appen:image validation} for additional results).  %confirms that PRISM generates images that are classified as belonging to the opposite class.


% \textbf{Quantitative evaluations } In Table~\ref{tab:counterfactual_images}, CF images generated by a classifier-guided GAN-based method~\cite{mertes2022ganterfactual} show similar result for the metric L1 indicating the synthesized images in both cases are similar to their factual counterpart. CFs synthesized by FLAME have higher CPG scores indicating that these images belong to the opposite class. Appendix~\ref{appen:image validation} provides additional validation that FLAME synthesizes image belonging to the opposite class.
%\textbf{Downstream Classifier Performance with Augmented Counterfactuals} 
Table~\ref{table:quant_results} shows the results of re-training the classifier with CFs for the classes Pleural Effusion, Cardiomegaly, No Finding, and Support Devices. As shown, augmented training leads to improved classifier performance, demonstrating that incorporating CFs synthesized by PRISM enhances the model's robustness. Notably, this increase in performance is not observed when the original data is randomly augmented with samples from the fine-tuned stable diffusion model, thus supporting the hypothesis that CF augmentation specifically improves classifier performance.
% Table~\ref{table:quant_results} shows the results of re-training the classifier augmented with CFs of the class Pleural Effusion, Cardiomegaly and Support devices. Augmented training improves the performance of the classifier. This also indicates that augmenting with CFs synthesized from FLAME can be used to train robust classifiers.
\begin{table}[h]
\centering
\begin{tabular}{c|cccc}
\hline
 & \textbf{\begin{tabular}[c]{@{}c@{}}Pleural\\ Effusion\end{tabular}} & \textbf{Cardiomegaly} & \textbf{\begin{tabular}[c]{@{}c@{}}No \\ Finding\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}Support \\ Devices\end{tabular}} \\ \hline
Original Data & 0.80 & 0.87 & 0.91 & 0.86 \\
\begin{tabular}[c]{@{}c@{}}Original Data + SD samples\end{tabular} & 0.82 & 0.86 & 0.91 & 0.85 \\
\begin{tabular}[c]{@{}c@{}}Original Data + PRISM CFs\end{tabular} & \textbf{0.88} & \textbf{0.90} & \textbf{0.92} & \textbf{0.88} \\ \hline
\end{tabular}
\caption{Augmented classifier accuracies using Efficient-Net: Synthetic samples from PRISM [second row - Original Data+SD (Stable Diffusion) samples] and CF images generated by PRISM [third row - Original Data + PRISM CFs] are used to augment the training dataset. The accuracies are reported on the same held-out test set.}
\label{table:quant_results}
\end{table}
\section{Conclusion}
Developing a generative model in the medical domain to produce high-quality counterfactuals requires a balance between image fidelity and controllability.
%Developing generative models in the medical domain requires balancing image quality with controllability to effectively produce counterfactuals.
In this work, we present PRISM, a fine-tuned vision-language foundation model for counterfactual medical image generation that addresses these challenges. PRISM is the first framework to use language guidance to synthesize high-resolution ($512\times512$) medical images consistent with their factual counterparts. We demonstrate our results through extensive experiments on the CheXpert dataset.
%-- a resolution previously unattained in medical CF image generation. 
Our approach generates precise and accurate CFs representing disease states 
%(observed via explainable difference maps) 
and is able to cleanly remove medical devices. 
%We showcase the model's capability to explain the disease and highlight its potential for enabling robust downstream classifiers. 
We make our code and fine-tuned model weights publicly available to the medical imaging community for further development. 
%in the hopes that the avenue of fine-tuning foundation models can be further explored by the medical imaging community. 
Future work will investigate the use of synthesized counterfactual images to build robust classifiers for out-of-distribution generalization, and to assess the disentanglement capacity of language-guided foundation models.

% In medical image analysis, vision foundation models trained on natural images can be effectively fine-tuned for improved performance. This paper presents a counterfactual generation framework that leverages the backbone of a stable diffusion and uses classifier-free guidance to synthesize counterfactual images. We show several downstream tasks such as image pre-processing, explainability and building robust classifiers. Qualitative analysis of our extensive experiments indicates that we can synthesize high-resolution, $512\times512$, counterfactual images consistent with factual images. 
\clearpage 
\section*{Acknowledgements}
The authors are grateful for funding provided by the Natural Sciences and Engineering Research Council of Canada, the Canadian Institute for Advanced Research (CIFAR) Artificial Intelligence Chairs program, Mila - Quebec AI Institute, Google Research, Calcul Quebec, Fonds de recherche du Québec (FRQNT),  and the Digital Research Alliance of Canada.
% Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\bibliography{midl25_230}
\appendix
\newpage

%\section{Implementation Details}\label{appen:implementation details}
%Similar to~\cite{prabhu2023lance}, we discard the edited samples with $S_\text{CLIP}<0.2$.

\section{Additional Implementation Details and Code Listings}\label{appen:implementation details}
We provide additional steps for our implementation.
\begin{listing}[h]\label{code:listing1}
    \inputminted{python}{finetune.py}
    \caption{Generating text for the images in CheXpert dataset}
\end{listing}


\begin{algorithm2e}
\SetAlgoLined
\DontPrintSemicolon
\caption{Fine-tuning Stable Diffusion on CheXpert}
\textbf{Pre-trained Stable Diffusion model components}: unet, vae, textEncoder, tokenizer, noiseScheduler \\
\textbf{CheXpert dataset}: dataloader \\
\textbf{Optimizer}: optimizer
\SetKwFor{ForEach}{for each}{do}{end}
    \ForEach{batch in dataloader}{
        latents = vae.encode(batch[``image'']) \Comment{encode images into latent space}\;
        noise = sampleRandomNoise()     \Comment{add random noise to latents}\;
        timesteps = sampleRandomTimesteps()\;
        noisyLatents = noiseScheduler.addNoise(latents, noise, timesteps)\;
        encoderHiddenStates = textEncoder(batch[``inputIds''])    \Comment{encode text captions}\;
        noisePred = unet(noisyLatents, timesteps, encoderHiddenStates) \Comment{predict noise residual with U-net}\;
        loss = mseLoss(noisePred, noise) \Comment{compute pixel wise loss}\;
        backward(loss)  \Comment{backpropagate}\;
        optimizer.step()    \Comment{update weights}\;
        optimizer.zeroGrad()\;
    }

\end{algorithm2e}
\newpage


\subsection{Language-guided Image Editing using PRISM}\label{appen:lgcigPRISM}
To generate CF medical images with language guidance, PRISM adopts the image-editing technique used in LANCE~\cite{prabhu2023lance}, which combines Null-text inversion~\cite{mokady2023null} with Prompt-to-prompt~\cite{hertz2022prompt} attention manipulation. Algorithm~\ref{algo:arlgorithm_prism} presents the detailed pseudo-code outlining the three key steps involved in PRISM's image editing process: (i) image inversion, (ii) image editing, and (iii) quality evaluation of the generated image.
 
\noindent\textbf{Image Inversion:}
In the inversion stage, the objective is to recover a latent representation of the original image and optimize unconditional embeddings to ensure accurate reconstruction. First, the original image $I_{\text{orig}}$ is encoded into the latent space as $z_T$ using an image encoder $E_I$. A deterministic DDIM reverse diffusion then produces the latent sequence $\{z_T, z_{T-1}, \ldots, z_0\}$.

Unconditional embeddings $E_{\text{uncond}}$ are randomly initialized, while conditional embeddings $E_{\text{cond}}$ are derived from the original prompt $P_{\text{orig}}$. For each diffusion step (from $t = T$ to $t = 1$), a predicted latent $\hat{z}_{t-1}$ is computed using $E_{\text{cond}}$ and the current $E_{\text{uncond}}$. The mean squared error, $\mathcal{L} = \|\hat{z}_{t-1} - z_{t-1}\|^2$, is minimized via gradient descent to update $E_{\text{uncond}}$. This null-text inversion process aligns the latent representation with the original image, preserving its structure and style for accurate reconstruction and reliable editing. Figure \ref{fig:appendix_b} shows the original and inverted images, with many details preserved during generation. Notably, the model struggles with the small text found within the images, which we further discuss in Appendix \ref{sec:appendix_F}. When the original and inverted images are passed through the state-of-the-art classifier, the changes in multi-class logit values are close to zero. This confirms that the inversion process maintains relevant details needed for accurate image classification. 


\noindent\textbf{Image Editing:}
In this step, the model modifies the original image by initiating the denoising diffusion process from the latent representation $z_T$ obtained during the inversion step. The goal is to progressively refine this latent representation towards a clean, edited image while applying changes specified by the edited prompt. 

The process begins by encoding the original prompt $P_{\text{orig}}$ and the edited prompt $P_{\text{edit}}$ into their respective conditional embeddings $E_{\text{cond}}^{\text{orig}}$ and $E_{\text{cond}}^{\text{edit}}$. For each timestep $t$ (from 1 to $T$), the model retrieves attention maps for both the original and edited prompts, $A_{\text{orig}}$ and $A_{\text{edit}}$, based on the current latent representation $z_{t-1}'$. Here, cross-attention is implemented similar to Prompt-to-prompt~\cite{hertz2022prompt}. %These attention maps guide the denoising step where the model refines the latent representation by incorporating the edited prompt while preserving relevant aspects from the original. 
Once the diffusion process is completed, the final counterfactual image $I_{\text{CF}}$ is decoded from the final latent representation $z_T'$.

% In the second step, the model edits the original image by deploying the denoising diffusion process starting from the noisy latent obtained during inversion $z_T$. The model progressively denoises this latent representation back to a clean image while incorporating the edited text prompt with cross-attention control.}

\noindent\textbf{Quality Evaluation}
Once the image has been generated, the CLIP similarity score, $S_\text{CLIP}$ (as defined in Equation~\ref{eq:edit_score}), is used to assess the quality of the edits. This score evaluates the similarity between the generated and original images and the alignment of the image with the edited text prompt ~\cite{prabhu2023lance}.


\begin{algorithm2e}[!htb]
\caption{Counterfactual Medical Image Generation using PRISM}\label{algo:arlgorithm_prism}

\SetAlgoLined

\KwIn{$I_{\text{orig}}$ (Original Image), $P_{\text{orig}}$ (Original Image Prompt), $P_{\text{edit}}$ (Edit Prompt), $E_I$ (Image Encoder), $E_P$ (Text Prompt Encoder), $f_\theta$ (Diffusion model) }
\KwOut{$I_{\text{CF}}$ (Counterfactual Image)}
\BlankLine
\textbf{Step 1: Image Inversion}\\
\quad Encode image to latent space: $z_T \leftarrow E_I(I_{\text{orig}})$\\
\quad Perform DDIM reverse diffusion to get latent sequence: $\{z_T, z_{T-1}, \ldots, z_0\}$\\
\quad $E_{\text{uncond}} \leftarrow \text{Random\_Initialize}()$\\
\quad $E_{\text{cond}} \leftarrow E_P(P_{\text{orig}})$\\

\Indp
\For{$t = T$ to $1$}{
    \quad \quad $\hat{z}_{t-1} \leftarrow \text{DDIM\_Reverse\_Step}(z_t, E_{\text{cond}}, E_{\text{uncond}})$\\
    \quad \quad $\mathcal{L} \leftarrow \|\hat{z}_{t-1} - z_{t-1}\|^2_2$ \Comment{MSE loss}\\
    \quad \quad Update $E_{\text{uncond}}$ via gradient descent to minimize $\mathcal{L}$
}
\Indm

\BlankLine

\textbf{Step 2: Image Editing}\\
\quad $z_0' \leftarrow z_0$\Comment{initialize with inverted latent}\\
\quad $E_{\text{cond}}^{\text{orig}} \leftarrow E_P(P_{\text{orig}})$\Comment{encode original prompt}\\
\quad $E_{\text{cond}}^{\text{edit}} \leftarrow E_P(P_{\text{edit}})$\Comment{encode edited prompt}\\
\Indp
\For{$t = 1$ \textbf{to} $T$}{
    \quad \quad $A_{\text{orig}} \leftarrow \text{Get\_Attention\_Maps}(z_{t-1}', E_{\text{cond}}^{\text{orig}})$\\
    \quad \quad $A_{\text{edit}} \leftarrow \text{Get\_Attention\_Maps}(z_{t-1}', E_{\text{cond}}^{\text{edit}})$\\
    \quad \quad $z_t' \leftarrow \text{Forward\_Step}(z_{t-1}', E_{\text{cond}}^{\text{edit}}, E_{\text{uncond}}, A_{\text{orig}}, A_{\text{edit}})$ \\ \Comment{Forward diffusion with attention control}
}
\Indm

\quad $I_{\text{CF}} \leftarrow \text{Decode}(z_T')$\\

\BlankLine
\textbf{Step 3: Evaluate Edit Quality}\\
\quad $S_{\text{CLIP}} \leftarrow \text{Evaluate\_CLIP}(I_{\text{orig}}, I_{\text{CF}}, P_{\text{orig}}, P_{\text{edit}})$ \Comment{CLIP similarity score}\\
\Return{$I_{\text{CF}}$}
\end{algorithm2e}




\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{pdfs/appendix1.pdf}
    \caption{The inversion quality of the proposed generative model.}
    \label{fig:appendix_b}
\end{figure}
\clearpage

\section{Qualitative Comparisons to Image Editing Methods utilizing a Stable-Diffusion Backbone}\label{appen:comparison_sota}
This section presents additional qualitative comparisons to other language-guided image-editing methods that use a stable-diffusion backbone, namely Imagic~\cite{kawar2023imagic}\footnote{Since the original implementation is unavailable for Imagic, we use the code available at \url{https://github.com/ShivamShrirao/diffusers/blob/main/examples/imagic/train_imagic.py}}, Null-text inversion~\cite{mokady2023null}\footnote{Source code for Null-text inversion is available at \url{https://github.com/google/prompt-to-prompt}}, and RadEdit~\cite{perez2025radedit} \footnote{Source code for RadEdit is available at \url{https://huggingface.co/microsoft/radedit}.}
\textbf{RadEdit} uses Stable Diffusion models fine-tuned to multiple chest x-ray datasets such as CheXpert, MIMIC-CXR and ChestX-ray 8 along with . The method employs two masks: an edit mask indicating the area where changes should be applied based on a text prompt and a keep mask that ensures other critical regions remain unchanged. These masks are combined with classifier-free guidance to ensure that edits are localized and consistent. RadEdit is trained on approximately 487k chest radiographs (compared to PRISM, which is trained on 80k images).\\
\textbf{Imagic} follows a three-step approach for language-guided image editing: (i) text embedding optimization to generate images similar to the input image based on the target text; (ii) generative model fine-tuning to improve the fidelity to the input image while freezing the optimized embeddings; and (iii) linear interpolation between the target text embedding and the optimized embedding and then, the generative diffusion process manipulates this combined representation to generate the final edited counterfactual (CF) image.
\begin{figure}[!htb]
    \centering
    \includegraphics[width=0.95\linewidth]{pdfs/compare_imagic.pdf}
    \caption{Comparison between PRISM (our method), RadEdit~\cite{perez2025radedit}, Imagic~\cite{kawar2023imagic} and Null-text inversion~\cite{mokady2023null} for the task of removing support devices from the original image. The edit text for PRISM, Imagic and Null-text Inversio was \texttt{Chest x-ray of a subject without support devices} while for RadEdit it was \texttt{remove support devices}. Note that RadEdit and Imagic is unable to remove support devices from the given image while Null-text inversion changes the patient's attributes. PRISM, Imagic and Null-text Inversion also use the same fine-tuned Stable Diffusion for image editing, while RadEdit uses their publicly released weights.}
    \label{fig:comp_imagic_null}
\end{figure}
\\
\textbf{Null-text inversion}, uses DDIM inversion to map the input image to a sequence of noised latent codes that serve as pivotal latent codes, a reference point for further optimization. Next, the classifier-free guidance involves predicting noise twice: once conditionally with a text prompt and once unconditionally (using a null-text embedding). By optimizing around the pivotal latent codes, the null-text embedding is adjusted to align with the pivotal codes, allowing for efficient and high-fidelity editing of images using text prompts.

Fig.~\ref{fig:comp_imagic_null} show PRISM performs significantly better than RadEdit, Imagic, and Null-text inversions for removing devices from the original image.
It should be noted that the methods Imagic and Null-text inversion were originally deployed with Stable Diffusion 1.4. For a fair comparison to PRISM, these two architecture use the same fine-tuned model as the PRISM for synthesizing images in Fig.~\ref{fig:comp_imagic_null}.

\subsection{Sequential Image Editing}
The image-editing performance of PRISM was evaluated against RadEdit~\cite{perez2025radedit} in sequential image-editing scenarios. Fig. ~\ref{fig:successive_edits} presents a comparative demonstration where both methods were tasked with first adding and then removing a medical support device from an image.  While RadEdit successfully added medical devices to the image, it shows limitations when attempting to remove these same devices.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.85\linewidth]{pdfs/successive_edits.pdf}
    \caption{\textbf{Sequential editing comparison:} RadEdit (top) and PRISM (bottom) first add a support device to the original image. When prompted to remove these \textcolor[HTML]{FF66FF}{support devices}, RadEdit fails or only partially succeeds, while PRISM successfully removes them completely. Note that RadEdit operated without masks in all experiments.}
    \label{fig:successive_edits}
\end{figure}

\clearpage
\section{Application of PRISM on ISIC Dataset}\label{append:ISIC}
We extend the applicability of PRISM to a different imaging modality to show its effectiveness. We use the ISIC 2019 dataset~\cite{tschandl2018ham10000,codella2018skin,combalia2019bcn20000}, a large-scale collection of dermoscopic images for skin cancer detection and classification. The 2019 version of the dataset contains 25,331 dermoscopic images across 8 different categories such as Melanoma (MEL), Melanocytic nevus (NV), Basal cell carcinoma (BCC), Actinic keratosis (AK), Benign keratosis (BKL), Dermatofibroma (DF), Vascular lesion (VASC), Squamous cell carcinoma (SCC). These dermoscopic images also contain artifacts such as dark corners, hairs, gel bubbles, rulers, ink, and patches.

As done for the CheXpert data in this manuscript, the tabular information is converted to sentences using the template \texttt{a dermoscopic image with [disease] showing [artifacts]} (Fig. ~\ref{fig:isic_data}). Due to the limited availability of the number of samples across different skin cancer types, we consider MEL and NV only as the \texttt{disease} types; and hairs, gel bubbles, rulers, and ink as the \texttt{artifacts}. Thus, the Stable Diffusion v1.5 is trained on 12,000 dermoscopic images for 50 epochs.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.65\linewidth]{images/isic.jpg}
    \caption{Deploying PRISM to remove/add artifacts on the demoscopic images in ISIC data. The corresponding text prompts are available at the bottom of the image. Note the selective removal of hair (left) or addition of gel bubbles (right) to the factual images.}
    \label{fig:isic_data}
\end{figure}
\clearpage
\section{Classifier Performance on the Synthesized CF Images}
We use the classifier, Efficient-Net, in Table~\ref{table:quant_results} to validate the changes made when synthesizing CF images. Classifications across all heads of the classifier, along with the corresponding original and counterfactual images, are presented in Fig.~\ref{fig:appendix_classifier}. As shown, the intervened-upon attribute is successfully pushed across the decision boundary, while all other attributes retain their original classification. Notably, even when multiple attributes are present in the original image, only the targeted attribute undergoes a shift across the decision boundary, which is verified by the resulting counterfactual image. This demonstrates our model's ability to precisely distinguish and modify each attribute as intended.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\textwidth]{pdfs/appendix-classifier.pdf}
    \caption{Classifier's performance on the original (left) and CF images (right). Note that the classifier is robust to changes made in the CF image. Text indicated in \textcolor[HTML]{00994D}{green} shows the ground truth for the given image.}
    \label{fig:appendix_classifier}
\end{figure}
\newpage
\section{Performance of the robust classifier}

To evaluate the utility of counterfactuals synthesized from PRISM for downstream tasks, we augment our dataset and retrain the original EfficientNet multi-head classifier (see Table~\ref{table:quant_results}). Notably, the original classifier, trained without augmented counterfactuals, continues to detect support devices even after their removal—likely due to the correlation between pleural effusion and medical devices in the dataset. By incorporating CF augmentation, the classifier learns the true features associated with the medical device, reducing its reliance on correlations with the disease, see Fig.~\ref{fig:robust_classifier}.
% After retraining with the CFs synthesized from PRISM, there is an improvement in the original EfficientNet multi-head classifier, see Table~\ref{table:quant_results}.
% This improvement in the performance of \textcolor[HTML]{00BFFF}{robust classifier} can be attributed to the fact it learns features associated with the attribute (medical device), see Fig.~\ref{fig:appendix_classifier}.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\textwidth]{pdfs/robust_classifier.pdf}
    \caption{\textcolor[HTML]{00994D}{Original classifier} and \textcolor[HTML]{00BFFF}{robust classifier} performance on the original (left) and CF images (right) for the text prompt to remove medical devices. Note that the \textcolor[HTML]{00BFFF}{robust classifier} is able to correctly identify the absence of medical devices in the CF image while the \textcolor[HTML]{00994D}{original classifier} fails. Text indicated in \textcolor[HTML]{00994D}{green} in the image shows the ground truth for the given image.}
    \label{fig:robust_classifier}
\end{figure}
\newpage
\section{Validation of Image Modification Using State-of-the-Art VQA Models}\label{appen:image validation}
To verify that the image modifications reflect expected anatomical changes, we use state-of-the-art VQA models to classify the images and analyze the disease-related features. We applied two state-of-the-art Vision Question Answering (VQA) models - Claude 3.5 Sonnet and LlaVA-Med. These models were chosen as they achieved high performance in disease diagnosis~\cite{kurokawa2024diagnostic}.
%, supporting their use in this context. 
During inference, we input an image $X$ and its corresponding edited image $X'$ after modifying the text $T$: ``an x-ray with severe pleural effusion" to $T'$:``an x-ray with no pleural effusion". The resulting correspondence with the VQA models is shown in Table~\ref{tab:medical_comparison}. The response from the VQA models indicate not only a successful shift in classification but also correct reasoning, aligning with the desired modifications.


\begin{table}[h] 
    \centering
    \begin{tabular}{|>{\centering\arraybackslash}m{0.2\textwidth}|>{\centering\arraybackslash}p{0.1\textwidth}|p{0.6\textwidth}|}
        \hline
        \multirow{3}{*}{%
            \begin{minipage}{0.2\textwidth}
                \centering
                \vspace{0.8cm}
                \includegraphics[width=\linewidth]{pdfs/patient53841_pe.pdf}
                \vspace{0.2cm}
                \small{Original Image}
            \end{minipage}%
        } 
        & \small{Question} & \small{Does this image have pleural effusion?} \\
        %\cdashline{2-3}
        & \small{Claude 3.5 Sonnet} & \small{\texttt{Yes, this chest X-ray shows evidence of pleural effusion. There appears to be blunting of the right costophrenic angle and increased opacity in the lower right hemithorax, which are characteristic radiographic signs of pleural effusion. The effusion appears to be unilateral (on the right side) and moderate in size based on this frontal (PA/AP) view.}} \\
        %\cdashline{2-3}
        & \small{LLaVA-Med} & \small{\texttt{Yes, the image shows bilateral pleural effusion, which means there is an accumulation of fluid in the pleural space on both sides of the lungs.}} \\
        \hline
        \multirow{3}{*}{%
            \begin{minipage}{0.2\textwidth}
                \centering
                \vspace{0.8cm}
                \includegraphics[width=\linewidth]{pdfs/patient53841_pe_edit.pdf}
                \vspace{0.2cm}
                \small{CF Image}\\ \textbf{[Ours]}
            \end{minipage}%
        }
        & \small{Question} & \small{Does this image have pleural effusion?} \\
        %\cdashline{2-3}
        & \small{Claude 3.5 Sonnet} & \small{\texttt{This chest X-ray appears to be normal without evidence of pleural effusion. The costophrenic angles (the corners where the ribs meet the diaphragm) are sharp and clear bilaterally, which is a key finding that suggests absence of pleural effusion. The lung fields appear symmetrical with normal transparency, and there is no blunting of the costophrenic angles or visible fluid levels that would indicate pleural effusion.}} \\
        %\cdashline{2-3}
        & \small{LLaVA-Med} & \small{\texttt{No, the chest X-ray image does not show any pleural effusion.}} \\
        \hline
    \end{tabular}
    \caption{Validation of the original and edited image using state-of-the art VQA models.}
    \label{tab:medical_comparison}
\end{table}

%\newpage
%\section{Visualization of the improved and robust classifier}

\newpage
\section{Role of Confounders in CF Generation}
An interesting correlation we observed when generating counterfactuals to explain the disease cardiomegaly (an enlarged heart) was the removal of the pacemaker. Note that a pacemaker can be a treatment for cardiomegaly. Indeed, when removing cardiomegaly, the CF showed that the pacemaker was also removed (Fig.~\ref{fig:pacemaker-confounder}), and other devices, such as EKG (electrocardiogram) stickers and tubes, are maintained (see final row of Fig. \ref{fig:appendix_classifier} and the bottom left example in Fig.~\ref{fig:pacemaker-confounder}). In the context where the original image contains both pleural effusion and cardiomegaly, the CF image with the task of removing cardiomegaly also removes the pacemaker. However, when generating a CF image to remove pleural effusion, no such effect occurs (Fig.~\ref{fig:pacemaker-confounder-pe-cardio}). This suggests that the model associates the presence of a pacemaker specifically with cardiomegaly but not with pleural effusion. 
 

\begin{figure}[h]
    \centering
    \includegraphics[width=0.95\textwidth]{pdfs/appendix-d_pacemaker.pdf}
    \caption{Samples where the removal of cardiomegaly, from the original image containing `pacemaker'. Please note that our method removes the disease, cardiomegaly, and pacemaker.}
    \label{fig:pacemaker-confounder}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.95\textwidth]{pdfs/appendix-d_pacemaker2.pdf}
    \caption{Comparing the change from original image with both cardiomegaly and pleural effusion to two different CFs. Note that when synthesizing the CF image with no pleural effusion the pacemaker is retained.}
    \label{fig:pacemaker-confounder-pe-cardio}
\end{figure}

\newpage
\section{Validation: CF generation in Challenging Cases}
\label{sec:extreme_case}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.85\textwidth]{pdfs/extreme-cases.pdf}
    \caption{Samples showing challenging cases. (a,c): Original images with devices; (b,d): CF images without medical devices.}
    \label{fig:extreme-cases}
\end{figure}
To demonstrate the robustness of PRISM, we examine cases that are particularly challenging to edit due to the placement of devices outside the field of view or devices in regions with bone structures. As shown in Fig.~\ref{fig:extreme-cases} (a), the device cables are located in low-light conditions near the arm.  Fig.~\ref{fig:extreme-cases} (b) shows the edited image where the cables are accurately removed by our model without impacting the humerus. In Fig.~\ref{fig:extreme-cases} (c), the artificial shoulder joint creates high-intensity pixels. The corresponding edited image in Fig.~\ref{fig:extreme-cases} (d) shows the successful removal of the joints, replacing the affected pixels with feasible anatomical structures for the region. The structures in other areas are not altered. These examples demonstrate the robustness of the proposed method in challenging settings.

\newpage
\section{Limitations of PRISM} \label{sec:appendix_F}
%As an important first step in using fine-tuned language-vision foundation models in the medical imaging domain, PRISM is not without its limitations. 
Although our method is capable of synthesizing high-resolution images ($512\times512$), it faces difficulties in reproducing the small text written in the corner of radiographs (Fig~\ref{fig:limitation-text}) in both the inverted and CF images. This inability of Stable Diffusion to resolve fine text is a known phenomena and is also seen in natural images~\cite{mokady2023null}.
%. This resolution loss in small text is also present in the image inversion, as demonstrated in Fig. \ref{fig:appendix_b}.
\begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth]{pdfs/limitation1.pdf}
    \caption{Text at the corner of the image remains unresolved in the inverted and edited images.}
    \label{fig:limitation-text}
\end{figure}

%While our model demonstrates robust capabilities in differentiating between artificial and anatomical features in extreme cases (as discussed in Appendix \ref{sec:extreme_case}), 
These are challenging settings in which the model struggles to maintain consistent edits. This variation is partly dependent on the complexity of the image. For example, if there is significant overlap between the support devices and the anatomical features such as bone (as in Fig. ~\ref{fig:limitation-image} (c), the model attempts to remove the device and create regions that change the identity of the subject. In cases where the original image is distorted, the CF image deviates from expected changes (see  Fig.~\ref{fig:limitation-image}).
\begin{figure}[h] 
    \centering 
    \includegraphics[width=\textwidth]{pdfs/incorrect_cases.pdf} \caption{Examples of original (a, c, e) and CF image (b, d, f) pairs. The command was to remove the support device, and the edits were inconsistent with the expected outcome. Red boxes highlight areas where the changes are not as intended. (a-b): The radiograph shows a problem with the original image (at the bottom). The edited image incorrectly modifies this region instead of retaining it. (c-d): The red-boxed region contains multiple tubes. While removing the tubes, the model recreates the missing anatomical area improperly. (e-f): When removing  the medical devices, the subject is depicted more strongly as female} \label{fig:limitation-image} 
\end{figure}
\end{document}