\section{Supplemental Results}
\label{appendix:results}

\subsection{SemiSynCXR for Supplementary Training Data Generation -- Per-Finding Results}
\label{subsec:90_augment-vdcxr}

\begin{table}[ht!]
    \centering
    \caption{Per-finding effect of SemiSynCXR-generated CXRs as supplementary training data on in-distribution localization performance. We report AP$_{10:70}$ on VinDr-CXR, using YOLO11n and YOLOv8n detectors trained on VinDr-CXR with varying quantities of our semi-synthetic images. Augmenting with our data improves localization performance across nearly all findings, with gains especially observed for Pneumothorax (up to 96\% relative increase) and Consolidation (up to 35\% relative increase).}
    \label{tab:90_augment-vdcxr}
    \footnotesize
    \begin{tabular}{@{}lrrccccccc@{}}
        \toprule
        \multicolumn{1}{l}{\textbf{Model}} & \multicolumn{2}{c}{\textbf{Training Data}} & \multicolumn{7}{c}{\textbf{VinDr-CXR Localization [AP$_{10:70}$ (\%) $\uparrow$]}} \\ \cmidrule(lr){2-3} \cmidrule(lr){4-10}
        \multicolumn{1}{c}{} & \multicolumn{1}{c}{Real} & \multicolumn{1}{c}{Synth} & \multicolumn{1}{c}{Atel.} & \multicolumn{1}{c}{Cmgl.} & \multicolumn{1}{c}{Cnls.} & \multicolumn{1}{c}{Opac.} & \multicolumn{1}{c}{P.Eff.} & \multicolumn{1}{c}{Pneum.} & \multicolumn{1}{c}{Avg.} \\ \midrule
        YOLO11n & $15$k & -- & 1.9 & 64.1 & 14.4 & 2.4 & 31.7 & 16.8 & 21.9 \\
         & $15$k & $7$k & \textbf{2.9} & 63.9 & 15.9 & 2.3 & \textbf{31.9} & 20.8 & 22.9\\
         & $15$k & $17.5$k & 2.8 & \textbf{66.5} & 16.3 & \textbf{2.7} & 30.2 & 16.5 & 22.5\\
         & $15$k & $35$k & 1.4 & 66.0 & \textbf{19.4} & 2.1 & 31.4 & \textbf{25.0} & \textbf{24.2} \\ \midrule
        YOLOv8n & $15$k & -- & \textbf{3.2} & 64.2 & 15.9 & 2.8 & 32.4 & 12.8 & 21.9\\
         & $15$k & $7$k & 1.6 & 64.4 & \textbf{16.2} & \textbf{3.4} & 30.8 & 21.4 & 22.9\\
         & $15$k & $17.5$k & 3.0 & \textbf{66.6} & 15.8 & 3.0 & \textbf{32.6} & 20.9 & 23.6\\
         & $15$k & $35$k & 2.2 & 64.8 & 15.4 & \textbf{3.4} & 31.3 & \textbf{25.1} &  \textbf{23.7}\\ \bottomrule
    \end{tabular}
\end{table}

\subsection{Visual and Visual-Text Alignment}
\label{subsec:90_vt_alignment}
We measure the visual alignment of our generated images with the MIMIC-CXR dataset using the Fréchet Inception Distance (FID) score, obtained with InceptionV3 (layer 2048) \cite{inception}. To also assess the visual–text alignment, we employ the CLIPScore based on the CXR-CLIP model \cite{cxrclip} and the XRayCLIP model \cite{xrayclip}\footnote{StanfordAIMI\/XrayCLIP\_\_vit-l-14\_\_laion2b-s32b-b82k}, which measures the similarity between each generated semi-synthetic image and its corresponding textual prompt. 

The general and finding-level results are presented in \tableref{tab:90_vt_alignment,tab:90_vt_aligment_finding}, respectively. Our approach achieves performance comparable to most existing methods, with only LLM-CXR yielding substantially better scores. However, a direct comparison of these scores across studies should be interpreted with caution due to: (i) the scores are computed on different subsets of finding classes, which alters the distribution of generated images, and (ii) the textual prompt distributions vary across studies, which can influence the CLIPScore. Overall, our method demonstrates image quality on par with state-of-the-art models, while uniquely providing ground-truth bounding boxes for the findings.

\begin{table}[ht!]
    \centering
    \caption{Visual alignment (FID, InceptionV3) and visual–text (CLIPScore, CXR-CLIP) alignment of \emph{SemiSynCXR}-generated CXRs. Our approach performs comparably to most existing methods, with only LLM-CXR achieving a notably better FID. These results, while broadly comparable, should be interpreted with caution due to differences in the findings considered and textual prompt distributions across studies. Overall, \emph{SemiSynCXR} achieves competitive image quality while providing precise bounding boxes for radiological findings.}
    \label{tab:90_vt_alignment}
    \small
    \begin{tabular}{@{}lcc@{}}
        \toprule
        \textbf{Model} & FID$_{\textnormal{InceptionV3}}$ $\downarrow$ & CLIPScore$_{\textnormal{CXR-CLIP}}$ $\uparrow$ \\ \midrule
        RoentGen \cite{c71roent} (synthetic) & 64.60\textsuperscript{$\ddagger$} & 0.29\textsuperscript{$\dag$}\textsuperscript{$\ddagger$} \\
        LLM-CXR \cite{llmcxr} (synthetic) & \textbf{22.75}\textsuperscript{$\ddagger$} & 0.20\textsuperscript{$\dag$}\textsuperscript{$\ddagger$} \\
        XReal \cite{xreal} (synthetic) & 55.12\textsuperscript{$\ddagger$} & -- \\
        CXRL \cite{RLCXR} (synthetic) & -- & \textbf{0.34}\textsuperscript{$\ddagger$}\\ \hdashline[0.5pt/2pt]
        SemiSynCXR (ours) & 63.99 & 0.30 \\ \bottomrule
    \end{tabular}
    \\
    \raggedright
    \textsuperscript{$\dag$}: Scores as reported in CXRL \cite{RLCXR}.\\
    \textsuperscript{$\ddagger$}: Limited comparability as these scores are averaged over a different set of finding classes.
\end{table}

\begin{table}[ht!]
    \centering
    \caption{Visual alignment and visual–text alignment per radiological finding, measured using FID (InceptionV3) and CLIPScore (CXR-CLIP, XRayCLIP), respectively.}
    \label{tab:90_vt_aligment_finding}
    \small
    \begin{tabular}{@{}lcccccccc@{}}
        \toprule
         & Atel. & Cmgl. & Cnls. & Edema & Opac. & P. Eff. & Pneum. & Avg. \\ \midrule
        \textbf{Visual Alignment} \\
        FID$_{\textnormal{InceptionV3}}$ $\downarrow$ & 61.62 & 73.22 & 63.24 & 67.09 & 58.57 & 61.34 & 62.84 & 63.99 \\ \midrule
        \textbf{Visual-Text Alignment}\\
        CLIPScore$_{\textnormal{CXR-CLIP}}$ $\uparrow$  & 0.28 & 0.28 & 0.34 & 0.14 & 0.29 & 0.42 & 0.36 & 0.30\\ 
        CLIPScore$_{\textnormal{XrayCLIP}}$ $\uparrow$   & 0.21 & 0.23 & 0.21 & 0.22 & 0.21 & 0.27 & 0.27 & 0.23\\ \bottomrule
    \end{tabular}
\end{table}

\begin{table}[ht!]
    \centering
    \caption{Qualitative evaluation of our generated CXRs. Three medical experts assessed 140 scans (70 generated, 70 real with findings). On average, 36\% of generated images were judged as real, and the intended finding was correctly identified in 64\% of generated cases, suggesting that many inpainted findings are recognizable.}
    \label{tab:90_qualitative}
    \small
    \begin{tabular}{@{}lcccccccc@{}}
        \toprule
         & Atel. & Cmgl. & Cnls. & Edema & Opac. & P. Eff. & Pneum. & Avg. \\ \midrule
        \textbf{Generated Images} \\
        As Real [FPR $\uparrow$] & 0.10 & \textbf{0.50} & 0.30 & 0.40 & 0.37 & 0.47 & 0.40 & 0.36 \\
        With Finding [TPR $\uparrow$] & 0.60 & 0.57 & \textbf{0.83} & 0.33 & 0.20 & 0.77 & 0.50 & 0.54 \\ \midrule
        \textbf{Real Images}\\
        As Real [TPR $\uparrow$] & 0.77 & 0.63 & 0.80 & 0.77 & 0.63 & 0.70 & 0.70 & 0.71 \\
        With Finding [TPR $\uparrow$] & 0.13 & 0.43 & 0.20 & 0.17 & 0.13 & 0.53 & 0.37 & 0.28 \\ \bottomrule
    \end{tabular}
\end{table}

\subsection{Qualitative Assessment}
\label{subsec:90_qualitative}
We conducted a systematic study to qualitatively evaluate our generated CXRs. Three medical experts rated 140 randomly selected CXRs: 70 generated images (10 per finding) and 70 real images (10 per finding). For each CXR, raters identified (i) whether the image was real (realism) and (ii) which finding was present (finding recognition). Results, presented in \tableref{tab:90_qualitative}, detail the False Positive Rate (FPR) for judging a generated image as real and the True Positive Rate (TPR) for correctly identifying the intended finding. For reference, we further report the TPRs in real images for realism and finding recognition.

On average, 36\% of the 70 generated images are judged as real (vs. 71\% on real images), and the intended finding is correctly identified in 54\% of the generated cases (vs. 28\% on real images). Pleural effusion and cardiomegaly demonstrate high rates for both realism and finding recognition. Atelectasis, consolidation, and pneumothorax also have high finding recognition rates, though there is room for improvement in their realism.
Overall, these results suggest that many inpainted findings are recognizable, even so image generation artifacts may remain.

\subsection{Ablation Studies}
\label{subsec:90_ablation}
We perform an ablation study comparing editing pipelines (diffusion model with blending strategy -- \sectionref{subsec:3_edit}) under different mask blurring parameters. Specifically, we evaluated four pipelines: RoentGen with blending before, RoentGen with blending after, RoentGen with CFG masking, and RadEdit with CFG masking. The overall generation quality score, as defined in \sectionref{subsec:3_optimal_config}, is presented in \figureref{fig:90_ablation_edit}. RoentGen generally achieves better or comparable performance to RadEdit, except for pneumothorax and consolidation, where RadEdit is notably superior. The choice of blending strategy has only a minor impact. Additional mask conditioning results are presented in \figureref{fig:90_ablation_config,fig:90_ablation_stop}.

\begin{figure}[th!]
    \centering
    \includegraphics[width=\textwidth]{MIDLLatexTemplate-master/imgs/90_ablation_edit.pdf}
    \caption{Ablation study on the different editing pipelines (RoentGen with blending before, RoentGen with blending after, RoentGen with CFG masking, and RadEdit with CFG masking). For each setting, we consider multiple mask blurring parameters shown as uncertainty intervals (\sectionref{subsec:3_mask}) and compute the overall generation quality score following \sectionref{subsec:3_optimal_config}. 
    We found that for most findings RoentGen generally performs better than or on par with RadEdit while the blending pipeline does not have a huge impact. However, note that for pneumothorax and consolidation RadEdit performs notably better.}
    \label{fig:90_ablation_edit}
\end{figure}

\begin{figure}[th!]
    \centering
    \includegraphics[width=\textwidth]{imgs/90_ablation_config.pdf}
    \caption{Ablation study on different blending pipelines and mask conditioning settings. We consider four different pipelines (RoentGen with blending before, RoentGen with blending after, RoentGen with CFG masking, and RadEdit with CFG masking) and vary the number of steps where mask conditioning is used. We enable mask conditioning for a specified percentage of steps ($x$-axis) before dropping it. For setting, we consider multiple mask blurring parameters (uncertainty intervals in the graph) and compute the overall generation quality score, following Sec. 3.6.}
    \label{fig:90_ablation_config}
\end{figure}

\begin{figure}[th!]
    \centering
    \includegraphics[width=\textwidth]{imgs/90_ablation_stop.pdf}
    \caption{Example on the effect of using mask conditioning for a specified percentage of steps when inpainting edema with our approach.}
    \label{fig:90_ablation_stop}
\end{figure}

\clearpage
\subsection{Examples of CXRs Generated by \emph{SemiSynCXR}}
\label{subsec:90_visual}
\begin{figure}[th!]
    \centering
    \subfigure[Atelectasis]{\includegraphics[width=\textwidth]{imgs/90_atelectasis.eps}}
    \subfigure[Cardiomegaly]{\includegraphics[width=\textwidth]{imgs/90_cardiomegaly.eps}}
    \subfigure[Consolidation]{\includegraphics[width=\textwidth]{imgs/90_consolidation.eps}}
\end{figure}%
\begin{figure}[ht]
    \subfigure[Edema]{\includegraphics[width=\textwidth]{imgs/90_edema.eps}}
    \subfigure[Lung Opacity]{\includegraphics[width=\textwidth]{imgs/90_edema.eps}}
    \subfigure[Pleural Effusion]{\includegraphics[width=\textwidth]{imgs/90_pleuraleffusion.eps}}
\end{figure}%
\begin{figure}[ht]
    \subfigure[Pneumothorax]{\includegraphics[width=\textwidth]{imgs/90_pneumothorax.eps}}
    \caption{Additional examples of images generated by our \emph{SemiSynCXR} framework, extending \figureref{fig:4_visual}. We show the real, healthy chest X-rays (top) and edited versions (bottom). The red outlines correspond to the conditioning masks alongside their non-blurred version, which serve as training targets (bounding boxes) for localization models.}
    \label{fig:90_visual}
\end{figure}