\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{graphicx}
% \usepackage{subcaption}
% \usepackage{subfigure}
\usepackage{booktabs}
\usepackage{hyperref}
% \hypersetup{hidelinks,
% 	colorlinks=true,
% 	allcolors=black,
% 	pdfstartview=Fit,
% 	breaklinks=true}
\usepackage{multirow}
\usepackage{xcolor}

% \newcommand{\revise}[1]{\textcolor{red}{#1}}
\newcommand{\revise}[1]{#1}


\usepackage{array}
\usepackage{caption}
% \usepackage{floatrow}
\usepackage{makecell}
\usepackage{mwe} % to get dummy images
\usepackage{mathrsfs}
\usepackage{xcolor}

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 32}
\editors{Accepted for publication at MIDL 2025}


\title[Style-Aligned Image Composition]{Style-Aligned Image Composition for Robust Detection of Abnormal Cells in Cytopathology}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Qiuyi Qi\midljointauthortext{Equally contributed to this work.}\nametag{$^{1}$}} \Email{qiqiuyi@zju.edu.cn}\\
\Name{Xin Li\midlotherjointauthor\nametag{$^{2,3,5}$}} \Email{merley@mail.ustc.edu.cn}\\
\Name{Ming Kong\nametag{$^{1}$}} \Email{zjukongming@zju.edu.cn}\\
\Name{Zikang Xu\nametag{$^{2,3}$}} \Email{zikangxu@mail.ustc.edu.cn}\\
\Name{Bingdi Chen\nametag{$^{4,5}$}} \Email{inanochen@tongji.edu.cn}\\
\Name{Qiang Zhu\midljointauthortext{Corresponding author.}\nametag{$^{1}$}} \Email{zhuq@zju.edu.cn}\\
\Name{S Kevin Zhou\midlotherjointauthor\nametag{$^{2,3}$}} \Email{skevinzhou@ustc.edu.cn}\\
\addr $^{1}$ Zhejiang University, Hangzhou 310058, China \\
\addr $^{2}$ School of Biomedical Engineering, Division of Life Sciences and Medicine, University of Science and Technology of China (USTC), Hefei Anhui, 230026, China \\
\addr $^{3}$ Center for Medical Imaging, Robotics, Analytic Computing \& Learning (MIRACLE), Suzhou Institute for Advance Research, USTC, Suzhou Jiangsu, 215123, China \\
\addr $^{4}$ The Institute for Biomedical Engineering
\& Nano Science, Tongji University School
of Medicine, Shanghai, 200331, China \\
\addr $^{5}$ Zhihui Medical Technology (Shanghai)
Co., Ltd., Shanghai, 200333, China }

\begin{document}

\maketitle

\begin{abstract}
Challenges such as the lack of high-quality annotations, long-tailed data distributions, and inconsistent staining styles pose significant obstacles to training neural networks to detect abnormal cells in cytopathology robustly. This paper proposes a style-aligned image composition (SAIC) method that composes high-fidelity and style-preserved pathological images to enhance the effectiveness and robustness of detection models. 
Without additional training, SAIC first selects an appropriate candidate from the abnormal cell bank based on attribute guidance. Then, it employs a high-frequency feature reconstruction to achieve a style-aligned and high-fidelity composition of abnormal cells and pathological backgrounds. Finally, it introduces a large vision-language model to filter high-quality synthesis images.  
Experimental results demonstrate that incorporating SAIC-synthesized images effectively enhances the performance and robustness of abnormal cell detection for tail categories and styles, thereby improving overall detection performance. The comprehensive quality evaluation further confirms the generalizability and practicality of SAIC in clinical application scenarios. Our code will be released at \href{https://github.com/Joey-Qi/SAIC}{https://github.com/Joey-Qi/SAIC}.
\end{abstract}

\begin{keywords}
Cytopathological Diagnosis, Abnormal Cell Detection, Data Augmentation, Image Composition
\end{keywords}

\section{Introduction}

% The identification of abnormal cytopathological cells is crucial in medical diagnostics. Due to its non-invasive, efficient, simple, and cost-effective nature, it has been widely adopted in clinical practice. However, traditional cytopathological screening methods such as ThinPrep Cytology Test (TCT) rely on human experts to identify abnormal cells in gigapixel Whole Slide Images (WSIs). This not only represents a time-consuming and tedious task but also heavily depends on the subjective recognition ability of pathology experts, posing challenges in terms of the shortage and imbalance of medical resources.

% Recent works aim to introduce deep learning models into abnormal cell detection in pathology images, improving detection efficiency and reducing the workload of doctors. However, despite the availability of pathological image data, collecting high-quality training data for abnormal cell detection models remains a significant challenge. First, due to the high level of expertise required for accurately annotating abnormal cells and the privacy restrictions surrounding patient data, collecting large-scale, accurately annotated, and diverse positive cell images involves considerable manual effort. Second, the distribution of positive cell categories is severely imbalanced, exhibiting typical long-tail characteristics, which poses significant challenges for the accurate recognition of tail-class diseases. Lastly, the issue of staining style variation arises, as pathology images collected from different institutions or at different times exhibit substantial differences in staining style and image quality. This requires detection models to possess sufficient robustness to image styles to accommodate diverse real-world application scenarios. Therefore, how to leverage limited, long-tail distributed, and style-biased annotated pathology data to train abnormal cell detection models that are robust to both category and style variations is a key problem in addressing computer-aided pathology image diagnostics.

% Synthesizing diverse training data using limited and biased data for abnormal cell detection model training is one of the mainstream approaches to solving the above challenges. Traditional data augmentation methods introduce techniques such as image affine transformations (e.g., scaling and rotation) and noise addition to increase the diversity of training data and enhance the robustness of detection models. However, these methods fail to provide semantic diversity and cannot address the issues of long-tail distribution and style bias. Recently, some works have applied diffusion model-based image generation techniques to pathology image data augmentation. For instance, the authors of [38] explored the use of parameter-efficient fine-tuning (PEFT) methods to customize diffusion models for the synthesis of cervical cytopathology images. Meanwhile, [39] investigated the effects of introducing uncertainty guidance into sampling steps for diffusion model-generated samples. Although these methods have effectively improved model performance, the training of diffusion models itself is similarly affected by data scale, distribution, and style biases.

% This paper proposes a novel style-aligned image synthesis framework for cytopathological image data augmentation. Unlike diffusion model-based image generation methods, our approach is training-free and enables the synthesis of abnormal cells into designated regions of background pathology images while ensuring consistency in category, type, area, and staining style between the foreground and background. Specifically, as illustrated in Figure 1, our method consists of two stages:

% \begin{itemize}
%     \item \textbf{Attribute-based Selection}: Before synthesis, we utilize three types of prior information—category, type, and area—to select the most compatible abnormal cell from a known abnormal cell bank for the designated synthesis location in the background image. This mitigates inconsistencies between the foreground and background in image synthesis.
%     \item \textbf{Style-aligned Composition}: Despite the Attribute-based Selection, there may still be significant staining style differences between the abnormal cell and the background image, leading to suboptimal synthesis results. To address this, we perform online staining style transfer during the image synthesis process by recombining high-frequency maps between the abnormal cell and a reference cell (whose staining style closely matches the cell in the designated location of the background image). This enhances the fidelity of the synthesized image.
%     \item  \textbf{LVLM-based Filtration}: 
% \end{itemize}

% Results from comparisons with other data augmentation methods demonstrate that our approach can synthesize diverse, high-fidelity, and information-rich cytopathological images while effectively improving the performance of abnormal cell detection models.

% Our contributions can be summarized as follows:

% \begin{itemize}
%     \item We propose a novel pathology image abnormal cell detection method based on image synthesis augmentation, which generates synthetic data by seamlessly combining abnormal cells with background pathology images, improving the performance and robustness of abnormal cell detection models.
%     \item We introduce a two-stage cytopathological image synthesis framework (framework name): first, it selects abnormal cell templates from the cell bank based on similarity in category, type, and area; then, it performs style-aligned image synthesis. Compared to image generation strategies, it offers advantages such as training-free operation, style alignment, and high fidelity.
%     \item Experimental results on the largest publicly available cervical cancer cell detection dataset, the Comparison Detector Database, show that our data augmentation framework effectively improves the detection performance of long-tail categories and rare staining styles of cells (specific data) and enhances the overall performance of abnormal cell detection (specific data). Comprehensive experimental validation ensures the high fidelity and style consistency of synthesized samples, demonstrating its effectiveness and potential for practical application. 
% \end{itemize}


Due to its non-invasive, efficient, convenient, and cost-effective advantages, abnormal cell identification from cytopathological images has been widely applied in clinical diagnostics. However, traditional cytopathological screening, such as the ThinPrep Cytology Test (TCT) relies on human experts to identify abnormal cells from gigapixel whole slide images (WSI), which is time-consuming, tedious, and heavily dependent on the subjective expertise of pathologists, posing challenges on scarce and imbalanced medical resources \cite{pan2024enhancing}. Consequently, introducing deep learning methods to address abnormal cell detection in pathological images to improve efficiency and reduce the workload of physicians has become a prominent research focus \cite{yin2024enhancing}.

%Despite the ava2ilability of pathological cell images, collecting training sets for abnormal cell detection models still faces numerous issues. First, accurately annotating a large-scale pathological image dataset requires highly specialized expertise and substantial labor costs, and patient privacy constraints further complicate the collection. Second, the category distribution of the abnormal cells is severely imbalanced, exhibiting a typical long-tailed characteristic, leading to performance and robustness concerns in tail categories. Lastly, there is the issue of staining style variability. Pathological images acquired from different institutions or times often exhibit significant differences in staining styles and image quality. Therefore, detection models must possess sufficient robustness to handle these variations and adapt to diverse real-world application scenarios. Consequently, a key challenge in computer-aided pathological image diagnosis is training robust models for abnormal cell detection in the presence of limited, long-tailed, and style-biased annotated data.
Collecting training sets for abnormal cell detection models faces numerous issues. First, curating a large-scale and accurate dataset needs highly specialized expertise and substantial labor costs. Second, the imbalanced abnormal cell distribution exhibits a typical long-tailed characteristic, leading to performance and robustness concerns in tail categories. Lastly, pathological images acquired from different institutions or periods often exhibit significant differences in staining styles and image qualities, requiring robustness to handle the variation and diversity in real-world scenarios. 


% ===== 25.01.13-00:42 以上已修改 ======

One mainstream approach to addressing these issues is to introduce data augmentation. Traditional methods like affine transformations or noise addition increase the diversity of geometric and local perturbations. However, they fail to effectively address the lack of diversity in long-tailed categories and staining styles. With the advancement of GANs \cite{goodfellow2020generative} and diffusion models \cite{ho2020denoising}, generation-based data augmentation has gained significant attention in pathological image analysis. For example, \cite{hou2019robust} proposed a GAN-based hybrid synthesis pipeline for generating pathological images using predefined rules and textures. \cite{shen2024two} explored applying parameter-efficient fine-tuning (PEFT) techniques to customize diffusion models for synthesizing cervical cytopathological images. Despite their effectiveness, these methods require fine-tuning, and thus remain constrained by data scale, distribution, and style biases.
Another mainstream strategy is to produce augmented data by composing existing foregrounds and backgrounds. For example, Paint-by-Example \cite{yang2023paint} and ObjectStitch \cite{song2023objectstitch} use CLIP \cite{radford2021learning} image encoder to convert the foreground image as an embedding for guidance, thus painting a semantic consistency object on the background image. However, these methods are not specially designed for pathological diagnosis and exhibit deficiencies in style consistency and fidelity.


% One mainstream approach to addressing these issues is to introduce data augmentation. Traditional data augmentation methods, such as affine transformations (e.g., scaling, rotation) or noise addition, increase the diversity of geometric and local perturbations. However, they fail to effectively address the lack of diversity in long-tailed categories and staining styles. With the advancement of Generative Adversarial Networks (GANs) \cite{goodfellow2020generative} and diffusion models \cite{ho2020denoising}, generation-based data augmentation techniques have attracted a significant attention in pathological image analysis. For example, \cite{hou2019robust} proposed a GAN-based hybrid synthesis pipeline that generates pathological images using predefined rules and textures initialized with real images. AttributeGAN \cite{ye2021multi} achieves data augmentation for histopathological images based on controllable cell morphological features. HistoGAN \cite{xue2021selective} generates histopathological images conditioned on class labels and incorporates a sample selection process to filter out effective samples. \cite{shen2024two} explored applying parameter-efficient fine-tuning (PEFT) techniques to customize diffusion models for synthesizing cervical cytopathological images, while \cite{luo2024measurement} investigated the introduction of uncertainty into sampling steps to guide the content generated by diffusion models. Although these methods have demonstrated effectiveness in improving the performance of various downstream tasks, they remain reliant on existing data for training and are thus constrained by data scale, distribution, and style biases.


% One mainstream approach to addressing these issues is to introduce data augmentation. Traditional data augmentation methods, such as affine transformations (e.g., scaling, rotation) or noise addition, increase the diversity of geometric and local perturbations. However, they fail to effectively address the lack of semantic diversity in long-tailed categories and staining styles. With the advancement of Generative Adversarial Networks (GANs) and diffusion models, image-generation-based data augmentation techniques have attracted significant attention in pathological image analysis. For example, [65] proposed a GAN-based hybrid synthesis pipeline that generates pathological images using predefined rules and textures initialized with real images. AttributeGAN [66] achieves data augmentation for histopathological images based on controllable cell morphological features. HistoGAN [67] generates histopathological images conditioned on class labels and incorporates a sample selection process to filter out effective samples. [38] explored applying parameter-efficient fine-tuning (PEFT) techniques to customize diffusion models for synthesizing cervical cytopathological images, while [39] investigated the introduction of uncertainty into sampling steps to guide the content generated by diffusion models. Although these methods have demonstrated effectiveness in improving the performance of abnormal cell detection models, they remain reliant on existing data for training and are thus constrained by data scale, distribution, and style biases.

This paper proposes a novel training-free \textbf{S}tyle \textbf{A}ligned \textbf{I}mage \textbf{C}omposition (SAIC) framework, which seamlessly ``injects" abnormal cells into specified locations of pathological images while ensuring high fidelity and consistency in categories, types, areas, and staining styles between the foreground and background. Specifically, SAIC consists of three steps: (1) \textbf{Attribute-based selection}: Using prior knowledge of category, type, and area to select {candidate abnormal cells} from an existing cell bank; (2) \textbf{Style-aligned composition}: Performing online staining style alignment by reconstructing high-frequency details between the abnormal cell candidates and the style reference image to ensure style consistency in the synthesized area; (3) \textbf{LVLM-based filtration}: Leveraging a large visual-language model (LVLM) to filter high-quality samples from synthesized pathological images. Experimental results demonstrate that SAIC achieves high-fidelity, style-preserving data augmentation, effectively enhances the detection of tail categories and rare styles of abnormal cells, and improves overall cell detection performance.

% This paper proposes a novel training-free \textbf{S}tyle \textbf{A}ligned \textbf{I}mage \textbf{C}omposition (SAIC) framework, which seamlessly integrates abnormal cells into specified regions of pathological images while ensuring high fidelity and consistency in categories, areas, and staining styles between the foreground and background. Specifically, SAIC consists of three steps: (1) \textbf{Attribute-based Selection}: Using prior knowledge of category, type, and area to select candidate abnormal cells from an existing cell bank that most closely matches the target location; (2) \textbf{Style-aligned Composition}: Performing online staining style transfer by reconstructing high-frequency details between the abnormal cell candidates and the style reference image to ensure style consistency in the synthesized area; (3) \textbf{LVLM-based Filtration}: Leveraging a large visual-language model (LVLM) to filter high-quality samples from the synthesized pathological images. Experimental results demonstrate that SAIC achieves high-fidelity, style-preserving data augmentation, effectively enhances the detection of tail categories and rare styles of abnormal cells, and improves overall cell detection performance.

%We summarize the contributions of this paper as follows: (1) We propose an image composition-based data augmentation architecture for cytopathological abnormal cell detection. By seamlessly integrating abnormal cell candidates into cytopathological background images as augmented training data, this approach enhances the performance and robustness of abnormal cell detection models. (2) Compared to existing generation-based data augmentation methods, the proposed Style-Aligned Image Composition (SAIC) framework offers advantages such as being training-free, style-aligned, and high-fidelity. (3) Extensive experimental results demonstrate that SAIC-generated synthetic images effectively improve abnormal cell detection performance, particularly for tail categories and rare staining styles, while with high-fidelity.
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\caption{Overall pipeline of SAIC.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  {\includegraphics[width=1\linewidth]{figures/figure1.png}}
  % {\includegraphics[width=1\linewidth]{figure1.png}}
\end{figure}
We summarize the contributions of this paper as follows: (1) We propose an image composition-based data augmentation architecture for cytopathological abnormal cell detection. (2) Compared to existing generation-based data augmentation methods, the proposed Style Aligned Image Composition (SAIC) framework offers advantages such as being training-free, style-aligned, and high-fidelity. (3) Extensive experimental results demonstrate that SAIC-synthesized images effectively improve abnormal cell detection performance, particularly for tail categories and rare staining styles, while with high-fidelity.

% \subsection{Basic Pipeline}
% \figureref{fig:example} illustrates the overall pipeline of SAIC. Given a background image and the target location for synthesis, the most consistent abnormal cell in terms of category, type, and area is first selected from the abnormal cell bank (as detailed in Section 3.1) using Attribute-based Selection. Subsequently, two synthetic abnormal images are generated in parallel through Native Composition and Style-aligned Composition. Finally, the image with higher fidelity is selected from the two using LVLM-based Filtration as the final synthetic image.
% \begin{equation}
%     C_{c} = \left\{ C_i \in C_{bank} \mid c_i = c_l \right\}, \tag{1} 
% \end{equation}

% \begin{equation}
%     C_{ct} = \left\{ C_i \in C_{c} \mid t_i = t_l \right\}, \tag{2} 
% \end{equation}

% \begin{equation}
%     C_{t}=\underset{C_i\in C_{ct}}{\operatorname*{\arg\min}}|a_i - a_l|, \tag{3} 
% \end{equation}



\section{Method}
%\figureref{fig:example} illustrates the overall pipeline of SAIC. Firstly, Attribute-based Selection identifies candidate abnormal cells from an abnormal cell bank based on category, type, and area consistency. Subsequently, Style-aligned Composition produces two synthetic abnormal images, one with style alignment and one without, in parallel. Finally, LVLM-based Filtration selects the image with higher fidelity as the final synthetic result.
\figureref{fig:example} illustrates the overall pipeline of SAIC. First, \textbf{Attribute-based Selection} selects a candidate abnormal cell from an abnormal cell bank based on category, type, and area consistency. Then, \textbf{Style-aligned Composition} extracts ID maps and style maps, parallelly producing two synthetic images with self- and background-style alignment. Finally, \textbf{LVLM-based Filtration} picks the image with higher fidelity as the final output.



\subsection{Attribute-based Selection}

% The significant domain gap between natural and cytopathological images can lead to suboptimal synthesis when candidate abnormal cells are randomly selected. To address this, we use knowledge priors. Given a background image \( B \) and target location \( L \), we can determine the category \( c_l \), type \( t_l \) (cell/clumps), and area \( a_l \) of the target location cell \( C_l \). To ensure fidelity and preserve potential cellular distribution patterns, we select the candidate cell \( C_t \) from the abnormal cell bank \( C_{\text{bank}} \) (as detailed in Section 3.1) based on the following criteria:
% \begin{align}
%     &C_{c} = \left\{ C_i \in C_{bank} \mid c_i = c_l \right\},\tag{1} \\
%     &C_{ct} = \left\{ C_i \in C_{c} \mid t_i = t_l \right\},\tag{2} \\
%     &C_{t}=\underset{C_i\in C_{ct}}{\operatorname*{\arg\min}}|a_i - a_l|, \tag{3} 
% \end{align}
% where \( c_i \), \( t_i \), and \( a_i \) represent the category, type, and area of an abnormal cell \( C_i \), respectively. 

We first create an abnormal cell bank $\mathcal{C}_{Bank}$ that contains various abnormal cells {with region} and attribute annotations, which can be easily acquired from the labeled training set. Given a background of cytopathological image $B$ and the target region $L$ with the original cell $c_{orig}$, we select a candidate abnormal cell $c_{cand}$ for composition from the cell bank $\mathcal{C}_{Bank}$ according to the category $\hat{m}$, type $\hat{t}$ (cell/clumps) and area size $\hat{a}$ of $c_{orig}$:

\begin{equation}
    \begin{aligned}
        &c_{cand} = {\arg\min}_{c \in \mathcal{C}_{Bank}} \left| a_c - \hat{a} \right| \\
        s.t. \ &m_c=\hat{m} \  \text{and} \  t_c=\hat{t} \\
    \end{aligned}
\end{equation}
where $m_c$, $t_c$, and $a_c$ represent the abnormal cell $c$'s category, type, and area, respectively.

\subsection{Style-aligned Composition}

As the candidate abnormal cell is selected, we must compose it with the background while preserving identity and style consistency. To this end, we design a three-step process:

\noindent
\textbf{Identity map extraction.}  
% To extract concise and discriminative features for the candidate abnormal cells, we utilize a pre-trained visual encoder with two key enhancements. First, we remove the background and center-align the cell using the interactive SAM for high-quality segmentation. Second, leveraging the strong feature preservation ability of self-supervised models, we adopt DINOv2 \cite{oquab2023dinov2} as the backbone to encode the image into global (\( T_g \)) and patch tokens (\( T_p \)), which are concatenated and projected into the embedding space of a pre-trained text-to-image UNet to produce the final ID token \( T_{ID} \).
For identity preservation, we extract discriminative identity features from the abnormal cell as an ID map. First, we segment the cell region with an interactive SAM \cite{kirillov2023segment} and center-align it. Then extract the visual feature with a DINOv2 encoder \cite{oquab2023dinov2}, followed up with a visual-to-text linear projection to get the ID maps \( \mathcal{F}_{ID} \). 

\noindent
\textbf{Style map extraction.}
%Given the low resolution of \( T_{ID} \) (16×16), we incorporate additional detail guidance using high-frequency maps. Specifically, We first select a style reference cell \( C_r \) from the abnormal cell bank \( C_{\text{bank}} \) based on similarity with the target location cell \( C_l \) using the DINOv2 Score. The reference cell will provide a high-frequency map containing background image's staining style details, avoiding the limitations of extracting such maps directly from the target location cell due to unremoved background interference. The similarity and selection are computed as:
For style preservation, we extract the style map with high-frequency information to incorporate style-aware detail guidance. Specifically, first, we select a style reference cell \( c_{ref} \) from the abnormal cell bank $\mathcal{C}_{Bank}$ to provide high-frequency information akin to the background's staining style. The selection process is denoted as:
\begin{equation}
   \begin{aligned}
    S(c_{orig},c) & = \frac{\text{DINOv2}(c_{orig})\cdot\text{DINOv2}(c)}{\|\text{DINOv2}(c_{orig})\|\|\text{DINOv2}(c)\|}, \\
    C_{ref} & = {\arg\max}_{c\in C_{bank}} S(c_{orig},c)
\end{aligned} 
\end{equation}

%Subsequently, we extract high-frequency maps \( H_t \) and \( H_r \) for the background-removed candidate cell and reference cell, respectively, using High-pass Filters \cite{kanopoulos1988design}. Since the scale of the abnormal cell bank is limited, the selected reference cells may still exhibit staining style inconsistencies with the background, causing low-quality synthetic images. To this end, we design two parallel composition processes: one with style alignment and one without. The final high-frequency maps \( H_n \) are defined as follows, respectively:
Subsequently, we extract high-frequency maps \( H_t \) and \( H_r \) for the candidate abnormal cell and the reference cell, respectively by High-pass Filters \cite{kanopoulos1988design}. Considering the style preservation of both the candidate cell and background, we parallelly extract two kinds of high-frequency maps for self- and background-style alignment: 
\begin{equation}
H_n = 
\begin{cases} 
H_t, & \text{self-style alignment}.\\
\alpha \cdot H_t + (1-\alpha) \cdot H_r, & \text{background-style alignment}
\end{cases}
\end{equation}
In practice, the reconstruction coefficient \( \alpha \) is empirically set to 0.1 to keep the most important textual information of the candidate cell.

% \colorbox{yellow}{In practice}, the reconstruction coefficient \( \alpha \) is searched over the range \([0, 1)\) with a step size of 0.1, and \( \alpha = 0.1 \) is found to yield stable results.

%Finally, concatenated with a shape mask indicating the synthesis location, these maps are respectively fed into a pre-trained ControlNet \cite{zhang2023adding} to generate hierarchical detail maps. In practice, \( \alpha \) is empirically set to 0.1 and some examples are shown in Appendix \ref{app:example1}. 
Finally, we stitch both kinds of high-frequency maps into the background and concatenate them with the shape mask of the composition location as inputs of a ControlNet \cite{zhang2023adding} to generate hierarchical style maps $\mathcal{F}_{style}$.


% Given the low resolution of \( T_{ID} \) (16×16), we incorporate additional detail guidance using high-frequency maps. A background-removed cell is represented as a high-frequency map \( I_h \), extracted via convolution with Sobel filters (\( K_h, K_v \)) and enhanced by the Hadamard product with the original image and an erosion mask. This map, concatenated with a shape mask indicating the synthesis location, is fed into a ControlNet-style encoder to generate hierarchical detail maps.


% Contents that can be excluded in the main paper
% \noindent
% \textbf{Detail feature extraction.}
% Considering that ID tokens are represented at a low resolution (16 × 16), they struggle to fully preserve low-level detail information. Therefore, additional guidance is required to supplement the details of the generated image. Inspired by [76, 77], using a collage as a control condition can provide strong prior information. Specifically, we attempt to concatenate the "background-removed cell" at the specified location in the background image, and use a high-frequency map to represent the former, thereby retaining fine details (such as cell color, edges, textures, etc.), which enhances the fidelity of the synthesized image. Given an abnormal cell image \( I \), the process of extracting its high-frequency map \( I_h \) is as follows:

% \begin{equation}
%     \mathbf{I}_h=(\mathbf{I}_\text{gray}\otimes\mathbf{K}_h+\mathbf{I}_\text{gray}\otimes\mathbf{K}_v)\odot\mathbf{I}\odot\mathbf{M}_\text{erode}, \tag{4}
% \end{equation}

% \noindent
% \( K_h \) and \( K_v \) represent the horizontal and vertical Sobel [78] kernels, used as high-pass filters; \( \otimes \) and \( \odot \) refer to convolution and the Hadamard product, respectively. Specifically, we first use the high-pass filters to extract high-frequency regions from the grayscale image \( I_{\text{gray}} \) corresponding to \( I \), and then combine the Hadamard product to extract RGB color information. Additionally, we apply an erosion mask \( M_{\text{erode}} \) to filter out irrelevant information near the outer contours of the target abnormal cell. Afterward, we concatenate the collage with a shape mask indicating the cell synthesis location and input them into the detail extractor, which is a ControlNet-style [71] UNet encoder capable of generating a series of detail maps with hierarchical resolutions.


\noindent
\textbf{Conditioned Composition.}  
%Both the ID tokens and detail maps are injected into a pre-trained Stable Diffusion model \cite{rombach2022high} to guide composition. The ID tokens (condition \( c_i \)) are integrated via cross-attention at each UNet layer, while the detail maps (condition \( c_d \)) are concatenated with decoder features at each resolution, ensuring accurate restoration of coarse- and fine-grained features. The latent representation of the synthesized image is generated using:
Injecting the ID map $\mathcal{F}_{ID}$ and style map $\mathcal{F}_{style}$ into a Stable Diffusion model \cite{rombach2022high} to guide the composition. Specifically, $\mathcal{F}_{ID}$ are integrated via cross-attention at each UNet layer for identity preservation, while $\mathcal{F}_{style}$ are concatenated with decoder features at each resolution for style preservation. The latent representation of the synthesized image is generated as follows:
\begin{equation}
    \mathbf{z}_t = \alpha_t\hat{\mathbf{x}}_\theta(\epsilon,\mathcal{F}_{ID}, \mathcal{F}_{style}) + \sigma_t\epsilon, \tag{4}
\end{equation}
where \( \alpha_t \) and \( \sigma_t \) are denoising hyperparameters, which stay aligned with the setting of Stable Diffusion. 

All the aforementioned models, including the interactive SAM segmenter, DINOv2-based encoder and its Linear layer, ControlNet, and Stable Diffusion, directly apply the parameters pre-trained on general datasets and do not need specific domain fine-tuning.

% Contents that can be excluded in the main paper
% \noindent
% \textbf{Feature injection.}
% After obtaining the ID token and detail maps, we inject them into a pre-trained text-to-image diffusion model to guide the generation process. We choose Stable Diffusion [14], which projects the image into a latent space and uses a UNet for probabilistic sampling. We denote the pre-trained UNet as \( \hat{x}_{\theta} \), which begins denoising from the initial latent noise \( \epsilon \sim U([0, 1]) \) and generates the latent representation \( z_t \) of the new image as follows:

% \begin{equation}
%     \mathbf{z}_t = \alpha_t\hat{\mathbf{x}}_\theta(\epsilon,\mathbf{c}_i, \mathbf{c}_d) + \sigma_t\epsilon, \tag{5}
% \end{equation}

% \noindent
% \( t \) is the diffusion timestep, \( c_i \) and \( c_d \) are the ID information control condition and detail information control condition, respectively, and \( \alpha_t \) and \( \sigma_t \) are the denoising hyperparameters. Specifically, the ID token is injected as the control condition \( c_i \) into each UNet layer through a cross-attention mechanism; the detail map is used as the control condition \( c_d \), and is concatenated with the UNet decoder features at each resolution level, ensuring that both coarse-grained and fine-grained features of the synthesized image are accurately restored.


% \noindent
% \textbf{Reference cell selection.}  
% For staining style alignment, we select a reference cell \( C_r \) from the abnormal cell bank \( C_{\text{bank}} \) based on similarity with the target location cell \( C_l \) using the DINOv2 Score. The reference cell provides a high-frequency map containing background image's staining style details, avoiding the limitations of extracting such maps directly from the target location cell due to unremoved background interference. The similarity and selection are computed as:
% \begin{align}
%     S(C_l,C_i)=&\frac{\text{DINOv2}(C_l)\cdot\text{DINOv2}(C_i)}{\|\text{DINOv2}(C_l)\|\|\text{DINOv2}(C_i)\|},\tag{6} \\
%     &C_{r}=\underset{C_i\in C_{bank}}{\operatorname*{\arg\max}}S(C_l,C_i),\tag{7}
% \end{align}


% Contents that can be excluded in the main paper
% It is important to note that the sole purpose of the reference cell here is to provide a high-frequency map containing details such as the staining style of the background image. While the target location cell already directly contains this information, due to the lack of an effective automated tool to remove the cell image background, it is not possible to quickly and easily extract a high-quality high-frequency map from it. This clearly limits the potential of our framework to create large-scale synthetic images. Therefore, we instead select a reference cell from the abnormal cell bank that has a staining style similar to the target location cell and has already had its background removed.


% \noindent
% \textbf{Staining style alignment.}  
% To achieve style alignment, the high-frequency maps of the target abnormal cell (\( H_t \)) and reference cell (\( H_r \)) are recombined with a weighted coefficient \( \alpha \), producing a new high-frequency map \( H_n \):
% \begin{equation}
%     \mathbf{H}_n=\alpha\cdot\mathbf{H}_t+(1-\alpha)\cdot\mathbf{H}_r,\tag{8}
% \end{equation}

% \noindent
% When \( \alpha = 0.1 \), \( H_t \) is replaced with \( H_n \), enabling stable staining style alignment. Some examples are shown in Appendix \ref{app:example1}.


% Contents that can be excluded in the main paper
% \noindent
% \( \alpha \) controls the recomposition ratio of two high-frequency maps. When \( \alpha = 0.1 \), by replacing the high-frequency map \( H_t \) with \( H_n \) while keeping the target abnormal cell's ID token unchanged, a relatively stable staining style alignment can be achieved during the synthesis process, as shown in Figure 3. Note that even if the target location does not include any abnormal cells, our strategy is still applicable.


\subsection{LVLM-based Filtration}
%When the abnormal cell bank is limited, reference cells selected via the DINOv2 Score may still exhibit staining style inconsistencies with the background images. In such cases, Style-aligned Composition may not effectively enhance the fidelity of the synthetic images. Since category, type, and area information alone cannot ensure staining style consistency among the target abnormal cell, reference cell, and background image, we execute two parallel synthesis processes: Native Composition and Style-aligned Composition, generating two synthetic images. A vision-language model (VLM), such as GPT-4, is then used to evaluate and select the image with higher fidelity as the final synthetic result. See Appendix \ref{app:example2} for an example in the supplementary material.
% Since the scale of the abnormal cell bank is limited, the selected reference cells may still exhibit staining style inconsistencies with the background, causing low-quality synthetic images. To this end, in the Style-Aligned Composition step, we generate two synthetic images. One with the selected style reference cell, and the other with the abnormal cell candidate itself as the style reference cell, i.e., without style alignment. We introduce an LVLM model to make a filtration from two synthetic samples and choose the more harmonized one. The detailed prompt setting of the filtration is shown in Appendix \ref{app:example2}.
As we produce two synthetic images with self- and background-style aligned in parallel (See Appendix \ref{app:example1} for examples), we need to keep the more harmonized one as the final output to raise the augmented data quality. We introduce an LVLM (GPT-4) to make the choice. 
\revise{To mitigate the potential existing positional bias of VLMs, we systematically shuffled the order of the two choices during the experiments.}
The detailed prompt setting of filtration and the LVLM’s filtration ratio of the two styled images are shown in Appendix \ref{app:example2}. 


% The detailed prompt setting of filtration is shown in Appendix \ref{app:example2}.



% as illustrated in Figure 3.

\section{Experiment}

\subsection{Datasets and Settings}

\noindent
\textbf{Dataset.}
We conduct experiments on the Comparison Detector Database \cite{liang2018comparison}, the largest public dataset for cervical cancer cell detection. This dataset comprises 7,410 cervical microscopic images with 50,447 abnormal cells across 11 categories, as illustrated in \figureref{fig:example4}. We define the categories with fewer than 500 cells as tail categories, while the remainder are non-tail categories. The abnormal cell bank $\mathcal{C}_{Bank}$ is composed of 824 randomly selected abnormal cells, where each category includes 68-90 samples.


% To support experiments, we constructed an abnormal cell bank by cropping 824 abnormal cells from 11 categories based on the training set annotations. 
% Each cell is provided in two formats: original RGB images and RGBA images with background removed using the interactive SAM.
% \begin{figure}[t]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example4}
%   {\caption{Overview of Comparison detector Database.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=1\linewidth]{figure4_revision.png}}
% \end{figure}

\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example4}
  {\caption{Overview of Comparison detector Database. Figures (a) and (b) respectively demonstrate examples of each category and their distribution in the dataset.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  {\includegraphics[width=1\linewidth]{figures/figure4.png}}
  % {\includegraphics[width=1\linewidth]{figure1.png}}
\end{figure}


% \begin{figure}[t]
% % \begin{figure}[htbp]
% \floatconts
%   {fig:example4} % 总图的 label，用于 \figureref
%   {\caption{Overview of Comparison detector Database. Figures (a) and (b) respectively demonstrate examples of each categories and their distribution in the dataset.}} % 总图的标题
%   { % 这里定义具体内容
%     \begin{minipage}[b]{0.45\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{figure4a_revision.png}
%         % \caption*{(a) Demonstration of cells}
%         \caption*{(a)}
%         \label{fig:example4a}
%     \end{minipage}
%     \hfill
%     \begin{minipage}[b]{0.45\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{figure4b_revision.png}
%         % \caption*{(b) Distribution of cells.}
%         \caption*{(b)}
%         \label{fig:example4b}
%     \end{minipage}
%   }
% \end{figure}





% We conduct experiments on the long-tailed Comparison detector Database [72], the largest publicly available dataset for cervical cancer cell detection. It contains 7,410 cervical microscopic images and 50,447 abnormal cells across 11 categories, as shown in \figureref{fig:example4}. After preprocessing to filter out invalid annotations (e.g., Bboxes that are too small and do not cover any cells), the training set includes 6,538 images with 44,670 instances, and the test set contains 743 images with 5,024 instances. 
% To support experiments, we constructed an abnormal cell bank by cropping 824 abnormal cells from 11 categories based on the training set annotations. Each cell is provided in two formats: original RGB images and RGBA images with background removed using the interactive SAM. These cells cover diverse categories, types, areas, and staining styles, enabling different data augmentation methods.



% We conduct experiments on the long-tailed Comparison detector Database [72], the largest publicly available dataset for cervical cancer cell detection. It contains 7,410 cervical microscopic images and 50,447 abnormal cells across 11 categories, As shown in \figureref{fig:example4}. After preprocessing to filter out invalid annotations (e.g., Bboxes that are too small and do not cover any cells), the training set includes 6,538 images with 44,670 instances, and the test set contains 743 images with 5,024 instances. 
% To support experiments, we constructed an abnormal cell bank by cropping 824 abnormal cells from 11 categories based on the training set annotations. Each cell is provided in two formats: original RGB images and RGBA images with background removed using the interactive SAM. These cells cover diverse categories, types, areas, and staining styles, enabling different data augmentation methods.



% Contents that can be excluded in the main paper
% For all experiments, the training set always includes all cells in the abnormal cell bank, and background images are sourced exclusively from the training set to prevent data leakage. Target location selection is randomized, covering existing abnormal cell locations or arbitrary positions within the image. While these settings ensure experimental fairness, in practical applications, the abnormal cell bank can include cells not present in the training set, and background images can be freely obtained.


\noindent
\textbf{Implementation details.}  
We evaluate data augmentation methods using two object detectors: YOLOv8 \cite{varghese2024yolov8} and Faster R-CNN \cite{ren2015faster}. Models are trained using the SGD optimizer \cite{ruder2016overview} with an initial learning rate of 0.01, a momentum of 0.937, a weight decay of 0.0005, and a batch size of 8, for 150 epochs.

% We evaluate data augmentation methods using two object detectors: YOLOv8 (initialized by YOLOv8s) and Faster R-CNN (initialized by ResNet101). Input images are resized to \( 1024 \times 1024 \), and models are trained using the SGD optimizer [73] with an initial learning rate of 0.01, momentum of 0.937, weight decay of 0.0005, and batch size of 8, for 150 epochs. All implementations are based on PyTorch and run on two NVIDIA GeForce 3090 GPUs.

\noindent
\textbf{Evaluation metrics.}
We use mAP to evaluate augmentation effectiveness. Specifically, \( \text{AP}_{50} \) (a strong indicator of good localization and classification scores) is calculated with an IoU threshold of 0.5 for each category, and averaged for \( \text{mAP}_{50} \). Additionally, we use FID \cite{heusel2017gans} to assess the overall realism of synthetic images, and DINOv2 Score to evaluate the foreground fidelity.

% We use mAP (mean Average Precision) to evaluate augmentation effectiveness. Specifically, \( \text{AP}_{50} \) (a strong indicator of good localization and classification scores) is calculated with an IoU threshold of 0.5 for each category, and the averaged values represent the comprehensive score. Additionally, FID [79] assesses the overall realism of synthetic images, while the DINOv2 Score evaluates foreground fidelity by measuring the similarity between the synthetic target abnormal cell and the real target abnormal cell.

\noindent
\textbf{Methods for comparison.}
We compare three types of data augmentation methods. (1) {Copy \& Paste}: A simple method by copying abnormal cells from the abnormal cell bank, resizing and pasting them onto the specified locations in the background images. (2) {Generation-based method}: We adopt GLIGEN \cite{li2023gligen}, an advanced diffusion model for image inpainting, following the setting of \cite{shen2024two}, we fine-tuned on the Comparison detector Database for 50k iterations. (3) {Composition-based methods}: We include Paint-by-Example \cite{yang2023paint} and ObjectStitch \cite{song2023objectstitch}, two mainstream models that support the same input format as ours and require no additional parameter fine-tuning.
\revise{Note that since data-augmented methods are utilized during the model training phase (to generate enriched synthetic data for training), they do not alter the inference time or memory consumption of the anomaly cell detection model to influence the practical deployment.}


% \subsection{Comparisons with Existing Alternatives}

% Traditional data augmentation methods, such as scaling, rotation, and affine transformations, do not really introduce new information. For fairness, we include them as supplements rather than separate baselines for the following data augmentation methods:

% \noindent
% \textbf{Copy \& Paste.}  
% This simple method generates new training samples by copying a target abnormal cell selected from the abnormal cell bank, resizing it, and pasting it onto a target location in a background image.

% \noindent
% \textbf{Image generation.}  
% We adopt GLIGEN [74], an advanced diffusion model for natural images, as the base model. GLIGEN enables target object generation at specified locations (i.e., image inpainting). Using the training set of the Comparison detector Database, comprising 44,670 instances across 11 categories (e.g., 25,212 hsil, 123 flora, etc.), we fine-tune GLIGEN for 50k iterations with a learning rate of \( 1.0 \times 10^{-4} \), batch size of 4, and the AdamW optimizer [75]. During inference, we use the DDIM sampler with 50 timesteps and a CFG scale of 5.0.

% \noindent
% \textbf{Image composition.}  
% We compare our proposed SAIC framework with two other mainstream image composition models: Paint-by-Example [60] and ObjectStitch [59]. Both models support the same input format as ours and require no additional parameter fine-tuning.

\subsection{Validation of Data Augmentation Effectiveness}
We validate the effectiveness of our SAIC through its improvement in detector performance and complement to staining styles in the training set.

\noindent
\textbf{Improvement in detector performance.}
%Under identical experimental conditions of adding 5,696 synthetic images into the initial training set, we compare the performance improvements achieved by different augmentation methods,  As shown in \tableref{tab:comparison1}, SAIC yields the best average performance improvement across both detection models, enhancing YOLOv8 by 3.1 points and Faster R-CNN by 2.5 points. This method is especially effective for tail categories, achieving significant performance gains for flora (15.5 for YOLOv8 and 10.9 for Faster R-CNN) and actinomyces (actin) (13.2 for YOLOv8 and 6.7 for Faster R-CNN).
% ======= 2025.01.24修改：增加actinomyces的缩写
Under identical experimental conditions of adding 5,696 synthetic images into the initial training set, we compared the performance improvements achieved by different augmentation methods. As shown in \tableref{tab:comparison1}, SAIC yields the best average performance improvement across both detection models, enhancing YOLOv8 by 3.1 points and Faster R-CNN by 2.5 points. This method is especially effective for tail categories. For example, flora achieves improvements of 15.5 points for YOLOv8 and 10.9 points for Faster R-CNN; and actinomyces (actin) gain 13.2 points for YOLOv8 and 6.7 points for Faster R-CNN.

% As shown in \tableref{tab:comparison1}, incorporating augmented data improves/degrades detector performance compared to the baseline (w/o data augmentation), but the effectiveness varies significantly across methods. For example, our SAIC proves particularly effective for tail categories, achieving substantial performance improvements for flora (15.5/10.9 for YOLOv8/Faster R-CNN) and actinomyces (13.2/6.7 for YOLOv8/Faster R-CNN). It also delivers an overall improvement of 3.1 for YOLOv8 and 2.5 for Faster R-CNN, respectively.
\begin{table}[t]
% \begin{table}[htbp]
\centering
\caption{Performance comparison of different methods (\textbf{Best results}, \underline{second best results}).}
% \caption{Performance comparison of data augmentation methods on YOLOv8 and Faster R-CNN. Results are reported as \( \text{mAP}_{50} \) and \( \text{AP}_{50} \) for individual categories.}
\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}l|l|c|cccc|ccccccc@{}}
\hline
% \toprule
\multirow{2}{*}{\textbf{Detector}} & \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\( \text{mAP}_{50} \)}}  & \multicolumn{4}{c|}{\textbf{Tail}}  &  & \multicolumn{4}{c}{\textbf{Non-tail}} \\  \cline{4-7} \cline{8-14}

& & & \textbf{flora} & \textbf{actin} & \textbf{herps} & \textbf{cand} & \textbf{lsil} & \textbf{ascus} & \textbf{scc} & \textbf{asch} & \textbf{agc} & \textbf{trich} & \textbf{hsil} \\
\hline
% \midrule
\multirow{6}{*}{YOLOv8} 
& Baseline       & 51.7 & 68.9 & 64.3 & 83.6 & \underline{54.7} & \underline{43.8} & 28.0 & 21.1 & \textbf{19.8} & \underline{66.3} & \textbf{68.4} & 50.1 \\ 
& Copy \& Paste    & \underline{52.1} & 72.5 & \underline{68.9} & \textbf{86.7} & 48.3 & 43.7 & 28.5 & \textbf{22.9} & 18.2 & 66.2 & 66.7 & \underline{50.4} \\ 
& GLIGEN (CVPR 2023)     & 51.8 & \underline{77.6} & 65.6 & 79.9 & \textbf{54.8} & 41.2 & \underline{30.9} & 20.3 & 17.6 & 66.2 & 66.7 & 49.2 \\ 
& Paint-by-Example (CVPR 2023) & 48.7 & 64.6 & 62.9 & 79.9 & 44.3 & 40.9 & \underline{30.9} & 20.4 & 15.2 & 63.2 & 65.0 & 48.5 \\ 
& ObjectStitch (CVPR 2023)   & 50.2 & 69.9 & 66.0 & 84.8 & 47.0 & 40.4 & 30.0 & 17.6 & 17.3 & 64.3 & 65.2 & 49.3 \\ 
& \textbf{SAIC (Ours)}  & \textbf{54.8} & \textbf{84.4} & \textbf{77.5} & \underline{85.8} & 51.7 & \textbf{44.3} & \textbf{31.5} & \underline{22.8} & \underline{19.6} & \textbf{67.6} & \underline{66.9} & \textbf{50.7} \\ 
\hline
% \midrule
\multirow{6}{*}{Faster R-CNN} 
& Baseline       & 59.4 & 72.8 & 78.9 & 83.5 & 68.8 & \textbf{60.7} & 43.0 & 31.8 & \textbf{23.7} & \underline{69.6} & \underline{67.9} & 52.6 \\ 
& Copy \& Paste    & 59.5 & 70.5 & 77.6 & \underline{83.6} & 73.5 & \underline{59.9} & \underline{44.7} & \underline{35.0} & 22.8 & 68.3 & 66.8 & 51.3 \\ 
& GLIGEN (CVPR 2023)     & 59.5 & 76.6 & 77.4 & 82.0 & \textbf{79.5} & 57.3 & 42.5 & 32.9 & 19.8 & 69.1 & 66.1 & 50.9 \\ 
& Paint-by-Example (CVPR 2023) & 59.1 & \underline{81.4} & 74.7 & 77.2 & 72.3 & 58.2 & 43.8 & 31.0 & 22.7 & 67.6 & \textbf{68.4} & \underline{52.7} \\ 
& ObjectStitch (CVPR 2023)  & \underline{59.7} & 77.1 & \underline{82.5} & 82.9 & 68.2 & 55.6 & 44.1 & 34.8 & 21.9 & \textbf{70.4} & 65.8 & \textbf{52.9} \\ 
& \textbf{SAIC (Ours)}  & \textbf{61.9} & \textbf{83.7} & \textbf{85.6} & \textbf{85.9} & \underline{76.3} & 59.4 & \textbf{44.8} & \textbf{36.9} & \underline{22.9} & 68.4 & 65.2 & 52.3 \\ 
\hline
% \bottomrule
\end{tabular}%
}
\label{tab:comparison1}
\end{table}

% To evaluate the effectiveness of SAIC for data augmentation, we compare the performance improvements achieved by different augmentation methods on detectors under identical conditions. All target abnormal cells and background images are sourced from the training set. The training set of 6,538 images is expanded with 5,696 synthetic images containing 22,532 newly augmented abnormal cells, approximately half of the original annotated data. Subsequently, these synthetic images are respectively combined with the original real images to form the training data for detectors.
% As shown in \tableref{tab:comparison1}, incorporating augmented data improves/degrades detector performance compared to the baseline (w/o data augmentation), but the effectiveness varies significantly across methods. For example, our SAIC proves particularly effective for tailed categories, achieving substantial performance improvements for flora (15.5/10.9 for YOLOv8/Faster R-CNN) and actinomyces (13.2/6.7 for YOLOv8/Faster R-CNN). It also delivers an overall improvement of 3.1 for YOLOv8 and 2.5 for Faster R-CNN, respectively.
% In contrast, the effects of Copy \& Paste and Generation-based methods are minimal, and two alternative Composition-based methods even degrade performance, underscoring the superiority of SAIC for abnormal cell detection.

\noindent
\textbf{Complement to staining styles.}
For the four tail categories, we use color histograms to roughly represent their staining styles and perform t-SNE analysis on style distributions of their training sets, augmented data, and test sets. As shown in \figureref{fig:example5}, SAIC effectively complements the staining styles distribution in the training set, thereby enhancing the detector's robustness to staining variations.
More investigations of data augmentation effectiveness are shown in Appendix \ref{app:example5}.
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example5}
  {\caption{t-SNE analysis of staining style complement on tail categories. The first/second value of legends indicates the detection state before/after augmentation, and T/F represents the true/false detection.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1\linewidth]{figures/figure5.png}}
\end{figure}


% Contents that can be excluded in the main paper
% Interestingly, Faster R-CNN outperforms YOLOv8 in cervical cancer cell detection, likely due to YOLOv8's deeper and more complex network structure. While YOLOv8 extracts finer features, it is more prone to overfitting when detecting clustered cells in noisy backgrounds.





% \begin{table}[htbp]
% \centering
% \caption{Performance comparison of data augmentation methods on YOLOv8 and Faster R-CNN. Results are reported as \( \text{mAP}_{50} \) and \( \text{AP}_{50} \) for individual categories.}
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{@{}l|l|c|c|c|c|c|c|c|c|c|c|c|c|c@{}}
% \toprule
% \textbf{Detector} & \textbf{Method} & \textbf{Venue} & \textbf{\( \text{mAP}_{50} \)} & \textbf{flora} & \textbf{actin} & \textbf{herps} & \textbf{cand} & \textbf{lsil} & \textbf{ascus} & \textbf{scc} & \textbf{asch} & \textbf{agc} & \textbf{trich} & \textbf{hsil} \\ 
% \midrule
% \multirow{6}{*}{YOLOv8} 
% & Baseline       & -      & 51.7 & 68.9 & 64.3 & 83.6 & \underline{54.7} & \underline{43.8} & 28.0 & 21.1 & \textbf{19.8} & \underline{66.3} & \textbf{68.4} & 50.1 \\ 
% & Copy \& Paste    & -    & \underline{52.1} & 72.5 & \underline{68.9} & \textbf{86.7} & 48.3 & 43.7 & 28.5 & \textbf{22.9} & 18.2 & 66.2 & 66.7 & \underline{50.4} \\ 
% & GLIGEN     & CVPR 2023      & 51.8 & \underline{77.6} & 65.6 & 79.9 & \textbf{54.8} & 41.2 & \underline{30.9} & 20.3 & 17.6 & 66.2 & 66.7 & 49.2 \\ 
% & Paint-by-Example & CVPR 2023 & 48.7 & 64.6 & 62.9 & 79.9 & 44.3 & 40.9 & \underline{30.9} & 20.4 & 15.2 & 63.2 & 65.0 & 48.5 \\ 
% & ObjectStitch  & CVPR 2023   & 50.2 & 69.9 & 66.0 & 84.8 & 47.0 & 40.4 & 30.0 & 17.6 & 17.3 & 64.3 & 65.2 & 49.3 \\ 
% & \textbf{Ours (SAIC)}  & -  & \textbf{54.8} & \textbf{84.4} & \textbf{77.5} & \underline{85.8} & 51.7 & \textbf{44.3} & \textbf{31.5} & \underline{22.8} & \underline{19.6} & \textbf{67.6} & \underline{66.9} & \textbf{50.7} \\ 
% \midrule
% \multirow{6}{*}{Faster R-CNN} 
% & Baseline       & -      & 59.4 & 72.8 & 78.9 & 83.5 & 68.8 & \textbf{60.7} & 43.0 & 31.8 & \textbf{23.7} & \underline{69.6} & \underline{67.9} & 52.6 \\ 
% & Copy \& Paste    & -    & 59.5 & 70.5 & 77.6 & \underline{83.6} & 73.5 & \underline{59.9} & \underline{44.7} & \underline{35.0} & 22.8 & 68.3 & 66.8 & 51.3 \\ 
% & GLIGEN      & CVPR 2023     & 59.5 & 76.6 & 77.4 & 82.0 & \textbf{79.5} & 57.3 & 42.5 & 32.9 & 19.8 & 69.1 & 66.1 & 50.9 \\ 
% & Paint-by-Example & CVPR 2023 & 59.1 & \underline{81.4} & 74.7 & 77.2 & 72.3 & 58.2 & 43.8 & 31.0 & 22.7 & 67.6 & \textbf{68.4} & \underline{52.7} \\ 
% & ObjectStitch   & CVPR 2023  & \underline{59.7} & 77.1 & \underline{82.5} & 82.9 & 68.2 & 55.6 & 44.1 & 34.8 & 21.9 & \textbf{70.4} & 65.8 & \textbf{52.9} \\ 
% & \textbf{Ours (SAIC)}  & -  & \textbf{61.9} & \textbf{83.7} & \textbf{85.6} & \textbf{85.9} & \underline{76.3} & 59.4 & \textbf{44.8} & \textbf{36.9} & \underline{22.9} & 68.4 & 65.2 & 52.3 \\ 
% \bottomrule
% \end{tabular}%
% }
% \label{tab:comparison1}
% \end{table}




\subsection{Quality Evaluation of Augmented Image}
We evaluate the quality of augmented images synthesized by SAIC through qualitative comparisons, quantitative comparisons, and a user study.

\noindent
\textbf{Qualitative comparisons.}  
As shown in \figureref{fig:example6}, Copy \& Paste produces synthetic images with a low informational density as it does not introduce new information. The Generation-based method GLIGEN \cite{li2023gligen} generates visually realistic images but struggles with tail category representation and fidelity to real cells due to diffusion models' bias toward simpler, in-distribution samples. In contrast, our SAIC synthesizes images with high fidelity and rich informational density, outperforming other Composition-based methods. More examples are shown in Appendix \ref{app:example6}.
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example6}
  {\caption{Qualitative comparisons across augmentation methods on flora, actin (top-2 tail categories) and hsil (top-1 non-tail category).}}
  % {\caption{Partial synthetic images across different methods (top-2/top-1 for tail/non-tail).}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1\linewidth]{figures/figure6.png}}
\end{figure}


% Contents that can be excluded in the main paper
% Furthermore, compared to the other two image synthesis-based methods, our approach has significant advantages in terms of quality and fidelity, as we replaced the ID extractor from CLIP with DINOv2 and introduced high-frequency maps as a source of detailed information. These improvements result in superior synthetic image quality.


% \noindent
% \textbf{Qualitative comparisons.}
% As shown in \figureref{fig:example6}, Copy \& Paste reliably places target abnormal cells (foreground) onto designated locations in the background image but suffers from two main limitations: (1) residual backgrounds from target cells may create inconsistencies with the background image, and (2) the method does not really introduce new information, resulting in synthetic images with low informational density.
% For the Generation-based method, while the generated images may visually resemble real ones, closer inspection reveals significant discrepancies between the generated abnormal cells and the real cells used as prompts. This stems from two issues: (1) diffusion models tend to generate simpler, in-distribution samples with lower informational density, limiting their utility for downstream tasks [39], and (2) the imbalance in cell category distributions impairs the model's ability to represent tail categories, leading to mismatches in appearance or even labels, which adversely affect downstream performance.
% In contrast, our SAIC combines the strengths of both methods, delivering synthetic images with high fidelity and rich informational density. Additionally, compared to other Composition-based methods, SAIC still demonstrates substantial advantages in both quality and fidelity.


% As shown in \figureref{fig:example6}, Copy \& Paste reliably pastes target abnormal cells onto designated locations in the background image. However, it faces two major challenges: (1) residual backgrounds from the target cells may cause inconsistencies with the background image, and (2) as it does not really introduce new information, the informational density of the synthetic images remains highly limited.
% For the Generation-based method, while the synthetic images initially appear similar to real images, closer examination reveals notable differences between the generated abnormal cells and the real cells used as prompts. This is attributed to two factors: (1) diffusion models’ sampling mechanisms tend to generate simpler, in-distribution samples with lower informational density, limiting their impact on downstream tasks [39], and (2) the severe imbalance in cell category distributions hinders the model's ability to learn the data distribution of tailed categories, resulting in appearance or even label mismatches between generated and real cells, which negatively affect downstream model performance.
% In contrast, our SAIC integrates the strengths of both methods, achieving high fidelity and rich informational density in synthetic images. Furthermore, compared to other Composition-based methods, SAIC demonstrates substantial advantages in synthetic quality and fidelity. 




% \noindent
% \textbf{Quantitative comparisons.}
% We also quantitatively evaluated the image quality produced by different data augmentation methods using FID and DINOv2 Scores, as shown in Table 2. Although image generation-based methods slightly outperform our image synthesis-based method in terms of the overall realism of synthetic images, they exhibit significant limitations in the fidelity of target abnormal cells. This hinders their potential for further application in data augmentation within the field of cytopathology. In contrast, the other two image synthesis-based methods face challenges due to structural design flaws, making it difficult for them to synthesize high-quality cytopathological images across domains in a zero-shot setting, unlike our proposed method.


\noindent
\textbf{Quantitative comparisons.}
As shown in \tableref{tab:comparison2}, compared to other composition-based methods, our SAIC significantly excels in the overall realism of synthetic images (indicated by FID), and achieves the highest foreground fidelity (indicated by DINOv2 Score). Although the fine-tuned GLIGEN \cite{li2023gligen} offers marginally better overall realism, it falls short of preserving the fidelity of candidate cells used as the foreground, which limits its effectiveness for downstream tasks. In contrast, our SAIC provides a balanced performance, excelling in both aspects.

% As shown in \tableref{tab:comparison2}, while the Generation-based method achieves slightly better overall realism (FID), it falls short in preserving the fidelity of cells used as foreground, limiting its potential for downstream tasks. In contrast, our SAIC demonstrates superior performance in synthesizing images with both high realism and fidelity, compared to other Composition-based methods.
% \begin{table}[t]
% % \begin{table}[htbp]
% \centering
% \caption{Quantitative comparisons of image quality across different methods.}
% \resizebox{0.5\textwidth}{!}{%
% % \begin{tabular}{@{}l|c|c@{}}
% \begin{tabular}{lcc}
% \hline
% % \toprule
% \textbf{Method}                     & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
% \hline
% % \midrule
% Generation (GLIGEN)                       & \textbf{9.1}     & \underline{81.6}                    \\ 
% Composition (Paint-by-Example)    & 191.0            & 74.3                    \\ 
% Composition (ObjectStitch)        & 95.6             & 79.2                    \\ 
% \textbf{Composition (Ours)}                & \underline{9.7}              & \textbf{86.5}           \\ 
% \hline
% % \bottomrule
% \end{tabular}%
% }
% \label{tab:comparison2}
% \end{table}



% \noindent
% \textbf{User study.}
% To evaluate the fidelity of synthetic cytopathological images generated by the SAIC framework, we designed and conducted a user study. This study involved 8 experienced pathologists from top-tier hospitals, who were tasked with distinguishing SAIC-generated synthetic cytopathological images from real images extracted from the long-tailed cervical cancer cell dataset (Comparison Detector Database). 
% Each dataset consisted of 50 images, including 25 synthetic images and 25 real images, which were randomly mixed. The participants were given 30 minutes to complete the classification task. We recorded the classification accuracy of each pathologist and calculated the average classification accuracy across all participants, which was X. Additionally, the distribution of classification results was further analyzed using a confusion matrix. The average classification precision and recall across all participants were Y and Z, respectively.
% This study further validated the fidelity of the synthetic images and provided empirical evidence supporting the application of SAIC in cytopathological diagnostics.


\noindent
\textbf{User study.}  
%We conduct a user study involving 4 experienced pathologists to evaluate the quality of images synthesized by SAIC. Each pathologist classifies 50 randomly mixed images (25 synthetic and 25 real) within 30 minutes. According to the confusion matrix shown in \figureref{fig:example11}, the average classification accuracy is 46\%, with a precision of 56\% and a recall of 46\%. These results reaffirm the high quality of SAIC-synthesized images and support its application in cytopathological diagnostics.
We conduct a user study involving 8 experienced pathologists to evaluate the quality of SAIC-synthesized images. Each pathologist was required to assess 50 images (25 synthetic and 25 real) for their realism within a 30-minute timeframe. According to the mean and standard deviation results presented in \tableref{tab:comparison3}, the average accuracy of distinguishing between real and synthetic images was 50\%. And the judgment distributions of actual real and synthetic images are consistent. The results demonstrate that the fidelity of the synthetic images is sufficient to deceive human observers and reaffirm their high quality in supporting cytopathological diagnostics.

\noindent
\begin{minipage}[t]{\textwidth}
    \begin{minipage}[t]{0.48\textwidth}
        \centering
        \makeatletter\def\@captype{table}\makeatother
        \caption{Quantitative comparisons.}
        \resizebox{\textwidth}{!}{%
        \begin{tabular}{llcc}
        \hline
        \textbf{Framework} & \textbf{Method} & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
        \hline
        Generation & GLIGEN        & \textbf{9.1} & \underline{81.6} \\ 
        Composition & Paint-by-Example & 191.0 & 74.3 \\ 
        Composition & ObjectStitch     & 95.6 & 79.2 \\ 
        Composition & \textbf{SAIC (Ours)}   & \underline{9.7} & \textbf{86.5} \\ 
        \hline
        \end{tabular}
        }
        \label{tab:comparison2}
    \end{minipage}
    \hfill % 自动填充水平间距
    \begin{minipage}[t]{0.42\textwidth}
        \centering
        \makeatletter\def\@captype{table}\makeatother
        \caption{User study results.}
        \resizebox{\textwidth}{!}{%
        \begin{tabular}{c|cc|c}
        \hline
         & Pred. Real & Pred. Syn & Total \\ 
        \hline
        Real      & 14.875\pm3.295 & 10.125\pm3.295 & 25 \\ 
        Syn       & 14.875\pm2.976 & 10.125\pm2.976 & 25 \\ 
        \hline
        Total & 29.750\pm5.449 & 20.250\pm5.449 & 50 \\
        \hline
        \end{tabular}
        }
        \label{tab:comparison3}            
    \end{minipage}
\end{minipage}
% \begin{table}[h]
% % \begin{table}[htbp]
% \centering
% % \caption{Performances in different ablation studies in terms of \( \text{mAP}_{50} \), FID, and DINOv2 Score.}
% \caption{Ablation study on impacts of various strategies.}
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{@{}ccc|c|c|cc|c|c@{}}
% \hline
% % \toprule
% \multicolumn{3}{c|}{\textbf{Attribute-based Selection}} & \multirow{2}{*}{\makecell{\textbf{Style-aligned} \\ \textbf{Composition}}} & \multirow{2}{*}{\makecell{\textbf{LVLM-based} \\ \textbf{Filtration}}} & \multicolumn{2}{c|}{\textbf{\( \text{mAP}_{50} \) ↑}} & \multirow{2}{*}{\textbf{FID ↓}} & \multirow{2}{*}{\textbf{DINOv2 Score ↑}} \\ \cline{1-3} \cline{6-7}
% \textbf{Category} & \textbf{Area} & \textbf{Type} &  &  & \textbf{YOLOv8} & \textbf{\text{Faster R-CNN}} &  &  \\ 
% \hline
% % \midrule
%  &  &  &  &  & 51.5 & 58.4 & 12.0 & 86.8 \\ 
% \checkmark &  &  &  &  & 53.2 & 60.3 & 11.4 & 86.4 \\ 
% \checkmark & \checkmark &  &  &  & 53.6 & 60.5 & 10.6 & \textbf{87.0} \\ 
% \checkmark & \checkmark & \checkmark &  &  & 53.9 & 60.9 & 10.2 & \underline{86.9} \\ 
%  &  &  & \checkmark &  & 50.6 & 58.8 & 10.5 & 85.2 \\
% \checkmark & \checkmark & \checkmark & \checkmark &  & \underline{54.1} & \underline{61.3} & \textbf{9.5} & 85.8 \\
% \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \textbf{54.8} & \textbf{61.9} & \underline{9.7} & 86.5 \\ 
% \hline
% % \bottomrule
% \end{tabular}%
% }
% \label{tab:comparison4}
% \end{table}



\begin{table}[h]
% \begin{table}[htbp]
\centering
% \caption{Performances in different ablation studies in terms of \( \text{mAP}_{50} \), FID, and DINOv2 Score.}
\caption{\revise{Ablation study on impacts of various strategies.}}
\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}ccc|cc|c|cc|c|c@{}}
\hline
% \toprule
\multicolumn{3}{c|}{\textbf{Attribute-based Selection}} & \multicolumn{2}{c|}{\textbf{Style-aligned Composition}} & \multirow{2}{*}{\makecell{\textbf{LVLM-based} \\ \textbf{Filtration}}} & \multicolumn{2}{c|}{\textbf{\( \text{mAP}_{50} \) ↑}} & \multirow{2}{*}{\textbf{FID ↓}} & \multirow{2}{*}{\textbf{DINOv2 Score ↑}} \\ \cline{1-3} \cline{4-5} \cline{7-8}

\textbf{Category} & \textbf{Area} & \textbf{Type} & \hspace{0.5cm}\textbf{Self}\hspace{0.5cm} & \hspace{0.5cm}\textbf{Background}\hspace{0.5cm}
 & & \textbf{YOLOv8} & \textbf{\text{Faster R-CNN}} &  &  \\ 

\hline
% \midrule
 &  &  & \checkmark &  &  & 51.5 & 58.4 & 12.0 & 86.8 \\ 
&  &  &  & \checkmark &  & 50.6 & 58.8 & 10.5 & 85.2 \\
\checkmark &  &  & \checkmark &  &  & 53.2 & 60.3 & 11.4 & 86.4 \\ 
\checkmark & \checkmark &  & \checkmark & &  & 53.6 & 60.5 & 10.6 & \textbf{87.0} \\ 
\checkmark & \checkmark & \checkmark & & & & 53.8 & 60.6 & 10.5 & 86.1 \\
\checkmark & \checkmark & \checkmark & \checkmark &  &  & 53.9 & 60.9 & 10.2 & \underline{86.9} \\
\checkmark & \checkmark & \checkmark &  & \checkmark &  & \underline{54.1} & \underline{61.3} & \textbf{9.5} & 85.8 \\
\checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \textbf{54.8} & \textbf{61.9} & \underline{9.7} & 86.5 \\ 
\hline
% \bottomrule
\end{tabular}%
}
\label{tab:comparison4}
\end{table}













% \noindent of actual real and synthetic images are consistent. The results demonstrate that the fidelity of the synthetic images is sufficient to deceive human observers and reaffirm their high quality in supporting cytopathological diagnostics.

% We conducted a user study involving 8 experienced pathologists to evaluate the quality of SAIC-synthesized images. Each pathologist is required to judge 50 images (25 synthetic and 25 real) of their reality within 30 minutes. According to the average statistical results shown in \tableref{tab:comparison3}, the average prediction accuracy is 48\%. Even the proportion of classifying as real of synthetic images is higher than the actual images. The near-random discriminative probability shows that the fidelity of synthetic images is sufficient to deceive humans and reaffirm their high qualities of supporting cytopathological diagnostics.

% \begin{figure}[t]
% % \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example11}
%   {\caption{User study results.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=0.38\linewidth]{figure11.png}}
% \end{figure}


% \begin{figure}[t]
% \begin{minipage}{0.61\linewidth}
% \centering
% \resizebox{0.5\textwidth}{!}{%
% \begin{tabular}{lcc}
% \hline
% % \toprule
% \textbf{Method}                     & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
% \hline
% % \midrule
% Generation (GLIGEN)                       & \textbf{9.1}     & \underline{81.6}                    \\ 
% Composition (Paint-by-Example)    & 191.0            & 74.3                    \\ 
% Composition (ObjectStitch)        & 95.6             & 79.2                    \\ 
% \textbf{Composition (Ours)}                & \underline{9.7}              & \textbf{86.5}           \\ 
% \hline
% % \bottomrule
% \end{tabular}%
% }
% % \captionof{table}{Quantitative comparisons of image quality across different methods.}
% \caption{Quantitative comparisons of image quality across different methods.}
% \label{tab:comparison2}
% \end{minipage}
% \begin{minipage}{0.38\linewidth}
%         \centering
% 		\includegraphics[width=0.99\columnwidth]{figure11.png}
% 		\caption{User study results.}
% 		\label{fig:example11}
% \end{minipage}
% \end{figure}


% \begin{minipage}{\textwidth}
%         \begin{minipage}[h]{0.5\textwidth}
%             \centering
%             \includegraphics[height=0.7\textwidth]{figure11.png}
%             \makeatletter\def\@captype{figure}\makeatother\caption{}
%             \label{a}
%         \end{minipage}
%         \begin{minipage}[h]{0.5\textwidth}
%             \centering
%             \includegraphics[height=0.7\textwidth]{figure11.png}
%             \makeatletter\def\@captype{figure}\makeatother\caption{}
%             \label{b}   
%         \end{minipage}
% \end{minipage}

% \noindent
% \begin{minipage}[t]{\textwidth}
%     \begin{minipage}[t]{0.52\textwidth}
%         \centering
%         \makeatletter\def\@captype{table}\makeatother
%         \caption{Quantitative comparisons of image.} % Caption 移到表格前面
%         % \caption{Quantitative comparisons of image quality.} % Caption 移到表格前面
%         \resizebox{\textwidth}{!}{%
%         \begin{tabular}{llcc}
%         \hline
%         \textbf{Class} & \textbf{Method} & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
%         \hline
%         Generation & GLIGEN        & \textbf{9.1} & \underline{81.6} \\ 
%         Composition & Paint-by-Example & 191.0 & 74.3 \\ 
%         Composition & ObjectStitch     & 95.6 & 79.2 \\ 
%         Composition & \textbf{SAIC (Ours)}   & \underline{9.7} & \textbf{86.5} \\ 
%         \hline
%         \end{tabular}
%         }
%         \label{tab:comparison2}
%     \end{minipage}
%     % 右侧：图片
%     % \begin{minipage}[h]{0.38\textwidth}
%     %     \centering
%     %     \includegraphics[height=0.7\textwidth]{figure11_revision.png}
%     %     \makeatletter\def\@captype{figure}\makeatother\caption{\colorbox{yellow}{User study results.}}
%     %     \label{fig:example11}
%     % \end{minipage}
%     \begin{minipage}[t]{0.38\textwidth}
%         \centering
%         \makeatletter\def\@captype{table}\makeatother
%         \caption{User study results.} % Caption 移到表格前面
%         \resizebox{\textwidth}{!}{%
%         \begin{tabular}{l|cc|c}
%         \hline
%          & Pred. Real & Pred. Syn & Tot \\ 
%         \hline
%         Real      & 14 & 11 & 25\\ 
%         Syn       & 16.25 & 8.75 & 25\\ 
%         \hline
%         Tot & 30.25 & 19.75 & 50 \\
%         \hline
%         \end{tabular}
%         }
%         \label{tab:comparison3}            
%     \end{minipage}
        
% \end{minipage}


% \noindent
% \begin{minipage}[t]{\textwidth}
%     \begin{minipage}[t]{0.505\textwidth}
%         \centering
%         \makeatletter\def\@captype{table}\makeatother
%         \caption{Quantitative comparisons.}
%         \resizebox{\textwidth}{!}{%
%         \begin{tabular}{llcc}
%         \hline
%         \textbf{Class} & \textbf{Method} & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
%         \hline
%         Generation & GLIGEN        & \textbf{9.1} & \underline{81.6} \\ 
%         Composition & Paint-by-Example & 191.0 & 74.3 \\ 
%         Composition & ObjectStitch     & 95.6 & 79.2 \\ 
%         Composition & \textbf{SAIC (Ours)}   & \underline{9.7} & \textbf{86.5} \\ 
%         \hline
%         \end{tabular}
%         }
%         \label{tab:comparison2}
%     \end{minipage}
%     \hfill % 自动填充水平间距
%     \begin{minipage}[t]{0.395\textwidth}
%         \centering
%         \makeatletter\def\@captype{table}\makeatother
%         \caption{User study results.}
%         \resizebox{\textwidth}{!}{%
%         \begin{tabular}{c|cc|c}
%         \hline
%          & Pred. Real & Pred. Syn & Total \\ 
%         \hline
%         Real      & 14.5 & 10.5 & 25\\ 
%         Syn       & 15.7 & 9.3 & 25\\ 
%         \hline
%         Total & 30.2 & 19.8 & 50 \\
%         \hline
%         \end{tabular}
%         }
%         \label{tab:comparison3}            
%     \end{minipage}
% \end{minipage}


% \noindent
% \begin{minipage}[t]{\textwidth}
%     \begin{minipage}[t]{0.505\textwidth}
%         \centering
%         \makeatletter\def\@captype{table}\makeatother
%         \caption{Quantitative comparisons.}
%         \resizebox{\textwidth}{!}{%
%         \begin{tabular}{llcc}
%         \hline
%         \textbf{Framework} & \textbf{Method} & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
%         \hline
%         Generation & GLIGEN        & \textbf{9.1} & \underline{81.6} \\ 
%         Composition & Paint-by-Example & 191.0 & 74.3 \\ 
%         Composition & ObjectStitch     & 95.6 & 79.2 \\ 
%         Composition & \textbf{SAIC (Ours)}   & \underline{9.7} & \textbf{86.5} \\ 
%         \hline
%         \end{tabular}
%         }
%         \label{tab:comparison2}
%     \end{minipage}
%     \hfill % 自动填充水平间距
%     \begin{minipage}[t]{0.395\textwidth}
%         \centering
%         \makeatletter\def\@captype{table}\makeatother
%         \caption{User study results.}
%         \resizebox{\textwidth}{!}{%
%         \begin{tabular}{c|cc|c}
%         \hline
%          & Pred. Real & Pred. Syn & Total \\ 
%         \hline
%         Real      & 14.875 & 10.125 & 25\\ 
%         Syn       & 14.875 & 10.125 & 25\\ 
%         \hline
%         Total & 29.75 & 20.25 & 50 \\
%         \hline
%         \end{tabular}
%         }
%         \label{tab:comparison3}            
%     \end{minipage}
% \end{minipage}











\subsection{Ablation Study}
%We conduct extensive ablation studies to validate the effectiveness of different strategies used in our SAIC. Quantitative comparisons in \tableref{tab:comparison4} confirm that our strategies effectively enhance detection performance and improve synthetic quality. Specifically, Attribute-based Selection, despite its simplicity, achieves strong results independently. Incorporating Style-aligned Composition and LVLM-based Filtration further boosts the performance. Qualitative results are shown in Appendix \ref{app:example4}.

% We conduct comprehensive ablation studies to validate the effectiveness of the various steps employed in SAIC. The comparison results presented in \tableref{tab:comparison4} confirm that our designed steps improve the composition quality and significantly enhance detection performance. Specifically, Attribute-based Selection, despite its simplicity, yields robust improvements. Incorporating Style-aligned Composition and LVLM-based Filtration further boosts performance. Qualitative results of the ablation study are provided in Appendix \ref{app:example4}.

\revise{We conduct comprehensive ablation studies to validate the effectiveness of the various steps employed in SAIC. The comparison results presented in \tableref{tab:comparison4} confirm that our designed steps improve the composition quality and significantly enhance detection performance.} 

\revise{Specifically, Attribute-based Selection, despite its simplicity, yields robust improvements, and incorporating self- and background-style-aligned composition both leads to further performance boosts, where background-style alignment yields more pronounced enhancements compared to self-style alignment. Besides, self-style alignment prioritizes fidelity (higher DINOv2 scores), while background-style alignment favors realism (lower FID). Critically, integrating both mechanisms through LVLM-based Filtration synergizes their strengths, achieving superior overall performance. Qualitative results of the ablation study are provided in Appendix \ref{app:example4}.}



% Contents that can be excluded in the main paper
% \colorbox{yellow}{(64\% for Back ..., 36\% for Self ...)}



% However, without prior knowledge, Style-aligned Composition alone falls short of producing satisfactory outcomes.






% We conducted extensive ablation studies to validate the effectiveness of our framework. As shown in Figure 7, given the same target abnormal cell, background image, and target location, we analyzed the results synthesized by different framework designs. The first and last columns show the real background image (Real) and the synthesis results of the full framework (Full), respectively, while the intermediate columns demonstrate the effects of omitting certain core strategies.
% First, we omitted both the Attribute-based Selection strategy (Stage 1) and the Style-aware Composition strategy (Stage 2), which essentially synthesizes cytopathological images without considering any prior knowledge or domain differences. We found that the synthesized results exhibited significant randomness and could disrupt potential cell distribution patterns (None). Next, we applied the Attribute-based Selection strategy and Style-aware Composition strategy separately. The results show that while the Attribute-based Selection strategy is simple, using it alone already yields decent synthesis quality (w/ Stage 1). Applying the online matching strategy on top of it further enhances the results (Full). Additionally, in the absence of prior knowledge, relying solely on staining style transfer does not produce a harmonious synthesis between the foreground and the background (w/ Stage 2).
% The quantitative comparisons in Table 3 provide further evidence supporting the effectiveness of our framework design in improving the quality of synthesized images, thereby enhancing abnormal cell detection performance.


% As shown in Figure 7, given the same target abnormal cell, background image, and target location, we compared synthesis results across different framework configurations. The first column shows the real background image (Real), while the last column displays the synthesis results of the complete framework (Full). Intermediate columns illustrate the impact of omitting key strategies.

% When both the Attribute-based Selection (Stage 1) and Style-aware Composition (Stage 2) strategies were omitted, the synthesis relied on no prior knowledge or domain considerations, resulting in random outputs that disrupted potential cell distribution patterns (None). Applying only the Attribute-based Selection strategy (w/ Stage 1) significantly improved synthesis quality, demonstrating its effectiveness despite its simplicity. Incorporating the online matching strategy further refined the results (Full). Conversely, relying solely on the Style-aware Composition strategy (w/ Stage 2) without prior knowledge failed to achieve harmonious integration between the foreground and background.




% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example7}
%   {\caption{Qualitative ablation study.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=1\linewidth]{figure7.png}}
% \end{figure}

% \begin{table}[htbp]
% \centering
% % \caption{Performances in different ablation studies in terms of \( \text{mAP}_{50} \), FID, and DINOv2 Score.}
% \caption{Ablation study on impacts of various strategies.}
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{@{}ccc|c|c|cc|c|c@{}}
% \hline
% % \toprule
% \multicolumn{3}{c|}{\textbf{Attribute-based Selection}} & \multirow{2}{*}{\textbf{Style-aligned Composition}} & \multirow{2}{*}{\textbf{LVLM-based Filtration}} & \multicolumn{2}{c|}{\textbf{\( \text{mAP}_{50} \) ↑}} & \multirow{2}{*}{\textbf{FID ↓}} & \multirow{2}{*}{\textbf{DINOv2 Score ↑}} \\ \cline{1-3} \cline{6-7}
% \textbf{Category} & \textbf{Area} & \textbf{Type} &  &  & \textbf{YOLOv8} & \textbf{\text{Faster R-CNN}} &  &  \\ 
% % \midrule
% \hline
%  &  &  &  &  & 51.5 & 58.4 & 12.0 & 86.8 \\ 
% \checkmark &  &  &  &  & 53.2 & 60.3 & 11.4 & 86.4 \\ 
% \checkmark & \checkmark &  &  &  & 53.6 & 60.5 & 10.6 & \textbf{87.0} \\ 
% \checkmark & \checkmark & \checkmark &  &  & 53.9 & 60.9 & 10.2 & \underline{86.9} \\ 
%  &  &  & \checkmark &  & 50.6 & 58.8 & 10.5 & 85.2 \\
% \checkmark & \checkmark & \checkmark & \checkmark &  & \underline{54.1} & \underline{61.3} & \textbf{9.5} & 85.8 \\
% \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \textbf{54.8} & \textbf{61.9} & \underline{9.7} & 86.5 \\ 
% \hline
% % \bottomrule
% \end{tabular}%
% }
% \label{tab:comparison2}
% \end{table}





% \begin{table}[htbp]
% \centering
% \caption{Performances in different ablation studies in terms of \( \text{mAP}_{50} \), FID, and DINOv2 Score. The first column is divided into three sub-columns, category\textbar area\textbar type.}
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{@{}>{\centering\arraybackslash}p{1.5cm} >{\centering\arraybackslash}p{1.5cm} >{\centering\arraybackslash}p{1.5cm}|c|c|c|c|c|c@{}}
% \toprule
% \multicolumn{3}{c|}{\textbf{Attribute-based Selection}} & \textbf{Style-aligned Composition} & \textbf{LVLM-based Filtration} & \textbf{\( \text{mAP}_{50}^{\text{YOLOv8}} \) ↑} & \textbf{\( \text{mAP}_{50}^{\text{Faster R-CNN}} \) ↑} & \textbf{FID ↓} & \textbf{DINOv2 Score ↑} \\ 
% \midrule
%  &  &  &  &  & 51.5 & 58.4 & 12.0 & 86.8 \\ 
% \checkmark &  &  &  &  & 53.2 & 60.3 & 11.4 & 86.4 \\ 
% \checkmark & \checkmark &  &  &  & 53.6 & 60.5 & 10.6 & \textbf{87.0} \\ 
% \checkmark & \checkmark & \checkmark &  &  & 53.9 & 60.9 & 10.2 & \underline{86.9} \\ 
%  &  &  & \checkmark &  & 50.6 & 58.8 & 10.5 & 85.2 \\
% \checkmark & \checkmark & \checkmark & \checkmark &  & \underline{54.1} & \underline{61.3} & \textbf{9.5} & 85.8 \\
% \checkmark & \checkmark & \checkmark & \checkmark & \checkmark & \textbf{54.8} & \textbf{61.9} & \underline{9.7} & 86.5 \\ 
% \bottomrule
% \end{tabular}%
% }
% \label{tab:comparison2}
% \end{table}

\subsection{Cross Domain Application}
%Our SAIC excels in addressing the challenges of sparse circulating tumor cells (CTCs) and inter-batch variability, showcasing its effectiveness in enhancing CTC detection without requiring fine-tuning. This demonstrates its strong cross-domain applicability and highlights its potential to significantly improve the accuracy of CTC detection in clinical settings. Some examples are shown in Appendix \ref{app:example7}.

% We explore the application of SAIC in a new cellular domain (Circulating Tumor Cells) without fine-tuning, demonstrating its strong cross-domain applicability and highlighting its potential to positively impact a variety of downstream applications. Some examples are shown in Appendix \ref{app:example7}.

\revise{We demonstrate SAIC’s generalizability by presenting a comparative evaluation of SAIC against baseline methods for synthesizing cells in three external pathological image types: circulating tumor cells (CTC), blood cells (BC), and urine cells (UC). The baselines include both the basic GLIGEN and the version of fine-tuning on the Comparison Detector Database (denoted as GLIGEN-base and GLIGEN-ft, respectively). As shown in \figureref{fig:example12}, SAIC consistently achieves superior fidelity and style coherence across all three cytopathological image synthesis tasks, thereby highlighting its potential for broader application in cytopathological image data augmentation.}

\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example12}
  {\caption{\revise{Qualitative comparisons in three external pathological image types.}}}
  % {\caption{Partial synthetic images across different methods (top-2/top-1 for tail/non-tail).}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1\linewidth]{figures/figure12.png}}
\end{figure}

% We also explore the excesses of SAIC in addressing the challenges of sparse circulating tumor cells (CTCs) and inter-batch variability, showcasing its effectiveness in enhancing CTC detection without requiring fine-tuning. Some examples are shown in Appendix \ref{app:example7}. This demonstrates its strong cross-domain applicability and highlights its potential to significantly improve the accuracy of CTC detection in clinical settings. 

% Our SAIC performs well in the new cellular domain without fine-tuning, demonstrating its cross-domain applicability. 
% Some examples are shown in Appendix \ref{app:example7}.

% \subsection{Other Analysis}
% Firstly, we conduct deeper investigations into the effectiveness of data augmentation, focusing on its role in complementing staining styles and its impact under varying initial data sizes and augmentation degrees.
% Secondly, we validate the fidelity of synthetic cytopathological images through quantitative comparison with other methods and a user study.
% Finally, we briefly demonstrate the cross-domain applicability of our SAIC.
% See Appendices \ref{app:example5}, \ref{app:example6}, and \ref{app:example7} for more details in the supplementary material.





\section{Conclusion}
% We propose a training-free framework for cytopathological image data augmentation to improve the detection accuracy of abnormal cells. The core idea is to seamlessly combine abnormal cells with background images while ensuring consistency in category, area, type, and (staining) style, thereby generating high-fidelity and diverse training samples. This framework offers a general solution for cytopathological image data augmentation (or synthesis) tasks and can positively impact a variety of downstream applications.

% Despite its significant advantages in cytopathological image data augmentation, our framework faces certain limitations. Due to the training-free nature, it struggles when handling extremely complex and challenging cells. Additionally, standardizing the construction of the abnormal cell bank and determining the optimal ratio of augmented images in training remain potential obstacles that hinder the framework from fully realizing its potential.
% ==== 2025.01.13 ====
This paper proposes \textbf{S}tyle \textbf{A}ligned \textbf{I}mage \textbf{C}omposition (SAIC), a training-free data augmentation architecture for cytopathological abnormal cell detection, to address issues of limited, long-tailed distributions and biased staining styles in pathological image data. By introducing \textit{Attribute-based Selection}, \textit{Style-aligned Composition}, and \textit{LVLM-based Filtration}, SAIC achieves high-fidelity and style-preserved data augmentation. Experimental results demonstrate that, compared to the existing data augmentation methods, SAIC-synthesized data more effectively enhances the performance and robustness of abnormal cell detection models for pathological images, showing notable advantages for tail categories. Moreover, SAIC exhibits outstanding fidelity and generalizability. This framework provides a universal data augmentation solution for cytopathological images and can potentially impact various cross-domain applications positively.




\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work is supported by the Natural Science Foundation of China under Grant 62271465, Suzhou Basic Research Program under Grant SYG202338, and Open Fund Project of Guangdong Academy of Medical Sciences, China (No. YKY-KF202206).}


\bibliography{midl25_32}


\clearpage

\appendix


\section{Staining Style Alignment}
\figureref{fig:example2} shows the composition results with self- and background-style alignment. Through self-style alignment, synthetic cells can effectively retain their own staining styles. However, this may result in inconsistencies with the staining style of the background image. By applying background-style alignment, synthetic cells can integrate more harmoniously into the background.

Note that we do not simply apply background-style alignment. The main reason is that, given the limited size of the abnormal cell bank, the reference cells selected using the DINOv2 Score may still be inconsistent with the staining style of the background image. In such cases, applying background-style alignment could negatively impact the fidelity of synthetic images.
\label{app:example1}
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example2}
  {\caption{Demonstration of staining style alignment.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1\linewidth]{figures/figure2.png}}
\end{figure}

\section{LVLM-based Filtration}
\figureref{fig:example3} shows the detailed prompt setting of LVLM-based Filtration. Leveraging the perception of large vision-language models like GPT-4 for images and their understanding of text, we can design appropriate prompts to enable them to automatically filter the more harmonized one from two synthetic images and provide comprehensive reasons.

Note that in the subsequent experiments, through LVLM-based filtration, 64\% of the synthetic images produced by our SAIC are derived from background-style alignment, while 36\% are derived from self-style alignment.
\label{app:example2}
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example3}
  {\caption{Demonstration of LVLM-based Filtration.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1.0\linewidth]{figures/figure3.png}}
\end{figure}

% \section{Dataset}
% \label{app:example3}
% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example4}
%   {\caption{Overview of Comparison detector Database.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=1\linewidth]{figure4.png}}
% \end{figure}

\section{More Investigations of Data Augmentation Effectiveness}
\label{app:example5}
% \noindent
% \textbf{Effect of data augmentation on staining style complement.} 
% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example5}
%   {\caption{Effect of data augmentation on staining style complement.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=1\linewidth]{figure5.png}}
% \end{figure}


\noindent
\textbf{Effectiveness of data augmentation across scaling ratios.} 
% Considering that many hospitals lack the resources to collect training datasets as large as the Comparison Detector Database, we also investigated the effectiveness of our framework under varying amounts of initial training data. Specifically, we sampled reduced training sets from the original training set at proportions of 0.1, 0.2, ..., 0.9 while maintaining the original class distribution. Using our framework, we applied consistent data augmentation to each of these reduced training sets (augmenting the data to approximately half the size of the original annotations). 
% As shown in Figure 8(a), compared to the baseline, our data augmentation framework significantly improves the performance of various detectors. However, as the amount of initial training data gradually increases, both the baseline accuracy of the detectors and the performance improvement achieved through data augmentation slowly converge. This convergence may be related to the bottleneck in the diversity of the data distribution.
To validate the effectiveness of SAIC in relieving the practical puzzle of large-scale training dataset collection, we evaluate our framework's performance with varying amounts of initial training data. Reduced training sets are sampled from the original data at proportions of 0.1 to 0.9, maintaining class distribution, and consistent data augmentation is applied to each reduced set. 
As shown in \figureref{fig:example8} (a), our SAIC significantly enhances detector performance compared to the baseline. However, as the initial training data increases, the baseline accuracy and the improvement from augmentation gradually converge, likely due to the limited diversity of the data distribution.

\noindent
\textbf{Effect of data augmentation across expanding ratios.} 
% In addition, we studied the effect of our framework with different levels of data augmentation under the same amount of initial training data. Specifically, starting with the training set obtained through 0.1 proportion sampling, we applied varying degrees of data augmentation using our framework. 
% As shown in Figure 8(b), compared to the baseline, the performance of the detectors generally shows an initial increase followed by a decline as the degree of augmentation increases. We attribute this decline to the fact that when too much synthetic data is added, it begins to dominate the training data distribution, inevitably causing a deviation from the original real data distribution and subsequently affecting performance on real test data. 
% Furthermore, we observed that when the initial training data is only one-tenth of the original size, YOLOv8 outperforms Faster R-CNN. This further validates our earlier hypothesis that YOLOv8 is prone to overfitting in certain scenarios.
We also evaluate the impact of different degrees of data augmentation on the training set sampled from the original data at the proportion of 0.1. As shown in \figureref{fig:example8} (b), the improvement on detector performance initially improves but declines as the degree of augmentation increases. This decline occurs because excessive synthetic data skews the training distribution, reducing alignment with real test data. 

% Contents that can be excluded in the main paper
% Notably, YOLOv8 outperforms Faster R-CNN with limited data, supporting our hypothesis that YOLOv8 is more prone to overfitting in such scenarios.

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example8}
%   {\caption{Quantitative results.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=1\linewidth]{figure8.png}}
% \end{figure}


% \begin{figure}[htbp]
%     \centering
%     \begin{minipage}[b]{0.6\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{figure8a.png}
%         \caption*{(a)}
%         \label{fig:example8a}
%     \end{minipage}
%     \hfill
%     \begin{minipage}[b]{0.3\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{figure8b.png}
%         \caption*{(b)}
%         \label{fig:example8b}
%     \end{minipage}
%     \caption{More investigations of data augmentation effectiveness.}
%     \label{fig:example8}
% \end{figure}
\begin{figure}[t]
% \begin{figure}[htbp]
\floatconts
  {fig:example8} % 总图的 label，用于 \figureref
  {\caption{More investigations of data augmentation effectiveness.}} % 总图的标题
  { % 这里定义具体内容
    \begin{minipage}[b]{0.6\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/figure8a.png}
        \caption*{(a)}
        \label{fig:example8a}
    \end{minipage}
    \hfill
    \begin{minipage}[b]{0.3\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/figure8b.png}
        \caption*{(b)}
        \label{fig:example8b}
    \end{minipage}
  }
\end{figure}








\section{More Qualitative Comparisons}
\figureref{fig:example7} shows more examples of qualitative comparisons. For candidate cells with various attributes (category, type, and area) and different background images, our SAIC consistently synthesizes images with high fidelity and rich informational density, outperforming other augmentation methods.
\label{app:example6}
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example7}
  {\caption{Qualitative comparisons across augmentation methods on agc, asch, and cand.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1\linewidth]{figures/figure10.png}}
\end{figure}


\section{Ablation Study}
\figureref{fig:example10} shows the qualitative results of the ablation study. The first and last columns show the real background image (Real) and the synthesis results of the full framework (Full), respectively. The intermediate columns illustrate the effects of omitting core strategies: (1) None: No strategies applied, resulting in random synthesis with disrupted cell distribution patterns; (2) w/ Stage 1: Attribute-based Selection strategy applied alone, yielding decent synthesis quality; (3) w/ Stage 2: Style-aligned Composition strategy applied alone, resulting in a lack of harmony between the foreground and background.

Note that in our experiments, through LVLM-based filtration, 64\% of the synthetic images produced by our SAIC are derived from background-style alignment, while 36\% are derived from self-style alignment.
\label{app:example4}
\begin{figure}[t]
% \begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example10}
  {\caption{Qualitative results of the ablation study.}}
  % {\includegraphics[width=0.5\linewidth]{example-image}}
  % {\includegraphics[width=0.5\linewidth]{figure1.png}}
  {\includegraphics[width=1\linewidth]{figures/figure7.png}}
\end{figure}







% \section{Comprehensive Fidelity Validation}
% \label{app:example6}
% \noindent
% \textbf{Quantitative comparison with other methods.} We also quantitatively evaluated the image quality produced by different data augmentation methods using FID and DINOv2 Scores, as shown in Table 2. Although image generation-based methods slightly outperform our image synthesis-based method in terms of the overall realism of synthetic images, they exhibit significant limitations in the fidelity of target abnormal cells. This hinders their potential for further application in data augmentation within the field of cytopathology. In contrast, the other two image synthesis-based methods face challenges due to structural design flaws, making it difficult for them to synthesize high-quality cytopathological images across domains in a zero-shot setting, unlike our proposed method.

% \begin{table}[htbp]
% \centering
% \caption{Quantitative Evaluation of Image Quality using FID (↓) and DINO Score (↑). Lower FID indicates better overall realism, while higher DINO Score indicates better fidelity of the target abnormal cells.}
% \resizebox{\textwidth}{!}{%
% \begin{tabular}{@{}l|c|c@{}}
% \toprule
% \textbf{Data}                     & \textbf{FID (↓)} & \textbf{DINO Score (↑)} \\ 
% \midrule
% Generation                        & \textbf{9.1}     & 81.6                    \\ 
% Composition (Paint-by-Example)    & 191.0            & 74.3                    \\ 
% Composition (ObjectStitch)        & 95.6             & 79.2                    \\ 
% Composition (Ours)                & 9.7              & \textbf{86.5}           \\ 
% \bottomrule
% \end{tabular}%
% }
% \label{tab:quantitative_evaluation}
% \end{table}

% \noindent
% \textbf{User study.} To evaluate the fidelity of synthetic cytopathological images generated by the SAIC framework, we designed and conducted a user study. This study involved 8 experienced pathologists from top-tier hospitals, who were tasked with distinguishing SAIC-generated synthetic cytopathological images from real images extracted from the long-tailed cervical cancer cell dataset (Comparison Detector Database). 
% Each dataset consisted of 50 images, including 25 synthetic images and 25 real images, which were randomly mixed. The participants were given 30 minutes to complete the classification task. We recorded the classification accuracy of each pathologist and calculated the average classification accuracy across all participants, which was X. Additionally, the distribution of classification results was further analyzed using a confusion matrix. The average classification precision and recall across all participants were Y and Z, respectively.
% This study further validated the fidelity of the synthetic images and provided empirical evidence supporting the application of SAIC in cytopathological diagnostics.


% \section{Cross Domain Application}
% \figureref{fig:example9} briefly shows the application of our SAIC in the domain of circulating tumor cells. Although circulating tumor cells are less complex than the diverse cervical cancer cells, considering that our SAIC requires no fine-tuning, it still demonstrates strong cross-domain applicability and its potential to positively impact a variety of downstream applications.
% \label{app:example7}
% \begin{figure}[t]
% % \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example9}
%   {\caption{Demonstration of cross domain application.}}
%   % {\includegraphics[width=0.5\linewidth]{example-image}}
%   % {\includegraphics[width=0.5\linewidth]{figure1.png}}
%   {\includegraphics[width=1\linewidth]{figure9.png}}
% \end{figure}

\revise{\section{Computational Efficiency Analysis}
We emphasize that since augmented data is utilized during the model training phase (to generate enriched synthetic data for training), it does not alter the inference time or memory consumption of the anomaly cell detection model. Given that this task does not impose stringent real-time requirements, the computational efficiency of YOLOv8 and Faster R-CNN is sufficient for practical deployment.
Furthermore, we compared the average time and memory consumption for augmented image generation, as summarized in \tableref{tab:efficiency_comparison}. SAIC achieves a generation speed of 12.81 seconds per image, with average time allocations of 0.04s, 8.79s, and 3.98s for the selection, composition, and filtration stages, respectively. While SAIC’s generation time is marginally longer than baseline methods (e.g., GLIGEN), it eliminates the need for supervised fine-tuning on domain-specific data and delivers superior synthesis quality. Lastly, SAIC’s memory usage remains comparable to baseline approaches.}
\label{app:example7}
\begin{table}[t]
    \centering
    \caption{\revise{Comparison of time cost and memory usage across augmentation methods.}}
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{llcc@{}}
        \toprule
        \textbf{Framework} & \textbf{Method} & \textbf{Average Time (second per image)} & \textbf{Memory Usage (MiB)} \\
        \midrule
        Generation & GLIGEN & 10.20 & 16225 \\
        Composition & Paint-by-Example & 4.60 & 12161 \\
        Composition & ObjectStitch & 4.30 & 11883 \\
        Composition & SAIC (Ours) & $0.04 + 8.79 + 3.98 = 12.81$ & 12657 \\
        \bottomrule
    \end{tabular}
    }
    \label{tab:efficiency_comparison}
\end{table}


\revise{\section{Comparison with Model-based Methods}
We have introduced two model-based methods for long-tailed object detection performance comparison:
\begin{enumerate}
    \item Faster R-CNN (RS): A Faster R-CNN model trained with resampling, a common solution for long-tailed problems.
    \item BACL: A data-balancing method from \textit{Balanced Classification: A Unified Framework for Long-Tailed Object Detection} (TMM 2023).
\end{enumerate}
It is worth noting that, as an augmentation-based method, SAIC and the model-based methods are mutually compatible. We further demonstrate the synergistic effects of combining SAIC with these approaches.
As shown in \tableref{tab:model_method_comparison}, SAIC outperforms both baseline methods in improving anomaly detection performance for both overall and tail categories. This indicates that enhancing tail-class diversity via SAIC provides more substantial gains compared to the re-weighting strategies employed by model-based methods. Moreover, integrating SAIC with model-based methods yields additional performance improvements in abnormal cell detection, underscoring the complementary nature of these two methodological paradigms.}
\label{app:example8}
\begin{table}[t]
    \centering
    \caption{\revise{Comparison with model-based methods (\textbf{Best results}, \underline{second best results}).}}
    \resizebox{0.8\textwidth}{!}{%
    \begin{tabular}{l|c|cccc@{}}
        \hline
        \multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{\( \text{mAP}_{50} \)}} & \multicolumn{4}{c}{\textbf{Tail}} \\ \cline{3-6}
        
        & & \textbf{Flora} & \textbf{Actin} & \textbf{Herps} & \textbf{Cand} \\
        \hline
        Faster R-CNN             & 59.4  & 72.8  & 78.9  & 83.5  & 68.8  \\
        Faster R-CNN (RS)        & 59.8  & 74.7  & 80.6  & 84.3  & 70.9  \\
        BACL                    & 60.7  & 76.5  & 81.3  & 85.2  & 72.1  \\
        Faster R-CNN + SAIC       & 61.9  & 83.7  & 85.6  & 85.9  & 76.3  \\
        Faster R-CNN (RS) + SAIC   & \underline{62.2}  & \underline{84.1}  & \underline{86.2}  & \underline{86.3}  & \underline{77.3}  \\
        BACL + SAIC               & \textbf{62.6}  & \textbf{84.3}  & \textbf{86.5}  & \textbf{86.4}  & \textbf{77.5}  \\
        \hline
    \end{tabular}
    }
    \label{tab:model_method_comparison}
\end{table}


% \revise{\section{Statistical Tests}
% We}
% \label{app:example9}
% \begin{table}[t]
%     \centering
%     \caption{Statistical tests.}
%     \resizebox{\textwidth}{!}{%
%     \begin{tabular}{l|cc|c|c}
%         \hline
%         \textbf{Detector} & \multicolumn{2}{c|}{\textbf{Methods}} & \textbf{p-value} & \textbf{t-value} \\
%         \hline
%         \multirow{2}{*}{YOLOv8} & SAIC & Baseline & 0.028 & 2.245 \\
%                                  & SAIC & Copy \& Paste & 0.039 & 2.080 \\
%         \hline
%         \multirow{2}{*}{Faster R-CNN} & SAIC & Baseline & 0.031 & 2.210 \\
%                                        & SAIC & ObjectStitch & 0.036 & 2.130 \\
%         \hline
%     \end{tabular}
%     }
%     \label{tab:model_method_comparison}
% \end{table}

\revise{\section{Statistical Tests}
We conduct bootstrap resampling tests comparing SAIC with baseline methods and the best-performing comparative methods across two detection models (Copy \& Paste for YOLOv8 and ObjectStich for Faster R-CNN). The results, summarized in \tableref{tab:statistical_test}, demonstrate statistically significant differences between SAIC and all compared methods (p-value \textless 0.05).}
\label{app:example9}
\begin{table}[h]
    \centering
    \caption{\revise{Statistical tests.}}
    \resizebox{0.5\textwidth}{!}{%
    \begin{tabular}{l|c|c}
        \hline
        \textbf{Detector} & \textbf{Method} & \textbf{p-value} \\
        \hline
        \multirow{2}{*}{YOLOv8} & Baseline & 0.028 \\
                                 & Copy \& Paste & 0.039 \\
        \hline
        \multirow{2}{*}{Faster R-CNN} & Baseline & 0.031 \\
                                       & ObjectStitch & 0.036 \\
        \hline
    \end{tabular}
    }
    \label{tab:statistical_test}
\end{table}





\end{document}
