\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage[font={footnotesize}]{caption}
\usepackage{setspace}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}

\usepackage{url}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools, nccmath}
\usepackage{svg}

\usepackage{booktabs}  % professional-quality tables
\usepackage{multirow}
\usepackage{multicol}

\usepackage{bm}
\usepackage{microtype}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{cases}

\usepackage{txfonts}
\usepackage{xcolor}
\usepackage{soul}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\def\ie{\emph{i.e}., }
\def\eg{\emph{e.g}., }
\def\Eg{\emph{E.g}., }
\def\etal{\emph{et al}. }
\def\wrt{\emph{w.r.t}. }
\def\etc{\emph{etc}. }
\def\resp{\emph{resp}. }
\def\jiqing{\textcolor{orange}}
\def\viktor{\textcolor{red}}
\def\ingrid{\textcolor{purple}}

\usepackage{lineno}
\linenumbers

% \usepackage[symbol]{footmisc}
% \renewcommand{\thefootnote}{\fnsymbol{footnote}}
% \jmlrvolume{-- Under Review}
\jmlryear{2024}\jmlrworkshop{Full Paper -- MIDL 2024}\jmlrvolume{-- 12}\editors{Accepted for publication at MIDL 2024}
% \editors{Under Review for MIDL 2024}

\title[IST-editing]{IST-editing: Infinite spatial transcriptomic editing in a generated gigapixel mouse pup}

\midlauthor{\Name{Jiqing Wu\nametag{$^{1}$}} \Email{jiqing.wu@usz.ch}\\
\addr $^{1}$ Department of Pathology and Molecular Pathology, University Hospital, University of Zurich, Switzerland.\\
\Name{Ingrid Berg\nametag{$^{1}$}} \Email{ingrid.berg@biol.ethz.ch}\\
\Name{Viktor H. Koelzer\nametag{$^{1,2}$}} \Email{viktor.koelzer@usb.ch}\\
\addr $^{2}$ Institute of Medical Genetics and Pathology, University Hospital Basel, Switzerland.
}

\begin{document}

\maketitle

\begin{abstract}
Advanced spatial transcriptomics (ST) techniques provide comprehensive insights into complex organisms across multiple scales, while simultaneously posing challenges in biomedical image analysis. 
The spatial co-profiling of biological tissues by gigapixel whole slide images (WSI) and gene expression arrays motivates the development of innovative and efficient algorithmic approaches.
Using Generative Adversarial Nets (GAN), we introduce \textbf{I}nfinite \textbf{S}patial \textbf{T}ranscriptomic \textbf{e}diting~(IST-editing) and establish gene expression-guided editing in a generated gigapixel mouse pup. 
Trained with patch-wise high-plex gene expression (input) and matched image data (output), IST-editing enables the seamless synthesis of arbitrarily large bioimages at inference, \textit{e.g.}, with a $106496 \times 53248$ resolution.
After feeding edited gene expression values to the trained model, we simulate cell-, tissue- and animal-level morphological transitions in the generated mouse pup. Lastly, we discuss and evaluate editing effects on interpretable morphological features. The code and generated WSIs are publicly accessible via \url{https://github.com/CTPLab/IST-editing}. 
\end{abstract}

\begin{keywords}
Gene expression editing, spatial transcriptomics, GAN, WSI, mouse pup
\end{keywords}

\section{Introduction}
Recent advances in multi-omics technologies (\eg spatial transcriptomics (ST)~\cite{moses2022museum}) and generative artificial intelligence (AI) have the potential to revolutionize biomedical image analysis~\cite{royer2023future}.
Leveraging spatial co-profiling of high-plex mRNA transcripts~(acting as proxies for gene expression) and high-resolution biomedical images, researchers possess unprecedented opportunities to model the complex spatial organization of an entire organism. 

Concurrently, generative AI~\cite{bermano2022state,croitoru2023diffusion} has showcased remarkable progress in creating high-quality visual content, paving the way towards novel applications in the biomedical domain.
Trained with Hematoxylin and Eosin (H\&E)-stained or (immuno)fluorescence images, prior studies~\cite{carrillo2023synthetic, lamiable2023revealing,wu2023gilea} have achieved impressive results of bioimage generation and manipulation using GAN approaches. Recently, researchers~\cite{wu2023silico} further demonstrated the algorithmic editability on ST data and simulated cellular morphological transitions by shifting gene expression distributions. Notably, these studies were carried out at cell- or tissue-level and the generated bioimage resolution is usually smaller than $256 \times 256$.   
Due to scalability limitations and violation of the translation equivariance property, the generative competence of such algorithmic methods cannot be extended to the entirety of a WSI without inducing visible stitching artifacts, which can be partially mitigated by employing more hardware resources. A series of StyleGAN studies~\cite{karras2020analyzing,karras2021alias} first showed the feasibility of training $1024\times1024$ images on 8 V100 GPUs. In a recent paper, a GAN-based approach~\cite{kang2023scaling} has accomplished $4096\times4096$ image generation with remarkably fine details. Critically, this achievement was made possible by training a scaled GAN model on 96-128 A100 GPUs. Despite impressive breakthroughs in generating megapixel-resolution images, the hardware requirements for synthesizing WSIs at the gigapixel scale can be computationally intractable, making the model application prohibitively expensive in biomedical research. 

To extend the model applicability to arbitrarily large images, Single GAN (SinGAN)~\cite{shaham2019singan} and Single Denoising Diffusion Model (SinDDM)~\cite{kulikov2023sinddm} were proposed to learn the internal statistics of a given training image. Their shared coarse-to-fine architectural design enables the generation of image samples of any desired dimensions. Differing from single-image training, InfinityGAN ~\cite{lin2021infinitygan} re-introduced large-scale training on patch-wise image data using low computational resources. Tailored for high-resolution natural scene creation, strong coordinate priors, such as vertical rapid saturation and horizontal repetitive patterns of sky, land, or ocean, were imposed within the structure and texture synthesizer of InfinityGAN. 

In bioimage generation, the utility of coordinate priors is nonetheless undesirable. This is because the arrangement of biological structures is not dictated by a rigid coordinate system, but rather by the intricate interplay between genetic, epigenetic, and gene expression variability that leads to the phenotype of a living system~\cite{haniffa2021roadmap}.
Here, we propose \textbf{I}nfinite \textbf{S}patial \textbf{T}ranscriptomic \textbf{e}diting (IST-editing) in a generated gigapixel mouse pup. To the best of our knowledge, we are the first to introduce algorithmic gene expression editing at the scale of an entire organism:
\begin{itemize}
\item Taken gene expression data as the input, we achieve the seamless generation of $106496 \times 53248$ WSIs of a whole mouse pup.
\item By gene expression-guided editing, we simulate cell-, tissue- and animal-level morphological transitions, measured with interpretable morphological features.
\item Importantly, the model training and inference can be efficiently executed on a single consumer-grade GPU, \eg GeForce RTX 3090 Ti.
\end{itemize}
\section{The proposed IST-editing}
To efficiently process the paired transcript count array and biomedical image with matched gigapixel resolutions, we develop IST-editing upon the StyleGAN~\cite{karras2020analyzing,karras2021alias} framework. This is motivated by recent GAN studies~\cite{sauer2022stylegan,sauer2023stylegan,kang2023scaling} in response to the remarkable advances made by diffusion models. While being orders of magnitude faster at inference time, these methods, built upon advanced GAN architectures such as StyleGAN, exhibit superior generation and editing performance that remain competitive with their diffusion counterparts. 


\begin{figure}[b!]
\vspace{-0.7cm}
\centering
\includegraphics[width=0.9\linewidth]{Figure_model.pdf}
\vspace{-0.3cm}
\caption{\textbf{Conceptual illustrations of the proposed model}.}
\vspace{-0.8cm}
\label{fig_m}
\end{figure}

\subsection{ST data}
\noindent\textbf{Spatial gene expression as the input and representation}:
 In the natural image domain, previous generative models~\cite{shaham2019singan,kulikov2023sinddm} typically utilize (spatial) noise input for unconditional image generation. In addition, learned textural representations~\cite{radford2021learning} can be incorporated into the model to guide the image alterations~\cite{bermano2022state}. However, semantic ambiguity often occurs in interpreting a single latent code and qualitative analysis is mostly made possible for a subset of representations~\cite{harkonen2020ganspace}. Given the well-established biological understanding of many individual genes, we utilize gene expression as both the \textbf{input data} and \textbf{interpretable representation} for bioimage generation and editing.

\noindent\textbf{Training data pair}:
As shown in Fig.~\ref{fig_m} (left), we take the patch-wise spatial gene expression (input) and biomedical image (output) as the training data pair. During the training, we randomly and densely crop $2n \times 2n$ gene expression arrays that are center-aligned on the paired $n \times n$ image. With a $2 \times$ higher resolution than the associated image, these gene arrays will eventually allow the construction of a spatial grid that imposes seamless WSI generation at inference.
To strike a balance between the generation quality and training efficiency, we employ the paired $256 \times 256$ gene array and $128 \times 128$ image in the experiments. To ensure boundary consistency between neighboring generated images, every boundary pixel at $(x, y)$ of the image tile is obtained using gene expression values located at $(x', y')$, where $|x' - x| \le 64 $ and $|y' - y| \le 64$. A larger $256 \times 256$ gene array, containing all these values, is thus necessary to generate a $128 \times 128$ image tile.  
Due to the sparse spatial presence of gene expression, we down-scale the sampled gene array to $8 \times 8$ by sum reduction, such that more densely distributed gene expression values are aggregated in the format of a smaller 3D array.


\subsection{Training}
\noindent\textbf{Coordinate- and padding-free generator $G$}:
Instead of relying on strong coordinate-based priors including the vertical saturation and horizontal repetition of natural scenes, the design of our generator is driven by the intricate interaction between genes (causative factors) and phenotypes (observable characteristics). To model the directed linkage from gene expression to the biomedical image, we propose a straightforward coordinate-free generator, which is constructed using a series of padding-free and translation equivariant StyledConv layers (Fig.~\ref{fig_m}). No external prior knowledge, aside from gene expression data, is incorporated into the output images. In all padding-free layers, we discard pixel values that are padded at both spatial ends of the output. 
Consider $i = 1, 2, \ldots, l$, we then have the intermediate output with $(2^{i + 2} + 5) \times (2^{i + 2} + 5)$ spatial resolutions for the $i$-th layer. After discarding 5 boundary pixels of the last layer output, we obtain the generated patch-wise image. Leveraging the consistent $2\times$ increase of image resolution, our model can be easily adapted to output patches with $256 \times 256$ or $512 \times 512$ resolutions.

\noindent\textbf{Cell-subtype conditioned discriminator $D$}:
Inspired by conditional generations of well-characterized normal and cancer cellular images~\cite{wu2023silico}, we integrate cell subtype information into the discriminator to adversarially and conditionally train the generator. Concretely, we project cell label embeddings into $D$ and train both models with the conditional adversarial loss $\mathcal{L}_{\mathsf{adc}}$. Along with the $R_1$ regulation $\mathcal{L}_{R_1}$ and path length regulation loss $\mathcal{L}_{\mathsf{path}}$~\cite{karras2020analyzing}, we have the loss function
$
    \min_{G}((\max_{D} \mathcal{L}_{\mathsf{adc}}) + \alpha_{R_1}\mathcal{L}_{R_1} + \alpha_{\mathsf{path}}\mathcal{L}_{\mathsf{path}}), 
$
where $\alpha_{R_1}$ and $ \alpha_{\mathsf{path}}$ are hyperparameters and are determined to be 10 and 2 based on the prior study~\cite{wu2023silico}. Then, we train the GAN model for 800k iterations with a batch size of 16. Eventually, the optimal model performance is determined using Fr\'echet Inception Distance ($d_\mathsf{FID}$)~\cite{heusel2017gans} and high Peak Signal-to-Noise Ratio (PSNR). For the former, we use a more efficient implementation~\cite{wu2022sorted} and robust CLIP features~\cite{radford2021learning,kynkaanniemi2022role} to carry out the computation. 

\subsection{Inference}
\noindent\textbf{Spatial gene expression grid}: 
We employ a divide-and-conquer strategy at inference, breaking down the WSI generation into parallelizable subtasks of patch-wise image generation. To guarantee the boundary consistency of neighboring patches and as shown in Fig.~\ref{fig_m} (middle), we use the spatial gene expression grid (dotted lines) that is overlaid on the image grid (solid lines). This grid is formed and merged with $2n \times 2n$ gene expression arrays center-aligned on $n \times n$ images, in which the stride size of array shift is $n \times n$. Together with the padding-free layer design (Fig.~\ref{fig_m} (right)), we generate arbitrarily large WSIs given gene expression data as the input.
Using a single GeForce RTX 3090 Ti, it takes $\sim30$ mins to synthesize $106496\times53248$ WSIs, which are accessible via our GitHub \href{https://github.com/CTPLab/IST-editing}{repo} and can be thoroughly examined by open-source software such as QuPath \cite{bankhead2017qupath}.

\section{Experiments}
We test IST-editing on the public Xenium~\cite{janesick2022high} ST dataset of a one-day mouse pup. This gigapixel-resolution dataset\footnote{The download link is \url{https://s3-us-west-2.amazonaws.com/10x.files/samples/xenium/1.6.0/Xenium_V1_mouse_pup/Xenium_V1_mouse_pup_outs.zip}} provides a well-curated sparse 3D array of 379-plex gene transcript counts (App. Fig.~\ref{app3}) and the spatially matched DAPI-stained WSI at the identical resolution, offering a comprehensive morpho-molecular landscape of the whole organism. 
In the absence of clear cell-level annotations in the Xenium dataset, we conducted a careful evaluation of the WSI and cell-level clustering\footnote{Please see also the 10x Genomics data summary provided at \url{https://cf.10xgenomics.com/samples/xenium/1.6.0/Xenium_V1_mouse_pup/Xenium_V1_mouse_pup_analysis_summary.html}.} within the context of tissue organizations. Instructed by domain biomedical experts, we confirmed the accuracy of subtype assignments derived from the `kmeans\_10\_clusters' results in the raw data. Depending on the majority vote of cell subtypes presented in the sampled training data, we assign the label of the predominant subtype to each image tile.

\begin{figure}[htp!]
\vspace{-1.2cm}
\centering
\includegraphics[width=0.85\linewidth]{Figure1.pdf}
\vspace{-0.2cm}
\caption{\textbf{Experimental results of the WSI generation}. \textbf{a}. The visual comparison of tissue-level ($4096 \times 4096$) synthesized images obtained by training with 100\% of the available data using SinGAN~\cite{shaham2019singan}, SinDDM~\cite{kulikov2023sinddm}, StyleGAN2~\cite{karras2020analyzing} and InfinityGAN~\cite{lin2021infinitygan} as compared to IST-editing. Using the coarse-to-fine upscaling technique introduced by SinGAN and SinDDM, we present unsatisfactory upscaling results ($4096 \times 4096$, right plots) next to the faithful generation of low-resolution input images (left subplots) for a clear side-by-side comparison. \textbf{b}. The mean and standard deviation of transcript counts of the highly expressed genes (per cell) \textit{w.r.t.} individual tissue regions. \textbf{c}. The comparison of tissue-level generation results between the compared methods by PSNR (left) and $d_{\mathsf{FID}}$ (right). \textbf{d}. The comparison of PSNR and $d_{\mathsf{FID}}$ scores obtained by training IST-editing on progressively smaller subsets of available data (left) and at different numbers of iterations for training with 3\% of the available data (right). For these experiments, subsets of the available data are sampled following the `checkerboard' patterns, as illustrated underneath the `Training data proportion' plot. \textbf{e}. The visual illustration of 3\% of available training data. \textbf{f}. The cell-, tissue- and animal-level visualization of ground-truth (left) and generated (right) mouse pup WSI. To visualize the spatial pattern of leading gene expression in the right plot, we first downscale the resolution of gene expression array using sum reduction and then shift the gene expression level to [0, 255].}
\label{fig1}
\end{figure}





\subsection{Evaluation of generation results}
We benchmark IST-editing against state-of-the-art diffusion- and GAN-based models such as InfinityGAN. 
Consistent with the IST-editing approach, 
we feed all the models with patch-wise spatial gene expression data (input) and DAPI images (output) for systematic and fair comparisons. Following the single-image training paradigm, we train SinGAN~\cite{shaham2019singan} and SinDDM~\cite{kulikov2023sinddm} on individual tissue-level images (\textit{e.g.}, $4096\times4096$) and generate high-resolution images for direct comparison with the IST-editing results.
In contrast, StyleGAN2, InfinityGAN, and IST-editing are trained on patch-wise data pairs extracted from the entire WSI. As shown in Fig.~\ref{fig1}~(a), SinGAN and SinDDM can recreate low-resolution images (small inset, left) including texture similarities to the original tissue such as the alveolar pattern observed in samples from the lung region. However, the image generation cannot be consistently scaled to a higher resolution: Only basic and biologically meaningless tissue textures remain. StyleGAN2 preserved a pattern resembling cell nuclei in generated high-resolution images. Nevertheless, the tissue structure corresponding to the individual organ regions is lost, as is evident from the `StyleGAN2' column of Fig.~\ref{fig1}~(a). Owing to undesired coordinate priors for bioimage generation, we observed horizontal lines and repetitive patterns in images generated by InfinityGAN and clearly identifiable tissue structures are not present in these image examples.
After inputting the 379-plex gene expression data (\eg see Fig.~\ref{fig1} (b) and App. Fig.~\ref{app3}),
our approach successfully generates tissue-level images at the scale of $4096\times4096$ resolution, with biologically meaningful details (Fig.~\ref{fig1} (a), right). The generated images show a high level of similarity both in tissue organization, texture and cell-level detail to the biological prior, as supported by expert pathologist interpretation. Quantitatively illustrated in Fig.~\ref{fig1} (c), IST-editing outperforms compared methods in terms of low $d_\mathsf{FID}$ and high PSNR score. Using the padding-free StyledConv operations and spatial gene expression grid,
IST-editing achieved the WSI generation with a $106496\times 53248$ pixel resolution. Please see also App. Fig.~\ref{app1} for more elaborated visualization.

\noindent\textbf{Training data utility (100\% - 3\%)}: Next, we evaluate the generation robustness of the proposed approach under conditions of increasing data scarcity. For this purpose, we utilize progressively smaller subsets of the available data for training. As depicted in Fig.~\ref{fig1} (d, left), the optimal $d_\mathsf{FID}$ and PSNR scores remain consistent as the amount of available data decreases. Only when reducing the training data to $1/36$ of the original size (Fig.~\ref{fig1} (e)) do we start to observe a mild degradation in quantitative performance by $d_\mathsf{FID}$. Upon comparing the cell-, tissue- and animal-level generation quality achieved by training on the entire dataset (App. Fig.~\ref{app1} (a, b)) and 3\%  (Fig.~\ref{fig1} (f, right)) of the available data, the visual discrepancy between the two gigapixel-resolution WSIs appears marginal, substantiating the adaptability of IST-editing to limited data scenarios, requiring the seamless synthesis of more than 97\% of the unseen data. 

\begin{figure}[htp!]
\vspace{-1.2cm}
\centering
\includegraphics[width=0.95\linewidth]{Figure2.pdf}
\vspace{-0.2cm}
\caption{\textbf{Gene expression profiles and experimental results of IST-editing}. \textbf{a}. The heatmap of highly expressed genes (average per cell) \textit{w.r.t.} different tissue regions of the whole mouse pup and selected organ systems of interest. \textbf{b}. The visual (left) and quantitative (right) IST-editing effects on individual tissue regions obtained by scaling the leading gene expression group (middle) while zeroing out the rest of gene expression values. \textbf{c}. The visual (left) and quantitative (right) IST-editing effects on individual tissue regions obtained by scaling the leading eigenvalues of the sample covariance matrix (SCM)~\cite{wu2022sorted,wu2023silico}. For both (b) and (c), **** means $p \le 0.0001$ and `ns' stands for not statistically significant. The error bar of the box plot represents the 5\%–95\% quantile. The radar plots report the proportional ratio of morphological features between edited (numerator) and GT cells (denominator).
\textbf{d}. The overall editing effects on the whole mouse pup achieved by the interpolation between random noise and ground-truth gene expression values. For plots (b)-(d), all the editing experiments are conducted using the model trained with 100\% of the available data.}
\label{fig2}
\end{figure}

\subsection{Evaluation of editing effects}
We investigate gene expression-guided editing of WSI data by three distinct strategies. Experiments are performed on the generated `in-silico mouse pup' which contains co-profiled ST and WSI data of all major mammalian organ systems.

\noindent\textbf{(1) Direct scaling of gene expression}: Organized structures of diverse tissue regions emerge when progressively scaling the expression levels of the top four genes by a factor of 0.5, 1 (baseline), and 2 (Fig.~\ref{fig2} (b, left and middle)), while remaining gene expression values are zeroed out. Such targeted editing is driven by the observable dominant impact on the morphological generation of the top four leading expressed genes (Fig.~\ref{fig1} (b)). Interestingly, the editing effects exhibited biologically explainable heterogeneity across the different regions. In the colon section, we observe the emergence of crypt epithelial structures orchestrated by the upscaling of leading genes including Epithelial Cell Adhesion Molecule, \textit{EPCAM}. As muscle-specific genes are not represented in the top colon gene sets, the outer muscle layer remains absent in the reconstruction. In other examples, the clear structures and organizations of the lung region have been recovered by our approach, closely resembling the GT lung image, and image artifacts (\eg white fluff on GT WSI scan) are effectively eliminated in the reconstruction. Calculated on the proportional ratio between edited and GT tissue regions highlighted in the bounding boxes (Fig.~\ref{fig2} (d)), the radar charts in Fig.~\ref{fig2} (b, right) demonstrate a consistent increase in cell-level metrics approaching the GT with the up-scaling coefficients.     

\noindent\textbf{(2)~Indirect scaling of gene expression}: Similar to the cell-level manipulation study~\cite{wu2023silico}, we perform algorithmic editing on the sample covariance matrix (SCM) and scale the leading eigenvalues by 0.1, 0.5, and 1 (baseline). Consider the SCM  $\frac{1}{n} \bm{G}^{\mathsf{T}} \bm{G} = \bm{O} \bm \lambda \bm{O}^{\mathsf{T}}$, where $\bm{G}$ is the collection of $n$ 379-plex gene expression data from a given tissue region, $\bm{O}_i$ is the $379 \times 379$ eigenbasis and $\bm{\lambda}$ is the (sorted) diagonal eigenvalues derived from eigenvalue decomposition. Then, we control $\bm{\lambda}$ for indirectly conducting gene expression-guided editing. As illustrated in Fig.~\ref{fig2} (c), there exists a rather homogeneous transition of tissue structures across the various regions of interest. On the contrary to the results described above using the leading genes, the muscle layer of the colon tissue as well as global architectural features of lung and skin are already observed at the scale of 0.1 when using all genes as an input. After examining the editing effects with up-scaling of the eigenvalues, we witness a further increase in DAPI pixel intensity and increased sharpening of architectural details closely resembling the GT image. This is reflected by the quantitative analysis of the interpretable morphological features Fig.~\ref{fig2} (c, right), where we observe an expected increase in the cellular region and DAPI signals.   

\noindent\textbf{(3) Interpolation between unorganized and well-organized gene expression}: To simulate morphological transitions at the scale of a whole `in-silico mouse pup', we conduct linear interpolation between randomly sampled and ground truth spatial gene expression, generating WSI results at coefficients of 0 (noise), 0.5, and 1 (mouse pup). The resulting WSIs exhibit a gradual progression from chaotic cellular organization - as reflected through the appearance of `random noise' across the entire sample - to the highly organized structure of the one-day mouse pup. We thus demonstrate the versatility of IST-editing in simulating biological processes across multiple scales.       

\subsection{Evaluation of model limitations}
\noindent\textbf{Training data utility (0.1\%)}: Pushing the limits further, we conduct extreme stress tests on the proposed approach for reconstructing the whole mouse pup. This is carried out by training on a single 2048×2048 resolution image extracted from individual tissue regions such as kidney, lung, and brain. Though the overall outline and structure of the mouse pup are retained, IST-editing struggles to recreate the WSI with fine biological-aware details, as illustrated in App. Fig.~\ref{app4}. Remarkably, heterogeneous generation patterns for different organs arise when training solely on one single image. For instance, the training of the gut region image leads to the generation of blank space in the mouse brain. This can be explained by the non-overlapping highly expressed genes between the gut (\textit{e.g.}, \textit{Cdh16}, \textit{Ldhb}, \textit{Epcam}, \textit{Tfcp2l1}) and brain (\textit{e.g.}, \textit{Stmn1}, \textit{Gap43}, \textit{Nnat}, \textit{Tubb3}) region, as presented in Fig.~\ref{fig2} (a) and App. Fig.~\ref{app3}. 
When utilizing an `almost black' image with a mere fragment of mouse skin (App. Fig.~\ref{app4}), the overall structure of the mouse pup remains preserved, though the cellular and tissue generation tends to exhibit a preference for mimicking skin epithelial morphology, suggesting a bias towards replicating trained cellular subtypes. 
To resolve the limitation of training and testing on the same WSI of the mouse pup, we reported generalization results on different brain sections from two mice (App. Fig.~\ref{app6}, \ref{app7}) using the synergistic ST data of mouse brain atlas~\cite{yao2023high}.

\section{Discussion and Conclusion}
This proof-of-concept study showcased the generative ability and editability in an in-silico mouse pup with DAPI nuclear staining and linked ST data. Notably, IST-editing can be readily extended to other broadly established staining techniques to visualize cellular detail, as exemplified by the first H\&E generation results of the same mouse pup (App. Fig.~\ref{app5}). In-silico modeling holds great potential for the Replacement, Reduction, and Refinement of animal research and extends beyond animal modeling. In future applications, IST-editing could enable the simulated intervention on biological samples from human pathology with reduced ethical, legal, and regulatory risks and provide a novel perspective to investigate the linkage between genotype and phenotype in human diseases.



\section*{Author contributions statement}
J.W. and V.H.K. conceived the research idea. J.W. implemented the algorithm and carried out the experiments. J.W., I.B. and V.H.K. analyzed the results. J.W. and V.H.K. drafted the manuscript. I.B. critically reviewed the manuscript and supplied biological interpretations. V.H.K. supervised the project.

\section*{Competing interests}
J.W. declares no competing interests. V.H.K. declares project-based research funding from Roche and the Image Analysis Group outside to the submitted work. V.H.K. is on an advisory board of Takeda has served as an invited speaker on behalf of Indica Labs and for Sharing Progress in Cancer Care, an independent nonprofit organization, outside of the submitted work.

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This study is funded by core funding of the University of Zurich to the Computational and Translational Pathology Lab led by V.H.K. at the Department of Pathology and Molecular Pathology, University Hospital and University of Zurich.}


\bibliography{midl24_12}

\appendix
\clearpage
\section{The overall gene expression profiles}

\begin{figure}[htp!]
\centering
\includegraphics[width=1.\linewidth]{App3.pdf}
\caption{\textbf{The heatmap of 379-plex gene expression values (average per cell) \textit{w.r.t.} different tissue regions}.}
\label{app3}
\end{figure}

\clearpage
\section{The experimental results for the DAPI-stained WSI}

\begin{figure}[htp!]
\centering
\includegraphics[width=1.\linewidth]{App1.pdf}
\caption{\textbf{The generation results of tissue region images (a) and WSI (b)}.}
\label{app1}
\end{figure}

\begin{figure}[htp!]
\centering
\includegraphics[width=1.\linewidth]{App2.pdf}
\caption{\textbf{The experimental results of diverse editing effects}.  \textbf{a}. The visual (left) and quantitative (right) editing effects on various tissue regions by scaling the leading gene expression group (middle) while zeroing out the rest of gene expression values.  \textbf{b}. The visual (left) and quantitative (right) editing effects by scaling the leading eigenvalues of the sample covariance matrix (SCM) of individual tissue regions.}
\label{app2}
\end{figure}

\begin{figure}[htp!]
\centering
\includegraphics[width=0.8\linewidth]{App4.pdf}
\caption{\textbf{The failure cases of WSI generation brought by training on a single $\mathbf{2048 \times 2048}$ image extracted from individual tissue regions}.}
\label{app4}
\end{figure}

\clearpage
\section{The first results for the H\&E-stained WSI}

\begin{figure}[htp!]
\centering
\includegraphics[width=1.\linewidth]{App5.pdf}
\caption{\textbf{The ground truth (a) and generated (b) H\&E-stained WSIs}.}
\label{app5}
\end{figure}

\clearpage
\section{The generalization results for coronal brain sections \cite{yao2023high}}
\begin{figure}[htp!]
\centering
\vspace{-0.5cm}
\includegraphics[width=0.95\linewidth]{App6.pdf}
\caption{\textbf{The ground truth (a) and generation result of the trained (b) and test (c) WSIs}. Here, the training brain section comes from the female mouse (ID: 609882, file: 1198980117) and the test section comes from the male mouse (ID: 609889, file: 1198980478), where both ST datasets have been generated with the same gene panel. By tile-wise quantitatively comparing the generated and ground truth WSIs, we report the mean and standard deviation of PSNR and $d_{\mathsf{FID}}$ for both training and unseen test data. Same as the results reported in the main manuscript, we here use a more efficient implementation~\cite{wu2022sorted} and robust CLIP features~\cite{radford2021learning,kynkaanniemi2022role} to carry out the $d_{\mathsf{FID}}$ computation. 
To visualize the spatial pattern of leading gene expression values in the middle and bottom plots, we first downscale the resolution of the gene expression array using sum reduction and then shift the gene expression level to [0, 255].}
\label{app6}
\end{figure}

\begin{figure}[htp!]
\centering
\includegraphics[width=0.95\linewidth]{App7.pdf}
\caption{\textbf{The ground truth (a) and generation result of the trained (b) and test (c) WSIs}. Here, the training brain section comes from the male mouse (ID: 609889, file: 1198980478) and the test section comes from the female mouse (ID: 609882, file: 1198980117). By tile-wise quantitatively comparing the generated and ground truth WSIs, we report the mean and standard deviation of PSNR and $d_{\mathsf{FID}}$ for both training and unseen test data. Same as the results reported in the main manuscript, we here use a more efficient implementation~\cite{wu2022sorted} and robust CLIP features~\cite{radford2021learning,kynkaanniemi2022role} to carry out the $d_{\mathsf{FID}}$ computation. To visualize the spatial pattern of leading gene expression values in the middle and bottom plots, we first downscale the resolution of the gene expression array using sum reduction and then shift the gene expression level to [0, 255].}
\label{app7}
\end{figure}

\end{document}